diff options
Diffstat (limited to 'arch/x86')
323 files changed, 38571 insertions, 24286 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 80b7ba4056d..65b449134cf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -17,81 +17,69 @@ config X86_64 ### Arch settings config X86 - bool - default y + def_bool y + +config GENERIC_LOCKBREAK + def_bool n config GENERIC_TIME - bool - default y + def_bool y config GENERIC_CMOS_UPDATE - bool - default y + def_bool y config CLOCKSOURCE_WATCHDOG - bool - default y + def_bool y config GENERIC_CLOCKEVENTS - bool - default y + def_bool y config GENERIC_CLOCKEVENTS_BROADCAST - bool - default y + def_bool y depends on X86_64 || (X86_32 && X86_LOCAL_APIC) config LOCKDEP_SUPPORT - bool - default y + def_bool y config STACKTRACE_SUPPORT - bool - default y + def_bool y config SEMAPHORE_SLEEPERS - bool - default y + def_bool y config MMU - bool - default y + def_bool y config ZONE_DMA - bool - default y + def_bool y config QUICKLIST - bool - default X86_32 + def_bool X86_32 config SBUS bool config GENERIC_ISA_DMA - bool - default y + def_bool y config GENERIC_IOMAP - bool - default y + def_bool y config GENERIC_BUG - bool - default y + def_bool y depends on BUG config GENERIC_HWEIGHT - bool - default y + def_bool y + +config GENERIC_GPIO + def_bool n config ARCH_MAY_HAVE_PC_FDC - bool - default y + def_bool y config DMI - bool - default y + def_bool y config RWSEM_GENERIC_SPINLOCK def_bool !X86_XADD @@ -112,10 +100,14 @@ config GENERIC_TIME_VSYSCALL bool default X86_64 +config HAVE_SETUP_PER_CPU_AREA + def_bool X86_64 + config ARCH_SUPPORTS_OPROFILE bool default y +select HAVE_KVM config ZONE_DMA32 bool @@ -144,9 +136,17 @@ config GENERIC_PENDING_IRQ config X86_SMP bool - depends on X86_32 && SMP && !X86_VOYAGER + depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) default y +config X86_32_SMP + def_bool y + depends on X86_32 && SMP + +config X86_64_SMP + def_bool y + depends on X86_64 && SMP + config X86_HT bool depends on SMP @@ -292,6 +292,18 @@ config X86_ES7000 Only choose this option if you have such a system, otherwise you should say N here. +config X86_RDC321X + bool "RDC R-321x SoC" + depends on X86_32 + select M486 + select X86_REBOOTFIXUPS + select GENERIC_GPIO + select LEDS_GPIO + help + This option is needed for RDC R-321x system-on-chip, also known + as R-8610-(G). + If you don't have one of these chips, you should say N here. + config X86_VSMP bool "Support for ScaleMP vSMP" depends on X86_64 && PCI @@ -303,8 +315,8 @@ config X86_VSMP endchoice config SCHED_NO_NO_OMIT_FRAME_POINTER - bool "Single-depth WCHAN output" - default y + def_bool y + prompt "Single-depth WCHAN output" depends on X86_32 help Calculate simpler /proc/<PID>/wchan values. If this option @@ -314,18 +326,8 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER If in doubt, say "Y". -config PARAVIRT - bool - depends on X86_32 && !(X86_VISWS || X86_VOYAGER) - help - This changes the kernel so it can modify itself when it is run - under a hypervisor, potentially improving performance significantly - over full virtualization. However, when run without a hypervisor - the kernel is theoretically slower and slightly larger. - menuconfig PARAVIRT_GUEST bool "Paravirtualized guest support" - depends on X86_32 help Say Y here to get to see options related to running Linux under various hypervisors. This option alone does not add any kernel code. @@ -339,6 +341,7 @@ source "arch/x86/xen/Kconfig" config VMI bool "VMI Guest support" select PARAVIRT + depends on X86_32 depends on !(X86_VISWS || X86_VOYAGER) help VMI provides a paravirtualized interface to the VMware ESX server @@ -348,40 +351,43 @@ config VMI source "arch/x86/lguest/Kconfig" +config PARAVIRT + bool "Enable paravirtualization code" + depends on !(X86_VISWS || X86_VOYAGER) + help + This changes the kernel so it can modify itself when it is run + under a hypervisor, potentially improving performance significantly + over full virtualization. However, when run without a hypervisor + the kernel is theoretically slower and slightly larger. + endif config ACPI_SRAT - bool - default y + def_bool y depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH) select ACPI_NUMA config HAVE_ARCH_PARSE_SRAT - bool - default y - depends on ACPI_SRAT + def_bool y + depends on ACPI_SRAT config X86_SUMMIT_NUMA - bool - default y + def_bool y depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH) config X86_CYCLONE_TIMER - bool - default y + def_bool y depends on X86_32 && X86_SUMMIT || X86_GENERICARCH config ES7000_CLUSTERED_APIC - bool - default y + def_bool y depends on SMP && X86_ES7000 && MPENTIUMIII source "arch/x86/Kconfig.cpu" config HPET_TIMER - bool + def_bool X86_64 prompt "HPET Timer Support" if X86_32 - default X86_64 help Use the IA-PC HPET (High Precision Event Timer) to manage time in preference to the PIT and RTC, if a HPET is @@ -399,9 +405,8 @@ config HPET_TIMER Choose N to continue using the legacy 8254 timer. config HPET_EMULATE_RTC - bool - depends on HPET_TIMER && RTC=y - default y + def_bool y + depends on HPET_TIMER && (RTC=y || RTC=m) # Mark as embedded because too many people got it wrong. # The code disables itself when not needed. @@ -441,8 +446,8 @@ config CALGARY_IOMMU If unsure, say Y. config CALGARY_IOMMU_ENABLED_BY_DEFAULT - bool "Should Calgary be enabled by default?" - default y + def_bool y + prompt "Should Calgary be enabled by default?" depends on CALGARY_IOMMU help Should Calgary be enabled by default? if you choose 'y', Calgary @@ -486,9 +491,9 @@ config SCHED_SMT N here. config SCHED_MC - bool "Multi-core scheduler support" + def_bool y + prompt "Multi-core scheduler support" depends on (X86_64 && SMP) || (X86_32 && X86_HT) - default y help Multi-core scheduler support improves the CPU scheduler's decision making when dealing with multi-core CPU chips at a cost of slightly @@ -522,19 +527,16 @@ config X86_UP_IOAPIC an IO-APIC, then the kernel will still run with no slowdown at all. config X86_LOCAL_APIC - bool + def_bool y depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH)) - default y config X86_IO_APIC - bool + def_bool y depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH)) - default y config X86_VISWS_APIC - bool + def_bool y depends on X86_32 && X86_VISWS - default y config X86_MCE bool "Machine Check Exception" @@ -554,17 +556,17 @@ config X86_MCE the 386 and 486, so nearly everyone can say Y here. config X86_MCE_INTEL - bool "Intel MCE features" + def_bool y + prompt "Intel MCE features" depends on X86_64 && X86_MCE && X86_LOCAL_APIC - default y help Additional support for intel specific MCE features such as the thermal monitor. config X86_MCE_AMD - bool "AMD MCE features" + def_bool y + prompt "AMD MCE features" depends on X86_64 && X86_MCE && X86_LOCAL_APIC - default y help Additional support for AMD specific MCE features such as the DRAM Error Threshold. @@ -637,9 +639,9 @@ config I8K Say N otherwise. config X86_REBOOTFIXUPS - bool "Enable X86 board specific fixups for reboot" + def_bool n + prompt "Enable X86 board specific fixups for reboot" depends on X86_32 && X86 - default n ---help--- This enables chipset and/or board specific fixups to be done in order to get reboot to work correctly. This is only needed on @@ -648,7 +650,7 @@ config X86_REBOOTFIXUPS system. Currently, the only fixup is for the Geode machines using - CS5530A and CS5536 chipsets. + CS5530A and CS5536 chipsets and the RDC R-321x SoC. Say Y if you want to enable the fixup. Currently, it's safe to enable this option even if you don't need it. @@ -672,9 +674,8 @@ config MICROCODE module will be called microcode. config MICROCODE_OLD_INTERFACE - bool + def_bool y depends on MICROCODE - default y config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" @@ -798,13 +799,12 @@ config PAGE_OFFSET depends on X86_32 config HIGHMEM - bool + def_bool y depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) - default y config X86_PAE - bool "PAE (Physical Address Extension) Support" - default n + def_bool n + prompt "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G select RESOURCES_64BIT help @@ -836,10 +836,10 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) config K8_NUMA - bool "Old style AMD Opteron NUMA detection" - depends on X86_64 && NUMA && PCI - default y - help + def_bool y + prompt "Old style AMD Opteron NUMA detection" + depends on X86_64 && NUMA && PCI + help Enable K8 NUMA node topology detection. You should say Y here if you have a multi processor AMD K8 system. This uses an old method to read the NUMA configuration directly from the builtin @@ -847,10 +847,10 @@ config K8_NUMA instead, which also takes priority if both are compiled in. config X86_64_ACPI_NUMA - bool "ACPI NUMA detection" + def_bool y + prompt "ACPI NUMA detection" depends on X86_64 && NUMA && ACPI && PCI select ACPI_NUMA - default y help Enable ACPI SRAT based node topology detection. @@ -864,52 +864,53 @@ config NUMA_EMU config NODES_SHIFT int + range 1 15 if X86_64 default "6" if X86_64 default "4" if X86_NUMAQ default "3" depends on NEED_MULTIPLE_NODES config HAVE_ARCH_BOOTMEM_NODE - bool + def_bool y depends on X86_32 && NUMA - default y config ARCH_HAVE_MEMORY_PRESENT - bool + def_bool y depends on X86_32 && DISCONTIGMEM - default y config NEED_NODE_MEMMAP_SIZE - bool + def_bool y depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) - default y config HAVE_ARCH_ALLOC_REMAP - bool + def_bool y depends on X86_32 && NUMA - default y config ARCH_FLATMEM_ENABLE def_bool y - depends on (X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC) || (X86_64 && !NUMA) + depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA config ARCH_DISCONTIGMEM_ENABLE def_bool y - depends on NUMA + depends on NUMA && X86_32 config ARCH_DISCONTIGMEM_DEFAULT def_bool y - depends on NUMA + depends on NUMA && X86_32 + +config ARCH_SPARSEMEM_DEFAULT + def_bool y + depends on X86_64 config ARCH_SPARSEMEM_ENABLE def_bool y - depends on NUMA || (EXPERIMENTAL && (X86_PC || X86_64)) + depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 config ARCH_SELECT_MEMORY_MODEL def_bool y - depends on X86_32 && ARCH_SPARSEMEM_ENABLE + depends on ARCH_SPARSEMEM_ENABLE config ARCH_MEMORY_PROBE def_bool X86_64 @@ -987,42 +988,32 @@ config MTRR See <file:Documentation/mtrr.txt> for more information. config EFI - bool "Boot from EFI support" - depends on X86_32 && ACPI - default n + def_bool n + prompt "EFI runtime service support" + depends on ACPI ---help--- - This enables the kernel to boot on EFI platforms using - system configuration information passed to it from the firmware. - This also enables the kernel to use any EFI runtime services that are + This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). - This option is only useful on systems that have EFI firmware - and will result in a kernel image that is ~8k larger. In addition, - you must use the latest ELILO loader available at - <http://elilo.sourceforge.net> in order to take advantage of - kernel initialization using EFI information (neither GRUB nor LILO know - anything about EFI). However, even with this option, the resultant - kernel should continue to boot on existing non-EFI platforms. + This option is only useful on systems that have EFI firmware. + In addition, you should use the latest ELILO loader available + at <http://elilo.sourceforge.net> in order to take advantage + of EFI runtime services. However, even with this option, the + resultant kernel should continue to boot on existing non-EFI + platforms. config IRQBALANCE - bool "Enable kernel irq balancing" + def_bool y + prompt "Enable kernel irq balancing" depends on X86_32 && SMP && X86_IO_APIC - default y help The default yes will allow the kernel to do irq load balancing. Saying no will keep the kernel from doing irq load balancing. -# turning this on wastes a bunch of space. -# Summit needs it only when NUMA is on -config BOOT_IOREMAP - bool - depends on X86_32 && (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) - default y - config SECCOMP - bool "Enable seccomp to safely compute untrusted bytecode" + def_bool y + prompt "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS - default y help This kernel feature is useful for number crunching applications that may need to compute untrusted bytecode during their @@ -1189,11 +1180,11 @@ config HOTPLUG_CPU suspend. config COMPAT_VDSO - bool "Compat VDSO support" - default y - depends on X86_32 + def_bool y + prompt "Compat VDSO support" + depends on X86_32 || IA32_EMULATION help - Map the VDSO to the predictable old-style address too. + Map the 32-bit VDSO to the predictable old-style address too. ---help--- Say N here if you are running a sufficiently recent glibc version (2.3.3 or later), to remove the high-mapped @@ -1207,30 +1198,26 @@ config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y depends on X86_64 || (X86_32 && HIGHMEM) -config MEMORY_HOTPLUG_RESERVE - def_bool X86_64 - depends on (MEMORY_HOTPLUG && DISCONTIGMEM) - config HAVE_ARCH_EARLY_PFN_TO_NID def_bool X86_64 depends on NUMA -config OUT_OF_LINE_PFN_TO_PAGE - def_bool X86_64 - depends on DISCONTIGMEM - menu "Power management options" depends on !X86_VOYAGER config ARCH_HIBERNATION_HEADER - bool + def_bool y depends on X86_64 && HIBERNATION - default y source "kernel/power/Kconfig" source "drivers/acpi/Kconfig" +config X86_APM_BOOT + bool + default y + depends on APM || APM_MODULE + menuconfig APM tristate "APM (Advanced Power Management) BIOS support" depends on X86_32 && PM_SLEEP && !X86_VISWS @@ -1371,7 +1358,7 @@ menu "Bus options (PCI etc.)" config PCI bool "PCI support" if !X86_VISWS depends on !X86_VOYAGER - default y if X86_VISWS + default y select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) help Find out whether you have a PCI motherboard. PCI is the name of a @@ -1418,25 +1405,21 @@ config PCI_GOANY endchoice config PCI_BIOS - bool + def_bool y depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) - default y # x86-64 doesn't support PCI BIOS access from long mode so always go direct. config PCI_DIRECT - bool + def_bool y depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY) || X86_VISWS) - default y config PCI_MMCONFIG - bool + def_bool y depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) - default y config PCI_DOMAINS - bool + def_bool y depends on PCI - default y config PCI_MMCONFIG bool "Support mmconfig PCI config space access" @@ -1453,9 +1436,9 @@ config DMAR remapping devices. config DMAR_GFX_WA - bool "Support for Graphics workaround" + def_bool y + prompt "Support for Graphics workaround" depends on DMAR - default y help Current Graphics drivers tend to use physical address for DMA and avoid using DMA APIs. Setting this config @@ -1464,9 +1447,8 @@ config DMAR_GFX_WA to use physical addresses for DMA. config DMAR_FLOPPY_WA - bool + def_bool y depends on DMAR - default y help Floppy disk drivers are know to bypass DMA API calls thereby failing to work when IOMMU is enabled. This @@ -1479,8 +1461,7 @@ source "drivers/pci/Kconfig" # x86_64 have no ISA slots, but do have ISA-style DMA. config ISA_DMA_API - bool - default y + def_bool y if X86_32 @@ -1546,9 +1527,9 @@ config SCx200HR_TIMER other workaround is idle=poll boot option. config GEODE_MFGPT_TIMER - bool "Geode Multi-Function General Purpose Timer (MFGPT) events" + def_bool y + prompt "Geode Multi-Function General Purpose Timer (MFGPT) events" depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS - default y help This driver provides a clock event source based on the MFGPT timer(s) in the CS5535 and CS5536 companion chip for the geode. @@ -1575,6 +1556,7 @@ source "fs/Kconfig.binfmt" config IA32_EMULATION bool "IA32 Emulation" depends on X86_64 + select COMPAT_BINFMT_ELF help Include code to run 32-bit programs under a 64-bit kernel. You should likely turn this on, unless you're 100% sure that you don't have any @@ -1587,18 +1569,16 @@ config IA32_AOUT Support old a.out binaries in the 32bit emulation. config COMPAT - bool + def_bool y depends on IA32_EMULATION - default y config COMPAT_FOR_U64_ALIGNMENT def_bool COMPAT depends on X86_64 config SYSVIPC_COMPAT - bool + def_bool y depends on X86_64 && COMPAT && SYSVIPC - default y endmenu @@ -1619,4 +1599,6 @@ source "security/Kconfig" source "crypto/Kconfig" +source "arch/x86/kvm/Kconfig" + source "lib/Kconfig" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index c30162202dc..e09a6b73a1a 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -219,10 +219,10 @@ config MGEODEGX1 Select this for a Geode GX1 (Cyrix MediaGX) chip. config MGEODE_LX - bool "Geode GX/LX" + bool "Geode GX/LX" depends on X86_32 - help - Select this for AMD Geode GX and LX processors. + help + Select this for AMD Geode GX and LX processors. config MCYRIXIII bool "CyrixIII/VIA-C3" @@ -258,7 +258,7 @@ config MPSC Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey Xeon CPUs with Intel 64bit which is compatible with x86-64. Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the - Netburst core and shouldn't use this option. You can distinguish them + Netburst core and shouldn't use this option. You can distinguish them using the cpu family field in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. @@ -317,81 +317,75 @@ config X86_L1_CACHE_SHIFT default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 config X86_XADD - bool + def_bool y depends on X86_32 && !M386 - default y config X86_PPRO_FENCE - bool + bool "PentiumPro memory ordering errata workaround" depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 - default y + help + Old PentiumPro multiprocessor systems had errata that could cause memory + operations to violate the x86 ordering standard in rare cases. Enabling this + option will attempt to work around some (but not all) occurances of + this problem, at the cost of much heavier spinlock and memory barrier + operations. + + If unsure, say n here. Even distro kernels should think twice before enabling + this: there are few systems, and an unlikely bug. config X86_F00F_BUG - bool + def_bool y depends on M586MMX || M586TSC || M586 || M486 || M386 - default y config X86_WP_WORKS_OK - bool + def_bool y depends on X86_32 && !M386 - default y config X86_INVLPG - bool + def_bool y depends on X86_32 && !M386 - default y config X86_BSWAP - bool + def_bool y depends on X86_32 && !M386 - default y config X86_POPAD_OK - bool + def_bool y depends on X86_32 && !M386 - default y config X86_ALIGNMENT_16 - bool + def_bool y depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 - default y config X86_GOOD_APIC - bool + def_bool y depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64 - default y config X86_INTEL_USERCOPY - bool + def_bool y depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 - default y config X86_USE_PPRO_CHECKSUM - bool + def_bool y depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 - default y config X86_USE_3DNOW - bool + def_bool y depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML - default y config X86_OOSTORE - bool + def_bool y depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR - default y config X86_TSC - bool + def_bool y depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 - default y # this should be set for all -march=.. options where the compiler # generates cmov. config X86_CMOV - bool + def_bool y depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7) - default y config X86_MINIMUM_CPU_FAMILY int @@ -399,3 +393,6 @@ config X86_MINIMUM_CPU_FAMILY default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) default "3" +config X86_DEBUGCTLMSR + def_bool y + depends on !(M586MMX || M586TSC || M586 || M486 || M386) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 761ca7b5f12..2e1e3af28c3 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -6,7 +6,7 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" config EARLY_PRINTK - bool "Early printk" if EMBEDDED && DEBUG_KERNEL && X86_32 + bool "Early printk" if EMBEDDED default y help Write kernel log output directly into the VGA buffer or to a serial @@ -40,22 +40,49 @@ comment "Page alloc debug is incompatible with Software Suspend on i386" config DEBUG_PAGEALLOC bool "Debug page memory allocations" - depends on DEBUG_KERNEL && !HIBERNATION && !HUGETLBFS - depends on X86_32 + depends on DEBUG_KERNEL && X86_32 help Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types of memory corruptions. +config DEBUG_PER_CPU_MAPS + bool "Debug access to per_cpu maps" + depends on DEBUG_KERNEL + depends on X86_64_SMP + default n + help + Say Y to verify that the per_cpu map being accessed has + been setup. Adds a fair amount of code to kernel memory + and decreases performance. + + Say N if unsure. + config DEBUG_RODATA bool "Write protect kernel read-only data structures" + default y depends on DEBUG_KERNEL help Mark the kernel read-only data as write-protected in the pagetables, in order to catch accidental (and incorrect) writes to such const - data. This option may have a slight performance impact because a - portion of the kernel code won't be covered by a 2MB TLB anymore. - If in doubt, say "N". + data. This is recommended so that we can catch kernel bugs sooner. + If in doubt, say "Y". + +config DEBUG_RODATA_TEST + bool "Testcase for the DEBUG_RODATA feature" + depends on DEBUG_RODATA + help + This option enables a testcase for the DEBUG_RODATA + feature as well as for the change_page_attr() infrastructure. + If in doubt, say "N" + +config DEBUG_NX_TEST + tristate "Testcase for the NX non-executable stack feature" + depends on DEBUG_KERNEL && m + help + This option enables a testcase for the CPU NX capability + and the software setup of this feature. + If in doubt, say "N" config 4KSTACKS bool "Use 4Kb for kernel stacks instead of 8Kb" @@ -75,8 +102,7 @@ config X86_FIND_SMP_CONFIG config X86_MPPARSE def_bool y - depends on X86_LOCAL_APIC && !X86_VISWS - depends on X86_32 + depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64 config DOUBLEFAULT default y @@ -112,4 +138,91 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. +# +# IO delay types: +# + +config IO_DELAY_TYPE_0X80 + int + default "0" + +config IO_DELAY_TYPE_0XED + int + default "1" + +config IO_DELAY_TYPE_UDELAY + int + default "2" + +config IO_DELAY_TYPE_NONE + int + default "3" + +choice + prompt "IO delay type" + default IO_DELAY_0XED + +config IO_DELAY_0X80 + bool "port 0x80 based port-IO delay [recommended]" + help + This is the traditional Linux IO delay used for in/out_p. + It is the most tested hence safest selection here. + +config IO_DELAY_0XED + bool "port 0xed based port-IO delay" + help + Use port 0xed as the IO delay. This frees up port 0x80 which is + often used as a hardware-debug port. + +config IO_DELAY_UDELAY + bool "udelay based port-IO delay" + help + Use udelay(2) as the IO delay method. This provides the delay + while not having any side-effect on the IO port space. + +config IO_DELAY_NONE + bool "no port-IO delay" + help + No port-IO delay. Will break on old boxes that require port-IO + delay for certain operations. Should work on most new machines. + +endchoice + +if IO_DELAY_0X80 +config DEFAULT_IO_DELAY_TYPE + int + default IO_DELAY_TYPE_0X80 +endif + +if IO_DELAY_0XED +config DEFAULT_IO_DELAY_TYPE + int + default IO_DELAY_TYPE_0XED +endif + +if IO_DELAY_UDELAY +config DEFAULT_IO_DELAY_TYPE + int + default IO_DELAY_TYPE_UDELAY +endif + +if IO_DELAY_NONE +config DEFAULT_IO_DELAY_TYPE + int + default IO_DELAY_TYPE_NONE +endif + +config DEBUG_BOOT_PARAMS + bool "Debug boot parameters" + depends on DEBUG_KERNEL + depends on DEBUG_FS + help + This option will cause struct boot_params to be exported via debugfs. + +config CPA_DEBUG + bool "CPA self test code" + depends on DEBUG_KERNEL + help + Do change_page_attr self tests at boot. + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7aa1dc6d67c..da8f4129780 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -7,13 +7,254 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif -# No need to remake these files -$(srctree)/arch/x86/Makefile%: ; +core-$(CONFIG_KVM) += arch/x86/kvm/ + +# BITS is used as extension for files which are available in a 32 bit +# and a 64 bit version to simplify shared Makefiles. +# e.g.: obj-y += foo_$(BITS).o +export BITS ifeq ($(CONFIG_X86_32),y) + BITS := 32 UTS_MACHINE := i386 - include $(srctree)/arch/x86/Makefile_32 + CHECKFLAGS += -D__i386__ + + biarch := $(call cc-option,-m32) + KBUILD_AFLAGS += $(biarch) + KBUILD_CFLAGS += $(biarch) + + ifdef CONFIG_RELOCATABLE + LDFLAGS_vmlinux := --emit-relocs + endif + + KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return + + # prevent gcc from keeping the stack 16 byte aligned + KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) + + # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use + # a lot more stack due to the lack of sharing of stacklots: + KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ + echo $(call cc-option,-fno-unit-at-a-time); fi ;) + + # CPU-specific tuning. Anything which can be shared with UML should go here. + include $(srctree)/arch/x86/Makefile_32.cpu + KBUILD_CFLAGS += $(cflags-y) + + # temporary until string.h is fixed + KBUILD_CFLAGS += -ffreestanding else + BITS := 64 UTS_MACHINE := x86_64 - include $(srctree)/arch/x86/Makefile_64 + CHECKFLAGS += -D__x86_64__ -m64 + + KBUILD_AFLAGS += -m64 + KBUILD_CFLAGS += -m64 + + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) + cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) + cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) + + cflags-$(CONFIG_MCORE2) += \ + $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) + cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) + KBUILD_CFLAGS += $(cflags-y) + + KBUILD_CFLAGS += -mno-red-zone + KBUILD_CFLAGS += -mcmodel=kernel + + # -funit-at-a-time shrinks the kernel .text considerably + # unfortunately it makes reading oopses harder. + KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time) + + # this works around some issues with generating unwind tables in older gccs + # newer gccs do it by default + KBUILD_CFLAGS += -maccumulate-outgoing-args + + stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh + stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \ + "$(CC)" -fstack-protector ) + stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \ + "$(CC)" -fstack-protector-all ) + + KBUILD_CFLAGS += $(stackp-y) endif + +# Stackpointer is addressed different for 32 bit and 64 bit x86 +sp-$(CONFIG_X86_32) := esp +sp-$(CONFIG_X86_64) := rsp + +# do binutils support CFI? +cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) +# is .cfi_signal_frame supported too? +cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) + +LDFLAGS := -m elf_$(UTS_MACHINE) +OBJCOPYFLAGS := -O binary -R .note -R .comment -S + +# Speed up the build +KBUILD_CFLAGS += -pipe +# Workaround for a gcc prelease that unfortunately was shipped in a suse release +KBUILD_CFLAGS += -Wno-sign-compare +# +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables +# prevent gcc from generating any FP code by mistake +KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) + +### +# Sub architecture support +# fcore-y is linked before mcore-y files. + +# Default subarch .c files +mcore-y := arch/x86/mach-default/ + +# Voyager subarch support +mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager +mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ + +# VISWS subarch support +mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws +mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws/ + +# NUMAQ subarch support +mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq +mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default/ + +# BIGSMP subarch support +mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp +mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default/ + +#Summit subarch support +mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit +mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default/ + +# generic subarchitecture +mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic +fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ +mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ + + +# ES7000 subarch support +mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000 +fcore-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/ +mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default/ + +# RDC R-321x subarch support +mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x +mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default +core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/ + +# default subarch .h files +mflags-y += -Iinclude/asm-x86/mach-default + +# 64 bit does not support subarch support - clear sub arch variables +fcore-$(CONFIG_X86_64) := +mcore-$(CONFIG_X86_64) := +mflags-$(CONFIG_X86_64) := + +KBUILD_CFLAGS += $(mflags-y) +KBUILD_AFLAGS += $(mflags-y) + +### +# Kernel objects + +head-y := arch/x86/kernel/head_$(BITS).o +head-$(CONFIG_X86_64) += arch/x86/kernel/head64.o +head-y += arch/x86/kernel/init_task.o + +libs-y += arch/x86/lib/ + +# Sub architecture files that needs linking first +core-y += $(fcore-y) + +# Xen paravirtualization support +core-$(CONFIG_XEN) += arch/x86/xen/ + +# lguest paravirtualization support +core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ + +core-y += arch/x86/kernel/ +core-y += arch/x86/mm/ + +# Remaining sub architecture files +core-y += $(mcore-y) + +core-y += arch/x86/crypto/ +core-y += arch/x86/vdso/ +core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ + +# drivers-y are linked after core-y +drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ +drivers-$(CONFIG_PCI) += arch/x86/pci/ + +# must be linked after kernel/ +drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ + +ifeq ($(CONFIG_X86_32),y) +drivers-$(CONFIG_PM) += arch/x86/power/ +drivers-$(CONFIG_FB) += arch/x86/video/ +endif + +#### +# boot loader support. Several targets are kept for legacy purposes + +boot := arch/x86/boot + +PHONY += zImage bzImage compressed zlilo bzlilo \ + zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install + +# Default kernel to build +all: bzImage + +# KBUILD_IMAGE specify target image being built + KBUILD_IMAGE := $(boot)/bzImage +zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage + +zImage bzImage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) + $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot + $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/bzImage + +compressed: zImage + +zlilo bzlilo: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo + +zdisk bzdisk: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk + +fdimage fdimage144 fdimage288 isoimage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@ + +install: vdso_install + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install + +PHONY += vdso_install +vdso_install: + $(Q)$(MAKE) $(build)=arch/x86/vdso $@ + +archclean: + $(Q)rm -rf $(objtree)/arch/i386 + $(Q)rm -rf $(objtree)/arch/x86_64 + $(Q)$(MAKE) $(clean)=$(boot) + +define archhelp + echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' + echo ' install - Install kernel using' + echo ' (your) ~/bin/installkernel or' + echo ' (distribution) /sbin/installkernel or' + echo ' install to $$(INSTALL_PATH) and run lilo' + echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' + echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' + echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)' + echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)' + echo ' bzdisk/fdimage*/isoimage also accept:' + echo ' FDARGS="..." arguments for the booted kernel' + echo ' FDINITRD=file initrd for the booted kernel' +endef + +CLEAN_FILES += arch/x86/boot/fdimage \ + arch/x86/boot/image.iso \ + arch/x86/boot/mtools.conf diff --git a/arch/x86/Makefile_32 b/arch/x86/Makefile_32 deleted file mode 100644 index 50394da2f6c..00000000000 --- a/arch/x86/Makefile_32 +++ /dev/null @@ -1,175 +0,0 @@ -# -# i386 Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. Remember to do have actions -# for "archclean" cleaning up for this architecture. -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 1994 by Linus Torvalds -# -# 19990713 Artur Skawina <skawina@geocities.com> -# Added '-march' and '-mpreferred-stack-boundary' support -# -# 20050320 Kianusch Sayah Karadji <kianusch@sk-tech.net> -# Added support for GEODE CPU - -# BITS is used as extension for files which are available in a 32 bit -# and a 64 bit version to simplify shared Makefiles. -# e.g.: obj-y += foo_$(BITS).o -BITS := 32 -export BITS - -HAS_BIARCH := $(call cc-option-yn, -m32) -ifeq ($(HAS_BIARCH),y) -AS := $(AS) --32 -LD := $(LD) -m elf_i386 -CC := $(CC) -m32 -endif - -LDFLAGS := -m elf_i386 -OBJCOPYFLAGS := -O binary -R .note -R .comment -S -ifdef CONFIG_RELOCATABLE -LDFLAGS_vmlinux := --emit-relocs -endif -CHECKFLAGS += -D__i386__ - -KBUILD_CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return - -# prevent gcc from keeping the stack 16 byte aligned -KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) - -# CPU-specific tuning. Anything which can be shared with UML should go here. -include $(srctree)/arch/x86/Makefile_32.cpu - -# temporary until string.h is fixed -cflags-y += -ffreestanding - -# this works around some issues with generating unwind tables in older gccs -# newer gccs do it by default -cflags-y += -maccumulate-outgoing-args - -# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use -# a lot more stack due to the lack of sharing of stacklots: -KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-option,-fno-unit-at-a-time); fi ;) - -# do binutils support CFI? -cflags-y += $(call as-instr,.cfi_startproc\n.cfi_rel_offset esp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,) -KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_rel_offset esp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,) - -# is .cfi_signal_frame supported too? -cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) -KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) - -KBUILD_CFLAGS += $(cflags-y) - -# Default subarch .c files -mcore-y := arch/x86/mach-default - -# Voyager subarch support -mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager -mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager - -# VISWS subarch support -mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws -mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws - -# NUMAQ subarch support -mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq -mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default - -# BIGSMP subarch support -mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp -mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default - -#Summit subarch support -mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit -mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default - -# generic subarchitecture -mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-x86/mach-generic -mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default -core-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ - -# ES7000 subarch support -mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000 -mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default -core-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/ - -# Xen paravirtualization support -core-$(CONFIG_XEN) += arch/x86/xen/ - -# lguest paravirtualization support -core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ - -# default subarch .h files -mflags-y += -Iinclude/asm-x86/mach-default - -head-y := arch/x86/kernel/head_32.o arch/x86/kernel/init_task.o - -libs-y += arch/x86/lib/ -core-y += arch/x86/kernel/ \ - arch/x86/mm/ \ - $(mcore-y)/ \ - arch/x86/crypto/ -drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ -drivers-$(CONFIG_PCI) += arch/x86/pci/ -# must be linked after kernel/ -drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ -drivers-$(CONFIG_PM) += arch/x86/power/ -drivers-$(CONFIG_FB) += arch/x86/video/ - -KBUILD_CFLAGS += $(mflags-y) -KBUILD_AFLAGS += $(mflags-y) - -boot := arch/x86/boot - -PHONY += zImage bzImage compressed zlilo bzlilo \ - zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install - -all: bzImage - -# KBUILD_IMAGE specify target image being built - KBUILD_IMAGE := $(boot)/bzImage -zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage - -zImage bzImage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) - $(Q)mkdir -p $(objtree)/arch/i386/boot - $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/i386/boot/bzImage - -compressed: zImage - -zlilo bzlilo: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo - -zdisk bzdisk: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk - -fdimage fdimage144 fdimage288 isoimage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@ - -install: - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install - -archclean: - $(Q)rm -rf $(objtree)/arch/i386/boot - $(Q)$(MAKE) $(clean)=arch/x86/boot - -define archhelp - echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' - echo ' install - Install kernel using' - echo ' (your) ~/bin/installkernel or' - echo ' (distribution) /sbin/installkernel or' - echo ' install to $$(INSTALL_PATH) and run lilo' - echo ' bzdisk - Create a boot floppy in /dev/fd0' - echo ' fdimage - Create a boot floppy image' - echo ' isoimage - Create a boot CD-ROM image' -endef - -CLEAN_FILES += arch/x86/boot/fdimage \ - arch/x86/boot/image.iso \ - arch/x86/boot/mtools.conf diff --git a/arch/x86/Makefile_64 b/arch/x86/Makefile_64 deleted file mode 100644 index a804860022e..00000000000 --- a/arch/x86/Makefile_64 +++ /dev/null @@ -1,144 +0,0 @@ -# -# x86_64 Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. Remember to do have actions -# for "archclean" and "archdep" for cleaning up and making dependencies for -# this architecture -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 1994 by Linus Torvalds -# -# 19990713 Artur Skawina <skawina@geocities.com> -# Added '-march' and '-mpreferred-stack-boundary' support -# 20000913 Pavel Machek <pavel@suse.cz> -# Converted for x86_64 architecture -# 20010105 Andi Kleen, add IA32 compiler. -# ....and later removed it again.... -# -# $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $ - -# BITS is used as extension for files which are available in a 32 bit -# and a 64 bit version to simplify shared Makefiles. -# e.g.: obj-y += foo_$(BITS).o -BITS := 64 -export BITS - -LDFLAGS := -m elf_x86_64 -OBJCOPYFLAGS := -O binary -R .note -R .comment -S -LDFLAGS_vmlinux := -CHECKFLAGS += -D__x86_64__ -m64 - -cflags-y := -cflags-kernel-y := -cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) -cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) -# gcc doesn't support -march=core2 yet as of gcc 4.3, but I hope it -# will eventually. Use -mtune=generic as fallback -cflags-$(CONFIG_MCORE2) += \ - $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) -cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) - -cflags-y += -m64 -cflags-y += -mno-red-zone -cflags-y += -mcmodel=kernel -cflags-y += -pipe -cflags-y += -Wno-sign-compare -cflags-y += -fno-asynchronous-unwind-tables -ifneq ($(CONFIG_DEBUG_INFO),y) -# -fweb shrinks the kernel a bit, but the difference is very small -# it also messes up debugging, so don't use it for now. -#cflags-y += $(call cc-option,-fweb) -endif -# -funit-at-a-time shrinks the kernel .text considerably -# unfortunately it makes reading oopses harder. -cflags-y += $(call cc-option,-funit-at-a-time) -# prevent gcc from generating any FP code by mistake -cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) -# this works around some issues with generating unwind tables in older gccs -# newer gccs do it by default -cflags-y += -maccumulate-outgoing-args - -# do binutils support CFI? -cflags-y += $(call as-instr,.cfi_startproc\n.cfi_rel_offset rsp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,) -KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_rel_offset rsp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,) - -# is .cfi_signal_frame supported too? -cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) -KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) - -cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector ) -cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector-all ) - -KBUILD_CFLAGS += $(cflags-y) -CFLAGS_KERNEL += $(cflags-kernel-y) -KBUILD_AFLAGS += -m64 - -head-y := arch/x86/kernel/head_64.o arch/x86/kernel/head64.o arch/x86/kernel/init_task.o - -libs-y += arch/x86/lib/ -core-y += arch/x86/kernel/ \ - arch/x86/mm/ \ - arch/x86/crypto/ \ - arch/x86/vdso/ -core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ -drivers-$(CONFIG_PCI) += arch/x86/pci/ -drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ - -boot := arch/x86/boot - -PHONY += bzImage bzlilo install archmrproper \ - fdimage fdimage144 fdimage288 isoimage archclean - -#Default target when executing "make" -all: bzImage - -BOOTIMAGE := arch/x86/boot/bzImage -KBUILD_IMAGE := $(BOOTIMAGE) - -bzImage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE) - $(Q)mkdir -p $(objtree)/arch/x86_64/boot - $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/x86_64/boot/bzImage - -bzlilo: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zlilo - -bzdisk: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk - -fdimage fdimage144 fdimage288 isoimage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ - -install: vdso_install - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ - -vdso_install: -ifeq ($(CONFIG_IA32_EMULATION),y) - $(Q)$(MAKE) $(build)=arch/x86/ia32 $@ -endif - $(Q)$(MAKE) $(build)=arch/x86/vdso $@ - -archclean: - $(Q)rm -rf $(objtree)/arch/x86_64/boot - $(Q)$(MAKE) $(clean)=$(boot) - -define archhelp - echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' - echo ' install - Install kernel using' - echo ' (your) ~/bin/installkernel or' - echo ' (distribution) /sbin/installkernel or' - echo ' install to $$(INSTALL_PATH) and run lilo' - echo ' bzdisk - Create a boot floppy in /dev/fd0' - echo ' fdimage - Create a boot floppy image' - echo ' isoimage - Create a boot CD-ROM image' -endef - -CLEAN_FILES += arch/x86/boot/fdimage \ - arch/x86/boot/image.iso \ - arch/x86/boot/mtools.conf - - diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 7a3116ccf38..349b81a39c4 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -28,9 +28,11 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA targets := vmlinux.bin setup.bin setup.elf zImage bzImage subdir- := compressed -setup-y += a20.o apm.o cmdline.o copy.o cpu.o cpucheck.o edd.o +setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o setup-y += header.o main.o mca.o memory.o pm.o pmjump.o -setup-y += printf.o string.o tty.o video.o version.o voyager.o +setup-y += printf.o string.o tty.o video.o version.o +setup-$(CONFIG_X86_APM_BOOT) += apm.o +setup-$(CONFIG_X86_VOYAGER) += voyager.o # The link order of the video-*.o modules can matter. In particular, # video-vga.o *must* be listed first, followed by video-vesa.o. @@ -49,10 +51,7 @@ HOSTCFLAGS_build.o := $(LINUXINCLUDE) # How to compile the 16-bit code. Note we always compile for -march=i386, # that way we can complain to the user if the CPU is insufficient. -cflags-$(CONFIG_X86_32) := -cflags-$(CONFIG_X86_64) := -m32 KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ - $(cflags-y) \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ -include $(srctree)/$(src)/code16gcc.h \ @@ -62,6 +61,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ $(call cc-option, -fno-unit-at-a-time)) \ $(call cc-option, -fno-stack-protector) \ $(call cc-option, -mpreferred-stack-boundary=2) +KBUILD_CFLAGS += $(call cc-option,-m32) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ $(obj)/zImage: IMAGE_OFFSET := 0x1000 diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c index eab50c55a3a..c117c7fb859 100644 --- a/arch/x86/boot/apm.c +++ b/arch/x86/boot/apm.c @@ -19,8 +19,6 @@ #include "boot.h" -#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE) - int query_apm_bios(void) { u16 ax, bx, cx, dx, di; @@ -95,4 +93,3 @@ int query_apm_bios(void) return 0; } -#endif diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index d2b5adf4651..7822a4983da 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -109,7 +109,7 @@ typedef unsigned int addr_t; static inline u8 rdfs8(addr_t addr) { u8 v; - asm volatile("movb %%fs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); return v; } static inline u16 rdfs16(addr_t addr) @@ -127,21 +127,21 @@ static inline u32 rdfs32(addr_t addr) static inline void wrfs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "r" (v)); + asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); } static inline void wrfs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "r" (v)); + asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); } static inline void wrfs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "r" (v)); + asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); } static inline u8 rdgs8(addr_t addr) { u8 v; - asm volatile("movb %%gs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); return v; } static inline u16 rdgs16(addr_t addr) @@ -159,15 +159,15 @@ static inline u32 rdgs32(addr_t addr) static inline void wrgs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "r" (v)); + asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); } static inline void wrgs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "r" (v)); + asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); } static inline void wrgs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "r" (v)); + asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); } /* Note: these only return true/false, not a signed return value! */ @@ -241,6 +241,7 @@ int query_apm_bios(void); /* cmdline.c */ int cmdline_find_option(const char *option, char *buffer, int bufsize); +int cmdline_find_option_bool(const char *option); /* cpu.c, cpucheck.c */ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c index 34bb778c435..680408a0f46 100644 --- a/arch/x86/boot/cmdline.c +++ b/arch/x86/boot/cmdline.c @@ -95,3 +95,68 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize) return len; } + +/* + * Find a boolean option (like quiet,noapic,nosmp....) + * + * Returns the position of that option (starts counting with 1) + * or 0 on not found + */ +int cmdline_find_option_bool(const char *option) +{ + u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr; + addr_t cptr; + char c; + int pos = 0, wstart = 0; + const char *opptr = NULL; + enum { + st_wordstart, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + } state = st_wordstart; + + if (!cmdline_ptr || cmdline_ptr >= 0x100000) + return -1; /* No command line, or inaccessible */ + + cptr = cmdline_ptr & 0xf; + set_fs(cmdline_ptr >> 4); + + while (cptr < 0x10000) { + c = rdfs8(cptr++); + pos++; + + switch (state) { + case st_wordstart: + if (!c) + return 0; + else if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + wstart = pos; + /* fall through */ + + case st_wordcmp: + if (!*opptr) + if (!c || myisspace(c)) + return wstart; + else + state = st_wordskip; + else if (!c) + return 0; + else if (c != *opptr++) + state = st_wordskip; + break; + + case st_wordskip: + if (!c) + return 0; + else if (myisspace(c)) + state = st_wordstart; + break; + } + } + + return 0; /* Buffer overrun */ +} diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 52c1db85452..fe24ceabd90 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -1,5 +1,63 @@ +# +# linux/arch/x86/boot/compressed/Makefile +# +# create a compressed vmlinux image from the original vmlinux +# + +targets := vmlinux vmlinux.bin vmlinux.bin.gz head_$(BITS).o misc.o piggy.o + +KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 +KBUILD_CFLAGS += -fno-strict-aliasing -fPIC +cflags-$(CONFIG_X86_64) := -mcmodel=small +KBUILD_CFLAGS += $(cflags-y) +KBUILD_CFLAGS += $(call cc-option,-ffreestanding) +KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) + +KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ + +LDFLAGS := -m elf_$(UTS_MACHINE) +LDFLAGS_vmlinux := -T + +$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE + $(call if_changed,ld) + @: + +$(obj)/vmlinux.bin: vmlinux FORCE + $(call if_changed,objcopy) + + ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/boot/compressed/Makefile_32 +targets += vmlinux.bin.all vmlinux.relocs +hostprogs-y := relocs + +quiet_cmd_relocs = RELOCS $@ + cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< +$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE + $(call if_changed,relocs) + +vmlinux.bin.all-y := $(obj)/vmlinux.bin +vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs +quiet_cmd_relocbin = BUILD $@ + cmd_relocbin = cat $(filter-out FORCE,$^) > $@ +$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE + $(call if_changed,relocbin) + +ifdef CONFIG_RELOCATABLE +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE + $(call if_changed,gzip) else -include ${srctree}/arch/x86/boot/compressed/Makefile_64 +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) endif +LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T + +else +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) + +LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T +endif + + +$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE + $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/Makefile_32 b/arch/x86/boot/compressed/Makefile_32 deleted file mode 100644 index e43ff7c56e6..00000000000 --- a/arch/x86/boot/compressed/Makefile_32 +++ /dev/null @@ -1,50 +0,0 @@ -# -# linux/arch/x86/boot/compressed/Makefile -# -# create a compressed vmlinux image from the original vmlinux -# - -targets := vmlinux vmlinux.bin vmlinux.bin.gz head_32.o misc_32.o piggy.o \ - vmlinux.bin.all vmlinux.relocs -EXTRA_AFLAGS := -traditional - -LDFLAGS_vmlinux := -T -hostprogs-y := relocs - -KBUILD_CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \ - -fno-strict-aliasing -fPIC \ - $(call cc-option,-ffreestanding) \ - $(call cc-option,-fno-stack-protector) -LDFLAGS := -m elf_i386 - -$(obj)/vmlinux: $(src)/vmlinux_32.lds $(obj)/head_32.o $(obj)/misc_32.o $(obj)/piggy.o FORCE - $(call if_changed,ld) - @: - -$(obj)/vmlinux.bin: vmlinux FORCE - $(call if_changed,objcopy) - -quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE - $(call if_changed,relocs) - -vmlinux.bin.all-y := $(obj)/vmlinux.bin -vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs -quiet_cmd_relocbin = BUILD $@ - cmd_relocbin = cat $(filter-out FORCE,$^) > $@ -$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE - $(call if_changed,relocbin) - -ifdef CONFIG_RELOCATABLE -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE - $(call if_changed,gzip) -else -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) -endif - -LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T - -$(obj)/piggy.o: $(src)/vmlinux_32.scr $(obj)/vmlinux.bin.gz FORCE - $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/Makefile_64 b/arch/x86/boot/compressed/Makefile_64 deleted file mode 100644 index 7801e8dd90b..00000000000 --- a/arch/x86/boot/compressed/Makefile_64 +++ /dev/null @@ -1,30 +0,0 @@ -# -# linux/arch/x86/boot/compressed/Makefile -# -# create a compressed vmlinux image from the original vmlinux -# - -targets := vmlinux vmlinux.bin vmlinux.bin.gz head_64.o misc_64.o piggy.o - -KBUILD_CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2 \ - -fno-strict-aliasing -fPIC -mcmodel=small \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-stack-protector) -KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -LDFLAGS := -m elf_x86_64 - -LDFLAGS_vmlinux := -T -$(obj)/vmlinux: $(src)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o $(obj)/piggy.o FORCE - $(call if_changed,ld) - @: - -$(obj)/vmlinux.bin: vmlinux FORCE - $(call if_changed,objcopy) - -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) - -LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T - -$(obj)/piggy.o: $(obj)/vmlinux_64.scr $(obj)/vmlinux.bin.gz FORCE - $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc.c index b74d60d1b2f..8182e32c1b4 100644 --- a/arch/x86/boot/compressed/misc_32.c +++ b/arch/x86/boot/compressed/misc.c @@ -1,7 +1,7 @@ /* * misc.c - * - * This is a collection of several routines from gzip-1.0.3 + * + * This is a collection of several routines from gzip-1.0.3 * adapted for Linux. * * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 @@ -9,9 +9,18 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ +/* + * we have to be careful, because no indirections are allowed here, and + * paravirt_ops is a kind of one. As it will only run in baremetal anyway, + * we just keep it from happening + */ #undef CONFIG_PARAVIRT +#ifdef CONFIG_X86_64 +#define _LINUX_STRING_H_ 1 +#define __LINUX_BITMAP_H 1 +#endif + #include <linux/linkage.h> -#include <linux/vmalloc.h> #include <linux/screen_info.h> #include <asm/io.h> #include <asm/page.h> @@ -186,10 +195,20 @@ static void *memcpy(void *dest, const void *src, unsigned n); static void putstr(const char *); -static unsigned long free_mem_ptr; -static unsigned long free_mem_end_ptr; +#ifdef CONFIG_X86_64 +#define memptr long +#else +#define memptr unsigned +#endif + +static memptr free_mem_ptr; +static memptr free_mem_end_ptr; +#ifdef CONFIG_X86_64 +#define HEAP_SIZE 0x7000 +#else #define HEAP_SIZE 0x4000 +#endif static char *vidmem = (char *)0xb8000; static int vidport; @@ -230,7 +249,7 @@ static void gzip_mark(void **ptr) static void gzip_release(void **ptr) { - free_mem_ptr = (unsigned long) *ptr; + free_mem_ptr = (memptr) *ptr; } static void scroll(void) @@ -247,8 +266,10 @@ static void putstr(const char *s) int x,y,pos; char c; +#ifdef CONFIG_X86_32 if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0) return; +#endif x = RM_SCREEN_INFO.orig_x; y = RM_SCREEN_INFO.orig_y; @@ -261,7 +282,7 @@ static void putstr(const char *s) y--; } } else { - vidmem [ ( x + cols * y ) * 2 ] = c; + vidmem [(x + cols * y) * 2] = c; if ( ++x >= cols ) { x = 0; if ( ++y >= lines ) { @@ -276,16 +297,16 @@ static void putstr(const char *s) RM_SCREEN_INFO.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ - outb_p(14, vidport); - outb_p(0xff & (pos >> 9), vidport+1); - outb_p(15, vidport); - outb_p(0xff & (pos >> 1), vidport+1); + outb(14, vidport); + outb(0xff & (pos >> 9), vidport+1); + outb(15, vidport); + outb(0xff & (pos >> 1), vidport+1); } static void* memset(void* s, int c, unsigned n) { int i; - char *ss = (char*)s; + char *ss = s; for (i=0;i<n;i++) ss[i] = c; return s; @@ -294,7 +315,8 @@ static void* memset(void* s, int c, unsigned n) static void* memcpy(void* dest, const void* src, unsigned n) { int i; - char *d = (char *)dest, *s = (char *)src; + const char *s = src; + char *d = dest; for (i=0;i<n;i++) d[i] = s[i]; return dest; @@ -339,11 +361,13 @@ static void error(char *x) putstr(x); putstr("\n\n -- System halted"); - while(1); /* Halt */ + while (1) + asm("hlt"); } -asmlinkage void decompress_kernel(void *rmode, unsigned long end, - uch *input_data, unsigned long input_len, uch *output) +asmlinkage void decompress_kernel(void *rmode, memptr heap, + uch *input_data, unsigned long input_len, + uch *output) { real_mode = rmode; @@ -358,25 +382,32 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long end, lines = RM_SCREEN_INFO.orig_video_lines; cols = RM_SCREEN_INFO.orig_video_cols; - window = output; /* Output buffer (Normally at 1M) */ - free_mem_ptr = end; /* Heap */ - free_mem_end_ptr = end + HEAP_SIZE; - inbuf = input_data; /* Input buffer */ + window = output; /* Output buffer (Normally at 1M) */ + free_mem_ptr = heap; /* Heap */ + free_mem_end_ptr = heap + HEAP_SIZE; + inbuf = input_data; /* Input buffer */ insize = input_len; inptr = 0; +#ifdef CONFIG_X86_64 + if ((ulg)output & (__KERNEL_ALIGN - 1)) + error("Destination address not 2M aligned"); + if ((ulg)output >= 0xffffffffffUL) + error("Destination address too large"); +#else if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1)) error("Destination address not CONFIG_PHYSICAL_ALIGN aligned"); - if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff)) + if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) error("Destination address too large"); #ifndef CONFIG_RELOCATABLE if ((u32)output != LOAD_PHYSICAL_ADDR) error("Wrong destination address"); #endif +#endif makecrc(); - putstr("Uncompressing Linux... "); + putstr("\nDecompressing Linux... "); gunzip(); - putstr("Ok, booting the kernel.\n"); + putstr("done.\nBooting the kernel.\n"); return; } diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c deleted file mode 100644 index 6ea015aa65e..00000000000 --- a/arch/x86/boot/compressed/misc_64.c +++ /dev/null @@ -1,371 +0,0 @@ -/* - * misc.c - * - * This is a collection of several routines from gzip-1.0.3 - * adapted for Linux. - * - * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 - * puts by Nick Holloway 1993, better puts by Martin Mares 1995 - * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 - */ - -#define _LINUX_STRING_H_ 1 -#define __LINUX_BITMAP_H 1 - -#include <linux/linkage.h> -#include <linux/screen_info.h> -#include <asm/io.h> -#include <asm/page.h> - -/* WARNING!! - * This code is compiled with -fPIC and it is relocated dynamically - * at run time, but no relocation processing is performed. - * This means that it is not safe to place pointers in static structures. - */ - -/* - * Getting to provable safe in place decompression is hard. - * Worst case behaviours need to be analyzed. - * Background information: - * - * The file layout is: - * magic[2] - * method[1] - * flags[1] - * timestamp[4] - * extraflags[1] - * os[1] - * compressed data blocks[N] - * crc[4] orig_len[4] - * - * resulting in 18 bytes of non compressed data overhead. - * - * Files divided into blocks - * 1 bit (last block flag) - * 2 bits (block type) - * - * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved. - * The smallest block type encoding is always used. - * - * stored: - * 32 bits length in bytes. - * - * fixed: - * magic fixed tree. - * symbols. - * - * dynamic: - * dynamic tree encoding. - * symbols. - * - * - * The buffer for decompression in place is the length of the - * uncompressed data, plus a small amount extra to keep the algorithm safe. - * The compressed data is placed at the end of the buffer. The output - * pointer is placed at the start of the buffer and the input pointer - * is placed where the compressed data starts. Problems will occur - * when the output pointer overruns the input pointer. - * - * The output pointer can only overrun the input pointer if the input - * pointer is moving faster than the output pointer. A condition only - * triggered by data whose compressed form is larger than the uncompressed - * form. - * - * The worst case at the block level is a growth of the compressed data - * of 5 bytes per 32767 bytes. - * - * The worst case internal to a compressed block is very hard to figure. - * The worst case can at least be boundined by having one bit that represents - * 32764 bytes and then all of the rest of the bytes representing the very - * very last byte. - * - * All of which is enough to compute an amount of extra data that is required - * to be safe. To avoid problems at the block level allocating 5 extra bytes - * per 32767 bytes of data is sufficient. To avoind problems internal to a block - * adding an extra 32767 bytes (the worst case uncompressed block size) is - * sufficient, to ensure that in the worst case the decompressed data for - * block will stop the byte before the compressed data for a block begins. - * To avoid problems with the compressed data's meta information an extra 18 - * bytes are needed. Leading to the formula: - * - * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. - * - * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. - * Adding 32768 instead of 32767 just makes for round numbers. - * Adding the decompressor_size is necessary as it musht live after all - * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actual data and 4K of bss. - * - */ - -/* - * gzip declarations - */ - -#define OF(args) args -#define STATIC static - -#undef memset -#undef memcpy -#define memzero(s, n) memset ((s), 0, (n)) - -typedef unsigned char uch; -typedef unsigned short ush; -typedef unsigned long ulg; - -#define WSIZE 0x80000000 /* Window size must be at least 32k, - * and a power of two - * We don't actually have a window just - * a huge output buffer so I report - * a 2G windows size, as that should - * always be larger than our output buffer. - */ - -static uch *inbuf; /* input buffer */ -static uch *window; /* Sliding window buffer, (and final output buffer) */ - -static unsigned insize; /* valid bytes in inbuf */ -static unsigned inptr; /* index of next byte to be processed in inbuf */ -static unsigned outcnt; /* bytes in output buffer */ - -/* gzip flag byte */ -#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ -#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ -#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ -#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ -#define COMMENT 0x10 /* bit 4 set: file comment present */ -#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ -#define RESERVED 0xC0 /* bit 6,7: reserved */ - -#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) - -/* Diagnostic functions */ -#ifdef DEBUG -# define Assert(cond,msg) {if(!(cond)) error(msg);} -# define Trace(x) fprintf x -# define Tracev(x) {if (verbose) fprintf x ;} -# define Tracevv(x) {if (verbose>1) fprintf x ;} -# define Tracec(c,x) {if (verbose && (c)) fprintf x ;} -# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;} -#else -# define Assert(cond,msg) -# define Trace(x) -# define Tracev(x) -# define Tracevv(x) -# define Tracec(c,x) -# define Tracecv(c,x) -#endif - -static int fill_inbuf(void); -static void flush_window(void); -static void error(char *m); -static void gzip_mark(void **); -static void gzip_release(void **); - -/* - * This is set up by the setup-routine at boot-time - */ -static unsigned char *real_mode; /* Pointer to real-mode data */ - -#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) -#ifndef STANDARD_MEMORY_BIOS_CALL -#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) -#endif -#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0)) - -extern unsigned char input_data[]; -extern int input_len; - -static long bytes_out = 0; - -static void *malloc(int size); -static void free(void *where); - -static void *memset(void *s, int c, unsigned n); -static void *memcpy(void *dest, const void *src, unsigned n); - -static void putstr(const char *); - -static long free_mem_ptr; -static long free_mem_end_ptr; - -#define HEAP_SIZE 0x7000 - -static char *vidmem = (char *)0xb8000; -static int vidport; -static int lines, cols; - -#include "../../../../lib/inflate.c" - -static void *malloc(int size) -{ - void *p; - - if (size <0) error("Malloc error"); - if (free_mem_ptr <= 0) error("Memory error"); - - free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */ - - p = (void *)free_mem_ptr; - free_mem_ptr += size; - - if (free_mem_ptr >= free_mem_end_ptr) - error("Out of memory"); - - return p; -} - -static void free(void *where) -{ /* Don't care */ -} - -static void gzip_mark(void **ptr) -{ - *ptr = (void *) free_mem_ptr; -} - -static void gzip_release(void **ptr) -{ - free_mem_ptr = (long) *ptr; -} - -static void scroll(void) -{ - int i; - - memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 ); - for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 ) - vidmem[i] = ' '; -} - -static void putstr(const char *s) -{ - int x,y,pos; - char c; - - x = RM_SCREEN_INFO.orig_x; - y = RM_SCREEN_INFO.orig_y; - - while ( ( c = *s++ ) != '\0' ) { - if ( c == '\n' ) { - x = 0; - if ( ++y >= lines ) { - scroll(); - y--; - } - } else { - vidmem [ ( x + cols * y ) * 2 ] = c; - if ( ++x >= cols ) { - x = 0; - if ( ++y >= lines ) { - scroll(); - y--; - } - } - } - } - - RM_SCREEN_INFO.orig_x = x; - RM_SCREEN_INFO.orig_y = y; - - pos = (x + cols * y) * 2; /* Update cursor position */ - outb_p(14, vidport); - outb_p(0xff & (pos >> 9), vidport+1); - outb_p(15, vidport); - outb_p(0xff & (pos >> 1), vidport+1); -} - -static void* memset(void* s, int c, unsigned n) -{ - int i; - char *ss = (char*)s; - - for (i=0;i<n;i++) ss[i] = c; - return s; -} - -static void* memcpy(void* dest, const void* src, unsigned n) -{ - int i; - char *d = (char *)dest, *s = (char *)src; - - for (i=0;i<n;i++) d[i] = s[i]; - return dest; -} - -/* =========================================================================== - * Fill the input buffer. This is called only when the buffer is empty - * and at least one byte is really needed. - */ -static int fill_inbuf(void) -{ - error("ran out of input data"); - return 0; -} - -/* =========================================================================== - * Write the output window window[0..outcnt-1] and update crc and bytes_out. - * (Used for the decompressed data only.) - */ -static void flush_window(void) -{ - /* With my window equal to my output buffer - * I only need to compute the crc here. - */ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, ch; - - in = window; - for (n = 0; n < outcnt; n++) { - ch = *in++; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - outcnt = 0; -} - -static void error(char *x) -{ - putstr("\n\n"); - putstr(x); - putstr("\n\n -- System halted"); - - while(1); /* Halt */ -} - -asmlinkage void decompress_kernel(void *rmode, unsigned long heap, - uch *input_data, unsigned long input_len, uch *output) -{ - real_mode = rmode; - - if (RM_SCREEN_INFO.orig_video_mode == 7) { - vidmem = (char *) 0xb0000; - vidport = 0x3b4; - } else { - vidmem = (char *) 0xb8000; - vidport = 0x3d4; - } - - lines = RM_SCREEN_INFO.orig_video_lines; - cols = RM_SCREEN_INFO.orig_video_cols; - - window = output; /* Output buffer (Normally at 1M) */ - free_mem_ptr = heap; /* Heap */ - free_mem_end_ptr = heap + HEAP_SIZE; - inbuf = input_data; /* Input buffer */ - insize = input_len; - inptr = 0; - - if ((ulg)output & (__KERNEL_ALIGN - 1)) - error("Destination address not 2M aligned"); - if ((ulg)output >= 0xffffffffffUL) - error("Destination address too large"); - - makecrc(); - putstr(".\nDecompressing Linux..."); - gunzip(); - putstr("done.\nBooting the kernel.\n"); - return; -} diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index 7a0d00b2cf2..d01ea42187e 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -27,11 +27,6 @@ static unsigned long *relocs; * absolute relocations present w.r.t these symbols. */ static const char* safe_abs_relocs[] = { - "__kernel_vsyscall", - "__kernel_rt_sigreturn", - "__kernel_sigreturn", - "SYSENTER_RETURN", - "VDSO_NOTE_MASK", "xen_irq_disable_direct_reloc", "xen_save_fl_direct_reloc", }; @@ -45,6 +40,8 @@ static int is_safe_abs_reloc(const char* sym_name) /* Match found */ return 1; } + if (strncmp(sym_name, "VDSO", 4) == 0) + return 1; if (strncmp(sym_name, "__crc_", 6) == 0) return 1; return 0; diff --git a/arch/x86/boot/compressed/vmlinux_64.scr b/arch/x86/boot/compressed/vmlinux.scr index bd1429ce193..f02382ae5c4 100644 --- a/arch/x86/boot/compressed/vmlinux_64.scr +++ b/arch/x86/boot/compressed/vmlinux.scr @@ -1,6 +1,6 @@ SECTIONS { - .text.compressed : { + .rodata.compressed : { input_len = .; LONG(input_data_end - input_data) input_data = .; *(.data) diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds index cc4854f6c6c..bb3c48379c4 100644 --- a/arch/x86/boot/compressed/vmlinux_32.lds +++ b/arch/x86/boot/compressed/vmlinux_32.lds @@ -3,17 +3,17 @@ OUTPUT_ARCH(i386) ENTRY(startup_32) SECTIONS { - /* Be careful parts of head.S assume startup_32 is at - * address 0. + /* Be careful parts of head_32.S assume startup_32 is at + * address 0. */ - . = 0 ; + . = 0; .text.head : { _head = . ; *(.text.head) _ehead = . ; } - .data.compressed : { - *(.data.compressed) + .rodata.compressed : { + *(.rodata.compressed) } .text : { _text = .; /* Text */ diff --git a/arch/x86/boot/compressed/vmlinux_32.scr b/arch/x86/boot/compressed/vmlinux_32.scr deleted file mode 100644 index 707a88f7f29..00000000000 --- a/arch/x86/boot/compressed/vmlinux_32.scr +++ /dev/null @@ -1,10 +0,0 @@ -SECTIONS -{ - .data.compressed : { - input_len = .; - LONG(input_data_end - input_data) input_data = .; - *(.data) - output_len = . - 4; - input_data_end = .; - } -} diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux_64.lds index 94c13e557fb..f6e5b445f45 100644 --- a/arch/x86/boot/compressed/vmlinux_64.lds +++ b/arch/x86/boot/compressed/vmlinux_64.lds @@ -3,15 +3,19 @@ OUTPUT_ARCH(i386:x86-64) ENTRY(startup_64) SECTIONS { - /* Be careful parts of head.S assume startup_32 is at - * address 0. + /* Be careful parts of head_64.S assume startup_64 is at + * address 0. */ . = 0; - .text : { + .text.head : { _head = . ; *(.text.head) _ehead = . ; - *(.text.compressed) + } + .rodata.compressed : { + *(.rodata.compressed) + } + .text : { _text = .; /* Text */ *(.text) *(.text.*) diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index bd138e442ec..8721dc46a0b 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -129,6 +129,7 @@ void query_edd(void) char eddarg[8]; int do_mbr = 1; int do_edd = 1; + int be_quiet; int devno; struct edd_info ei, *edp; u32 *mbrptr; @@ -140,12 +141,21 @@ void query_edd(void) do_edd = 0; } + be_quiet = cmdline_find_option_bool("quiet"); + edp = boot_params.eddbuf; mbrptr = boot_params.edd_mbr_sig_buffer; if (!do_edd) return; + /* Bugs in OnBoard or AddOnCards Bios may hang the EDD probe, + * so give a hint if this happens. + */ + + if (!be_quiet) + printf("Probing EDD (edd=off to disable)... "); + for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) { /* * Scan the BIOS-supported hard disks and query EDD @@ -162,6 +172,9 @@ void query_edd(void) if (do_mbr && !read_mbr_sig(devno, &ei, mbrptr++)) boot_params.edd_mbr_sig_buf_entries = devno-0x80+1; } + + if (!be_quiet) + printf("ok\n"); } #endif diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 4cc5b0411db..64ad9016585 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -195,10 +195,13 @@ cmd_line_ptr: .long 0 # (Header version 0x0202 or later) # can be located anywhere in # low memory 0x10000 or higher. -ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff +ramdisk_max: .long 0x7fffffff # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd + # The current kernel allows up to 4 GB, + # but leave it at 2 GB to avoid + # possible bootloader bugs. kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment #required for protected mode diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 1f95750ede2..7828da5cfd0 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -100,20 +100,32 @@ static void set_bios_mode(void) #endif } -void main(void) +static void init_heap(void) { - /* First, copy the boot header into the "zeropage" */ - copy_boot_params(); + char *stack_end; - /* End of heap check */ if (boot_params.hdr.loadflags & CAN_USE_HEAP) { - heap_end = (char *)(boot_params.hdr.heap_end_ptr - +0x200-STACK_SIZE); + asm("leal %P1(%%esp),%0" + : "=r" (stack_end) : "i" (-STACK_SIZE)); + + heap_end = (char *) + ((size_t)boot_params.hdr.heap_end_ptr + 0x200); + if (heap_end > stack_end) + heap_end = stack_end; } else { /* Boot protocol 2.00 only, no heap available */ puts("WARNING: Ancient bootloader, some functionality " "may be limited!\n"); } +} + +void main(void) +{ + /* First, copy the boot header into the "zeropage" */ + copy_boot_params(); + + /* End of heap check */ + init_heap(); /* Make sure we have all the proper CPU support */ if (validate_cpu()) { @@ -131,9 +143,6 @@ void main(void) /* Set keyboard repeat rate (why?) */ keyboard_set_repeat(); - /* Set the video mode */ - set_video(); - /* Query MCA information */ query_mca(); @@ -154,6 +163,10 @@ void main(void) #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) query_edd(); #endif + + /* Set the video mode */ + set_video(); + /* Do the last things and invoke protected mode */ go_to_protected_mode(); } diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c index 09fb342cc62..1a0f936c160 100644 --- a/arch/x86/boot/pm.c +++ b/arch/x86/boot/pm.c @@ -104,7 +104,7 @@ static void reset_coprocessor(void) (((u64)(base & 0xff000000) << 32) | \ ((u64)flags << 40) | \ ((u64)(limit & 0x00ff0000) << 32) | \ - ((u64)(base & 0x00ffff00) << 16) | \ + ((u64)(base & 0x00ffffff) << 16) | \ ((u64)(limit & 0x0000ffff))) struct gdt_ptr { @@ -121,6 +121,10 @@ static void setup_gdt(void) [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff), /* DS: data, read/write, 4 GB, base 0 */ [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff), + /* TSS: 32-bit tss, 104 bytes, base 4096 */ + /* We only have a TSS here to keep Intel VT happy; + we don't actually use it for anything. */ + [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103), }; /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead of the gdt_ptr contents. Thus, make it static so it will diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S index fa6bed1fac1..f5402d51f7c 100644 --- a/arch/x86/boot/pmjump.S +++ b/arch/x86/boot/pmjump.S @@ -15,6 +15,7 @@ */ #include <asm/boot.h> +#include <asm/processor-flags.h> #include <asm/segment.h> .text @@ -29,28 +30,55 @@ */ protected_mode_jump: movl %edx, %esi # Pointer to boot_params table - movl %eax, 2f # Patch ljmpl instruction + + xorl %ebx, %ebx + movw %cs, %bx + shll $4, %ebx + addl %ebx, 2f movw $__BOOT_DS, %cx - xorl %ebx, %ebx # Per the 32-bit boot protocol - xorl %ebp, %ebp # Per the 32-bit boot protocol - xorl %edi, %edi # Per the 32-bit boot protocol + movw $__BOOT_TSS, %di movl %cr0, %edx - orb $1, %dl # Protected mode (PE) bit + orb $X86_CR0_PE, %dl # Protected mode movl %edx, %cr0 jmp 1f # Short jump to serialize on 386/486 1: - movw %cx, %ds - movw %cx, %es - movw %cx, %fs - movw %cx, %gs - movw %cx, %ss - - # Jump to the 32-bit entrypoint + # Transition to 32-bit mode .byte 0x66, 0xea # ljmpl opcode -2: .long 0 # offset +2: .long in_pm32 # offset .word __BOOT_CS # segment .size protected_mode_jump, .-protected_mode_jump + + .code32 + .type in_pm32, @function +in_pm32: + # Set up data segments for flat 32-bit mode + movl %ecx, %ds + movl %ecx, %es + movl %ecx, %fs + movl %ecx, %gs + movl %ecx, %ss + # The 32-bit code sets up its own stack, but this way we do have + # a valid stack if some debugging hack wants to use it. + addl %ebx, %esp + + # Set up TR to make Intel VT happy + ltr %di + + # Clear registers to allow for future extensions to the + # 32-bit boot protocol + xorl %ecx, %ecx + xorl %edx, %edx + xorl %ebx, %ebx + xorl %ebp, %ebp + xorl %edi, %edi + + # Set up LDTR to make Intel VT happy + lldt %cx + + jmpl *%eax # Jump to the 32-bit entrypoint + + .size in_pm32, .-in_pm32 diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index ed0672a8187..ff664a11709 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -104,6 +104,7 @@ static int bios_probe(void) mi = GET_HEAP(struct mode_info, 1); mi->mode = VIDEO_FIRST_BIOS+mode; + mi->depth = 0; /* text */ mi->x = rdfs16(0x44a); mi->y = rdfs8(0x484)+1; nmodes++; @@ -116,7 +117,7 @@ static int bios_probe(void) __videocard video_bios = { - .card_name = "BIOS (scanned)", + .card_name = "BIOS", .probe = bios_probe, .set_mode = bios_set_mode, .unsafe = 1, diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 4716b9a9635..662dd2f1306 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -79,20 +79,28 @@ static int vesa_probe(void) /* Text Mode, TTY BIOS supported, supported by hardware */ mi = GET_HEAP(struct mode_info, 1); - mi->mode = mode + VIDEO_FIRST_VESA; - mi->x = vminfo.h_res; - mi->y = vminfo.v_res; + mi->mode = mode + VIDEO_FIRST_VESA; + mi->depth = 0; /* text */ + mi->x = vminfo.h_res; + mi->y = vminfo.v_res; nmodes++; - } else if ((vminfo.mode_attr & 0x99) == 0x99) { + } else if ((vminfo.mode_attr & 0x99) == 0x99 && + (vminfo.memory_layout == 4 || + vminfo.memory_layout == 6) && + vminfo.memory_planes == 1) { #ifdef CONFIG_FB /* Graphics mode, color, linear frame buffer - supported -- register the mode but hide from - the menu. Only do this if framebuffer is - configured, however, otherwise the user will - be left without a screen. */ + supported. Only register the mode if + if framebuffer is configured, however, + otherwise the user will be left without a screen. + We don't require CONFIG_FB_VESA, however, since + some of the other framebuffer drivers can use + this mode-setting, too. */ mi = GET_HEAP(struct mode_info, 1); mi->mode = mode + VIDEO_FIRST_VESA; - mi->x = mi->y = 0; + mi->depth = vminfo.bpp; + mi->x = vminfo.h_res; + mi->y = vminfo.v_res; nmodes++; #endif } diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index aef02f9ec0c..7259387b7d1 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -18,22 +18,22 @@ #include "video.h" static struct mode_info vga_modes[] = { - { VIDEO_80x25, 80, 25 }, - { VIDEO_8POINT, 80, 50 }, - { VIDEO_80x43, 80, 43 }, - { VIDEO_80x28, 80, 28 }, - { VIDEO_80x30, 80, 30 }, - { VIDEO_80x34, 80, 34 }, - { VIDEO_80x60, 80, 60 }, + { VIDEO_80x25, 80, 25, 0 }, + { VIDEO_8POINT, 80, 50, 0 }, + { VIDEO_80x43, 80, 43, 0 }, + { VIDEO_80x28, 80, 28, 0 }, + { VIDEO_80x30, 80, 30, 0 }, + { VIDEO_80x34, 80, 34, 0 }, + { VIDEO_80x60, 80, 60, 0 }, }; static struct mode_info ega_modes[] = { - { VIDEO_80x25, 80, 25 }, - { VIDEO_8POINT, 80, 43 }, + { VIDEO_80x25, 80, 25, 0 }, + { VIDEO_8POINT, 80, 43, 0 }, }; static struct mode_info cga_modes[] = { - { VIDEO_80x25, 80, 25 }, + { VIDEO_80x25, 80, 25, 0 }, }; __videocard video_vga; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index ad9712f0173..696d08f3843 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -293,13 +293,28 @@ static void display_menu(void) struct mode_info *mi; char ch; int i; + int nmodes; + int modes_per_line; + int col; - puts("Mode: COLSxROWS:\n"); + nmodes = 0; + for (card = video_cards; card < video_cards_end; card++) + nmodes += card->nmodes; + modes_per_line = 1; + if (nmodes >= 20) + modes_per_line = 3; + + for (col = 0; col < modes_per_line; col++) + puts("Mode: Resolution: Type: "); + putchar('\n'); + + col = 0; ch = '0'; for (card = video_cards; card < video_cards_end; card++) { mi = card->modes; for (i = 0; i < card->nmodes; i++, mi++) { + char resbuf[32]; int visible = mi->x && mi->y; u16 mode_id = mi->mode ? mi->mode : (mi->y << 8)+mi->x; @@ -307,8 +322,18 @@ static void display_menu(void) if (!visible) continue; /* Hidden mode */ - printf("%c %04X %3dx%-3d %s\n", - ch, mode_id, mi->x, mi->y, card->card_name); + if (mi->depth) + sprintf(resbuf, "%dx%d", mi->y, mi->depth); + else + sprintf(resbuf, "%d", mi->y); + + printf("%c %03X %4dx%-7s %-6s", + ch, mode_id, mi->x, resbuf, card->card_name); + col++; + if (col >= modes_per_line) { + putchar('\n'); + col = 0; + } if (ch == '9') ch = 'a'; @@ -318,6 +343,8 @@ static void display_menu(void) ch++; } } + if (col) + putchar('\n'); } #define H(x) ((x)-'a'+10) diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index b92447d5121..d69347f79e8 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -83,7 +83,8 @@ void store_screen(void); struct mode_info { u16 mode; /* Mode number (vga= style) */ - u8 x, y; /* Width, height */ + u16 x, y; /* Width, height */ + u16 depth; /* Bits per pixel, 0 for text mode */ }; struct card_info { diff --git a/arch/x86/boot/voyager.c b/arch/x86/boot/voyager.c index 61c8fe0453b..6499e3239b4 100644 --- a/arch/x86/boot/voyager.c +++ b/arch/x86/boot/voyager.c @@ -16,8 +16,6 @@ #include "boot.h" -#ifdef CONFIG_X86_VOYAGER - int query_voyager(void) { u8 err; @@ -42,5 +40,3 @@ int query_voyager(void) copy_from_fs(data_ptr, di, 7); /* Table is 7 bytes apparently */ return 0; } - -#endif /* CONFIG_X86_VOYAGER */ diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 54ee1764fda..77562e7cdab 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -99,9 +99,9 @@ CONFIG_IOSCHED_NOOP=y CONFIG_IOSCHED_AS=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y -CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_AS is not set # CONFIG_DEFAULT_DEADLINE is not set -# CONFIG_DEFAULT_CFQ is not set +CONFIG_DEFAULT_CFQ=y # CONFIG_DEFAULT_NOOP is not set CONFIG_DEFAULT_IOSCHED="anticipatory" diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 38a83f9c966..9e2b0ef851d 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -145,15 +145,6 @@ CONFIG_K8_NUMA=y CONFIG_NODES_SHIFT=6 CONFIG_X86_64_ACPI_NUMA=y CONFIG_NUMA_EMU=y -CONFIG_ARCH_DISCONTIGMEM_ENABLE=y -CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y -CONFIG_ARCH_SPARSEMEM_ENABLE=y -CONFIG_SELECT_MEMORY_MODEL=y -# CONFIG_FLATMEM_MANUAL is not set -CONFIG_DISCONTIGMEM_MANUAL=y -# CONFIG_SPARSEMEM_MANUAL is not set -CONFIG_DISCONTIGMEM=y -CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_NEED_MULTIPLE_NODES=y # CONFIG_SPARSEMEM_STATIC is not set CONFIG_SPLIT_PTLOCK_CPUS=4 diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index e2edda255a8..52d0ccfcf6e 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -2,9 +2,7 @@ # Makefile for the ia32 kernel emulation subsystem. # -obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \ - ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o \ - mmap32.o +obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o sysv-$(CONFIG_SYSVIPC) := ipc32.o obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) @@ -13,40 +11,3 @@ obj-$(CONFIG_IA32_AOUT) += ia32_aout.o audit-class-$(CONFIG_AUDIT) := audit.o obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) - -$(obj)/syscall32_syscall.o: \ - $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so) - -# Teach kbuild about targets -targets := $(foreach F,$(addprefix vsyscall-,sysenter syscall),\ - $F.o $F.so $F.so.dbg) - -# The DSO images are built using a special linker script -quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -m32 -nostdlib -shared \ - $(call ld-option, -Wl$(comma)--hash-style=sysv) \ - -Wl,-soname=linux-gate.so.1 -o $@ \ - -Wl,-T,$(filter-out FORCE,$^) - -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE - $(call if_changed,objcopy) - -$(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \ -$(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE - $(call if_changed,syscall) - -AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 -AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 - -vdsos := vdso32-sysenter.so vdso32-syscall.so - -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(@:vdso32-%.so=$(obj)/vsyscall-%.so.dbg) \ - $(MODLIB)/vdso/$@ - -$(vdsos): - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: $(vdsos) diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 91b7b5922df..5d7b381da69 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c @@ -27,7 +27,7 @@ unsigned ia32_signal_class[] = { int ia32_classify_syscall(unsigned syscall) { - switch(syscall) { + switch (syscall) { case __NR_open: return 2; case __NR_openat: diff --git a/arch/x86/ia32/fpu32.c b/arch/x86/ia32/fpu32.c deleted file mode 100644 index 2c8209a3605..00000000000 --- a/arch/x86/ia32/fpu32.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes. - * This is used for ptrace, signals and coredumps in 32bit emulation. - */ - -#include <linux/sched.h> -#include <asm/sigcontext32.h> -#include <asm/processor.h> -#include <asm/uaccess.h> -#include <asm/i387.h> - -static inline unsigned short twd_i387_to_fxsr(unsigned short twd) -{ - unsigned int tmp; /* to avoid 16 bit prefixes in the code */ - - /* Transform each pair of bits into 01 (valid) or 00 (empty) */ - tmp = ~twd; - tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ - /* and move the valid bits to the lower byte. */ - tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ - tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ - tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ - return tmp; -} - -static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) -{ - struct _fpxreg *st = NULL; - unsigned long tos = (fxsave->swd >> 11) & 7; - unsigned long twd = (unsigned long) fxsave->twd; - unsigned long tag; - unsigned long ret = 0xffff0000; - int i; - -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); - - for (i = 0 ; i < 8 ; i++) { - if (twd & 0x1) { - st = FPREG_ADDR( fxsave, (i - tos) & 7 ); - - switch (st->exponent & 0x7fff) { - case 0x7fff: - tag = 2; /* Special */ - break; - case 0x0000: - if ( !st->significand[0] && - !st->significand[1] && - !st->significand[2] && - !st->significand[3] ) { - tag = 1; /* Zero */ - } else { - tag = 2; /* Special */ - } - break; - default: - if (st->significand[3] & 0x8000) { - tag = 0; /* Valid */ - } else { - tag = 2; /* Special */ - } - break; - } - } else { - tag = 3; /* Empty */ - } - ret |= (tag << (2 * i)); - twd = twd >> 1; - } - return ret; -} - - -static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave, - struct _fpstate_ia32 __user *buf) -{ - struct _fpxreg *to; - struct _fpreg __user *from; - int i; - u32 v; - int err = 0; - -#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf) - G(0, fxsave->cwd); - G(1, fxsave->swd); - G(2, fxsave->twd); - fxsave->twd = twd_i387_to_fxsr(fxsave->twd); - G(3, fxsave->rip); - G(4, v); - fxsave->fop = v>>16; /* cs ignored */ - G(5, fxsave->rdp); - /* 6: ds ignored */ -#undef G - if (err) - return -1; - - to = (struct _fpxreg *)&fxsave->st_space[0]; - from = &buf->_st[0]; - for (i = 0 ; i < 8 ; i++, to++, from++) { - if (__copy_from_user(to, from, sizeof(*from))) - return -1; - } - return 0; -} - - -static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf, - struct i387_fxsave_struct *fxsave, - struct pt_regs *regs, - struct task_struct *tsk) -{ - struct _fpreg __user *to; - struct _fpxreg *from; - int i; - u16 cs,ds; - int err = 0; - - if (tsk == current) { - /* should be actually ds/cs at fpu exception time, - but that information is not available in 64bit mode. */ - asm("movw %%ds,%0 " : "=r" (ds)); - asm("movw %%cs,%0 " : "=r" (cs)); - } else { /* ptrace. task has stopped. */ - ds = tsk->thread.ds; - cs = regs->cs; - } - -#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf) - P(0, (u32)fxsave->cwd | 0xffff0000); - P(1, (u32)fxsave->swd | 0xffff0000); - P(2, twd_fxsr_to_i387(fxsave)); - P(3, (u32)fxsave->rip); - P(4, cs | ((u32)fxsave->fop) << 16); - P(5, fxsave->rdp); - P(6, 0xffff0000 | ds); -#undef P - - if (err) - return -1; - - to = &buf->_st[0]; - from = (struct _fpxreg *) &fxsave->st_space[0]; - for ( i = 0 ; i < 8 ; i++, to++, from++ ) { - if (__copy_to_user(to, from, sizeof(*to))) - return -1; - } - return 0; -} - -int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave) -{ - clear_fpu(tsk); - if (!fsave) { - if (__copy_from_user(&tsk->thread.i387.fxsave, - &buf->_fxsr_env[0], - sizeof(struct i387_fxsave_struct))) - return -1; - tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - set_stopped_child_used_math(tsk); - } - return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf); -} - -int save_i387_ia32(struct task_struct *tsk, - struct _fpstate_ia32 __user *buf, - struct pt_regs *regs, - int fsave) -{ - int err = 0; - - init_fpu(tsk); - if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk)) - return -1; - if (fsave) - return 0; - err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status); - if (fsave) - return err ? -1 : 1; - err |= __put_user(X86_FXSR_MAGIC, &buf->magic); - err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave, - sizeof(struct i387_fxsave_struct)); - return err ? -1 : 1; -} diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index f82e1a94fcb..e4c12079171 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -25,6 +25,7 @@ #include <linux/binfmts.h> #include <linux/personality.h> #include <linux/init.h> +#include <linux/jiffies.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -36,61 +37,67 @@ #undef WARN_OLD #undef CORE_DUMP /* probably broken */ -static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); -static int load_aout_library(struct file*); +static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs); +static int load_aout_library(struct file *); #ifdef CORE_DUMP -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, + unsigned long limit); /* * fill in the user structure for a core dump.. */ -static void dump_thread32(struct pt_regs * regs, struct user32 * dump) +static void dump_thread32(struct pt_regs *regs, struct user32 *dump) { - u32 fs,gs; + u32 fs, gs; /* changed the size calculations - should hopefully work better. lbt */ dump->magic = CMAGIC; dump->start_code = 0; - dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1); + dump->start_stack = regs->sp & ~(PAGE_SIZE - 1); dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; - dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; + dump->u_dsize = ((unsigned long) + (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; - dump->u_debugreg[0] = current->thread.debugreg0; - dump->u_debugreg[1] = current->thread.debugreg1; - dump->u_debugreg[2] = current->thread.debugreg2; - dump->u_debugreg[3] = current->thread.debugreg3; - dump->u_debugreg[4] = 0; - dump->u_debugreg[5] = 0; - dump->u_debugreg[6] = current->thread.debugreg6; - dump->u_debugreg[7] = current->thread.debugreg7; - - if (dump->start_stack < 0xc0000000) - dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT; - - dump->regs.ebx = regs->rbx; - dump->regs.ecx = regs->rcx; - dump->regs.edx = regs->rdx; - dump->regs.esi = regs->rsi; - dump->regs.edi = regs->rdi; - dump->regs.ebp = regs->rbp; - dump->regs.eax = regs->rax; + dump->u_debugreg[0] = current->thread.debugreg0; + dump->u_debugreg[1] = current->thread.debugreg1; + dump->u_debugreg[2] = current->thread.debugreg2; + dump->u_debugreg[3] = current->thread.debugreg3; + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + dump->u_debugreg[7] = current->thread.debugreg7; + + if (dump->start_stack < 0xc0000000) { + unsigned long tmp; + + tmp = (unsigned long) (0xc0000000 - dump->start_stack); + dump->u_ssize = tmp >> PAGE_SHIFT; + } + + dump->regs.bx = regs->bx; + dump->regs.cx = regs->cx; + dump->regs.dx = regs->dx; + dump->regs.si = regs->si; + dump->regs.di = regs->di; + dump->regs.bp = regs->bp; + dump->regs.ax = regs->ax; dump->regs.ds = current->thread.ds; dump->regs.es = current->thread.es; asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; - asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; - dump->regs.orig_eax = regs->orig_rax; - dump->regs.eip = regs->rip; + asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; + dump->regs.orig_ax = regs->orig_ax; + dump->regs.ip = regs->ip; dump->regs.cs = regs->cs; - dump->regs.eflags = regs->eflags; - dump->regs.esp = regs->rsp; + dump->regs.flags = regs->flags; + dump->regs.sp = regs->sp; dump->regs.ss = regs->ss; #if 1 /* FIXME */ dump->u_fpvalid = 0; #else - dump->u_fpvalid = dump_fpu (regs, &dump->i387); + dump->u_fpvalid = dump_fpu(regs, &dump->i387); #endif } @@ -128,15 +135,19 @@ static int dump_write(struct file *file, const void *addr, int nr) return file->f_op->write(file, addr, nr, &file->f_pos) == nr; } -#define DUMP_WRITE(addr, nr) \ +#define DUMP_WRITE(addr, nr) \ if (!dump_write(file, (void *)(addr), (nr))) \ goto end_coredump; -#define DUMP_SEEK(offset) \ -if (file->f_op->llseek) { \ - if (file->f_op->llseek(file,(offset),0) != (offset)) \ - goto end_coredump; \ -} else file->f_pos = (offset) +#define DUMP_SEEK(offset) \ + if (file->f_op->llseek) { \ + if (file->f_op->llseek(file, (offset), 0) != (offset)) \ + goto end_coredump; \ + } else \ + file->f_pos = (offset) + +#define START_DATA() (u.u_tsize << PAGE_SHIFT) +#define START_STACK(u) (u.start_stack) /* * Routine writes a core dump image in the current directory. @@ -148,62 +159,70 @@ if (file->f_op->llseek) { \ * dumping of the process results in another error.. */ -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, + unsigned long limit) { mm_segment_t fs; int has_dumped = 0; unsigned long dump_start, dump_size; struct user32 dump; -# define START_DATA(u) (u.u_tsize << PAGE_SHIFT) -# define START_STACK(u) (u.start_stack) fs = get_fs(); set_fs(KERNEL_DS); has_dumped = 1; current->flags |= PF_DUMPCORE; - strncpy(dump.u_comm, current->comm, sizeof(current->comm)); - dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump))); + strncpy(dump.u_comm, current->comm, sizeof(current->comm)); + dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - + ((unsigned long)(&dump))); dump.signal = signr; dump_thread32(regs, &dump); -/* If the size of the dump file exceeds the rlimit, then see what would happen - if we wrote the stack, but not the data area. */ + /* + * If the size of the dump file exceeds the rlimit, then see + * what would happen if we wrote the stack, but not the data + * area. + */ if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > limit) dump.u_dsize = 0; -/* Make sure we have enough room to write the stack and data areas. */ + /* Make sure we have enough room to write the stack and data areas. */ if ((dump.u_ssize + 1) * PAGE_SIZE > limit) dump.u_ssize = 0; -/* make sure we actually have a data and stack area to dump */ + /* make sure we actually have a data and stack area to dump */ set_fs(USER_DS); - if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) + if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), + dump.u_dsize << PAGE_SHIFT)) dump.u_dsize = 0; - if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) + if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), + dump.u_ssize << PAGE_SHIFT)) dump.u_ssize = 0; set_fs(KERNEL_DS); -/* struct user */ - DUMP_WRITE(&dump,sizeof(dump)); -/* Now dump all of the user data. Include malloced stuff as well */ + /* struct user */ + DUMP_WRITE(&dump, sizeof(dump)); + /* Now dump all of the user data. Include malloced stuff as well */ DUMP_SEEK(PAGE_SIZE); -/* now we start writing out the user space info */ + /* now we start writing out the user space info */ set_fs(USER_DS); -/* Dump the data area */ + /* Dump the data area */ if (dump.u_dsize != 0) { dump_start = START_DATA(dump); dump_size = dump.u_dsize << PAGE_SHIFT; - DUMP_WRITE(dump_start,dump_size); + DUMP_WRITE(dump_start, dump_size); } -/* Now prepare to dump the stack area */ + /* Now prepare to dump the stack area */ if (dump.u_ssize != 0) { dump_start = START_STACK(dump); dump_size = dump.u_ssize << PAGE_SHIFT; - DUMP_WRITE(dump_start,dump_size); + DUMP_WRITE(dump_start, dump_size); } -/* Finally dump the task struct. Not be used by gdb, but could be useful */ + /* + * Finally dump the task struct. Not be used by gdb, but + * could be useful + */ set_fs(KERNEL_DS); - DUMP_WRITE(current,sizeof(*current)); + DUMP_WRITE(current, sizeof(*current)); end_coredump: set_fs(fs); return has_dumped; @@ -217,35 +236,34 @@ end_coredump: */ static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) { - u32 __user *argv; - u32 __user *envp; - u32 __user *sp; - int argc = bprm->argc; - int envc = bprm->envc; + u32 __user *argv, *envp, *sp; + int argc = bprm->argc, envc = bprm->envc; sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); sp -= envc+1; envp = sp; sp -= argc+1; argv = sp; - put_user((unsigned long) envp,--sp); - put_user((unsigned long) argv,--sp); - put_user(argc,--sp); + put_user((unsigned long) envp, --sp); + put_user((unsigned long) argv, --sp); + put_user(argc, --sp); current->mm->arg_start = (unsigned long) p; - while (argc-->0) { + while (argc-- > 0) { char c; - put_user((u32)(unsigned long)p,argv++); + + put_user((u32)(unsigned long)p, argv++); do { - get_user(c,p++); + get_user(c, p++); } while (c); } put_user(0, argv); current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-->0) { + while (envc-- > 0) { char c; - put_user((u32)(unsigned long)p,envp++); + + put_user((u32)(unsigned long)p, envp++); do { - get_user(c,p++); + get_user(c, p++); } while (c); } put_user(0, envp); @@ -257,20 +275,18 @@ static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) * These are the functions used to load a.out style executables and shared * libraries. There is no binary dependent code anywhere else. */ - -static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) +static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) { + unsigned long error, fd_offset, rlim; struct exec ex; - unsigned long error; - unsigned long fd_offset; - unsigned long rlim; int retval; ex = *((struct exec *) bprm->buf); /* exec-header */ if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + i_size_read(bprm->file->f_path.dentry->d_inode) < + ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { return -ENOEXEC; } @@ -291,13 +307,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) if (retval) return retval; - regs->cs = __USER32_CS; + regs->cs = __USER32_CS; regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; /* OK, This is the point of no return */ set_personality(PER_LINUX); - set_thread_flag(TIF_IA32); + set_thread_flag(TIF_IA32); clear_thread_flag(TIF_ABI_PENDING); current->mm->end_code = ex.a_text + @@ -311,7 +327,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->mmap = NULL; compute_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; + current->flags &= ~PF_FORKNOEXEC; if (N_MAGIC(ex) == OMAGIC) { unsigned long text_addr, map_size; @@ -338,30 +354,31 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) send_sig(SIGKILL, current, 0); return error; } - + flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); } else { #ifdef WARN_OLD static unsigned long error_time, error_time2; if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && - (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) - { + (N_MAGIC(ex) != NMAGIC) && + time_after(jiffies, error_time2 + 5*HZ)) { printk(KERN_NOTICE "executable not page aligned\n"); error_time2 = jiffies; } if ((fd_offset & ~PAGE_MASK) != 0 && - (jiffies-error_time) > 5*HZ) - { - printk(KERN_WARNING - "fd_offset is not page aligned. Please convert program: %s\n", + time_after(jiffies, error_time + 5*HZ)) { + printk(KERN_WARNING + "fd_offset is not page aligned. Please convert " + "program: %s\n", bprm->file->f_path.dentry->d_name.name); error_time = jiffies; } #endif - if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { + if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) { loff_t pos = fd_offset; + down_write(¤t->mm->mmap_sem); do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); up_write(¤t->mm->mmap_sem); @@ -376,9 +393,10 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) down_write(¤t->mm->mmap_sem); error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, - PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, - fd_offset); + PROT_READ | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | + MAP_EXECUTABLE | MAP_32BIT, + fd_offset); up_write(¤t->mm->mmap_sem); if (error != N_TXTADDR(ex)) { @@ -387,9 +405,10 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) } down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, + error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | + MAP_EXECUTABLE | MAP_32BIT, fd_offset + ex.a_text); up_write(¤t->mm->mmap_sem); if (error != N_DATADDR(ex)) { @@ -403,9 +422,9 @@ beyond_if: set_brk(current->mm->start_brk, current->mm->brk); retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); + if (retval < 0) { + /* Someone check-me: is this error path enough? */ + send_sig(SIGKILL, current, 0); return retval; } @@ -414,10 +433,10 @@ beyond_if: /* start thread */ asm volatile("movl %0,%%fs" :: "r" (0)); \ asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); - load_gs_index(0); - (regs)->rip = ex.a_entry; - (regs)->rsp = current->mm->start_stack; - (regs)->eflags = 0x200; + load_gs_index(0); + (regs)->ip = ex.a_entry; + (regs)->sp = current->mm->start_stack; + (regs)->flags = 0x200; (regs)->cs = __USER32_CS; (regs)->ss = __USER32_DS; regs->r8 = regs->r9 = regs->r10 = regs->r11 = @@ -425,7 +444,7 @@ beyond_if: set_fs(USER_DS); if (unlikely(current->ptrace & PT_PTRACED)) { if (current->ptrace & PT_TRACE_EXEC) - ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); + ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP); else send_sig(SIGTRAP, current, 0); } @@ -434,9 +453,8 @@ beyond_if: static int load_aout_library(struct file *file) { - struct inode * inode; - unsigned long bss, start_addr, len; - unsigned long error; + struct inode *inode; + unsigned long bss, start_addr, len, error; int retval; struct exec ex; @@ -450,7 +468,8 @@ static int load_aout_library(struct file *file) /* We come in here for the regular a.out style of shared libraries */ if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || - i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + i_size_read(inode) < + ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { goto out; } @@ -467,10 +486,10 @@ static int load_aout_library(struct file *file) #ifdef WARN_OLD static unsigned long error_time; - if ((jiffies-error_time) > 5*HZ) - { - printk(KERN_WARNING - "N_TXTOFF is not page aligned. Please convert library: %s\n", + if (time_after(jiffies, error_time + 5*HZ)) { + printk(KERN_WARNING + "N_TXTOFF is not page aligned. Please convert " + "library: %s\n", file->f_path.dentry->d_name.name); error_time = jiffies; } @@ -478,11 +497,12 @@ static int load_aout_library(struct file *file) down_write(¤t->mm->mmap_sem); do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); up_write(¤t->mm->mmap_sem); - + file->f_op->read(file, (char __user *)start_addr, ex.a_text + ex.a_data, &pos); flush_icache_range((unsigned long) start_addr, - (unsigned long) start_addr + ex.a_text + ex.a_data); + (unsigned long) start_addr + ex.a_text + + ex.a_data); retval = 0; goto out; diff --git a/arch/x86/ia32/ia32_binfmt.c b/arch/x86/ia32/ia32_binfmt.c deleted file mode 100644 index 55822d2cf05..00000000000 --- a/arch/x86/ia32/ia32_binfmt.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * Written 2000,2002 by Andi Kleen. - * - * Loosely based on the sparc64 and IA64 32bit emulation loaders. - * This tricks binfmt_elf.c into loading 32bit binaries using lots - * of ugly preprocessor tricks. Talk about very very poor man's inheritance. - */ - -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/rwsem.h> -#include <linux/sched.h> -#include <linux/compat.h> -#include <linux/string.h> -#include <linux/binfmts.h> -#include <linux/mm.h> -#include <linux/security.h> -#include <linux/elfcore-compat.h> - -#include <asm/segment.h> -#include <asm/ptrace.h> -#include <asm/processor.h> -#include <asm/user32.h> -#include <asm/sigcontext32.h> -#include <asm/fpu32.h> -#include <asm/i387.h> -#include <asm/uaccess.h> -#include <asm/ia32.h> -#include <asm/vsyscall32.h> - -#undef ELF_ARCH -#undef ELF_CLASS -#define ELF_CLASS ELFCLASS32 -#define ELF_ARCH EM_386 - -#undef elfhdr -#undef elf_phdr -#undef elf_note -#undef elf_addr_t -#define elfhdr elf32_hdr -#define elf_phdr elf32_phdr -#define elf_note elf32_note -#define elf_addr_t Elf32_Off - -#define ELF_NAME "elf/i386" - -#define AT_SYSINFO 32 -#define AT_SYSINFO_EHDR 33 - -int sysctl_vsyscall32 = 1; - -#undef ARCH_DLINFO -#define ARCH_DLINFO do { \ - if (sysctl_vsyscall32) { \ - current->mm->context.vdso = (void *)VSYSCALL32_BASE; \ - NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \ - NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE); \ - } \ -} while(0) - -struct file; - -#define IA32_EMULATOR 1 - -#undef ELF_ET_DYN_BASE - -#define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) - -#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0) - -#define _GET_SEG(x) \ - ({ __u32 seg; asm("movl %%" __stringify(x) ",%0" : "=r"(seg)); seg; }) - -/* Assumes current==process to be dumped */ -#undef ELF_CORE_COPY_REGS -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ - pr_reg[0] = regs->rbx; \ - pr_reg[1] = regs->rcx; \ - pr_reg[2] = regs->rdx; \ - pr_reg[3] = regs->rsi; \ - pr_reg[4] = regs->rdi; \ - pr_reg[5] = regs->rbp; \ - pr_reg[6] = regs->rax; \ - pr_reg[7] = _GET_SEG(ds); \ - pr_reg[8] = _GET_SEG(es); \ - pr_reg[9] = _GET_SEG(fs); \ - pr_reg[10] = _GET_SEG(gs); \ - pr_reg[11] = regs->orig_rax; \ - pr_reg[12] = regs->rip; \ - pr_reg[13] = regs->cs; \ - pr_reg[14] = regs->eflags; \ - pr_reg[15] = regs->rsp; \ - pr_reg[16] = regs->ss; - - -#define elf_prstatus compat_elf_prstatus -#define elf_prpsinfo compat_elf_prpsinfo -#define elf_fpregset_t struct user_i387_ia32_struct -#define elf_fpxregset_t struct user32_fxsr_struct -#define user user32 - -#undef elf_read_implies_exec -#define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X) - -#define elf_core_copy_regs elf32_core_copy_regs -static inline void elf32_core_copy_regs(compat_elf_gregset_t *elfregs, - struct pt_regs *regs) -{ - ELF_CORE_COPY_REGS((&elfregs->ebx), regs) -} - -#define elf_core_copy_task_regs elf32_core_copy_task_regs -static inline int elf32_core_copy_task_regs(struct task_struct *t, - compat_elf_gregset_t* elfregs) -{ - struct pt_regs *pp = task_pt_regs(t); - ELF_CORE_COPY_REGS((&elfregs->ebx), pp); - /* fix wrong segments */ - elfregs->ds = t->thread.ds; - elfregs->fs = t->thread.fsindex; - elfregs->gs = t->thread.gsindex; - elfregs->es = t->thread.es; - return 1; -} - -#define elf_core_copy_task_fpregs elf32_core_copy_task_fpregs -static inline int -elf32_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, - elf_fpregset_t *fpu) -{ - struct _fpstate_ia32 *fpstate = (void*)fpu; - mm_segment_t oldfs = get_fs(); - - if (!tsk_used_math(tsk)) - return 0; - if (!regs) - regs = task_pt_regs(tsk); - if (tsk == current) - unlazy_fpu(tsk); - set_fs(KERNEL_DS); - save_i387_ia32(tsk, fpstate, regs, 1); - /* Correct for i386 bug. It puts the fop into the upper 16bits of - the tag word (like FXSAVE), not into the fcs*/ - fpstate->cssel |= fpstate->tag & 0xffff0000; - set_fs(oldfs); - return 1; -} - -#define ELF_CORE_COPY_XFPREGS 1 -#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG -#define elf_core_copy_task_xfpregs elf32_core_copy_task_xfpregs -static inline int -elf32_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) -{ - struct pt_regs *regs = task_pt_regs(t); - if (!tsk_used_math(t)) - return 0; - if (t == current) - unlazy_fpu(t); - memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t)); - xfpu->fcs = regs->cs; - xfpu->fos = t->thread.ds; /* right? */ - return 1; -} - -#undef elf_check_arch -#define elf_check_arch(x) \ - ((x)->e_machine == EM_386) - -extern int force_personality32; - -#undef ELF_EXEC_PAGESIZE -#undef ELF_HWCAP -#undef ELF_PLATFORM -#undef SET_PERSONALITY -#define ELF_EXEC_PAGESIZE PAGE_SIZE -#define ELF_HWCAP (boot_cpu_data.x86_capability[0]) -#define ELF_PLATFORM ("i686") -#define SET_PERSONALITY(ex, ibcs2) \ -do { \ - unsigned long new_flags = 0; \ - if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ - new_flags = _TIF_IA32; \ - if ((current_thread_info()->flags & _TIF_IA32) \ - != new_flags) \ - set_thread_flag(TIF_ABI_PENDING); \ - else \ - clear_thread_flag(TIF_ABI_PENDING); \ - /* XXX This overwrites the user set personality */ \ - current->personality |= force_personality32; \ -} while (0) - -/* Override some function names */ -#define elf_format elf32_format - -#define init_elf_binfmt init_elf32_binfmt -#define exit_elf_binfmt exit_elf32_binfmt - -#define load_elf_binary load_elf32_binary - -#undef ELF_PLAT_INIT -#define ELF_PLAT_INIT(r, load_addr) elf32_init(r) - -#undef start_thread -#define start_thread(regs,new_rip,new_rsp) do { \ - asm volatile("movl %0,%%fs" :: "r" (0)); \ - asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \ - load_gs_index(0); \ - (regs)->rip = (new_rip); \ - (regs)->rsp = (new_rsp); \ - (regs)->eflags = 0x200; \ - (regs)->cs = __USER32_CS; \ - (regs)->ss = __USER32_DS; \ - set_fs(USER_DS); \ -} while(0) - - -#include <linux/module.h> - -MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries."); -MODULE_AUTHOR("Eric Youngdale, Andi Kleen"); - -#undef MODULE_DESCRIPTION -#undef MODULE_AUTHOR - -static void elf32_init(struct pt_regs *); - -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 -#define arch_setup_additional_pages syscall32_setup_pages -extern int syscall32_setup_pages(struct linux_binprm *, int exstack); - -#include "../../../fs/binfmt_elf.c" - -static void elf32_init(struct pt_regs *regs) -{ - struct task_struct *me = current; - regs->rdi = 0; - regs->rsi = 0; - regs->rdx = 0; - regs->rcx = 0; - regs->rax = 0; - regs->rbx = 0; - regs->rbp = 0; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - me->thread.fs = 0; - me->thread.gs = 0; - me->thread.fsindex = 0; - me->thread.gsindex = 0; - me->thread.ds = __USER_DS; - me->thread.es = __USER_DS; -} - -#ifdef CONFIG_SYSCTL -/* Register vsyscall32 into the ABI table */ -#include <linux/sysctl.h> - -static ctl_table abi_table2[] = { - { - .procname = "vsyscall32", - .data = &sysctl_vsyscall32, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - {} -}; - -static ctl_table abi_root_table2[] = { - { - .ctl_name = CTL_ABI, - .procname = "abi", - .mode = 0555, - .child = abi_table2 - }, - {} -}; - -static __init int ia32_binfmt_init(void) -{ - register_sysctl_table(abi_root_table2); - return 0; -} -__initcall(ia32_binfmt_init); -#endif diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 6ea19c25f90..1c0503bdfb1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -29,9 +29,8 @@ #include <asm/ia32_unistd.h> #include <asm/user32.h> #include <asm/sigcontext32.h> -#include <asm/fpu32.h> #include <asm/proto.h> -#include <asm/vsyscall32.h> +#include <asm/vdso.h> #define DEBUG_SIG 0 @@ -43,7 +42,8 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where); int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { int err; - if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t))) + + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; /* If you change siginfo_t structure, please make sure that @@ -53,16 +53,19 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) 3 ints plus the relevant union member. */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); + err |= __put_user((short)from->si_code, &to->si_code); if (from->si_code < 0) { err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); } else { - /* First 32bits of unions are always present: - * si_pid === si_band === si_tid === si_addr(LS half) */ - err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]); + /* + * First 32bits of unions are always present: + * si_pid === si_band === si_tid === si_addr(LS half) + */ + err |= __put_user(from->_sifields._pad[0], + &to->_sifields._pad[0]); switch (from->si_code >> 16) { case __SI_FAULT >> 16: break; @@ -76,14 +79,15 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_uid, &to->si_uid); break; case __SI_POLL >> 16: - err |= __put_user(from->si_fd, &to->si_fd); + err |= __put_user(from->si_fd, &to->si_fd); break; case __SI_TIMER >> 16: - err |= __put_user(from->si_overrun, &to->si_overrun); + err |= __put_user(from->si_overrun, &to->si_overrun); err |= __put_user(ptr_to_compat(from->si_ptr), - &to->si_ptr); + &to->si_ptr); break; - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ + /* This is not generated by the kernel as of now. */ + case __SI_RT >> 16: case __SI_MESGQ >> 16: err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_int, &to->si_int); @@ -97,7 +101,8 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) { int err; u32 ptr32; - if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t))) + + if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) return -EFAULT; err = __get_user(to->si_signo, &from->si_signo); @@ -112,8 +117,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) return err; } -asmlinkage long -sys32_sigsuspend(int history0, int history1, old_sigset_t mask) +asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask) { mask &= _BLOCKABLE; spin_lock_irq(¤t->sighand->siglock); @@ -128,36 +132,37 @@ sys32_sigsuspend(int history0, int history1, old_sigset_t mask) return -ERESTARTNOHAND; } -asmlinkage long -sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, - stack_ia32_t __user *uoss_ptr, - struct pt_regs *regs) +asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, + stack_ia32_t __user *uoss_ptr, + struct pt_regs *regs) { - stack_t uss,uoss; + stack_t uss, uoss; int ret; - mm_segment_t seg; - if (uss_ptr) { + mm_segment_t seg; + + if (uss_ptr) { u32 ptr; - memset(&uss,0,sizeof(stack_t)); - if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) || + + memset(&uss, 0, sizeof(stack_t)); + if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t)) || __get_user(ptr, &uss_ptr->ss_sp) || __get_user(uss.ss_flags, &uss_ptr->ss_flags) || __get_user(uss.ss_size, &uss_ptr->ss_size)) return -EFAULT; uss.ss_sp = compat_ptr(ptr); } - seg = get_fs(); - set_fs(KERNEL_DS); - ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp); - set_fs(seg); + seg = get_fs(); + set_fs(KERNEL_DS); + ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp); + set_fs(seg); if (ret >= 0 && uoss_ptr) { - if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) || + if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)) || __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || __put_user(uoss.ss_size, &uoss_ptr->ss_size)) ret = -EFAULT; - } - return ret; + } + return ret; } /* @@ -186,87 +191,85 @@ struct rt_sigframe char retcode[8]; }; -static int -ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax) +#define COPY(x) { \ + unsigned int reg; \ + err |= __get_user(reg, &sc->x); \ + regs->x = reg; \ +} + +#define RELOAD_SEG(seg,mask) \ + { unsigned int cur; \ + unsigned short pre; \ + err |= __get_user(pre, &sc->seg); \ + asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ + pre |= mask; \ + if (pre != cur) loadsegment(seg, pre); } + +static int ia32_restore_sigcontext(struct pt_regs *regs, + struct sigcontext_ia32 __user *sc, + unsigned int *peax) { - unsigned int err = 0; - + unsigned int tmpflags, gs, oldgs, err = 0; + struct _fpstate_ia32 __user *buf; + u32 tmp; + /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; #if DEBUG_SIG - printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", - sc, sc->err, sc->eip, sc->cs, sc->eflags); + printk(KERN_DEBUG "SIG restore_sigcontext: " + "sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", + sc, sc->err, sc->ip, sc->cs, sc->flags); #endif -#define COPY(x) { \ - unsigned int reg; \ - err |= __get_user(reg, &sc->e ##x); \ - regs->r ## x = reg; \ -} -#define RELOAD_SEG(seg,mask) \ - { unsigned int cur; \ - unsigned short pre; \ - err |= __get_user(pre, &sc->seg); \ - asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ - pre |= mask; \ - if (pre != cur) loadsegment(seg,pre); } - - /* Reload fs and gs if they have changed in the signal handler. - This does not handle long fs/gs base changes in the handler, but - does not clobber them at least in the normal case. */ - - { - unsigned gs, oldgs; - err |= __get_user(gs, &sc->gs); - gs |= 3; - asm("movl %%gs,%0" : "=r" (oldgs)); - if (gs != oldgs) - load_gs_index(gs); - } - RELOAD_SEG(fs,3); - RELOAD_SEG(ds,3); - RELOAD_SEG(es,3); + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ + err |= __get_user(gs, &sc->gs); + gs |= 3; + asm("movl %%gs,%0" : "=r" (oldgs)); + if (gs != oldgs) + load_gs_index(gs); + + RELOAD_SEG(fs, 3); + RELOAD_SEG(ds, 3); + RELOAD_SEG(es, 3); COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); - /* Don't touch extended registers */ - - err |= __get_user(regs->cs, &sc->cs); - regs->cs |= 3; - err |= __get_user(regs->ss, &sc->ss); - regs->ss |= 3; - - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); - regs->orig_rax = -1; /* disable syscall checks */ - } + /* Don't touch extended registers */ + + err |= __get_user(regs->cs, &sc->cs); + regs->cs |= 3; + err |= __get_user(regs->ss, &sc->ss); + regs->ss |= 3; + + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); + /* disable syscall checks */ + regs->orig_ax = -1; + + err |= __get_user(tmp, &sc->fpstate); + buf = compat_ptr(tmp); + if (buf) { + if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) + goto badframe; + err |= restore_i387_ia32(buf); + } else { + struct task_struct *me = current; - { - u32 tmp; - struct _fpstate_ia32 __user * buf; - err |= __get_user(tmp, &sc->fpstate); - buf = compat_ptr(tmp); - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387_ia32(current, buf, 0); - } else { - struct task_struct *me = current; - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } + if (used_math()) { + clear_fpu(me); + clear_used_math(); } } - { - u32 tmp; - err |= __get_user(tmp, &sc->eax); - *peax = tmp; - } + err |= __get_user(tmp, &sc->ax); + *peax = tmp; + return err; badframe: @@ -275,15 +278,16 @@ badframe: asmlinkage long sys32_sigreturn(struct pt_regs *regs) { - struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8); + struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8); sigset_t set; - unsigned int eax; + unsigned int ax; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) || (_COMPAT_NSIG_WORDS > 1 - && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask, + && __copy_from_user((((char *) &set.sig) + 4), + &frame->extramask, sizeof(frame->extramask)))) goto badframe; @@ -292,24 +296,24 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs) current->blocked = set; recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - - if (ia32_restore_sigcontext(regs, &frame->sc, &eax)) + + if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) goto badframe; - return eax; + return ax; badframe: signal_fault(regs, frame, "32bit sigreturn"); return 0; -} +} asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; sigset_t set; - unsigned int eax; + unsigned int ax; struct pt_regs tregs; - frame = (struct rt_sigframe __user *)(regs->rsp - 4); + frame = (struct rt_sigframe __user *)(regs->sp - 4); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -321,28 +325,28 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) current->blocked = set; recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - - if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + + if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; tregs = *regs; if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) goto badframe; - return eax; + return ax; badframe: - signal_fault(regs,frame,"32bit rt sigreturn"); + signal_fault(regs, frame, "32bit rt sigreturn"); return 0; -} +} /* * Set up a signal frame. */ -static int -ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate, - struct pt_regs *regs, unsigned int mask) +static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, + struct _fpstate_ia32 __user *fpstate, + struct pt_regs *regs, unsigned int mask) { int tmp, err = 0; @@ -356,26 +360,26 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __ __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); err |= __put_user(tmp, (unsigned int __user *)&sc->es); - err |= __put_user((u32)regs->rdi, &sc->edi); - err |= __put_user((u32)regs->rsi, &sc->esi); - err |= __put_user((u32)regs->rbp, &sc->ebp); - err |= __put_user((u32)regs->rsp, &sc->esp); - err |= __put_user((u32)regs->rbx, &sc->ebx); - err |= __put_user((u32)regs->rdx, &sc->edx); - err |= __put_user((u32)regs->rcx, &sc->ecx); - err |= __put_user((u32)regs->rax, &sc->eax); + err |= __put_user((u32)regs->di, &sc->di); + err |= __put_user((u32)regs->si, &sc->si); + err |= __put_user((u32)regs->bp, &sc->bp); + err |= __put_user((u32)regs->sp, &sc->sp); + err |= __put_user((u32)regs->bx, &sc->bx); + err |= __put_user((u32)regs->dx, &sc->dx); + err |= __put_user((u32)regs->cx, &sc->cx); + err |= __put_user((u32)regs->ax, &sc->ax); err |= __put_user((u32)regs->cs, &sc->cs); err |= __put_user((u32)regs->ss, &sc->ss); err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user((u32)regs->rip, &sc->eip); - err |= __put_user((u32)regs->eflags, &sc->eflags); - err |= __put_user((u32)regs->rsp, &sc->esp_at_signal); + err |= __put_user((u32)regs->ip, &sc->ip); + err |= __put_user((u32)regs->flags, &sc->flags); + err |= __put_user((u32)regs->sp, &sc->sp_at_signal); - tmp = save_i387_ia32(current, fpstate, regs, 0); + tmp = save_i387_ia32(fpstate); if (tmp < 0) err = -EFAULT; - else { + else { clear_used_math(); stts(); err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), @@ -392,40 +396,53 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __ /* * Determine which stack to use.. */ -static void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, + size_t frame_size) { - unsigned long rsp; + unsigned long sp; /* Default to using normal stack */ - rsp = regs->rsp; + sp = regs->sp; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(rsp) == 0) - rsp = current->sas_ss_sp + current->sas_ss_size; + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; } /* This is the legacy signal stack switching. */ else if ((regs->ss & 0xffff) != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) { - rsp = (unsigned long) ka->sa.sa_restorer; - } + ka->sa.sa_restorer) + sp = (unsigned long) ka->sa.sa_restorer; - rsp -= frame_size; + sp -= frame_size; /* Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ - rsp = ((rsp + 4) & -16ul) - 4; - return (void __user *) rsp; + sp = ((sp + 4) & -16ul) - 4; + return (void __user *) sp; } int ia32_setup_frame(int sig, struct k_sigaction *ka, - compat_sigset_t *set, struct pt_regs * regs) + compat_sigset_t *set, struct pt_regs *regs) { struct sigframe __user *frame; + void __user *restorer; int err = 0; + /* copy_to_user optimizes that into a single 8 byte store */ + static const struct { + u16 poplmovl; + u32 val; + u16 int80; + u16 pad; + } __attribute__((packed)) code = { + 0xb858, /* popl %eax ; movl $...,%eax */ + __NR_ia32_sigreturn, + 0x80cd, /* int $0x80 */ + 0, + }; + frame = get_sigframe(ka, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) @@ -443,64 +460,53 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, if (_COMPAT_NSIG_WORDS > 1) { err |= __copy_to_user(frame->extramask, &set->sig[1], sizeof(frame->extramask)); + if (err) + goto give_sigsegv; } - if (err) - goto give_sigsegv; - /* Return stub is in 32bit vsyscall page */ - { - void __user *restorer; + if (ka->sa.sa_flags & SA_RESTORER) { + restorer = ka->sa.sa_restorer; + } else { + /* Return stub is in 32bit vsyscall page */ if (current->binfmt->hasvdso) - restorer = VSYSCALL32_SIGRETURN; + restorer = VDSO32_SYMBOL(current->mm->context.vdso, + sigreturn); else - restorer = (void *)&frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); - } - /* These are actually not used anymore, but left because some - gdb versions depend on them as a marker. */ - { - /* copy_to_user optimizes that into a single 8 byte store */ - static const struct { - u16 poplmovl; - u32 val; - u16 int80; - u16 pad; - } __attribute__((packed)) code = { - 0xb858, /* popl %eax ; movl $...,%eax */ - __NR_ia32_sigreturn, - 0x80cd, /* int $0x80 */ - 0, - }; - err |= __copy_to_user(frame->retcode, &code, 8); + restorer = &frame->retcode; } + err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); + + /* + * These are actually not used anymore, but left because some + * gdb versions depend on them as a marker. + */ + err |= __copy_to_user(frame->retcode, &code, 8); if (err) goto give_sigsegv; /* Set up registers for signal handler */ - regs->rsp = (unsigned long) frame; - regs->rip = (unsigned long) ka->sa.sa_handler; + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; /* Make -mregparm=3 work */ - regs->rax = sig; - regs->rdx = 0; - regs->rcx = 0; + regs->ax = sig; + regs->dx = 0; + regs->cx = 0; - asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); + asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); + asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; set_fs(USER_DS); - regs->eflags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); + printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; @@ -511,25 +517,34 @@ give_sigsegv: } int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - compat_sigset_t *set, struct pt_regs * regs) + compat_sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; + struct exec_domain *ed = current_thread_info()->exec_domain; + void __user *restorer; int err = 0; + /* __copy_to_user optimizes that into a single 8 byte store */ + static const struct { + u8 movl; + u32 val; + u16 int80; + u16 pad; + u8 pad2; + } __attribute__((packed)) code = { + 0xb8, + __NR_ia32_rt_sigreturn, + 0x80cd, + 0, + }; + frame = get_sigframe(ka, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - { - struct exec_domain *ed = current_thread_info()->exec_domain; - err |= __put_user((ed - && ed->signal_invmap - && sig < 32 - ? ed->signal_invmap[sig] - : sig), - &frame->sig); - } + err |= __put_user((ed && ed->signal_invmap && sig < 32 + ? ed->signal_invmap[sig] : sig), &frame->sig); err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); err |= copy_siginfo_to_user32(&frame->info, info); @@ -540,73 +555,58 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->rsp), + err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, - regs, set->sig[0]); + regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) goto give_sigsegv; - - { - void __user *restorer = VSYSCALL32_RTSIGRETURN; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); - } - - /* This is movl $,%eax ; int $0x80 */ - /* Not actually used anymore, but left because some gdb versions - need it. */ - { - /* __copy_to_user optimizes that into a single 8 byte store */ - static const struct { - u8 movl; - u32 val; - u16 int80; - u16 pad; - u8 pad2; - } __attribute__((packed)) code = { - 0xb8, - __NR_ia32_rt_sigreturn, - 0x80cd, - 0, - }; - err |= __copy_to_user(frame->retcode, &code, 8); - } + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + else + restorer = VDSO32_SYMBOL(current->mm->context.vdso, + rt_sigreturn); + err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); + + /* + * Not actually used anymore, but left because some gdb + * versions need it. + */ + err |= __copy_to_user(frame->retcode, &code, 8); if (err) goto give_sigsegv; /* Set up registers for signal handler */ - regs->rsp = (unsigned long) frame; - regs->rip = (unsigned long) ka->sa.sa_handler; + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; /* Make -mregparm=3 work */ - regs->rax = sig; - regs->rdx = (unsigned long) &frame->info; - regs->rcx = (unsigned long) &frame->uc; + regs->ax = sig; + regs->dx = (unsigned long) &frame->info; + regs->cx = (unsigned long) &frame->uc; /* Make -mregparm=3 work */ - regs->rax = sig; - regs->rdx = (unsigned long) &frame->info; - regs->rcx = (unsigned long) &frame->uc; + regs->ax = sig; + regs->dx = (unsigned long) &frame->info; + regs->cx = (unsigned long) &frame->uc; + + asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); + asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); - - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; set_fs(USER_DS); - regs->eflags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); + printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index df588f0f76e..0db0a6291bb 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -12,7 +12,6 @@ #include <asm/ia32_unistd.h> #include <asm/thread_info.h> #include <asm/segment.h> -#include <asm/vsyscall32.h> #include <asm/irqflags.h> #include <linux/linkage.h> @@ -104,7 +103,7 @@ ENTRY(ia32_sysenter_target) pushfq CFI_ADJUST_CFA_OFFSET 8 /*CFI_REL_OFFSET rflags,0*/ - movl $VSYSCALL32_SYSEXIT, %r10d + movl 8*3-THREAD_SIZE+threadinfo_sysenter_return(%rsp), %r10d CFI_REGISTER rip,r10 pushq $__USER32_CS CFI_ADJUST_CFA_OFFSET 8 @@ -142,6 +141,8 @@ sysenter_do_call: andl $~TS_COMPAT,threadinfo_status(%r10) /* clear IF, that popfq doesn't enable interrupts early */ andl $~0x200,EFLAGS-R11(%rsp) + movl RIP-R11(%rsp),%edx /* User %eip */ + CFI_REGISTER rip,rdx RESTORE_ARGS 1,24,1,1,1,1 popfq CFI_ADJUST_CFA_OFFSET -8 @@ -149,8 +150,6 @@ sysenter_do_call: popq %rcx /* User %esp */ CFI_ADJUST_CFA_OFFSET -8 CFI_REGISTER rsp,rcx - movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */ - CFI_REGISTER rip,rdx TRACE_IRQS_ON swapgs sti /* sti only takes effect after the next instruction */ @@ -644,8 +643,8 @@ ia32_sys_call_table: .quad compat_sys_futex /* 240 */ .quad compat_sys_sched_setaffinity .quad compat_sys_sched_getaffinity - .quad sys32_set_thread_area - .quad sys32_get_thread_area + .quad sys_set_thread_area + .quad sys_get_thread_area .quad compat_sys_io_setup /* 245 */ .quad sys_io_destroy .quad compat_sys_io_getevents diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c index 7b3342e5aab..d21991ce606 100644 --- a/arch/x86/ia32/ipc32.c +++ b/arch/x86/ia32/ipc32.c @@ -9,9 +9,8 @@ #include <linux/ipc.h> #include <linux/compat.h> -asmlinkage long -sys32_ipc(u32 call, int first, int second, int third, - compat_uptr_t ptr, u32 fifth) +asmlinkage long sys32_ipc(u32 call, int first, int second, int third, + compat_uptr_t ptr, u32 fifth) { int version; @@ -19,36 +18,35 @@ sys32_ipc(u32 call, int first, int second, int third, call &= 0xffff; switch (call) { - case SEMOP: + case SEMOP: /* struct sembuf is the same on 32 and 64bit :)) */ return sys_semtimedop(first, compat_ptr(ptr), second, NULL); - case SEMTIMEDOP: + case SEMTIMEDOP: return compat_sys_semtimedop(first, compat_ptr(ptr), second, compat_ptr(fifth)); - case SEMGET: + case SEMGET: return sys_semget(first, second, third); - case SEMCTL: + case SEMCTL: return compat_sys_semctl(first, second, third, compat_ptr(ptr)); - case MSGSND: + case MSGSND: return compat_sys_msgsnd(first, second, third, compat_ptr(ptr)); - case MSGRCV: + case MSGRCV: return compat_sys_msgrcv(first, second, fifth, third, version, compat_ptr(ptr)); - case MSGGET: + case MSGGET: return sys_msgget((key_t) first, second); - case MSGCTL: + case MSGCTL: return compat_sys_msgctl(first, second, compat_ptr(ptr)); - case SHMAT: + case SHMAT: return compat_sys_shmat(first, second, third, version, compat_ptr(ptr)); - break; - case SHMDT: + case SHMDT: return sys_shmdt(compat_ptr(ptr)); - case SHMGET: + case SHMGET: return sys_shmget(first, (unsigned)second, third); - case SHMCTL: + case SHMCTL: return compat_sys_shmctl(first, second, compat_ptr(ptr)); } return -ENOSYS; diff --git a/arch/x86/ia32/mmap32.c b/arch/x86/ia32/mmap32.c deleted file mode 100644 index e4b84b4a417..00000000000 --- a/arch/x86/ia32/mmap32.c +++ /dev/null @@ -1,79 +0,0 @@ -/* - * linux/arch/x86_64/ia32/mm/mmap.c - * - * flexible mmap layout support - * - * Based on the i386 version which was - * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * - * Started by Ingo Molnar <mingo@elte.hu> - */ - -#include <linux/personality.h> -#include <linux/mm.h> -#include <linux/random.h> -#include <linux/sched.h> - -/* - * Top of mmap area (just below the process stack). - * - * Leave an at least ~128 MB hole. - */ -#define MIN_GAP (128*1024*1024) -#define MAX_GAP (TASK_SIZE/6*5) - -static inline unsigned long mmap_base(struct mm_struct *mm) -{ - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; - unsigned long random_factor = 0; - - if (current->flags & PF_RANDOMIZE) - random_factor = get_random_int() % (1024*1024); - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(TASK_SIZE - gap - random_factor); -} - -/* - * This function, called very early during the creation of a new - * process VM image, sets up which VM layout function to use: - */ -void ia32_pick_mmap_layout(struct mm_struct *mm) -{ - /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: - */ - if (sysctl_legacy_va_layout || - (current->personality & ADDR_COMPAT_LAYOUT) || - current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; - } else { - mm->mmap_base = mmap_base(mm); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; - } -} diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c deleted file mode 100644 index 4a233ad6269..00000000000 --- a/arch/x86/ia32/ptrace32.c +++ /dev/null @@ -1,404 +0,0 @@ -/* - * 32bit ptrace for x86-64. - * - * Copyright 2001,2002 Andi Kleen, SuSE Labs. - * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier - * copyright. - * - * This allows to access 64bit processes too; but there is no way to see the extended - * register contents. - */ - -#include <linux/kernel.h> -#include <linux/stddef.h> -#include <linux/sched.h> -#include <linux/syscalls.h> -#include <linux/unistd.h> -#include <linux/mm.h> -#include <linux/err.h> -#include <linux/ptrace.h> -#include <asm/ptrace.h> -#include <asm/compat.h> -#include <asm/uaccess.h> -#include <asm/user32.h> -#include <asm/user.h> -#include <asm/errno.h> -#include <asm/debugreg.h> -#include <asm/i387.h> -#include <asm/fpu32.h> -#include <asm/ia32.h> - -/* - * Determines which flags the user has access to [1 = access, 0 = no access]. - * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). - * Also masks reserved bits (31-22, 15, 5, 3, 1). - */ -#define FLAG_MASK 0x54dd5UL - -#define R32(l,q) \ - case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break - -static int putreg32(struct task_struct *child, unsigned regno, u32 val) -{ - int i; - __u64 *stack = (__u64 *)task_pt_regs(child); - - switch (regno) { - case offsetof(struct user32, regs.fs): - if (val && (val & 3) != 3) return -EIO; - child->thread.fsindex = val & 0xffff; - break; - case offsetof(struct user32, regs.gs): - if (val && (val & 3) != 3) return -EIO; - child->thread.gsindex = val & 0xffff; - break; - case offsetof(struct user32, regs.ds): - if (val && (val & 3) != 3) return -EIO; - child->thread.ds = val & 0xffff; - break; - case offsetof(struct user32, regs.es): - child->thread.es = val & 0xffff; - break; - case offsetof(struct user32, regs.ss): - if ((val & 3) != 3) return -EIO; - stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff; - break; - case offsetof(struct user32, regs.cs): - if ((val & 3) != 3) return -EIO; - stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff; - break; - - R32(ebx, rbx); - R32(ecx, rcx); - R32(edx, rdx); - R32(edi, rdi); - R32(esi, rsi); - R32(ebp, rbp); - R32(eax, rax); - R32(orig_eax, orig_rax); - R32(eip, rip); - R32(esp, rsp); - - case offsetof(struct user32, regs.eflags): { - __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8]; - val &= FLAG_MASK; - *flags = val | (*flags & ~FLAG_MASK); - break; - } - - case offsetof(struct user32, u_debugreg[4]): - case offsetof(struct user32, u_debugreg[5]): - return -EIO; - - case offsetof(struct user32, u_debugreg[0]): - child->thread.debugreg0 = val; - break; - - case offsetof(struct user32, u_debugreg[1]): - child->thread.debugreg1 = val; - break; - - case offsetof(struct user32, u_debugreg[2]): - child->thread.debugreg2 = val; - break; - - case offsetof(struct user32, u_debugreg[3]): - child->thread.debugreg3 = val; - break; - - case offsetof(struct user32, u_debugreg[6]): - child->thread.debugreg6 = val; - break; - - case offsetof(struct user32, u_debugreg[7]): - val &= ~DR_CONTROL_RESERVED; - /* See arch/i386/kernel/ptrace.c for an explanation of - * this awkward check.*/ - for(i=0; i<4; i++) - if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1) - return -EIO; - child->thread.debugreg7 = val; - if (val) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - break; - - default: - if (regno > sizeof(struct user32) || (regno & 3)) - return -EIO; - - /* Other dummy fields in the virtual user structure are ignored */ - break; - } - return 0; -} - -#undef R32 - -#define R32(l,q) \ - case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break - -static int getreg32(struct task_struct *child, unsigned regno, u32 *val) -{ - __u64 *stack = (__u64 *)task_pt_regs(child); - - switch (regno) { - case offsetof(struct user32, regs.fs): - *val = child->thread.fsindex; - break; - case offsetof(struct user32, regs.gs): - *val = child->thread.gsindex; - break; - case offsetof(struct user32, regs.ds): - *val = child->thread.ds; - break; - case offsetof(struct user32, regs.es): - *val = child->thread.es; - break; - - R32(cs, cs); - R32(ss, ss); - R32(ebx, rbx); - R32(ecx, rcx); - R32(edx, rdx); - R32(edi, rdi); - R32(esi, rsi); - R32(ebp, rbp); - R32(eax, rax); - R32(orig_eax, orig_rax); - R32(eip, rip); - R32(eflags, eflags); - R32(esp, rsp); - - case offsetof(struct user32, u_debugreg[0]): - *val = child->thread.debugreg0; - break; - case offsetof(struct user32, u_debugreg[1]): - *val = child->thread.debugreg1; - break; - case offsetof(struct user32, u_debugreg[2]): - *val = child->thread.debugreg2; - break; - case offsetof(struct user32, u_debugreg[3]): - *val = child->thread.debugreg3; - break; - case offsetof(struct user32, u_debugreg[6]): - *val = child->thread.debugreg6; - break; - case offsetof(struct user32, u_debugreg[7]): - *val = child->thread.debugreg7; - break; - - default: - if (regno > sizeof(struct user32) || (regno & 3)) - return -EIO; - - /* Other dummy fields in the virtual user structure are ignored */ - *val = 0; - break; - } - return 0; -} - -#undef R32 - -static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) -{ - int ret; - compat_siginfo_t __user *si32 = compat_ptr(data); - siginfo_t ssi; - siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t)); - if (request == PTRACE_SETSIGINFO) { - memset(&ssi, 0, sizeof(siginfo_t)); - ret = copy_siginfo_from_user32(&ssi, si32); - if (ret) - return ret; - if (copy_to_user(si, &ssi, sizeof(siginfo_t))) - return -EFAULT; - } - ret = sys_ptrace(request, pid, addr, (unsigned long)si); - if (ret) - return ret; - if (request == PTRACE_GETSIGINFO) { - if (copy_from_user(&ssi, si, sizeof(siginfo_t))) - return -EFAULT; - ret = copy_siginfo_to_user32(si32, &ssi); - } - return ret; -} - -asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) -{ - struct task_struct *child; - struct pt_regs *childregs; - void __user *datap = compat_ptr(data); - int ret; - __u32 val; - - switch (request) { - case PTRACE_TRACEME: - case PTRACE_ATTACH: - case PTRACE_KILL: - case PTRACE_CONT: - case PTRACE_SINGLESTEP: - case PTRACE_DETACH: - case PTRACE_SYSCALL: - case PTRACE_OLDSETOPTIONS: - case PTRACE_SETOPTIONS: - case PTRACE_SET_THREAD_AREA: - case PTRACE_GET_THREAD_AREA: - return sys_ptrace(request, pid, addr, data); - - default: - return -EINVAL; - - case PTRACE_PEEKTEXT: - case PTRACE_PEEKDATA: - case PTRACE_POKEDATA: - case PTRACE_POKETEXT: - case PTRACE_POKEUSR: - case PTRACE_PEEKUSR: - case PTRACE_GETREGS: - case PTRACE_SETREGS: - case PTRACE_SETFPREGS: - case PTRACE_GETFPREGS: - case PTRACE_SETFPXREGS: - case PTRACE_GETFPXREGS: - case PTRACE_GETEVENTMSG: - break; - - case PTRACE_SETSIGINFO: - case PTRACE_GETSIGINFO: - return ptrace32_siginfo(request, pid, addr, data); - } - - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) - return PTR_ERR(child); - - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (ret < 0) - goto out; - - childregs = task_pt_regs(child); - - switch (request) { - case PTRACE_PEEKDATA: - case PTRACE_PEEKTEXT: - ret = 0; - if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32)) - ret = -EIO; - else - ret = put_user(val, (unsigned int __user *)datap); - break; - - case PTRACE_POKEDATA: - case PTRACE_POKETEXT: - ret = 0; - if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32)) - ret = -EIO; - break; - - case PTRACE_PEEKUSR: - ret = getreg32(child, addr, &val); - if (ret == 0) - ret = put_user(val, (__u32 __user *)datap); - break; - - case PTRACE_POKEUSR: - ret = putreg32(child, addr, data); - break; - - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - int i; - if (!access_ok(VERIFY_WRITE, datap, 16*4)) { - ret = -EIO; - break; - } - ret = 0; - for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) { - getreg32(child, i, &val); - ret |= __put_user(val,(u32 __user *)datap); - datap += sizeof(u32); - } - break; - } - - case PTRACE_SETREGS: { /* Set all gp regs in the child. */ - unsigned long tmp; - int i; - if (!access_ok(VERIFY_READ, datap, 16*4)) { - ret = -EIO; - break; - } - ret = 0; - for ( i = 0; i <= 16*4; i += sizeof(u32) ) { - ret |= __get_user(tmp, (u32 __user *)datap); - putreg32(child, i, tmp); - datap += sizeof(u32); - } - break; - } - - case PTRACE_GETFPREGS: - ret = -EIO; - if (!access_ok(VERIFY_READ, compat_ptr(data), - sizeof(struct user_i387_struct))) - break; - save_i387_ia32(child, datap, childregs, 1); - ret = 0; - break; - - case PTRACE_SETFPREGS: - ret = -EIO; - if (!access_ok(VERIFY_WRITE, datap, - sizeof(struct user_i387_struct))) - break; - ret = 0; - /* don't check EFAULT to be bug-to-bug compatible to i386 */ - restore_i387_ia32(child, datap, 1); - break; - - case PTRACE_GETFPXREGS: { - struct user32_fxsr_struct __user *u = datap; - init_fpu(child); - ret = -EIO; - if (!access_ok(VERIFY_WRITE, u, sizeof(*u))) - break; - ret = -EFAULT; - if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u))) - break; - ret = __put_user(childregs->cs, &u->fcs); - ret |= __put_user(child->thread.ds, &u->fos); - break; - } - case PTRACE_SETFPXREGS: { - struct user32_fxsr_struct __user *u = datap; - unlazy_fpu(child); - ret = -EIO; - if (!access_ok(VERIFY_READ, u, sizeof(*u))) - break; - /* no checking to be bug-to-bug compatible with i386. */ - /* but silence warning */ - if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u))) - ; - set_stopped_child_used_math(child); - child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - ret = 0; - break; - } - - case PTRACE_GETEVENTMSG: - ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data)); - break; - - default: - BUG(); - } - - out: - put_task_struct(child); - return ret; -} - diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index bee96d61443..abf71d26fc2 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -1,29 +1,29 @@ /* * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on - * sys_sparc32 + * sys_sparc32 * * Copyright (C) 2000 VA Linux Co * Copyright (C) 2000 Don Dugger <n0ano@valinux.com> - * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> - * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) - * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) * Copyright (C) 2000 Hewlett-Packard Co. * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> - * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) + * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) * * These routines maintain argument size conversion between 32bit and 64bit - * environment. In 2.5 most of this should be moved to a generic directory. + * environment. In 2.5 most of this should be moved to a generic directory. * * This file assumes that there is a hole at the end of user address space. - * - * Some of the functions are LE specific currently. These are hopefully all marked. - * This should be fixed. + * + * Some of the functions are LE specific currently. These are + * hopefully all marked. This should be fixed. */ #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/fs.h> -#include <linux/file.h> +#include <linux/fs.h> +#include <linux/file.h> #include <linux/signal.h> #include <linux/syscalls.h> #include <linux/resource.h> @@ -90,43 +90,44 @@ int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) return -EOVERFLOW; if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || - __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) || - __put_user (ino, &ubuf->st_ino) || - __put_user (kbuf->mode, &ubuf->st_mode) || - __put_user (kbuf->nlink, &ubuf->st_nlink) || - __put_user (uid, &ubuf->st_uid) || - __put_user (gid, &ubuf->st_gid) || - __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || - __put_user (kbuf->size, &ubuf->st_size) || - __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) || - __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || - __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) || - __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || - __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) || - __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || - __put_user (kbuf->blksize, &ubuf->st_blksize) || - __put_user (kbuf->blocks, &ubuf->st_blocks)) + __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) || + __put_user(ino, &ubuf->st_ino) || + __put_user(kbuf->mode, &ubuf->st_mode) || + __put_user(kbuf->nlink, &ubuf->st_nlink) || + __put_user(uid, &ubuf->st_uid) || + __put_user(gid, &ubuf->st_gid) || + __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || + __put_user(kbuf->size, &ubuf->st_size) || + __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) || + __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || + __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) || + __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || + __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) || + __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || + __put_user(kbuf->blksize, &ubuf->st_blksize) || + __put_user(kbuf->blocks, &ubuf->st_blocks)) return -EFAULT; return 0; } -asmlinkage long -sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high) +asmlinkage long sys32_truncate64(char __user *filename, + unsigned long offset_low, + unsigned long offset_high) { return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); } -asmlinkage long -sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high) +asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long offset_low, + unsigned long offset_high) { return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); } -/* Another set for IA32/LFS -- x86_64 struct stat is different due to - support for 64bit inode numbers. */ - -static int -cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) +/* + * Another set for IA32/LFS -- x86_64 struct stat is different due to + * support for 64bit inode numbers. + */ +static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) { typeof(ubuf->st_uid) uid = 0; typeof(ubuf->st_gid) gid = 0; @@ -134,38 +135,39 @@ cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) SET_GID(gid, stat->gid); if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) || __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || - __put_user (stat->ino, &ubuf->__st_ino) || - __put_user (stat->ino, &ubuf->st_ino) || - __put_user (stat->mode, &ubuf->st_mode) || - __put_user (stat->nlink, &ubuf->st_nlink) || - __put_user (uid, &ubuf->st_uid) || - __put_user (gid, &ubuf->st_gid) || - __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) || - __put_user (stat->size, &ubuf->st_size) || - __put_user (stat->atime.tv_sec, &ubuf->st_atime) || - __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) || - __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) || - __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || - __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) || - __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || - __put_user (stat->blksize, &ubuf->st_blksize) || - __put_user (stat->blocks, &ubuf->st_blocks)) + __put_user(stat->ino, &ubuf->__st_ino) || + __put_user(stat->ino, &ubuf->st_ino) || + __put_user(stat->mode, &ubuf->st_mode) || + __put_user(stat->nlink, &ubuf->st_nlink) || + __put_user(uid, &ubuf->st_uid) || + __put_user(gid, &ubuf->st_gid) || + __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) || + __put_user(stat->size, &ubuf->st_size) || + __put_user(stat->atime.tv_sec, &ubuf->st_atime) || + __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) || + __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) || + __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || + __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) || + __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || + __put_user(stat->blksize, &ubuf->st_blksize) || + __put_user(stat->blocks, &ubuf->st_blocks)) return -EFAULT; return 0; } -asmlinkage long -sys32_stat64(char __user * filename, struct stat64 __user *statbuf) +asmlinkage long sys32_stat64(char __user *filename, + struct stat64 __user *statbuf) { struct kstat stat; int ret = vfs_stat(filename, &stat); + if (!ret) ret = cp_stat64(statbuf, &stat); return ret; } -asmlinkage long -sys32_lstat64(char __user * filename, struct stat64 __user *statbuf) +asmlinkage long sys32_lstat64(char __user *filename, + struct stat64 __user *statbuf) { struct kstat stat; int ret = vfs_lstat(filename, &stat); @@ -174,8 +176,7 @@ sys32_lstat64(char __user * filename, struct stat64 __user *statbuf) return ret; } -asmlinkage long -sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) +asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) { struct kstat stat; int ret = vfs_fstat(fd, &stat); @@ -184,9 +185,8 @@ sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) return ret; } -asmlinkage long -sys32_fstatat(unsigned int dfd, char __user *filename, - struct stat64 __user* statbuf, int flag) +asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename, + struct stat64 __user *statbuf, int flag) { struct kstat stat; int error = -EINVAL; @@ -221,8 +221,7 @@ struct mmap_arg_struct { unsigned int offset; }; -asmlinkage long -sys32_mmap(struct mmap_arg_struct __user *arg) +asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) { struct mmap_arg_struct a; struct file *file = NULL; @@ -233,33 +232,33 @@ sys32_mmap(struct mmap_arg_struct __user *arg) return -EFAULT; if (a.offset & ~PAGE_MASK) - return -EINVAL; + return -EINVAL; if (!(a.flags & MAP_ANONYMOUS)) { file = fget(a.fd); if (!file) return -EBADF; } - - mm = current->mm; - down_write(&mm->mmap_sem); - retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT); + + mm = current->mm; + down_write(&mm->mmap_sem); + retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, + a.offset>>PAGE_SHIFT); if (file) fput(file); - up_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); return retval; } -asmlinkage long -sys32_mprotect(unsigned long start, size_t len, unsigned long prot) +asmlinkage long sys32_mprotect(unsigned long start, size_t len, + unsigned long prot) { - return sys_mprotect(start,len,prot); + return sys_mprotect(start, len, prot); } -asmlinkage long -sys32_pipe(int __user *fd) +asmlinkage long sys32_pipe(int __user *fd) { int retval; int fds[2]; @@ -269,13 +268,13 @@ sys32_pipe(int __user *fd) goto out; if (copy_to_user(fd, fds, sizeof(fds))) retval = -EFAULT; - out: +out: return retval; } -asmlinkage long -sys32_rt_sigaction(int sig, struct sigaction32 __user *act, - struct sigaction32 __user *oact, unsigned int sigsetsize) +asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act, + struct sigaction32 __user *oact, + unsigned int sigsetsize) { struct k_sigaction new_ka, old_ka; int ret; @@ -291,12 +290,17 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act, if (!access_ok(VERIFY_READ, act, sizeof(*act)) || __get_user(handler, &act->sa_handler) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || - __get_user(restorer, &act->sa_restorer)|| - __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t))) + __get_user(restorer, &act->sa_restorer) || + __copy_from_user(&set32, &act->sa_mask, + sizeof(compat_sigset_t))) return -EFAULT; new_ka.sa.sa_handler = compat_ptr(handler); new_ka.sa.sa_restorer = compat_ptr(restorer); - /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ + + /* + * FIXME: here we rely on _COMPAT_NSIG_WORS to be >= + * than _NSIG_WORDS << 1 + */ switch (_NSIG_WORDS) { case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6] | (((long)set32.sig[7]) << 32); @@ -312,7 +316,10 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ + /* + * FIXME: here we rely on _COMPAT_NSIG_WORS to be >= + * than _NSIG_WORDS << 1 + */ switch (_NSIG_WORDS) { case 4: set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32); @@ -328,23 +335,26 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act, set32.sig[0] = old_ka.sa.sa_mask.sig[0]; } if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || - __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || + __put_user(ptr_to_compat(old_ka.sa.sa_handler), + &oact->sa_handler) || + __put_user(ptr_to_compat(old_ka.sa.sa_restorer), + &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || - __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t))) + __copy_to_user(&oact->sa_mask, &set32, + sizeof(compat_sigset_t))) return -EFAULT; } return ret; } -asmlinkage long -sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact) +asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act, + struct old_sigaction32 __user *oact) { - struct k_sigaction new_ka, old_ka; - int ret; + struct k_sigaction new_ka, old_ka; + int ret; - if (act) { + if (act) { compat_old_sigset_t mask; compat_uptr_t handler, restorer; @@ -359,33 +369,35 @@ sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigacti new_ka.sa.sa_restorer = compat_ptr(restorer); siginitset(&new_ka.sa.sa_mask, mask); - } + } - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || - __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || + __put_user(ptr_to_compat(old_ka.sa.sa_handler), + &oact->sa_handler) || + __put_user(ptr_to_compat(old_ka.sa.sa_restorer), + &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; - } + } return ret; } -asmlinkage long -sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, - compat_sigset_t __user *oset, unsigned int sigsetsize) +asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, + compat_sigset_t __user *oset, + unsigned int sigsetsize) { sigset_t s; compat_sigset_t s32; int ret; mm_segment_t old_fs = get_fs(); - + if (set) { - if (copy_from_user (&s32, set, sizeof(compat_sigset_t))) + if (copy_from_user(&s32, set, sizeof(compat_sigset_t))) return -EFAULT; switch (_NSIG_WORDS) { case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32); @@ -394,13 +406,14 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32); } } - set_fs (KERNEL_DS); + set_fs(KERNEL_DS); ret = sys_rt_sigprocmask(how, set ? (sigset_t __user *)&s : NULL, oset ? (sigset_t __user *)&s : NULL, - sigsetsize); - set_fs (old_fs); - if (ret) return ret; + sigsetsize); + set_fs(old_fs); + if (ret) + return ret; if (oset) { switch (_NSIG_WORDS) { case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; @@ -408,52 +421,49 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; } - if (copy_to_user (oset, &s32, sizeof(compat_sigset_t))) + if (copy_to_user(oset, &s32, sizeof(compat_sigset_t))) return -EFAULT; } return 0; } -static inline long -get_tv32(struct timeval *o, struct compat_timeval __user *i) +static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i) { - int err = -EFAULT; - if (access_ok(VERIFY_READ, i, sizeof(*i))) { + int err = -EFAULT; + + if (access_ok(VERIFY_READ, i, sizeof(*i))) { err = __get_user(o->tv_sec, &i->tv_sec); err |= __get_user(o->tv_usec, &i->tv_usec); } - return err; + return err; } -static inline long -put_tv32(struct compat_timeval __user *o, struct timeval *i) +static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) { int err = -EFAULT; - if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { + + if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { err = __put_user(i->tv_sec, &o->tv_sec); err |= __put_user(i->tv_usec, &o->tv_usec); - } - return err; + } + return err; } -extern unsigned int alarm_setitimer(unsigned int seconds); - -asmlinkage long -sys32_alarm(unsigned int seconds) +asmlinkage long sys32_alarm(unsigned int seconds) { return alarm_setitimer(seconds); } -/* Translations due to time_t size differences. Which affects all - sorts of things, like timeval and itimerval. */ - -extern struct timezone sys_tz; - -asmlinkage long -sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) +/* + * Translations due to time_t size differences. Which affects all + * sorts of things, like timeval and itimerval. + */ +asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) { if (tv) { struct timeval ktv; + do_gettimeofday(&ktv); if (put_tv32(tv, &ktv)) return -EFAULT; @@ -465,14 +475,14 @@ sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) return 0; } -asmlinkage long -sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) +asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) { struct timeval ktv; struct timespec kts; struct timezone ktz; - if (tv) { + if (tv) { if (get_tv32(&ktv, tv)) return -EFAULT; kts.tv_sec = ktv.tv_sec; @@ -494,8 +504,7 @@ struct sel_arg_struct { unsigned int tvp; }; -asmlinkage long -sys32_old_select(struct sel_arg_struct __user *arg) +asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg) { struct sel_arg_struct a; @@ -505,50 +514,45 @@ sys32_old_select(struct sel_arg_struct __user *arg) compat_ptr(a.exp), compat_ptr(a.tvp)); } -extern asmlinkage long -compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options, - struct compat_rusage *ru); - -asmlinkage long -sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) +asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, + int options) { return compat_sys_wait4(pid, stat_addr, options, NULL); } /* 32-bit timeval and related flotsam. */ -asmlinkage long -sys32_sysfs(int option, u32 arg1, u32 arg2) +asmlinkage long sys32_sysfs(int option, u32 arg1, u32 arg2) { return sys_sysfs(option, arg1, arg2); } -asmlinkage long -sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval) +asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid, + struct compat_timespec __user *interval) { struct timespec t; int ret; - mm_segment_t old_fs = get_fs (); - - set_fs (KERNEL_DS); + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); - set_fs (old_fs); + set_fs(old_fs); if (put_compat_timespec(&t, interval)) return -EFAULT; return ret; } -asmlinkage long -sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) +asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *set, + compat_size_t sigsetsize) { sigset_t s; compat_sigset_t s32; int ret; mm_segment_t old_fs = get_fs(); - - set_fs (KERNEL_DS); + + set_fs(KERNEL_DS); ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize); - set_fs (old_fs); + set_fs(old_fs); if (!ret) { switch (_NSIG_WORDS) { case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; @@ -556,30 +560,29 @@ sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; } - if (copy_to_user (set, &s32, sizeof(compat_sigset_t))) + if (copy_to_user(set, &s32, sizeof(compat_sigset_t))) return -EFAULT; } return ret; } -asmlinkage long -sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo) +asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig, + compat_siginfo_t __user *uinfo) { siginfo_t info; int ret; mm_segment_t old_fs = get_fs(); - + if (copy_siginfo_from_user32(&info, uinfo)) return -EFAULT; - set_fs (KERNEL_DS); + set_fs(KERNEL_DS); ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info); - set_fs (old_fs); + set_fs(old_fs); return ret; } /* These are here just in case some old ia32 binary calls it. */ -asmlinkage long -sys32_pause(void) +asmlinkage long sys32_pause(void) { current->state = TASK_INTERRUPTIBLE; schedule(); @@ -599,25 +602,25 @@ struct sysctl_ia32 { }; -asmlinkage long -sys32_sysctl(struct sysctl_ia32 __user *args32) +asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32) { struct sysctl_ia32 a32; - mm_segment_t old_fs = get_fs (); + mm_segment_t old_fs = get_fs(); void __user *oldvalp, *newvalp; size_t oldlen; int __user *namep; long ret; - if (copy_from_user(&a32, args32, sizeof (a32))) + if (copy_from_user(&a32, args32, sizeof(a32))) return -EFAULT; /* - * We need to pre-validate these because we have to disable address checking - * before calling do_sysctl() because of OLDLEN but we can't run the risk of the - * user specifying bad addresses here. Well, since we're dealing with 32 bit - * addresses, we KNOW that access_ok() will always succeed, so this is an - * expensive NOP, but so what... + * We need to pre-validate these because we have to disable + * address checking before calling do_sysctl() because of + * OLDLEN but we can't run the risk of the user specifying bad + * addresses here. Well, since we're dealing with 32 bit + * addresses, we KNOW that access_ok() will always succeed, so + * this is an expensive NOP, but so what... */ namep = compat_ptr(a32.name); oldvalp = compat_ptr(a32.oldval); @@ -636,34 +639,34 @@ sys32_sysctl(struct sysctl_ia32 __user *args32) unlock_kernel(); set_fs(old_fs); - if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp))) + if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp))) return -EFAULT; return ret; } #endif -/* warning: next two assume little endian */ -asmlinkage long -sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) +/* warning: next two assume little endian */ +asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, + u32 poslo, u32 poshi) { return sys_pread64(fd, ubuf, count, ((loff_t)AA(poshi) << 32) | AA(poslo)); } -asmlinkage long -sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) +asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, + u32 poslo, u32 poshi) { return sys_pwrite64(fd, ubuf, count, ((loff_t)AA(poshi) << 32) | AA(poslo)); } -asmlinkage long -sys32_personality(unsigned long personality) +asmlinkage long sys32_personality(unsigned long personality) { int ret; - if (personality(current->personality) == PER_LINUX32 && + + if (personality(current->personality) == PER_LINUX32 && personality == PER_LINUX) personality = PER_LINUX32; ret = sys_personality(personality); @@ -672,34 +675,33 @@ sys32_personality(unsigned long personality) return ret; } -asmlinkage long -sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) +asmlinkage long sys32_sendfile(int out_fd, int in_fd, + compat_off_t __user *offset, s32 count) { mm_segment_t old_fs = get_fs(); int ret; off_t of; - + if (offset && get_user(of, offset)) return -EFAULT; - + set_fs(KERNEL_DS); ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL, count); set_fs(old_fs); - + if (offset && put_user(of, offset)) return -EFAULT; - return ret; } asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) { struct mm_struct *mm = current->mm; unsigned long error; - struct file * file = NULL; + struct file *file = NULL; flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); if (!(flags & MAP_ANONYMOUS)) { @@ -717,36 +719,35 @@ asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, return error; } -asmlinkage long sys32_olduname(struct oldold_utsname __user * name) +asmlinkage long sys32_olduname(struct oldold_utsname __user *name) { + char *arch = "x86_64"; int err; if (!name) return -EFAULT; if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) return -EFAULT; - - down_read(&uts_sem); - - err = __copy_to_user(&name->sysname,&utsname()->sysname, - __OLD_UTS_LEN); - err |= __put_user(0,name->sysname+__OLD_UTS_LEN); - err |= __copy_to_user(&name->nodename,&utsname()->nodename, - __OLD_UTS_LEN); - err |= __put_user(0,name->nodename+__OLD_UTS_LEN); - err |= __copy_to_user(&name->release,&utsname()->release, - __OLD_UTS_LEN); - err |= __put_user(0,name->release+__OLD_UTS_LEN); - err |= __copy_to_user(&name->version,&utsname()->version, - __OLD_UTS_LEN); - err |= __put_user(0,name->version+__OLD_UTS_LEN); - { - char *arch = "x86_64"; - if (personality(current->personality) == PER_LINUX32) - arch = "i686"; - - err |= __copy_to_user(&name->machine, arch, strlen(arch)+1); - } + + down_read(&uts_sem); + + err = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + err |= __put_user(0, name->sysname+__OLD_UTS_LEN); + err |= __copy_to_user(&name->nodename, &utsname()->nodename, + __OLD_UTS_LEN); + err |= __put_user(0, name->nodename+__OLD_UTS_LEN); + err |= __copy_to_user(&name->release, &utsname()->release, + __OLD_UTS_LEN); + err |= __put_user(0, name->release+__OLD_UTS_LEN); + err |= __copy_to_user(&name->version, &utsname()->version, + __OLD_UTS_LEN); + err |= __put_user(0, name->version+__OLD_UTS_LEN); + + if (personality(current->personality) == PER_LINUX32) + arch = "i686"; + + err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1); up_read(&uts_sem); @@ -755,17 +756,19 @@ asmlinkage long sys32_olduname(struct oldold_utsname __user * name) return err; } -long sys32_uname(struct old_utsname __user * name) +long sys32_uname(struct old_utsname __user *name) { int err; + if (!name) return -EFAULT; down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof(*name)); up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) + if (personality(current->personality) == PER_LINUX32) err |= copy_to_user(&name->machine, "i686", 5); - return err?-EFAULT:0; + + return err ? -EFAULT : 0; } long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) @@ -773,27 +776,28 @@ long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) struct ustat u; mm_segment_t seg; int ret; - - seg = get_fs(); - set_fs(KERNEL_DS); + + seg = get_fs(); + set_fs(KERNEL_DS); ret = sys_ustat(dev, (struct ustat __user *)&u); set_fs(seg); - if (ret >= 0) { - if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || - __put_user((__u32) u.f_tfree, &u32p->f_tfree) || - __put_user((__u32) u.f_tinode, &u32p->f_tfree) || - __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) || - __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack))) - ret = -EFAULT; - } + if (ret < 0) + return ret; + + if (!access_ok(VERIFY_WRITE, u32p, sizeof(struct ustat32)) || + __put_user((__u32) u.f_tfree, &u32p->f_tfree) || + __put_user((__u32) u.f_tinode, &u32p->f_tfree) || + __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) || + __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack))) + ret = -EFAULT; return ret; -} +} asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, compat_uptr_t __user *envp, struct pt_regs *regs) { long error; - char * filename; + char *filename; filename = getname(name); error = PTR_ERR(filename); @@ -812,18 +816,19 @@ asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, struct pt_regs *regs) { - void __user *parent_tid = (void __user *)regs->rdx; - void __user *child_tid = (void __user *)regs->rdi; + void __user *parent_tid = (void __user *)regs->dx; + void __user *child_tid = (void __user *)regs->di; + if (!newsp) - newsp = regs->rsp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } /* - * Some system calls that need sign extended arguments. This could be done by a generic wrapper. - */ - -long sys32_lseek (unsigned int fd, int offset, unsigned int whence) + * Some system calls that need sign extended arguments. This could be + * done by a generic wrapper. + */ +long sys32_lseek(unsigned int fd, int offset, unsigned int whence) { return sys_lseek(fd, offset, whence); } @@ -832,49 +837,52 @@ long sys32_kill(int pid, int sig) { return sys_kill(pid, sig); } - -long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, + +long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, __u32 len_low, __u32 len_high, int advice) -{ +{ return sys_fadvise64_64(fd, (((u64)offset_high)<<32) | offset_low, (((u64)len_high)<<32) | len_low, - advice); -} + advice); +} long sys32_vm86_warning(void) -{ +{ struct task_struct *me = current; static char lastcomm[sizeof(me->comm)]; + if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", - me->comm); + compat_printk(KERN_INFO + "%s: vm86 mode not supported on 64 bit kernel\n", + me->comm); strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } + } return -ENOSYS; -} +} long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, - char __user * buf, size_t len) + char __user *buf, size_t len) { return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len); } -asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count) +asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, + size_t count) { return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); } asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi, - unsigned n_low, unsigned n_hi, int flags) + unsigned n_low, unsigned n_hi, int flags) { return sys_sync_file_range(fd, ((u64)off_hi << 32) | off_low, ((u64)n_hi << 32) | n_low, flags); } -asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, size_t len, - int advice) +asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, + size_t len, int advice) { return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, len, advice); diff --git a/arch/x86/ia32/syscall32.c b/arch/x86/ia32/syscall32.c deleted file mode 100644 index 15013bac181..00000000000 --- a/arch/x86/ia32/syscall32.c +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright 2002,2003 Andi Kleen, SuSE Labs */ - -/* vsyscall handling for 32bit processes. Map a stub page into it - on demand because 32bit cannot reach the kernel's fixmaps */ - -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/stringify.h> -#include <linux/security.h> -#include <asm/proto.h> -#include <asm/tlbflush.h> -#include <asm/ia32_unistd.h> -#include <asm/vsyscall32.h> - -extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; -extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; -extern int sysctl_vsyscall32; - -static struct page *syscall32_pages[1]; -static int use_sysenter = -1; - -struct linux_binprm; - -/* Setup a VMA at program startup for the vsyscall page */ -int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) -{ - struct mm_struct *mm = current->mm; - int ret; - - down_write(&mm->mmap_sem); - /* - * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - /* Could randomize here */ - ret = install_special_mapping(mm, VSYSCALL32_BASE, PAGE_SIZE, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, - syscall32_pages); - up_write(&mm->mmap_sem); - return ret; -} - -static int __init init_syscall32(void) -{ - char *syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); - if (!syscall32_page) - panic("Cannot allocate syscall32 page"); - syscall32_pages[0] = virt_to_page(syscall32_page); - if (use_sysenter > 0) { - memcpy(syscall32_page, syscall32_sysenter, - syscall32_sysenter_end - syscall32_sysenter); - } else { - memcpy(syscall32_page, syscall32_syscall, - syscall32_syscall_end - syscall32_syscall); - } - return 0; -} - -__initcall(init_syscall32); - -/* May not be __init: called during resume */ -void syscall32_cpu_init(void) -{ - if (use_sysenter < 0) - use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); - - /* Load these always in case some future AMD CPU supports - SYSENTER from compat mode too. */ - checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); - checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); - - wrmsrl(MSR_CSTAR, ia32_cstar_target); -} diff --git a/arch/x86/ia32/syscall32_syscall.S b/arch/x86/ia32/syscall32_syscall.S deleted file mode 100644 index 933f0f08b1c..00000000000 --- a/arch/x86/ia32/syscall32_syscall.S +++ /dev/null @@ -1,17 +0,0 @@ -/* 32bit VDSOs mapped into user space. */ - - .section ".init.data","aw" - - .globl syscall32_syscall - .globl syscall32_syscall_end - -syscall32_syscall: - .incbin "arch/x86/ia32/vsyscall-syscall.so" -syscall32_syscall_end: - - .globl syscall32_sysenter - .globl syscall32_sysenter_end - -syscall32_sysenter: - .incbin "arch/x86/ia32/vsyscall-sysenter.so" -syscall32_sysenter_end: diff --git a/arch/x86/ia32/tls32.c b/arch/x86/ia32/tls32.c deleted file mode 100644 index 1cc4340de3c..00000000000 --- a/arch/x86/ia32/tls32.c +++ /dev/null @@ -1,163 +0,0 @@ -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/user.h> - -#include <asm/uaccess.h> -#include <asm/desc.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/processor.h> -#include <asm/proto.h> - -/* - * sys_alloc_thread_area: get a yet unused TLS descriptor index. - */ -static int get_free_idx(void) -{ - struct thread_struct *t = ¤t->thread; - int idx; - - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) - if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx)) - return idx + GDT_ENTRY_TLS_MIN; - return -ESRCH; -} - -/* - * Set a given TLS descriptor: - * When you want addresses > 32bit use arch_prctl() - */ -int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info) -{ - struct user_desc info; - struct n_desc_struct *desc; - int cpu, idx; - - if (copy_from_user(&info, u_info, sizeof(info))) - return -EFAULT; - - idx = info.entry_number; - - /* - * index -1 means the kernel should try to find and - * allocate an empty descriptor: - */ - if (idx == -1) { - idx = get_free_idx(); - if (idx < 0) - return idx; - if (put_user(idx, &u_info->entry_number)) - return -EFAULT; - } - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; - - /* - * We must not get preempted while modifying the TLS. - */ - cpu = get_cpu(); - - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - if (t == ¤t->thread) - load_TLS(t, cpu); - - put_cpu(); - return 0; -} - -asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info) -{ - return do_set_thread_area(¤t->thread, u_info); -} - - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) -#define GET_LONGMODE(desc) (((desc)->b >> 21) & 1) - -int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info) -{ - struct user_desc info; - struct n_desc_struct *desc; - int idx; - - if (get_user(idx, &u_info->entry_number)) - return -EFAULT; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; - - memset(&info, 0, sizeof(struct user_desc)); - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - info.lm = GET_LONGMODE(desc); - - if (copy_to_user(u_info, &info, sizeof(info))) - return -EFAULT; - return 0; -} - -asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info) -{ - return do_get_thread_area(¤t->thread, u_info); -} - - -int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs) -{ - struct n_desc_struct *desc; - struct user_desc info; - struct user_desc __user *cp; - int idx; - - cp = (void __user *)childregs->rsi; - if (copy_from_user(&info, cp, sizeof(info))) - return -EFAULT; - if (LDT_empty(&info)) - return -EINVAL; - - idx = info.entry_number; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN; - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - - return 0; -} diff --git a/arch/x86/ia32/vsyscall-sigreturn.S b/arch/x86/ia32/vsyscall-sigreturn.S deleted file mode 100644 index b383be00bae..00000000000 --- a/arch/x86/ia32/vsyscall-sigreturn.S +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Common code for the sigreturn entry points on the vsyscall page. - * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80) - * to enter the kernel. - * This file is #include'd by vsyscall-*.S to define them after the - * vsyscall entry point. The addresses we get for these entry points - * by doing ".balign 32" must match in both versions of the page. - */ - - .code32 - .section .text.sigreturn,"ax" - .balign 32 - .globl __kernel_sigreturn - .type __kernel_sigreturn,@function -__kernel_sigreturn: -.LSTART_sigreturn: - popl %eax - movl $__NR_ia32_sigreturn, %eax - SYSCALL_ENTER_KERNEL -.LEND_sigreturn: - .size __kernel_sigreturn,.-.LSTART_sigreturn - - .section .text.rtsigreturn,"ax" - .balign 32 - .globl __kernel_rt_sigreturn - .type __kernel_rt_sigreturn,@function -__kernel_rt_sigreturn: -.LSTART_rt_sigreturn: - movl $__NR_ia32_rt_sigreturn, %eax - SYSCALL_ENTER_KERNEL -.LEND_rt_sigreturn: - .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn - - .section .eh_frame,"a",@progbits -.LSTARTFRAMES: - .long .LENDCIES-.LSTARTCIES -.LSTARTCIES: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zRS" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIES: - - .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */ -.LSTARTFDE2: - .long .LSTARTFDE2-.LSTARTFRAMES /* CIE pointer */ - /* HACK: The dwarf2 unwind routines will subtract 1 from the - return address to get an address in the middle of the - presumed call instruction. Since we didn't get here via - a call, we need to include the nop before the real start - to make up for it. */ - .long .LSTART_sigreturn-1-. /* PC-relative start address */ - .long .LEND_sigreturn-.LSTART_sigreturn+1 - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - complicated by the fact that the "CFA" is always assumed to - be the value of the stack pointer in the caller. This means - that we must define the CFA of this body of code to be the - saved value of the stack pointer in the sigcontext. Which - also means that there is no fixed relation to the other - saved registers, which means that we must use DW_CFA_expression - to compute their addresses. It also means that when we - adjust the stack with the popl, we have to do it all over again. */ - -#define do_cfa_expr(offset) \ - .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ - .byte 0x06; /* DW_OP_deref */ \ -1: - -#define do_expr(regno, offset) \ - .byte 0x10; /* DW_CFA_expression */ \ - .uleb128 regno; /* regno */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ -1: - - do_cfa_expr(IA32_SIGCONTEXT_esp+4) - do_expr(0, IA32_SIGCONTEXT_eax+4) - do_expr(1, IA32_SIGCONTEXT_ecx+4) - do_expr(2, IA32_SIGCONTEXT_edx+4) - do_expr(3, IA32_SIGCONTEXT_ebx+4) - do_expr(5, IA32_SIGCONTEXT_ebp+4) - do_expr(6, IA32_SIGCONTEXT_esi+4) - do_expr(7, IA32_SIGCONTEXT_edi+4) - do_expr(8, IA32_SIGCONTEXT_eip+4) - - .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ - - do_cfa_expr(IA32_SIGCONTEXT_esp) - do_expr(0, IA32_SIGCONTEXT_eax) - do_expr(1, IA32_SIGCONTEXT_ecx) - do_expr(2, IA32_SIGCONTEXT_edx) - do_expr(3, IA32_SIGCONTEXT_ebx) - do_expr(5, IA32_SIGCONTEXT_ebp) - do_expr(6, IA32_SIGCONTEXT_esi) - do_expr(7, IA32_SIGCONTEXT_edi) - do_expr(8, IA32_SIGCONTEXT_eip) - - .align 4 -.LENDFDE2: - - .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */ -.LSTARTFDE3: - .long .LSTARTFDE3-.LSTARTFRAMES /* CIE pointer */ - /* HACK: See above wrt unwind library assumptions. */ - .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ - .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 - .uleb128 0 /* Augmentation */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - slightly less complicated than the above, since we don't - modify the stack pointer in the process. */ - - do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp) - do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax) - do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx) - do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx) - do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx) - do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp) - do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi) - do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi) - do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip) - - .align 4 -.LENDFDE3: - -#include "../../x86/kernel/vsyscall-note_32.S" - diff --git a/arch/x86/ia32/vsyscall-sysenter.S b/arch/x86/ia32/vsyscall-sysenter.S deleted file mode 100644 index ae056e553d1..00000000000 --- a/arch/x86/ia32/vsyscall-sysenter.S +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Code for the vsyscall page. This version uses the sysenter instruction. - */ - -#include <asm/ia32_unistd.h> -#include <asm/asm-offsets.h> - - .code32 - .text - .section .text.vsyscall,"ax" - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function -__kernel_vsyscall: -.LSTART_vsyscall: - push %ecx -.Lpush_ecx: - push %edx -.Lpush_edx: - push %ebp -.Lenter_kernel: - movl %esp,%ebp - sysenter - .space 7,0x90 - jmp .Lenter_kernel - /* 16: System call normal return point is here! */ - pop %ebp -.Lpop_ebp: - pop %edx -.Lpop_edx: - pop %ecx -.Lpop_ecx: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE -.LSTARTCIE: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIE: - - .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ -.LSTARTFDE1: - .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_ecx-.LSTART_vsyscall - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_edx-.Lpush_ecx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lenter_kernel-.Lpush_edx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x10 /* RA at offset 16 now */ - .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ - /* Finally the epilogue. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ebp-.Lenter_kernel - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x12 /* RA at offset 12 now */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_edx-.Lpop_ebp - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ecx-.Lpop_edx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x04 /* RA at offset 4 now */ - .align 4 -.LENDFDE1: - -#define SYSCALL_ENTER_KERNEL int $0x80 -#include "vsyscall-sigreturn.S" diff --git a/arch/x86/ia32/vsyscall.lds b/arch/x86/ia32/vsyscall.lds deleted file mode 100644 index 1dc86ff5bcb..00000000000 --- a/arch/x86/ia32/vsyscall.lds +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Linker script for vsyscall DSO. The vsyscall page is an ELF shared - * object prelinked to its virtual address. This script controls its layout. - */ - -/* This must match <asm/fixmap.h>. */ -VSYSCALL_BASE = 0xffffe000; - -SECTIONS -{ - . = VSYSCALL_BASE + SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - /* This linker script is used both with -r and with -shared. - For the layouts to match, we need to skip more than enough - space for the dynamic symbol table et al. If this amount - is insufficient, ld -shared will barf. Just increase it here. */ - . = VSYSCALL_BASE + 0x400; - - .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090 - - /* This is an 32bit object and we cannot easily get the offsets - into the 64bit kernel. Just hardcode them here. This assumes - that all the stubs don't need more than 0x100 bytes. */ - . = VSYSCALL_BASE + 0x500; - - .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090 - - . = VSYSCALL_BASE + 0x600; - - .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090 - - .note : { *(.note.*) } :text :note - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic - .useless : { - *(.got.plt) *(.got) - *(.data .data.* .gnu.linkonce.d.*) - *(.dynbss) - *(.bss .bss.* .gnu.linkonce.b.*) - } :text -} - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ -} - -/* - * This controls what symbols we export from the DSO. - */ -VERSION -{ - LINUX_2.5 { - global: - __kernel_vsyscall; - __kernel_sigreturn; - __kernel_rt_sigreturn; - - local: *; - }; -} - -/* The ELF entry point can be used to set the AT_SYSINFO value. */ -ENTRY(__kernel_vsyscall); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 38573340b14..6f813009d44 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -1,9 +1,91 @@ -ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/kernel/Makefile_32 -else -include ${srctree}/arch/x86/kernel/Makefile_64 +# +# Makefile for the linux kernel. +# + +extra-y := head_$(BITS).o init_task.o vmlinux.lds +extra-$(CONFIG_X86_64) += head64.o + +CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) +CFLAGS_vsyscall_64.o := $(PROFILING) -g0 + +obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o +obj-y += traps_$(BITS).o irq_$(BITS).o +obj-y += time_$(BITS).o ioport.o ldt.o +obj-y += setup_$(BITS).o i8259_$(BITS).o +obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o +obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o +obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o +obj-y += pci-dma_$(BITS).o bootflag.o e820_$(BITS).o +obj-y += quirks.o i8237.o topology.o kdebugfs.o +obj-y += alternative.o i8253.o +obj-$(CONFIG_X86_64) += pci-nommu_64.o bugs_64.o +obj-y += tsc_$(BITS).o io_delay.o rtc.o + +obj-y += i387.o +obj-y += ptrace.o +obj-y += ds.o +obj-$(CONFIG_X86_32) += tls.o +obj-$(CONFIG_IA32_EMULATION) += tls.o +obj-y += step.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += cpu/ +obj-y += acpi/ +obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o +obj-$(CONFIG_X86_64) += reboot.o +obj-$(CONFIG_MCA) += mca_32.o +obj-$(CONFIG_X86_MSR) += msr.o +obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_PCI) += early-quirks.o +obj-$(CONFIG_APM) += apm_32.o +obj-$(CONFIG_X86_SMP) += smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o +obj-$(CONFIG_X86_32_SMP) += smpcommon_32.o +obj-$(CONFIG_X86_64_SMP) += smp_64.o smpboot_64.o tsc_sync.o +obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o +obj-$(CONFIG_X86_MPPARSE) += mpparse_$(BITS).o +obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o +obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o +obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o +obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o +obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o +obj-$(CONFIG_X86_VSMP) += vsmp_64.o +obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_MODULES) += module_$(BITS).o +obj-$(CONFIG_ACPI_SRAT) += srat_32.o +obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o +obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o +obj-$(CONFIG_VM86) += vm86_32.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o + +obj-$(CONFIG_HPET_TIMER) += hpet.o + +obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o +obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o +obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o + +obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o + +ifdef CONFIG_INPUT_PCSPKR +obj-y += pcspeaker.o endif -# Workaround to delete .lds files with make clean -# The problem is that we do not enter Makefile_32 with make clean. -clean-files := vsyscall*.lds vsyscall*.so +obj-$(CONFIG_SCx200) += scx200_32.o + +### +# 64 bit specific files +ifeq ($(CONFIG_X86_64),y) + obj-y += genapic_64.o genapic_flat_64.o + obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o + obj-$(CONFIG_AUDIT) += audit_64.o + obj-$(CONFIG_PM) += suspend_64.o + obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o + + obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o + obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o + obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o +endif diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 deleted file mode 100644 index a7bc93c2766..00000000000 --- a/arch/x86/kernel/Makefile_32 +++ /dev/null @@ -1,88 +0,0 @@ -# -# Makefile for the linux kernel. -# - -extra-y := head_32.o init_task.o vmlinux.lds -CPPFLAGS_vmlinux.lds += -Ui386 - -obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \ - ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \ - pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\ - quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o - -obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-y += cpu/ -obj-y += acpi/ -obj-$(CONFIG_X86_BIOS_REBOOT) += reboot_32.o -obj-$(CONFIG_MCA) += mca_32.o -obj-$(CONFIG_X86_MSR) += msr.o -obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_PCI) += early-quirks.o -obj-$(CONFIG_APM) += apm_32.o -obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o -obj-$(CONFIG_SMP) += smpcommon_32.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o -obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o -obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o -obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o -obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o -obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o -obj-$(CONFIG_KPROBES) += kprobes_32.o -obj-$(CONFIG_MODULES) += module_32.o -obj-y += sysenter_32.o vsyscall_32.o -obj-$(CONFIG_ACPI_SRAT) += srat_32.o -obj-$(CONFIG_EFI) += efi_32.o efi_stub_32.o -obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o -obj-$(CONFIG_VM86) += vm86_32.o -obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_HPET_TIMER) += hpet.o -obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o - -obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o -obj-$(CONFIG_PARAVIRT) += paravirt_32.o -obj-y += pcspeaker.o - -obj-$(CONFIG_SCx200) += scx200_32.o - -# vsyscall_32.o contains the vsyscall DSO images as __initdata. -# We must build both images before we can assemble it. -# Note: kbuild does not track this dependency due to usage of .incbin -$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so -targets += $(foreach F,int80 sysenter,vsyscall-$F_32.o vsyscall-$F_32.so) -targets += vsyscall-note_32.o vsyscall_32.lds - -# The DSO images are built using a special linker script. -quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \ - -Wl,-T,$(filter-out FORCE,$^) -o $@ - -export CPPFLAGS_vsyscall_32.lds += -P -C -Ui386 - -vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \ - $(call ld-option, -Wl$(comma)--hash-style=sysv) -SYSCFLAGS_vsyscall-sysenter_32.so = $(vsyscall-flags) -SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags) - -$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \ -$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \ - $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE - $(call if_changed,syscall) - -# We also create a special relocatable object that should mirror the symbol -# table and layout of the linked DSO. With ld -R we can then refer to -# these symbols in the kernel code rather than hand-coded addresses. -extra-y += vsyscall-syms.o -$(obj)/built-in.o: $(obj)/vsyscall-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o - -SYSCFLAGS_vsyscall-syms.o = -r -$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \ - $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE - $(call if_changed,syscall) - - diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64 deleted file mode 100644 index 5a88890d8ee..00000000000 --- a/arch/x86/kernel/Makefile_64 +++ /dev/null @@ -1,45 +0,0 @@ -# -# Makefile for the linux kernel. -# - -extra-y := head_64.o head64.o init_task.o vmlinux.lds -CPPFLAGS_vmlinux.lds += -Ux86_64 -EXTRA_AFLAGS := -traditional - -obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \ - ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \ - x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \ - setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \ - pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \ - i8253.o - -obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-y += cpu/ -obj-y += acpi/ -obj-$(CONFIG_X86_MSR) += msr.o -obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o -obj-y += apic_64.o nmi_64.o -obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o -obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o -obj-$(CONFIG_PM) += suspend_64.o -obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o -obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o -obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o -obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o -obj-$(CONFIG_KPROBES) += kprobes_64.o -obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o -obj-$(CONFIG_X86_VSMP) += vsmp_64.o -obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_AUDIT) += audit_64.o - -obj-$(CONFIG_MODULES) += module_64.o -obj-$(CONFIG_PCI) += early-quirks.o - -obj-y += topology.o -obj-y += pcspeaker.o - -CFLAGS_vsyscall_64.o := $(PROFILING) -g0 diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 1351c3982ee..19d3d6e9d09 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_ACPI) += boot.o -obj-$(CONFIG_ACPI_SLEEP) += sleep_$(BITS).o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o processor.o diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c new file mode 100644 index 00000000000..6bc815cd8cb --- /dev/null +++ b/arch/x86/kernel/acpi/sleep.c @@ -0,0 +1,87 @@ +/* + * sleep.c - x86-specific ACPI sleep support. + * + * Copyright (C) 2001-2003 Patrick Mochel + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> + */ + +#include <linux/acpi.h> +#include <linux/bootmem.h> +#include <linux/dmi.h> +#include <linux/cpumask.h> + +#include <asm/smp.h> + +/* address in low memory of the wakeup routine. */ +unsigned long acpi_wakeup_address = 0; +unsigned long acpi_realmode_flags; +extern char wakeup_start, wakeup_end; + +extern unsigned long acpi_copy_wakeup_routine(unsigned long); + +/** + * acpi_save_state_mem - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int acpi_save_state_mem(void) +{ + if (!acpi_wakeup_address) { + printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n"); + return -ENOMEM; + } + memcpy((void *)acpi_wakeup_address, &wakeup_start, + &wakeup_end - &wakeup_start); + acpi_copy_wakeup_routine(acpi_wakeup_address); + + return 0; +} + +/* + * acpi_restore_state - undo effects of acpi_save_state_mem + */ +void acpi_restore_state_mem(void) +{ +} + + +/** + * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * + * We allocate a page from the first 1MB of memory for the wakeup + * routine for when we come back from a sleep state. The + * runtime allocator allows specification of <16MB pages, but not + * <1MB pages. + */ +void __init acpi_reserve_bootmem(void) +{ + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) { + printk(KERN_ERR + "ACPI: Wakeup code way too big, S3 disabled.\n"); + return; + } + + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); + if (!acpi_wakeup_address) + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); +} + + +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_realmode_flags |= 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c index 10699489cfe..63fe5525e02 100644 --- a/arch/x86/kernel/acpi/sleep_32.c +++ b/arch/x86/kernel/acpi/sleep_32.c @@ -12,76 +12,6 @@ #include <asm/smp.h> -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem(void) -{ - if (!acpi_wakeup_address) - return 1; - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); - - return 0; -} - -/* - * acpi_restore_state - undo effects of acpi_save_state_mem - */ -void acpi_restore_state_mem(void) -{ -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page from the first 1MB of memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16MB pages, but not - * <1MB pages. - */ -void __init acpi_reserve_bootmem(void) -{ - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) { - printk(KERN_ERR - "ACPI: Wakeup code way too big, S3 disabled.\n"); - return; - } - - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); - if (!acpi_wakeup_address) - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); -} - -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_realmode_flags |= 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_realmode_flags |= 2; - if (strncmp(str, "s3_beep", 7) == 0) - acpi_realmode_flags |= 4; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); - /* Ouch, we want to delete this. We already have better version in userspace, in s2ram from suspend.sf.net project */ static __init int reset_videomode_after_s3(const struct dmi_system_id *d) diff --git a/arch/x86/kernel/acpi/sleep_64.c b/arch/x86/kernel/acpi/sleep_64.c deleted file mode 100644 index da42de261ba..00000000000 --- a/arch/x86/kernel/acpi/sleep_64.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * acpi.c - Architecture-Specific Low-Level ACPI Support - * - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> - * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> - * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> - * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) - * Copyright (C) 2003 Pavel Machek, SuSE Labs - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/pci.h> -#include <linux/bootmem.h> -#include <linux/acpi.h> -#include <linux/cpumask.h> - -#include <asm/mpspec.h> -#include <asm/io.h> -#include <asm/apic.h> -#include <asm/apicdef.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/tlbflush.h> - -/* -------------------------------------------------------------------------- - Low-Level Sleep Support - -------------------------------------------------------------------------- */ - -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long acpi_copy_wakeup_routine(unsigned long); - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem(void) -{ - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); - - return 0; -} - -/* - * acpi_restore_state - */ -void acpi_restore_state_mem(void) -{ -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page in low memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16M pages, but not - * <1M pages. - */ -void __init acpi_reserve_bootmem(void) -{ - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); - if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) - printk(KERN_CRIT - "ACPI: Wakeup code way too big, will crash on attempt" - " to suspend\n"); -} - -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_realmode_flags |= 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_realmode_flags |= 2; - if (strncmp(str, "s3_beep", 7) == 0) - acpi_realmode_flags |= 4; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); - diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 1e931aaf2ef..f53e3277f8e 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -1,4 +1,4 @@ -.text + .section .text.page_aligned #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page.h> diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 5ed3bc5c61d..2e1b9e0d076 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -344,13 +344,13 @@ do_suspend_lowlevel: call save_processor_state movq $saved_context, %rax - movq %rsp, pt_regs_rsp(%rax) - movq %rbp, pt_regs_rbp(%rax) - movq %rsi, pt_regs_rsi(%rax) - movq %rdi, pt_regs_rdi(%rax) - movq %rbx, pt_regs_rbx(%rax) - movq %rcx, pt_regs_rcx(%rax) - movq %rdx, pt_regs_rdx(%rax) + movq %rsp, pt_regs_sp(%rax) + movq %rbp, pt_regs_bp(%rax) + movq %rsi, pt_regs_si(%rax) + movq %rdi, pt_regs_di(%rax) + movq %rbx, pt_regs_bx(%rax) + movq %rcx, pt_regs_cx(%rax) + movq %rdx, pt_regs_dx(%rax) movq %r8, pt_regs_r8(%rax) movq %r9, pt_regs_r9(%rax) movq %r10, pt_regs_r10(%rax) @@ -360,7 +360,7 @@ do_suspend_lowlevel: movq %r14, pt_regs_r14(%rax) movq %r15, pt_regs_r15(%rax) pushfq - popq pt_regs_eflags(%rax) + popq pt_regs_flags(%rax) movq $.L97, saved_rip(%rip) @@ -391,15 +391,15 @@ do_suspend_lowlevel: movq %rbx, %cr2 movq saved_context_cr0(%rax), %rbx movq %rbx, %cr0 - pushq pt_regs_eflags(%rax) + pushq pt_regs_flags(%rax) popfq - movq pt_regs_rsp(%rax), %rsp - movq pt_regs_rbp(%rax), %rbp - movq pt_regs_rsi(%rax), %rsi - movq pt_regs_rdi(%rax), %rdi - movq pt_regs_rbx(%rax), %rbx - movq pt_regs_rcx(%rax), %rcx - movq pt_regs_rdx(%rax), %rdx + movq pt_regs_sp(%rax), %rsp + movq pt_regs_bp(%rax), %rbp + movq pt_regs_si(%rax), %rsi + movq pt_regs_di(%rax), %rdi + movq pt_regs_bx(%rax), %rbx + movq pt_regs_cx(%rax), %rcx + movq pt_regs_dx(%rax), %rdx movq pt_regs_r8(%rax), %r8 movq pt_regs_r9(%rax), %r9 movq pt_regs_r10(%rax), %r10 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d6405e0842b..45d79ea890a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -273,6 +273,7 @@ struct smp_alt_module { }; static LIST_HEAD(smp_alt_modules); static DEFINE_SPINLOCK(smp_alt); +static int smp_mode = 1; /* protected by smp_alt */ void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, @@ -341,12 +342,13 @@ void alternatives_smp_switch(int smp) #ifdef CONFIG_LOCKDEP /* - * A not yet fixed binutils section handling bug prevents - * alternatives-replacement from working reliably, so turn - * it off: + * Older binutils section handling bug prevented + * alternatives-replacement from working reliably. + * + * If this still occurs then you should see a hang + * or crash shortly after this line: */ - printk("lockdep: not fixing up alternatives.\n"); - return; + printk("lockdep: fixing up alternatives.\n"); #endif if (noreplace_smp || smp_alt_once) @@ -354,21 +356,29 @@ void alternatives_smp_switch(int smp) BUG_ON(!smp && (num_online_cpus() > 1)); spin_lock_irqsave(&smp_alt, flags); - if (smp) { + + /* + * Avoid unnecessary switches because it forces JIT based VMs to + * throw away all cached translations, which can be quite costly. + */ + if (smp == smp_mode) { + /* nothing */ + } else if (smp) { printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); - clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - clear_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); } else { printk(KERN_INFO "SMP alternatives: switching to UP code\n"); - set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_unlock(mod->locks, mod->locks_end, mod->text, mod->text_end); } + smp_mode = smp; spin_unlock_irqrestore(&smp_alt, flags); } @@ -431,8 +441,9 @@ void __init alternative_instructions(void) if (smp_alt_once) { if (1 == num_possible_cpus()) { printk(KERN_INFO "SMP alternatives: switching to UP code\n"); - set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); + alternatives_smp_unlock(__smp_locks, __smp_locks_end, _text, _etext); } @@ -440,7 +451,10 @@ void __init alternative_instructions(void) alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); - alternatives_smp_switch(0); + + /* Only switch to UP mode if we don't immediately boot others */ + if (num_possible_cpus() == 1 || setup_max_cpus <= 1) + alternatives_smp_switch(0); } #endif apply_paravirt(__parainstructions, __parainstructions_end); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 5b6992799c9..608152a2a05 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -1,12 +1,12 @@ -/* +/* * Firmware replacement code. - * + * * Work around broken BIOSes that don't set an aperture or only set the - * aperture in the AGP bridge. - * If all fails map the aperture over some low memory. This is cheaper than - * doing bounce buffering. The memory is lost. This is done at early boot - * because only the bootmem allocator can allocate 32+MB. - * + * aperture in the AGP bridge. + * If all fails map the aperture over some low memory. This is cheaper than + * doing bounce buffering. The memory is lost. This is done at early boot + * because only the bootmem allocator can allocate 32+MB. + * * Copyright 2002 Andi Kleen, SuSE Labs. */ #include <linux/kernel.h> @@ -30,7 +30,7 @@ int gart_iommu_aperture_disabled __initdata = 0; int gart_iommu_aperture_allowed __initdata = 0; int fallback_aper_order __initdata = 1; /* 64MB */ -int fallback_aper_force __initdata = 0; +int fallback_aper_force __initdata = 0; int fix_aperture __initdata = 1; @@ -49,167 +49,270 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ -static u32 __init allocate_aperture(void) +static u32 __init allocate_aperture(void) { u32 aper_size; - void *p; + void *p; - if (fallback_aper_order > 7) - fallback_aper_order = 7; - aper_size = (32 * 1024 * 1024) << fallback_aper_order; + if (fallback_aper_order > 7) + fallback_aper_order = 7; + aper_size = (32 * 1024 * 1024) << fallback_aper_order; - /* - * Aperture has to be naturally aligned. This means an 2GB aperture won't - * have much chance of finding a place in the lower 4GB of memory. - * Unfortunately we cannot move it up because that would make the - * IOMMU useless. + /* + * Aperture has to be naturally aligned. This means a 2GB aperture + * won't have much chance of finding a place in the lower 4GB of + * memory. Unfortunately we cannot move it up because that would + * make the IOMMU useless. */ p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); if (!p || __pa(p)+aper_size > 0xffffffff) { - printk("Cannot allocate aperture memory hole (%p,%uK)\n", - p, aper_size>>10); + printk(KERN_ERR + "Cannot allocate aperture memory hole (%p,%uK)\n", + p, aper_size>>10); if (p) free_bootmem(__pa(p), aper_size); return 0; } - printk("Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, __pa(p)); + printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", + aper_size >> 10, __pa(p)); insert_aperture_resource((u32)__pa(p), aper_size); - return (u32)__pa(p); + + return (u32)__pa(p); } static int __init aperture_valid(u64 aper_base, u32 aper_size) -{ - if (!aper_base) - return 0; - if (aper_size < 64*1024*1024) { - printk("Aperture too small (%d MB)\n", aper_size>>20); +{ + if (!aper_base) return 0; - } + if (aper_base + aper_size > 0x100000000UL) { - printk("Aperture beyond 4GB. Ignoring.\n"); - return 0; + printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n"); + return 0; } if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { - printk("Aperture pointing to e820 RAM. Ignoring.\n"); - return 0; - } + printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n"); + return 0; + } + if (aper_size < 64*1024*1024) { + printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20); + return 0; + } + return 1; -} +} /* Find a PCI capability */ -static __u32 __init find_cap(int num, int slot, int func, int cap) -{ - u8 pos; +static __u32 __init find_cap(int num, int slot, int func, int cap) +{ int bytes; - if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) + u8 pos; + + if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) return 0; - pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + + pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { u8 id; - pos &= ~3; - id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); + + pos &= ~3; + id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); if (id == 0xff) break; - if (id == cap) - return pos; - pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); - } + if (id == cap) + return pos; + pos = read_pci_config_byte(num, slot, func, + pos+PCI_CAP_LIST_NEXT); + } return 0; -} +} /* Read a standard AGPv3 bridge header */ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) -{ +{ u32 apsize; u32 apsizereg; int nbits; u32 aper_low, aper_hi; u64 aper; - printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); - apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); + printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func); + apsizereg = read_pci_config_16(num, slot, func, cap + 0x14); if (apsizereg == 0xffffffff) { - printk("APSIZE in AGP bridge unreadable\n"); + printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); return 0; } apsize = apsizereg & 0xfff; /* Some BIOS use weird encodings not in the AGPv3 table. */ - if (apsize & 0xff) - apsize |= 0xf00; + if (apsize & 0xff) + apsize |= 0xf00; nbits = hweight16(apsize); *order = 7 - nbits; if ((int)*order < 0) /* < 32MB */ *order = 0; - - aper_low = read_pci_config(num,slot,func, 0x10); - aper_hi = read_pci_config(num,slot,func,0x14); + + aper_low = read_pci_config(num, slot, func, 0x10); + aper_hi = read_pci_config(num, slot, func, 0x14); aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); - printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", - aper, 32 << *order, apsizereg); + printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", + aper, 32 << *order, apsizereg); if (!aperture_valid(aper, (32*1024*1024) << *order)) - return 0; - return (u32)aper; -} - -/* Look for an AGP bridge. Windows only expects the aperture in the - AGP bridge and some BIOS forget to initialize the Northbridge too. - Work around this here. - - Do an PCI bus scan by hand because we're running before the PCI - subsystem. + return 0; + return (u32)aper; +} - All K8 AGP bridges are AGPv3 compliant, so we can do this scan - generically. It's probably overkill to always scan all slots because - the AGP bridges should be always an own bus on the HT hierarchy, - but do it here for future safety. */ +/* + * Look for an AGP bridge. Windows only expects the aperture in the + * AGP bridge and some BIOS forget to initialize the Northbridge too. + * Work around this here. + * + * Do an PCI bus scan by hand because we're running before the PCI + * subsystem. + * + * All K8 AGP bridges are AGPv3 compliant, so we can do this scan + * generically. It's probably overkill to always scan all slots because + * the AGP bridges should be always an own bus on the HT hierarchy, + * but do it here for future safety. + */ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) { int num, slot, func; /* Poor man's PCI discovery */ - for (num = 0; num < 256; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { + for (num = 0; num < 256; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { u32 class, cap; u8 type; - class = read_pci_config(num,slot,func, + class = read_pci_config(num, slot, func, PCI_CLASS_REVISION); if (class == 0xffffffff) - break; - - switch (class >> 16) { + break; + + switch (class >> 16) { case PCI_CLASS_BRIDGE_HOST: case PCI_CLASS_BRIDGE_OTHER: /* needed? */ /* AGP bridge? */ - cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); + cap = find_cap(num, slot, func, + PCI_CAP_ID_AGP); if (!cap) break; - *valid_agp = 1; - return read_agp(num,slot,func,cap,order); - } - + *valid_agp = 1; + return read_agp(num, slot, func, cap, + order); + } + /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, + type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); if (!(type & 0x80)) break; - } - } + } + } } - printk("No AGP bridge found\n"); + printk(KERN_INFO "No AGP bridge found\n"); + return 0; } +static int gart_fix_e820 __initdata = 1; + +static int __init parse_gart_mem(char *p) +{ + if (!p) + return -EINVAL; + + if (!strncmp(p, "off", 3)) + gart_fix_e820 = 0; + else if (!strncmp(p, "on", 2)) + gart_fix_e820 = 1; + + return 0; +} +early_param("gart_fix_e820", parse_gart_mem); + +void __init early_gart_iommu_check(void) +{ + /* + * in case it is enabled before, esp for kexec/kdump, + * previous kernel already enable that. memset called + * by allocate_aperture/__alloc_bootmem_nopanic cause restart. + * or second kernel have different position for GART hole. and new + * kernel could use hole as RAM that is still used by GART set by + * first kernel + * or BIOS forget to put that in reserved. + * try to update e820 to make that region as reserved. + */ + int fix, num; + u32 ctl; + u32 aper_size = 0, aper_order = 0, last_aper_order = 0; + u64 aper_base = 0, last_aper_base = 0; + int aper_enabled = 0, last_aper_enabled = 0; + + if (!early_pci_allowed()) + return; + + fix = 0; + for (num = 24; num < 32; num++) { + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + + ctl = read_pci_config(0, num, 3, 0x90); + aper_enabled = ctl & 1; + aper_order = (ctl >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; + aper_base <<= 25; + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base) || + (last_aper_enabled && aper_enabled != last_aper_enabled)) { + fix = 1; + break; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + last_aper_enabled = aper_enabled; + } + + if (!fix && !aper_enabled) + return; + + if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL) + fix = 1; + + if (gart_fix_e820 && !fix && aper_enabled) { + if (e820_any_mapped(aper_base, aper_base + aper_size, + E820_RAM)) { + /* reserved it, so we can resuse it in second kernel */ + printk(KERN_INFO "update e820 for GART\n"); + add_memory_region(aper_base, aper_size, E820_RESERVED); + update_e820(); + } + return; + } + + /* different nodes have different setting, disable them all at first*/ + for (num = 24; num < 32; num++) { + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + + ctl = read_pci_config(0, num, 3, 0x90); + ctl &= ~1; + write_pci_config(0, num, 3, 0x90, ctl); + } + +} + void __init gart_iommu_hole_init(void) -{ - int fix, num; +{ u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; u64 aper_base, last_aper_base = 0; - int valid_agp = 0; + int fix, num, valid_agp = 0; + int node; if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) @@ -218,24 +321,26 @@ void __init gart_iommu_hole_init(void) printk(KERN_INFO "Checking aperture...\n"); fix = 0; - for (num = 24; num < 32; num++) { + node = 0; + for (num = 24; num < 32; num++) { if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) continue; iommu_detected = 1; gart_iommu_aperture = 1; - aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; - aper_size = (32 * 1024 * 1024) << aper_order; + aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; - aper_base <<= 25; + aper_base <<= 25; + + printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", + node, aper_base, aper_size >> 20); + node++; - printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, - aper_base, aper_size>>20); - if (!aperture_valid(aper_base, aper_size)) { - fix = 1; - break; + fix = 1; + break; } if ((last_aper_order && aper_order != last_aper_order) || @@ -245,55 +350,64 @@ void __init gart_iommu_hole_init(void) } last_aper_order = aper_order; last_aper_base = aper_base; - } + } if (!fix && !fallback_aper_force) { if (last_aper_base) { unsigned long n = (32 * 1024 * 1024) << last_aper_order; + insert_aperture_resource((u32)last_aper_base, n); } - return; + return; } if (!fallback_aper_force) - aper_alloc = search_agp_bridge(&aper_order, &valid_agp); - - if (aper_alloc) { + aper_alloc = search_agp_bridge(&aper_order, &valid_agp); + + if (aper_alloc) { /* Got the aperture from the AGP bridge */ } else if (swiotlb && !valid_agp) { /* Do nothing */ } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || force_iommu || valid_agp || - fallback_aper_force) { - printk("Your BIOS doesn't leave a aperture memory hole\n"); - printk("Please enable the IOMMU option in the BIOS setup\n"); - printk("This costs you %d MB of RAM\n", - 32 << fallback_aper_order); + fallback_aper_force) { + printk(KERN_ERR + "Your BIOS doesn't leave a aperture memory hole\n"); + printk(KERN_ERR + "Please enable the IOMMU option in the BIOS setup\n"); + printk(KERN_ERR + "This costs you %d MB of RAM\n", + 32 << fallback_aper_order); aper_order = fallback_aper_order; aper_alloc = allocate_aperture(); - if (!aper_alloc) { - /* Could disable AGP and IOMMU here, but it's probably - not worth it. But the later users cannot deal with - bad apertures and turning on the aperture over memory - causes very strange problems, so it's better to - panic early. */ + if (!aper_alloc) { + /* + * Could disable AGP and IOMMU here, but it's + * probably not worth it. But the later users + * cannot deal with bad apertures and turning + * on the aperture over memory causes very + * strange problems, so it's better to panic + * early. + */ panic("Not enough memory for aperture"); } - } else { - return; - } + } else { + return; + } /* Fix up the north bridges */ - for (num = 24; num < 32; num++) { + for (num = 24; num < 32; num++) { if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; - - /* Don't enable translation yet. That is done later. - Assume this BIOS didn't initialise the GART so - just overwrite all previous bits */ - write_pci_config(0, num, 3, 0x90, aper_order<<1); - write_pci_config(0, num, 3, 0x94, aper_alloc>>25); - } -} + continue; + + /* + * Don't enable translation yet. That is done later. + * Assume this BIOS didn't initialise the GART so + * just overwrite all previous bits + */ + write_pci_config(0, num, 3, 0x90, aper_order<<1); + write_pci_config(0, num, 3, 0x94, aper_alloc>>25); + } +} diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index a56c782653b..35a568ea840 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -43,12 +43,10 @@ #include <mach_apicdef.h> #include <mach_ipi.h> -#include "io_ports.h" - /* * Sanity check */ -#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F +#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) # error SPURIOUS_APIC_VECTOR definition error #endif @@ -57,7 +55,7 @@ * * -1=force-disable, +1=force-enable */ -static int enable_local_apic __initdata = 0; +static int enable_local_apic __initdata; /* Local APIC timer verification ok */ static int local_apic_timer_verify_ok; @@ -101,6 +99,8 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; +static unsigned long apic_phys; + /* * Get the LAPIC version */ @@ -110,7 +110,7 @@ static inline int lapic_get_version(void) } /* - * Check, if the APIC is integrated or a seperate chip + * Check, if the APIC is integrated or a separate chip */ static inline int lapic_is_integrated(void) { @@ -135,9 +135,9 @@ void apic_wait_icr_idle(void) cpu_relax(); } -unsigned long safe_apic_wait_icr_idle(void) +u32 safe_apic_wait_icr_idle(void) { - unsigned long send_status; + u32 send_status; int timeout; timeout = 0; @@ -154,7 +154,7 @@ unsigned long safe_apic_wait_icr_idle(void) /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ -void enable_NMI_through_LVT0 (void * dummy) +void __cpuinit enable_NMI_through_LVT0(void) { unsigned int v = APIC_DM_NMI; @@ -379,8 +379,10 @@ void __init setup_boot_APIC_clock(void) */ if (local_apic_timer_disabled) { /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; setup_APIC_timer(); + } return; } @@ -434,7 +436,7 @@ void __init setup_boot_APIC_clock(void) "with PM Timer: %ldms instead of 100ms\n", (long)res); /* Correct the lapic counter value */ - res = (((u64) delta ) * pm_100ms); + res = (((u64) delta) * pm_100ms); do_div(res, deltapm); printk(KERN_INFO "APIC delta adjusted to PM-Timer: " "%lu (%ld)\n", (unsigned long) res, delta); @@ -472,6 +474,19 @@ void __init setup_boot_APIC_clock(void) local_apic_timer_verify_ok = 1; + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + local_irq_enable(); + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + /* We trust the pm timer based calibration */ if (!pm_referenced) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); @@ -563,6 +578,9 @@ static void local_apic_timer_interrupt(void) return; } + /* + * the NMI deadlock-detector uses this. + */ per_cpu(irq_stat, cpu).apic_timer_irqs++; evt->event_handler(evt); @@ -576,8 +594,7 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ - -void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) +void smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -616,9 +633,14 @@ int setup_profiling_timer(unsigned int multiplier) */ void clear_local_APIC(void) { - int maxlvt = lapic_get_maxlvt(); - unsigned long v; + int maxlvt; + u32 v; + + /* APIC hasn't been mapped yet */ + if (!apic_phys) + return; + maxlvt = lapic_get_maxlvt(); /* * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. @@ -976,7 +998,8 @@ void __cpuinit setup_local_APIC(void) value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); - if (integrated && !esr_disable) { /* !82489DX */ + if (integrated && !esr_disable) { + /* !82489DX */ maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -1020,7 +1043,7 @@ void __cpuinit setup_local_APIC(void) /* * Detect and initialize APIC */ -static int __init detect_init_APIC (void) +static int __init detect_init_APIC(void) { u32 h, l, features; @@ -1077,7 +1100,7 @@ static int __init detect_init_APIC (void) printk(KERN_WARNING "Could not enable APIC!\n"); return -1; } - set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; /* The BIOS may have set up the APIC at some other address */ @@ -1104,8 +1127,6 @@ no_apic: */ void __init init_apic_mappings(void) { - unsigned long apic_phys; - /* * If no local APIC can be found then set up a fake all * zeroes page to simulate the local APIC and another @@ -1164,10 +1185,10 @@ fake_ioapic_page: * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ -int __init APIC_init_uniprocessor (void) +int __init APIC_init_uniprocessor(void) { if (enable_local_apic < 0) - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); if (!smp_found_config && !cpu_has_apic) return -1; @@ -1179,7 +1200,7 @@ int __init APIC_init_uniprocessor (void) APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } @@ -1210,50 +1231,6 @@ int __init APIC_init_uniprocessor (void) } /* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); - -static int __init parse_nolapic(char *arg) -{ - enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - return 0; -} -early_param("nolapic", parse_nolapic); - -static int __init parse_disable_lapic_timer(char *arg) -{ - local_apic_timer_disabled = 1; - return 0; -} -early_param("nolapic_timer", parse_disable_lapic_timer); - -static int __init parse_lapic_timer_c2_ok(char *arg) -{ - local_apic_timer_c2_ok = 1; - return 0; -} -early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); - -static int __init apic_set_verbosity(char *str) -{ - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - return 1; -} - -__setup("apic=", apic_set_verbosity); - - -/* * Local APIC interrupts */ @@ -1306,7 +1283,7 @@ void smp_error_interrupt(struct pt_regs *regs) 6: Received illegal vector 7: Illegal register address */ - printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", + printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); } @@ -1393,7 +1370,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) value = apic_read(APIC_LVT0); value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); apic_write_around(APIC_LVT0, value); @@ -1565,3 +1542,46 @@ device_initcall(init_lapic_sysfs); static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ + +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); + +static int __init parse_nolapic(char *arg) +{ + enable_local_apic = -1; + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + return 0; +} +early_param("nolapic", parse_nolapic); + +static int __init parse_disable_lapic_timer(char *arg) +{ + local_apic_timer_disabled = 1; + return 0; +} +early_param("nolapic_timer", parse_disable_lapic_timer); + +static int __init parse_lapic_timer_c2_ok(char *arg) +{ + local_apic_timer_c2_ok = 1; + return 0; +} +early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); + +static int __init apic_set_verbosity(char *str) +{ + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + return 1; +} +__setup("apic=", apic_set_verbosity); + diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index fa6cdee6d30..d8d03e09dea 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -23,32 +23,37 @@ #include <linux/mc146818rtc.h> #include <linux/kernel_stat.h> #include <linux/sysdev.h> -#include <linux/module.h> #include <linux/ioport.h> #include <linux/clockchips.h> +#include <linux/acpi_pmtmr.h> +#include <linux/module.h> #include <asm/atomic.h> #include <asm/smp.h> #include <asm/mtrr.h> #include <asm/mpspec.h> +#include <asm/hpet.h> #include <asm/pgalloc.h> #include <asm/mach_apic.h> #include <asm/nmi.h> #include <asm/idle.h> #include <asm/proto.h> #include <asm/timex.h> -#include <asm/hpet.h> #include <asm/apic.h> -int apic_verbosity; int disable_apic_timer __cpuinitdata; static int apic_calibrate_pmtmr __initdata; +int disable_apic; -/* Local APIC timer works in C2? */ +/* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); -static struct resource *ioapic_resources; +/* + * Debug level, exported for io_apic.c + */ +int apic_verbosity; + static struct resource lapic_resource = { .name = "Local APIC", .flags = IORESOURCE_MEM | IORESOURCE_BUSY, @@ -60,10 +65,8 @@ static int lapic_next_event(unsigned long delta, struct clock_event_device *evt); static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt); - static void lapic_timer_broadcast(cpumask_t mask); - -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen); +static void apic_pm_activate(void); static struct clock_event_device lapic_clockevent = { .name = "lapic", @@ -78,6 +81,150 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); +static unsigned long apic_phys; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a seperate chip + */ +static inline int lapic_is_integrated(void) +{ + return 1; +} + +/* + * Check, whether this is a modern or a first generation APIC + */ +static int modern_apic(void) +{ + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + return lapic_get_version() >= 0x14; +} + +void apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +u32 safe_apic_wait_icr_idle(void) +{ + u32 send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + +/** + * enable_NMI_through_LVT0 - enable NMI through local vector table 0 + */ +void __cpuinit enable_NMI_through_LVT0(void) +{ + unsigned int v; + + /* unmask and set to NMI */ + v = APIC_DM_NMI; + apic_write(APIC_LVT0, v); +} + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + unsigned int v, maxlvt; + + v = apic_read(APIC_LVR); + maxlvt = GET_APIC_MAXLVT(v); + return maxlvt; +} + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ + +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) +{ + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write(APIC_LVTT, lvtt_value); + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + if (!oneshot) + apic_write(APIC_TMICT, clocks); +} + +/* + * Setup extended LVT, AMD specific (K8, family 10h) + * + * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and + * MCE interrupts are supported. Thus MCE offset must be set to 0. + */ + +#define APIC_EILVT_LVTOFF_MCE 0 +#define APIC_EILVT_LVTOFF_IBS 1 + +static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) +{ + unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; + + apic_write(reg, v); +} + +u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_MCE; +} + +u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_IBS; +} + +/* + * Program the next event, relative to now + */ static int lapic_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -85,6 +232,9 @@ static int lapic_next_event(unsigned long delta, return 0; } +/* + * Setup the lapic timer in periodic or oneshot mode + */ static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt) { @@ -127,75 +277,261 @@ static void lapic_timer_broadcast(cpumask_t mask) #endif } -static void apic_pm_activate(void); +/* + * Setup the local APIC timer for this CPU. Copy the initilized values + * of the boot CPU and register the clock event in the framework. + */ +static void setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); -void apic_wait_icr_idle(void) + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); + + clockevents_register_device(levt); +} + +/* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. + * + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. + */ + +#define TICK_COUNT 100000000 + +static void __init calibrate_APIC_clock(void) { - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); + unsigned apic, apic_start; + unsigned long tsc, tsc_start; + int result; + + local_irq_disable(); + + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + * + * No interrupt enable ! + */ + __setup_APIC_LVTT(250000000, 0, 0); + + apic_start = apic_read(APIC_TMCCT); +#ifdef CONFIG_X86_PM_TIMER + if (apic_calibrate_pmtmr && pmtmr_ioport) { + pmtimer_wait(5000); /* 5ms wait */ + apic = apic_read(APIC_TMCCT); + result = (apic_start - apic) * 1000L / 5; + } else +#endif + { + rdtscll(tsc_start); + + do { + apic = apic_read(APIC_TMCCT); + rdtscll(tsc); + } while ((tsc - tsc_start) < TICK_COUNT && + (apic_start - apic) < TICK_COUNT); + + result = (apic_start - apic) * 1000L * tsc_khz / + (tsc - tsc_start); + } + + local_irq_enable(); + + printk(KERN_DEBUG "APIC timer calibration result %d\n", result); + + printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", + result / 1000 / 1000, result / 1000 % 1000); + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = result / HZ; } -unsigned int safe_apic_wait_icr_idle(void) +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) { - unsigned int send_status; - int timeout; + /* + * The local apic timer can be disabled via the kernel commandline. + * Register the lapic timer as a dummy clock event source on SMP + * systems, so the broadcast mechanism is used. On UP systems simply + * ignore it. + */ + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; + } - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - udelay(100); - } while (timeout++ < 1000); + printk(KERN_INFO "Using local APIC timer interrupts.\n"); + calibrate_APIC_clock(); - return send_status; + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=1!\n"); + + setup_APIC_timer(); } -void enable_NMI_through_LVT0 (void * dummy) +/* + * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the + * C1E flag only in the secondary CPU, so when we detect the wreckage + * we already have enabled the boot CPU local apic timer. Check, if + * disable_apic_timer is set and the DUMMY flag is cleared. If yes, + * set the DUMMY flag again and force the broadcast mode in the + * clockevents layer. + */ +void __cpuinit check_boot_apic_timer_broadcast(void) { - unsigned int v; + if (!disable_apic_timer || + (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) + return; - /* unmask and set to NMI */ - v = APIC_DM_NMI; - apic_write(APIC_LVT0, v); + printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n"); + lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY; + + local_irq_enable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id); + local_irq_disable(); } -int get_maxlvt(void) +void __cpuinit setup_secondary_APIC_clock(void) { - unsigned int v, maxlvt; + check_boot_apic_timer_broadcast(); + setup_APIC_timer(); +} - v = apic_read(APIC_LVR); - maxlvt = GET_APIC_MAXLVT(v); - return maxlvt; +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) +{ + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + /* + * the NMI deadlock-detector uses this. + */ + add_pda(apic_timer_irqs, 1); + + evt->event_handler(evt); } /* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] */ -void ack_bad_irq(unsigned int irq) +void smp_apic_timer_interrupt(struct pt_regs *regs) { - printk("unexpected IRQ trap at vector %02x\n", irq); + struct pt_regs *old_regs = set_irq_regs(regs); + /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But don't ack when the APIC is disabled. -AK + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. */ - if (!disable_apic) - ack_APIC_irq(); + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + exit_idle(); + irq_enter(); + local_apic_timer_interrupt(); + irq_exit(); + set_irq_regs(old_regs); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; } + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ void clear_local_APIC(void) { - int maxlvt; - unsigned int v; + int maxlvt = lapic_get_maxlvt(); + u32 v; - maxlvt = get_maxlvt(); + /* APIC hasn't been mapped yet */ + if (!apic_phys) + return; + maxlvt = lapic_get_maxlvt(); /* * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. @@ -233,45 +569,9 @@ void clear_local_APIC(void) apic_read(APIC_ESR); } -void disconnect_bsp_APIC(int virt_wire_setup) -{ - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } - - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); -} - +/** + * disable_local_APIC - clear and disable the local APIC + */ void disable_local_APIC(void) { unsigned int value; @@ -333,7 +633,7 @@ int __init verify_local_APIC(void) reg1 = GET_APIC_VERSION(reg0); if (reg1 == 0x00 || reg1 == 0xff) return 0; - reg1 = get_maxlvt(); + reg1 = lapic_get_maxlvt(); if (reg1 < 0x02 || reg1 == 0xff) return 0; @@ -355,18 +655,20 @@ int __init verify_local_APIC(void) * compatibility mode, but most boxes are anymore. */ reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); + apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); reg1 = apic_read(APIC_LVT1); apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); return 1; } +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ void __init sync_Arb_IDs(void) { /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ - unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (ver >= 0x14) /* P4 or higher */ + if (modern_apic()) return; /* @@ -418,9 +720,12 @@ void __init init_bsp_APIC(void) apic_write(APIC_LVT1, value); } -void __cpuinit setup_local_APIC (void) +/** + * setup_local_APIC - setup the local APIC + */ +void __cpuinit setup_local_APIC(void) { - unsigned int value, maxlvt; + unsigned int value; int i, j; value = apic_read(APIC_LVR); @@ -516,30 +821,217 @@ void __cpuinit setup_local_APIC (void) else value = APIC_DM_NMI | APIC_LVT_MASKED; apic_write(APIC_LVT1, value); +} - { - unsigned oldvalue; - maxlvt = get_maxlvt(); - oldvalue = apic_read(APIC_ESR); - value = ERROR_APIC_VECTOR; // enables sending errors - apic_write(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, - "ESR value after enabling vector: %08x, after %08x\n", - oldvalue, value); - } +void __cpuinit lapic_setup_esr(void) +{ + unsigned maxlvt = lapic_get_maxlvt(); + + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); +} +void __cpuinit end_local_APIC_setup(void) +{ + lapic_setup_esr(); nmi_watchdog_default(); setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ +static int __init detect_init_APIC(void) +{ + if (!cpu_has_apic) { + printk(KERN_INFO "No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + boot_cpu_id = 0; + return 0; +} + +/** + * init_apic_mappings - initialize APIC mappings + */ +void __init init_apic_mappings(void) +{ + /* + * If no local APIC can be found then set up a fake all + * zeroes page to simulate the local APIC and another + * one for the IO-APIC. + */ + if (!smp_found_config && detect_init_APIC()) { + apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = __pa(apic_phys); + } else + apic_phys = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, apic_phys); + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor(void) +{ + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } + + verify_local_APIC(); + + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); + + setup_local_APIC(); + + /* + * Now enable IO-APICs, actually call clear_IO_APIC + * We need clear_IO_APIC before enabling vector on BP + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); + + end_local_APIC_setup(); + + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); + else + nr_ioapics = 0; + setup_boot_APIC_clock(); + check_nmi_watchdog(); + return 0; +} + +/* + * Local APIC interrupts + */ + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +asmlinkage void smp_spurious_interrupt(void) +{ + unsigned int v; + exit_idle(); + irq_enter(); + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + + add_pda(irq_spurious_count, 1); + irq_exit(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ +asmlinkage void smp_error_interrupt(void) +{ + unsigned int v, v1; + + exit_idle(); + irq_enter(); + /* First tickle the hardware, only then report what went on. -- REW */ + v = apic_read(APIC_ESR); + apic_write(APIC_ESR, 0); + v1 = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + /* Here is what the APIC error bits mean: + 0: Send CS error + 1: Receive CS error + 2: Send accept error + 3: Receive accept error + 4: Reserved + 5: Send illegal vector + 6: Received illegal vector + 7: Illegal register address + */ + printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + smp_processor_id(), v , v1); + irq_exit(); +} + +void disconnect_bsp_APIC(int virt_wire_setup) +{ + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); + } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); +} + +/* + * Power management + */ #ifdef CONFIG_PM static struct { @@ -571,7 +1063,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) if (!apic_pm_state.active) return 0; - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); apic_pm_state.apic_id = apic_read(APIC_ID); apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); @@ -605,7 +1097,7 @@ static int lapic_resume(struct sys_device *dev) if (!apic_pm_state.active) return 0; - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); local_irq_save(flags); rdmsr(MSR_IA32_APICBASE, l, h); @@ -645,8 +1137,8 @@ static struct sysdev_class lapic_sysclass = { }; static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, + .id = 0, + .cls = &lapic_sysclass, }; static void __cpuinit apic_pm_activate(void) @@ -657,9 +1149,11 @@ static void __cpuinit apic_pm_activate(void) static int __init init_lapic_sysfs(void) { int error; + if (!cpu_has_apic) return 0; /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + error = sysdev_class_register(&lapic_sysclass); if (!error) error = sysdev_register(&device_lapic); @@ -673,423 +1167,6 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ -static int __init apic_set_verbosity(char *str) -{ - if (str == NULL) { - skip_ioapic_setup = 0; - ioapic_force = 1; - return 0; - } - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", str); - return -EINVAL; - } - - return 0; -} -early_param("apic", apic_set_verbosity); - -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. - * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) - */ - -static int __init detect_init_APIC (void) -{ - if (!cpu_has_apic) { - printk(KERN_INFO "No local APIC present\n"); - return -1; - } - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_id = 0; - return 0; -} - -#ifdef CONFIG_X86_IO_APIC -static struct resource * __init ioapic_setup_resources(void) -{ -#define IOAPIC_RESOURCE_NAME_SIZE 11 - unsigned long n; - struct resource *res; - char *mem; - int i; - - if (nr_ioapics <= 0) - return NULL; - - n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= nr_ioapics; - - mem = alloc_bootmem(n); - res = (void *)mem; - - if (mem != NULL) { - memset(mem, 0, n); - mem += sizeof(struct resource) * nr_ioapics; - - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } - } - - ioapic_resources = res; - - return res; -} - -static int __init ioapic_insert_resources(void) -{ - int i; - struct resource *r = ioapic_resources; - - if (!r) { - printk("IO APIC resources could be not be allocated.\n"); - return -1; - } - - for (i = 0; i < nr_ioapics; i++) { - insert_resource(&iomem_resource, r); - r++; - } - - return 0; -} - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); -#endif - -void __init init_apic_mappings(void) -{ - unsigned long apic_phys; - - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, apic_phys); - - /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; - lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; - insert_resource(&iomem_resource, &lapic_resource); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - - { - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; - struct resource *ioapic_res; - - ioapic_res = ioapic_setup_resources(); - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mpc_apicaddr; - } else { - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %016lx (%016lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } - } - } -} - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ - -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) -{ - unsigned int lvtt_value, tmp_value; - - lvtt_value = LOCAL_TIMER_VECTOR; - if (!oneshot) - lvtt_value |= APIC_LVT_TIMER_PERIODIC; - if (!irqen) - lvtt_value |= APIC_LVT_MASKED; - - apic_write(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); - - if (!oneshot) - apic_write(APIC_TMICT, clocks); -} - -static void setup_APIC_timer(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - - memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); - - clockevents_register_device(levt); -} - -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static void __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - - local_irq_disable(); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - * - * No interrupt enable ! - */ - __setup_APIC_LVTT(250000000, 0, 0); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - - local_irq_enable(); - - printk(KERN_DEBUG "APIC timer calibration result %d\n", result); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = result / HZ; -} - -void __init setup_boot_APIC_clock (void) -{ - /* - * The local apic timer can be disabled via the kernel commandline. - * Register the lapic timer as a dummy clock event source on SMP - * systems, so the broadcast mechanism is used. On UP systems simply - * ignore it. - */ - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; - } - - printk(KERN_INFO "Using local APIC timer interrupts.\n"); - calibrate_APIC_clock(); - - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=1!\n"); - - setup_APIC_timer(); -} - -/* - * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the - * C1E flag only in the secondary CPU, so when we detect the wreckage - * we already have enabled the boot CPU local apic timer. Check, if - * disable_apic_timer is set and the DUMMY flag is cleared. If yes, - * set the DUMMY flag again and force the broadcast mode in the - * clockevents layer. - */ -void __cpuinit check_boot_apic_timer_broadcast(void) -{ - if (!disable_apic_timer || - (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) - return; - - printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n"); - lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY; - - local_irq_enable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id); - local_irq_disable(); -} - -void __cpuinit setup_secondary_APIC_clock(void) -{ - check_boot_apic_timer_broadcast(); - setup_APIC_timer(); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - -void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, - unsigned char msg_type, unsigned char mask) -{ - unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - apic_write(reg, v); -} - -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ - -void smp_local_timer_interrupt(void) -{ - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - - /* - * Normally we should not be here till LAPIC has been initialized but - * in some cases like kdump, its possible that there is a pending LAPIC - * timer interrupt from previous kernel's context and is delivered in - * new kernel the moment interrupts are enabled. - * - * Interrupts are enabled early and LAPIC is setup much later, hence - * its possible that when we get here evt->event_handler is NULL. - * Check for event_handler being NULL and discard the interrupt as - * spurious. - */ - if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); - /* Switch it off */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); - return; - } - - /* - * the NMI deadlock-detector uses this. - */ - add_pda(apic_timer_irqs, 1); - - evt->event_handler(evt); -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - exit_idle(); - irq_enter(); - smp_local_timer_interrupt(); - irq_exit(); - set_irq_regs(old_regs); -} - /* * apic_is_clustered_box() -- Check if we can expect good TSC * @@ -1103,21 +1180,34 @@ __cpuinit int apic_is_clustered_box(void) { int i, clusters, zeros; unsigned id; + u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); for (i = 0; i < NR_CPUS; i++) { - id = bios_cpu_apicid[i]; + /* are we being called early in kernel startup? */ + if (bios_cpu_apicid) { + id = bios_cpu_apicid[i]; + } + else if (i < nr_cpu_ids) { + if (cpu_present(i)) + id = per_cpu(x86_bios_cpu_apicid, i); + else + continue; + } + else + break; + if (id != BAD_APICID) __set_bit(APIC_CLUSTERID(id), clustermap); } /* Problem: Partially populated chassis may not have CPUs in some of * the APIC clusters they have been allocated. Only present CPUs have - * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since - * clusters are allocated sequentially, count zeros only if they are - * bounded by ones. + * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. + * Since clusters are allocated sequentially, count zeros only if + * they are bounded by ones. */ clusters = 0; zeros = 0; @@ -1138,96 +1228,33 @@ __cpuinit int apic_is_clustered_box(void) } /* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -asmlinkage void smp_spurious_interrupt(void) -{ - unsigned int v; - exit_idle(); - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - - add_pda(irq_spurious_count, 1); - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture + * APIC command line parameters */ - -asmlinkage void smp_error_interrupt(void) -{ - unsigned int v, v1; - - exit_idle(); - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -int disable_apic; - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor (void) +static int __init apic_set_verbosity(char *str) { - if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); - return -1; + if (str == NULL) { + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; } - if (!cpu_has_apic) { - disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); - return -1; + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", str); + return -EINVAL; } - verify_local_APIC(); - - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); - - setup_local_APIC(); - - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else - nr_ioapics = 0; - setup_boot_APIC_clock(); - check_nmi_watchdog(); return 0; } +early_param("apic", apic_set_verbosity); static __init int setup_disableapic(char *str) { disable_apic = 1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return 0; } early_param("disableapic", setup_disableapic); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index af045ca0f65..d4438ef296d 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -227,6 +227,7 @@ #include <linux/dmi.h> #include <linux/suspend.h> #include <linux/kthread.h> +#include <linux/jiffies.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -235,8 +236,6 @@ #include <asm/paravirt.h> #include <asm/reboot.h> -#include "io_ports.h" - #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); #endif @@ -324,7 +323,7 @@ extern int (*console_blank_hook)(int); /* * Ignore suspend events for this amount of time after a resume */ -#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) +#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) /* * Maximum number of events stored @@ -336,7 +335,7 @@ extern int (*console_blank_hook)(int); */ struct apm_user { int magic; - struct apm_user * next; + struct apm_user *next; unsigned int suser: 1; unsigned int writer: 1; unsigned int reader: 1; @@ -372,44 +371,44 @@ struct apm_user { static struct { unsigned long offset; unsigned short segment; -} apm_bios_entry; -static int clock_slowed; -static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; -static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; -static int set_pm_idle; -static int suspends_pending; -static int standbys_pending; -static int ignore_sys_suspend; -static int ignore_normal_resume; -static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; - -static int debug __read_mostly; -static int smp __read_mostly; -static int apm_disabled = -1; +} apm_bios_entry; +static int clock_slowed; +static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; +static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; +static int set_pm_idle; +static int suspends_pending; +static int standbys_pending; +static int ignore_sys_suspend; +static int ignore_normal_resume; +static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; + +static int debug __read_mostly; +static int smp __read_mostly; +static int apm_disabled = -1; #ifdef CONFIG_SMP -static int power_off; +static int power_off; #else -static int power_off = 1; +static int power_off = 1; #endif #ifdef CONFIG_APM_REAL_MODE_POWER_OFF -static int realmode_power_off = 1; +static int realmode_power_off = 1; #else -static int realmode_power_off; +static int realmode_power_off; #endif #ifdef CONFIG_APM_ALLOW_INTS -static int allow_ints = 1; +static int allow_ints = 1; #else -static int allow_ints; +static int allow_ints; #endif -static int broken_psr; +static int broken_psr; static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); -static struct apm_user * user_list; +static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static const struct desc_struct bad_bios_desc = { 0, 0x00409200 }; +static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; -static const char driver_version[] = "1.16ac"; /* no spaces */ +static const char driver_version[] = "1.16ac"; /* no spaces */ static struct task_struct *kapmd_task; @@ -417,7 +416,7 @@ static struct task_struct *kapmd_task; * APM event names taken from the APM 1.2 specification. These are * the message codes that the BIOS uses to tell us about events */ -static const char * const apm_event_name[] = { +static const char * const apm_event_name[] = { "system standby", "system suspend", "normal resume", @@ -435,14 +434,14 @@ static const char * const apm_event_name[] = { typedef struct lookup_t { int key; - char * msg; + char *msg; } lookup_t; /* * The BIOS returns a set of standard error codes in AX when the * carry flag is set. */ - + static const lookup_t error_table[] = { /* N/A { APM_SUCCESS, "Operation succeeded" }, */ { APM_DISABLED, "Power management disabled" }, @@ -472,24 +471,25 @@ static const lookup_t error_table[] = { * Write a meaningful log entry to the kernel log in the event of * an APM error. */ - + static void apm_error(char *str, int err) { - int i; + int i; for (i = 0; i < ERROR_COUNT; i++) - if (error_table[i].key == err) break; + if (error_table[i].key == err) + break; if (i < ERROR_COUNT) printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); else printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", - str, err); + str, err); } /* * Lock APM functionality to physical CPU 0 */ - + #ifdef CONFIG_SMP static cpumask_t apm_save_cpus(void) @@ -511,7 +511,7 @@ static inline void apm_restore_cpus(cpumask_t mask) /* * No CPU lockdown needed on a uniprocessor */ - + #define apm_save_cpus() (current->cpus_allowed) #define apm_restore_cpus(x) (void)(x) @@ -590,7 +590,7 @@ static inline void apm_irq_restore(unsigned long flags) * code is returned in AH (bits 8-15 of eax) and this function * returns non-zero. */ - + static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) { @@ -602,7 +602,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, struct desc_struct *gdt; cpus = apm_save_cpus(); - + cpu = get_cpu(); gdt = get_cpu_gdt_table(cpu); save_desc_40 = gdt[0x40 / 8]; @@ -616,7 +616,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, gdt[0x40 / 8] = save_desc_40; put_cpu(); apm_restore_cpus(cpus); - + return *eax & 0xff; } @@ -645,7 +645,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) struct desc_struct *gdt; cpus = apm_save_cpus(); - + cpu = get_cpu(); gdt = get_cpu_gdt_table(cpu); save_desc_40 = gdt[0x40 / 8]; @@ -680,7 +680,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) static int apm_driver_version(u_short *val) { - u32 eax; + u32 eax; if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) return (eax >> 8) & 0xff; @@ -704,16 +704,16 @@ static int apm_driver_version(u_short *val) * that APM 1.2 is in use. If no messges are pending the value 0x80 * is returned (No power management events pending). */ - + static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) { - u32 eax; - u32 ebx; - u32 ecx; - u32 dummy; + u32 eax; + u32 ebx; + u32 ecx; + u32 dummy; if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, - &dummy, &dummy)) + &dummy, &dummy)) return (eax >> 8) & 0xff; *event = ebx; if (apm_info.connection_version < 0x0102) @@ -736,10 +736,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) * The state holds the state to transition to, which may in fact * be an acceptance of a BIOS requested state change. */ - + static int set_power_state(u_short what, u_short state) { - u32 eax; + u32 eax; if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) return (eax >> 8) & 0xff; @@ -752,7 +752,7 @@ static int set_power_state(u_short what, u_short state) * * Transition the entire system into a new APM power state. */ - + static int set_system_power_state(u_short state) { return set_power_state(APM_DEVICE_ALL, state); @@ -766,13 +766,13 @@ static int set_system_power_state(u_short state) * to handle the idle request. On a success the function returns 1 * if the BIOS did clock slowing or 0 otherwise. */ - + static int apm_do_idle(void) { - u32 eax; - u8 ret = 0; - int idled = 0; - int polling; + u32 eax; + u8 ret = 0; + int idled = 0; + int polling; polling = !!(current_thread_info()->status & TS_POLLING); if (polling) { @@ -799,10 +799,9 @@ static int apm_do_idle(void) /* This always fails on some SMP boards running UP kernels. * Only report the failure the first 5 times. */ - if (++t < 5) - { + if (++t < 5) { printk(KERN_DEBUG "apm_do_idle failed (%d)\n", - (eax >> 8) & 0xff); + (eax >> 8) & 0xff); t = jiffies; } return -1; @@ -814,15 +813,15 @@ static int apm_do_idle(void) /** * apm_do_busy - inform the BIOS the CPU is busy * - * Request that the BIOS brings the CPU back to full performance. + * Request that the BIOS brings the CPU back to full performance. */ - + static void apm_do_busy(void) { - u32 dummy; + u32 dummy; if (clock_slowed || ALWAYS_CALL_BUSY) { - (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); + (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); clock_slowed = 0; } } @@ -833,15 +832,15 @@ static void apm_do_busy(void) * power management - we probably want * to conserve power. */ -#define IDLE_CALC_LIMIT (HZ * 100) -#define IDLE_LEAKY_MAX 16 +#define IDLE_CALC_LIMIT (HZ * 100) +#define IDLE_LEAKY_MAX 16 static void (*original_pm_idle)(void) __read_mostly; /** * apm_cpu_idle - cpu idling for APM capable Linux * - * This is the idling function the kernel executes when APM is available. It + * This is the idling function the kernel executes when APM is available. It * tries to do BIOS powermanagement based on the average system idle time. * Furthermore it calls the system default idle routine. */ @@ -882,7 +881,8 @@ recalc: t = jiffies; switch (apm_do_idle()) { - case 0: apm_idle_done = 1; + case 0: + apm_idle_done = 1; if (t != jiffies) { if (bucket) { bucket = IDLE_LEAKY_MAX; @@ -893,7 +893,8 @@ recalc: continue; } break; - case 1: apm_idle_done = 1; + case 1: + apm_idle_done = 1; break; default: /* BIOS refused */ break; @@ -921,10 +922,10 @@ recalc: * the SMP call on CPU0 as some systems will only honour this call * on their first cpu. */ - + static void apm_power_off(void) { - unsigned char po_bios_call[] = { + unsigned char po_bios_call[] = { 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ 0x8e, 0xd0, /* movw ax,ss */ 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ @@ -935,13 +936,12 @@ static void apm_power_off(void) }; /* Some bioses don't like being called from CPU != 0 */ - if (apm_info.realmode_power_off) - { + if (apm_info.realmode_power_off) { (void)apm_save_cpus(); machine_real_restart(po_bios_call, sizeof(po_bios_call)); + } else { + (void)set_system_power_state(APM_STATE_OFF); } - else - (void) set_system_power_state(APM_STATE_OFF); } #ifdef CONFIG_APM_DO_ENABLE @@ -950,17 +950,17 @@ static void apm_power_off(void) * apm_enable_power_management - enable BIOS APM power management * @enable: enable yes/no * - * Enable or disable the APM BIOS power services. + * Enable or disable the APM BIOS power services. */ - + static int apm_enable_power_management(int enable) { - u32 eax; + u32 eax; if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) return APM_NOT_ENGAGED; if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, - enable, &eax)) + enable, &eax)) return (eax >> 8) & 0xff; if (enable) apm_info.bios.flags &= ~APM_BIOS_DISABLED; @@ -983,19 +983,19 @@ static int apm_enable_power_management(int enable) * if reported is a lifetime in secodnds/minutes at current powwer * consumption. */ - + static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 dummy; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 dummy; if (apm_info.get_power_status_broken) return APM_32_UNSUPPORTED; if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, - &eax, &ebx, &ecx, &edx, &dummy)) + &eax, &ebx, &ecx, &edx, &dummy)) return (eax >> 8) & 0xff; *status = ebx; *bat = ecx; @@ -1011,11 +1011,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) static int apm_get_battery_status(u_short which, u_short *status, u_short *bat, u_short *life, u_short *nbat) { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; if (apm_info.connection_version < 0x0102) { /* pretend we only have one battery. */ @@ -1026,7 +1026,7 @@ static int apm_get_battery_status(u_short which, u_short *status, } if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, - &ebx, &ecx, &edx, &esi)) + &ebx, &ecx, &edx, &esi)) return (eax >> 8) & 0xff; *status = ebx; *bat = ecx; @@ -1044,10 +1044,10 @@ static int apm_get_battery_status(u_short which, u_short *status, * Activate or deactive power management on either a specific device * or the entire system (%APM_DEVICE_ALL). */ - + static int apm_engage_power_management(u_short device, int enable) { - u32 eax; + u32 eax; if ((enable == 0) && (device == APM_DEVICE_ALL) && (apm_info.bios.flags & APM_BIOS_DISABLED)) @@ -1074,7 +1074,7 @@ static int apm_engage_power_management(u_short device, int enable) * all video devices. Typically the BIOS will do laptop backlight and * monitor powerdown for us. */ - + static int apm_console_blank(int blank) { int error = APM_NOT_ENGAGED; /* silence gcc */ @@ -1126,7 +1126,7 @@ static apm_event_t get_queued_event(struct apm_user *as) static void queue_event(apm_event_t event, struct apm_user *sender) { - struct apm_user * as; + struct apm_user *as; spin_lock(&user_list_lock); if (user_list == NULL) @@ -1174,11 +1174,11 @@ static void reinit_timer(void) spin_lock_irqsave(&i8253_lock, flags); /* set the clock to HZ */ - outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ udelay(10); - outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ + outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */ udelay(10); - outb(LATCH >> 8, PIT_CH0); /* MSB */ + outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ udelay(10); spin_unlock_irqrestore(&i8253_lock, flags); #endif @@ -1186,7 +1186,7 @@ static void reinit_timer(void) static int suspend(int vetoable) { - int err; + int err; struct apm_user *as; if (pm_send_all(PM_SUSPEND, (void *)3)) { @@ -1239,7 +1239,7 @@ static int suspend(int vetoable) static void standby(void) { - int err; + int err; local_irq_disable(); device_power_down(PMSG_SUSPEND); @@ -1256,8 +1256,8 @@ static void standby(void) static apm_event_t get_event(void) { - int error; - apm_event_t event = APM_NO_EVENTS; /* silence gcc */ + int error; + apm_event_t event = APM_NO_EVENTS; /* silence gcc */ apm_eventinfo_t info; static int notified; @@ -1275,9 +1275,9 @@ static apm_event_t get_event(void) static void check_events(void) { - apm_event_t event; - static unsigned long last_resume; - static int ignore_bounce; + apm_event_t event; + static unsigned long last_resume; + static int ignore_bounce; while ((event = get_event()) != 0) { if (debug) { @@ -1289,7 +1289,7 @@ static void check_events(void) "event 0x%02x\n", event); } if (ignore_bounce - && ((jiffies - last_resume) > bounce_interval)) + && (time_after(jiffies, last_resume + bounce_interval))) ignore_bounce = 0; switch (event) { @@ -1357,7 +1357,7 @@ static void check_events(void) /* * We are not allowed to reject a critical suspend. */ - (void) suspend(0); + (void)suspend(0); break; } } @@ -1365,12 +1365,12 @@ static void check_events(void) static void apm_event_handler(void) { - static int pending_count = 4; - int err; + static int pending_count = 4; + int err; if ((standbys_pending > 0) || (suspends_pending > 0)) { if ((apm_info.connection_version > 0x100) && - (pending_count-- <= 0)) { + (pending_count-- <= 0)) { pending_count = 4; if (debug) printk(KERN_DEBUG "apm: setting state busy\n"); @@ -1418,9 +1418,9 @@ static int check_apm_user(struct apm_user *as, const char *func) static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) { - struct apm_user * as; - int i; - apm_event_t event; + struct apm_user *as; + int i; + apm_event_t event; as = fp->private_data; if (check_apm_user(as, "read")) @@ -1459,9 +1459,9 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t * return 0; } -static unsigned int do_poll(struct file *fp, poll_table * wait) +static unsigned int do_poll(struct file *fp, poll_table *wait) { - struct apm_user * as; + struct apm_user *as; as = fp->private_data; if (check_apm_user(as, "poll")) @@ -1472,10 +1472,10 @@ static unsigned int do_poll(struct file *fp, poll_table * wait) return 0; } -static int do_ioctl(struct inode * inode, struct file *filp, +static int do_ioctl(struct inode *inode, struct file *filp, u_int cmd, u_long arg) { - struct apm_user * as; + struct apm_user *as; as = filp->private_data; if (check_apm_user(as, "ioctl")) @@ -1515,9 +1515,9 @@ static int do_ioctl(struct inode * inode, struct file *filp, return 0; } -static int do_release(struct inode * inode, struct file * filp) +static int do_release(struct inode *inode, struct file *filp) { - struct apm_user * as; + struct apm_user *as; as = filp->private_data; if (check_apm_user(as, "release")) @@ -1533,11 +1533,11 @@ static int do_release(struct inode * inode, struct file * filp) if (suspends_pending <= 0) (void) suspend(1); } - spin_lock(&user_list_lock); + spin_lock(&user_list_lock); if (user_list == as) user_list = as->next; else { - struct apm_user * as1; + struct apm_user *as1; for (as1 = user_list; (as1 != NULL) && (as1->next != as); @@ -1553,9 +1553,9 @@ static int do_release(struct inode * inode, struct file * filp) return 0; } -static int do_open(struct inode * inode, struct file * filp) +static int do_open(struct inode *inode, struct file *filp) { - struct apm_user * as; + struct apm_user *as; as = kmalloc(sizeof(*as), GFP_KERNEL); if (as == NULL) { @@ -1569,7 +1569,7 @@ static int do_open(struct inode * inode, struct file * filp) as->suspends_read = as->standbys_read = 0; /* * XXX - this is a tiny bit broken, when we consider BSD - * process accounting. If the device is opened by root, we + * process accounting. If the device is opened by root, we * instantly flag that we used superuser privs. Who knows, * we might close the device immediately without doing a * privileged operation -- cevans @@ -1652,16 +1652,16 @@ static int proc_apm_show(struct seq_file *m, void *v) 8) min = minutes; sec = seconds */ seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", - driver_version, - (apm_info.bios.version >> 8) & 0xff, - apm_info.bios.version & 0xff, - apm_info.bios.flags, - ac_line_status, - battery_status, - battery_flag, - percentage, - time_units, - units); + driver_version, + (apm_info.bios.version >> 8) & 0xff, + apm_info.bios.version & 0xff, + apm_info.bios.flags, + ac_line_status, + battery_status, + battery_flag, + percentage, + time_units, + units); return 0; } @@ -1684,8 +1684,8 @@ static int apm(void *unused) unsigned short cx; unsigned short dx; int error; - char * power_stat; - char * bat_stat; + char *power_stat; + char *bat_stat; #ifdef CONFIG_SMP /* 2002/08/01 - WT @@ -1744,23 +1744,41 @@ static int apm(void *unused) } } - if (debug && (num_online_cpus() == 1 || smp )) { + if (debug && (num_online_cpus() == 1 || smp)) { error = apm_get_power_status(&bx, &cx, &dx); if (error) printk(KERN_INFO "apm: power status not available\n"); else { switch ((bx >> 8) & 0xff) { - case 0: power_stat = "off line"; break; - case 1: power_stat = "on line"; break; - case 2: power_stat = "on backup power"; break; - default: power_stat = "unknown"; break; + case 0: + power_stat = "off line"; + break; + case 1: + power_stat = "on line"; + break; + case 2: + power_stat = "on backup power"; + break; + default: + power_stat = "unknown"; + break; } switch (bx & 0xff) { - case 0: bat_stat = "high"; break; - case 1: bat_stat = "low"; break; - case 2: bat_stat = "critical"; break; - case 3: bat_stat = "charging"; break; - default: bat_stat = "unknown"; break; + case 0: + bat_stat = "high"; + break; + case 1: + bat_stat = "low"; + break; + case 2: + bat_stat = "critical"; + break; + case 3: + bat_stat = "charging"; + break; + default: + bat_stat = "unknown"; + break; } printk(KERN_INFO "apm: AC %s, battery status %s, battery life ", @@ -1777,8 +1795,8 @@ static int apm(void *unused) printk("unknown\n"); else printk("%d %s\n", dx & 0x7fff, - (dx & 0x8000) ? - "minutes" : "seconds"); + (dx & 0x8000) ? + "minutes" : "seconds"); } } } @@ -1803,7 +1821,7 @@ static int apm(void *unused) #ifndef MODULE static int __init apm_setup(char *str) { - int invert; + int invert; while ((str != NULL) && (*str != '\0')) { if (strncmp(str, "off", 3) == 0) @@ -1828,14 +1846,13 @@ static int __init apm_setup(char *str) if ((strncmp(str, "power-off", 9) == 0) || (strncmp(str, "power_off", 9) == 0)) power_off = !invert; - if (strncmp(str, "smp", 3) == 0) - { + if (strncmp(str, "smp", 3) == 0) { smp = !invert; idle_threshold = 100; } if ((strncmp(str, "allow-ints", 10) == 0) || (strncmp(str, "allow_ints", 10) == 0)) - apm_info.allow_ints = !invert; + apm_info.allow_ints = !invert; if ((strncmp(str, "broken-psr", 10) == 0) || (strncmp(str, "broken_psr", 10) == 0)) apm_info.get_power_status_broken = !invert; @@ -1881,7 +1898,8 @@ static int __init print_if_true(const struct dmi_system_id *d) */ static int __init broken_ps2_resume(const struct dmi_system_id *d) { - printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident); + printk(KERN_INFO "%s machine detected. Mousepad Resume Bug " + "workaround hopefully not needed.\n", d->ident); return 0; } @@ -1890,7 +1908,8 @@ static int __init set_realmode_power_off(const struct dmi_system_id *d) { if (apm_info.realmode_power_off == 0) { apm_info.realmode_power_off = 1; - printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident); + printk(KERN_INFO "%s bios detected. " + "Using realmode poweroff only.\n", d->ident); } return 0; } @@ -1900,7 +1919,8 @@ static int __init set_apm_ints(const struct dmi_system_id *d) { if (apm_info.allow_ints == 0) { apm_info.allow_ints = 1; - printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Enabling interrupts during APM calls.\n", d->ident); } return 0; } @@ -1910,7 +1930,8 @@ static int __init apm_is_horked(const struct dmi_system_id *d) { if (apm_info.disabled == 0) { apm_info.disabled = 1; - printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); } return 0; } @@ -1919,7 +1940,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d) { if (apm_info.disabled == 0) { apm_info.disabled = 1; - printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); printk(KERN_INFO "download from support.intel.com \n"); } @@ -1931,7 +1953,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d) { if (apm_info.forbid_idle == 0) { apm_info.forbid_idle = 1; - printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Disabling APM idle calls.\n", d->ident); } return 0; } @@ -1954,7 +1977,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d) static int __init broken_apm_power(const struct dmi_system_id *d) { apm_info.get_power_status_broken = 1; - printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n"); + printk(KERN_WARNING "BIOS strings suggest APM bugs, " + "disabling power status reporting.\n"); return 0; } @@ -1965,7 +1989,8 @@ static int __init broken_apm_power(const struct dmi_system_id *d) static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d) { apm_info.get_power_status_swabinminutes = 1; - printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n"); + printk(KERN_WARNING "BIOS strings suggest APM reports battery life " + "in minutes and wrong byte order.\n"); return 0; } @@ -1990,8 +2015,8 @@ static struct dmi_system_id __initdata apm_dmi_table[] = { apm_is_horked, "Dell Inspiron 2500", { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), - DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, }, { /* Allow interrupts during suspend on Dell Inspiron laptops*/ set_apm_ints, "Dell Inspiron", { @@ -2014,15 +2039,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = { apm_is_horked, "Dell Dimension 4100", { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), - DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, }, { /* Allow interrupts during suspend on Compaq Laptops*/ set_apm_ints, "Compaq 12XL125", { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"4.06"), }, + DMI_MATCH(DMI_BIOS_VERSION, "4.06"), }, }, { /* Allow interrupts during APM or the clock goes slow */ set_apm_ints, "ASUSTeK", @@ -2064,15 +2089,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = { apm_is_horked, "Sharp PC-PJ/AX", { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), - DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"), - DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"), + DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), }, }, { /* APM crashes */ apm_is_horked, "Dell Inspiron 2500", { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), - DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, }, { /* APM idle hangs */ apm_likes_to_melt, "Jabil AMD", @@ -2203,11 +2228,11 @@ static int __init apm_init(void) return -ENODEV; } printk(KERN_INFO - "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", - ((apm_info.bios.version >> 8) & 0xff), - (apm_info.bios.version & 0xff), - apm_info.bios.flags, - driver_version); + "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", + ((apm_info.bios.version >> 8) & 0xff), + (apm_info.bios.version & 0xff), + apm_info.bios.flags, + driver_version); if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { printk(KERN_INFO "apm: no 32 bit BIOS support\n"); return -ENODEV; @@ -2312,9 +2337,9 @@ static int __init apm_init(void) } wake_up_process(kapmd_task); - if (num_online_cpus() > 1 && !smp ) { + if (num_online_cpus() > 1 && !smp) { printk(KERN_NOTICE - "apm: disabled - APM is not SMP safe (power off active).\n"); + "apm: disabled - APM is not SMP safe (power off active).\n"); return 0; } @@ -2339,7 +2364,7 @@ static int __init apm_init(void) static void __exit apm_exit(void) { - int error; + int error; if (set_pm_idle) { pm_idle = original_pm_idle; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 0e45981b2dd..afd84463b71 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -38,15 +38,15 @@ void foo(void); void foo(void) { - OFFSET(SIGCONTEXT_eax, sigcontext, eax); - OFFSET(SIGCONTEXT_ebx, sigcontext, ebx); - OFFSET(SIGCONTEXT_ecx, sigcontext, ecx); - OFFSET(SIGCONTEXT_edx, sigcontext, edx); - OFFSET(SIGCONTEXT_esi, sigcontext, esi); - OFFSET(SIGCONTEXT_edi, sigcontext, edi); - OFFSET(SIGCONTEXT_ebp, sigcontext, ebp); - OFFSET(SIGCONTEXT_esp, sigcontext, esp); - OFFSET(SIGCONTEXT_eip, sigcontext, eip); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip); BLANK(); OFFSET(CPUINFO_x86, cpuinfo_x86, x86); @@ -70,39 +70,38 @@ void foo(void) OFFSET(TI_cpu, thread_info, cpu); BLANK(); - OFFSET(GDS_size, Xgt_desc_struct, size); - OFFSET(GDS_address, Xgt_desc_struct, address); - OFFSET(GDS_pad, Xgt_desc_struct, pad); + OFFSET(GDS_size, desc_ptr, size); + OFFSET(GDS_address, desc_ptr, address); BLANK(); - OFFSET(PT_EBX, pt_regs, ebx); - OFFSET(PT_ECX, pt_regs, ecx); - OFFSET(PT_EDX, pt_regs, edx); - OFFSET(PT_ESI, pt_regs, esi); - OFFSET(PT_EDI, pt_regs, edi); - OFFSET(PT_EBP, pt_regs, ebp); - OFFSET(PT_EAX, pt_regs, eax); - OFFSET(PT_DS, pt_regs, xds); - OFFSET(PT_ES, pt_regs, xes); - OFFSET(PT_FS, pt_regs, xfs); - OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); - OFFSET(PT_EIP, pt_regs, eip); - OFFSET(PT_CS, pt_regs, xcs); - OFFSET(PT_EFLAGS, pt_regs, eflags); - OFFSET(PT_OLDESP, pt_regs, esp); - OFFSET(PT_OLDSS, pt_regs, xss); + OFFSET(PT_EBX, pt_regs, bx); + OFFSET(PT_ECX, pt_regs, cx); + OFFSET(PT_EDX, pt_regs, dx); + OFFSET(PT_ESI, pt_regs, si); + OFFSET(PT_EDI, pt_regs, di); + OFFSET(PT_EBP, pt_regs, bp); + OFFSET(PT_EAX, pt_regs, ax); + OFFSET(PT_DS, pt_regs, ds); + OFFSET(PT_ES, pt_regs, es); + OFFSET(PT_FS, pt_regs, fs); + OFFSET(PT_ORIG_EAX, pt_regs, orig_ax); + OFFSET(PT_EIP, pt_regs, ip); + OFFSET(PT_CS, pt_regs, cs); + OFFSET(PT_EFLAGS, pt_regs, flags); + OFFSET(PT_OLDESP, pt_regs, sp); + OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); - OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); OFFSET(pbe_address, pbe, address); OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); - /* Offset from the sysenter stack to tss.esp0 */ - DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) - + /* Offset from the sysenter stack to tss.sp0 */ + DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); @@ -111,8 +110,6 @@ void foo(void) DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); - DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK); - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); #ifdef CONFIG_PARAVIRT @@ -123,7 +120,7 @@ void foo(void) OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d1b6ed98774..494e1e096ee 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -38,7 +38,6 @@ int main(void) #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) ENTRY(state); ENTRY(flags); - ENTRY(thread); ENTRY(pid); BLANK(); #undef ENTRY @@ -47,6 +46,9 @@ int main(void) ENTRY(addr_limit); ENTRY(preempt_count); ENTRY(status); +#ifdef CONFIG_IA32_EMULATION + ENTRY(sysenter_return); +#endif BLANK(); #undef ENTRY #define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) @@ -59,17 +61,31 @@ int main(void) ENTRY(data_offset); BLANK(); #undef ENTRY +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); + OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); + OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); + OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); + OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); + OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); + OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); +#endif + + #ifdef CONFIG_IA32_EMULATION #define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) - ENTRY(eax); - ENTRY(ebx); - ENTRY(ecx); - ENTRY(edx); - ENTRY(esi); - ENTRY(edi); - ENTRY(ebp); - ENTRY(esp); - ENTRY(eip); + ENTRY(ax); + ENTRY(bx); + ENTRY(cx); + ENTRY(dx); + ENTRY(si); + ENTRY(di); + ENTRY(bp); + ENTRY(sp); + ENTRY(ip); BLANK(); #undef ENTRY DEFINE(IA32_RT_SIGFRAME_sigcontext, @@ -81,14 +97,14 @@ int main(void) DEFINE(pbe_next, offsetof(struct pbe, next)); BLANK(); #define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry)) - ENTRY(rbx); - ENTRY(rbx); - ENTRY(rcx); - ENTRY(rdx); - ENTRY(rsp); - ENTRY(rbp); - ENTRY(rsi); - ENTRY(rdi); + ENTRY(bx); + ENTRY(bx); + ENTRY(cx); + ENTRY(dx); + ENTRY(sp); + ENTRY(bp); + ENTRY(si); + ENTRY(di); ENTRY(r8); ENTRY(r9); ENTRY(r10); @@ -97,7 +113,7 @@ int main(void) ENTRY(r13); ENTRY(r14); ENTRY(r15); - ENTRY(eflags); + ENTRY(flags); BLANK(); #undef ENTRY #define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) @@ -108,7 +124,7 @@ int main(void) ENTRY(cr8); BLANK(); #undef ENTRY - DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); + DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist)); BLANK(); DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); BLANK(); diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 0b9860530a6..30f25a75fe2 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -1,8 +1,6 @@ /* * Implement 'Simple Boot Flag Specification 2.0' */ - - #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> @@ -14,40 +12,38 @@ #include <linux/mc146818rtc.h> - #define SBF_RESERVED (0x78) #define SBF_PNPOS (1<<0) #define SBF_BOOTING (1<<1) #define SBF_DIAG (1<<2) #define SBF_PARITY (1<<7) - int sbf_port __initdata = -1; /* set via acpi_boot_init() */ - static int __init parity(u8 v) { int x = 0; int i; - - for(i=0;i<8;i++) - { - x^=(v&1); - v>>=1; + + for (i = 0; i < 8; i++) { + x ^= (v & 1); + v >>= 1; } + return x; } static void __init sbf_write(u8 v) { unsigned long flags; - if(sbf_port != -1) - { + + if (sbf_port != -1) { v &= ~SBF_PARITY; - if(!parity(v)) - v|=SBF_PARITY; + if (!parity(v)) + v |= SBF_PARITY; - printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); + printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", + sbf_port, v); spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(v, sbf_port); @@ -57,33 +53,41 @@ static void __init sbf_write(u8 v) static u8 __init sbf_read(void) { - u8 v; unsigned long flags; - if(sbf_port == -1) + u8 v; + + if (sbf_port == -1) return 0; + spin_lock_irqsave(&rtc_lock, flags); v = CMOS_READ(sbf_port); spin_unlock_irqrestore(&rtc_lock, flags); + return v; } static int __init sbf_value_valid(u8 v) { - if(v&SBF_RESERVED) /* Reserved bits */ + if (v & SBF_RESERVED) /* Reserved bits */ return 0; - if(!parity(v)) + if (!parity(v)) return 0; + return 1; } static int __init sbf_init(void) { u8 v; - if(sbf_port == -1) + + if (sbf_port == -1) return 0; + v = sbf_read(); - if(!sbf_value_valid(v)) - printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v); + if (!sbf_value_valid(v)) { + printk(KERN_WARNING "Simple Boot Flag value 0x%x read from " + "CMOS RAM was invalid\n", v); + } v &= ~SBF_RESERVED; v &= ~SBF_BOOTING; @@ -92,7 +96,7 @@ static int __init sbf_init(void) v |= SBF_PNPOS; #endif sbf_write(v); + return 0; } - module_init(sbf_init); diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c index 9a189cef640..8f520f93ffd 100644 --- a/arch/x86/kernel/bugs_64.c +++ b/arch/x86/kernel/bugs_64.c @@ -13,7 +13,6 @@ void __init check_bugs(void) { identify_cpu(&boot_cpu_data); - mtrr_bp_init(); #if !defined(CONFIG_SMP) printk("CPU: "); print_cpu_info(&boot_cpu_data); diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 3e91d3ee26e..238468ae199 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -45,6 +45,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) ®s[CR_ECX], ®s[CR_EDX]); if (regs[cb->reg] & (1 << cb->bit)) - set_bit(cb->feature, c->x86_capability); + set_cpu_cap(c, cb->feature); } } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1ff88c7f45c..06fa159232f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -63,6 +63,15 @@ static __cpuinit int amd_apic_timer_broken(void) int force_mwait __cpuinitdata; +void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +{ + if (cpuid_eax(0x80000000) >= 0x80000007) { + c->x86_power = cpuid_edx(0x80000007); + if (c->x86_power & (1<<8)) + set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); + } +} + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; @@ -85,6 +94,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } #endif + early_init_amd(c); + /* * FIXME: We should handle the K5 here. Set up the write * range and also turn on MSR 83 bits 4 and 31 (write alloc, @@ -257,12 +268,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; } - if (cpuid_eax(0x80000000) >= 0x80000007) { - c->x86_power = cpuid_edx(0x80000007); - if (c->x86_power & (1<<8)) - set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); - } - #ifdef CONFIG_X86_HT /* * On a AMD multi core setup the lower bits of the APIC id @@ -295,12 +300,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) local_apic_timer_disabled = 1; #endif - if (c->x86 == 0x10 && !force_mwait) - clear_bit(X86_FEATURE_MWAIT, c->x86_capability); - /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) clear_bit(X86_FEATURE_MCE, c->x86_capability); + + if (cpu_has_xmm) + set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); } static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 205fd5ba57f..9b95edcfc6a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -11,6 +11,7 @@ #include <linux/utsname.h> #include <asm/bugs.h> #include <asm/processor.h> +#include <asm/processor-flags.h> #include <asm/i387.h> #include <asm/msr.h> #include <asm/paravirt.h> @@ -35,7 +36,7 @@ __setup("mca-pentium", mca_pentium); static int __init no_387(char *s) { boot_cpu_data.hard_math = 0; - write_cr0(0xE | read_cr0()); + write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0()); return 1; } @@ -153,7 +154,7 @@ static void __init check_config(void) * If we configured ourselves for a TSC, we'd better have one! */ #ifdef CONFIG_X86_TSC - if (!cpu_has_tsc && !tsc_disable) + if (!cpu_has_tsc) panic("Kernel compiled for Pentium+, requires TSC feature!"); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e2fcf2051bd..db28aa9e2f6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -22,43 +22,48 @@ #include "cpu.h" DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, - [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, - [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, - [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ - [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ - [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ + /* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + /* 16-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ - [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + /* 32-bit code */ + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, - [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + /* data */ + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, - [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + static int cachesize_override __cpuinitdata = -1; -static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; -static int disable_x86_sep __cpuinitdata; struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; -extern int disable_pse; - static void __cpuinit default_init(struct cpuinfo_x86 * c) { /* Not much we can do here... */ @@ -207,16 +212,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) static int __init x86_fxsr_setup(char * s) { - /* Tell all the other CPUs to not use it... */ - disable_x86_fxsr = 1; - - /* - * ... and clear the bits early in the boot_cpu_data - * so that the bootup process doesn't try to do this - * either. - */ - clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability); - clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability); + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); return 1; } __setup("nofxsr", x86_fxsr_setup); @@ -224,7 +221,7 @@ __setup("nofxsr", x86_fxsr_setup); static int __init x86_sep_setup(char * s) { - disable_x86_sep = 1; + setup_clear_cpu_cap(X86_FEATURE_SEP); return 1; } __setup("nosep", x86_sep_setup); @@ -281,6 +278,33 @@ void __init cpu_detect(struct cpuinfo_x86 *c) c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; } } +static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) +{ + u32 tfms, xlvl; + int ebx; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + if (have_cpuid_p()) { + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + } + + } + +} /* Do minimum CPU detection early. Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. @@ -300,6 +324,17 @@ static void __init early_cpu_detect(void) cpu_detect(c); get_cpu_vendor(c, 1); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + early_init_amd(c); + break; + case X86_VENDOR_INTEL: + early_init_intel(c); + break; + } + + early_get_cap(c); } static void __cpuinit generic_identify(struct cpuinfo_x86 * c) @@ -357,8 +392,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c) init_scattered_cpuid_features(c); } - early_intel_workaround(c); - #ifdef CONFIG_X86_HT c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif @@ -392,7 +425,7 @@ __setup("serialnumber", x86_serial_nr_setup); /* * This does the hard work of actually picking apart the CPU stuff... */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -418,20 +451,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) generic_identify(c); - printk(KERN_DEBUG "CPU: After generic identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - - if (this_cpu->c_identify) { + if (this_cpu->c_identify) this_cpu->c_identify(c); - printk(KERN_DEBUG "CPU: After vendor identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - } - /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are @@ -453,23 +475,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) * we do "generic changes." */ - /* TSC disabled? */ - if ( tsc_disable ) - clear_bit(X86_FEATURE_TSC, c->x86_capability); - - /* FXSR disabled? */ - if (disable_x86_fxsr) { - clear_bit(X86_FEATURE_FXSR, c->x86_capability); - clear_bit(X86_FEATURE_XMM, c->x86_capability); - } - - /* SEP disabled? */ - if (disable_x86_sep) - clear_bit(X86_FEATURE_SEP, c->x86_capability); - - if (disable_pse) - clear_bit(X86_FEATURE_PSE, c->x86_capability); - /* If the model name is still unset, do table lookup. */ if ( !c->x86_model_id[0] ) { char *p; @@ -482,13 +487,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86, c->x86_model); } - /* Now the feature flags better reflect actual CPU features! */ - - printk(KERN_DEBUG "CPU: After all inits, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -501,8 +499,14 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } + /* Clear all flags overriden by options */ + for (i = 0; i < NCAPINTS; i++) + c->x86_capability[i] ^= cleared_cpu_caps[i]; + /* Init Machine Check Exception if available. */ mcheck_init(c); + + select_idle_routine(c); } void __init identify_boot_cpu(void) @@ -510,7 +514,6 @@ void __init identify_boot_cpu(void) identify_cpu(&boot_cpu_data); sysenter_setup(); enable_sep_cpu(); - mtrr_bp_init(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -567,6 +570,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) } #endif +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; +} +__setup("noclflush", setup_noclflush); + void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { char *vendor = NULL; @@ -590,6 +600,17 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) printk("\n"); } +static __init int setup_disablecpuid(char *arg) +{ + int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + return 1; +} +__setup("clearcpuid=", setup_disablecpuid); + cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; /* This is hacky. :) @@ -620,21 +641,13 @@ void __init early_cpu_init(void) nexgen_init_cpu(); umc_init_cpu(); early_cpu_detect(); - -#ifdef CONFIG_DEBUG_PAGEALLOC - /* pse is not compatible with on-the-fly unmapping, - * disable it even if the cpus claim to support it. - */ - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; -#endif } /* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xfs = __KERNEL_PERCPU; + regs->fs = __KERNEL_PERCPU; return regs; } @@ -642,7 +655,7 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) * it's on the real one. */ void switch_to_new_gdt(void) { - struct Xgt_desc_struct gdt_descr; + struct desc_ptr gdt_descr; gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); gdt_descr.size = GDT_SIZE - 1; @@ -672,12 +685,6 @@ void __cpuinit cpu_init(void) if (cpu_has_vme || cpu_has_tsc || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - if (tsc_disable && cpu_has_tsc) { - printk(KERN_NOTICE "Disabling TSC...\n"); - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); - set_in_cr4(X86_CR4_TSD); - } load_idt(&idt_descr); switch_to_new_gdt(); @@ -691,7 +698,7 @@ void __cpuinit cpu_init(void) BUG(); enter_lazy_tlb(&init_mm, curr); - load_esp0(t, thread); + load_sp0(t, thread); set_tss_desc(cpu,t); load_TR_desc(); load_LDT(&init_mm.context); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 2f6432cef6f..ad6527a5beb 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -24,5 +24,6 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); -extern void early_intel_workaround(struct cpuinfo_x86 *c); +extern void early_init_intel(struct cpuinfo_x86 *c); +extern void early_init_amd(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index fea0af0476b..a962dcb9c40 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -67,7 +67,8 @@ struct acpi_cpufreq_data { unsigned int cpu_feature; }; -static struct acpi_cpufreq_data *drv_data[NR_CPUS]; +static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); + /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -218,14 +219,14 @@ static u32 get_cur_val(cpumask_t mask) if (unlikely(cpus_empty(mask))) return 0; - switch (drv_data[first_cpu(mask)]->cpu_feature) { + switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; break; case SYSTEM_IO_CAPABLE: cmd.type = SYSTEM_IO_CAPABLE; - perf = drv_data[first_cpu(mask)]->acpi_data; + perf = per_cpu(drv_data, first_cpu(mask))->acpi_data; cmd.addr.io.port = perf->control_register.address; cmd.addr.io.bit_width = perf->control_register.bit_width; break; @@ -325,7 +326,7 @@ static unsigned int get_measured_perf(unsigned int cpu) #endif - retval = drv_data[cpu]->max_freq * perf_percent / 100; + retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; put_cpu(); set_cpus_allowed(current, saved_mask); @@ -336,7 +337,7 @@ static unsigned int get_measured_perf(unsigned int cpu) static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { - struct acpi_cpufreq_data *data = drv_data[cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); unsigned int freq; dprintk("get_cur_freq_on_cpu (%d)\n", cpu); @@ -370,7 +371,7 @@ static unsigned int check_freqs(cpumask_t mask, unsigned int freq, static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); struct acpi_processor_performance *perf; struct cpufreq_freqs freqs; cpumask_t online_policy_cpus; @@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, static int acpi_cpufreq_verify(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); dprintk("acpi_cpufreq_verify\n"); @@ -570,7 +571,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) return -ENOMEM; data->acpi_data = percpu_ptr(acpi_perf_data, cpu); - drv_data[cpu] = data; + per_cpu(drv_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; @@ -714,20 +715,20 @@ err_unreg: acpi_processor_unregister_performance(perf, cpu); err_free: kfree(data); - drv_data[cpu] = NULL; + per_cpu(drv_data, cpu) = NULL; return result; } static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); dprintk("acpi_cpufreq_cpu_exit\n"); if (data) { cpufreq_frequency_table_put_attr(policy->cpu); - drv_data[policy->cpu] = NULL; + per_cpu(drv_data, policy->cpu) = NULL; acpi_processor_unregister_performance(data->acpi_data, policy->cpu); kfree(data); @@ -738,7 +739,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) static int acpi_cpufreq_resume(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); dprintk("acpi_cpufreq_resume\n"); diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 749d00cb2eb..06fcce516d5 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -694,7 +694,7 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle, if ( acpi_bus_get_device(obj_handle, &d) ) { return 0; } - *return_value = (void *)acpi_driver_data(d); + *return_value = acpi_driver_data(d); return 1; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 99e1ef9939b..a0522735dd9 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -52,7 +52,7 @@ /* serialize freq changes */ static DEFINE_MUTEX(fidvid_mutex); -static struct powernow_k8_data *powernow_data[NR_CPUS]; +static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data); static int cpu_family = CPU_OPTERON; @@ -1018,7 +1018,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { cpumask_t oldmask = CPU_MASK_ALL; - struct powernow_k8_data *data = powernow_data[pol->cpu]; + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); u32 checkfid; u32 checkvid; unsigned int newstate; @@ -1094,7 +1094,7 @@ err_out: /* Driver entry point to verify the policy and range of frequencies */ static int powernowk8_verify(struct cpufreq_policy *pol) { - struct powernow_k8_data *data = powernow_data[pol->cpu]; + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); if (!data) return -EINVAL; @@ -1202,7 +1202,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", data->currfid, data->currvid); - powernow_data[pol->cpu] = data; + per_cpu(powernow_data, pol->cpu) = data; return 0; @@ -1216,7 +1216,7 @@ err_out: static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol) { - struct powernow_k8_data *data = powernow_data[pol->cpu]; + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); if (!data) return -EINVAL; @@ -1237,7 +1237,7 @@ static unsigned int powernowk8_get (unsigned int cpu) cpumask_t oldmask = current->cpus_allowed; unsigned int khz = 0; - data = powernow_data[first_cpu(per_cpu(cpu_core_map, cpu))]; + data = per_cpu(powernow_data, first_cpu(per_cpu(cpu_core_map, cpu))); if (!data) return -EINVAL; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 88d66fb8411..404a6a2d401 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -5,6 +5,7 @@ #include <asm/dma.h> #include <asm/io.h> #include <asm/processor-cyrix.h> +#include <asm/processor-flags.h> #include <asm/timer.h> #include <asm/pci-direct.h> #include <asm/tsc.h> @@ -126,15 +127,12 @@ static void __cpuinit set_cx86_reorder(void) static void __cpuinit set_cx86_memwb(void) { - u32 cr0; - printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); /* CCR2 bit 2: unlock NW bit */ setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); /* set 'Not Write-through' */ - cr0 = 0x20000000; - write_cr0(read_cr0() | cr0); + write_cr0(read_cr0() | X86_CR0_NW); /* CCR2 bit 2: lock NW bit and set WT1 */ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 ); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cc8c501b9f3..d1c372b018d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -11,6 +11,8 @@ #include <asm/pgtable.h> #include <asm/msr.h> #include <asm/uaccess.h> +#include <asm/ptrace.h> +#include <asm/ds.h> #include "cpu.h" @@ -27,13 +29,14 @@ struct movsl_mask movsl_mask __read_mostly; #endif -void __cpuinit early_intel_workaround(struct cpuinfo_x86 *c) +void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { - if (c->x86_vendor != X86_VENDOR_INTEL) - return; /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } /* @@ -113,6 +116,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) unsigned int l2 = 0; char *p = NULL; + early_init_intel(c); + #ifdef CONFIG_X86_F00F_BUG /* * All current models of Pentium and Pentium with MMX technology CPUs @@ -132,7 +137,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } #endif - select_idle_routine(c); l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9 ) { unsigned eax = cpuid_eax(10); @@ -201,16 +205,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } #endif + if (cpu_has_xmm2) + set_bit(X86_FEATURE_LFENCE_RDTSC, c->x86_capability); if (c->x86 == 15) { set_bit(X86_FEATURE_P4, c->x86_capability); - set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability); } if (c->x86 == 6) set_bit(X86_FEATURE_P3, c->x86_capability); - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); - if (cpu_has_ds) { unsigned int l1; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); @@ -219,6 +220,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (!(l1 & (1<<12))) set_bit(X86_FEATURE_PEBS, c->x86_capability); } + + if (cpu_has_bts) + ds_init_intel(c); } static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) @@ -342,5 +346,22 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) EXPORT_SYMBOL(cmpxchg_386_u32); #endif +#ifndef CONFIG_X86_CMPXCHG64 +unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) +{ + u64 prev; + unsigned long flags; + + /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u64 *)ptr; + if (prev == old) + *(u64 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_486_u64); +#endif + // arch_initcall(intel_cpu_init); diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index eef63e3630c..e633c9c2b76 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -16,7 +16,7 @@ #include "mce.h" /* Machine Check Handler For AMD Athlon/Duron */ -static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) +static void k7_machine_check(struct pt_regs * regs, long error_code) { int recover=1; u32 alow, ahigh, high, low; @@ -27,29 +27,32 @@ static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) if (mcgstl & (1<<0)) /* Recoverable ? */ recover=0; - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); - for (i=1; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + for (i = 1; i < nr_mce_banks; i++) { + rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); if (high&(1<<31)) { + char misc[20]; + char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; - printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); high &= ~(1<<31); if (high & (1<<27)) { - rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); - printk ("[%08x%08x]", ahigh, alow); + rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); + snprintf(misc, 20, "[%08x%08x]", ahigh, alow); } if (high & (1<<26)) { - rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - printk (" at %08x%08x", ahigh, alow); + rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + snprintf(addr, 24, " at %08x%08x", ahigh, alow); } - printk ("\n"); + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", + smp_processor_id(), i, high, low, misc, addr); /* Clear it */ - wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); /* Serialize */ wmb(); add_taint(TAINT_MACHINE_CHECK); diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index 81fb6e2d35f..ae9f628838f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -8,7 +8,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); /* Call the installed machine check handler for this CPU setup. */ -extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); +extern void (*machine_check_vector)(struct pt_regs *, long error_code); extern int nr_mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index 34c781eddee..a5182dcd94a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -22,13 +22,13 @@ int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ /* Handle unconfigured int18 (should never happen) */ -static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code) +static void unexpected_machine_check(struct pt_regs * regs, long error_code) { printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); } /* Call the installed machine check handler for this CPU setup. */ -void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; +void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 242e8668dbe..9a699ed0359 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -63,7 +63,7 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait); * separate MCEs from kernel messages to avoid bogus bug reports. */ -struct mce_log mcelog = { +static struct mce_log mcelog = { MCE_LOG_SIGNATURE, MCE_LOG_LEN, }; @@ -80,7 +80,7 @@ void mce_log(struct mce *mce) /* When the buffer fills up discard new entries. Assume that the earlier errors are the more interesting. */ if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, &mcelog.flags); + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); return; } /* Old left over entry. Skip. */ @@ -110,12 +110,12 @@ static void print_mce(struct mce *m) KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->cpu, m->mcgstatus, m->bank, m->status); - if (m->rip) { + if (m->ip) { printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->rip); + m->cs, m->ip); if (m->cs == __KERNEL_CS) - print_symbol("{%s}", m->rip); + print_symbol("{%s}", m->ip); printk("\n"); } printk(KERN_EMERG "TSC %Lx ", m->tsc); @@ -156,16 +156,16 @@ static int mce_available(struct cpuinfo_x86 *c) static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) { if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { - m->rip = regs->rip; + m->ip = regs->ip; m->cs = regs->cs; } else { - m->rip = 0; + m->ip = 0; m->cs = 0; } if (rip_msr) { /* Assume the RIP in the MSR is exact. Is this true? */ m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->rip); + rdmsrl(rip_msr, m->ip); m->cs = 0; } } @@ -192,10 +192,10 @@ void do_machine_check(struct pt_regs * regs, long error_code) atomic_inc(&mce_entry); - if (regs) - notify_die(DIE_NMI, "machine check", regs, error_code, 18, - SIGKILL); - if (!banks) + if ((regs + && notify_die(DIE_NMI, "machine check", regs, error_code, + 18, SIGKILL) == NOTIFY_STOP) + || !banks) goto out2; memset(&m, 0, sizeof(struct mce)); @@ -288,7 +288,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) * instruction which caused the MCE. */ if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = panicm.rip && (panicm.cs & 3); + user_space = panicm.ip && (panicm.cs & 3); /* * If we know that the error was in user space, send a @@ -564,7 +564,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { unsigned long *cpu_tsc; - static DECLARE_MUTEX(mce_read_sem); + static DEFINE_MUTEX(mce_read_mutex); unsigned next; char __user *buf = ubuf; int i, err; @@ -573,12 +573,12 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, if (!cpu_tsc) return -ENOMEM; - down(&mce_read_sem); + mutex_lock(&mce_read_mutex); next = rcu_dereference(mcelog.next); /* Only supports full reads right now */ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - up(&mce_read_sem); + mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); return -EINVAL; } @@ -621,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, memset(&mcelog.entry[i], 0, sizeof(struct mce)); } } - up(&mce_read_sem); + mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); return err ? -EFAULT : buf - ubuf; } @@ -634,8 +634,7 @@ static unsigned int mce_poll(struct file *file, poll_table *wait) return 0; } -static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, - unsigned long arg) +static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { int __user *p = (int __user *)arg; @@ -664,7 +663,7 @@ static const struct file_operations mce_chrdev_ops = { .release = mce_release, .read = mce_read, .poll = mce_poll, - .ioctl = mce_ioctl, + .unlocked_ioctl = mce_ioctl, }; static struct miscdevice mce_log_device = { @@ -855,8 +854,8 @@ static void mce_remove_device(unsigned int cpu) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -873,7 +872,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block mce_cpu_notifier = { +static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 753588755fe..32671da8184 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -118,6 +118,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) { unsigned int bank, block; unsigned int cpu = smp_processor_id(); + u8 lvt_off; u32 low = 0, high = 0, address = 0; for (bank = 0; bank < NR_BANKS; ++bank) { @@ -153,14 +154,13 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) if (shared_bank[bank] && c->cpu_core_id) break; #endif + lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0); + high &= ~MASK_LVTOFF_HI; - high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; + high |= lvt_off << 20; wrmsr(address, low, high); - setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, - THRESHOLD_APIC_VECTOR, - K8_APIC_EXT_INT_MSG_FIX, 0); - threshold_defaults.address = address; threshold_restart_bank(&threshold_defaults, 0, 0); } @@ -450,7 +450,8 @@ recurse: if (err) goto out_free; - kobject_uevent(&b->kobj, KOBJ_ADD); + if (b) + kobject_uevent(&b->kobj, KOBJ_ADD); return err; @@ -554,7 +555,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu) int err = 0; for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) + if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; err = threshold_create_bank(cpu, bank); if (err) @@ -637,14 +638,14 @@ static void threshold_remove_device(unsigned int cpu) unsigned int bank; for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) + if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); } } /* get notified when a cpu comes on/off */ -static int threshold_cpu_callback(struct notifier_block *nfb, +static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { /* cpu was unsigned int to begin with */ @@ -669,7 +670,7 @@ static int threshold_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier = { +static struct notifier_block threshold_cpu_notifier __cpuinitdata = { .notifier_call = threshold_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index be4dabfee1f..cb03345554a 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -57,7 +57,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs) /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -fastcall void smp_thermal_interrupt(struct pt_regs *regs) +void smp_thermal_interrupt(struct pt_regs *regs) { irq_enter(); vendor_thermal_interrupt(regs); @@ -141,7 +141,7 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) rdmsr (MSR_IA32_MCG_EIP, r->eip, h); } -static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) +static void intel_machine_check(struct pt_regs * regs, long error_code) { int recover=1; u32 alow, ahigh, high, low; @@ -152,38 +152,41 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) if (mcgstl & (1<<0)) /* Recoverable ? */ recover=0; - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); if (mce_num_extended_msrs > 0) { struct intel_mce_extended_msrs dbg; intel_get_extended_msrs(&dbg); - printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags); - printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx); - printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", + printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" + "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" + "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", + smp_processor_id(), dbg.eip, dbg.eflags, + dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, dbg.esi, dbg.edi, dbg.ebp, dbg.esp); } - for (i=0; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + for (i = 0; i < nr_mce_banks; i++) { + rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); if (high & (1<<31)) { + char misc[20]; + char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; - printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); high &= ~(1<<31); if (high & (1<<27)) { - rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); - printk ("[%08x%08x]", ahigh, alow); + rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); + snprintf(misc, 20, "[%08x%08x]", ahigh, alow); } if (high & (1<<26)) { - rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - printk (" at %08x%08x", ahigh, alow); + rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + snprintf(addr, 24, " at %08x%08x", ahigh, alow); } - printk ("\n"); + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", + smp_processor_id(), i, high, low, misc, addr); } } diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 94bc43d950c..a18310aaae0 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -16,7 +16,7 @@ #include "mce.h" /* Machine check handler for Pentium class Intel */ -static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code) +static void pentium_machine_check(struct pt_regs * regs, long error_code) { u32 loaddr, hi, lotype; rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index deeae42ce19..74342604d30 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -16,7 +16,7 @@ #include "mce.h" /* Machine Check Handler For PII/PIII */ -static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) +static void intel_machine_check(struct pt_regs * regs, long error_code) { int recover=1; u32 alow, ahigh, high, low; @@ -27,27 +27,30 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) if (mcgstl & (1<<0)) /* Recoverable ? */ recover=0; - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); - for (i=0; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + for (i = 0; i < nr_mce_banks; i++) { + rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); if (high & (1<<31)) { + char misc[20]; + char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; - printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); high &= ~(1<<31); if (high & (1<<27)) { - rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); - printk ("[%08x%08x]", ahigh, alow); + rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); + snprintf(misc, 20, "[%08x%08x]", ahigh, alow); } if (high & (1<<26)) { - rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - printk (" at %08x%08x", ahigh, alow); + rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + snprintf(addr, 24, " at %08x%08x", ahigh, alow); } - printk ("\n"); + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", + smp_processor_id(), i, high, low, misc, addr); } } diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 9e424b6c293..3d428d5afc5 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -15,7 +15,7 @@ #include "mce.h" /* Machine check handler for WinChip C6 */ -static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code) +static void winchip_machine_check(struct pt_regs * regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK); diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index 0949cdbf848..ee2331b0e58 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -53,8 +53,6 @@ static void amd_set_mtrr(unsigned int reg, unsigned long base, <base> The base address of the region. <size> The size of the region. If this is 0 the region is disabled. <type> The type of the region. - <do_safe> If TRUE, do the change safely. If FALSE, safety measures should - be done externally. [RETURNS] Nothing. */ { diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 9964be3de2b..8e139c70f88 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -4,6 +4,7 @@ #include <asm/msr.h> #include <asm/io.h> #include <asm/processor-cyrix.h> +#include <asm/processor-flags.h> #include "mtrr.h" int arr3_protected; @@ -142,7 +143,7 @@ static void prepare_set(void) /* Disable and flush caches. Note that wbinvd flushes the TLBs as a side-effect */ - cr0 = read_cr0() | 0x40000000; + cr0 = read_cr0() | X86_CR0_CD; wbinvd(); write_cr0(cr0); wbinvd(); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 992f08dfbb6..103d61a59b1 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -9,11 +9,12 @@ #include <asm/msr.h> #include <asm/system.h> #include <asm/cpufeature.h> +#include <asm/processor-flags.h> #include <asm/tlbflush.h> #include "mtrr.h" struct mtrr_state { - struct mtrr_var_range *var_ranges; + struct mtrr_var_range var_ranges[MAX_VAR_RANGES]; mtrr_type fixed_ranges[NUM_FIXED_RANGES]; unsigned char enabled; unsigned char have_fixed; @@ -85,12 +86,6 @@ void __init get_mtrr_state(void) struct mtrr_var_range *vrs; unsigned lo, dummy; - if (!mtrr_state.var_ranges) { - mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range), - GFP_KERNEL); - if (!mtrr_state.var_ranges) - return; - } vrs = mtrr_state.var_ranges; rdmsr(MTRRcap_MSR, lo, dummy); @@ -188,7 +183,7 @@ static inline void k8_enable_fixed_iorrs(void) * \param changed pointer which indicates whether the MTRR needed to be changed * \param msrwords pointer to the MSR values which the MSR should have */ -static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) +static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) { unsigned lo, hi; @@ -200,7 +195,7 @@ static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) k8_enable_fixed_iorrs(); mtrr_wrmsr(msr, msrwords[0], msrwords[1]); - *changed = TRUE; + *changed = true; } } @@ -260,7 +255,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, static int set_fixed_ranges(mtrr_type * frs) { unsigned long long *saved = (unsigned long long *) frs; - int changed = FALSE; + bool changed = false; int block=-1, range; while (fixed_range_blocks[++block].ranges) @@ -273,17 +268,17 @@ static int set_fixed_ranges(mtrr_type * frs) /* Set the MSR pair relating to a var range. Returns TRUE if changes are made */ -static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) +static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) { unsigned int lo, hi; - int changed = FALSE; + bool changed = false; rdmsr(MTRRphysBase_MSR(index), lo, hi); if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); - changed = TRUE; + changed = true; } rdmsr(MTRRphysMask_MSR(index), lo, hi); @@ -292,7 +287,7 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); - changed = TRUE; + changed = true; } return changed; } @@ -350,7 +345,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ - cr0 = read_cr0() | 0x40000000; /* set CD flag */ + cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); wbinvd(); @@ -417,8 +412,6 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, <base> The base address of the region. <size> The size of the region. If this is 0 the region is disabled. <type> The type of the region. - <do_safe> If TRUE, do the change safely. If FALSE, safety measures should - be done externally. [RETURNS] Nothing. */ { diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index c7d8f175674..91e150acb46 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -11,10 +11,6 @@ #include <asm/mtrr.h> #include "mtrr.h" -/* RED-PEN: this is accessed without any locking */ -extern unsigned int *usage_table; - - #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) static const char *const mtrr_strings[MTRR_NUM_TYPES] = @@ -37,7 +33,7 @@ const char *mtrr_attrib_to_str(int x) static int mtrr_file_add(unsigned long base, unsigned long size, - unsigned int type, char increment, struct file *file, int page) + unsigned int type, bool increment, struct file *file, int page) { int reg, max; unsigned int *fcount = FILE_FCOUNT(file); @@ -55,7 +51,7 @@ mtrr_file_add(unsigned long base, unsigned long size, base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; } - reg = mtrr_add_page(base, size, type, 1); + reg = mtrr_add_page(base, size, type, true); if (reg >= 0) ++fcount[reg]; return reg; @@ -141,7 +137,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) size >>= PAGE_SHIFT; err = mtrr_add_page((unsigned long) base, (unsigned long) size, i, - 1); + true); if (err < 0) return err; return len; @@ -217,7 +213,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = - mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, file, 0); break; case MTRRIOC_SET_ENTRY: @@ -226,7 +222,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); + err = mtrr_add(sentry.base, sentry.size, sentry.type, false); break; case MTRRIOC_DEL_ENTRY: #ifdef CONFIG_COMPAT @@ -270,7 +266,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = - mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, file, 1); break; case MTRRIOC_SET_PAGE_ENTRY: @@ -279,7 +275,8 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); + err = + mtrr_add_page(sentry.base, sentry.size, sentry.type, false); break; case MTRRIOC_DEL_PAGE_ENTRY: #ifdef CONFIG_COMPAT @@ -396,7 +393,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) for (i = 0; i < max; i++) { mtrr_if->get(i, &base, &size, &type); if (size == 0) - usage_table[i] = 0; + mtrr_usage_table[i] = 0; else { if (size < (0x100000 >> PAGE_SHIFT)) { /* less than 1MB */ @@ -410,7 +407,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) len += seq_printf(seq, "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", i, base, base >> (20 - PAGE_SHIFT), size, factor, - mtrr_attrib_to_str(type), usage_table[i]); + mtrr_attrib_to_str(type), mtrr_usage_table[i]); } } return 0; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index beb45c9c083..71591958265 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -38,8 +38,8 @@ #include <linux/cpu.h> #include <linux/mutex.h> +#include <asm/e820.h> #include <asm/mtrr.h> - #include <asm/uaccess.h> #include <asm/processor.h> #include <asm/msr.h> @@ -47,7 +47,7 @@ u32 num_var_ranges = 0; -unsigned int *usage_table; +unsigned int mtrr_usage_table[MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; @@ -121,13 +121,8 @@ static void __init init_table(void) int i, max; max = num_var_ranges; - if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) - == NULL) { - printk(KERN_ERR "mtrr: could not allocate\n"); - return; - } for (i = 0; i < max; i++) - usage_table[i] = 1; + mtrr_usage_table[i] = 1; } struct set_mtrr_data { @@ -311,7 +306,7 @@ static void set_mtrr(unsigned int reg, unsigned long base, */ int mtrr_add_page(unsigned long base, unsigned long size, - unsigned int type, char increment) + unsigned int type, bool increment) { int i, replace, error; mtrr_type ltype; @@ -383,7 +378,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, goto out; } if (increment) - ++usage_table[i]; + ++mtrr_usage_table[i]; error = i; goto out; } @@ -391,13 +386,15 @@ int mtrr_add_page(unsigned long base, unsigned long size, i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); - if (likely(replace < 0)) - usage_table[i] = 1; - else { - usage_table[i] = usage_table[replace] + !!increment; + if (likely(replace < 0)) { + mtrr_usage_table[i] = 1; + } else { + mtrr_usage_table[i] = mtrr_usage_table[replace]; + if (increment) + mtrr_usage_table[i]++; if (unlikely(replace != i)) { set_mtrr(replace, 0, 0, 0); - usage_table[replace] = 0; + mtrr_usage_table[replace] = 0; } } } else @@ -460,7 +457,7 @@ static int mtrr_check(unsigned long base, unsigned long size) int mtrr_add(unsigned long base, unsigned long size, unsigned int type, - char increment) + bool increment) { if (mtrr_check(base, size)) return -EINVAL; @@ -527,11 +524,11 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); goto out; } - if (usage_table[reg] < 1) { + if (mtrr_usage_table[reg] < 1) { printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); goto out; } - if (--usage_table[reg] < 1) + if (--mtrr_usage_table[reg] < 1) set_mtrr(reg, 0, 0, 0); error = reg; out: @@ -591,16 +588,11 @@ struct mtrr_value { unsigned long lsize; }; -static struct mtrr_value * mtrr_state; +static struct mtrr_value mtrr_state[MAX_VAR_RANGES]; static int mtrr_save(struct sys_device * sysdev, pm_message_t state) { int i; - int size = num_var_ranges * sizeof(struct mtrr_value); - - mtrr_state = kzalloc(size,GFP_ATOMIC); - if (!mtrr_state) - return -ENOMEM; for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, @@ -622,7 +614,6 @@ static int mtrr_restore(struct sys_device * sysdev) mtrr_state[i].lsize, mtrr_state[i].ltype); } - kfree(mtrr_state); return 0; } @@ -633,6 +624,112 @@ static struct sysdev_driver mtrr_sysdev_driver = { .resume = mtrr_restore, }; +static int disable_mtrr_trim; + +static int __init disable_mtrr_trim_setup(char *str) +{ + disable_mtrr_trim = 1; + return 0; +} +early_param("disable_mtrr_trim", disable_mtrr_trim_setup); + +/* + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB + * for memory >4GB. Check for that here. + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't + * apply to are wrong, but so far we don't know of any such case in the wild. + */ +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) + +static __init int amd_special_default_mtrr(void) +{ + u32 l, h; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return 0; + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + return 0; + /* In case some hypervisor doesn't pass SYSCFG through */ + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + return 0; + /* + * Memory between 4GB and top of mem is forced WB by this magic bit. + * Reserved before K8RevF, but should be zero there. + */ + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == + (Tom2Enabled | Tom2ForceMemTypeWB)) + return 1; + return 0; +} + +/** + * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * + * Some buggy BIOSes don't setup the MTRRs properly for systems with certain + * memory configurations. This routine checks that the highest MTRR matches + * the end of memory, to make sure the MTRRs having a write back type cover + * all of the memory the kernel is intending to use. If not, it'll trim any + * memory off the end by adjusting end_pfn, removing it from the kernel's + * allocation pools, warning the user with an obnoxious message. + */ +int __init mtrr_trim_uncached_memory(unsigned long end_pfn) +{ + unsigned long i, base, size, highest_addr = 0, def, dummy; + mtrr_type type; + u64 trim_start, trim_size; + + /* + * Make sure we only trim uncachable memory on machines that + * support the Intel MTRR architecture: + */ + if (!is_cpu(INTEL) || disable_mtrr_trim) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + if (amd_special_default_mtrr()) + return 0; + + /* Find highest cached pfn */ + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + if (type != MTRR_TYPE_WRBACK) + continue; + base <<= PAGE_SHIFT; + size <<= PAGE_SHIFT; + if (highest_addr < base + size) + highest_addr = base + size; + } + + /* kvm/qemu doesn't have mtrr set right, don't trim them all */ + if (!highest_addr) { + printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); + WARN_ON(1); + return 0; + } + + if ((highest_addr >> PAGE_SHIFT) < end_pfn) { + printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" + " all of memory, losing %LdMB of RAM.\n", + (((u64)end_pfn << PAGE_SHIFT) - highest_addr) >> 20); + + WARN_ON(1); + + printk(KERN_INFO "update e820 for mtrr\n"); + trim_start = highest_addr; + trim_size = end_pfn; + trim_size <<= PAGE_SHIFT; + trim_size -= trim_start; + add_memory_region(trim_start, trim_size, E820_RESERVED); + update_e820(); + return 1; + } + + return 0; +} /** * mtrr_bp_init - initialize mtrrs on the boot CPU diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 289dfe6030e..fb74a2c2081 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -2,10 +2,8 @@ * local mtrr defines. */ -#ifndef TRUE -#define TRUE 1 -#define FALSE 0 -#endif +#include <linux/types.h> +#include <linux/stddef.h> #define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff @@ -14,6 +12,7 @@ #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) #define NUM_FIXED_RANGES 88 +#define MAX_VAR_RANGES 256 #define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 @@ -34,6 +33,8 @@ an 8 bit field: */ typedef u8 mtrr_type; +extern unsigned int mtrr_usage_table[MAX_VAR_RANGES]; + struct mtrr_ops { u32 vendor; u32 use_intel_if; diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 49e20c2afcd..9f8ba923d1c 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -4,6 +4,7 @@ #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/processor-cyrix.h> +#include <asm/processor-flags.h> #include "mtrr.h" @@ -25,7 +26,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) /* Disable and flush caches. Note that wbinvd flushes the TLBs as a side-effect */ - cr0 = read_cr0() | 0x40000000; + cr0 = read_cr0() | X86_CR0_CD; wbinvd(); write_cr0(cr0); wbinvd(); diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index c02541e6e65..9b838324b81 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -167,7 +167,6 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, evntsel_nmi_owner); } -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); EXPORT_SYMBOL(reserve_perfctr_nmi); EXPORT_SYMBOL(release_perfctr_nmi); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 3900e46d66d..02821326014 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -188,7 +188,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos) static void c_stop(struct seq_file *m, void *v) { } -struct seq_operations cpuinfo_op = { +const struct seq_operations cpuinfo_op = { .start = c_start, .next = c_next, .stop = c_stop, diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index d387c770c51..dec66e45281 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -50,7 +50,7 @@ struct cpuid_command { static void cpuid_smp_cpuid(void *cmd_block) { - struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; + struct cpuid_command *cmd = cmd_block; cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], &cmd->data[3]); diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 40978af630e..a47798b59f0 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -17,7 +17,7 @@ static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; static void doublefault_fn(void) { - struct Xgt_desc_struct gdt_desc = {0, 0}; + struct desc_ptr gdt_desc = {0, 0}; unsigned long gdt, tss; store_gdt(&gdt_desc); @@ -33,14 +33,15 @@ static void doublefault_fn(void) printk(KERN_EMERG "double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { - struct i386_hw_tss *t = (struct i386_hw_tss *)tss; + struct x86_hw_tss *t = (struct x86_hw_tss *)tss; - printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp); + printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", + t->ip, t->sp); printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", - t->eax, t->ebx, t->ecx, t->edx); + t->ax, t->bx, t->cx, t->dx); printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", - t->esi, t->edi); + t->si, t->di); } } @@ -50,15 +51,15 @@ static void doublefault_fn(void) struct tss_struct doublefault_tss __cacheline_aligned = { .x86_tss = { - .esp0 = STACK_START, + .sp0 = STACK_START, .ss0 = __KERNEL_DS, .ldt = 0, .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - .eip = (unsigned long) doublefault_fn, + .ip = (unsigned long) doublefault_fn, /* 0x2 bit is always set */ - .eflags = X86_EFLAGS_SF | 0x2, - .esp = STACK_START, + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, .es = __USER_DS, .cs = __KERNEL_CS, .ss = __KERNEL_DS, diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c new file mode 100644 index 00000000000..1c5ca4d1878 --- /dev/null +++ b/arch/x86/kernel/ds.c @@ -0,0 +1,464 @@ +/* + * Debug Store support + * + * This provides a low-level interface to the hardware's Debug Store + * feature that is used for last branch recording (LBR) and + * precise-event based sampling (PEBS). + * + * Different architectures use a different DS layout/pointer size. + * The below functions therefore work on a void*. + * + * + * Since there is no user for PEBS, yet, only LBR (or branch + * trace store, BTS) is supported. + * + * + * Copyright (C) 2007 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 + */ + +#include <asm/ds.h> + +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/slab.h> + + +/* + * Debug Store (DS) save area configuration (see Intel64 and IA32 + * Architectures Software Developer's Manual, section 18.5) + * + * The DS configuration consists of the following fields; different + * architetures vary in the size of those fields. + * - double-word aligned base linear address of the BTS buffer + * - write pointer into the BTS buffer + * - end linear address of the BTS buffer (one byte beyond the end of + * the buffer) + * - interrupt pointer into BTS buffer + * (interrupt occurs when write pointer passes interrupt pointer) + * - double-word aligned base linear address of the PEBS buffer + * - write pointer into the PEBS buffer + * - end linear address of the PEBS buffer (one byte beyond the end of + * the buffer) + * - interrupt pointer into PEBS buffer + * (interrupt occurs when write pointer passes interrupt pointer) + * - value to which counter is reset following counter overflow + * + * On later architectures, the last branch recording hardware uses + * 64bit pointers even in 32bit mode. + * + * + * Branch Trace Store (BTS) records store information about control + * flow changes. They at least provide the following information: + * - source linear address + * - destination linear address + * + * Netburst supported a predicated bit that had been dropped in later + * architectures. We do not suppor it. + * + * + * In order to abstract from the actual DS and BTS layout, we describe + * the access to the relevant fields. + * Thanks to Andi Kleen for proposing this design. + * + * The implementation, however, is not as general as it might seem. In + * order to stay somewhat simple and efficient, we assume an + * underlying unsigned type (mostly a pointer type) and we expect the + * field to be at least as big as that type. + */ + +/* + * A special from_ip address to indicate that the BTS record is an + * info record that needs to be interpreted or skipped. + */ +#define BTS_ESCAPE_ADDRESS (-1) + +/* + * A field access descriptor + */ +struct access_desc { + unsigned char offset; + unsigned char size; +}; + +/* + * The configuration for a particular DS/BTS hardware implementation. + */ +struct ds_configuration { + /* the DS configuration */ + unsigned char sizeof_ds; + struct access_desc bts_buffer_base; + struct access_desc bts_index; + struct access_desc bts_absolute_maximum; + struct access_desc bts_interrupt_threshold; + /* the BTS configuration */ + unsigned char sizeof_bts; + struct access_desc from_ip; + struct access_desc to_ip; + /* BTS variants used to store additional information like + timestamps */ + struct access_desc info_type; + struct access_desc info_data; + unsigned long debugctl_mask; +}; + +/* + * The global configuration used by the below accessor functions + */ +static struct ds_configuration ds_cfg; + +/* + * Accessor functions for some DS and BTS fields using the above + * global ptrace_bts_cfg. + */ +static inline unsigned long get_bts_buffer_base(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); +} +static inline void set_bts_buffer_base(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; +} +static inline unsigned long get_bts_index(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_index.offset); +} +static inline void set_bts_index(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; +} +static inline unsigned long get_bts_absolute_maximum(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); +} +static inline void set_bts_absolute_maximum(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; +} +static inline unsigned long get_bts_interrupt_threshold(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); +} +static inline void set_bts_interrupt_threshold(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; +} +static inline unsigned long get_from_ip(char *base) +{ + return *(unsigned long *)(base + ds_cfg.from_ip.offset); +} +static inline void set_from_ip(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; +} +static inline unsigned long get_to_ip(char *base) +{ + return *(unsigned long *)(base + ds_cfg.to_ip.offset); +} +static inline void set_to_ip(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; +} +static inline unsigned char get_info_type(char *base) +{ + return *(unsigned char *)(base + ds_cfg.info_type.offset); +} +static inline void set_info_type(char *base, unsigned char value) +{ + (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; +} +static inline unsigned long get_info_data(char *base) +{ + return *(unsigned long *)(base + ds_cfg.info_data.offset); +} +static inline void set_info_data(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; +} + + +int ds_allocate(void **dsp, size_t bts_size_in_bytes) +{ + size_t bts_size_in_records; + unsigned long bts; + void *ds; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (bts_size_in_bytes < 0) + return -EINVAL; + + bts_size_in_records = + bts_size_in_bytes / ds_cfg.sizeof_bts; + bts_size_in_bytes = + bts_size_in_records * ds_cfg.sizeof_bts; + + if (bts_size_in_bytes <= 0) + return -EINVAL; + + bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); + + if (!bts) + return -ENOMEM; + + ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); + + if (!ds) { + kfree((void *)bts); + return -ENOMEM; + } + + set_bts_buffer_base(ds, bts); + set_bts_index(ds, bts); + set_bts_absolute_maximum(ds, bts + bts_size_in_bytes); + set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1); + + *dsp = ds; + return 0; +} + +int ds_free(void **dsp) +{ + if (*dsp) + kfree((void *)get_bts_buffer_base(*dsp)); + kfree(*dsp); + *dsp = 0; + + return 0; +} + +int ds_get_bts_size(void *ds) +{ + int size_in_bytes; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (!ds) + return 0; + + size_in_bytes = + get_bts_absolute_maximum(ds) - + get_bts_buffer_base(ds); + return size_in_bytes; +} + +int ds_get_bts_end(void *ds) +{ + int size_in_bytes = ds_get_bts_size(ds); + + if (size_in_bytes <= 0) + return size_in_bytes; + + return size_in_bytes / ds_cfg.sizeof_bts; +} + +int ds_get_bts_index(void *ds) +{ + int index_offset_in_bytes; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + index_offset_in_bytes = + get_bts_index(ds) - + get_bts_buffer_base(ds); + + return index_offset_in_bytes / ds_cfg.sizeof_bts; +} + +int ds_set_overflow(void *ds, int method) +{ + switch (method) { + case DS_O_SIGNAL: + return -EOPNOTSUPP; + case DS_O_WRAP: + return 0; + default: + return -EINVAL; + } +} + +int ds_get_overflow(void *ds) +{ + return DS_O_WRAP; +} + +int ds_clear(void *ds) +{ + int bts_size = ds_get_bts_size(ds); + unsigned long bts_base; + + if (bts_size <= 0) + return bts_size; + + bts_base = get_bts_buffer_base(ds); + memset((void *)bts_base, 0, bts_size); + + set_bts_index(ds, bts_base); + return 0; +} + +int ds_read_bts(void *ds, int index, struct bts_struct *out) +{ + void *bts; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (index < 0) + return -EINVAL; + + if (index >= ds_get_bts_size(ds)) + return -EINVAL; + + bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); + + memset(out, 0, sizeof(*out)); + if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) { + out->qualifier = get_info_type(bts); + out->variant.jiffies = get_info_data(bts); + } else { + out->qualifier = BTS_BRANCH; + out->variant.lbr.from_ip = get_from_ip(bts); + out->variant.lbr.to_ip = get_to_ip(bts); + } + + return sizeof(*out);; +} + +int ds_write_bts(void *ds, const struct bts_struct *in) +{ + unsigned long bts; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (ds_get_bts_size(ds) <= 0) + return -ENXIO; + + bts = get_bts_index(ds); + + memset((void *)bts, 0, ds_cfg.sizeof_bts); + switch (in->qualifier) { + case BTS_INVALID: + break; + + case BTS_BRANCH: + set_from_ip((void *)bts, in->variant.lbr.from_ip); + set_to_ip((void *)bts, in->variant.lbr.to_ip); + break; + + case BTS_TASK_ARRIVES: + case BTS_TASK_DEPARTS: + set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); + set_info_type((void *)bts, in->qualifier); + set_info_data((void *)bts, in->variant.jiffies); + break; + + default: + return -EINVAL; + } + + bts = bts + ds_cfg.sizeof_bts; + if (bts >= get_bts_absolute_maximum(ds)) + bts = get_bts_buffer_base(ds); + set_bts_index(ds, bts); + + return ds_cfg.sizeof_bts; +} + +unsigned long ds_debugctl_mask(void) +{ + return ds_cfg.debugctl_mask; +} + +#ifdef __i386__ +static const struct ds_configuration ds_cfg_netburst = { + .sizeof_ds = 9 * 4, + .bts_buffer_base = { 0, 4 }, + .bts_index = { 4, 4 }, + .bts_absolute_maximum = { 8, 4 }, + .bts_interrupt_threshold = { 12, 4 }, + .sizeof_bts = 3 * 4, + .from_ip = { 0, 4 }, + .to_ip = { 4, 4 }, + .info_type = { 4, 1 }, + .info_data = { 8, 4 }, + .debugctl_mask = (1<<2)|(1<<3) +}; + +static const struct ds_configuration ds_cfg_pentium_m = { + .sizeof_ds = 9 * 4, + .bts_buffer_base = { 0, 4 }, + .bts_index = { 4, 4 }, + .bts_absolute_maximum = { 8, 4 }, + .bts_interrupt_threshold = { 12, 4 }, + .sizeof_bts = 3 * 4, + .from_ip = { 0, 4 }, + .to_ip = { 4, 4 }, + .info_type = { 4, 1 }, + .info_data = { 8, 4 }, + .debugctl_mask = (1<<6)|(1<<7) +}; +#endif /* _i386_ */ + +static const struct ds_configuration ds_cfg_core2 = { + .sizeof_ds = 9 * 8, + .bts_buffer_base = { 0, 8 }, + .bts_index = { 8, 8 }, + .bts_absolute_maximum = { 16, 8 }, + .bts_interrupt_threshold = { 24, 8 }, + .sizeof_bts = 3 * 8, + .from_ip = { 0, 8 }, + .to_ip = { 8, 8 }, + .info_type = { 8, 1 }, + .info_data = { 16, 8 }, + .debugctl_mask = (1<<6)|(1<<7)|(1<<9) +}; + +static inline void +ds_configure(const struct ds_configuration *cfg) +{ + ds_cfg = *cfg; +} + +void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 0x6: + switch (c->x86_model) { +#ifdef __i386__ + case 0xD: + case 0xE: /* Pentium M */ + ds_configure(&ds_cfg_pentium_m); + break; +#endif /* _i386_ */ + case 0xF: /* Core2 */ + ds_configure(&ds_cfg_core2); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + case 0xF: + switch (c->x86_model) { +#ifdef __i386__ + case 0x0: + case 0x1: + case 0x2: /* Netburst */ + ds_configure(&ds_cfg_netburst); + break; +#endif /* _i386_ */ + default: + /* sorry, don't know about them */ + break; + } + break; + default: + /* sorry, don't know about them */ + break; + } +} diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 18f500d185a..4e16ef4a265 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -7,7 +7,6 @@ #include <linux/kexec.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/efi.h> #include <linux/pfn.h> #include <linux/uaccess.h> #include <linux/suspend.h> @@ -17,11 +16,6 @@ #include <asm/e820.h> #include <asm/setup.h> -#ifdef CONFIG_EFI -int efi_enabled = 0; -EXPORT_SYMBOL(efi_enabled); -#endif - struct e820map e820; struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -37,26 +31,6 @@ unsigned long pci_mem_start = 0x10000000; EXPORT_SYMBOL(pci_mem_start); #endif extern int user_defined_memmap; -struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -struct resource bss_resource = { - .name = "Kernel bss", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; static struct resource system_rom_resource = { .name = "System ROM", @@ -111,60 +85,6 @@ static struct resource video_rom_resource = { .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM }; -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource standard_io_resources[] = { { - .name = "dma1", - .start = 0x0000, - .end = 0x001f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic1", - .start = 0x0020, - .end = 0x0021, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer0", - .start = 0x0040, - .end = 0x0043, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer1", - .start = 0x0050, - .end = 0x0053, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "keyboard", - .start = 0x0060, - .end = 0x006f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma page reg", - .start = 0x0080, - .end = 0x008f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic2", - .start = 0x00a0, - .end = 0x00a1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma2", - .start = 0x00c0, - .end = 0x00df, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "fpu", - .start = 0x00f0, - .end = 0x00ff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -} }; - #define ROMSIGNATURE 0xaa55 static int __init romsignature(const unsigned char *rom) @@ -260,10 +180,9 @@ static void __init probe_roms(void) * Request address space for all standard RAM and ROM resources * and also for regions reported as reserved by the e820. */ -static void __init -legacy_init_iomem_resources(struct resource *code_resource, - struct resource *data_resource, - struct resource *bss_resource) +void __init init_iomem_resources(struct resource *code_resource, + struct resource *data_resource, + struct resource *bss_resource) { int i; @@ -305,35 +224,6 @@ legacy_init_iomem_resources(struct resource *code_resource, } } -/* - * Request address space for all standard resources - * - * This is called just before pcibios_init(), which is also a - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). - */ -static int __init request_standard_resources(void) -{ - int i; - - printk("Setting up standard PCI resources\n"); - if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, - &data_resource, &bss_resource); - else - legacy_init_iomem_resources(&code_resource, - &data_resource, &bss_resource); - - /* EFI systems may still have VGA */ - request_resource(&iomem_resource, &video_ram_resource); - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - return 0; -} - -subsys_initcall(request_standard_resources); - #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) /** * e820_mark_nosave_regions - Find the ranges of physical addresses that do not @@ -370,19 +260,17 @@ void __init add_memory_region(unsigned long long start, { int x; - if (!efi_enabled) { - x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } + x = e820.nr_map; - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; } /* add_memory_region */ /* @@ -598,29 +486,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map) } /* - * Callback for efi_memory_walk. - */ -static int __init -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) -{ - unsigned long *max_pfn = arg, pfn; - - if (start < end) { - pfn = PFN_UP(end -1); - if (pfn > *max_pfn) - *max_pfn = pfn; - } - return 0; -} - -static int __init -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) -{ - memory_present(0, PFN_UP(start), PFN_DOWN(end)); - return 0; -} - -/* * Find the highest page frame number we have available */ void __init find_max_pfn(void) @@ -628,11 +493,6 @@ void __init find_max_pfn(void) int i; max_pfn = 0; - if (efi_enabled) { - efi_memmap_walk(efi_find_max_pfn, &max_pfn); - efi_memmap_walk(efi_memory_present_wrapper, NULL); - return; - } for (i = 0; i < e820.nr_map; i++) { unsigned long start, end; @@ -650,34 +510,12 @@ void __init find_max_pfn(void) } /* - * Free all available memory for boot time allocation. Used - * as a callback function by efi_memory_walk() - */ - -static int __init -free_available_memory(unsigned long start, unsigned long end, void *arg) -{ - /* check max_low_pfn */ - if (start >= (max_low_pfn << PAGE_SHIFT)) - return 0; - if (end >= (max_low_pfn << PAGE_SHIFT)) - end = max_low_pfn << PAGE_SHIFT; - if (start < end) - free_bootmem(start, end - start); - - return 0; -} -/* * Register fully available low RAM pages with the bootmem allocator. */ void __init register_bootmem_low_pages(unsigned long max_low_pfn) { int i; - if (efi_enabled) { - efi_memmap_walk(free_available_memory, NULL); - return; - } for (i = 0; i < e820.nr_map; i++) { unsigned long curr_pfn, last_pfn, size; /* @@ -785,56 +623,12 @@ void __init print_memory_map(char *who) } } -static __init __always_inline void efi_limit_regions(unsigned long long size) -{ - unsigned long long current_addr = 0; - efi_memory_desc_t *md, *next_md; - void *p, *p1; - int i, j; - - j = 0; - p1 = memmap.map; - for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { - md = p; - next_md = p1; - current_addr = md->phys_addr + - PFN_PHYS(md->num_pages); - if (is_available_memory(md)) { - if (md->phys_addr >= size) continue; - memcpy(next_md, md, memmap.desc_size); - if (current_addr >= size) { - next_md->num_pages -= - PFN_UP(current_addr-size); - } - p1 += memmap.desc_size; - next_md = p1; - j++; - } else if ((md->attribute & EFI_MEMORY_RUNTIME) == - EFI_MEMORY_RUNTIME) { - /* In order to make runtime services - * available we have to include runtime - * memory regions in memory map */ - memcpy(next_md, md, memmap.desc_size); - p1 += memmap.desc_size; - next_md = p1; - j++; - } - } - memmap.nr_map = j; - memmap.map_end = memmap.map + - (memmap.nr_map * memmap.desc_size); -} - void __init limit_regions(unsigned long long size) { unsigned long long current_addr; int i; print_memory_map("limit_regions start"); - if (efi_enabled) { - efi_limit_regions(size); - return; - } for (i = 0; i < e820.nr_map; i++) { current_addr = e820.map[i].addr + e820.map[i].size; if (current_addr < size) @@ -955,3 +749,14 @@ static int __init parse_memmap(char *arg) return 0; } early_param("memmap", parse_memmap); +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + print_memory_map("modified"); +} diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 04698e0b056..c617174e896 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -1,4 +1,4 @@ -/* +/* * Handle the memory map. * The functions here do the job until bootmem takes over. * @@ -26,80 +26,87 @@ #include <asm/proto.h> #include <asm/setup.h> #include <asm/sections.h> +#include <asm/kdebug.h> struct e820map e820; -/* +/* * PFN of last memory page. */ -unsigned long end_pfn; -EXPORT_SYMBOL(end_pfn); +unsigned long end_pfn; -/* +/* * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. * The direct mapping extends to end_pfn_map, so that we can directly access * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long end_pfn_map; + */ +unsigned long end_pfn_map; -/* +/* * Last pfn which the user wants to use. */ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; -extern struct resource code_resource, data_resource, bss_resource; - -/* Check for some hardcoded bad areas that early boot is not allowed to touch */ -static inline int bad_addr(unsigned long *addrp, unsigned long size) -{ - unsigned long addr = *addrp, last = addr + size; - - /* various gunk below that needed for SMP startup */ - if (addr < 0x8000) { - *addrp = PAGE_ALIGN(0x8000); - return 1; - } - - /* direct mapping tables of the kernel */ - if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { - *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT); - return 1; - } - - /* initrd */ -#ifdef CONFIG_BLK_DEV_INITRD - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image+ramdisk_size; - - if (last >= ramdisk_image && addr < ramdisk_end) { - *addrp = PAGE_ALIGN(ramdisk_end); - return 1; - } - } +/* + * Early reserved memory areas. + */ +#define MAX_EARLY_RES 20 + +struct early_res { + unsigned long start, end; +}; +static struct early_res early_res[MAX_EARLY_RES] __initdata = { + { 0, PAGE_SIZE }, /* BIOS data page */ +#ifdef CONFIG_SMP + { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE }, #endif - /* kernel code */ - if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { - *addrp = PAGE_ALIGN(__pa_symbol(&_end)); - return 1; + {} +}; + +void __init reserve_early(unsigned long start, unsigned long end) +{ + int i; + struct early_res *r; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + panic("Overlapping early reservations %lx-%lx to %lx-%lx\n", + start, end, r->start, r->end); } + if (i >= MAX_EARLY_RES) + panic("Too many early reservations"); + r = &early_res[i]; + r->start = start; + r->end = end; +} - if (last >= ebda_addr && addr < ebda_addr + ebda_size) { - *addrp = PAGE_ALIGN(ebda_addr + ebda_size); - return 1; +void __init early_res_to_bootmem(void) +{ + int i; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + reserve_bootmem_generic(r->start, r->end - r->start); } +} -#ifdef CONFIG_NUMA - /* NUMA memory to node map */ - if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { - *addrp = nodemap_addr + nodemap_size; - return 1; +/* Check for already reserved areas */ +static inline int bad_addr(unsigned long *addrp, unsigned long size) +{ + int i; + unsigned long addr = *addrp, last; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last >= r->start && addr < r->end) { + *addrp = addr = r->end; + changed = 1; + goto again; + } } -#endif - /* XXX ramdisk image here? */ - return 0; -} + return changed; +} /* * This function checks if any part of the range <start,end> is mapped @@ -107,16 +114,18 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) */ int e820_any_mapped(unsigned long start, unsigned long end, unsigned type) -{ +{ int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (type && ei->type != type) continue; if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } + continue; + return 1; + } return 0; } EXPORT_SYMBOL_GPL(e820_any_mapped); @@ -127,11 +136,14 @@ EXPORT_SYMBOL_GPL(e820_any_mapped); * Note: this function only works correct if the e820 table is sorted and * not-overlapping, which is the case */ -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) +int __init e820_all_mapped(unsigned long start, unsigned long end, + unsigned type) { int i; + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) continue; /* is the region (part) in overlap with the current region ?*/ @@ -143,65 +155,73 @@ int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type */ if (ei->addr <= start) start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full coverage */ + /* + * if start is now at or beyond end, we're done, full + * coverage + */ if (start >= end) - return 1; /* we're done */ + return 1; } return 0; } -/* - * Find a free area in a specific range. - */ -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr = ei->addr, last; - if (ei->type != E820_RAM) - continue; - if (addr < start) +/* + * Find a free area in a specific range. + */ +unsigned long __init find_e820_area(unsigned long start, unsigned long end, + unsigned size) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long addr = ei->addr, last; + + if (ei->type != E820_RAM) + continue; + if (addr < start) addr = start; - if (addr > ei->addr + ei->size) - continue; + if (addr > ei->addr + ei->size) + continue; while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) ; last = PAGE_ALIGN(addr) + size; if (last > ei->addr + ei->size) continue; - if (last > end) + if (last > end) continue; - return addr; - } - return -1UL; -} + return addr; + } + return -1UL; +} /* * Find the highest page frame number we have available */ unsigned long __init e820_end_of_ram(void) { - unsigned long end_pfn = 0; + unsigned long end_pfn; + end_pfn = find_max_pfn_with_active_regions(); - - if (end_pfn > end_pfn_map) + + if (end_pfn > end_pfn_map) end_pfn_map = end_pfn; if (end_pfn_map > MAXMEM>>PAGE_SHIFT) end_pfn_map = MAXMEM>>PAGE_SHIFT; if (end_pfn > end_user_pfn) end_pfn = end_user_pfn; - if (end_pfn > end_pfn_map) - end_pfn = end_pfn_map; + if (end_pfn > end_pfn_map) + end_pfn = end_pfn_map; - printk("end_pfn_map = %lu\n", end_pfn_map); - return end_pfn; + printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map); + return end_pfn; } /* * Mark e820 reserved areas as busy for the resource manager. */ -void __init e820_reserve_resources(void) +void __init e820_reserve_resources(struct resource *code_resource, + struct resource *data_resource, struct resource *bss_resource) { int i; for (i = 0; i < e820.nr_map; i++) { @@ -219,13 +239,13 @@ void __init e820_reserve_resources(void) request_resource(&iomem_resource, res); if (e820.map[i].type == E820_RAM) { /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. */ - request_resource(res, &code_resource); - request_resource(res, &data_resource); - request_resource(res, &bss_resource); + request_resource(res, code_resource); + request_resource(res, data_resource); + request_resource(res, bss_resource); #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) request_resource(res, &crashk_res); @@ -322,9 +342,9 @@ e820_register_active_regions(int nid, unsigned long start_pfn, add_active_range(nid, ei_startpfn, ei_endpfn); } -/* +/* * Add a memory region to the kernel e820 map. - */ + */ void __init add_memory_region(unsigned long start, unsigned long size, int type) { int x = e820.nr_map; @@ -349,9 +369,7 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long ei_startpfn; - unsigned long ei_endpfn; - unsigned long ram = 0; + unsigned long ei_startpfn, ei_endpfn, ram = 0; int i; for (i = 0; i < e820.nr_map; i++) { @@ -363,28 +381,31 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end) return end - start - (ram << PAGE_SHIFT); } -void __init e820_print_map(char *who) +static void __init e820_print_map(char *who) { int i; for (i = 0; i < e820.nr_map; i++) { printk(KERN_INFO " %s: %016Lx - %016Lx ", who, - (unsigned long long) e820.map[i].addr, - (unsigned long long) (e820.map[i].addr + e820.map[i].size)); + (unsigned long long) e820.map[i].addr, + (unsigned long long) + (e820.map[i].addr + e820.map[i].size)); switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; + case E820_RAM: + printk(KERN_CONT "(usable)\n"); + break; case E820_RESERVED: - printk("(reserved)\n"); - break; + printk(KERN_CONT "(reserved)\n"); + break; case E820_ACPI: - printk("(ACPI data)\n"); - break; + printk(KERN_CONT "(ACPI data)\n"); + break; case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %u\n", e820.map[i].type); - break; + printk(KERN_CONT "(ACPI NVS)\n"); + break; + default: + printk(KERN_CONT "type %u\n", e820.map[i].type); + break; } } } @@ -392,11 +413,11 @@ void __init e820_print_map(char *who) /* * Sanitize the BIOS e820 map. * - * Some e820 responses include overlapping entries. The following + * Some e820 responses include overlapping entries. The following * replaces the original e820 map with a new one, removing overlaps. * */ -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) { struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -416,7 +437,8 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) int i; /* - Visually we're performing the following (1,2,3,4 = memory types)... + Visually we're performing the following + (1,2,3,4 = memory types)... Sample memory map (w/overlaps): ____22__________________ @@ -458,22 +480,23 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) old_nr = *pnr_map; /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; i<old_nr; i++) + for (i = 0; i < old_nr; i++) if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) return -1; /* create pointers for initial change-point information (for sorting) */ - for (i=0; i < 2*old_nr; i++) + for (i = 0; i < 2 * old_nr; i++) change_point[i] = &change_point_list[i]; /* record all known change-points (starting and ending addresses), omitting those that are for empty memory regions */ chgidx = 0; - for (i=0; i < old_nr; i++) { + for (i = 0; i < old_nr; i++) { if (biosmap[i].size != 0) { change_point[chgidx]->addr = biosmap[i].addr; change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx]->addr = biosmap[i].addr + + biosmap[i].size; change_point[chgidx++]->pbios = &biosmap[i]; } } @@ -483,75 +506,106 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) still_changing = 1; while (still_changing) { still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if <current_addr> > <last_addr>, swap */ - /* or, if current=<start_addr> & last=<end_addr>, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { + for (i = 1; i < chg_nr; i++) { + unsigned long long curaddr, lastaddr; + unsigned long long curpbaddr, lastpbaddr; + + curaddr = change_point[i]->addr; + lastaddr = change_point[i - 1]->addr; + curpbaddr = change_point[i]->pbios->addr; + lastpbaddr = change_point[i - 1]->pbios->addr; + + /* + * swap entries, when: + * + * curaddr > lastaddr or + * curaddr == lastaddr and curaddr == curpbaddr and + * lastaddr != lastpbaddr + */ + if (curaddr < lastaddr || + (curaddr == lastaddr && curaddr == curpbaddr && + lastaddr != lastpbaddr)) { change_tmp = change_point[i]; change_point[i] = change_point[i-1]; change_point[i-1] = change_tmp; - still_changing=1; + still_changing = 1; } } } /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ + overlap_entries = 0; /* number of entries in the overlap table */ + new_bios_entry = 0; /* index for creating new bios map entries */ last_type = 0; /* start with undefined memory type */ last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { + for (chgidx = 0; chgidx < chg_nr; chgidx++) { /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; i<overlap_entries; i++) - { - if (overlap_list[i] == change_point[chgidx]->pbios) - overlap_list[i] = overlap_list[overlap_entries-1]; + if (change_point[chgidx]->addr == + change_point[chgidx]->pbios->addr) { + /* + * add map entry to overlap list (> 1 entry + * implies an overlap) + */ + overlap_list[overlap_entries++] = + change_point[chgidx]->pbios; + } else { + /* + * remove entry from list (order independent, + * so swap with last) + */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == + change_point[chgidx]->pbios) + overlap_list[i] = + overlap_list[overlap_entries-1]; } overlap_entries--; } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + /* + * if there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ current_type = 0; - for (i=0; i<overlap_entries; i++) + for (i = 0; i < overlap_entries; i++) if (overlap_list[i]->type > current_type) current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ + /* + * continue building up new bios map based on this + * information + */ if (current_type != last_type) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ + /* + * move forward only if the new size + * was non-zero + */ if (new_bios[new_bios_entry].size != 0) + /* + * no more space left for new + * bios entries ? + */ if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ + break; } if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].addr = + change_point[chgidx]->addr; new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; + last_addr = change_point[chgidx]->addr; } last_type = current_type; } } - new_nr = new_bios_entry; /* retain count for new bios entries */ + /* retain count for new bios entries */ + new_nr = new_bios_entry; /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); *pnr_map = new_nr; return 0; @@ -566,7 +620,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. */ -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +static int __init copy_e820_map(struct e820entry *biosmap, int nr_map) { /* Only one memory region (or negative)? Ignore it */ if (nr_map < 2) @@ -583,18 +637,20 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) return -1; add_memory_region(start, size, type); - } while (biosmap++,--nr_map); + } while (biosmap++, --nr_map); return 0; } -void early_panic(char *msg) +static void early_panic(char *msg) { early_printk(msg); panic(msg); } -void __init setup_memory_region(void) +/* We're not void only for x86 32-bit compat */ +char * __init machine_specific_memory_setup(void) { + char *who = "BIOS-e820"; /* * Try to copy the BIOS-supplied E820-map. * @@ -605,7 +661,10 @@ void __init setup_memory_region(void) if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map("BIOS-e820"); + e820_print_map(who); + + /* In case someone cares... */ + return who; } static int __init parse_memopt(char *p) @@ -613,9 +672,9 @@ static int __init parse_memopt(char *p) if (!p) return -EINVAL; end_user_pfn = memparse(p, &p); - end_user_pfn >>= PAGE_SHIFT; + end_user_pfn >>= PAGE_SHIFT; return 0; -} +} early_param("mem", parse_memopt); static int userdef __initdata; @@ -627,9 +686,9 @@ static int __init parse_memmap_opt(char *p) if (!strcmp(p, "exactmap")) { #ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is + /* + * If we are doing a crash dump, we still need to know + * the real mem size before original memory map is * reset. */ e820_register_active_regions(0, 0, -1UL); @@ -646,6 +705,8 @@ static int __init parse_memmap_opt(char *p) mem_size = memparse(p, &p); if (p == oldp) return -EINVAL; + + userdef = 1; if (*p == '@') { start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RAM); @@ -665,11 +726,29 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { + char nr = e820.nr_map; + + if (sanitize_e820_map(e820.map, &nr) < 0) + early_panic("Invalid user supplied memory map"); + e820.nr_map = nr; + printk(KERN_INFO "user-defined physical RAM map:\n"); e820_print_map("user"); } } +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + e820_print_map("modified"); +} + unsigned long pci_mem_start = 0xaeedbabe; EXPORT_SYMBOL(pci_mem_start); @@ -713,8 +792,10 @@ __init void e820_setup_gap(void) if (!found) { gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " + "address range\n" + KERN_ERR "PCI: Unassigned devices with 32bit resource " + "registers may break!\n"); } /* @@ -727,8 +808,9 @@ __init void e820_setup_gap(void) /* Fun with two's complement */ pci_mem_start = (gapstart + round) & -round; - printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); + printk(KERN_INFO + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); } int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 88bb83ec895..9f51e1ea9e8 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -21,7 +21,33 @@ #include <asm/gart.h> #endif -static void __init via_bugs(void) +static void __init fix_hypertransport_config(int num, int slot, int func) +{ + u32 htcfg; + /* + * we found a hypertransport bus + * make sure that we are broadcasting + * interrupts to all cpus on the ht bus + * if we're using extended apic ids + */ + htcfg = read_pci_config(num, slot, func, 0x68); + if (htcfg & (1 << 18)) { + printk(KERN_INFO "Detected use of extended apic ids " + "on hypertransport bus\n"); + if ((htcfg & (1 << 17)) == 0) { + printk(KERN_INFO "Enabling hypertransport extended " + "apic interrupt broadcast\n"); + printk(KERN_INFO "Note this is a bios bug, " + "please contact your hw vendor\n"); + htcfg |= (1 << 17); + write_pci_config(num, slot, func, 0x68, htcfg); + } + } + + +} + +static void __init via_bugs(int num, int slot, int func) { #ifdef CONFIG_GART_IOMMU if ((end_pfn > MAX_DMA32_PFN || force_iommu) && @@ -44,7 +70,7 @@ static int __init nvidia_hpet_check(struct acpi_table_header *header) #endif /* CONFIG_X86_IO_APIC */ #endif /* CONFIG_ACPI */ -static void __init nvidia_bugs(void) +static void __init nvidia_bugs(int num, int slot, int func) { #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC @@ -72,7 +98,7 @@ static void __init nvidia_bugs(void) } -static void __init ati_bugs(void) +static void __init ati_bugs(int num, int slot, int func) { #ifdef CONFIG_X86_IO_APIC if (timer_over_8254 == 1) { @@ -83,18 +109,67 @@ static void __init ati_bugs(void) #endif } +#define QFLAG_APPLY_ONCE 0x1 +#define QFLAG_APPLIED 0x2 +#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) struct chipset { - u16 vendor; - void (*f)(void); + u32 vendor; + u32 device; + u32 class; + u32 class_mask; + u32 flags; + void (*f)(int num, int slot, int func); }; static struct chipset early_qrk[] __initdata = { - { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, - { PCI_VENDOR_ID_VIA, via_bugs }, - { PCI_VENDOR_ID_ATI, ati_bugs }, + { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, + { PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, + { PCI_VENDOR_ID_ATI, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs }, + { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, {} }; +static void __init check_dev_quirk(int num, int slot, int func) +{ + u16 class; + u16 vendor; + u16 device; + u8 type; + int i; + + class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); + + if (class == 0xffff) + return; + + vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + + for (i = 0; early_qrk[i].f != NULL; i++) { + if (((early_qrk[i].vendor == PCI_ANY_ID) || + (early_qrk[i].vendor == vendor)) && + ((early_qrk[i].device == PCI_ANY_ID) || + (early_qrk[i].device == device)) && + (!((early_qrk[i].class ^ class) & + early_qrk[i].class_mask))) { + if ((early_qrk[i].flags & + QFLAG_DONE) != QFLAG_DONE) + early_qrk[i].f(num, slot, func); + early_qrk[i].flags |= QFLAG_APPLIED; + } + } + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + return; +} + void __init early_quirks(void) { int num, slot, func; @@ -103,36 +178,8 @@ void __init early_quirks(void) return; /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - u8 type; - int i; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - vendor &= 0xffff; - - for (i = 0; early_qrk[i].f; i++) - if (early_qrk[i].vendor == vendor) { - early_qrk[i].f(); - return; - } - - type = read_pci_config_byte(num, slot, func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } + for (num = 0; num < 32; num++) + for (slot = 0; slot < 32; slot++) + for (func = 0; func < 8; func++) + check_dev_quirk(num, slot, func); } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c new file mode 100644 index 00000000000..1411324a625 --- /dev/null +++ b/arch/x86/kernel/efi.c @@ -0,0 +1,512 @@ +/* + * Common EFI (Extensible Firmware Interface) support functions + * Based on Extensible Firmware Interface Specification version 1.0 + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999-2002 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * Copyright (C) 2005-2008 Intel Co. + * Fenghua Yu <fenghua.yu@intel.com> + * Bibo Mao <bibo.mao@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * Huang Ying <ying.huang@intel.com> + * + * Copied from efi_32.c to eliminate the duplicated code between EFI + * 32/64 support code. --ying 2007-10-26 + * + * All EFI Runtime Services are not implemented yet as EFI only + * supports physical mode addressing on SoftSDV. This is to be fixed + * in a future version. --drummond 1999-07-20 + * + * Implemented EFI runtime services and virtual mode calls. --davidm + * + * Goutham Rao: <goutham.rao@intel.com> + * Skip non-WB memory and ignore empty memory ranges. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/efi.h> +#include <linux/bootmem.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> +#include <linux/time.h> +#include <linux/io.h> +#include <linux/reboot.h> +#include <linux/bcd.h> + +#include <asm/setup.h> +#include <asm/efi.h> +#include <asm/time.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +#define EFI_DEBUG 1 +#define PFX "EFI: " + +int efi_enabled; +EXPORT_SYMBOL(efi_enabled); + +struct efi efi; +EXPORT_SYMBOL(efi); + +struct efi_memory_map memmap; + +struct efi efi_phys __initdata; +static efi_system_table_t efi_systab __initdata; + +static int __init setup_noefi(char *arg) +{ + efi_enabled = 0; + return 0; +} +early_param("noefi", setup_noefi); + +static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) +{ + return efi_call_virt2(get_time, tm, tc); +} + +static efi_status_t virt_efi_set_time(efi_time_t *tm) +{ + return efi_call_virt1(set_time, tm); +} + +static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, + efi_bool_t *pending, + efi_time_t *tm) +{ + return efi_call_virt3(get_wakeup_time, + enabled, pending, tm); +} + +static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) +{ + return efi_call_virt2(set_wakeup_time, + enabled, tm); +} + +static efi_status_t virt_efi_get_variable(efi_char16_t *name, + efi_guid_t *vendor, + u32 *attr, + unsigned long *data_size, + void *data) +{ + return efi_call_virt5(get_variable, + name, vendor, attr, + data_size, data); +} + +static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, + efi_char16_t *name, + efi_guid_t *vendor) +{ + return efi_call_virt3(get_next_variable, + name_size, name, vendor); +} + +static efi_status_t virt_efi_set_variable(efi_char16_t *name, + efi_guid_t *vendor, + unsigned long attr, + unsigned long data_size, + void *data) +{ + return efi_call_virt5(set_variable, + name, vendor, attr, + data_size, data); +} + +static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) +{ + return efi_call_virt1(get_next_high_mono_count, count); +} + +static void virt_efi_reset_system(int reset_type, + efi_status_t status, + unsigned long data_size, + efi_char16_t *data) +{ + efi_call_virt4(reset_system, reset_type, status, + data_size, data); +} + +static efi_status_t virt_efi_set_virtual_address_map( + unsigned long memory_map_size, + unsigned long descriptor_size, + u32 descriptor_version, + efi_memory_desc_t *virtual_map) +{ + return efi_call_virt4(set_virtual_address_map, + memory_map_size, descriptor_size, + descriptor_version, virtual_map); +} + +static efi_status_t __init phys_efi_set_virtual_address_map( + unsigned long memory_map_size, + unsigned long descriptor_size, + u32 descriptor_version, + efi_memory_desc_t *virtual_map) +{ + efi_status_t status; + + efi_call_phys_prelog(); + status = efi_call_phys4(efi_phys.set_virtual_address_map, + memory_map_size, descriptor_size, + descriptor_version, virtual_map); + efi_call_phys_epilog(); + return status; +} + +static efi_status_t __init phys_efi_get_time(efi_time_t *tm, + efi_time_cap_t *tc) +{ + efi_status_t status; + + efi_call_phys_prelog(); + status = efi_call_phys2(efi_phys.get_time, tm, tc); + efi_call_phys_epilog(); + return status; +} + +int efi_set_rtc_mmss(unsigned long nowtime) +{ + int real_seconds, real_minutes; + efi_status_t status; + efi_time_t eft; + efi_time_cap_t cap; + + status = efi.get_time(&eft, &cap); + if (status != EFI_SUCCESS) { + printk(KERN_ERR "Oops: efitime: can't read time!\n"); + return -1; + } + + real_seconds = nowtime % 60; + real_minutes = nowtime / 60; + if (((abs(real_minutes - eft.minute) + 15)/30) & 1) + real_minutes += 30; + real_minutes %= 60; + eft.minute = real_minutes; + eft.second = real_seconds; + + status = efi.set_time(&eft); + if (status != EFI_SUCCESS) { + printk(KERN_ERR "Oops: efitime: can't write time!\n"); + return -1; + } + return 0; +} + +unsigned long efi_get_time(void) +{ + efi_status_t status; + efi_time_t eft; + efi_time_cap_t cap; + + status = efi.get_time(&eft, &cap); + if (status != EFI_SUCCESS) + printk(KERN_ERR "Oops: efitime: can't read time!\n"); + + return mktime(eft.year, eft.month, eft.day, eft.hour, + eft.minute, eft.second); +} + +#if EFI_DEBUG +static void __init print_efi_memmap(void) +{ + efi_memory_desc_t *md; + void *p; + int i; + + for (p = memmap.map, i = 0; + p < memmap.map_end; + p += memmap.desc_size, i++) { + md = p; + printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, " + "range=[0x%016llx-0x%016llx) (%lluMB)\n", + i, md->type, md->attribute, md->phys_addr, + md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), + (md->num_pages >> (20 - EFI_PAGE_SHIFT))); + } +} +#endif /* EFI_DEBUG */ + +void __init efi_init(void) +{ + efi_config_table_t *config_tables; + efi_runtime_services_t *runtime; + efi_char16_t *c16; + char vendor[100] = "unknown"; + int i = 0; + void *tmp; + +#ifdef CONFIG_X86_32 + efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; + memmap.phys_map = (void *)boot_params.efi_info.efi_memmap; +#else + efi_phys.systab = (efi_system_table_t *) + (boot_params.efi_info.efi_systab | + ((__u64)boot_params.efi_info.efi_systab_hi<<32)); + memmap.phys_map = (void *) + (boot_params.efi_info.efi_memmap | + ((__u64)boot_params.efi_info.efi_memmap_hi<<32)); +#endif + memmap.nr_map = boot_params.efi_info.efi_memmap_size / + boot_params.efi_info.efi_memdesc_size; + memmap.desc_version = boot_params.efi_info.efi_memdesc_version; + memmap.desc_size = boot_params.efi_info.efi_memdesc_size; + + efi.systab = early_ioremap((unsigned long)efi_phys.systab, + sizeof(efi_system_table_t)); + if (efi.systab == NULL) + printk(KERN_ERR "Couldn't map the EFI system table!\n"); + memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t)); + early_iounmap(efi.systab, sizeof(efi_system_table_t)); + efi.systab = &efi_systab; + + /* + * Verify the EFI Table + */ + if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + printk(KERN_ERR "EFI system table signature incorrect!\n"); + if ((efi.systab->hdr.revision >> 16) == 0) + printk(KERN_ERR "Warning: EFI system table version " + "%d.%02d, expected 1.00 or greater!\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff); + + /* + * Show what we know for posterity + */ + c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); + if (c16) { + for (i = 0; i < sizeof(vendor) && *c16; ++i) + vendor[i] = *c16++; + vendor[i] = '\0'; + } else + printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); + early_iounmap(tmp, 2); + + printk(KERN_INFO "EFI v%u.%.02u by %s \n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); + + /* + * Let's see what config tables the firmware passed to us. + */ + config_tables = early_ioremap( + efi.systab->tables, + efi.systab->nr_tables * sizeof(efi_config_table_t)); + if (config_tables == NULL) + printk(KERN_ERR "Could not map EFI Configuration Table!\n"); + + printk(KERN_INFO); + for (i = 0; i < efi.systab->nr_tables; i++) { + if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) { + efi.mps = config_tables[i].table; + printk(" MPS=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + ACPI_20_TABLE_GUID)) { + efi.acpi20 = config_tables[i].table; + printk(" ACPI 2.0=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + ACPI_TABLE_GUID)) { + efi.acpi = config_tables[i].table; + printk(" ACPI=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + SMBIOS_TABLE_GUID)) { + efi.smbios = config_tables[i].table; + printk(" SMBIOS=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + HCDP_TABLE_GUID)) { + efi.hcdp = config_tables[i].table; + printk(" HCDP=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + UGA_IO_PROTOCOL_GUID)) { + efi.uga = config_tables[i].table; + printk(" UGA=0x%lx ", config_tables[i].table); + } + } + printk("\n"); + early_iounmap(config_tables, + efi.systab->nr_tables * sizeof(efi_config_table_t)); + + /* + * Check out the runtime services table. We need to map + * the runtime services table so that we can grab the physical + * address of several of the EFI runtime functions, needed to + * set the firmware into virtual mode. + */ + runtime = early_ioremap((unsigned long)efi.systab->runtime, + sizeof(efi_runtime_services_t)); + if (runtime != NULL) { + /* + * We will only need *early* access to the following + * two EFI runtime services before set_virtual_address_map + * is invoked. + */ + efi_phys.get_time = (efi_get_time_t *)runtime->get_time; + efi_phys.set_virtual_address_map = + (efi_set_virtual_address_map_t *) + runtime->set_virtual_address_map; + /* + * Make efi_get_time can be called before entering + * virtual mode. + */ + efi.get_time = phys_efi_get_time; + } else + printk(KERN_ERR "Could not map the EFI runtime service " + "table!\n"); + early_iounmap(runtime, sizeof(efi_runtime_services_t)); + + /* Map the EFI memory map */ + memmap.map = early_ioremap((unsigned long)memmap.phys_map, + memmap.nr_map * memmap.desc_size); + if (memmap.map == NULL) + printk(KERN_ERR "Could not map the EFI memory map!\n"); + memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + if (memmap.desc_size != sizeof(efi_memory_desc_t)) + printk(KERN_WARNING "Kernel-defined memdesc" + "doesn't match the one from EFI!\n"); + + /* Setup for EFI runtime service */ + reboot_type = BOOT_EFI; + +#if EFI_DEBUG + print_efi_memmap(); +#endif +} + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static void __init runtime_code_page_mkexec(void) +{ + efi_memory_desc_t *md; + unsigned long end; + void *p; + + if (!(__supported_pte_mask & _PAGE_NX)) + return; + + /* Make EFI runtime service code area executable */ + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + if (md->type == EFI_RUNTIME_SERVICES_CODE && + (end >> PAGE_SHIFT) <= max_pfn_mapped) { + set_memory_x(md->virt_addr, md->num_pages); + set_memory_uc(md->virt_addr, md->num_pages); + } + } + __flush_tlb_all(); +} +#else +static inline void __init runtime_code_page_mkexec(void) { } +#endif + +/* + * This function will switch the EFI runtime services to virtual mode. + * Essentially, look through the EFI memmap and map every region that + * has the runtime attribute bit set in its memory descriptor and update + * that memory descriptor with the virtual address obtained from ioremap(). + * This enables the runtime services to be called without having to + * thunk back into physical mode for every invocation. + */ +void __init efi_enter_virtual_mode(void) +{ + efi_memory_desc_t *md; + efi_status_t status; + unsigned long end; + void *p; + + efi.systab = NULL; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + if ((md->attribute & EFI_MEMORY_WB) && + ((end >> PAGE_SHIFT) <= max_pfn_mapped)) + md->virt_addr = (unsigned long)__va(md->phys_addr); + else + md->virt_addr = (unsigned long) + efi_ioremap(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT); + if (!md->virt_addr) + printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n", + (unsigned long long)md->phys_addr); + if ((md->phys_addr <= (unsigned long)efi_phys.systab) && + ((unsigned long)efi_phys.systab < end)) + efi.systab = (efi_system_table_t *)(unsigned long) + (md->virt_addr - md->phys_addr + + (unsigned long)efi_phys.systab); + } + + BUG_ON(!efi.systab); + + status = phys_efi_set_virtual_address_map( + memmap.desc_size * memmap.nr_map, + memmap.desc_size, + memmap.desc_version, + memmap.phys_map); + + if (status != EFI_SUCCESS) { + printk(KERN_ALERT "Unable to switch EFI into virtual mode " + "(status=%lx)!\n", status); + panic("EFI call to SetVirtualAddressMap() failed!"); + } + + /* + * Now that EFI is in virtual mode, update the function + * pointers in the runtime service table to the new virtual addresses. + * + * Call EFI services through wrapper functions. + */ + efi.get_time = virt_efi_get_time; + efi.set_time = virt_efi_set_time; + efi.get_wakeup_time = virt_efi_get_wakeup_time; + efi.set_wakeup_time = virt_efi_set_wakeup_time; + efi.get_variable = virt_efi_get_variable; + efi.get_next_variable = virt_efi_get_next_variable; + efi.set_variable = virt_efi_set_variable; + efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; + efi.reset_system = virt_efi_reset_system; + efi.set_virtual_address_map = virt_efi_set_virtual_address_map; + runtime_code_page_mkexec(); + early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); + memmap.map = NULL; +} + +/* + * Convenience functions to obtain memory types and attributes + */ +u32 efi_mem_type(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if ((md->phys_addr <= phys_addr) && + (phys_addr < (md->phys_addr + + (md->num_pages << EFI_PAGE_SHIFT)))) + return md->type; + } + return 0; +} + +u64 efi_mem_attributes(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if ((md->phys_addr <= phys_addr) && + (phys_addr < (md->phys_addr + + (md->num_pages << EFI_PAGE_SHIFT)))) + return md->attribute; + } + return 0; +} diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c index e2be78f4939..cb91f985b4a 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/kernel/efi_32.c @@ -20,40 +20,15 @@ */ #include <linux/kernel.h> -#include <linux/init.h> -#include <linux/mm.h> #include <linux/types.h> -#include <linux/time.h> -#include <linux/spinlock.h> -#include <linux/bootmem.h> #include <linux/ioport.h> -#include <linux/module.h> #include <linux/efi.h> -#include <linux/kexec.h> -#include <asm/setup.h> #include <asm/io.h> #include <asm/page.h> #include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/desc.h> #include <asm/tlbflush.h> -#define EFI_DEBUG 0 -#define PFX "EFI: " - -extern efi_status_t asmlinkage efi_call_phys(void *, ...); - -struct efi efi; -EXPORT_SYMBOL(efi); -static struct efi efi_phys; -struct efi_memory_map memmap; - -/* - * We require an early boot_ioremap mapping mechanism initially - */ -extern void * boot_ioremap(unsigned long, unsigned long); - /* * To make EFI call EFI runtime service in physical addressing mode we need * prelog/epilog before/after the invocation to disable interrupt, to @@ -62,16 +37,14 @@ extern void * boot_ioremap(unsigned long, unsigned long); */ static unsigned long efi_rt_eflags; -static DEFINE_SPINLOCK(efi_rt_lock); static pgd_t efi_bak_pg_dir_pointer[2]; -static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) +void efi_call_phys_prelog(void) { unsigned long cr4; unsigned long temp; - struct Xgt_desc_struct gdt_descr; + struct desc_ptr gdt_descr; - spin_lock(&efi_rt_lock); local_irq_save(efi_rt_eflags); /* @@ -101,17 +74,17 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) /* * After the lock is released, the original page table is restored. */ - local_flush_tlb(); + __flush_tlb_all(); gdt_descr.address = __pa(get_cpu_gdt_table(0)); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); } -static void efi_call_phys_epilog(void) __releases(efi_rt_lock) +void efi_call_phys_epilog(void) { unsigned long cr4; - struct Xgt_desc_struct gdt_descr; + struct desc_ptr gdt_descr; gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); gdt_descr.size = GDT_SIZE - 1; @@ -132,586 +105,7 @@ static void efi_call_phys_epilog(void) __releases(efi_rt_lock) /* * After the lock is released, the original page table is restored. */ - local_flush_tlb(); + __flush_tlb_all(); local_irq_restore(efi_rt_eflags); - spin_unlock(&efi_rt_lock); -} - -static efi_status_t -phys_efi_set_virtual_address_map(unsigned long memory_map_size, - unsigned long descriptor_size, - u32 descriptor_version, - efi_memory_desc_t *virtual_map) -{ - efi_status_t status; - - efi_call_phys_prelog(); - status = efi_call_phys(efi_phys.set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); - efi_call_phys_epilog(); - return status; -} - -static efi_status_t -phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) -{ - efi_status_t status; - - efi_call_phys_prelog(); - status = efi_call_phys(efi_phys.get_time, tm, tc); - efi_call_phys_epilog(); - return status; -} - -inline int efi_set_rtc_mmss(unsigned long nowtime) -{ - int real_seconds, real_minutes; - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - - spin_lock(&efi_rt_lock); - status = efi.get_time(&eft, &cap); - spin_unlock(&efi_rt_lock); - if (status != EFI_SUCCESS) - panic("Ooops, efitime: can't read time!\n"); - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - - if (((abs(real_minutes - eft.minute) + 15)/30) & 1) - real_minutes += 30; - real_minutes %= 60; - - eft.minute = real_minutes; - eft.second = real_seconds; - - if (status != EFI_SUCCESS) { - printk("Ooops: efitime: can't read time!\n"); - return -1; - } - return 0; -} -/* - * This is used during kernel init before runtime - * services have been remapped and also during suspend, therefore, - * we'll need to call both in physical and virtual modes. - */ -inline unsigned long efi_get_time(void) -{ - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - - if (efi.get_time) { - /* if we are in virtual mode use remapped function */ - status = efi.get_time(&eft, &cap); - } else { - /* we are in physical mode */ - status = phys_efi_get_time(&eft, &cap); - } - - if (status != EFI_SUCCESS) - printk("Oops: efitime: can't read time status: 0x%lx\n",status); - - return mktime(eft.year, eft.month, eft.day, eft.hour, - eft.minute, eft.second); -} - -int is_available_memory(efi_memory_desc_t * md) -{ - if (!(md->attribute & EFI_MEMORY_WB)) - return 0; - - switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: - case EFI_BOOT_SERVICES_CODE: - case EFI_BOOT_SERVICES_DATA: - case EFI_CONVENTIONAL_MEMORY: - return 1; - } - return 0; -} - -/* - * We need to map the EFI memory map again after paging_init(). - */ -void __init efi_map_memmap(void) -{ - memmap.map = NULL; - - memmap.map = bt_ioremap((unsigned long) memmap.phys_map, - (memmap.nr_map * memmap.desc_size)); - if (memmap.map == NULL) - printk(KERN_ERR PFX "Could not remap the EFI memmap!\n"); - - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); -} - -#if EFI_DEBUG -static void __init print_efi_memmap(void) -{ - efi_memory_desc_t *md; - void *p; - int i; - - for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { - md = p; - printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, " - "range=[0x%016llx-0x%016llx) (%lluMB)\n", - i, md->type, md->attribute, md->phys_addr, - md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), - (md->num_pages >> (20 - EFI_PAGE_SHIFT))); - } -} -#endif /* EFI_DEBUG */ - -/* - * Walks the EFI memory map and calls CALLBACK once for each EFI - * memory descriptor that has memory that is available for kernel use. - */ -void efi_memmap_walk(efi_freemem_callback_t callback, void *arg) -{ - int prev_valid = 0; - struct range { - unsigned long start; - unsigned long end; - } uninitialized_var(prev), curr; - efi_memory_desc_t *md; - unsigned long start, end; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - - if ((md->num_pages == 0) || (!is_available_memory(md))) - continue; - - curr.start = md->phys_addr; - curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT); - - if (!prev_valid) { - prev = curr; - prev_valid = 1; - } else { - if (curr.start < prev.start) - printk(KERN_INFO PFX "Unordered memory map\n"); - if (prev.end == curr.start) - prev.end = curr.end; - else { - start = - (unsigned long) (PAGE_ALIGN(prev.start)); - end = (unsigned long) (prev.end & PAGE_MASK); - if ((end > start) - && (*callback) (start, end, arg) < 0) - return; - prev = curr; - } - } - } - if (prev_valid) { - start = (unsigned long) PAGE_ALIGN(prev.start); - end = (unsigned long) (prev.end & PAGE_MASK); - if (end > start) - (*callback) (start, end, arg); - } -} - -void __init efi_init(void) -{ - efi_config_table_t *config_tables; - efi_runtime_services_t *runtime; - efi_char16_t *c16; - char vendor[100] = "unknown"; - unsigned long num_config_tables; - int i = 0; - - memset(&efi, 0, sizeof(efi) ); - memset(&efi_phys, 0, sizeof(efi_phys)); - - efi_phys.systab = - (efi_system_table_t *)boot_params.efi_info.efi_systab; - memmap.phys_map = (void *)boot_params.efi_info.efi_memmap; - memmap.nr_map = boot_params.efi_info.efi_memmap_size/ - boot_params.efi_info.efi_memdesc_size; - memmap.desc_version = boot_params.efi_info.efi_memdesc_version; - memmap.desc_size = boot_params.efi_info.efi_memdesc_size; - - efi.systab = (efi_system_table_t *) - boot_ioremap((unsigned long) efi_phys.systab, - sizeof(efi_system_table_t)); - /* - * Verify the EFI Table - */ - if (efi.systab == NULL) - printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n"); - if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n"); - if ((efi.systab->hdr.revision >> 16) == 0) - printk(KERN_ERR PFX "Warning: EFI system table version " - "%d.%02d, expected 1.00 or greater\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff); - - /* - * Grab some details from the system table - */ - num_config_tables = efi.systab->nr_tables; - config_tables = (efi_config_table_t *)efi.systab->tables; - runtime = efi.systab->runtime; - - /* - * Show what we know for posterity - */ - c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2); - if (c16) { - for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i) - vendor[i] = *c16++; - vendor[i] = '\0'; - } else - printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); - - printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff, vendor); - - /* - * Let's see what config tables the firmware passed to us. - */ - config_tables = (efi_config_table_t *) - boot_ioremap((unsigned long) config_tables, - num_config_tables * sizeof(efi_config_table_t)); - - if (config_tables == NULL) - printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n"); - - efi.mps = EFI_INVALID_TABLE_ADDR; - efi.acpi = EFI_INVALID_TABLE_ADDR; - efi.acpi20 = EFI_INVALID_TABLE_ADDR; - efi.smbios = EFI_INVALID_TABLE_ADDR; - efi.sal_systab = EFI_INVALID_TABLE_ADDR; - efi.boot_info = EFI_INVALID_TABLE_ADDR; - efi.hcdp = EFI_INVALID_TABLE_ADDR; - efi.uga = EFI_INVALID_TABLE_ADDR; - - for (i = 0; i < num_config_tables; i++) { - if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) { - efi.mps = config_tables[i].table; - printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) { - efi.acpi20 = config_tables[i].table; - printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) { - efi.acpi = config_tables[i].table; - printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) { - efi.smbios = config_tables[i].table; - printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) { - efi.hcdp = config_tables[i].table; - printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) { - efi.uga = config_tables[i].table; - printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table); - } - } - printk("\n"); - - /* - * Check out the runtime services table. We need to map - * the runtime services table so that we can grab the physical - * address of several of the EFI runtime functions, needed to - * set the firmware into virtual mode. - */ - - runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long) - runtime, - sizeof(efi_runtime_services_t)); - if (runtime != NULL) { - /* - * We will only need *early* access to the following - * two EFI runtime services before set_virtual_address_map - * is invoked. - */ - efi_phys.get_time = (efi_get_time_t *) runtime->get_time; - efi_phys.set_virtual_address_map = - (efi_set_virtual_address_map_t *) - runtime->set_virtual_address_map; - } else - printk(KERN_ERR PFX "Could not map the runtime service table!\n"); - - /* Map the EFI memory map for use until paging_init() */ - memmap.map = boot_ioremap(boot_params.efi_info.efi_memmap, - boot_params.efi_info.efi_memmap_size); - if (memmap.map == NULL) - printk(KERN_ERR PFX "Could not map the EFI memory map!\n"); - - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); - -#if EFI_DEBUG - print_efi_memmap(); -#endif -} - -static inline void __init check_range_for_systab(efi_memory_desc_t *md) -{ - if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) && - ((unsigned long)efi_phys.systab < md->phys_addr + - ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) { - unsigned long addr; - - addr = md->virt_addr - md->phys_addr + - (unsigned long)efi_phys.systab; - efi.systab = (efi_system_table_t *)addr; - } -} - -/* - * Wrap all the virtual calls in a way that forces the parameters on the stack. - */ - -#define efi_call_virt(f, args...) \ - ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args) - -static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) -{ - return efi_call_virt(get_time, tm, tc); -} - -static efi_status_t virt_efi_set_time (efi_time_t *tm) -{ - return efi_call_virt(set_time, tm); -} - -static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled, - efi_bool_t *pending, - efi_time_t *tm) -{ - return efi_call_virt(get_wakeup_time, enabled, pending, tm); -} - -static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled, - efi_time_t *tm) -{ - return efi_call_virt(set_wakeup_time, enabled, tm); -} - -static efi_status_t virt_efi_get_variable (efi_char16_t *name, - efi_guid_t *vendor, u32 *attr, - unsigned long *data_size, void *data) -{ - return efi_call_virt(get_variable, name, vendor, attr, data_size, data); -} - -static efi_status_t virt_efi_get_next_variable (unsigned long *name_size, - efi_char16_t *name, - efi_guid_t *vendor) -{ - return efi_call_virt(get_next_variable, name_size, name, vendor); -} - -static efi_status_t virt_efi_set_variable (efi_char16_t *name, - efi_guid_t *vendor, - unsigned long attr, - unsigned long data_size, void *data) -{ - return efi_call_virt(set_variable, name, vendor, attr, data_size, data); -} - -static efi_status_t virt_efi_get_next_high_mono_count (u32 *count) -{ - return efi_call_virt(get_next_high_mono_count, count); -} - -static void virt_efi_reset_system (int reset_type, efi_status_t status, - unsigned long data_size, - efi_char16_t *data) -{ - efi_call_virt(reset_system, reset_type, status, data_size, data); -} - -/* - * This function will switch the EFI runtime services to virtual mode. - * Essentially, look through the EFI memmap and map every region that - * has the runtime attribute bit set in its memory descriptor and update - * that memory descriptor with the virtual address obtained from ioremap(). - * This enables the runtime services to be called without having to - * thunk back into physical mode for every invocation. - */ - -void __init efi_enter_virtual_mode(void) -{ - efi_memory_desc_t *md; - efi_status_t status; - void *p; - - efi.systab = NULL; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - - md->virt_addr = (unsigned long)ioremap(md->phys_addr, - md->num_pages << EFI_PAGE_SHIFT); - if (!(unsigned long)md->virt_addr) { - printk(KERN_ERR PFX "ioremap of 0x%lX failed\n", - (unsigned long)md->phys_addr); - } - /* update the virtual address of the EFI system table */ - check_range_for_systab(md); - } - - BUG_ON(!efi.systab); - - status = phys_efi_set_virtual_address_map( - memmap.desc_size * memmap.nr_map, - memmap.desc_size, - memmap.desc_version, - memmap.phys_map); - - if (status != EFI_SUCCESS) { - printk (KERN_ALERT "You are screwed! " - "Unable to switch EFI into virtual mode " - "(status=%lx)\n", status); - panic("EFI call to SetVirtualAddressMap() failed!"); - } - - /* - * Now that EFI is in virtual mode, update the function - * pointers in the runtime service table to the new virtual addresses. - */ - - efi.get_time = virt_efi_get_time; - efi.set_time = virt_efi_set_time; - efi.get_wakeup_time = virt_efi_get_wakeup_time; - efi.set_wakeup_time = virt_efi_set_wakeup_time; - efi.get_variable = virt_efi_get_variable; - efi.get_next_variable = virt_efi_get_next_variable; - efi.set_variable = virt_efi_set_variable; - efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; - efi.reset_system = virt_efi_reset_system; -} - -void __init -efi_initialize_iomem_resources(struct resource *code_resource, - struct resource *data_resource, - struct resource *bss_resource) -{ - struct resource *res; - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - - if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) > - 0x100000000ULL) - continue; - res = kzalloc(sizeof(struct resource), GFP_ATOMIC); - switch (md->type) { - case EFI_RESERVED_TYPE: - res->name = "Reserved Memory"; - break; - case EFI_LOADER_CODE: - res->name = "Loader Code"; - break; - case EFI_LOADER_DATA: - res->name = "Loader Data"; - break; - case EFI_BOOT_SERVICES_DATA: - res->name = "BootServices Data"; - break; - case EFI_BOOT_SERVICES_CODE: - res->name = "BootServices Code"; - break; - case EFI_RUNTIME_SERVICES_CODE: - res->name = "Runtime Service Code"; - break; - case EFI_RUNTIME_SERVICES_DATA: - res->name = "Runtime Service Data"; - break; - case EFI_CONVENTIONAL_MEMORY: - res->name = "Conventional Memory"; - break; - case EFI_UNUSABLE_MEMORY: - res->name = "Unusable Memory"; - break; - case EFI_ACPI_RECLAIM_MEMORY: - res->name = "ACPI Reclaim"; - break; - case EFI_ACPI_MEMORY_NVS: - res->name = "ACPI NVS"; - break; - case EFI_MEMORY_MAPPED_IO: - res->name = "Memory Mapped IO"; - break; - case EFI_MEMORY_MAPPED_IO_PORT_SPACE: - res->name = "Memory Mapped IO Port Space"; - break; - default: - res->name = "Reserved"; - break; - } - res->start = md->phys_addr; - res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1); - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - if (request_resource(&iomem_resource, res) < 0) - printk(KERN_ERR PFX "Failed to allocate res %s : " - "0x%llx-0x%llx\n", res->name, - (unsigned long long)res->start, - (unsigned long long)res->end); - /* - * We don't know which region contains kernel data so we try - * it repeatedly and let the resource manager test it. - */ - if (md->type == EFI_CONVENTIONAL_MEMORY) { - request_resource(res, code_resource); - request_resource(res, data_resource); - request_resource(res, bss_resource); -#ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); -#endif - } - } -} - -/* - * Convenience functions to obtain memory types and attributes - */ - -u32 efi_mem_type(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && (phys_addr < - (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) - return md->type; - } - return 0; -} - -u64 efi_mem_attributes(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && (phys_addr < - (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) - return md->attribute; - } - return 0; } diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c new file mode 100644 index 00000000000..4b73992c1e1 --- /dev/null +++ b/arch/x86/kernel/efi_64.c @@ -0,0 +1,134 @@ +/* + * x86_64 specific EFI support functions + * Based on Extensible Firmware Interface Specification version 1.0 + * + * Copyright (C) 2005-2008 Intel Co. + * Fenghua Yu <fenghua.yu@intel.com> + * Bibo Mao <bibo.mao@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * Huang Ying <ying.huang@intel.com> + * + * Code to convert EFI to E820 map has been implemented in elilo bootloader + * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table + * is setup appropriately for EFI runtime code. + * - mouli 06/14/2007. + * + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bootmem.h> +#include <linux/ioport.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/reboot.h> + +#include <asm/setup.h> +#include <asm/page.h> +#include <asm/e820.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/proto.h> +#include <asm/efi.h> + +static pgd_t save_pgd __initdata; +static unsigned long efi_flags __initdata; + +static void __init early_mapping_set_exec(unsigned long start, + unsigned long end, + int executable) +{ + pte_t *kpte; + int level; + + while (start < end) { + kpte = lookup_address((unsigned long)__va(start), &level); + BUG_ON(!kpte); + if (executable) + set_pte(kpte, pte_mkexec(*kpte)); + else + set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \ + __supported_pte_mask)); + if (level == 4) + start = (start + PMD_SIZE) & PMD_MASK; + else + start = (start + PAGE_SIZE) & PAGE_MASK; + } +} + +static void __init early_runtime_code_mapping_set_exec(int executable) +{ + efi_memory_desc_t *md; + void *p; + + if (!(__supported_pte_mask & _PAGE_NX)) + return; + + /* Make EFI runtime service code area executable */ + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if (md->type == EFI_RUNTIME_SERVICES_CODE) { + unsigned long end; + end = md->phys_addr + (md->num_pages << PAGE_SHIFT); + early_mapping_set_exec(md->phys_addr, end, executable); + } + } +} + +void __init efi_call_phys_prelog(void) +{ + unsigned long vaddress; + + local_irq_save(efi_flags); + early_runtime_code_mapping_set_exec(1); + vaddress = (unsigned long)__va(0x0UL); + save_pgd = *pgd_offset_k(0x0UL); + set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress)); + __flush_tlb_all(); +} + +void __init efi_call_phys_epilog(void) +{ + /* + * After the lock is released, the original page table is restored. + */ + set_pgd(pgd_offset_k(0x0UL), save_pgd); + early_runtime_code_mapping_set_exec(0); + __flush_tlb_all(); + local_irq_restore(efi_flags); +} + +void __init efi_reserve_bootmem(void) +{ + reserve_bootmem_generic((unsigned long)memmap.phys_map, + memmap.nr_map * memmap.desc_size); +} + +void __iomem * __init efi_ioremap(unsigned long offset, + unsigned long size) +{ + static unsigned pages_mapped; + unsigned long last_addr; + unsigned i, pages; + + last_addr = offset + size - 1; + offset &= PAGE_MASK; + pages = (PAGE_ALIGN(last_addr) - offset) >> PAGE_SHIFT; + if (pages_mapped + pages > MAX_EFI_IO_PAGES) + return NULL; + + for (i = 0; i < pages; i++) { + __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, + offset, PAGE_KERNEL_EXEC_NOCACHE); + offset += PAGE_SIZE; + pages_mapped++; + } + + return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \ + (pages_mapped - pages)); +} diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S new file mode 100644 index 00000000000..99b47d48c9f --- /dev/null +++ b/arch/x86/kernel/efi_stub_64.S @@ -0,0 +1,109 @@ +/* + * Function calling ABI conversion from Linux to EFI for x86_64 + * + * Copyright (C) 2007 Intel Corp + * Bibo Mao <bibo.mao@intel.com> + * Huang Ying <ying.huang@intel.com> + */ + +#include <linux/linkage.h> + +#define SAVE_XMM \ + mov %rsp, %rax; \ + subq $0x70, %rsp; \ + and $~0xf, %rsp; \ + mov %rax, (%rsp); \ + mov %cr0, %rax; \ + clts; \ + mov %rax, 0x8(%rsp); \ + movaps %xmm0, 0x60(%rsp); \ + movaps %xmm1, 0x50(%rsp); \ + movaps %xmm2, 0x40(%rsp); \ + movaps %xmm3, 0x30(%rsp); \ + movaps %xmm4, 0x20(%rsp); \ + movaps %xmm5, 0x10(%rsp) + +#define RESTORE_XMM \ + movaps 0x60(%rsp), %xmm0; \ + movaps 0x50(%rsp), %xmm1; \ + movaps 0x40(%rsp), %xmm2; \ + movaps 0x30(%rsp), %xmm3; \ + movaps 0x20(%rsp), %xmm4; \ + movaps 0x10(%rsp), %xmm5; \ + mov 0x8(%rsp), %rsi; \ + mov %rsi, %cr0; \ + mov (%rsp), %rsp + +ENTRY(efi_call0) + SAVE_XMM + subq $32, %rsp + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call1) + SAVE_XMM + subq $32, %rsp + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call2) + SAVE_XMM + subq $32, %rsp + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call3) + SAVE_XMM + subq $32, %rsp + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call4) + SAVE_XMM + subq $32, %rsp + mov %r8, %r9 + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call5) + SAVE_XMM + subq $48, %rsp + mov %r9, 32(%rsp) + mov %r8, %r9 + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $48, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call6) + SAVE_XMM + mov (%rsp), %rax + mov 8(%rax), %rax + subq $48, %rsp + mov %r9, 32(%rsp) + mov %rax, 40(%rsp) + mov %r8, %r9 + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $48, %rsp + RESTORE_XMM + ret diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index dc7f938e501..be5c31d0488 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -58,7 +58,7 @@ * for paravirtualization. The following will never clobber any registers: * INTERRUPT_RETURN (aka. "iret") * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit"). * * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). @@ -283,12 +283,12 @@ END(resume_kernel) the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub -ENTRY(sysenter_entry) +ENTRY(ia32_sysenter_target) CFI_STARTPROC simple CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp - movl TSS_sysenter_esp0(%esp),%esp + movl TSS_sysenter_sp0(%esp),%esp sysenter_past_esp: /* * No need to follow this irqs on/off section: the syscall @@ -351,7 +351,7 @@ sysenter_past_esp: xorl %ebp,%ebp TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs - ENABLE_INTERRUPTS_SYSEXIT + ENABLE_INTERRUPTS_SYSCALL_RET CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) @@ -360,7 +360,7 @@ sysenter_past_esp: .align 4 .long 1b,2b .popsection -ENDPROC(sysenter_entry) +ENDPROC(ia32_sysenter_target) # system call handler stub ENTRY(system_call) @@ -583,7 +583,7 @@ END(syscall_badsys) * Build the entry stubs and pointer table with * some assembler magic. */ -.data +.section .rodata,"a" ENTRY(interrupt) .text @@ -743,7 +743,7 @@ END(device_not_available) * that sets up the real kernel stack. Check here, since we can't * allow the wrong stack to be used. * - * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have * already pushed 3 words if it hits on the sysenter instruction: * eflags, cs and eip. * @@ -755,7 +755,7 @@ END(device_not_available) cmpw $__KERNEL_CS,4(%esp); \ jne ok; \ label: \ - movl TSS_sysenter_esp0+offset(%esp),%esp; \ + movl TSS_sysenter_sp0+offset(%esp),%esp; \ CFI_DEF_CFA esp, 0; \ CFI_UNDEFINED eip; \ pushfl; \ @@ -768,7 +768,7 @@ label: \ KPROBE_ENTRY(debug) RING0_INT_FRAME - cmpl $sysenter_entry,(%esp) + cmpl $ia32_sysenter_target,(%esp) jne debug_stack_correct FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) debug_stack_correct: @@ -799,7 +799,7 @@ KPROBE_ENTRY(nmi) popl %eax CFI_ADJUST_CFA_OFFSET -4 je nmi_espfix_stack - cmpl $sysenter_entry,(%esp) + cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup pushl %eax CFI_ADJUST_CFA_OFFSET 4 @@ -812,7 +812,7 @@ KPROBE_ENTRY(nmi) popl %eax CFI_ADJUST_CFA_OFFSET -4 jae nmi_stack_correct - cmpl $sysenter_entry,12(%esp) + cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: /* We have a RING0_INT_FRAME here */ @@ -882,10 +882,10 @@ ENTRY(native_iret) .previous END(native_iret) -ENTRY(native_irq_enable_sysexit) +ENTRY(native_irq_enable_syscall_ret) sti sysexit -END(native_irq_enable_sysexit) +END(native_irq_enable_syscall_ret) #endif KPROBE_ENTRY(int3) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e70f3881d7e..bea8474744f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -50,6 +50,7 @@ #include <asm/hw_irq.h> #include <asm/page.h> #include <asm/irqflags.h> +#include <asm/paravirt.h> .code64 @@ -57,6 +58,13 @@ #define retint_kernel retint_restore_args #endif +#ifdef CONFIG_PARAVIRT +ENTRY(native_irq_enable_syscall_ret) + movq %gs:pda_oldrsp,%rsp + swapgs + sysretq +#endif /* CONFIG_PARAVIRT */ + .macro TRACE_IRQS_IRETQ offset=ARGOFFSET #ifdef CONFIG_TRACE_IRQFLAGS @@ -216,14 +224,21 @@ ENTRY(system_call) CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ - swapgs + SWAPGS_UNSAFE_STACK + /* + * A hypervisor implementation might want to use a label + * after the swapgs, so that it can do the swapgs + * for the guest and jump here on syscall. + */ +ENTRY(system_call_after_swapgs) + movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp /* * No need to follow this irqs off/on section - it's straight * and short: */ - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_ARGS 8,1 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) @@ -246,7 +261,7 @@ ret_from_sys_call: sysret_check: LOCKDEP_SYS_EXIT GET_THREAD_INFO(%rcx) - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx @@ -260,9 +275,7 @@ sysret_check: CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 /*CFI_REGISTER rflags,r11*/ - movq %gs:pda_oldrsp,%rsp - swapgs - sysretq + ENABLE_INTERRUPTS_SYSCALL_RET CFI_RESTORE_STATE /* Handle reschedules */ @@ -271,7 +284,7 @@ sysret_careful: bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule @@ -282,7 +295,7 @@ sysret_careful: /* Handle a signal */ sysret_signal: TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f @@ -295,7 +308,7 @@ sysret_signal: 1: movl $_TIF_NEED_RESCHED,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -327,7 +340,7 @@ tracesys: */ .globl int_ret_from_sys_call int_ret_from_sys_call: - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $3,CS-ARGOFFSET(%rsp) je retint_restore_args @@ -349,20 +362,20 @@ int_careful: bt $TIF_NEED_RESCHED,%edx jnc int_very_careful TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check /* handle signals and tracing -- both require a full stack frame */ int_very_careful: TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST /* Check for syscall exit trace */ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx @@ -385,7 +398,7 @@ int_signal: 1: movl $_TIF_NEED_RESCHED,%edi int_restore_rest: RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC @@ -506,7 +519,7 @@ END(stub_rt_sigreturn) CFI_DEF_CFA_REGISTER rbp testl $3,CS(%rdi) je 1f - swapgs + SWAPGS /* irqcount is used to check if a CPU is already on an interrupt stack or not. While this is essentially redundant with preempt_count it is a little cheaper to use a separate counter in the PDA @@ -527,7 +540,7 @@ ENTRY(common_interrupt) interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF decl %gs:pda_irqcount leaveq @@ -556,13 +569,13 @@ retint_swapgs: /* return to user-space */ /* * The iretq could re-enable interrupts: */ - cli + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ - swapgs + SWAPGS jmp restore_args retint_restore_args: /* return to kernel space */ - cli + DISABLE_INTERRUPTS(CLBR_ANY) /* * The iretq could re-enable interrupts: */ @@ -570,10 +583,14 @@ retint_restore_args: /* return to kernel space */ restore_args: RESTORE_ARGS 0,8,0 iret_label: +#ifdef CONFIG_PARAVIRT + INTERRUPT_RETURN +#endif +ENTRY(native_iret) iretq .section __ex_table,"a" - .quad iret_label,bad_iret + .quad native_iret, bad_iret .previous .section .fixup,"ax" /* force a signal here? this matches i386 behaviour */ @@ -581,24 +598,24 @@ iret_label: bad_iret: movq $11,%rdi /* SIGSEGV */ TRACE_IRQS_ON - sti - jmp do_exit - .previous - + ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) + jmp do_exit + .previous + /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp retint_check @@ -606,14 +623,14 @@ retint_signal: testl $_TIF_DO_NOTIFY_MASK,%edx jz retint_swapgs TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) @@ -731,7 +748,7 @@ END(spurious_interrupt) rdmsr testl %edx,%edx js 1f - swapgs + SWAPGS xorl %ebx,%ebx 1: .if \ist @@ -747,7 +764,7 @@ END(spurious_interrupt) .if \ist addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) .endif - cli + DISABLE_INTERRUPTS(CLBR_NONE) .if \irqtrace TRACE_IRQS_OFF .endif @@ -776,10 +793,10 @@ paranoid_swapgs\trace: .if \trace TRACE_IRQS_IRETQ 0 .endif - swapgs + SWAPGS_UNSAFE_STACK paranoid_restore\trace: RESTORE_ALL 8 - iretq + INTERRUPT_RETURN paranoid_userspace\trace: GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%ebx @@ -794,11 +811,11 @@ paranoid_userspace\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_NONE) xorl %esi,%esi /* arg2: oldset */ movq %rsp,%rdi /* arg1: &pt_regs */ call do_notify_resume - cli + DISABLE_INTERRUPTS(CLBR_NONE) .if \trace TRACE_IRQS_OFF .endif @@ -807,9 +824,9 @@ paranoid_schedule\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_ANY) call schedule - cli + DISABLE_INTERRUPTS(CLBR_ANY) .if \trace TRACE_IRQS_OFF .endif @@ -862,7 +879,7 @@ KPROBE_ENTRY(error_entry) testl $3,CS(%rsp) je error_kernelspace error_swapgs: - swapgs + SWAPGS error_sti: movq %rdi,RDI(%rsp) CFI_REL_OFFSET rdi,RDI @@ -874,7 +891,7 @@ error_sti: error_exit: movl %ebx,%eax RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax @@ -911,12 +928,12 @@ ENTRY(load_gs_index) CFI_STARTPROC pushf CFI_ADJUST_CFA_OFFSET 8 - cli - swapgs + DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) + SWAPGS gs_change: movl %edi,%gs 2: mfence /* workaround */ - swapgs + SWAPGS popf CFI_ADJUST_CFA_OFFSET -8 ret @@ -930,7 +947,7 @@ ENDPROC(load_gs_index) .section .fixup,"ax" /* running with kernelgs */ bad_gs: - swapgs /* switch back to user gs */ + SWAPGS /* switch back to user gs */ xorl %eax,%eax movl %eax,%gs jmp 2b diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index ce703e21c91..4ae7b644026 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -24,18 +24,11 @@ #include <acpi/acpi_bus.h> #endif -/* - * which logical CPU number maps to which CPU (physical APIC ID) - * - * The following static array is used during kernel startup - * and the x86_cpu_to_apicid_ptr contains the address of the - * array during this time. Is it zeroed when the per_cpu - * data area is removed. - */ -u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata +/* which logical CPU number maps to which CPU (physical APIC ID) */ +u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata = { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_cpu_to_apicid_ptr; -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; +void *x86_cpu_to_apicid_early_ptr; +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); struct genapic __read_mostly *genapic = &apic_flat; diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c index f12d8c5d980..9c7f7d39596 100644 --- a/arch/x86/kernel/geode_32.c +++ b/arch/x86/kernel/geode_32.c @@ -1,6 +1,7 @@ /* * AMD Geode southbridge support code * Copyright (C) 2006, Advanced Micro Devices, Inc. + * Copyright (C) 2007, Andres Salomon <dilinger@debian.org> * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public License @@ -51,45 +52,62 @@ EXPORT_SYMBOL_GPL(geode_get_dev_base); /* === GPIO API === */ -void geode_gpio_set(unsigned int gpio, unsigned int reg) +void geode_gpio_set(u32 gpio, unsigned int reg) { u32 base = geode_get_dev_base(GEODE_DEV_GPIO); if (!base) return; - if (gpio < 16) - outl(1 << gpio, base + reg); - else - outl(1 << (gpio - 16), base + 0x80 + reg); + /* low bank register */ + if (gpio & 0xFFFF) + outl(gpio & 0xFFFF, base + reg); + /* high bank register */ + gpio >>= 16; + if (gpio) + outl(gpio, base + 0x80 + reg); } EXPORT_SYMBOL_GPL(geode_gpio_set); -void geode_gpio_clear(unsigned int gpio, unsigned int reg) +void geode_gpio_clear(u32 gpio, unsigned int reg) { u32 base = geode_get_dev_base(GEODE_DEV_GPIO); if (!base) return; - if (gpio < 16) - outl(1 << (gpio + 16), base + reg); - else - outl(1 << gpio, base + 0x80 + reg); + /* low bank register */ + if (gpio & 0xFFFF) + outl((gpio & 0xFFFF) << 16, base + reg); + /* high bank register */ + gpio &= (0xFFFF << 16); + if (gpio) + outl(gpio, base + 0x80 + reg); } EXPORT_SYMBOL_GPL(geode_gpio_clear); -int geode_gpio_isset(unsigned int gpio, unsigned int reg) +int geode_gpio_isset(u32 gpio, unsigned int reg) { u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + u32 val; if (!base) return 0; - if (gpio < 16) - return (inl(base + reg) & (1 << gpio)) ? 1 : 0; - else - return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0; + /* low bank register */ + if (gpio & 0xFFFF) { + val = inl(base + reg) & (gpio & 0xFFFF); + if ((gpio & 0xFFFF) == val) + return 1; + } + /* high bank register */ + gpio >>= 16; + if (gpio) { + val = inl(base + 0x80 + reg) & gpio; + if (gpio == val) + return 1; + } + return 0; } EXPORT_SYMBOL_GPL(geode_gpio_isset); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6b3469311e4..a317336cdea 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/percpu.h> +#include <linux/start_kernel.h> #include <asm/processor.h> #include <asm/proto.h> @@ -19,12 +20,14 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/e820.h> static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); pgd_clear(pgd); - __flush_tlb(); + __flush_tlb_all(); } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -46,6 +49,35 @@ static void __init copy_bootdata(char *real_mode_data) } } +#define EBDA_ADDR_POINTER 0x40E + +static __init void reserve_ebda(void) +{ + unsigned ebda_addr, ebda_size; + + /* + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E + */ + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); + ebda_addr <<= 4; + + if (!ebda_addr) + return; + + ebda_size = *(unsigned short *)__va(ebda_addr); + + /* Round EBDA up to pages */ + if (ebda_size == 0) + ebda_size = 1; + ebda_size <<= 10; + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); + if (ebda_size > 64*1024) + ebda_size = 64*1024; + + reserve_early(ebda_addr, ebda_addr + ebda_size); +} + void __init x86_64_start_kernel(char * real_mode_data) { int i; @@ -56,8 +88,13 @@ void __init x86_64_start_kernel(char * real_mode_data) /* Make NULL pointers segfault */ zap_identity_mappings(); - for (i = 0; i < IDT_ENTRIES; i++) + for (i = 0; i < IDT_ENTRIES; i++) { +#ifdef CONFIG_EARLY_PRINTK + set_intr_gate(i, &early_idt_handlers[i]); +#else set_intr_gate(i, early_idt_handler); +#endif + } load_idt((const struct desc_ptr *)&idt_descr); early_printk("Kernel alive\n"); @@ -67,8 +104,24 @@ void __init x86_64_start_kernel(char * real_mode_data) pda_init(0); copy_bootdata(__va(real_mode_data)); -#ifdef CONFIG_SMP - cpu_set(0, cpu_online_map); -#endif + + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end)); + + /* Reserve INITRD */ + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + reserve_early(ramdisk_image, ramdisk_end); + } + + reserve_ebda(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + start_kernel(); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index fbad51fce67..5d8c5730686 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -9,6 +9,7 @@ .text #include <linux/threads.h> +#include <linux/init.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page.h> @@ -151,7 +152,9 @@ WEAK(xen_entry) /* Unknown implementation; there's really nothing we can do at this point. */ ud2a -.data + + __INITDATA + subarch_entries: .long default_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ @@ -199,7 +202,6 @@ default_entry: addl $0x67, %eax /* 0x67 == _PAGE_TABLE */ movl %eax, 4092(%edx) - xorl %ebx,%ebx /* This is the boot CPU (BSP) */ jmp 3f /* * Non-boot CPU entry point; entered from trampoline.S @@ -222,6 +224,8 @@ ENTRY(startup_32_smp) movl %eax,%es movl %eax,%fs movl %eax,%gs +#endif /* CONFIG_SMP */ +3: /* * New page tables may be in 4Mbyte page mode and may @@ -268,12 +272,6 @@ ENTRY(startup_32_smp) wrmsr 6: - /* This is a secondary processor (AP) */ - xorl %ebx,%ebx - incl %ebx - -#endif /* CONFIG_SMP */ -3: /* * Enable paging @@ -297,7 +295,7 @@ ENTRY(startup_32_smp) popfl #ifdef CONFIG_SMP - andl %ebx,%ebx + cmpb $0, ready jz 1f /* Initial CPU cleans BSS */ jmp checkCPUtype 1: @@ -502,6 +500,7 @@ early_fault: call printk #endif #endif + call dump_stack hlt_loop: hlt jmp hlt_loop diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b6167fe3330..1d5a7a36120 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -19,6 +19,13 @@ #include <asm/msr.h> #include <asm/cache.h> +#ifdef CONFIG_PARAVIRT +#include <asm/asm-offsets.h> +#include <asm/paravirt.h> +#else +#define GET_CR2_INTO_RCX movq %cr2, %rcx +#endif + /* we are not able to switch in one step to the final KERNEL ADRESS SPACE * because we need identity-mapped pages. * @@ -260,14 +267,43 @@ init_rsp: bad_address: jmp bad_address +#ifdef CONFIG_EARLY_PRINTK +.macro early_idt_tramp first, last + .ifgt \last-\first + early_idt_tramp \first, \last-1 + .endif + movl $\last,%esi + jmp early_idt_handler +.endm + + .globl early_idt_handlers +early_idt_handlers: + early_idt_tramp 0, 63 + early_idt_tramp 64, 127 + early_idt_tramp 128, 191 + early_idt_tramp 192, 255 +#endif + ENTRY(early_idt_handler) +#ifdef CONFIG_EARLY_PRINTK cmpl $2,early_recursion_flag(%rip) jz 1f incl early_recursion_flag(%rip) + GET_CR2_INTO_RCX + movq %rcx,%r9 + xorl %r8d,%r8d # zero for error code + movl %esi,%ecx # get vector number + # Test %ecx against mask of vectors that push error code. + cmpl $31,%ecx + ja 0f + movl $1,%eax + salq %cl,%rax + testl $0x27d00,%eax + je 0f + popq %r8 # get error code +0: movq 0(%rsp),%rcx # get ip + movq 8(%rsp),%rdx # get cs xorl %eax,%eax - movq 8(%rsp),%rsi # get rip - movq (%rsp),%rdx - movq %cr2,%rcx leaq early_idt_msg(%rip),%rdi call early_printk cmpl $2,early_recursion_flag(%rip) @@ -278,15 +314,19 @@ ENTRY(early_idt_handler) movq 8(%rsp),%rsi # get rip again call __print_symbol #endif +#endif /* EARLY_PRINTK */ 1: hlt jmp 1b + +#ifdef CONFIG_EARLY_PRINTK early_recursion_flag: .long 0 early_idt_msg: - .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" + .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" early_idt_ripmsg: .asciz "RIP %s\n" +#endif /* CONFIG_EARLY_PRINTK */ .balign PAGE_SIZE diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 2f99ee206b9..429d084e014 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/sysdev.h> #include <linux/pm.h> -#include <linux/delay.h> #include <asm/fixmap.h> #include <asm/hpet.h> @@ -16,7 +15,8 @@ #define HPET_MASK CLOCKSOURCE_MASK(32) #define HPET_SHIFT 22 -/* FSEC = 10^-15 NSEC = 10^-9 */ +/* FSEC = 10^-15 + NSEC = 10^-9 */ #define FSEC_PER_NSEC 1000000 /* @@ -107,6 +107,7 @@ int is_hpet_enabled(void) { return is_hpet_capable() && hpet_legacy_int_enabled; } +EXPORT_SYMBOL_GPL(is_hpet_enabled); /* * When the hpet driver (/dev/hpet) is enabled, we need to reserve @@ -132,16 +133,13 @@ static void hpet_reserve_platform_timers(unsigned long id) #ifdef CONFIG_HPET_EMULATE_RTC hpet_reserve_timer(&hd, 1); #endif - hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; - for (i = 2; i < nrtimers; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - + for (i = 2; i < nrtimers; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; hpet_alloc(&hd); - } #else static void hpet_reserve_platform_timers(unsigned long id) { } @@ -478,6 +476,7 @@ void hpet_disable(void) */ #include <linux/mc146818rtc.h> #include <linux/rtc.h> +#include <asm/rtc.h> #define DEFAULT_RTC_INT_FREQ 64 #define DEFAULT_RTC_SHIFT 6 @@ -492,6 +491,38 @@ static unsigned long hpet_default_delta; static unsigned long hpet_pie_delta; static unsigned long hpet_pie_limit; +static rtc_irq_handler irq_handler; + +/* + * Registers a IRQ handler. + */ +int hpet_register_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return -ENODEV; + if (irq_handler) + return -EBUSY; + + irq_handler = handler; + + return 0; +} +EXPORT_SYMBOL_GPL(hpet_register_irq_handler); + +/* + * Deregisters the IRQ handler registered with hpet_register_irq_handler() + * and does cleanup. + */ +void hpet_unregister_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return; + + irq_handler = NULL; + hpet_rtc_flags = 0; +} +EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); + /* * Timer 1 for RTC emulation. We use one shot mode, as periodic mode * is not supported by all HPET implementations for timer 1. @@ -533,6 +564,7 @@ int hpet_rtc_timer_init(void) return 1; } +EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); /* * The functions below are called from rtc driver. @@ -547,6 +579,7 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask) hpet_rtc_flags &= ~bit_mask; return 1; } +EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); int hpet_set_rtc_irq_bit(unsigned long bit_mask) { @@ -562,6 +595,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask) return 1; } +EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit); int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) @@ -575,6 +609,7 @@ int hpet_set_alarm_time(unsigned char hrs, unsigned char min, return 1; } +EXPORT_SYMBOL_GPL(hpet_set_alarm_time); int hpet_set_periodic_freq(unsigned long freq) { @@ -593,11 +628,13 @@ int hpet_set_periodic_freq(unsigned long freq) } return 1; } +EXPORT_SYMBOL_GPL(hpet_set_periodic_freq); int hpet_rtc_dropped_irq(void) { return is_hpet_enabled(); } +EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); static void hpet_rtc_timer_reinit(void) { @@ -641,9 +678,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) unsigned long rtc_int_flag = 0; hpet_rtc_timer_reinit(); + memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - rtc_get_rtc_time(&curr_time); + get_rtc_time(&curr_time); if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { @@ -665,8 +703,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) if (rtc_int_flag) { rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id); + if (irq_handler) + irq_handler(rtc_int_flag, dev_id); } return IRQ_HANDLED; } +EXPORT_SYMBOL_GPL(hpet_rtc_interrupt); #endif diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 02112fcc0de..061627806a2 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -22,12 +22,5 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strstr); -#ifdef CONFIG_SMP -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); -EXPORT_SYMBOL(__write_lock_failed); -EXPORT_SYMBOL(__read_lock_failed); -#endif - EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(empty_zero_page); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c new file mode 100644 index 00000000000..26719bd2c77 --- /dev/null +++ b/arch/x86/kernel/i387.c @@ -0,0 +1,479 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/regset.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/math_emu.h> +#include <asm/sigcontext.h> +#include <asm/user.h> +#include <asm/ptrace.h> +#include <asm/uaccess.h> + +#ifdef CONFIG_X86_64 + +#include <asm/sigcontext32.h> +#include <asm/user32.h> + +#else + +#define save_i387_ia32 save_i387 +#define restore_i387_ia32 restore_i387 + +#define _fpstate_ia32 _fpstate +#define user_i387_ia32_struct user_i387_struct +#define user32_fxsr_struct user_fxsr_struct + +#endif + +#ifdef CONFIG_MATH_EMULATION +#define HAVE_HWFP (boot_cpu_data.hard_math) +#else +#define HAVE_HWFP 1 +#endif + +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; + +void mxcsr_feature_mask_init(void) +{ + unsigned long mask = 0; + clts(); + if (cpu_has_fxsr) { + memset(¤t->thread.i387.fxsave, 0, + sizeof(struct i387_fxsave_struct)); + asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); + mask = current->thread.i387.fxsave.mxcsr_mask; + if (mask == 0) + mask = 0x0000ffbf; + } + mxcsr_feature_mask &= mask; + stts(); +} + +#ifdef CONFIG_X86_64 +/* + * Called at bootup to set up the initial FPU state that is later cloned + * into all processes. + */ +void __cpuinit fpu_init(void) +{ + unsigned long oldcr0 = read_cr0(); + extern void __bad_fxsave_alignment(void); + + if (offsetof(struct task_struct, thread.i387.fxsave) & 15) + __bad_fxsave_alignment(); + set_in_cr4(X86_CR4_OSFXSR); + set_in_cr4(X86_CR4_OSXMMEXCPT); + + write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ + + mxcsr_feature_mask_init(); + /* clean state in init */ + current_thread_info()->status = 0; + clear_used_math(); +} +#endif /* CONFIG_X86_64 */ + +/* + * The _current_ task is using the FPU for the first time + * so initialize it and set the mxcsr to its default + * value at reset if we support XMM instructions and then + * remeber the current task has used the FPU. + */ +void init_fpu(struct task_struct *tsk) +{ + if (tsk_used_math(tsk)) { + if (tsk == current) + unlazy_fpu(tsk); + return; + } + + if (cpu_has_fxsr) { + memset(&tsk->thread.i387.fxsave, 0, + sizeof(struct i387_fxsave_struct)); + tsk->thread.i387.fxsave.cwd = 0x37f; + if (cpu_has_xmm) + tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT; + } else { + memset(&tsk->thread.i387.fsave, 0, + sizeof(struct i387_fsave_struct)); + tsk->thread.i387.fsave.cwd = 0xffff037fu; + tsk->thread.i387.fsave.swd = 0xffff0000u; + tsk->thread.i387.fsave.twd = 0xffffffffu; + tsk->thread.i387.fsave.fos = 0xffff0000u; + } + /* + * Only the device not available exception or ptrace can call init_fpu. + */ + set_stopped_child_used_math(tsk); +} + +int fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return tsk_used_math(target) ? regset->n : 0; +} + +int xfpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0; +} + +int xfpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (!cpu_has_fxsr) + return -ENODEV; + + unlazy_fpu(target); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fxsave, 0, -1); +} + +int xfpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!cpu_has_fxsr) + return -ENODEV; + + unlazy_fpu(target); + set_stopped_child_used_math(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fxsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + + return ret; +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + return tmp; +} + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); +#define FP_EXP_TAG_VALID 0 +#define FP_EXP_TAG_ZERO 1 +#define FP_EXP_TAG_SPECIAL 2 +#define FP_EXP_TAG_EMPTY 3 + +static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) +{ + struct _fpxreg *st; + u32 tos = (fxsave->swd >> 11) & 7; + u32 twd = (unsigned long) fxsave->twd; + u32 tag; + u32 ret = 0xffff0000u; + int i; + + for (i = 0; i < 8; i++, twd >>= 1) { + if (twd & 0x1) { + st = FPREG_ADDR(fxsave, (i - tos) & 7); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = FP_EXP_TAG_SPECIAL; + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3]) + tag = FP_EXP_TAG_ZERO; + else + tag = FP_EXP_TAG_SPECIAL; + break; + default: + if (st->significand[3] & 0x8000) + tag = FP_EXP_TAG_VALID; + else + tag = FP_EXP_TAG_SPECIAL; + break; + } + } else { + tag = FP_EXP_TAG_EMPTY; + } + ret |= tag << (2 * i); + } + return ret; +} + +/* + * FXSR floating point environment conversions. + */ + +static void convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk) +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave; + struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + env->cwd = fxsave->cwd | 0xffff0000u; + env->swd = fxsave->swd | 0xffff0000u; + env->twd = twd_fxsr_to_i387(fxsave); + +#ifdef CONFIG_X86_64 + env->fip = fxsave->rip; + env->foo = fxsave->rdp; + if (tsk == current) { + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + asm("mov %%ds,%0" : "=r" (env->fos)); + asm("mov %%cs,%0" : "=r" (env->fcs)); + } else { + struct pt_regs *regs = task_pt_regs(tsk); + env->fos = 0xffff0000 | tsk->thread.ds; + env->fcs = regs->cs; + } +#else + env->fip = fxsave->fip; + env->fcs = fxsave->fcs; + env->foo = fxsave->foo; + env->fos = fxsave->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(to[0])); +} + +static void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env) + +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave; + struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + fxsave->cwd = env->cwd; + fxsave->swd = env->swd; + fxsave->twd = twd_i387_to_fxsr(env->twd); + fxsave->fop = (u16) ((u32) env->fcs >> 16); +#ifdef CONFIG_X86_64 + fxsave->rip = env->fip; + fxsave->rdp = env->foo; + /* cs and ds ignored */ +#else + fxsave->fip = env->fip; + fxsave->fcs = (env->fcs & 0xffff); + fxsave->foo = env->foo; + fxsave->fos = env->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(from[0])); +} + +int fpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct user_i387_ia32_struct env; + + if (!HAVE_HWFP) + return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); + + unlazy_fpu(target); + + if (!cpu_has_fxsr) + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fsave, 0, -1); + + if (kbuf && pos == 0 && count == sizeof(env)) { + convert_from_fxsr(kbuf, target); + return 0; + } + + convert_from_fxsr(&env, target); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); +} + +int fpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_i387_ia32_struct env; + int ret; + + if (!HAVE_HWFP) + return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); + + unlazy_fpu(target); + set_stopped_child_used_math(target); + + if (!cpu_has_fxsr) + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fsave, 0, -1); + + if (pos > 0 || count < sizeof(env)) + convert_from_fxsr(&env, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + if (!ret) + convert_to_fxsr(target, &env); + + return ret; +} + +/* + * Signal frame handlers. + */ + +static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) +{ + struct task_struct *tsk = current; + + unlazy_fpu(tsk); + tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd; + if (__copy_to_user(buf, &tsk->thread.i387.fsave, + sizeof(struct i387_fsave_struct))) + return -1; + return 1; +} + +static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) +{ + struct task_struct *tsk = current; + struct user_i387_ia32_struct env; + int err = 0; + + unlazy_fpu(tsk); + + convert_from_fxsr(&env, tsk); + if (__copy_to_user(buf, &env, sizeof(env))) + return -1; + + err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status); + err |= __put_user(X86_FXSR_MAGIC, &buf->magic); + if (err) + return -1; + + if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave, + sizeof(struct i387_fxsave_struct))) + return -1; + return 1; +} + +int save_i387_ia32(struct _fpstate_ia32 __user *buf) +{ + if (!used_math()) + return 0; + + /* This will cause a "finit" to be triggered by the next + * attempted FPU operation by the 'current' process. + */ + clear_used_math(); + + if (HAVE_HWFP) { + if (cpu_has_fxsr) { + return save_i387_fxsave(buf); + } else { + return save_i387_fsave(buf); + } + } else { + return fpregs_soft_get(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) ? -1 : 1; + } +} + +static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) +{ + struct task_struct *tsk = current; + clear_fpu(tsk); + return __copy_from_user(&tsk->thread.i387.fsave, buf, + sizeof(struct i387_fsave_struct)); +} + +static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) +{ + int err; + struct task_struct *tsk = current; + struct user_i387_ia32_struct env; + clear_fpu(tsk); + err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0], + sizeof(struct i387_fxsave_struct)); + /* mxcsr reserved bits must be masked to zero for security reasons */ + tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + if (err || __copy_from_user(&env, buf, sizeof(env))) + return 1; + convert_to_fxsr(tsk, &env); + return 0; +} + +int restore_i387_ia32(struct _fpstate_ia32 __user *buf) +{ + int err; + + if (HAVE_HWFP) { + if (cpu_has_fxsr) { + err = restore_i387_fxsave(buf); + } else { + err = restore_i387_fsave(buf); + } + } else { + err = fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; + } + set_used_math(); + return err; +} + +/* + * FPU state for core dumps. + * This is only used for a.out dumps now. + * It is declared generically using elf_fpregset_t (which is + * struct user_i387_struct) but is in fact only used for 32-bit + * dumps, so on 64-bit it is really struct user_i387_ia32_struct. + */ +int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu) +{ + int fpvalid; + struct task_struct *tsk = current; + + fpvalid = !!used_math(); + if (fpvalid) + fpvalid = !fpregs_get(tsk, NULL, + 0, sizeof(struct user_i387_ia32_struct), + fpu, NULL); + + return fpvalid; +} +EXPORT_SYMBOL(dump_fpu); + +#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c deleted file mode 100644 index 7d2e12f6c78..00000000000 --- a/arch/x86/kernel/i387_32.c +++ /dev/null @@ -1,544 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - */ - -#include <linux/sched.h> -#include <linux/module.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/math_emu.h> -#include <asm/sigcontext.h> -#include <asm/user.h> -#include <asm/ptrace.h> -#include <asm/uaccess.h> - -#ifdef CONFIG_MATH_EMULATION -#define HAVE_HWFP (boot_cpu_data.hard_math) -#else -#define HAVE_HWFP 1 -#endif - -static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff; - -void mxcsr_feature_mask_init(void) -{ - unsigned long mask = 0; - clts(); - if (cpu_has_fxsr) { - memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); - mask = current->thread.i387.fxsave.mxcsr_mask; - if (mask == 0) mask = 0x0000ffbf; - } - mxcsr_feature_mask &= mask; - stts(); -} - -/* - * The _current_ task is using the FPU for the first time - * so initialize it and set the mxcsr to its default - * value at reset if we support XMM instructions and then - * remeber the current task has used the FPU. - */ -void init_fpu(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - tsk->thread.i387.fxsave.cwd = 0x37f; - if (cpu_has_xmm) - tsk->thread.i387.fxsave.mxcsr = 0x1f80; - } else { - memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct)); - tsk->thread.i387.fsave.cwd = 0xffff037fu; - tsk->thread.i387.fsave.swd = 0xffff0000u; - tsk->thread.i387.fsave.twd = 0xffffffffu; - tsk->thread.i387.fsave.fos = 0xffff0000u; - } - /* only the device not available exception or ptrace can call init_fpu */ - set_stopped_child_used_math(tsk); -} - -/* - * FPU lazy state save handling. - */ - -void kernel_fpu_begin(void) -{ - struct thread_info *thread = current_thread_info(); - - preempt_disable(); - if (thread->status & TS_USEDFPU) { - __save_init_fpu(thread->task); - return; - } - clts(); -} -EXPORT_SYMBOL_GPL(kernel_fpu_begin); - -/* - * FPU tag word conversions. - */ - -static inline unsigned short twd_i387_to_fxsr( unsigned short twd ) -{ - unsigned int tmp; /* to avoid 16 bit prefixes in the code */ - - /* Transform each pair of bits into 01 (valid) or 00 (empty) */ - tmp = ~twd; - tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ - /* and move the valid bits to the lower byte. */ - tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ - tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ - tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ - return tmp; -} - -static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave ) -{ - struct _fpxreg *st = NULL; - unsigned long tos = (fxsave->swd >> 11) & 7; - unsigned long twd = (unsigned long) fxsave->twd; - unsigned long tag; - unsigned long ret = 0xffff0000u; - int i; - -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); - - for ( i = 0 ; i < 8 ; i++ ) { - if ( twd & 0x1 ) { - st = FPREG_ADDR( fxsave, (i - tos) & 7 ); - - switch ( st->exponent & 0x7fff ) { - case 0x7fff: - tag = 2; /* Special */ - break; - case 0x0000: - if ( !st->significand[0] && - !st->significand[1] && - !st->significand[2] && - !st->significand[3] ) { - tag = 1; /* Zero */ - } else { - tag = 2; /* Special */ - } - break; - default: - if ( st->significand[3] & 0x8000 ) { - tag = 0; /* Valid */ - } else { - tag = 2; /* Special */ - } - break; - } - } else { - tag = 3; /* Empty */ - } - ret |= (tag << (2 * i)); - twd = twd >> 1; - } - return ret; -} - -/* - * FPU state interaction. - */ - -unsigned short get_fpu_cwd( struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - return tsk->thread.i387.fxsave.cwd; - } else { - return (unsigned short)tsk->thread.i387.fsave.cwd; - } -} - -unsigned short get_fpu_swd( struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - return tsk->thread.i387.fxsave.swd; - } else { - return (unsigned short)tsk->thread.i387.fsave.swd; - } -} - -#if 0 -unsigned short get_fpu_twd( struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - return tsk->thread.i387.fxsave.twd; - } else { - return (unsigned short)tsk->thread.i387.fsave.twd; - } -} -#endif /* 0 */ - -unsigned short get_fpu_mxcsr( struct task_struct *tsk ) -{ - if ( cpu_has_xmm ) { - return tsk->thread.i387.fxsave.mxcsr; - } else { - return 0x1f80; - } -} - -#if 0 - -void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd ) -{ - if ( cpu_has_fxsr ) { - tsk->thread.i387.fxsave.cwd = cwd; - } else { - tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u); - } -} - -void set_fpu_swd( struct task_struct *tsk, unsigned short swd ) -{ - if ( cpu_has_fxsr ) { - tsk->thread.i387.fxsave.swd = swd; - } else { - tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u); - } -} - -void set_fpu_twd( struct task_struct *tsk, unsigned short twd ) -{ - if ( cpu_has_fxsr ) { - tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd); - } else { - tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u); - } -} - -#endif /* 0 */ - -/* - * FXSR floating point environment conversions. - */ - -static int convert_fxsr_to_user( struct _fpstate __user *buf, - struct i387_fxsave_struct *fxsave ) -{ - unsigned long env[7]; - struct _fpreg __user *to; - struct _fpxreg *from; - int i; - - env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul; - env[1] = (unsigned long)fxsave->swd | 0xffff0000ul; - env[2] = twd_fxsr_to_i387(fxsave); - env[3] = fxsave->fip; - env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); - env[5] = fxsave->foo; - env[6] = fxsave->fos; - - if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) - return 1; - - to = &buf->_st[0]; - from = (struct _fpxreg *) &fxsave->st_space[0]; - for ( i = 0 ; i < 8 ; i++, to++, from++ ) { - unsigned long __user *t = (unsigned long __user *)to; - unsigned long *f = (unsigned long *)from; - - if (__put_user(*f, t) || - __put_user(*(f + 1), t + 1) || - __put_user(from->exponent, &to->exponent)) - return 1; - } - return 0; -} - -static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave, - struct _fpstate __user *buf ) -{ - unsigned long env[7]; - struct _fpxreg *to; - struct _fpreg __user *from; - int i; - - if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) - return 1; - - fxsave->cwd = (unsigned short)(env[0] & 0xffff); - fxsave->swd = (unsigned short)(env[1] & 0xffff); - fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); - fxsave->fip = env[3]; - fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16); - fxsave->fcs = (env[4] & 0xffff); - fxsave->foo = env[5]; - fxsave->fos = env[6]; - - to = (struct _fpxreg *) &fxsave->st_space[0]; - from = &buf->_st[0]; - for ( i = 0 ; i < 8 ; i++, to++, from++ ) { - unsigned long *t = (unsigned long *)to; - unsigned long __user *f = (unsigned long __user *)from; - - if (__get_user(*t, f) || - __get_user(*(t + 1), f + 1) || - __get_user(to->exponent, &from->exponent)) - return 1; - } - return 0; -} - -/* - * Signal frame handlers. - */ - -static inline int save_i387_fsave( struct _fpstate __user *buf ) -{ - struct task_struct *tsk = current; - - unlazy_fpu( tsk ); - tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd; - if ( __copy_to_user( buf, &tsk->thread.i387.fsave, - sizeof(struct i387_fsave_struct) ) ) - return -1; - return 1; -} - -static int save_i387_fxsave( struct _fpstate __user *buf ) -{ - struct task_struct *tsk = current; - int err = 0; - - unlazy_fpu( tsk ); - - if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) ) - return -1; - - err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status ); - err |= __put_user( X86_FXSR_MAGIC, &buf->magic ); - if ( err ) - return -1; - - if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave, - sizeof(struct i387_fxsave_struct) ) ) - return -1; - return 1; -} - -int save_i387( struct _fpstate __user *buf ) -{ - if ( !used_math() ) - return 0; - - /* This will cause a "finit" to be triggered by the next - * attempted FPU operation by the 'current' process. - */ - clear_used_math(); - - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - return save_i387_fxsave( buf ); - } else { - return save_i387_fsave( buf ); - } - } else { - return save_i387_soft( ¤t->thread.i387.soft, buf ); - } -} - -static inline int restore_i387_fsave( struct _fpstate __user *buf ) -{ - struct task_struct *tsk = current; - clear_fpu( tsk ); - return __copy_from_user( &tsk->thread.i387.fsave, buf, - sizeof(struct i387_fsave_struct) ); -} - -static int restore_i387_fxsave( struct _fpstate __user *buf ) -{ - int err; - struct task_struct *tsk = current; - clear_fpu( tsk ); - err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0], - sizeof(struct i387_fxsave_struct) ); - /* mxcsr reserved bits must be masked to zero for security reasons */ - tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf ); -} - -int restore_i387( struct _fpstate __user *buf ) -{ - int err; - - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - err = restore_i387_fxsave( buf ); - } else { - err = restore_i387_fsave( buf ); - } - } else { - err = restore_i387_soft( ¤t->thread.i387.soft, buf ); - } - set_used_math(); - return err; -} - -/* - * ptrace request handlers. - */ - -static inline int get_fpregs_fsave( struct user_i387_struct __user *buf, - struct task_struct *tsk ) -{ - return __copy_to_user( buf, &tsk->thread.i387.fsave, - sizeof(struct user_i387_struct) ); -} - -static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf, - struct task_struct *tsk ) -{ - return convert_fxsr_to_user( (struct _fpstate __user *)buf, - &tsk->thread.i387.fxsave ); -} - -int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk ) -{ - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - return get_fpregs_fxsave( buf, tsk ); - } else { - return get_fpregs_fsave( buf, tsk ); - } - } else { - return save_i387_soft( &tsk->thread.i387.soft, - (struct _fpstate __user *)buf ); - } -} - -static inline int set_fpregs_fsave( struct task_struct *tsk, - struct user_i387_struct __user *buf ) -{ - return __copy_from_user( &tsk->thread.i387.fsave, buf, - sizeof(struct user_i387_struct) ); -} - -static inline int set_fpregs_fxsave( struct task_struct *tsk, - struct user_i387_struct __user *buf ) -{ - return convert_fxsr_from_user( &tsk->thread.i387.fxsave, - (struct _fpstate __user *)buf ); -} - -int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf ) -{ - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - return set_fpregs_fxsave( tsk, buf ); - } else { - return set_fpregs_fsave( tsk, buf ); - } - } else { - return restore_i387_soft( &tsk->thread.i387.soft, - (struct _fpstate __user *)buf ); - } -} - -int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - if (__copy_to_user( buf, &tsk->thread.i387.fxsave, - sizeof(struct user_fxsr_struct) )) - return -EFAULT; - return 0; - } else { - return -EIO; - } -} - -int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf ) -{ - int ret = 0; - - if ( cpu_has_fxsr ) { - if (__copy_from_user( &tsk->thread.i387.fxsave, buf, - sizeof(struct user_fxsr_struct) )) - ret = -EFAULT; - /* mxcsr reserved bits must be masked to zero for security reasons */ - tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - } else { - ret = -EIO; - } - return ret; -} - -/* - * FPU state for core dumps. - */ - -static inline void copy_fpu_fsave( struct task_struct *tsk, - struct user_i387_struct *fpu ) -{ - memcpy( fpu, &tsk->thread.i387.fsave, - sizeof(struct user_i387_struct) ); -} - -static inline void copy_fpu_fxsave( struct task_struct *tsk, - struct user_i387_struct *fpu ) -{ - unsigned short *to; - unsigned short *from; - int i; - - memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) ); - - to = (unsigned short *)&fpu->st_space[0]; - from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0]; - for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) { - memcpy( to, from, 5 * sizeof(unsigned short) ); - } -} - -int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) -{ - int fpvalid; - struct task_struct *tsk = current; - - fpvalid = !!used_math(); - if ( fpvalid ) { - unlazy_fpu( tsk ); - if ( cpu_has_fxsr ) { - copy_fpu_fxsave( tsk, fpu ); - } else { - copy_fpu_fsave( tsk, fpu ); - } - } - - return fpvalid; -} -EXPORT_SYMBOL(dump_fpu); - -int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) -{ - int fpvalid = !!tsk_used_math(tsk); - - if (fpvalid) { - if (tsk == current) - unlazy_fpu(tsk); - if (cpu_has_fxsr) - copy_fpu_fxsave(tsk, fpu); - else - copy_fpu_fsave(tsk, fpu); - } - return fpvalid; -} - -int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu) -{ - int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr; - - if (fpvalid) { - if (tsk == current) - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu)); - } - return fpvalid; -} diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c deleted file mode 100644 index bfaff28fb13..00000000000 --- a/arch/x86/kernel/i387_64.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * x86-64 rework 2002 Andi Kleen. - * Does direct fxsave in and out of user space now for signal handlers. - * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, - * the 64bit user space sees a FXSAVE frame directly. - */ - -#include <linux/sched.h> -#include <linux/init.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/sigcontext.h> -#include <asm/user.h> -#include <asm/ptrace.h> -#include <asm/uaccess.h> - -unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff; - -void mxcsr_feature_mask_init(void) -{ - unsigned int mask; - clts(); - memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); - mask = current->thread.i387.fxsave.mxcsr_mask; - if (mask == 0) mask = 0x0000ffbf; - mxcsr_feature_mask &= mask; - stts(); -} - -/* - * Called at bootup to set up the initial FPU state that is later cloned - * into all processes. - */ -void __cpuinit fpu_init(void) -{ - unsigned long oldcr0 = read_cr0(); - extern void __bad_fxsave_alignment(void); - - if (offsetof(struct task_struct, thread.i387.fxsave) & 15) - __bad_fxsave_alignment(); - set_in_cr4(X86_CR4_OSFXSR); - set_in_cr4(X86_CR4_OSXMMEXCPT); - - write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ - - mxcsr_feature_mask_init(); - /* clean state in init */ - current_thread_info()->status = 0; - clear_used_math(); -} - -void init_fpu(struct task_struct *child) -{ - if (tsk_used_math(child)) { - if (child == current) - unlazy_fpu(child); - return; - } - memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - child->thread.i387.fxsave.cwd = 0x37f; - child->thread.i387.fxsave.mxcsr = 0x1f80; - /* only the device not available exception or ptrace can call init_fpu */ - set_stopped_child_used_math(child); -} - -/* - * Signal frame handlers. - */ - -int save_i387(struct _fpstate __user *buf) -{ - struct task_struct *tsk = current; - int err = 0; - - BUILD_BUG_ON(sizeof(struct user_i387_struct) != - sizeof(tsk->thread.i387.fxsave)); - - if ((unsigned long)buf % 16) - printk("save_i387: bad fpstate %p\n",buf); - - if (!used_math()) - return 0; - clear_used_math(); /* trigger finit */ - if (task_thread_info(tsk)->status & TS_USEDFPU) { - err = save_i387_checking((struct i387_fxsave_struct __user *)buf); - if (err) return err; - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); - } else { - if (__copy_to_user(buf, &tsk->thread.i387.fxsave, - sizeof(struct i387_fxsave_struct))) - return -1; - } - return 1; -} - -/* - * ptrace request handlers. - */ - -int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) -{ - init_fpu(tsk); - return __copy_to_user(buf, &tsk->thread.i387.fxsave, - sizeof(struct user_i387_struct)) ? -EFAULT : 0; -} - -int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) -{ - if (__copy_from_user(&tsk->thread.i387.fxsave, buf, - sizeof(struct user_i387_struct))) - return -EFAULT; - return 0; -} - -/* - * FPU state for core dumps. - */ - -int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) -{ - struct task_struct *tsk = current; - - if (!used_math()) - return 0; - - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); - return 1; -} - -int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) -{ - int fpvalid = !!tsk_used_math(tsk); - - if (fpvalid) { - if (tsk == current) - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); -} - return fpvalid; -} diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index a42c8074532..ef62b07b2b4 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -13,10 +13,17 @@ #include <asm/delay.h> #include <asm/i8253.h> #include <asm/io.h> +#include <asm/hpet.h> DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); +#ifdef CONFIG_X86_32 +static void pit_disable_clocksource(void); +#else +static inline void pit_disable_clocksource(void) { } +#endif + /* * HPET replaces the PIT, when enabled. So we need to know, which of * the two timers is used @@ -31,38 +38,38 @@ struct clock_event_device *global_clock_event; static void init_pit_timer(enum clock_event_mode mode, struct clock_event_device *evt) { - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); + spin_lock(&i8253_lock); switch(mode) { case CLOCK_EVT_MODE_PERIODIC: /* binary, mode 2, LSB/MSB, ch 0 */ - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - outb(LATCH >> 8 , PIT_CH0); /* MSB */ + outb_pit(0x34, PIT_MODE); + outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */ + outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */ break; case CLOCK_EVT_MODE_SHUTDOWN: case CLOCK_EVT_MODE_UNUSED: if (evt->mode == CLOCK_EVT_MODE_PERIODIC || evt->mode == CLOCK_EVT_MODE_ONESHOT) { - outb_p(0x30, PIT_MODE); - outb_p(0, PIT_CH0); - outb_p(0, PIT_CH0); + outb_pit(0x30, PIT_MODE); + outb_pit(0, PIT_CH0); + outb_pit(0, PIT_CH0); } + pit_disable_clocksource(); break; case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ - outb_p(0x38, PIT_MODE); + pit_disable_clocksource(); + outb_pit(0x38, PIT_MODE); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ break; } - spin_unlock_irqrestore(&i8253_lock, flags); + spin_unlock(&i8253_lock); } /* @@ -72,12 +79,10 @@ static void init_pit_timer(enum clock_event_mode mode, */ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(delta & 0xff , PIT_CH0); /* LSB */ - outb(delta >> 8 , PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); + spin_lock(&i8253_lock); + outb_pit(delta & 0xff , PIT_CH0); /* LSB */ + outb_pit(delta >> 8 , PIT_CH0); /* MSB */ + spin_unlock(&i8253_lock); return 0; } @@ -148,15 +153,15 @@ static cycle_t pit_read(void) * count), it cannot be newer. */ jifs = jiffies; - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb_p(PIT_CH0) << 8; + outb_pit(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb_pit(PIT_CH0); /* read the latched count */ + count |= inb_pit(PIT_CH0) << 8; /* VIA686a test code... reset the latch if count > max + 1 */ if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); + outb_pit(0x34, PIT_MODE); + outb_pit(LATCH & 0xff, PIT_CH0); + outb_pit(LATCH >> 8, PIT_CH0); count = LATCH - 1; } @@ -195,9 +200,28 @@ static struct clocksource clocksource_pit = { .shift = 20, }; +static void pit_disable_clocksource(void) +{ + /* + * Use mult to check whether it is registered or not + */ + if (clocksource_pit.mult) { + clocksource_unregister(&clocksource_pit); + clocksource_pit.mult = 0; + } +} + static int __init init_pit_clocksource(void) { - if (num_possible_cpus() > 1) /* PIT does not scale! */ + /* + * Several reasons not to register PIT as a clocksource: + * + * - On SMP PIT does not scale due to i8253_lock + * - when HPET is enabled + * - when local APIC timer is active (PIT is switched off) + */ + if (num_possible_cpus() > 1 || is_hpet_enabled() || + pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) return 0; clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index 5f3496d0198..2d25b77102f 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -21,8 +21,6 @@ #include <asm/arch_hooks.h> #include <asm/i8259.h> -#include <io_ports.h> - /* * This is the 'legacy' 8259A Programmable Interrupt Controller, * present in the majority of PC/AT boxes. @@ -291,20 +289,20 @@ void init_8259A(int auto_eoi) outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ /* - * outb_p - this has to work on a wide range of PC hardware. + * outb_pic - this has to work on a wide range of PC hardware. */ - outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ + outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ if (auto_eoi) /* master does Auto EOI */ - outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); else /* master expects normal EOI */ - outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ - outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ if (auto_eoi) /* * In AEOI mode we just have to mask the interrupt @@ -341,7 +339,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) outb(0,0xF0); if (ignore_fpu_irq || !boot_cpu_data.hard_math) return IRQ_NONE; - math_error((void __user *)get_irq_regs()->eip); + math_error((void __user *)get_irq_regs()->ip); return IRQ_HANDLED; } diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index ba6d57286f5..fa57a156850 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -21,6 +21,7 @@ #include <asm/delay.h> #include <asm/desc.h> #include <asm/apic.h> +#include <asm/i8259.h> /* * Common place to define all x86 IRQ vectors @@ -48,7 +49,7 @@ */ /* - * The IO-APIC gives us many more interrupt sources. Most of these + * The IO-APIC gives us many more interrupt sources. Most of these * are unused but an SMP system is supposed to have enough memory ... * sometimes (mostly wrt. hw bugs) we get corrupted vectors all * across the spectrum, so we really want to be prepared to get all @@ -76,7 +77,7 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) /* for the irq vectors */ -static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { +static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { IRQLIST_16(0x2), IRQLIST_16(0x3), IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), @@ -114,11 +115,7 @@ static struct irq_chip i8259A_chip = { /* * This contains the irq mask for both 8259A irq controllers, */ -static unsigned int cached_irq_mask = 0xffff; - -#define __byte(x,y) (((unsigned char *)&(y))[x]) -#define cached_21 (__byte(0,cached_irq_mask)) -#define cached_A1 (__byte(1,cached_irq_mask)) +unsigned int cached_irq_mask = 0xffff; /* * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) @@ -139,9 +136,9 @@ void disable_8259A_irq(unsigned int irq) spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) - outb(cached_A1,0xA1); + outb(cached_slave_mask, PIC_SLAVE_IMR); else - outb(cached_21,0x21); + outb(cached_master_mask, PIC_MASTER_IMR); spin_unlock_irqrestore(&i8259A_lock, flags); } @@ -153,9 +150,9 @@ void enable_8259A_irq(unsigned int irq) spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; if (irq & 8) - outb(cached_A1,0xA1); + outb(cached_slave_mask, PIC_SLAVE_IMR); else - outb(cached_21,0x21); + outb(cached_master_mask, PIC_MASTER_IMR); spin_unlock_irqrestore(&i8259A_lock, flags); } @@ -167,9 +164,9 @@ int i8259A_irq_pending(unsigned int irq) spin_lock_irqsave(&i8259A_lock, flags); if (irq < 8) - ret = inb(0x20) & mask; + ret = inb(PIC_MASTER_CMD) & mask; else - ret = inb(0xA0) & (mask >> 8); + ret = inb(PIC_SLAVE_CMD) & (mask >> 8); spin_unlock_irqrestore(&i8259A_lock, flags); return ret; @@ -196,14 +193,14 @@ static inline int i8259A_irq_real(unsigned int irq) int irqmask = 1<<irq; if (irq < 8) { - outb(0x0B,0x20); /* ISR register */ - value = inb(0x20) & irqmask; - outb(0x0A,0x20); /* back to the IRR register */ + outb(0x0B,PIC_MASTER_CMD); /* ISR register */ + value = inb(PIC_MASTER_CMD) & irqmask; + outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ return value; } - outb(0x0B,0xA0); /* ISR register */ - value = inb(0xA0) & (irqmask >> 8); - outb(0x0A,0xA0); /* back to the IRR register */ + outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ + value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); + outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ return value; } @@ -240,14 +237,17 @@ static void mask_and_ack_8259A(unsigned int irq) handle_real_irq: if (irq & 8) { - inb(0xA1); /* DUMMY - (do we need this?) */ - outb(cached_A1,0xA1); - outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ - outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7),PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); } else { - inb(0x21); /* DUMMY - (do we need this?) */ - outb(cached_21,0x21); - outb(0x60+irq,0x20); /* 'Specific EOI' to master */ + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); + /* 'Specific EOI' to master */ + outb(0x60+irq,PIC_MASTER_CMD); } spin_unlock_irqrestore(&i8259A_lock, flags); return; @@ -270,7 +270,8 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); + printk(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } atomic_inc(&irq_err_count); @@ -283,51 +284,6 @@ spurious_8259A_irq: } } -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-2 */ - - /* - * outb_p - this has to work on a wide range of PC hardware. - */ - outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ - outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) - outb_p(0x03, 0x21); /* master does Auto EOI */ - else - outb_p(0x01, 0x21); /* master expects normal EOI */ - - outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ - outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ - outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ - outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode - is to be investigated) */ - - if (auto_eoi) - /* - * in AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_21, 0x21); /* restore master IRQ mask */ - outb(cached_A1, 0xA1); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} - static char irq_trigger[2]; /** * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ @@ -364,8 +320,8 @@ static int i8259A_shutdown(struct sys_device *dev) * the kernel initialization code can get it * out of. */ - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-1 */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ return 0; } @@ -391,6 +347,58 @@ static int __init i8259A_init_sysfs(void) device_initcall(i8259A_init_sysfs); +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x04, PIC_MASTER_IMR); + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.mask_ack = disable_8259A_irq; + else + i8259A_chip.mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + + + + /* * IRQ2 is cascade interrupt to second interrupt controller */ @@ -448,7 +456,9 @@ void __init init_ISA_irqs (void) } } -void __init init_IRQ(void) +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) { int i; diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 468c9c43784..5b3ce793436 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -15,7 +15,6 @@ static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); -EXPORT_SYMBOL(init_mm); /* * Initial thread structure. diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index ab77f190546..4ca548632c8 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -35,6 +35,7 @@ #include <linux/htirq.h> #include <linux/freezer.h> #include <linux/kthread.h> +#include <linux/jiffies.h> /* time_after() */ #include <asm/io.h> #include <asm/smp.h> @@ -48,8 +49,6 @@ #include <mach_apic.h> #include <mach_apicdef.h> -#include "io_ports.h" - int (*ioapic_renumber_irq)(int ioapic, int irq); atomic_t irq_mis_count; @@ -351,7 +350,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) # include <asm/processor.h> /* kernel_thread() */ # include <linux/kernel_stat.h> /* kstat */ # include <linux/slab.h> /* kmalloc() */ -# include <linux/timer.h> /* time_after() */ +# include <linux/timer.h> #define IRQBALANCE_CHECK_ARCH -999 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) @@ -727,7 +726,7 @@ late_initcall(balanced_irq_init); #endif /* CONFIG_SMP */ #ifndef CONFIG_SMP -void fastcall send_IPI_self(int vector) +void send_IPI_self(int vector) { unsigned int cfg; @@ -1900,7 +1899,7 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + if (time_after(jiffies, t1 + 4)) return 1; return 0; @@ -2080,7 +2079,7 @@ static struct irq_chip lapic_chip __read_mostly = { .eoi = ack_apic, }; -static void setup_nmi (void) +static void __init setup_nmi(void) { /* * Dirty trick to enable the NMI watchdog ... @@ -2093,7 +2092,7 @@ static void setup_nmi (void) */ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); + enable_NMI_through_LVT0(); apic_printk(APIC_VERBOSE, " done.\n"); } diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 23a3ac06a23..1627c0d53e0 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -32,9 +32,11 @@ #include <linux/msi.h> #include <linux/htirq.h> #include <linux/dmar.h> +#include <linux/jiffies.h> #ifdef CONFIG_ACPI #include <acpi/acpi_bus.h> #endif +#include <linux/bootmem.h> #include <asm/idle.h> #include <asm/io.h> @@ -1069,7 +1071,7 @@ void __apicdebuginit print_local_APIC(void * dummy) v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); @@ -1171,7 +1173,7 @@ void __apicdebuginit print_PIC(void) #endif /* 0 */ -static void __init enable_IO_APIC(void) +void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; int i8259_apic, i8259_pin; @@ -1298,7 +1300,7 @@ static int __init timer_irq_works(void) */ /* jiffies wrap? */ - if (jiffies - t1 > 4) + if (time_after(jiffies, t1 + 4)) return 1; return 0; } @@ -1411,7 +1413,7 @@ static void irq_complete_move(unsigned int irq) if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_rax; + vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { cpumask_t cleanup_mask; @@ -1438,7 +1440,7 @@ static void ack_apic_level(unsigned int irq) int do_unmask_irq = 0; irq_complete_move(irq); -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; @@ -1565,7 +1567,7 @@ static struct hw_interrupt_type lapic_irq_type __read_mostly = { .end = end_lapic_irq, }; -static void setup_nmi (void) +static void __init setup_nmi(void) { /* * Dirty trick to enable the NMI watchdog ... @@ -1578,7 +1580,7 @@ static void setup_nmi (void) */ printk(KERN_INFO "activating NMI Watchdog ..."); - enable_NMI_through_LVT0(NULL); + enable_NMI_through_LVT0(); printk(" done.\n"); } @@ -1654,7 +1656,7 @@ static inline void unlock_ExtINT_logic(void) * * FIXME: really need to revamp this for modern platforms only. */ -static inline void check_timer(void) +static inline void __init check_timer(void) { struct irq_cfg *cfg = irq_cfg + 0; int apic1, pin1, apic2, pin2; @@ -1788,7 +1790,10 @@ __setup("no_timer_check", notimercheck); void __init setup_IO_APIC(void) { - enable_IO_APIC(); + + /* + * calling enable_IO_APIC() is moved to setup_local_APIC for BP + */ if (acpi_ioapic) io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ @@ -2288,3 +2293,92 @@ void __init setup_ioapic_dest(void) } #endif +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + memset(mem, 0, n); + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} + +void __init ioapic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + struct resource *ioapic_res; + int i; + + ioapic_res = ioapic_setup_resources(); + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mpc_apicaddr; + } else { + ioapic_phys = (unsigned long) + alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, + "mapped IOAPIC to %016lx (%016lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } + } +} + +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk(KERN_ERR + "IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); + diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c new file mode 100644 index 00000000000..bd49321034d --- /dev/null +++ b/arch/x86/kernel/io_delay.c @@ -0,0 +1,114 @@ +/* + * I/O delay strategies for inb_p/outb_p + * + * Allow for a DMI based override of port 0x80, needed for certain HP laptops + * and possibly other systems. Also allow for the gradual elimination of + * outb_p/inb_p API uses. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/dmi.h> +#include <asm/io.h> + +int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; +EXPORT_SYMBOL_GPL(io_delay_type); + +static int __initdata io_delay_override; + +/* + * Paravirt wants native_io_delay to be a constant. + */ +void native_io_delay(void) +{ + switch (io_delay_type) { + default: + case CONFIG_IO_DELAY_TYPE_0X80: + asm volatile ("outb %al, $0x80"); + break; + case CONFIG_IO_DELAY_TYPE_0XED: + asm volatile ("outb %al, $0xed"); + break; + case CONFIG_IO_DELAY_TYPE_UDELAY: + /* + * 2 usecs is an upper-bound for the outb delay but + * note that udelay doesn't have the bus-level + * side-effects that outb does, nor does udelay() have + * precise timings during very early bootup (the delays + * are shorter until calibrated): + */ + udelay(2); + case CONFIG_IO_DELAY_TYPE_NONE: + break; + } +} +EXPORT_SYMBOL(native_io_delay); + +static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) +{ + if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) { + printk(KERN_NOTICE "%s: using 0xed I/O delay port\n", + id->ident); + io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + } + + return 0; +} + +/* + * Quirk table for systems that misbehave (lock up, etc.) if port + * 0x80 is used: + */ +static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { + { + .callback = dmi_io_delay_0xed_port, + .ident = "Compaq Presario V6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B7") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv9000z", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B9") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion tx1000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30BF") + } + }, + { } +}; + +void __init io_delay_init(void) +{ + if (!io_delay_override) + dmi_check_system(io_delay_0xed_port_dmi_table); +} + +static int __init io_delay_param(char *s) +{ + if (!strcmp(s, "0x80")) + io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; + else if (!strcmp(s, "0xed")) + io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + else if (!strcmp(s, "udelay")) + io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY; + else if (!strcmp(s, "none")) + io_delay_type = CONFIG_IO_DELAY_TYPE_NONE; + else + return -EINVAL; + + io_delay_override = 1; + return 0; +} + +early_param("io_delay", io_delay_param); diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport.c index 4ed48dc8df1..50e5e4a31c8 100644 --- a/arch/x86/kernel/ioport_32.c +++ b/arch/x86/kernel/ioport.c @@ -1,6 +1,6 @@ /* * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. + * by Linus. 32/64 bits code unification by Miguel Botón. */ #include <linux/sched.h> @@ -16,49 +16,27 @@ #include <linux/syscalls.h> /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +static void set_bitmap(unsigned long *bitmap, unsigned int base, + unsigned int extent, int new_value) { - unsigned long mask; - unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); - unsigned int low_index = base & (BITS_PER_LONG-1); - int length = low_index + extent; - - if (low_index != 0) { - mask = (~0UL << low_index); - if (length < BITS_PER_LONG) - mask &= ~(~0UL << length); - if (new_value) - *bitmap_base++ |= mask; - else - *bitmap_base++ &= ~mask; - length -= BITS_PER_LONG; - } - - mask = (new_value ? ~0UL : 0UL); - while (length >= BITS_PER_LONG) { - *bitmap_base++ = mask; - length -= BITS_PER_LONG; - } + unsigned int i; - if (length > 0) { - mask = ~(~0UL << length); + for (i = base; i < base + extent; i++) { if (new_value) - *bitmap_base++ |= mask; + __set_bit(i, bitmap); else - *bitmap_base++ &= ~mask; + __clear_bit(i, bitmap); } } - /* * this changes the io permissions bitmap in the current task. */ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) { - unsigned long i, max_long, bytes, bytes_updated; struct thread_struct * t = ¤t->thread; struct tss_struct * tss; - unsigned long *bitmap; + unsigned int i, max_long, bytes, bytes_updated; if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; @@ -71,7 +49,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * this is why we delay this operation until now: */ if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) return -ENOMEM; @@ -100,11 +79,12 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) if (t->io_bitmap_ptr[i] != ~0UL) max_long = i; - bytes = (max_long + 1) * sizeof(long); + bytes = (max_long + 1) * sizeof(unsigned long); bytes_updated = max(bytes, t->io_bitmap_max); t->io_bitmap_max = bytes; +#ifdef CONFIG_X86_32 /* * Sets the lazy trigger so that the next I/O operation will * reload the correct bitmap. @@ -113,6 +93,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) */ tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; tss->io_bitmap_owner = NULL; +#else + /* Update the TSS: */ + memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); +#endif put_cpu(); @@ -124,18 +108,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * beyond the 0x3ff range: to get the full 65536 ports bitmapped * you'd need 8kB of bitmaps/process, which is a bit excessive. * - * Here we just change the eflags value on the stack: we allow + * Here we just change the flags value on the stack: we allow * only the super-user to do it. This depends on the stack-layout * on system-call entry - see also fork() and the signal handling * code. */ - -asmlinkage long sys_iopl(unsigned long unused) +static int do_iopl(unsigned int level, struct pt_regs *regs) { - volatile struct pt_regs * regs = (struct pt_regs *) &unused; - unsigned int level = regs->ebx; - unsigned int old = (regs->eflags >> 12) & 3; - struct thread_struct *t = ¤t->thread; + unsigned int old = (regs->flags >> 12) & 3; if (level > 3) return -EINVAL; @@ -144,8 +124,31 @@ asmlinkage long sys_iopl(unsigned long unused) if (!capable(CAP_SYS_RAWIO)) return -EPERM; } + regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); + + return 0; +} + +#ifdef CONFIG_X86_32 +asmlinkage long sys_iopl(unsigned long regsp) +{ + struct pt_regs *regs = (struct pt_regs *)®sp; + unsigned int level = regs->bx; + struct thread_struct *t = ¤t->thread; + int rc; + + rc = do_iopl(level, regs); + if (rc < 0) + goto out; + t->iopl = level << 12; - regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl; set_iopl_mask(t->iopl); - return 0; +out: + return rc; +} +#else +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +{ + return do_iopl(level, regs); } +#endif diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c deleted file mode 100644 index 5f62fad64da..00000000000 --- a/arch/x86/kernel/ioport_64.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/smp.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> -#include <linux/syscalls.h> - -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -{ - int i; - if (new_value) - for (i = base; i < base + extent; i++) - __set_bit(i, bitmap); - else - for (i = base; i < base + extent; i++) - clear_bit(i, bitmap); -} - -/* - * this changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - unsigned int i, max_long, bytes, bytes_updated; - struct thread_struct * t = ¤t->thread; - struct tss_struct * tss; - unsigned long *bitmap; - - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) - return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* - * If it's the first ioperm() call in this thread's lifetime, set the - * IO bitmap up. ioperm() is much less timing critical than clone(), - * this is why we delay this operation until now: - */ - if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!bitmap) - return -ENOMEM; - - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - set_thread_flag(TIF_IO_BITMAP); - } - - /* - * do it in the per-thread copy and in the TSS ... - * - * Disable preemption via get_cpu() - we must not switch away - * because the ->io_bitmap_max value must match the bitmap - * contents: - */ - tss = &per_cpu(init_tss, get_cpu()); - - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - - /* - * Search for a (possibly new) maximum. This is simple and stupid, - * to keep it obviously correct: - */ - max_long = 0; - for (i = 0; i < IO_BITMAP_LONGS; i++) - if (t->io_bitmap_ptr[i] != ~0UL) - max_long = i; - - bytes = (max_long + 1) * sizeof(long); - bytes_updated = max(bytes, t->io_bitmap_max); - - t->io_bitmap_max = bytes; - - /* Update the TSS: */ - memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); - - put_cpu(); - - return 0; -} - -/* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. - * - * Here we just change the eflags value on the stack: we allow - * only the super-user to do it. This depends on the stack-layout - * on system-call entry - see also fork() and the signal handling - * code. - */ - -asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -{ - unsigned int old = (regs->eflags >> 12) & 3; - - if (level > 3) - return -EINVAL; - /* Trying to gain more privileges? */ - if (level > old) { - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - } - regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12); - return 0; -} diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index d3fde94f734..cef054b09d2 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ - int irq = ~regs->orig_eax; + int irq = ~regs->orig_ax; struct irq_desc *desc = irq_desc + irq; #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; @@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { - long esp; + long sp; __asm__ __volatile__("andl %%esp,%0" : - "=r" (esp) : "0" (THREAD_SIZE - 1)); - if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + "=r" (sp) : "0" (THREAD_SIZE - 1)); + if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { printk("do_IRQ: stack overflow: %ld\n", - esp - sizeof(struct thread_info)); + sp - sizeof(struct thread_info)); dump_stack(); } } @@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) * current stack (which is the irq stack already after all) */ if (curctx != irqctx) { - int arg1, arg2, ebx; + int arg1, arg2, bx; /* build the stack frame on the IRQ stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); @@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) (curctx->tinfo.preempt_count & SOFTIRQ_MASK); asm volatile( - " xchgl %%ebx,%%esp \n" - " call *%%edi \n" - " movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (ebx) + " xchgl %%ebx,%%esp \n" + " call *%%edi \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (bx) : "0" (irq), "1" (desc), "2" (isp), "D" (desc->handle_irq) : "memory", "cc" diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 6b5c730d67b..3aac15466a9 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,6 +20,26 @@ atomic_t irq_err_count; +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq); + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But don't ack when the APIC is disabled. -AK + */ + if (!disable_apic) + ack_APIC_irq(); +} + #ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: @@ -33,11 +53,11 @@ static inline void stack_overflow_check(struct pt_regs *regs) u64 curbase = (u64)task_stack_page(current); static unsigned long warned = -60*HZ; - if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && - regs->rsp < curbase + sizeof(struct thread_info) + 128 && + if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + 128 && time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", - current->comm, curbase, regs->rsp); + printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); show_stack(NULL,NULL); warned = jiffies; } @@ -142,7 +162,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); /* high bit used in ret_from_ code */ - unsigned vector = ~regs->orig_rax; + unsigned vector = ~regs->orig_ax; unsigned irq; exit_idle(); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c new file mode 100644 index 00000000000..73354302fda --- /dev/null +++ b/arch/x86/kernel/kdebugfs.c @@ -0,0 +1,65 @@ +/* + * Architecture specific debugfs files + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying <ying.huang@intel.com> + * + * This file is released under the GPLv2. + */ + +#include <linux/debugfs.h> +#include <linux/stat.h> +#include <linux/init.h> + +#include <asm/setup.h> + +#ifdef CONFIG_DEBUG_BOOT_PARAMS +static struct debugfs_blob_wrapper boot_params_blob = { + .data = &boot_params, + .size = sizeof(boot_params), +}; + +static int __init boot_params_kdebugfs_init(void) +{ + int error; + struct dentry *dbp, *version, *data; + + dbp = debugfs_create_dir("boot_params", NULL); + if (!dbp) { + error = -ENOMEM; + goto err_return; + } + version = debugfs_create_x16("version", S_IRUGO, dbp, + &boot_params.hdr.version); + if (!version) { + error = -ENOMEM; + goto err_dir; + } + data = debugfs_create_blob("data", S_IRUGO, dbp, + &boot_params_blob); + if (!data) { + error = -ENOMEM; + goto err_version; + } + return 0; +err_version: + debugfs_remove(version); +err_dir: + debugfs_remove(dbp); +err_return: + return error; +} +#endif + +static int __init arch_kdebugfs_init(void) +{ + int error = 0; + +#ifdef CONFIG_DEBUG_BOOT_PARAMS + error = boot_params_kdebugfs_init(); +#endif + + return error; +} + +arch_initcall(arch_kdebugfs_init); diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c new file mode 100644 index 00000000000..a99e764fd66 --- /dev/null +++ b/arch/x86/kernel/kprobes.c @@ -0,0 +1,1066 @@ +/* + * Kernel Probes (KProbes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes + * interface to access function arguments. + * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi + * <prasanna@in.ibm.com> adapted for x86_64 from i386. + * 2005-Mar Roland McGrath <roland@redhat.com> + * Fixed to handle %rip-relative addressing mode correctly. + * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston + * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi + * <prasanna@in.ibm.com> added function-return probes. + * 2005-May Rusty Lynch <rusty.lynch@intel.com> + * Added function return probes functionality + * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added + * kprobe-booster and kretprobe-booster for i386. + * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster + * and kretprobe-booster for x86-64 + * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven + * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> + * unified x86 kprobes code. + */ + +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/hardirq.h> +#include <linux/preempt.h> +#include <linux/module.h> +#include <linux/kdebug.h> + +#include <asm/cacheflush.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/alternative.h> + +void jprobe_return_end(void); + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +#ifdef CONFIG_X86_64 +#define stack_addr(regs) ((unsigned long *)regs->sp) +#else +/* + * "®s->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs + * don't save the ss and esp registers if the CPU is already in kernel + * mode when it traps. So for kprobes, regs->sp and regs->ss are not + * the [nonexistent] saved stack pointer and ss register, but rather + * the top 8 bytes of the pre-int3 stack. So ®s->sp happens to + * point to the top of the pre-int3 stack. + */ +#define stack_addr(regs) ((unsigned long *)®s->sp) +#endif + +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + /* + * Undefined/reserved opcodes, conditional jump, Opcode Extension + * Groups, and some special opcodes can not boost. + */ +static const u32 twobyte_is_boostable[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ + W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */ + W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */ + W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */ + W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +static const u32 onebyte_has_modrm[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ----------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ + W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ + W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ + W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ + W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ + W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ + W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +static const u32 twobyte_has_modrm[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ----------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ + W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ + W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ + W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ + W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ + W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ + W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ + W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#undef W + +struct kretprobe_blackpoint kretprobe_blacklist[] = { + {"__switch_to", }, /* This function switches only current task, but + doesn't switch kernel stack.*/ + {NULL, NULL} /* Terminator */ +}; +const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes set_jmp_op(void *from, void *to) +{ + struct __arch_jmp_op { + char op; + s32 raddr; + } __attribute__((packed)) * jop; + jop = (struct __arch_jmp_op *)from; + jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); + jop->op = RELATIVEJUMP_INSTRUCTION; +} + +/* + * Check for the REX prefix which can only exist on X86_64 + * X86_32 always returns 0 + */ +static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) +{ +#ifdef CONFIG_X86_64 + if ((*insn & 0xf0) == 0x40) + return 1; +#endif + return 0; +} + +/* + * Returns non-zero if opcode is boostable. + * RIP relative instructions are adjusted at copying time in 64 bits mode + */ +static int __kprobes can_boost(kprobe_opcode_t *opcodes) +{ + kprobe_opcode_t opcode; + kprobe_opcode_t *orig_opcodes = opcodes; + +retry: + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + opcode = *(opcodes++); + + /* 2nd-byte opcode */ + if (opcode == 0x0f) { + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + return test_bit(*opcodes, + (unsigned long *)twobyte_is_boostable); + } + + switch (opcode & 0xf0) { +#ifdef CONFIG_X86_64 + case 0x40: + goto retry; /* REX prefix is boostable */ +#endif + case 0x60: + if (0x63 < opcode && opcode < 0x67) + goto retry; /* prefixes */ + /* can't boost Address-size override and bound */ + return (opcode != 0x62 && opcode != 0x67); + case 0x70: + return 0; /* can't boost conditional jump */ + case 0xc0: + /* can't boost software-interruptions */ + return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; + case 0xd0: + /* can boost AA* and XLAT */ + return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); + case 0xe0: + /* can boost in/out and absolute jmps */ + return ((opcode & 0x04) || opcode == 0xea); + case 0xf0: + if ((opcode & 0x0c) == 0 && opcode != 0xf1) + goto retry; /* lock/rep(ne) prefix */ + /* clear and set flags are boostable */ + return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); + default: + /* segment override prefixes are boostable */ + if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) + goto retry; /* prefixes */ + /* CS override prefix and call are not boostable */ + return (opcode != 0x2e && opcode != 0x9a); + } +} + +/* + * Returns non-zero if opcode modifies the interrupt flag. + */ +static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) +{ + switch (*insn) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + + /* + * on X86_64, 0x40-0x4f are REX prefixes so we need to look + * at the next byte instead.. but of course not recurse infinitely + */ + if (is_REX_prefix(insn)) + return is_IF_modifier(++insn); + + return 0; +} + +/* + * Adjust the displacement if the instruction uses the %rip-relative + * addressing mode. + * If it does, Return the address of the 32-bit displacement word. + * If not, return null. + * Only applicable to 64-bit x86. + */ +static void __kprobes fix_riprel(struct kprobe *p) +{ +#ifdef CONFIG_X86_64 + u8 *insn = p->ainsn.insn; + s64 disp; + int need_modrm; + + /* Skip legacy instruction prefixes. */ + while (1) { + switch (*insn) { + case 0x66: + case 0x67: + case 0x2e: + case 0x3e: + case 0x26: + case 0x64: + case 0x65: + case 0x36: + case 0xf0: + case 0xf3: + case 0xf2: + ++insn; + continue; + } + break; + } + + /* Skip REX instruction prefix. */ + if (is_REX_prefix(insn)) + ++insn; + + if (*insn == 0x0f) { + /* Two-byte opcode. */ + ++insn; + need_modrm = test_bit(*insn, + (unsigned long *)twobyte_has_modrm); + } else + /* One-byte opcode. */ + need_modrm = test_bit(*insn, + (unsigned long *)onebyte_has_modrm); + + if (need_modrm) { + u8 modrm = *++insn; + if ((modrm & 0xc7) == 0x05) { + /* %rip+disp32 addressing mode */ + /* Displacement follows ModRM byte. */ + ++insn; + /* + * The copied instruction uses the %rip-relative + * addressing mode. Adjust the displacement for the + * difference between the original location of this + * instruction and the location of the copy that will + * actually be run. The tricky bit here is making sure + * that the sign extension happens correctly in this + * calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the + * %rip value and yield the same 64-bit result that the + * sign-extension of the original signed 32-bit + * displacement would have given. + */ + disp = (u8 *) p->addr + *((s32 *) insn) - + (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ + *(s32 *)insn = (s32) disp; + } + } +#endif +} + +static void __kprobes arch_copy_kprobe(struct kprobe *p) +{ + memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + + fix_riprel(p); + + if (can_boost(p->addr)) + p->ainsn.boostable = 0; + else + p->ainsn.boostable = -1; + + p->opcode = *p->addr; +} + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + /* insn: must be on special executable page on x86. */ + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + arch_copy_kprobe(p); + return 0; +} + +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); +} + +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, &p->opcode, 1); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + mutex_lock(&kprobe_mutex); + free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); + mutex_unlock(&kprobe_mutex); +} + +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags; + kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags; +} + +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; + kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; +} + +static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = p; + kcb->kprobe_saved_flags = kcb->kprobe_old_flags + = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); + if (is_IF_modifier(p->ainsn.insn)) + kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF; +} + +static void __kprobes clear_btf(void) +{ + if (test_thread_flag(TIF_DEBUGCTLMSR)) + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); +} + +static void __kprobes restore_btf(void) +{ + if (test_thread_flag(TIF_DEBUGCTLMSR)) + wrmsrl(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr); +} + +static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; +} + +/* Called with kretprobe_lock held */ +void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + unsigned long *sara = stack_addr(regs); + + ri->ret_addr = (kprobe_opcode_t *) *sara; + + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; +} + +static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) + if (p->ainsn.boostable == 1 && !p->post_handler) { + /* Boost up -- we can execute copied instructions directly */ + reset_current_kprobe(); + regs->ip = (unsigned long)p->ainsn.insn; + preempt_enable_no_resched(); + return; + } +#endif + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_HIT_SS; +} + +/* + * We have reentered the kprobe_handler(), since another probe was hit while + * within the handler. We save the original kprobes variables and just single + * step on the instruction of the new probe without calling any user handlers. + */ +static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + switch (kcb->kprobe_status) { + case KPROBE_HIT_SSDONE: +#ifdef CONFIG_X86_64 + /* TODO: Provide re-entrancy from post_kprobes_handler() and + * avoid exception stack corruption while single-stepping on + * the instruction of the new probe. + */ + arch_disarm_kprobe(p); + regs->ip = (unsigned long)p->addr; + reset_current_kprobe(); + preempt_enable_no_resched(); + break; +#endif + case KPROBE_HIT_ACTIVE: + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kprobes_inc_nmissed_count(p); + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_REENTER; + break; + case KPROBE_HIT_SS: + if (p == kprobe_running()) { + regs->flags &= ~TF_MASK; + regs->flags |= kcb->kprobe_saved_flags; + return 0; + } else { + /* A probe has been hit in the codepath leading up + * to, or just after, single-stepping of a probed + * instruction. This entire codepath should strictly + * reside in .kprobes.text section. Raise a warning + * to highlight this peculiar case. + */ + } + default: + /* impossible cases */ + WARN_ON(1); + return 0; + } + + return 1; +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled thorough out this function. + */ +static int __kprobes kprobe_handler(struct pt_regs *regs) +{ + kprobe_opcode_t *addr; + struct kprobe *p; + struct kprobe_ctlblk *kcb; + + addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Back up over the (now missing) int3 and run + * the original instruction. + */ + regs->ip = (unsigned long)addr; + return 1; + } + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing. We conditionally + * re-enable preemption at the end of this function, + * and also in reenter_kprobe() and setup_singlestep(). + */ + preempt_disable(); + + kcb = get_kprobe_ctlblk(); + p = get_kprobe(addr); + + if (p) { + if (kprobe_running()) { + if (reenter_kprobe(p, regs, kcb)) + return 1; + } else { + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + /* + * If we have no pre-handler or it returned 0, we + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, it prepped + * for calling the break_handler below on re-entry + * for jprobe processing, so get out doing nothing + * more here. + */ + if (!p->pre_handler || !p->pre_handler(p, regs)) + setup_singlestep(p, regs, kcb); + return 1; + } + } else if (kprobe_running()) { + p = __get_cpu_var(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) { + setup_singlestep(p, regs, kcb); + return 1; + } + } /* else: not a kprobe fault; let the kernel handle it */ + + preempt_enable_no_resched(); + return 0; +} + +/* + * When a retprobed function returns, this code saves registers and + * calls trampoline_handler() runs, which calls the kretprobe's handler. + */ +void __kprobes kretprobe_trampoline_holder(void) +{ + asm volatile ( + ".global kretprobe_trampoline\n" + "kretprobe_trampoline: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + /* + * Skip cs, ip, orig_ax. + * trampoline_handler() will plug in these values + */ + " subq $24, %rsp\n" + " pushq %rdi\n" + " pushq %rsi\n" + " pushq %rdx\n" + " pushq %rcx\n" + " pushq %rax\n" + " pushq %r8\n" + " pushq %r9\n" + " pushq %r10\n" + " pushq %r11\n" + " pushq %rbx\n" + " pushq %rbp\n" + " pushq %r12\n" + " pushq %r13\n" + " pushq %r14\n" + " pushq %r15\n" + " movq %rsp, %rdi\n" + " call trampoline_handler\n" + /* Replace saved sp with true return address. */ + " movq %rax, 152(%rsp)\n" + " popq %r15\n" + " popq %r14\n" + " popq %r13\n" + " popq %r12\n" + " popq %rbp\n" + " popq %rbx\n" + " popq %r11\n" + " popq %r10\n" + " popq %r9\n" + " popq %r8\n" + " popq %rax\n" + " popq %rcx\n" + " popq %rdx\n" + " popq %rsi\n" + " popq %rdi\n" + /* Skip orig_ax, ip, cs */ + " addq $24, %rsp\n" + " popfq\n" +#else + " pushf\n" + /* + * Skip cs, ip, orig_ax. + * trampoline_handler() will plug in these values + */ + " subl $12, %esp\n" + " pushl %fs\n" + " pushl %ds\n" + " pushl %es\n" + " pushl %eax\n" + " pushl %ebp\n" + " pushl %edi\n" + " pushl %esi\n" + " pushl %edx\n" + " pushl %ecx\n" + " pushl %ebx\n" + " movl %esp, %eax\n" + " call trampoline_handler\n" + /* Move flags to cs */ + " movl 52(%esp), %edx\n" + " movl %edx, 48(%esp)\n" + /* Replace saved flags with true return address. */ + " movl %eax, 52(%esp)\n" + " popl %ebx\n" + " popl %ecx\n" + " popl %edx\n" + " popl %esi\n" + " popl %edi\n" + " popl %ebp\n" + " popl %eax\n" + /* Skip ip, orig_ax, es, ds, fs */ + " addl $20, %esp\n" + " popf\n" +#endif + " ret\n"); +} + +/* + * Called from kretprobe_trampoline + */ +void * __kprobes trampoline_handler(struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *node, *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; + + INIT_HLIST_HEAD(&empty_rp); + spin_lock_irqsave(&kretprobe_lock, flags); + head = kretprobe_inst_table_head(current); + /* fixup registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); +#endif + regs->ip = trampoline_address; + regs->orig_ax = ~0UL; + + /* + * It is possible to have multiple instances associated with a given + * task either because multiple functions in the call path have + * return probes installed on them, and/or more then one + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always pushed into the head of the list + * - when multiple return probes are registered for the same + * function, the (chronologically) first instance's ret_addr + * will be the real return address, and all the rest will + * point to kretprobe_trampoline. + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) { + __get_cpu_var(current_kprobe) = &ri->rp->kp; + get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; + ri->rp->handler(ri, regs); + __get_cpu_var(current_kprobe) = NULL; + } + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri, &empty_rp); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + + spin_unlock_irqrestore(&kretprobe_lock, flags); + + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + return (void *)orig_ret_address; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int 3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + * + * This function prepares to return from the post-single-step + * interrupt. We have to fix up the stack as follows: + * + * 0) Except in the case of absolute or indirect jump or call instructions, + * the new ip is relative to the copied instruction. We need to make + * it relative to the original instruction. + * + * 1) If the single-stepped instruction was pushfl, then the TF and IF + * flags are set in the just-pushed flags, and may need to be cleared. + * + * 2) If the single-stepped instruction was a call, the return address + * that is atop the stack is the address following the copied instruction. + * We need to make it the address following the original instruction. + * + * If this is the first time we've single-stepped the instruction at + * this probepoint, and the instruction is boostable, boost it: add a + * jump instruction after the copied instruction, that jumps to the next + * instruction after the probepoint. + */ +static void __kprobes resume_execution(struct kprobe *p, + struct pt_regs *regs, struct kprobe_ctlblk *kcb) +{ + unsigned long *tos = stack_addr(regs); + unsigned long copy_ip = (unsigned long)p->ainsn.insn; + unsigned long orig_ip = (unsigned long)p->addr; + kprobe_opcode_t *insn = p->ainsn.insn; + + /*skip the REX prefix*/ + if (is_REX_prefix(insn)) + insn++; + + regs->flags &= ~X86_EFLAGS_TF; + switch (*insn) { + case 0x9c: /* pushfl */ + *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF); + *tos |= kcb->kprobe_old_flags; + break; + case 0xc2: /* iret/ret/lret */ + case 0xc3: + case 0xca: + case 0xcb: + case 0xcf: + case 0xea: /* jmp absolute -- ip is correct */ + /* ip is already adjusted, no more changes required */ + p->ainsn.boostable = 1; + goto no_change; + case 0xe8: /* call relative - Fix return addr */ + *tos = orig_ip + (*tos - copy_ip); + break; +#ifdef CONFIG_X86_32 + case 0x9a: /* call absolute -- same as call absolute, indirect */ + *tos = orig_ip + (*tos - copy_ip); + goto no_change; +#endif + case 0xff: + if ((insn[1] & 0x30) == 0x10) { + /* + * call absolute, indirect + * Fix return addr; ip is correct. + * But this is not boostable + */ + *tos = orig_ip + (*tos - copy_ip); + goto no_change; + } else if (((insn[1] & 0x31) == 0x20) || + ((insn[1] & 0x31) == 0x21)) { + /* + * jmp near and far, absolute indirect + * ip is correct. And this is boostable + */ + p->ainsn.boostable = 1; + goto no_change; + } + default: + break; + } + + if (p->ainsn.boostable == 0) { + if ((regs->ip > copy_ip) && + (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + set_jmp_op((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); + p->ainsn.boostable = 1; + } else { + p->ainsn.boostable = -1; + } + } + + regs->ip += orig_ip - copy_ip; + +no_change: + restore_btf(); +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate and they + * remain disabled thoroughout this function. + */ +static int __kprobes post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + resume_execution(cur, regs, kcb); + regs->flags |= kcb->kprobe_saved_flags; + trace_hardirqs_fixup_flags(regs->flags); + + /* Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); +out: + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, flags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->flags & X86_EFLAGS_TF) + return 0; + + return 1; +} + +int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + switch (kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the ip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + regs->ip = (unsigned long)cur->addr; + regs->flags |= kcb->kprobe_old_flags; + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + preempt_enable_no_resched(); + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accounting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + if (fixup_exception(regs)) + return 1; + + /* + * fixup routine could not handle it, + * Let do_page_fault() fix it. + */ + break; + default: + break; + } + return 0; +} + +/* + * Wrapper routine for handling exceptions. + */ +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = data; + int ret = NOTIFY_DONE; + + if (args->regs && user_mode_vm(args->regs)) + return ret; + + switch (val) { + case DIE_INT3: + if (kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_DEBUG: + if (post_kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_GPF: + /* + * To be potentially processing a kprobe fault and to + * trust the result from kprobe_running(), we have + * be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + ret = NOTIFY_STOP; + break; + default: + break; + } + return ret; +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr; + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + kcb->jprobe_saved_regs = *regs; + kcb->jprobe_saved_sp = stack_addr(regs); + addr = (unsigned long)(kcb->jprobe_saved_sp); + + /* + * As Linus pointed out, gcc assumes that the callee + * owns the argument space and could overwrite it, e.g. + * tailcall optimization. So, to be absolutely safe + * we also save and restore enough stack bytes to cover + * the argument area. + */ + memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, + MIN_STACK_SIZE(addr)); + regs->flags &= ~X86_EFLAGS_IF; + trace_hardirqs_off(); + regs->ip = (unsigned long)(jp->entry); + return 1; +} + +void __kprobes jprobe_return(void) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + asm volatile ( +#ifdef CONFIG_X86_64 + " xchg %%rbx,%%rsp \n" +#else + " xchgl %%ebx,%%esp \n" +#endif + " int3 \n" + " .globl jprobe_return_end\n" + " jprobe_return_end: \n" + " nop \n"::"b" + (kcb->jprobe_saved_sp):"memory"); +} + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + u8 *addr = (u8 *) (regs->ip - 1); + struct jprobe *jp = container_of(p, struct jprobe, kp); + + if ((addr > (u8 *) jprobe_return) && + (addr < (u8 *) jprobe_return_end)) { + if (stack_addr(regs) != kcb->jprobe_saved_sp) { + struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; + printk(KERN_ERR + "current sp %p does not match saved sp %p\n", + stack_addr(regs), kcb->jprobe_saved_sp); + printk(KERN_ERR "Saved registers for jprobe %p\n", jp); + show_registers(saved_regs); + printk(KERN_ERR "Current registers\n"); + show_registers(regs); + BUG(); + } + *regs = kcb->jprobe_saved_regs; + memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), + kcb->jprobes_stack, + MIN_STACK_SIZE(kcb->jprobe_saved_sp)); + preempt_enable_no_resched(); + return 1; + } + return 0; +} + +int __init arch_init_kprobes(void) +{ + return 0; +} + +int __kprobes arch_trampoline_kprobe(struct kprobe *p) +{ + return 0; +} diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c deleted file mode 100644 index 3a020f79f82..00000000000 --- a/arch/x86/kernel/kprobes_32.c +++ /dev/null @@ -1,756 +0,0 @@ -/* - * Kernel Probes (KProbes) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2002, 2004 - * - * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel - * Probes initial implementation ( includes contributions from - * Rusty Russell). - * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes - * interface to access function arguments. - * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston - * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi - * <prasanna@in.ibm.com> added function-return probes. - */ - -#include <linux/kprobes.h> -#include <linux/ptrace.h> -#include <linux/preempt.h> -#include <linux/kdebug.h> -#include <asm/cacheflush.h> -#include <asm/desc.h> -#include <asm/uaccess.h> -#include <asm/alternative.h> - -void jprobe_return_end(void); - -DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; -DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); - -struct kretprobe_blackpoint kretprobe_blacklist[] = { - {"__switch_to", }, /* This function switches only current task, but - doesn't switch kernel stack.*/ - {NULL, NULL} /* Terminator */ -}; -const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); - -/* insert a jmp code */ -static __always_inline void set_jmp_op(void *from, void *to) -{ - struct __arch_jmp_op { - char op; - long raddr; - } __attribute__((packed)) *jop; - jop = (struct __arch_jmp_op *)from; - jop->raddr = (long)(to) - ((long)(from) + 5); - jop->op = RELATIVEJUMP_INSTRUCTION; -} - -/* - * returns non-zero if opcodes can be boosted. - */ -static __always_inline int can_boost(kprobe_opcode_t *opcodes) -{ -#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ - (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ - (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ - (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ - (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ - << (row % 32)) - /* - * Undefined/reserved opcodes, conditional jump, Opcode Extension - * Groups, and some special opcodes can not be boost. - */ - static const unsigned long twobyte_is_boostable[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */ - W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */ - W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */ - W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */ - W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ - W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */ - W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */ - W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */ - W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */ - W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */ - W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */ - W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */ - W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */ - W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */ - W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */ - W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; -#undef W - kprobe_opcode_t opcode; - kprobe_opcode_t *orig_opcodes = opcodes; -retry: - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - opcode = *(opcodes++); - - /* 2nd-byte opcode */ - if (opcode == 0x0f) { - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - return test_bit(*opcodes, twobyte_is_boostable); - } - - switch (opcode & 0xf0) { - case 0x60: - if (0x63 < opcode && opcode < 0x67) - goto retry; /* prefixes */ - /* can't boost Address-size override and bound */ - return (opcode != 0x62 && opcode != 0x67); - case 0x70: - return 0; /* can't boost conditional jump */ - case 0xc0: - /* can't boost software-interruptions */ - return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; - case 0xd0: - /* can boost AA* and XLAT */ - return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); - case 0xe0: - /* can boost in/out and absolute jmps */ - return ((opcode & 0x04) || opcode == 0xea); - case 0xf0: - if ((opcode & 0x0c) == 0 && opcode != 0xf1) - goto retry; /* lock/rep(ne) prefix */ - /* clear and set flags can be boost */ - return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); - default: - if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) - goto retry; /* prefixes */ - /* can't boost CS override and call */ - return (opcode != 0x2e && opcode != 0x9a); - } -} - -/* - * returns non-zero if opcode modifies the interrupt flag. - */ -static int __kprobes is_IF_modifier(kprobe_opcode_t opcode) -{ - switch (opcode) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; - } - return 0; -} - -int __kprobes arch_prepare_kprobe(struct kprobe *p) -{ - /* insn: must be on special executable page on i386. */ - p->ainsn.insn = get_insn_slot(); - if (!p->ainsn.insn) - return -ENOMEM; - - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - p->opcode = *p->addr; - if (can_boost(p->addr)) { - p->ainsn.boostable = 0; - } else { - p->ainsn.boostable = -1; - } - return 0; -} - -void __kprobes arch_arm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); -} - -void __kprobes arch_disarm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, &p->opcode, 1); -} - -void __kprobes arch_remove_kprobe(struct kprobe *p) -{ - mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); - mutex_unlock(&kprobe_mutex); -} - -static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - kcb->prev_kprobe.kp = kprobe_running(); - kcb->prev_kprobe.status = kcb->kprobe_status; - kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags; - kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags; -} - -static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; - kcb->kprobe_status = kcb->prev_kprobe.status; - kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags; - kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags; -} - -static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = p; - kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->opcode)) - kcb->kprobe_saved_eflags &= ~IF_MASK; -} - -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - regs->eflags |= TF_MASK; - regs->eflags &= ~IF_MASK; - /*single step inline if the instruction is an int3*/ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->eip = (unsigned long)p->addr; - else - regs->eip = (unsigned long)p->ainsn.insn; -} - -/* Called with kretprobe_lock held */ -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - unsigned long *sara = (unsigned long *)®s->esp; - - ri->ret_addr = (kprobe_opcode_t *) *sara; - - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; -} - -/* - * Interrupts are disabled on entry as trap3 is an interrupt gate and they - * remain disabled thorough out this function. - */ -static int __kprobes kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *p; - int ret = 0; - kprobe_opcode_t *addr; - struct kprobe_ctlblk *kcb; - - addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); - - /* - * We don't want to be preempted for the entire - * duration of kprobe processing - */ - preempt_disable(); - kcb = get_kprobe_ctlblk(); - - /* Check we're not actually recursing */ - if (kprobe_running()) { - p = get_kprobe(addr); - if (p) { - if (kcb->kprobe_status == KPROBE_HIT_SS && - *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { - regs->eflags &= ~TF_MASK; - regs->eflags |= kcb->kprobe_saved_eflags; - goto no_kprobe; - } - /* We have reentered the kprobe_handler(), since - * another probe was hit while within the handler. - * We here save the original kprobes variables and - * just single step on the instruction of the new probe - * without calling any user handlers. - */ - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); - kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; - return 1; - } else { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* The breakpoint instruction was removed by - * another cpu right after we hit, no further - * handling of this interrupt is appropriate - */ - regs->eip -= sizeof(kprobe_opcode_t); - ret = 1; - goto no_kprobe; - } - p = __get_cpu_var(current_kprobe); - if (p->break_handler && p->break_handler(p, regs)) { - goto ss_probe; - } - } - goto no_kprobe; - } - - p = get_kprobe(addr); - if (!p) { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Back up over the (now missing) int3 and run - * the original instruction. - */ - regs->eip -= sizeof(kprobe_opcode_t); - ret = 1; - } - /* Not one of ours: let kernel handle it */ - goto no_kprobe; - } - - set_current_kprobe(p, regs, kcb); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - - if (p->pre_handler && p->pre_handler(p, regs)) - /* handler has already set things up, so skip ss setup */ - return 1; - -ss_probe: -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) - if (p->ainsn.boostable == 1 && !p->post_handler){ - /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); - regs->eip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); - return 1; - } -#endif - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; - return 1; - -no_kprobe: - preempt_enable_no_resched(); - return ret; -} - -/* - * For function-return probes, init_kprobes() establishes a probepoint - * here. When a retprobed function returns, this probe is hit and - * trampoline_probe_handler() runs, calling the kretprobe's handler. - */ - void __kprobes kretprobe_trampoline_holder(void) - { - asm volatile ( ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" - " pushf\n" - /* skip cs, eip, orig_eax */ - " subl $12, %esp\n" - " pushl %fs\n" - " pushl %ds\n" - " pushl %es\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" - " movl %esp, %eax\n" - " call trampoline_handler\n" - /* move eflags to cs */ - " movl 52(%esp), %edx\n" - " movl %edx, 48(%esp)\n" - /* save true return address on eflags */ - " movl %eax, 52(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* skip eip, orig_eax, es, ds, fs */ - " addl $20, %esp\n" - " popf\n" - " ret\n"); -} - -/* - * Called from kretprobe_trampoline - */ -fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) -{ - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; - - INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); - /* fixup registers */ - regs->xcs = __KERNEL_CS | get_kernel_rpl(); - regs->eip = trampoline_address; - regs->orig_eax = 0xffffffff; - - /* - * It is possible to have multiple instances associated with a given - * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always inserted at the head of the list - * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the - * real return address, and all the rest will point to - * kretprobe_trampoline - */ - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - if (ri->rp && ri->rp->handler){ - __get_cpu_var(current_kprobe) = &ri->rp->kp; - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; - ri->rp->handler(ri, regs); - __get_cpu_var(current_kprobe) = NULL; - } - - orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - spin_unlock_irqrestore(&kretprobe_lock, flags); - - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - return (void*)orig_ret_address; -} - -/* - * Called after single-stepping. p->addr is the address of the - * instruction whose first byte has been replaced by the "int 3" - * instruction. To avoid the SMP problems that can occur when we - * temporarily put back the original opcode to single-step, we - * single-stepped a copy of the instruction. The address of this - * copy is p->ainsn.insn. - * - * This function prepares to return from the post-single-step - * interrupt. We have to fix up the stack as follows: - * - * 0) Except in the case of absolute or indirect jump or call instructions, - * the new eip is relative to the copied instruction. We need to make - * it relative to the original instruction. - * - * 1) If the single-stepped instruction was pushfl, then the TF and IF - * flags are set in the just-pushed eflags, and may need to be cleared. - * - * 2) If the single-stepped instruction was a call, the return address - * that is atop the stack is the address following the copied instruction. - * We need to make it the address following the original instruction. - * - * This function also checks instruction size for preparing direct execution. - */ -static void __kprobes resume_execution(struct kprobe *p, - struct pt_regs *regs, struct kprobe_ctlblk *kcb) -{ - unsigned long *tos = (unsigned long *)®s->esp; - unsigned long copy_eip = (unsigned long)p->ainsn.insn; - unsigned long orig_eip = (unsigned long)p->addr; - - regs->eflags &= ~TF_MASK; - switch (p->ainsn.insn[0]) { - case 0x9c: /* pushfl */ - *tos &= ~(TF_MASK | IF_MASK); - *tos |= kcb->kprobe_old_eflags; - break; - case 0xc2: /* iret/ret/lret */ - case 0xc3: - case 0xca: - case 0xcb: - case 0xcf: - case 0xea: /* jmp absolute -- eip is correct */ - /* eip is already adjusted, no more changes required */ - p->ainsn.boostable = 1; - goto no_change; - case 0xe8: /* call relative - Fix return addr */ - *tos = orig_eip + (*tos - copy_eip); - break; - case 0x9a: /* call absolute -- same as call absolute, indirect */ - *tos = orig_eip + (*tos - copy_eip); - goto no_change; - case 0xff: - if ((p->ainsn.insn[1] & 0x30) == 0x10) { - /* - * call absolute, indirect - * Fix return addr; eip is correct. - * But this is not boostable - */ - *tos = orig_eip + (*tos - copy_eip); - goto no_change; - } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ - ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ - /* eip is correct. And this is boostable */ - p->ainsn.boostable = 1; - goto no_change; - } - default: - break; - } - - if (p->ainsn.boostable == 0) { - if ((regs->eip > copy_eip) && - (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) { - /* - * These instructions can be executed directly if it - * jumps back to correct address. - */ - set_jmp_op((void *)regs->eip, - (void *)orig_eip + (regs->eip - copy_eip)); - p->ainsn.boostable = 1; - } else { - p->ainsn.boostable = -1; - } - } - - regs->eip = orig_eip + (regs->eip - copy_eip); - -no_change: - return; -} - -/* - * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled thoroughout this function. - */ -static int __kprobes post_kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - if (!cur) - return 0; - - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - - resume_execution(cur, regs, kcb); - regs->eflags |= kcb->kprobe_saved_eflags; - trace_hardirqs_fixup_flags(regs->eflags); - - /*Restore back the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(kcb); - goto out; - } - reset_current_kprobe(); -out: - preempt_enable_no_resched(); - - /* - * if somebody else is singlestepping across a probe point, eflags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (regs->eflags & TF_MASK) - return 0; - - return 1; -} - -int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - switch(kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: - /* - * We are here because the instruction being single - * stepped caused a page fault. We reset the current - * kprobe and the eip points back to the probe address - * and allow the page fault handler to continue as a - * normal page fault. - */ - regs->eip = (unsigned long)cur->addr; - regs->eflags |= kcb->kprobe_old_eflags; - if (kcb->kprobe_status == KPROBE_REENTER) - restore_previous_kprobe(kcb); - else - reset_current_kprobe(); - preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: - /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accouting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) - return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - if (fixup_exception(regs)) - return 1; - - /* - * fixup_exception() could not handle it, - * Let do_page_fault() fix it. - */ - break; - default: - break; - } - return 0; -} - -/* - * Wrapper routine to for handling exceptions. - */ -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct die_args *args = (struct die_args *)data; - int ret = NOTIFY_DONE; - - if (args->regs && user_mode_vm(args->regs)) - return ret; - - switch (val) { - case DIE_INT3: - if (kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_GPF: - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - ret = NOTIFY_STOP; - preempt_enable(); - break; - default: - break; - } - return ret; -} - -int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - kcb->jprobe_saved_esp = ®s->esp; - addr = (unsigned long)(kcb->jprobe_saved_esp); - - /* - * TBD: As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - */ - memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, - MIN_STACK_SIZE(addr)); - regs->eflags &= ~IF_MASK; - trace_hardirqs_off(); - regs->eip = (unsigned long)(jp->entry); - return 1; -} - -void __kprobes jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - asm volatile (" xchgl %%ebx,%%esp \n" - " int3 \n" - " .globl jprobe_return_end \n" - " jprobe_return_end: \n" - " nop \n"::"b" - (kcb->jprobe_saved_esp):"memory"); -} - -int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - u8 *addr = (u8 *) (regs->eip - 1); - unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp); - struct jprobe *jp = container_of(p, struct jprobe, kp); - - if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if (®s->esp != kcb->jprobe_saved_esp) { - struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; - printk("current esp %p does not match saved esp %p\n", - ®s->esp, kcb->jprobe_saved_esp); - printk("Saved registers for jprobe %p\n", jp); - show_registers(saved_regs); - printk("Current registers\n"); - show_registers(regs); - BUG(); - } - *regs = kcb->jprobe_saved_regs; - memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, - MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); - return 1; - } - return 0; -} - -int __kprobes arch_trampoline_kprobe(struct kprobe *p) -{ - return 0; -} - -int __init arch_init_kprobes(void) -{ - return 0; -} diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c deleted file mode 100644 index 5df19a9f923..00000000000 --- a/arch/x86/kernel/kprobes_64.c +++ /dev/null @@ -1,749 +0,0 @@ -/* - * Kernel Probes (KProbes) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2002, 2004 - * - * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel - * Probes initial implementation ( includes contributions from - * Rusty Russell). - * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes - * interface to access function arguments. - * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi - * <prasanna@in.ibm.com> adapted for x86_64 - * 2005-Mar Roland McGrath <roland@redhat.com> - * Fixed to handle %rip-relative addressing mode correctly. - * 2005-May Rusty Lynch <rusty.lynch@intel.com> - * Added function return probes functionality - */ - -#include <linux/kprobes.h> -#include <linux/ptrace.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/preempt.h> -#include <linux/module.h> -#include <linux/kdebug.h> - -#include <asm/pgtable.h> -#include <asm/uaccess.h> -#include <asm/alternative.h> - -void jprobe_return_end(void); -static void __kprobes arch_copy_kprobe(struct kprobe *p); - -DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; -DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); - -struct kretprobe_blackpoint kretprobe_blacklist[] = { - {"__switch_to", }, /* This function switches only current task, but - doesn't switch kernel stack.*/ - {NULL, NULL} /* Terminator */ -}; -const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); - -/* - * returns non-zero if opcode modifies the interrupt flag. - */ -static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) -{ - switch (*insn) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; - } - - if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) - return 1; - return 0; -} - -int __kprobes arch_prepare_kprobe(struct kprobe *p) -{ - /* insn: must be on special executable page on x86_64. */ - p->ainsn.insn = get_insn_slot(); - if (!p->ainsn.insn) { - return -ENOMEM; - } - arch_copy_kprobe(p); - return 0; -} - -/* - * Determine if the instruction uses the %rip-relative addressing mode. - * If it does, return the address of the 32-bit displacement word. - * If not, return null. - */ -static s32 __kprobes *is_riprel(u8 *insn) -{ -#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ - (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ - (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ - (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ - (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ - << (row % 64)) - static const u64 onebyte_has_modrm[256 / 64] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ - W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ - W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ - W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ - W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ - W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ - W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ - W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ - W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ - W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ - W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ - W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ - W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ - W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ - W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ - W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; - static const u64 twobyte_has_modrm[256 / 64] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ - W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ - W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ - W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ - W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ - W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ - W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ - W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ - W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ - W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ - W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ - W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ - W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ - W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ - W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ - W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; -#undef W - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } - - /* Skip REX instruction prefix. */ - if ((*insn & 0xf0) == 0x40) - ++insn; - - if (*insn == 0x0f) { /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, twobyte_has_modrm); - } else { /* One-byte opcode. */ - need_modrm = test_bit(*insn, onebyte_has_modrm); - } - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - return (s32 *) ++insn; - } - } - - /* No %rip-relative addressing mode here. */ - return NULL; -} - -static void __kprobes arch_copy_kprobe(struct kprobe *p) -{ - s32 *ripdisp; - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); - ripdisp = is_riprel(p->ainsn.insn); - if (ripdisp) { - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *ripdisp = disp; - } - p->opcode = *p->addr; -} - -void __kprobes arch_arm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); -} - -void __kprobes arch_disarm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, &p->opcode, 1); -} - -void __kprobes arch_remove_kprobe(struct kprobe *p) -{ - mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, 0); - mutex_unlock(&kprobe_mutex); -} - -static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - kcb->prev_kprobe.kp = kprobe_running(); - kcb->prev_kprobe.status = kcb->kprobe_status; - kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags; - kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags; -} - -static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; - kcb->kprobe_status = kcb->prev_kprobe.status; - kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags; - kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags; -} - -static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = p; - kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->ainsn.insn)) - kcb->kprobe_saved_rflags &= ~IF_MASK; -} - -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - regs->eflags |= TF_MASK; - regs->eflags &= ~IF_MASK; - /*single step inline if the instruction is an int3*/ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->rip = (unsigned long)p->addr; - else - regs->rip = (unsigned long)p->ainsn.insn; -} - -/* Called with kretprobe_lock held */ -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - unsigned long *sara = (unsigned long *)regs->rsp; - - ri->ret_addr = (kprobe_opcode_t *) *sara; - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; -} - -int __kprobes kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *p; - int ret = 0; - kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); - struct kprobe_ctlblk *kcb; - - /* - * We don't want to be preempted for the entire - * duration of kprobe processing - */ - preempt_disable(); - kcb = get_kprobe_ctlblk(); - - /* Check we're not actually recursing */ - if (kprobe_running()) { - p = get_kprobe(addr); - if (p) { - if (kcb->kprobe_status == KPROBE_HIT_SS && - *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { - regs->eflags &= ~TF_MASK; - regs->eflags |= kcb->kprobe_saved_rflags; - goto no_kprobe; - } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) { - /* TODO: Provide re-entrancy from - * post_kprobes_handler() and avoid exception - * stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->rip = (unsigned long)p->addr; - reset_current_kprobe(); - ret = 1; - } else { - /* We have reentered the kprobe_handler(), since - * another probe was hit while within the - * handler. We here save the original kprobe - * variables and just single step on instruction - * of the new probe without calling any user - * handlers. - */ - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); - kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; - return 1; - } - } else { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* The breakpoint instruction was removed by - * another cpu right after we hit, no further - * handling of this interrupt is appropriate - */ - regs->rip = (unsigned long)addr; - ret = 1; - goto no_kprobe; - } - p = __get_cpu_var(current_kprobe); - if (p->break_handler && p->break_handler(p, regs)) { - goto ss_probe; - } - } - goto no_kprobe; - } - - p = get_kprobe(addr); - if (!p) { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Back up over the (now missing) int3 and run - * the original instruction. - */ - regs->rip = (unsigned long)addr; - ret = 1; - } - /* Not one of ours: let kernel handle it */ - goto no_kprobe; - } - - set_current_kprobe(p, regs, kcb); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - - if (p->pre_handler && p->pre_handler(p, regs)) - /* handler has already set things up, so skip ss setup */ - return 1; - -ss_probe: - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; - return 1; - -no_kprobe: - preempt_enable_no_resched(); - return ret; -} - -/* - * For function-return probes, init_kprobes() establishes a probepoint - * here. When a retprobed function returns, this probe is hit and - * trampoline_probe_handler() runs, calling the kretprobe's handler. - */ - void kretprobe_trampoline_holder(void) - { - asm volatile ( ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" - "nop\n"); - } - -/* - * Called when we hit the probe point at kretprobe_trampoline - */ -int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; - - INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); - - /* - * It is possible to have multiple instances associated with a given - * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always inserted at the head of the list - * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the - * real return address, and all the rest will point to - * kretprobe_trampoline - */ - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - if (ri->rp && ri->rp->handler) - ri->rp->handler(ri, regs); - - orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - regs->rip = orig_ret_address; - - reset_current_kprobe(); - spin_unlock_irqrestore(&kretprobe_lock, flags); - preempt_enable_no_resched(); - - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we don't want the post_handler - * to run (and have re-enabled preemption) - */ - return 1; -} - -/* - * Called after single-stepping. p->addr is the address of the - * instruction whose first byte has been replaced by the "int 3" - * instruction. To avoid the SMP problems that can occur when we - * temporarily put back the original opcode to single-step, we - * single-stepped a copy of the instruction. The address of this - * copy is p->ainsn.insn. - * - * This function prepares to return from the post-single-step - * interrupt. We have to fix up the stack as follows: - * - * 0) Except in the case of absolute or indirect jump or call instructions, - * the new rip is relative to the copied instruction. We need to make - * it relative to the original instruction. - * - * 1) If the single-stepped instruction was pushfl, then the TF and IF - * flags are set in the just-pushed eflags, and may need to be cleared. - * - * 2) If the single-stepped instruction was a call, the return address - * that is atop the stack is the address following the copied instruction. - * We need to make it the address following the original instruction. - */ -static void __kprobes resume_execution(struct kprobe *p, - struct pt_regs *regs, struct kprobe_ctlblk *kcb) -{ - unsigned long *tos = (unsigned long *)regs->rsp; - unsigned long copy_rip = (unsigned long)p->ainsn.insn; - unsigned long orig_rip = (unsigned long)p->addr; - kprobe_opcode_t *insn = p->ainsn.insn; - - /*skip the REX prefix*/ - if (*insn >= 0x40 && *insn <= 0x4f) - insn++; - - regs->eflags &= ~TF_MASK; - switch (*insn) { - case 0x9c: /* pushfl */ - *tos &= ~(TF_MASK | IF_MASK); - *tos |= kcb->kprobe_old_rflags; - break; - case 0xc2: /* iret/ret/lret */ - case 0xc3: - case 0xca: - case 0xcb: - case 0xcf: - case 0xea: /* jmp absolute -- ip is correct */ - /* ip is already adjusted, no more changes required */ - goto no_change; - case 0xe8: /* call relative - Fix return addr */ - *tos = orig_rip + (*tos - copy_rip); - break; - case 0xff: - if ((insn[1] & 0x30) == 0x10) { - /* call absolute, indirect */ - /* Fix return addr; ip is correct. */ - *tos = orig_rip + (*tos - copy_rip); - goto no_change; - } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ - ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ - /* ip is correct. */ - goto no_change; - } - default: - break; - } - - regs->rip = orig_rip + (regs->rip - copy_rip); -no_change: - - return; -} - -int __kprobes post_kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - if (!cur) - return 0; - - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - - resume_execution(cur, regs, kcb); - regs->eflags |= kcb->kprobe_saved_rflags; - trace_hardirqs_fixup_flags(regs->eflags); - - /* Restore the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(kcb); - goto out; - } - reset_current_kprobe(); -out: - preempt_enable_no_resched(); - - /* - * if somebody else is singlestepping across a probe point, eflags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (regs->eflags & TF_MASK) - return 0; - - return 1; -} - -int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - const struct exception_table_entry *fixup; - - switch(kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: - /* - * We are here because the instruction being single - * stepped caused a page fault. We reset the current - * kprobe and the rip points back to the probe address - * and allow the page fault handler to continue as a - * normal page fault. - */ - regs->rip = (unsigned long)cur->addr; - regs->eflags |= kcb->kprobe_old_rflags; - if (kcb->kprobe_status == KPROBE_REENTER) - restore_previous_kprobe(kcb); - else - reset_current_kprobe(); - preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: - /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accouting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) - return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return 1; - } - - /* - * fixup() could not handle it, - * Let do_page_fault() fix it. - */ - break; - default: - break; - } - return 0; -} - -/* - * Wrapper routine for handling exceptions. - */ -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct die_args *args = (struct die_args *)data; - int ret = NOTIFY_DONE; - - if (args->regs && user_mode(args->regs)) - return ret; - - switch (val) { - case DIE_INT3: - if (kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_GPF: - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - ret = NOTIFY_STOP; - preempt_enable(); - break; - default: - break; - } - return ret; -} - -int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - kcb->jprobe_saved_rsp = (long *) regs->rsp; - addr = (unsigned long)(kcb->jprobe_saved_rsp); - /* - * As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - */ - memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, - MIN_STACK_SIZE(addr)); - regs->eflags &= ~IF_MASK; - trace_hardirqs_off(); - regs->rip = (unsigned long)(jp->entry); - return 1; -} - -void __kprobes jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - asm volatile (" xchg %%rbx,%%rsp \n" - " int3 \n" - " .globl jprobe_return_end \n" - " jprobe_return_end: \n" - " nop \n"::"b" - (kcb->jprobe_saved_rsp):"memory"); -} - -int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - u8 *addr = (u8 *) (regs->rip - 1); - unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp); - struct jprobe *jp = container_of(p, struct jprobe, kp); - - if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if ((unsigned long *)regs->rsp != kcb->jprobe_saved_rsp) { - struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; - printk("current rsp %p does not match saved rsp %p\n", - (long *)regs->rsp, kcb->jprobe_saved_rsp); - printk("Saved registers for jprobe %p\n", jp); - show_registers(saved_regs); - printk("Current registers\n"); - show_registers(regs); - BUG(); - } - *regs = kcb->jprobe_saved_regs; - memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, - MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); - return 1; - } - return 0; -} - -static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, - .pre_handler = trampoline_probe_handler -}; - -int __init arch_init_kprobes(void) -{ - return register_kprobe(&trampoline_p); -} - -int __kprobes arch_trampoline_kprobe(struct kprobe *p) -{ - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) - return 1; - - return 0; -} diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt.c index 9ff90a27c45..8a7660c8394 100644 --- a/arch/x86/kernel/ldt_32.c +++ b/arch/x86/kernel/ldt.c @@ -1,6 +1,9 @@ /* * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. */ #include <linux/errno.h> @@ -9,7 +12,6 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/vmalloc.h> -#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -17,7 +19,7 @@ #include <asm/desc.h> #include <asm/mmu_context.h> -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +#ifdef CONFIG_SMP static void flush_ldt(void *null) { if (current->active_mm) @@ -27,26 +29,31 @@ static void flush_ldt(void *null) static int alloc_ldt(mm_context_t *pc, int mincount, int reload) { - void *oldldt; - void *newldt; + void *oldldt, *newldt; int oldsize; if (mincount <= pc->size) return 0; oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + mincount = (mincount + 511) & (~511); + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount * LDT_ENTRY_SIZE); else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + newldt = (void *)__get_free_page(GFP_KERNEL); if (!newldt) return -ENOMEM; if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, + (mincount - oldsize) * LDT_ENTRY_SIZE); + +#ifdef CONFIG_X86_64 + /* CHECKME: Do we really need this ? */ + wmb(); +#endif pc->ldt = newldt; wmb(); pc->size = mincount; @@ -55,6 +62,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (reload) { #ifdef CONFIG_SMP cpumask_t mask; + preempt_disable(); load_LDT(pc); mask = cpumask_of_cpu(smp_processor_id()); @@ -66,10 +74,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #endif } if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(oldldt); else - kfree(oldldt); + put_page(virt_to_page(oldldt)); } return 0; } @@ -77,9 +85,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { int err = alloc_ldt(new, old->size, 0); + if (err < 0) return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); return 0; } @@ -89,7 +98,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old) */ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { - struct mm_struct * old_mm; + struct mm_struct *old_mm; int retval = 0; mutex_init(&mm->context.lock); @@ -105,33 +114,38 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) /* * No need to lock the MM as we are the last user + * + * 64bit: Don't touch the LDT register - we're already in the next thread. */ void destroy_context(struct mm_struct *mm) { if (mm->context.size) { +#ifdef CONFIG_X86_32 + /* CHECKME: Can this ever happen ? */ if (mm == current->active_mm) clear_LDT(); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) +#endif + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(mm->context.ldt); else - kfree(mm->context.ldt); + put_page(virt_to_page(mm->context.ldt)); mm->context.size = 0; } } -static int read_ldt(void __user * ptr, unsigned long bytecount) +static int read_ldt(void __user *ptr, unsigned long bytecount) { int err; unsigned long size; - struct mm_struct * mm = current->mm; + struct mm_struct *mm = current->mm; if (!mm->context.size) return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; mutex_lock(&mm->context.lock); - size = mm->context.size*LDT_ENTRY_SIZE; + size = mm->context.size * LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -143,7 +157,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) goto error_return; if (size != bytecount) { /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { + if (clear_user(ptr + size, bytecount - size) != 0) { err = -EFAULT; goto error_return; } @@ -153,34 +167,32 @@ error_return: return err; } -static int read_default_ldt(void __user * ptr, unsigned long bytecount) +static int read_default_ldt(void __user *ptr, unsigned long bytecount) { - int err; - unsigned long size; - - err = 0; - size = 5*sizeof(struct desc_struct); - if (size > bytecount) - size = bytecount; - - err = size; - if (clear_user(ptr, size)) - err = -EFAULT; - - return err; + /* CHECKME: Can we use _one_ random number ? */ +#ifdef CONFIG_X86_32 + unsigned long size = 5 * sizeof(struct desc_struct); +#else + unsigned long size = 128; +#endif + if (bytecount > size) + bytecount = size; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; } -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { - struct mm_struct * mm = current->mm; - __u32 entry_1, entry_2; + struct mm_struct *mm = current->mm; + struct desc_struct ldt; int error; struct user_desc ldt_info; error = -EINVAL; if (bytecount != sizeof(ldt_info)) goto out; - error = -EFAULT; + error = -EFAULT; if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) goto out; @@ -196,28 +208,27 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) mutex_lock(&mm->context.lock); if (ldt_info.entry_number >= mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + error = alloc_ldt(¤t->mm->context, + ldt_info.entry_number + 1, 1); if (error < 0) goto out_unlock; } - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; + memset(&ldt, 0, sizeof(ldt)); goto install; } } - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); + fill_ldt(&ldt, &ldt_info); if (oldmode) - entry_2 &= ~(1 << 20); + ldt.avl = 0; /* Install the new entry ... */ install: - write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2); + write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); error = 0; out_unlock: @@ -226,7 +237,8 @@ out: return error; } -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +asmlinkage int sys_modify_ldt(int func, void __user *ptr, + unsigned long bytecount) { int ret = -ENOSYS; diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c deleted file mode 100644 index 60e57abb8e9..00000000000 --- a/arch/x86/kernel/ldt_64.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2002 Andi Kleen - * - * This handles calls from both 32bit and 64bit mode. - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *null) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) -{ - void *oldldt; - void *newldt; - unsigned oldsize; - - if (mincount <= (unsigned)pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - wmb(); - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - if (reload) { -#ifdef CONFIG_SMP - cpumask_t mask; - - preempt_disable(); - mask = cpumask_of_cpu(smp_processor_id()); - load_LDT(pc); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); - preempt_enable(); -#else - load_LDT(pc); -#endif - } - if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) - return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - mutex_init(&mm->context.lock); - mm->context.size = 0; - old_mm = current->mm; - if (old_mm && old_mm->context.size > 0) { - mutex_lock(&old_mm->context.lock); - retval = copy_ldt(&mm->context, &old_mm->context); - mutex_unlock(&old_mm->context.lock); - } - return retval; -} - -/* - * - * Don't touch the LDT register - we're already in the next thread. - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } -} - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - mutex_lock(&mm->context.lock); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - mutex_unlock(&mm->context.lock); - if (err < 0) - goto error_return; - if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { - err = -EFAULT; - goto error_return; - } - } - return bytecount; -error_return: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - /* Arbitrary number */ - /* x86-64 default LDT is all zeros */ - if (bytecount > 128) - bytecount = 128; - if (clear_user(ptr, bytecount)) - return -EFAULT; - return bytecount; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -{ - struct task_struct *me = current; - struct mm_struct * mm = me->mm; - __u32 entry_1, entry_2, *lp; - int error; - struct user_desc ldt_info; - - error = -EINVAL; - - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, bytecount)) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - mutex_lock(&mm->context.lock); - if (ldt_info.entry_number >= (unsigned)mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); - if (oldmode) - entry_2 &= ~(1 << 20); - - /* Install the new entry ... */ -install: - *lp = entry_1; - *(lp+1) = entry_2; - error = 0; - -out_unlock: - mutex_unlock(&mm->context.lock); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 11b935f4f88..c1cfd60639d 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -32,7 +32,7 @@ static u32 kexec_pte1[1024] PAGE_ALIGNED; static void set_idt(void *newidt, __u16 limit) { - struct Xgt_desc_struct curidt; + struct desc_ptr curidt; /* ia32 supports unaliged loads & stores */ curidt.size = limit; @@ -44,7 +44,7 @@ static void set_idt(void *newidt, __u16 limit) static void set_gdt(void *newgdt, __u16 limit) { - struct Xgt_desc_struct curgdt; + struct desc_ptr curgdt; /* ia32 supports unaligned loads & stores */ curgdt.size = limit; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index aa3d2c8f773..a1fef42f8cd 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -234,10 +234,5 @@ NORET_TYPE void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_SYMBOL(init_level4_pgt); - -#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE - VMCOREINFO_SYMBOL(node_data); - VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); -#endif } diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 3960ab7e149..219f86eb612 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -63,6 +63,21 @@ static int __init mfgpt_disable(char *s) } __setup("nomfgpt", mfgpt_disable); +/* Reset the MFGPT timers. This is required by some broken BIOSes which already + * do the same and leave the system in an unstable state. TinyBIOS 0.98 is + * affected at least (0.99 is OK with MFGPT workaround left to off). + */ +static int __init mfgpt_fix(char *s) +{ + u32 val, dummy; + + /* The following udocumented bit resets the MFGPT timers */ + val = 0xFF; dummy = 0; + wrmsr(0x5140002B, val, dummy); + return 1; +} +__setup("mfgptfix", mfgpt_fix); + /* * Check whether any MFGPTs are available for the kernel to use. In most * cases, firmware that uses AMD's VSA code will claim all timers during diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 40cfd548871..6ff447f9fda 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -244,8 +244,8 @@ static int microcode_sanity_check(void *mc) return 0; /* check extended signature checksum */ for (i = 0; i < ext_sigcount; i++) { - ext_sig = (struct extended_signature *)((void *)ext_header - + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i); + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; sum = orig_sum - (mc_header->sig + mc_header->pf + mc_header->cksum) + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); @@ -279,11 +279,9 @@ static int get_maching_microcode(void *mc, int cpu) if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) return 0; - ext_header = (struct extended_sigtable *)(mc + - get_datasize(mc_header) + MC_HEADER_SIZE); + ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; ext_sigcount = ext_header->count; - ext_sig = (struct extended_signature *)((void *)ext_header - + EXT_HEADER_SIZE); + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; for (i = 0; i < ext_sigcount; i++) { if (microcode_update_match(cpu, mc_header, ext_sig->sig, ext_sig->pf)) @@ -539,7 +537,7 @@ static int cpu_request_microcode(int cpu) pr_debug("ucode data file %s load failed\n", name); return error; } - buf = (void *)firmware->data; + buf = firmware->data; size = firmware->size; while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) > 0) { diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c index 7a05a7f6099..67009cdd5ec 100644 --- a/arch/x86/kernel/mpparse_32.c +++ b/arch/x86/kernel/mpparse_32.c @@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ -unsigned int __cpuinitdata num_processors; +unsigned int num_processors; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; @@ -258,7 +258,7 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m) if (!(m->mpc_flags & MPC_APIC_USABLE)) return; - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", @@ -405,9 +405,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) mps_oem_check(mpc, oem, str); - printk("APIC at: 0x%lX\n",mpc->mpc_lapic); + printk("APIC at: 0x%X\n", mpc->mpc_lapic); - /* + /* * Save the local APIC address (it might be non-default) -- but only * if we're not using ACPI. */ @@ -721,7 +721,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) unsigned long *bp = phys_to_virt(base); struct intel_mp_floating *mpf; - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length); if (sizeof(*mpf) != 16) printk("Error: MPF size\n"); @@ -734,8 +734,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) || (mpf->mpf_specification == 4)) ) { smp_found_config = 1; - printk(KERN_INFO "found SMP MP-table at %08lx\n", - virt_to_phys(mpf)); + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, virt_to_phys(mpf)); reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); if (mpf->mpf_physptr) { /* @@ -918,14 +918,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) */ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + + mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_base, - mp_ioapic_routing[idx].gsi_end); + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, + mp_ioapic_routing[idx].gsi_end); } void __init @@ -1041,15 +1041,16 @@ void __init mp_config_acpi_legacy_irqs (void) } #define MAX_GSI_NUM 4096 +#define IRQ_COMPRESSION_START 64 int mp_register_gsi(u32 gsi, int triggering, int polarity) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; - static int pci_irq = 16; + static int pci_irq = IRQ_COMPRESSION_START; /* - * Mapping between Global System Interrups, which + * Mapping between Global System Interrupts, which * represent all possible interrupts, and IRQs * assigned to actual devices. */ @@ -1086,12 +1087,16 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi_to_irq[gsi]; + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); } mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); - if (triggering == ACPI_LEVEL_SENSITIVE) { + /* + * For GSI >= 64, use IRQ compression + */ + if ((gsi >= IRQ_COMPRESSION_START) + && (triggering == ACPI_LEVEL_SENSITIVE)) { /* * For PCI devices assign IRQs in order, avoiding gaps * due to unused I/O APIC pins. diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c index ef4aab12358..72ab1403fed 100644 --- a/arch/x86/kernel/mpparse_64.c +++ b/arch/x86/kernel/mpparse_64.c @@ -60,14 +60,18 @@ unsigned int boot_cpu_id = -1U; EXPORT_SYMBOL(boot_cpu_id); /* Internal processor count */ -unsigned int num_processors __cpuinitdata = 0; +unsigned int num_processors; unsigned disabled_cpus __cpuinitdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata + = { [0 ... NR_CPUS-1] = BAD_APICID }; +void *x86_bios_cpu_apicid_early_ptr; +DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; +EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); /* @@ -118,24 +122,22 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) physid_set(m->mpc_apicid, phys_cpu_present_map); if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* - * bios_cpu_apicid is required to have processors listed + * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. */ cpu = 0; } - bios_cpu_apicid[cpu] = m->mpc_apicid; - /* - * We get called early in the the start_kernel initialization - * process when the per_cpu data area is not yet setup, so we - * use a static array that is removed after the per_cpu data - * area is created. - */ - if (x86_cpu_to_apicid_ptr) { - u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr; - x86_cpu_to_apicid[cpu] = m->mpc_apicid; + /* are we being called early in kernel startup? */ + if (x86_cpu_to_apicid_early_ptr) { + u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; + u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + + cpu_to_apicid[cpu] = m->mpc_apicid; + bios_cpu_apicid[cpu] = m->mpc_apicid; } else { per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; + per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid; } cpu_set(cpu, cpu_possible_map); diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 4f4bfd3a88b..edd413650b3 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -51,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); static int endflag __initdata = 0; +#ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all * CPUs during the test make them busy. */ static __init void nmi_cpu_busy(void *data) { -#ifdef CONFIG_SMP local_irq_enable_in_hardirq(); /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, @@ -67,8 +67,8 @@ static __init void nmi_cpu_busy(void *data) care if they get somewhat less cycles. */ while (endflag == 0) mb(); -#endif } +#endif static int __init check_nmi_watchdog(void) { @@ -87,11 +87,13 @@ static int __init check_nmi_watchdog(void) printk(KERN_INFO "Testing NMI watchdog ... "); +#ifdef CONFIG_SMP if (nmi_watchdog == NMI_LOCAL_APIC) smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); +#endif for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; + prev_nmi_count[cpu] = nmi_count(cpu); local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks @@ -237,10 +239,10 @@ void acpi_nmi_disable(void) on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); } -void setup_apic_nmi_watchdog (void *unused) +void setup_apic_nmi_watchdog(void *unused) { if (__get_cpu_var(wd_enabled)) - return; + return; /* cheap hack to support suspend/resume */ /* if cpu0 is not active neither should the other cpus */ @@ -329,7 +331,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) unsigned int sum; int touched = 0; int cpu = smp_processor_id(); - int rc=0; + int rc = 0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index c3d1476b6a1..fb99484d21c 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -39,7 +39,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE; * 0: the lapic NMI watchdog is disabled, but can be enabled */ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -int panic_on_timeout; +static int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; @@ -78,22 +78,22 @@ static __init void nmi_cpu_busy(void *data) } #endif -int __init check_nmi_watchdog (void) +int __init check_nmi_watchdog(void) { - int *counts; + int *prev_nmi_count; int cpu; - if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) return 0; if (!atomic_read(&nmi_active)) return 0; - counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); - if (!counts) + prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + if (!prev_nmi_count) return -1; - printk(KERN_INFO "testing NMI watchdog ... "); + printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP if (nmi_watchdog == NMI_LOCAL_APIC) @@ -101,30 +101,29 @@ int __init check_nmi_watchdog (void) #endif for (cpu = 0; cpu < NR_CPUS; cpu++) - counts[cpu] = cpu_pda(cpu)->__nmi_count; + prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; - if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { + if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) { printk(KERN_WARNING "WARNING: CPU#%d: NMI " "appears to be stuck (%d->%d)!\n", - cpu, - counts[cpu], - cpu_pda(cpu)->__nmi_count); + cpu, + prev_nmi_count[cpu], + cpu_pda(cpu)->__nmi_count); per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } } + endflag = 1; if (!atomic_read(&nmi_active)) { - kfree(counts); + kfree(prev_nmi_count); atomic_set(&nmi_active, -1); - endflag = 1; return -1; } - endflag = 1; printk("OK.\n"); /* now that we know it works we can reduce NMI frequency to @@ -132,11 +131,11 @@ int __init check_nmi_watchdog (void) if (nmi_watchdog == NMI_LOCAL_APIC) nmi_hz = lapic_adjust_nmi_hz(1); - kfree(counts); + kfree(prev_nmi_count); return 0; } -int __init setup_nmi_watchdog(char *str) +static int __init setup_nmi_watchdog(char *str) { int nmi; @@ -159,34 +158,6 @@ int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} #ifdef CONFIG_PM static int nmi_pm_active; /* nmi_active before suspend */ @@ -217,7 +188,7 @@ static struct sysdev_class nmi_sysclass = { }; static struct sys_device device_lapic_nmi = { - .id = 0, + .id = 0, .cls = &nmi_sysclass, }; @@ -231,7 +202,7 @@ static int __init init_lapic_nmi_sysfs(void) if (nmi_watchdog != NMI_LOCAL_APIC) return 0; - if ( atomic_read(&nmi_active) < 0 ) + if (atomic_read(&nmi_active) < 0) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -244,9 +215,37 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ +static void __acpi_nmi_enable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} + +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + void setup_apic_nmi_watchdog(void *unused) { - if (__get_cpu_var(wd_enabled) == 1) + if (__get_cpu_var(wd_enabled)) return; /* cheap hack to support suspend/resume */ @@ -311,8 +310,9 @@ void touch_nmi_watchdog(void) } } - touch_softlockup_watchdog(); + touch_softlockup_watchdog(); } +EXPORT_SYMBOL(touch_nmi_watchdog); int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { @@ -479,4 +479,3 @@ void __trigger_all_cpu_backtrace(void) EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 9000d82c6dc..e65281b1634 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -82,7 +82,7 @@ static int __init numaq_tsc_disable(void) { if (num_online_nodes() > 1) { printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - tsc_disable = 1; + setup_clear_cpu_cap(X86_FEATURE_TSC); } return 0; } diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt.c index f5000799f8e..075962cc75a 100644 --- a/arch/x86/kernel/paravirt_32.c +++ b/arch/x86/kernel/paravirt.c @@ -14,7 +14,10 @@ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc */ + #include <linux/errno.h> #include <linux/module.h> #include <linux/efi.h> @@ -55,59 +58,9 @@ char *memory_setup(void) extern const char start_##ops##_##name[], end_##ops##_##name[]; \ asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") -DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); -DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); -DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); -DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); -DEF_NATIVE(pv_cpu_ops, iret, "iret"); -DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); -DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); -DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); -DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); -DEF_NATIVE(pv_cpu_ops, clts, "clts"); -DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); - /* Undefined instruction for dealing with missing ops pointers. */ static const unsigned char ud2a[] = { 0x0f, 0x0b }; -static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - unsigned long addr, unsigned len) -{ - const unsigned char *start, *end; - unsigned ret; - - switch(type) { -#define SITE(ops, x) \ - case PARAVIRT_PATCH(ops.x): \ - start = start_##ops##_##x; \ - end = end_##ops##_##x; \ - goto patch_site - - SITE(pv_irq_ops, irq_disable); - SITE(pv_irq_ops, irq_enable); - SITE(pv_irq_ops, restore_fl); - SITE(pv_irq_ops, save_fl); - SITE(pv_cpu_ops, iret); - SITE(pv_cpu_ops, irq_enable_sysexit); - SITE(pv_mmu_ops, read_cr2); - SITE(pv_mmu_ops, read_cr3); - SITE(pv_mmu_ops, write_cr3); - SITE(pv_cpu_ops, clts); - SITE(pv_cpu_ops, read_tsc); -#undef SITE - - patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); - break; - - default: - ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); - break; - } - - return ret; -} - unsigned paravirt_patch_nop(void) { return 0; @@ -186,7 +139,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, /* If the operation is a nop, then nop the callsite */ ret = paravirt_patch_nop(); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || - type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit)) + type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); else @@ -237,7 +190,7 @@ static void native_flush_tlb_single(unsigned long addr) /* These are in entry.S */ extern void native_iret(void); -extern void native_irq_enable_sysexit(void); +extern void native_irq_enable_syscall_ret(void); static int __init print_banner(void) { @@ -285,18 +238,18 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); BUG_ON(preemptible()); - x86_write_percpu(paravirt_lazy_mode, mode); + __get_cpu_var(paravirt_lazy_mode) = mode; } void paravirt_leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode); + BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); BUG_ON(preemptible()); - x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); + __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; } void paravirt_enter_lazy_mmu(void) @@ -321,7 +274,7 @@ void paravirt_leave_lazy_cpu(void) enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { - return x86_read_percpu(paravirt_lazy_mode); + return __get_cpu_var(paravirt_lazy_mode); } struct pv_info pv_info = { @@ -366,11 +319,16 @@ struct pv_cpu_ops pv_cpu_ops = { .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, .write_cr4 = native_write_cr4, +#ifdef CONFIG_X86_64 + .read_cr8 = native_read_cr8, + .write_cr8 = native_write_cr8, +#endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, + .read_tscp = native_read_tscp, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, .load_gdt = native_load_gdt, @@ -379,13 +337,14 @@ struct pv_cpu_ops pv_cpu_ops = { .store_idt = native_store_idt, .store_tr = native_store_tr, .load_tls = native_load_tls, - .write_ldt_entry = write_dt_entry, - .write_gdt_entry = write_dt_entry, - .write_idt_entry = write_dt_entry, - .load_esp0 = native_load_esp0, + .write_ldt_entry = native_write_ldt_entry, + .write_gdt_entry = native_write_gdt_entry, + .write_idt_entry = native_write_idt_entry, + .load_sp0 = native_load_sp0, - .irq_enable_sysexit = native_irq_enable_sysexit, + .irq_enable_syscall_ret = native_irq_enable_syscall_ret, .iret = native_iret, + .swapgs = native_swapgs, .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, @@ -408,8 +367,10 @@ struct pv_apic_ops pv_apic_ops = { }; struct pv_mmu_ops pv_mmu_ops = { +#ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, +#endif .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, @@ -437,16 +398,23 @@ struct pv_mmu_ops pv_mmu_ops = { .kmap_atomic_pte = kmap_atomic, #endif +#if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, .set_pte_present = native_set_pte_present, - .set_pud = native_set_pud, .pte_clear = native_pte_clear, .pmd_clear = native_pmd_clear, - +#endif + .set_pud = native_set_pud, .pmd_val = native_pmd_val, .make_pmd = native_make_pmd, + +#if PAGETABLE_LEVELS == 4 + .pud_val = native_pud_val, + .make_pud = native_make_pud, + .set_pgd = native_set_pgd, #endif +#endif /* PAGETABLE_LEVELS >= 3 */ .pte_val = native_pte_val, .pgd_val = native_pgd_val, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c new file mode 100644 index 00000000000..82fc5fcab4f --- /dev/null +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -0,0 +1,49 @@ +#include <asm/paravirt.h> + +DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); +DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); +DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); +DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); +DEF_NATIVE(pv_cpu_ops, iret, "iret"); +DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit"); +DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); +DEF_NATIVE(pv_cpu_ops, clts, "clts"); +DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +{ + const unsigned char *start, *end; + unsigned ret; + +#define PATCH_SITE(ops, x) \ + case PARAVIRT_PATCH(ops.x): \ + start = start_##ops##_##x; \ + end = end_##ops##_##x; \ + goto patch_site + switch(type) { + PATCH_SITE(pv_irq_ops, irq_disable); + PATCH_SITE(pv_irq_ops, irq_enable); + PATCH_SITE(pv_irq_ops, restore_fl); + PATCH_SITE(pv_irq_ops, save_fl); + PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); + PATCH_SITE(pv_mmu_ops, read_cr2); + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); + PATCH_SITE(pv_cpu_ops, clts); + PATCH_SITE(pv_cpu_ops, read_tsc); + + patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; + + default: + ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); + break; + } +#undef PATCH_SITE + return ret; +} diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c new file mode 100644 index 00000000000..7d904e138d7 --- /dev/null +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -0,0 +1,57 @@ +#include <asm/paravirt.h> +#include <asm/asm-offsets.h> +#include <linux/stringify.h> + +DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); +DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); +DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); +DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); +DEF_NATIVE(pv_cpu_ops, iret, "iretq"); +DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); +DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); +DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); +DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); +DEF_NATIVE(pv_cpu_ops, clts, "clts"); +DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); + +/* the three commands give us more control to how to return from a syscall */ +DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;"); +DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +{ + const unsigned char *start, *end; + unsigned ret; + +#define PATCH_SITE(ops, x) \ + case PARAVIRT_PATCH(ops.x): \ + start = start_##ops##_##x; \ + end = end_##ops##_##x; \ + goto patch_site + switch(type) { + PATCH_SITE(pv_irq_ops, restore_fl); + PATCH_SITE(pv_irq_ops, save_fl); + PATCH_SITE(pv_irq_ops, irq_enable); + PATCH_SITE(pv_irq_ops, irq_disable); + PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); + PATCH_SITE(pv_cpu_ops, swapgs); + PATCH_SITE(pv_mmu_ops, read_cr2); + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); + PATCH_SITE(pv_cpu_ops, clts); + PATCH_SITE(pv_mmu_ops, flush_tlb_single); + PATCH_SITE(pv_cpu_ops, wbinvd); + + patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; + + default: + ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); + break; + } +#undef PATCH_SITE + return ret; +} diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 6bf1f716909..21f34db2c03 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -30,7 +30,6 @@ #include <linux/spinlock.h> #include <linux/string.h> #include <linux/dma-mapping.h> -#include <linux/init.h> #include <linux/bitops.h> #include <linux/pci_ids.h> #include <linux/pci.h> @@ -183,7 +182,7 @@ static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; /* enable this to stress test the chip's TCE cache */ #ifdef CONFIG_IOMMU_DEBUG -int debugging __read_mostly = 1; +static int debugging = 1; static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) @@ -202,7 +201,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, return ~0UL; } #else /* debugging is disabled */ -int debugging __read_mostly = 0; +static int debugging; static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c index 5552d23d23c..a82473d192a 100644 --- a/arch/x86/kernel/pci-dma_64.c +++ b/arch/x86/kernel/pci-dma_64.c @@ -13,7 +13,6 @@ #include <asm/calgary.h> int iommu_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_merge); dma_addr_t bad_dma_address __read_mostly; EXPORT_SYMBOL(bad_dma_address); @@ -230,7 +229,7 @@ EXPORT_SYMBOL(dma_set_mask); * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter * documentation. */ -__init int iommu_setup(char *p) +static __init int iommu_setup(char *p) { iommu_merge = 1; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 06bcba53604..4d5cc718198 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -1,12 +1,12 @@ /* * Dynamic DMA mapping support for AMD Hammer. - * + * * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. * This allows to use PCI devices that only support 32bit addresses on systems - * with more than 4GB. + * with more than 4GB. * * See Documentation/DMA-mapping.txt for the interface specification. - * + * * Copyright 2002 Andi Kleen, SuSE Labs. * Subject to the GNU General Public License v2 only. */ @@ -37,23 +37,26 @@ #include <asm/k8.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ -static unsigned long iommu_size; /* size of remapping area bytes */ +static unsigned long iommu_size; /* size of remapping area bytes */ static unsigned long iommu_pages; /* .. and in pages */ -static u32 *iommu_gatt_base; /* Remapping table */ +static u32 *iommu_gatt_base; /* Remapping table */ -/* If this is disabled the IOMMU will use an optimized flushing strategy - of only flushing when an mapping is reused. With it true the GART is flushed - for every mapping. Problem is that doing the lazy flush seems to trigger - bugs with some popular PCI cards, in particular 3ware (but has been also - also seen with Qlogic at least). */ +/* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is + * flushed for every mapping. Problem is that doing the lazy flush seems + * to trigger bugs with some popular PCI cards, in particular 3ware (but + * has been also also seen with Qlogic at least). + */ int iommu_fullflush = 1; -/* Allocation bitmap for the remapping area */ +/* Allocation bitmap for the remapping area: */ static DEFINE_SPINLOCK(iommu_bitmap_lock); -static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ +/* Guarded by iommu_bitmap_lock: */ +static unsigned long *iommu_gart_bitmap; -static u32 gart_unmapped_entry; +static u32 gart_unmapped_entry; #define GPTE_VALID 1 #define GPTE_COHERENT 2 @@ -61,10 +64,10 @@ static u32 gart_unmapped_entry; (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) -#define to_pages(addr,size) \ +#define to_pages(addr, size) \ (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) -#define EMERGENCY_PAGES 32 /* = 128KB */ +#define EMERGENCY_PAGES 32 /* = 128KB */ #ifdef CONFIG_AGP #define AGPEXTERN extern @@ -77,130 +80,152 @@ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; static unsigned long next_bit; /* protected by iommu_bitmap_lock */ -static int need_flush; /* global flush state. set for each gart wrap */ +static int need_flush; /* global flush state. set for each gart wrap */ -static unsigned long alloc_iommu(int size) -{ +static unsigned long alloc_iommu(int size) +{ unsigned long offset, flags; - spin_lock_irqsave(&iommu_bitmap_lock, flags); - offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); + spin_lock_irqsave(&iommu_bitmap_lock, flags); + offset = find_next_zero_string(iommu_gart_bitmap, next_bit, + iommu_pages, size); if (offset == -1) { need_flush = 1; - offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); + offset = find_next_zero_string(iommu_gart_bitmap, 0, + iommu_pages, size); } - if (offset != -1) { - set_bit_string(iommu_gart_bitmap, offset, size); - next_bit = offset+size; - if (next_bit >= iommu_pages) { + if (offset != -1) { + set_bit_string(iommu_gart_bitmap, offset, size); + next_bit = offset+size; + if (next_bit >= iommu_pages) { next_bit = 0; need_flush = 1; - } - } + } + } if (iommu_fullflush) need_flush = 1; - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + return offset; -} +} static void free_iommu(unsigned long offset, int size) -{ +{ unsigned long flags; + spin_lock_irqsave(&iommu_bitmap_lock, flags); __clear_bit_string(iommu_gart_bitmap, offset, size); spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} +} -/* +/* * Use global flush state to avoid races with multiple flushers. */ static void flush_gart(void) -{ +{ unsigned long flags; + spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { k8_flush_garts(); need_flush = 0; - } + } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} +} #ifdef CONFIG_IOMMU_LEAK -#define SET_LEAK(x) if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0); -#define CLEAR_LEAK(x) if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; +#define SET_LEAK(x) \ + do { \ + if (iommu_leak_tab) \ + iommu_leak_tab[x] = __builtin_return_address(0);\ + } while (0) + +#define CLEAR_LEAK(x) \ + do { \ + if (iommu_leak_tab) \ + iommu_leak_tab[x] = NULL; \ + } while (0) /* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; +static void **iommu_leak_tab; static int leak_trace; static int iommu_leak_pages = 20; + static void dump_leak(void) { int i; - static int dump; - if (dump || !iommu_leak_tab) return; + static int dump; + + if (dump || !iommu_leak_tab) + return; dump = 1; - show_stack(NULL,NULL); - /* Very crude. dump some from the end of the table too */ - printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i+=2) { - printk("%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); - printk("%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk("\n"); + show_stack(NULL, NULL); + + /* Very crude. dump some from the end of the table too */ + printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", + iommu_leak_pages); + for (i = 0; i < iommu_leak_pages; i += 2) { + printk(KERN_DEBUG "%lu: ", iommu_pages-i); + printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); + printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); + } + printk(KERN_DEBUG "\n"); } #else -#define SET_LEAK(x) -#define CLEAR_LEAK(x) +# define SET_LEAK(x) +# define CLEAR_LEAK(x) #endif static void iommu_full(struct device *dev, size_t size, int dir) { - /* + /* * Ran out of IOMMU space for this operation. This is very bad. * Unfortunately the drivers cannot handle this operation properly. - * Return some non mapped prereserved space in the aperture and + * Return some non mapped prereserved space in the aperture and * let the Northbridge deal with it. This will result in garbage * in the IO operation. When the size exceeds the prereserved space - * memory corruption will occur or random memory will be DMAed + * memory corruption will occur or random memory will be DMAed * out. Hopefully no network devices use single mappings that big. - */ - - printk(KERN_ERR - "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", - size, dev->bus_id); + */ + + printk(KERN_ERR + "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", + size, dev->bus_id); if (size > PAGE_SIZE*EMERGENCY_PAGES) { if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) panic("PCI-DMA: Memory would be corrupted\n"); - if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) - panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n"); - } - + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic(KERN_ERR + "PCI-DMA: Random memory would be DMAed\n"); + } #ifdef CONFIG_IOMMU_LEAK - dump_leak(); + dump_leak(); #endif -} +} -static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) -{ +static inline int +need_iommu(struct device *dev, unsigned long addr, size_t size) +{ u64 mask = *dev->dma_mask; int high = addr + size > mask; int mmu = high; - if (force_iommu) - mmu = 1; - return mmu; + + if (force_iommu) + mmu = 1; + + return mmu; } -static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) -{ +static inline int +nonforced_iommu(struct device *dev, unsigned long addr, size_t size) +{ u64 mask = *dev->dma_mask; int high = addr + size > mask; int mmu = high; - return mmu; + + return mmu; } /* Map a single continuous physical area into the IOMMU. @@ -208,13 +233,14 @@ static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t */ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir) -{ +{ unsigned long npages = to_pages(phys_mem, size); unsigned long iommu_page = alloc_iommu(npages); int i; + if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) - return phys_mem; + return phys_mem; if (panic_on_overflow) panic("dma_map_area overflow %lu bytes\n", size); iommu_full(dev, size, dir); @@ -229,35 +255,39 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); } -static dma_addr_t gart_map_simple(struct device *dev, char *buf, - size_t size, int dir) +static dma_addr_t +gart_map_simple(struct device *dev, char *buf, size_t size, int dir) { dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); + flush_gart(); + return map; } /* Map a single area into the IOMMU */ -static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) +static dma_addr_t +gart_map_single(struct device *dev, void *addr, size_t size, int dir) { unsigned long phys_mem, bus; if (!dev) dev = &fallback_dev; - phys_mem = virt_to_phys(addr); + phys_mem = virt_to_phys(addr); if (!need_iommu(dev, phys_mem, size)) - return phys_mem; + return phys_mem; bus = gart_map_simple(dev, addr, size, dir); - return bus; + + return bus; } /* * Free a DMA mapping. */ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) + size_t size, int direction) { unsigned long iommu_page; int npages; @@ -266,6 +296,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || dma_addr >= iommu_bus_base + iommu_size) return; + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; npages = to_pages(dma_addr, size); for (i = 0; i < npages; i++) { @@ -278,7 +309,8 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, /* * Wrapper for pci_unmap_single working with scatterlists. */ -static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +static void +gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { struct scatterlist *s; int i; @@ -303,12 +335,13 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, for_each_sg(sg, s, nents, i) { unsigned long addr = sg_phys(s); - if (nonforced_iommu(dev, addr, s->length)) { + + if (nonforced_iommu(dev, addr, s->length)) { addr = dma_map_area(dev, addr, s->length, dir); - if (addr == bad_dma_address) { - if (i > 0) + if (addr == bad_dma_address) { + if (i > 0) gart_unmap_sg(dev, sg, i, dir); - nents = 0; + nents = 0; sg[0].dma_length = 0; break; } @@ -317,15 +350,16 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, s->dma_length = s->length; } flush_gart(); + return nents; } /* Map multiple scatterlist entries continuous into the first. */ static int __dma_map_cont(struct scatterlist *start, int nelems, - struct scatterlist *sout, unsigned long pages) + struct scatterlist *sout, unsigned long pages) { unsigned long iommu_start = alloc_iommu(pages); - unsigned long iommu_page = iommu_start; + unsigned long iommu_page = iommu_start; struct scatterlist *s; int i; @@ -335,32 +369,33 @@ static int __dma_map_cont(struct scatterlist *start, int nelems, for_each_sg(start, s, nelems, i) { unsigned long pages, addr; unsigned long phys_addr = s->dma_address; - + BUG_ON(s != start && s->offset); if (s == start) { sout->dma_address = iommu_bus_base; sout->dma_address += iommu_page*PAGE_SIZE + s->offset; sout->dma_length = s->length; - } else { - sout->dma_length += s->length; + } else { + sout->dma_length += s->length; } addr = phys_addr; - pages = to_pages(s->offset, s->length); - while (pages--) { - iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + pages = to_pages(s->offset, s->length); + while (pages--) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); addr += PAGE_SIZE; iommu_page++; } - } - BUG_ON(iommu_page - iommu_start != pages); + } + BUG_ON(iommu_page - iommu_start != pages); + return 0; } -static inline int dma_map_cont(struct scatterlist *start, int nelems, - struct scatterlist *sout, - unsigned long pages, int need) +static inline int +dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout, + unsigned long pages, int need) { if (!need) { BUG_ON(nelems != 1); @@ -370,22 +405,19 @@ static inline int dma_map_cont(struct scatterlist *start, int nelems, } return __dma_map_cont(start, nelems, sout, pages); } - + /* * DMA map all entries in a scatterlist. - * Merge chunks that have page aligned sizes into a continuous mapping. + * Merge chunks that have page aligned sizes into a continuous mapping. */ -static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, - int dir) +static int +gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { - int i; - int out; - int start; - unsigned long pages = 0; - int need = 0, nextneed; struct scatterlist *s, *ps, *start_sg, *sgmap; + int need = 0, nextneed, i, out, start; + unsigned long pages = 0; - if (nents == 0) + if (nents == 0) return 0; if (!dev) @@ -397,15 +429,19 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, ps = NULL; /* shut up gcc */ for_each_sg(sg, s, nents, i) { dma_addr_t addr = sg_phys(s); + s->dma_address = addr; - BUG_ON(s->length == 0); + BUG_ON(s->length == 0); - nextneed = need_iommu(dev, addr, s->length); + nextneed = need_iommu(dev, addr, s->length); /* Handle the previous not yet processed entries */ if (i > start) { - /* Can only merge when the last chunk ends on a page - boundary and the new one doesn't have an offset. */ + /* + * Can only merge when the last chunk ends on a + * page boundary and the new one doesn't have an + * offset. + */ if (!iommu_merge || !nextneed || !need || s->offset || (ps->offset + ps->length) % PAGE_SIZE) { if (dma_map_cont(start_sg, i - start, sgmap, @@ -436,6 +472,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, error: flush_gart(); gart_unmap_sg(dev, sg, out, dir); + /* When it was forced or merged try again in a dumb way */ if (force_iommu || iommu_merge) { out = dma_map_sg_nonforce(dev, sg, nents, dir); @@ -444,64 +481,68 @@ error: } if (panic_on_overflow) panic("dma_map_sg: overflow on %lu pages\n", pages); + iommu_full(dev, pages << PAGE_SHIFT, dir); for_each_sg(sg, s, nents, i) s->dma_address = bad_dma_address; return 0; -} +} static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) -{ - unsigned long a; - if (!iommu_size) { - iommu_size = aper_size; - if (!no_agp) - iommu_size /= 2; - } - - a = aper + iommu_size; +{ + unsigned long a; + + if (!iommu_size) { + iommu_size = aper_size; + if (!no_agp) + iommu_size /= 2; + } + + a = aper + iommu_size; iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; - if (iommu_size < 64*1024*1024) + if (iommu_size < 64*1024*1024) { printk(KERN_WARNING - "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); - + "PCI-DMA: Warning: Small IOMMU %luMB." + " Consider increasing the AGP aperture in BIOS\n", + iommu_size >> 20); + } + return iommu_size; -} +} -static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) -{ - unsigned aper_size = 0, aper_base_32; +static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) +{ + unsigned aper_size = 0, aper_base_32, aper_order; u64 aper_base; - unsigned aper_order; - pci_read_config_dword(dev, 0x94, &aper_base_32); + pci_read_config_dword(dev, 0x94, &aper_base_32); pci_read_config_dword(dev, 0x90, &aper_order); - aper_order = (aper_order >> 1) & 7; + aper_order = (aper_order >> 1) & 7; - aper_base = aper_base_32 & 0x7fff; + aper_base = aper_base_32 & 0x7fff; aper_base <<= 25; - aper_size = (32 * 1024 * 1024) << aper_order; - if (aper_base + aper_size > 0x100000000UL || !aper_size) + aper_size = (32 * 1024 * 1024) << aper_order; + if (aper_base + aper_size > 0x100000000UL || !aper_size) aper_base = 0; *size = aper_size; return aper_base; -} +} -/* +/* * Private Northbridge GATT initialization in case we cannot use the - * AGP driver for some reason. + * AGP driver for some reason. */ static __init int init_k8_gatt(struct agp_kern_info *info) -{ +{ + unsigned aper_size, gatt_size, new_aper_size; + unsigned aper_base, new_aper_base; struct pci_dev *dev; void *gatt; - unsigned aper_base, new_aper_base; - unsigned aper_size, gatt_size, new_aper_size; int i; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); @@ -509,75 +550,75 @@ static __init int init_k8_gatt(struct agp_kern_info *info) dev = NULL; for (i = 0; i < num_k8_northbridges; i++) { dev = k8_northbridges[i]; - new_aper_base = read_aperture(dev, &new_aper_size); - if (!new_aper_base) - goto nommu; - - if (!aper_base) { + new_aper_base = read_aperture(dev, &new_aper_size); + if (!new_aper_base) + goto nommu; + + if (!aper_base) { aper_size = new_aper_size; aper_base = new_aper_base; - } - if (aper_size != new_aper_size || aper_base != new_aper_base) + } + if (aper_size != new_aper_size || aper_base != new_aper_base) goto nommu; } if (!aper_base) - goto nommu; + goto nommu; info->aper_base = aper_base; - info->aper_size = aper_size>>20; + info->aper_size = aper_size >> 20; - gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); - gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); - if (!gatt) + gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); + gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); + if (!gatt) panic("Cannot allocate GATT table"); - if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE)) + if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) panic("Could not set GART PTEs to uncacheable pages"); - global_flush_tlb(); - memset(gatt, 0, gatt_size); + memset(gatt, 0, gatt_size); agp_gatt_table = gatt; for (i = 0; i < num_k8_northbridges; i++) { - u32 ctl; - u32 gatt_reg; + u32 gatt_reg; + u32 ctl; dev = k8_northbridges[i]; - gatt_reg = __pa(gatt) >> 12; - gatt_reg <<= 4; + gatt_reg = __pa(gatt) >> 12; + gatt_reg <<= 4; pci_write_config_dword(dev, 0x98, gatt_reg); - pci_read_config_dword(dev, 0x90, &ctl); + pci_read_config_dword(dev, 0x90, &ctl); ctl |= 1; ctl &= ~((1<<4) | (1<<5)); - pci_write_config_dword(dev, 0x90, ctl); + pci_write_config_dword(dev, 0x90, ctl); } flush_gart(); - - printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); + + printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", + aper_base, aper_size>>10); return 0; nommu: - /* Should not happen anymore */ + /* Should not happen anymore */ printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); - return -1; -} + return -1; +} extern int agp_amd64_init(void); static const struct dma_mapping_ops gart_dma_ops = { - .mapping_error = NULL, - .map_single = gart_map_single, - .map_simple = gart_map_simple, - .unmap_single = gart_unmap_single, - .sync_single_for_cpu = NULL, - .sync_single_for_device = NULL, - .sync_single_range_for_cpu = NULL, - .sync_single_range_for_device = NULL, - .sync_sg_for_cpu = NULL, - .sync_sg_for_device = NULL, - .map_sg = gart_map_sg, - .unmap_sg = gart_unmap_sg, + .mapping_error = NULL, + .map_single = gart_map_single, + .map_simple = gart_map_simple, + .unmap_single = gart_unmap_single, + .sync_single_for_cpu = NULL, + .sync_single_for_device = NULL, + .sync_single_range_for_cpu = NULL, + .sync_single_range_for_device = NULL, + .sync_sg_for_cpu = NULL, + .sync_sg_for_device = NULL, + .map_sg = gart_map_sg, + .unmap_sg = gart_unmap_sg, }; void gart_iommu_shutdown(void) @@ -588,23 +629,23 @@ void gart_iommu_shutdown(void) if (no_agp && (dma_ops != &gart_dma_ops)) return; - for (i = 0; i < num_k8_northbridges; i++) { - u32 ctl; + for (i = 0; i < num_k8_northbridges; i++) { + u32 ctl; - dev = k8_northbridges[i]; - pci_read_config_dword(dev, 0x90, &ctl); + dev = k8_northbridges[i]; + pci_read_config_dword(dev, 0x90, &ctl); - ctl &= ~1; + ctl &= ~1; - pci_write_config_dword(dev, 0x90, ctl); - } + pci_write_config_dword(dev, 0x90, ctl); + } } void __init gart_iommu_init(void) -{ +{ struct agp_kern_info info; - unsigned long aper_size; unsigned long iommu_start; + unsigned long aper_size; unsigned long scratch; long i; @@ -614,14 +655,14 @@ void __init gart_iommu_init(void) } #ifndef CONFIG_AGP_AMD64 - no_agp = 1; + no_agp = 1; #else /* Makefile puts PCI initialization via subsys_initcall first. */ /* Add other K8 AGP bridge drivers here */ - no_agp = no_agp || - (agp_amd64_init() < 0) || + no_agp = no_agp || + (agp_amd64_init() < 0) || (agp_copy_info(agp_bridge, &info) < 0); -#endif +#endif if (swiotlb) return; @@ -643,77 +684,78 @@ void __init gart_iommu_init(void) } printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); - aper_size = info.aper_size * 1024 * 1024; - iommu_size = check_iommu_size(info.aper_base, aper_size); - iommu_pages = iommu_size >> PAGE_SHIFT; - - iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, - get_order(iommu_pages/8)); - if (!iommu_gart_bitmap) - panic("Cannot allocate iommu bitmap\n"); + aper_size = info.aper_size * 1024 * 1024; + iommu_size = check_iommu_size(info.aper_base, aper_size); + iommu_pages = iommu_size >> PAGE_SHIFT; + + iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, + get_order(iommu_pages/8)); + if (!iommu_gart_bitmap) + panic("Cannot allocate iommu bitmap\n"); memset(iommu_gart_bitmap, 0, iommu_pages/8); #ifdef CONFIG_IOMMU_LEAK - if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, + if (leak_trace) { + iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, get_order(iommu_pages*sizeof(void *))); - if (iommu_leak_tab) - memset(iommu_leak_tab, 0, iommu_pages * 8); + if (iommu_leak_tab) + memset(iommu_leak_tab, 0, iommu_pages * 8); else - printk("PCI-DMA: Cannot allocate leak trace area\n"); - } + printk(KERN_DEBUG + "PCI-DMA: Cannot allocate leak trace area\n"); + } #endif - /* + /* * Out of IOMMU space handling. - * Reserve some invalid pages at the beginning of the GART. - */ - set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + * Reserve some invalid pages at the beginning of the GART. + */ + set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - agp_memory_reserved = iommu_size; + agp_memory_reserved = iommu_size; printk(KERN_INFO "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", - iommu_size>>20); + iommu_size >> 20); - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; bad_dma_address = iommu_bus_base; iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); - /* + /* * Unmap the IOMMU part of the GART. The alias of the page is * always mapped with cache enabled and there is no full cache * coherency across the GART remapping. The unmapping avoids * automatic prefetches from the CPU allocating cache lines in * there. All CPU accesses are done via the direct mapping to * the backing memory. The GART address is only used by PCI - * devices. + * devices. */ clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); - /* - * Try to workaround a bug (thanks to BenH) - * Set unmapped entries to a scratch page instead of 0. + /* + * Try to workaround a bug (thanks to BenH) + * Set unmapped entries to a scratch page instead of 0. * Any prefetches that hit unmapped entries won't get an bus abort * then. */ - scratch = get_zeroed_page(GFP_KERNEL); - if (!scratch) + scratch = get_zeroed_page(GFP_KERNEL); + if (!scratch) panic("Cannot allocate iommu scratch page"); gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); - for (i = EMERGENCY_PAGES; i < iommu_pages; i++) + for (i = EMERGENCY_PAGES; i < iommu_pages; i++) iommu_gatt_base[i] = gart_unmapped_entry; flush_gart(); dma_ops = &gart_dma_ops; -} +} void __init gart_parse_options(char *p) { int arg; #ifdef CONFIG_IOMMU_LEAK - if (!strncmp(p,"leak",4)) { + if (!strncmp(p, "leak", 4)) { leak_trace = 1; p += 4; if (*p == '=') ++p; @@ -723,18 +765,18 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush",8)) + if (!strncmp(p, "fullflush", 8)) iommu_fullflush = 1; - if (!strncmp(p, "nofullflush",11)) + if (!strncmp(p, "nofullflush", 11)) iommu_fullflush = 0; - if (!strncmp(p,"noagp",5)) + if (!strncmp(p, "noagp", 5)) no_agp = 1; - if (!strncmp(p, "noaperture",10)) + if (!strncmp(p, "noaperture", 10)) fix_aperture = 0; /* duplicated from pci-dma.c */ - if (!strncmp(p,"force",5)) + if (!strncmp(p, "force", 5)) gart_iommu_aperture_allowed = 1; - if (!strncmp(p,"allowed",7)) + if (!strncmp(p, "allowed", 7)) gart_iommu_aperture_allowed = 1; if (!strncmp(p, "memaper", 7)) { fallback_aper_force = 1; diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 102866d729a..82a0a674a00 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -10,7 +10,6 @@ #include <asm/dma.h> int swiotlb __read_mostly; -EXPORT_SYMBOL(swiotlb); const struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c index ae8f91214f1..b112406f199 100644 --- a/arch/x86/kernel/pmtimer_64.c +++ b/arch/x86/kernel/pmtimer_64.c @@ -19,13 +19,13 @@ #include <linux/time.h> #include <linux/init.h> #include <linux/cpumask.h> +#include <linux/acpi_pmtmr.h> + #include <asm/io.h> #include <asm/proto.h> #include <asm/msr.h> #include <asm/vsyscall.h> -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - static inline u32 cyc2us(u32 cycles) { /* The Power Management Timer ticks at 3.579545 ticks per microsecond. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 46d391d49de..968371ab223 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,6 +55,7 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> +#include <asm/kdebug.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -74,7 +75,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); */ unsigned long thread_saved_pc(struct task_struct *tsk) { - return ((unsigned long *)tsk->thread.esp)[3]; + return ((unsigned long *)tsk->thread.sp)[3]; } /* @@ -113,10 +114,19 @@ void default_idle(void) smp_mb(); local_irq_disable(); - if (!need_resched()) + if (!need_resched()) { + ktime_t t0, t1; + u64 t0n, t1n; + + t0 = ktime_get(); + t0n = ktime_to_ns(t0); safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); + local_irq_disable(); + t1 = ktime_get(); + t1n = ktime_to_ns(t1); + sched_clock_idle_wakeup_event(t1n - t0n); + } + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } else { /* loop is done by the caller */ @@ -132,7 +142,7 @@ EXPORT_SYMBOL(default_idle); * to poll the ->work.need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { cpu_relax(); } @@ -188,6 +198,9 @@ void cpu_idle(void) rmb(); idle = pm_idle; + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, 0); + if (!idle) idle = default_idle; @@ -255,13 +268,13 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); * New with Core Duo processors, MWAIT can take some hints based on CPU * capability. */ -void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) - __mwait(eax, ecx); + __mwait(ax, cx); } } @@ -272,19 +285,37 @@ static void mwait_idle(void) mwait_idle_with_hints(0, 0); } +static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +{ + if (force_mwait) + return 1; + /* Any C1 states supported? */ + return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0; +} + void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_MWAIT)) { - printk("monitor/mwait feature present.\n"); + static int selected; + + if (selected) + return; +#ifdef CONFIG_X86_SMP + if (pm_idle == poll_idle && smp_num_siblings > 1) { + printk(KERN_WARNING "WARNING: polling idle and HT enabled," + " performance may degrade.\n"); + } +#endif + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { /* * Skip, if setup has overridden idle. * One CPU supports mwait => All CPUs supports mwait */ if (!pm_idle) { - printk("using mwait in idle threads.\n"); + printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; } } + selected = 1; } static int __init idle_setup(char *str) @@ -292,10 +323,6 @@ static int __init idle_setup(char *str) if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; -#ifdef CONFIG_X86_SMP - if (smp_num_siblings > 1) - printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); -#endif } else if (!strcmp(str, "mwait")) force_mwait = 1; else @@ -310,15 +337,15 @@ void __show_registers(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; - unsigned long esp; + unsigned long sp; unsigned short ss, gs; if (user_mode_vm(regs)) { - esp = regs->esp; - ss = regs->xss & 0xffff; + sp = regs->sp; + ss = regs->ss & 0xffff; savesegment(gs, gs); } else { - esp = (unsigned long) (®s->esp); + sp = (unsigned long) (®s->sp); savesegment(ss, ss); savesegment(gs, gs); } @@ -331,17 +358,17 @@ void __show_registers(struct pt_regs *regs, int all) init_utsname()->version); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", - 0xffff & regs->xcs, regs->eip, regs->eflags, + 0xffff & regs->cs, regs->ip, regs->flags, smp_processor_id()); - print_symbol("EIP is at %s\n", regs->eip); + print_symbol("EIP is at %s\n", regs->ip); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); + regs->ax, regs->bx, regs->cx, regs->dx); printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->esi, regs->edi, regs->ebp, esp); + regs->si, regs->di, regs->bp, sp); printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, - regs->xfs & 0xffff, gs, ss); + regs->ds & 0xffff, regs->es & 0xffff, + regs->fs & 0xffff, gs, ss); if (!all) return; @@ -369,12 +396,12 @@ void __show_registers(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { __show_registers(regs, 1); - show_trace(NULL, regs, ®s->esp); + show_trace(NULL, regs, ®s->sp, regs->bp); } /* - * This gets run with %ebx containing the - * function to call, and %edx containing + * This gets run with %bx containing the + * function to call, and %dx containing * the "args". */ extern void kernel_thread_helper(void); @@ -388,16 +415,16 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) memset(®s, 0, sizeof(regs)); - regs.ebx = (unsigned long) fn; - regs.edx = (unsigned long) arg; + regs.bx = (unsigned long) fn; + regs.dx = (unsigned long) arg; - regs.xds = __USER_DS; - regs.xes = __USER_DS; - regs.xfs = __KERNEL_PERCPU; - regs.orig_eax = -1; - regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS | get_kernel_rpl(); - regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); @@ -435,7 +462,12 @@ void flush_thread(void) { struct task_struct *tsk = current; - memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); clear_tsk_thread_flag(tsk, TIF_DEBUG); /* @@ -460,7 +492,7 @@ void prepare_to_copy(struct task_struct *tsk) unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { @@ -470,15 +502,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, childregs = task_pt_regs(p); *childregs = *regs; - childregs->eax = 0; - childregs->esp = esp; + childregs->ax = 0; + childregs->sp = sp; - p->thread.esp = (unsigned long) childregs; - p->thread.esp0 = (unsigned long) (childregs+1); + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); - p->thread.eip = (unsigned long) ret_from_fork; + p->thread.ip = (unsigned long) ret_from_fork; - savesegment(gs,p->thread.gs); + savesegment(gs, p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -491,32 +523,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, set_tsk_thread_flag(p, TIF_IO_BITMAP); } + err = 0; + /* * Set a new TLS for the child thread? */ - if (clone_flags & CLONE_SETTLS) { - struct desc_struct *desc; - struct user_desc info; - int idx; - - err = -EFAULT; - if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) - goto out; - err = -EINVAL; - if (LDT_empty(&info)) - goto out; - - idx = info.entry_number; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - goto out; - - desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } + if (clone_flags & CLONE_SETTLS) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); - err = 0; - out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; @@ -529,62 +544,52 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, */ void dump_thread(struct pt_regs * regs, struct user * dump) { - int i; + u16 gs; /* changed the size calculations - should hopefully work better. lbt */ dump->magic = CMAGIC; dump->start_code = 0; - dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); + dump->start_stack = regs->sp & ~(PAGE_SIZE - 1); dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; - for (i = 0; i < 8; i++) - dump->u_debugreg[i] = current->thread.debugreg[i]; + dump->u_debugreg[0] = current->thread.debugreg0; + dump->u_debugreg[1] = current->thread.debugreg1; + dump->u_debugreg[2] = current->thread.debugreg2; + dump->u_debugreg[3] = current->thread.debugreg3; + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + dump->u_debugreg[7] = current->thread.debugreg7; if (dump->start_stack < TASK_SIZE) dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; - dump->regs.ebx = regs->ebx; - dump->regs.ecx = regs->ecx; - dump->regs.edx = regs->edx; - dump->regs.esi = regs->esi; - dump->regs.edi = regs->edi; - dump->regs.ebp = regs->ebp; - dump->regs.eax = regs->eax; - dump->regs.ds = regs->xds; - dump->regs.es = regs->xes; - dump->regs.fs = regs->xfs; - savesegment(gs,dump->regs.gs); - dump->regs.orig_eax = regs->orig_eax; - dump->regs.eip = regs->eip; - dump->regs.cs = regs->xcs; - dump->regs.eflags = regs->eflags; - dump->regs.esp = regs->esp; - dump->regs.ss = regs->xss; + dump->regs.bx = regs->bx; + dump->regs.cx = regs->cx; + dump->regs.dx = regs->dx; + dump->regs.si = regs->si; + dump->regs.di = regs->di; + dump->regs.bp = regs->bp; + dump->regs.ax = regs->ax; + dump->regs.ds = (u16)regs->ds; + dump->regs.es = (u16)regs->es; + dump->regs.fs = (u16)regs->fs; + savesegment(gs,gs); + dump->regs.orig_ax = regs->orig_ax; + dump->regs.ip = regs->ip; + dump->regs.cs = (u16)regs->cs; + dump->regs.flags = regs->flags; + dump->regs.sp = regs->sp; + dump->regs.ss = (u16)regs->ss; dump->u_fpvalid = dump_fpu (regs, &dump->i387); } EXPORT_SYMBOL(dump_thread); -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs ptregs = *task_pt_regs(tsk); - ptregs.xcs &= 0xffff; - ptregs.xds &= 0xffff; - ptregs.xes &= 0xffff; - ptregs.xss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - return 1; -} - #ifdef CONFIG_SECCOMP -void hard_disable_TSC(void) +static void hard_disable_TSC(void) { write_cr4(read_cr4() | X86_CR4_TSD); } @@ -599,7 +604,7 @@ void disable_TSC(void) hard_disable_TSC(); preempt_enable(); } -void hard_enable_TSC(void) +static void hard_enable_TSC(void) { write_cr4(read_cr4() & ~X86_CR4_TSD); } @@ -609,18 +614,32 @@ static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { - struct thread_struct *next; + struct thread_struct *prev, *next; + unsigned long debugctl; + prev = &prev_p->thread; next = &next_p->thread; + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); + } + + if (next->debugctlmsr != debugctl) + wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg[0], 0); - set_debugreg(next->debugreg[1], 1); - set_debugreg(next->debugreg[2], 2); - set_debugreg(next->debugreg[3], 3); + set_debugreg(next->debugreg0, 0); + set_debugreg(next->debugreg1, 1); + set_debugreg(next->debugreg2, 2); + set_debugreg(next->debugreg3, 3); /* no 4 and 5 */ - set_debugreg(next->debugreg[6], 6); - set_debugreg(next->debugreg[7], 7); + set_debugreg(next->debugreg6, 6); + set_debugreg(next->debugreg7, 7); } #ifdef CONFIG_SECCOMP @@ -634,6 +653,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, } #endif + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); + + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); + + if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache @@ -687,11 +713,11 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, * More important, however, is the fact that this allows us much * more flexibility. * - * The return value (in %eax) will be the "prev" task after + * The return value (in %ax) will be the "prev" task after * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; @@ -710,7 +736,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas /* * Reload esp0. */ - load_esp0(tss, next); + load_sp0(tss, next); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -774,7 +800,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas asmlinkage int sys_fork(struct pt_regs regs) { - return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL); } asmlinkage int sys_clone(struct pt_regs regs) @@ -783,12 +809,12 @@ asmlinkage int sys_clone(struct pt_regs regs) unsigned long newsp; int __user *parent_tidptr, *child_tidptr; - clone_flags = regs.ebx; - newsp = regs.ecx; - parent_tidptr = (int __user *)regs.edx; - child_tidptr = (int __user *)regs.edi; + clone_flags = regs.bx; + newsp = regs.cx; + parent_tidptr = (int __user *)regs.dx; + child_tidptr = (int __user *)regs.di; if (!newsp) - newsp = regs.esp; + newsp = regs.sp; return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); } @@ -804,7 +830,7 @@ asmlinkage int sys_clone(struct pt_regs regs) */ asmlinkage int sys_vfork(struct pt_regs regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL); } /* @@ -815,18 +841,15 @@ asmlinkage int sys_execve(struct pt_regs regs) int error; char * filename; - filename = getname((char __user *) regs.ebx); + filename = getname((char __user *) regs.bx); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; error = do_execve(filename, - (char __user * __user *) regs.ecx, - (char __user * __user *) regs.edx, + (char __user * __user *) regs.cx, + (char __user * __user *) regs.dx, ®s); if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); /* Make sure we don't return using sysenter.. */ set_thread_flag(TIF_IRET); } @@ -840,145 +863,37 @@ out: unsigned long get_wchan(struct task_struct *p) { - unsigned long ebp, esp, eip; + unsigned long bp, sp, ip; unsigned long stack_page; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; stack_page = (unsigned long)task_stack_page(p); - esp = p->thread.esp; - if (!stack_page || esp < stack_page || esp > top_esp+stack_page) + sp = p->thread.sp; + if (!stack_page || sp < stack_page || sp > top_esp+stack_page) return 0; - /* include/asm-i386/system.h:switch_to() pushes ebp last. */ - ebp = *(unsigned long *) esp; + /* include/asm-i386/system.h:switch_to() pushes bp last. */ + bp = *(unsigned long *) sp; do { - if (ebp < stack_page || ebp > top_ebp+stack_page) + if (bp < stack_page || bp > top_ebp+stack_page) return 0; - eip = *(unsigned long *) (ebp+4); - if (!in_sched_functions(eip)) - return eip; - ebp = *(unsigned long *) ebp; + ip = *(unsigned long *) (bp+4); + if (!in_sched_functions(ip)) + return ip; + bp = *(unsigned long *) bp; } while (count++ < 16); return 0; } -/* - * sys_alloc_thread_area: get a yet unused TLS descriptor index. - */ -static int get_free_idx(void) -{ - struct thread_struct *t = ¤t->thread; - int idx; - - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) - if (desc_empty(t->tls_array + idx)) - return idx + GDT_ENTRY_TLS_MIN; - return -ESRCH; -} - -/* - * Set a given TLS descriptor: - */ -asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) -{ - struct thread_struct *t = ¤t->thread; - struct user_desc info; - struct desc_struct *desc; - int cpu, idx; - - if (copy_from_user(&info, u_info, sizeof(info))) - return -EFAULT; - idx = info.entry_number; - - /* - * index -1 means the kernel should try to find and - * allocate an empty descriptor: - */ - if (idx == -1) { - idx = get_free_idx(); - if (idx < 0) - return idx; - if (put_user(idx, &u_info->entry_number)) - return -EFAULT; - } - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; - - /* - * We must not get preempted while modifying the TLS. - */ - cpu = get_cpu(); - - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - load_TLS(t, cpu); - - put_cpu(); - - return 0; -} - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) - -asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) -{ - struct user_desc info; - struct desc_struct *desc; - int idx; - - if (get_user(idx, &u_info->entry_number)) - return -EFAULT; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - memset(&info, 0, sizeof(info)); - - desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - - if (copy_to_user(u_info, &info, sizeof(info))) - return -EFAULT; - return 0; -} - unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ab79e1dfa02..137a86171c3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -3,7 +3,7 @@ * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 - * + * * X86-64 port * Andi Kleen. * @@ -19,19 +19,19 @@ #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/fs.h> #include <linux/elfcore.h> #include <linux/smp.h> #include <linux/slab.h> #include <linux/user.h> -#include <linux/module.h> #include <linux/a.out.h> #include <linux/interrupt.h> +#include <linux/utsname.h> #include <linux/delay.h> +#include <linux/module.h> #include <linux/ptrace.h> -#include <linux/utsname.h> #include <linux/random.h> #include <linux/notifier.h> #include <linux/kprobes.h> @@ -72,13 +72,6 @@ void idle_notifier_register(struct notifier_block *n) { atomic_notifier_chain_register(&idle_notifier, n); } -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL(idle_notifier_unregister); void enter_idle(void) { @@ -106,7 +99,7 @@ void exit_idle(void) * We use this if we don't have any better * idle routine.. */ -static void default_idle(void) +void default_idle(void) { current_thread_info()->status &= ~TS_POLLING; /* @@ -116,11 +109,18 @@ static void default_idle(void) smp_mb(); local_irq_disable(); if (!need_resched()) { - /* Enables interrupts one instruction before HLT. - x86 special cases this so there is no race. */ - safe_halt(); - } else - local_irq_enable(); + ktime_t t0, t1; + u64 t0n, t1n; + + t0 = ktime_get(); + t0n = ktime_to_ns(t0); + safe_halt(); /* enables interrupts racelessly */ + local_irq_disable(); + t1 = ktime_get(); + t1n = ktime_to_ns(t1); + sched_clock_idle_wakeup_event(t1n - t0n); + } + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } @@ -129,54 +129,12 @@ static void default_idle(void) * to poll the ->need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { local_irq_enable(); cpu_relax(); } -static void do_nothing(void *unused) -{ -} - -void cpu_idle_wait(void) -{ - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && - !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, 0, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - #ifdef CONFIG_HOTPLUG_CPU DECLARE_PER_CPU(int, cpu_state); @@ -207,19 +165,18 @@ static inline void play_dead(void) * low exit latency (ie sit in a loop waiting for * somebody to say that they'd like to reschedule) */ -void cpu_idle (void) +void cpu_idle(void) { current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { + tick_nohz_stop_sched_tick(); while (!need_resched()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; - tick_nohz_stop_sched_tick(); - rmb(); idle = pm_idle; if (!idle) @@ -247,6 +204,47 @@ void cpu_idle (void) } } +static void do_nothing(void *unused) +{ +} + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map, tmp = current->cpus_allowed; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + /* + * We waited 1 sec, if a CPU still did not call idle + * it may be because it is in idle and not waking up + * because it has nothing to do. + * Give all the remaining CPUS a kick. + */ + smp_call_function_mask(map, do_nothing, 0, 0); + } while (!cpus_empty(map)); + + set_cpus_allowed(current, tmp); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + /* * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, * which can obviate IPI to trigger checking of need_resched. @@ -257,13 +255,13 @@ void cpu_idle (void) * New with Core Duo processors, MWAIT can take some hints based on CPU * capability. */ -void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) - __mwait(eax, ecx); + __mwait(ax, cx); } } @@ -282,25 +280,41 @@ static void mwait_idle(void) } } + +static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +{ + if (force_mwait) + return 1; + /* Any C1 states supported? */ + return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0; +} + void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { - static int printed; - if (cpu_has(c, X86_FEATURE_MWAIT)) { + static int selected; + + if (selected) + return; +#ifdef CONFIG_X86_SMP + if (pm_idle == poll_idle && smp_num_siblings > 1) { + printk(KERN_WARNING "WARNING: polling idle and HT enabled," + " performance may degrade.\n"); + } +#endif + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { /* * Skip, if setup has overridden idle. * One CPU supports mwait => All CPUs supports mwait */ if (!pm_idle) { - if (!printed) { - printk(KERN_INFO "using mwait in idle threads.\n"); - printed = 1; - } + printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; } } + selected = 1; } -static int __init idle_setup (char *str) +static int __init idle_setup(char *str) { if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); @@ -315,13 +329,13 @@ static int __init idle_setup (char *str) } early_param("idle", idle_setup); -/* Prints also some state that isn't saved in the pt_regs */ +/* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; - unsigned int fsindex,gsindex; - unsigned int ds,cs,es; + unsigned int fsindex, gsindex; + unsigned int ds, cs, es; printk("\n"); print_modules(); @@ -330,16 +344,16 @@ void __show_regs(struct pt_regs * regs) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); - printk_address(regs->rip); - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, - regs->eflags); + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk_address(regs->ip, 1); + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, + regs->flags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", - regs->rax, regs->rbx, regs->rcx); + regs->ax, regs->bx, regs->cx); printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", - regs->rdx, regs->rsi, regs->rdi); + regs->dx, regs->si, regs->di); printk("RBP: %016lx R08: %016lx R09: %016lx\n", - regs->rbp, regs->r8, regs->r9); + regs->bp, regs->r8, regs->r9); printk("R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); printk("R13: %016lx R14: %016lx R15: %016lx\n", @@ -379,7 +393,7 @@ void show_regs(struct pt_regs *regs) { printk("CPU %d:", smp_processor_id()); __show_regs(regs); - show_trace(NULL, regs, (void *)(regs + 1)); + show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } /* @@ -390,7 +404,7 @@ void exit_thread(void) struct task_struct *me = current; struct thread_struct *t = &me->thread; - if (me->thread.io_bitmap_ptr) { + if (me->thread.io_bitmap_ptr) { struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); kfree(t->io_bitmap_ptr); @@ -426,7 +440,7 @@ void flush_thread(void) tsk->thread.debugreg3 = 0; tsk->thread.debugreg6 = 0; tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. */ @@ -449,26 +463,21 @@ void release_thread(struct task_struct *dead_task) static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) { - struct user_desc ud = { + struct user_desc ud = { .base_addr = addr, .limit = 0xfffff, .seg_32bit = 1, .limit_in_pages = 1, .useable = 1, }; - struct n_desc_struct *desc = (void *)t->thread.tls_array; + struct desc_struct *desc = t->thread.tls_array; desc += tls; - desc->a = LDT_entry_a(&ud); - desc->b = LDT_entry_b(&ud); + fill_ldt(desc, &ud); } static inline u32 read_32bit_tls(struct task_struct *t, int tls) { - struct desc_struct *desc = (void *)t->thread.tls_array; - desc += tls; - return desc->base0 | - (((u32)desc->base1) << 16) | - (((u32)desc->base2) << 24); + return get_desc_base(&t->thread.tls_array[tls]); } /* @@ -480,7 +489,7 @@ void prepare_to_copy(struct task_struct *tsk) unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { @@ -492,14 +501,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, (THREAD_SIZE + task_stack_page(p))) - 1; *childregs = *regs; - childregs->rax = 0; - childregs->rsp = rsp; - if (rsp == ~0UL) - childregs->rsp = (unsigned long)childregs; + childregs->ax = 0; + childregs->sp = sp; + if (sp == ~0UL) + childregs->sp = (unsigned long)childregs; - p->thread.rsp = (unsigned long) childregs; - p->thread.rsp0 = (unsigned long) (childregs+1); - p->thread.userrsp = me->thread.userrsp; + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); + p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); @@ -520,7 +529,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); - } + } /* * Set a new TLS for the child thread? @@ -528,7 +537,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) - err = ia32_child_tls(p, childregs); + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); else #endif err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); @@ -547,17 +557,30 @@ out: /* * This special macro can be used to load a debugging register */ -#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) +#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) static inline void __switch_to_xtra(struct task_struct *prev_p, - struct task_struct *next_p, - struct tss_struct *tss) + struct task_struct *next_p, + struct tss_struct *tss) { struct thread_struct *prev, *next; + unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); + } + + if (next->debugctlmsr != debugctl) + wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { loaddebug(next, 0); loaddebug(next, 1); @@ -581,12 +604,18 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } + + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); + + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); } /* * switch_to(x,y) should switch tasks from x to y. * - * This could still be optimized: + * This could still be optimized: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced * @@ -597,7 +626,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; - int cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); /* we're going to use this soon, after a few expensive things */ @@ -607,7 +636,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Reload esp0, LDT and the page table pointer: */ - tss->rsp0 = next->rsp0; + load_sp0(tss, next); /* * Switch DS and ES. @@ -666,8 +695,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->userrsp = read_pda(oldrsp); - write_pda(oldrsp, next->userrsp); + prev->usersp = read_pda(oldrsp); + write_pda(oldrsp, next->usersp); write_pda(pcurrent, next_p); write_pda(kernelstack, @@ -684,8 +713,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Now maybe reload the debug registers and handle I/O bitmaps */ - if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) - || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); /* If the task has used fpu the last 5 timeslices, just do a full @@ -700,7 +729,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * sys_execve() executes a new program. */ -asmlinkage +asmlinkage long sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp, struct pt_regs regs) { @@ -712,11 +741,6 @@ long sys_execve(char __user *name, char __user * __user *argv, if (IS_ERR(filename)) return error; error = do_execve(filename, argv, envp, ®s); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); - } putname(filename); return error; } @@ -726,18 +750,18 @@ void set_personality_64bit(void) /* inherit personality from parent */ /* Make sure to be in 64bit mode */ - clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_IA32); /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that - 32bit childs are affected again. */ + 32bit childs are affected again. */ current->personality &= ~READ_IMPLIES_EXEC; } asmlinkage long sys_fork(struct pt_regs *regs) { - return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); } asmlinkage long @@ -745,7 +769,7 @@ sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) { if (!newsp) - newsp = regs->rsp; + newsp = regs->sp; return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } @@ -761,29 +785,29 @@ sys_clone(unsigned long clone_flags, unsigned long newsp, */ asmlinkage long sys_vfork(struct pt_regs *regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL); } unsigned long get_wchan(struct task_struct *p) { unsigned long stack; - u64 fp,rip; + u64 fp,ip; int count = 0; if (!p || p == current || p->state==TASK_RUNNING) return 0; stack = (unsigned long)task_stack_page(p); - if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) + if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) return 0; - fp = *(u64 *)(p->thread.rsp); + fp = *(u64 *)(p->thread.sp); do { if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) return 0; - rip = *(u64 *)(fp+8); - if (!in_sched_functions(rip)) - return rip; + ip = *(u64 *)(fp+8); + if (!in_sched_functions(ip)) + return ip; fp = *(u64 *)fp; } while (count++ < 16); return 0; @@ -824,19 +848,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) /* Not strictly needed for fs, but do it for symmetry with gs */ if (addr >= TASK_SIZE_OF(task)) - return -EPERM; + return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to + /* handle small bases via the GDT because that's faster to switch. */ - if (addr <= 0xffffffff) { + if (addr <= 0xffffffff) { set_32bit_tls(task, FS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); + if (doit) { + load_TLS(&task->thread, cpu); asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); } task->thread.fsindex = FS_TLS_SEL; task->thread.fs = 0; - } else { + } else { task->thread.fsindex = 0; task->thread.fs = addr; if (doit) { @@ -848,24 +872,24 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) } put_cpu(); break; - case ARCH_GET_FS: { - unsigned long base; + case ARCH_GET_FS: { + unsigned long base; if (task->thread.fsindex == FS_TLS_SEL) base = read_32bit_tls(task, FS_TLS); else if (doit) rdmsrl(MSR_FS_BASE, base); else base = task->thread.fs; - ret = put_user(base, (unsigned long __user *)addr); - break; + ret = put_user(base, (unsigned long __user *)addr); + break; } - case ARCH_GET_GS: { + case ARCH_GET_GS: { unsigned long base; unsigned gsindex; if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); else if (doit) { - asm("movl %%gs,%0" : "=r" (gsindex)); + asm("movl %%gs,%0" : "=r" (gsindex)); if (gsindex) rdmsrl(MSR_KERNEL_GS_BASE, base); else @@ -873,39 +897,21 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) } else base = task->thread.gs; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)addr); break; } default: ret = -EINVAL; break; - } + } - return ret; -} + return ret; +} long sys_arch_prctl(int code, unsigned long addr) { return do_arch_prctl(current, code, addr); -} - -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs *pp, ptregs; - - pp = task_pt_regs(tsk); - - ptregs = *pp; - ptregs.cs &= 0xffff; - ptregs.ss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - return 1; } unsigned long arch_align_stack(unsigned long sp) @@ -914,3 +920,9 @@ unsigned long arch_align_stack(unsigned long sp) sp -= get_random_int() % 8192; return sp & ~0xf; } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c new file mode 100644 index 00000000000..96286df1bb8 --- /dev/null +++ b/arch/x86/kernel/ptrace.c @@ -0,0 +1,1545 @@ +/* By Ross Biro 1/23/92 */ +/* + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * BTS tracing + * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/regset.h> +#include <linux/user.h> +#include <linux/elf.h> +#include <linux/security.h> +#include <linux/audit.h> +#include <linux/seccomp.h> +#include <linux/signal.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/debugreg.h> +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/prctl.h> +#include <asm/proto.h> +#include <asm/ds.h> + +#include "tls.h" + +enum x86_regset { + REGSET_GENERAL, + REGSET_FP, + REGSET_XFP, + REGSET_TLS, +}; + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* + * Determines which flags the user has access to [1 = access, 0 = no access]. + */ +#define FLAG_MASK_32 ((unsigned long) \ + (X86_EFLAGS_CF | X86_EFLAGS_PF | \ + X86_EFLAGS_AF | X86_EFLAGS_ZF | \ + X86_EFLAGS_SF | X86_EFLAGS_TF | \ + X86_EFLAGS_DF | X86_EFLAGS_OF | \ + X86_EFLAGS_RF | X86_EFLAGS_AC)) + +/* + * Determines whether a value may be installed in a segment register. + */ +static inline bool invalid_selector(u16 value) +{ + return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL); +} + +#ifdef CONFIG_X86_32 + +#define FLAG_MASK FLAG_MASK_32 + +static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); + regno >>= 2; + if (regno > FS) + --regno; + return ®s->bx + regno; +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int retval; + if (offset != offsetof(struct user_regs_struct, gs)) + retval = *pt_regs_access(task_pt_regs(task), offset); + else { + retval = task->thread.gs; + if (task == current) + savesegment(gs, retval); + } + return retval; +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + if (offset != offsetof(struct user_regs_struct, gs)) + *pt_regs_access(task_pt_regs(task), offset) = value; + else { + task->thread.gs = value; + if (task == current) + /* + * The user-mode %gs is not affected by + * kernel entry, so we must update the CPU. + */ + loadsegment(gs, value); + } + + return 0; +} + +static unsigned long debugreg_addr_limit(struct task_struct *task) +{ + return TASK_SIZE - 3; +} + +#else /* CONFIG_X86_64 */ + +#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) + +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0); + return ®s->r15 + (offset / sizeof(regs->r15)); +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int seg; + + switch (offset) { + case offsetof(struct user_regs_struct, fs): + if (task == current) { + /* Older gas can't assemble movq %?s,%r?? */ + asm("movl %%fs,%0" : "=r" (seg)); + return seg; + } + return task->thread.fsindex; + case offsetof(struct user_regs_struct, gs): + if (task == current) { + asm("movl %%gs,%0" : "=r" (seg)); + return seg; + } + return task->thread.gsindex; + case offsetof(struct user_regs_struct, ds): + if (task == current) { + asm("movl %%ds,%0" : "=r" (seg)); + return seg; + } + return task->thread.ds; + case offsetof(struct user_regs_struct, es): + if (task == current) { + asm("movl %%es,%0" : "=r" (seg)); + return seg; + } + return task->thread.es; + + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + break; + } + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + switch (offset) { + case offsetof(struct user_regs_struct,fs): + /* + * If this is setting fs as for normal 64-bit use but + * setting fs_base has implicitly changed it, leave it. + */ + if ((value == FS_TLS_SEL && task->thread.fsindex == 0 && + task->thread.fs != 0) || + (value == 0 && task->thread.fsindex == FS_TLS_SEL && + task->thread.fs == 0)) + break; + task->thread.fsindex = value; + if (task == current) + loadsegment(fs, task->thread.fsindex); + break; + case offsetof(struct user_regs_struct,gs): + /* + * If this is setting gs as for normal 64-bit use but + * setting gs_base has implicitly changed it, leave it. + */ + if ((value == GS_TLS_SEL && task->thread.gsindex == 0 && + task->thread.gs != 0) || + (value == 0 && task->thread.gsindex == GS_TLS_SEL && + task->thread.gs == 0)) + break; + task->thread.gsindex = value; + if (task == current) + load_gs_index(task->thread.gsindex); + break; + case offsetof(struct user_regs_struct,ds): + task->thread.ds = value; + if (task == current) + loadsegment(ds, task->thread.ds); + break; + case offsetof(struct user_regs_struct,es): + task->thread.es = value; + if (task == current) + loadsegment(es, task->thread.es); + break; + + /* + * Can't actually change these in 64-bit mode. + */ + case offsetof(struct user_regs_struct,cs): +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) + task_pt_regs(task)->cs = value; +#endif + break; + case offsetof(struct user_regs_struct,ss): +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) + task_pt_regs(task)->ss = value; +#endif + break; + } + + return 0; +} + +static unsigned long debugreg_addr_limit(struct task_struct *task) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) + return IA32_PAGE_OFFSET - 3; +#endif + return TASK_SIZE64 - 7; +} + +#endif /* CONFIG_X86_32 */ + +static unsigned long get_flags(struct task_struct *task) +{ + unsigned long retval = task_pt_regs(task)->flags; + + /* + * If the debugger set TF, hide it from the readout. + */ + if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + retval &= ~X86_EFLAGS_TF; + + return retval; +} + +static int set_flags(struct task_struct *task, unsigned long value) +{ + struct pt_regs *regs = task_pt_regs(task); + + /* + * If the user value contains TF, mark that + * it was not "us" (the debugger) that set it. + * If not, make sure it stays set if we had. + */ + if (value & X86_EFLAGS_TF) + clear_tsk_thread_flag(task, TIF_FORCED_TF); + else if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + value |= X86_EFLAGS_TF; + + regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK); + + return 0; +} + +static int putreg(struct task_struct *child, + unsigned long offset, unsigned long value) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return set_segment_reg(child, offset, value); + + case offsetof(struct user_regs_struct, flags): + return set_flags(child, value); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct,fs_base): + if (value >= TASK_SIZE_OF(child)) + return -EIO; + /* + * When changing the segment base, use do_arch_prctl + * to set either thread.fs or thread.fsindex and the + * corresponding GDT slot. + */ + if (child->thread.fs != value) + return do_arch_prctl(child, ARCH_SET_FS, value); + return 0; + case offsetof(struct user_regs_struct,gs_base): + /* + * Exactly the same here as the %fs handling above. + */ + if (value >= TASK_SIZE_OF(child)) + return -EIO; + if (child->thread.gs != value) + return do_arch_prctl(child, ARCH_SET_GS, value); + return 0; +#endif + } + + *pt_regs_access(task_pt_regs(child), offset) = value; + return 0; +} + +static unsigned long getreg(struct task_struct *task, unsigned long offset) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return get_segment_reg(task, offset); + + case offsetof(struct user_regs_struct, flags): + return get_flags(task); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct, fs_base): { + /* + * do_arch_prctl may have used a GDT slot instead of + * the MSR. To userland, it appears the same either + * way, except the %fs segment selector might not be 0. + */ + unsigned int seg = task->thread.fsindex; + if (task->thread.fs != 0) + return task->thread.fs; + if (task == current) + asm("movl %%fs,%0" : "=r" (seg)); + if (seg != FS_TLS_SEL) + return 0; + return get_desc_base(&task->thread.tls_array[FS_TLS]); + } + case offsetof(struct user_regs_struct, gs_base): { + /* + * Exactly the same here as the %fs handling above. + */ + unsigned int seg = task->thread.gsindex; + if (task->thread.gs != 0) + return task->thread.gs; + if (task == current) + asm("movl %%gs,%0" : "=r" (seg)); + if (seg != GS_TLS_SEL) + return 0; + return get_desc_base(&task->thread.tls_array[GS_TLS]); + } +#endif + } + + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int genregs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (kbuf) { + unsigned long *k = kbuf; + while (count > 0) { + *k++ = getreg(target, pos); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + unsigned long __user *u = ubuf; + while (count > 0) { + if (__put_user(getreg(target, pos), u++)) + return -EFAULT; + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + return 0; +} + +static int genregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const unsigned long *k = kbuf; + while (count > 0 && !ret) { + ret = putreg(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const unsigned long __user *u = ubuf; + while (count > 0 && !ret) { + unsigned long word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +/* + * This function is trivial and will be inlined by the compiler. + * Having it separates the implementation details of debug + * registers from the interface details of ptrace. + */ +static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) +{ + switch (n) { + case 0: return child->thread.debugreg0; + case 1: return child->thread.debugreg1; + case 2: return child->thread.debugreg2; + case 3: return child->thread.debugreg3; + case 6: return child->thread.debugreg6; + case 7: return child->thread.debugreg7; + } + return 0; +} + +static int ptrace_set_debugreg(struct task_struct *child, + int n, unsigned long data) +{ + int i; + + if (unlikely(n == 4 || n == 5)) + return -EIO; + + if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) + return -EIO; + + switch (n) { + case 0: child->thread.debugreg0 = data; break; + case 1: child->thread.debugreg1 = data; break; + case 2: child->thread.debugreg2 = data; break; + case 3: child->thread.debugreg3 = data; break; + + case 6: + if ((data & ~0xffffffffUL) != 0) + return -EIO; + child->thread.debugreg6 = data; + break; + + case 7: + /* + * Sanity-check data. Take one half-byte at once with + * check = (val >> (16 + 4*i)) & 0xf. It contains the + * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits + * 2 and 3 are LENi. Given a list of invalid values, + * we do mask |= 1 << invalid_value, so that + * (mask >> check) & 1 is a correct test for invalid + * values. + * + * R/Wi contains the type of the breakpoint / + * watchpoint, LENi contains the length of the watched + * data in the watchpoint case. + * + * The invalid values are: + * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit] + * - R/Wi == 0x10 (break on I/O reads or writes), so + * mask |= 0x4444. + * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= + * 0x1110. + * + * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. + * + * See the Intel Manual "System Programming Guide", + * 15.2.4 + * + * Note that LENi == 0x10 is defined on x86_64 in long + * mode (i.e. even for 32-bit userspace software, but + * 64-bit kernel), so the x86_64 mask value is 0x5454. + * See the AMD manual no. 24593 (AMD64 System Programming) + */ +#ifdef CONFIG_X86_32 +#define DR7_MASK 0x5f54 +#else +#define DR7_MASK 0x5554 +#endif + data &= ~DR_CONTROL_RESERVED; + for (i = 0; i < 4; i++) + if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) + return -EIO; + child->thread.debugreg7 = data; + if (data) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); + break; + } + + return 0; +} + +static int ptrace_bts_get_size(struct task_struct *child) +{ + if (!child->thread.ds_area_msr) + return -ENXIO; + + return ds_get_bts_index((void *)child->thread.ds_area_msr); +} + +static int ptrace_bts_read_record(struct task_struct *child, + long index, + struct bts_struct __user *out) +{ + struct bts_struct ret; + int retval; + int bts_end; + int bts_index; + + if (!child->thread.ds_area_msr) + return -ENXIO; + + if (index < 0) + return -EINVAL; + + bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr); + if (bts_end <= index) + return -EINVAL; + + /* translate the ptrace bts index into the ds bts index */ + bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); + bts_index -= (index + 1); + if (bts_index < 0) + bts_index += bts_end; + + retval = ds_read_bts((void *)child->thread.ds_area_msr, + bts_index, &ret); + if (retval < 0) + return retval; + + if (copy_to_user(out, &ret, sizeof(ret))) + return -EFAULT; + + return sizeof(ret); +} + +static int ptrace_bts_write_record(struct task_struct *child, + const struct bts_struct *in) +{ + int retval; + + if (!child->thread.ds_area_msr) + return -ENXIO; + + retval = ds_write_bts((void *)child->thread.ds_area_msr, in); + if (retval) + return retval; + + return sizeof(*in); +} + +static int ptrace_bts_clear(struct task_struct *child) +{ + if (!child->thread.ds_area_msr) + return -ENXIO; + + return ds_clear((void *)child->thread.ds_area_msr); +} + +static int ptrace_bts_drain(struct task_struct *child, + long size, + struct bts_struct __user *out) +{ + int end, i; + void *ds = (void *)child->thread.ds_area_msr; + + if (!ds) + return -ENXIO; + + end = ds_get_bts_index(ds); + if (end <= 0) + return end; + + if (size < (end * sizeof(struct bts_struct))) + return -EIO; + + for (i = 0; i < end; i++, out++) { + struct bts_struct ret; + int retval; + + retval = ds_read_bts(ds, i, &ret); + if (retval < 0) + return retval; + + if (copy_to_user(out, &ret, sizeof(ret))) + return -EFAULT; + } + + ds_clear(ds); + + return end; +} + +static int ptrace_bts_realloc(struct task_struct *child, + int size, int reduce_size) +{ + unsigned long rlim, vm; + int ret, old_size; + + if (size < 0) + return -EINVAL; + + old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); + if (old_size < 0) + return old_size; + + ret = ds_free((void **)&child->thread.ds_area_msr); + if (ret < 0) + goto out; + + size >>= PAGE_SHIFT; + old_size >>= PAGE_SHIFT; + + current->mm->total_vm -= old_size; + current->mm->locked_vm -= old_size; + + if (size == 0) + goto out; + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->total_vm; + if (size <= 0) + goto out; + } + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->locked_vm; + if (size <= 0) + goto out; + } + + ret = ds_allocate((void **)&child->thread.ds_area_msr, + size << PAGE_SHIFT); + if (ret < 0) + goto out; + + current->mm->total_vm += size; + current->mm->locked_vm += size; + +out: + if (child->thread.ds_area_msr) + set_tsk_thread_flag(child, TIF_DS_AREA_MSR); + else + clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); + + return ret; +} + +static int ptrace_bts_config(struct task_struct *child, + long cfg_size, + const struct ptrace_bts_config __user *ucfg) +{ + struct ptrace_bts_config cfg; + int bts_size, ret = 0; + void *ds; + + if (cfg_size < sizeof(cfg)) + return -EIO; + + if (copy_from_user(&cfg, ucfg, sizeof(cfg))) + return -EFAULT; + + if ((int)cfg.size < 0) + return -EINVAL; + + bts_size = 0; + ds = (void *)child->thread.ds_area_msr; + if (ds) { + bts_size = ds_get_bts_size(ds); + if (bts_size < 0) + return bts_size; + } + cfg.size = PAGE_ALIGN(cfg.size); + + if (bts_size != cfg.size) { + ret = ptrace_bts_realloc(child, cfg.size, + cfg.flags & PTRACE_BTS_O_CUT_SIZE); + if (ret < 0) + goto errout; + + ds = (void *)child->thread.ds_area_msr; + } + + if (cfg.flags & PTRACE_BTS_O_SIGNAL) + ret = ds_set_overflow(ds, DS_O_SIGNAL); + else + ret = ds_set_overflow(ds, DS_O_WRAP); + if (ret < 0) + goto errout; + + if (cfg.flags & PTRACE_BTS_O_TRACE) + child->thread.debugctlmsr |= ds_debugctl_mask(); + else + child->thread.debugctlmsr &= ~ds_debugctl_mask(); + + if (cfg.flags & PTRACE_BTS_O_SCHED) + set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + else + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + + ret = sizeof(cfg); + +out: + if (child->thread.debugctlmsr) + set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + else + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + + return ret; + +errout: + child->thread.debugctlmsr &= ~ds_debugctl_mask(); + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + goto out; +} + +static int ptrace_bts_status(struct task_struct *child, + long cfg_size, + struct ptrace_bts_config __user *ucfg) +{ + void *ds = (void *)child->thread.ds_area_msr; + struct ptrace_bts_config cfg; + + if (cfg_size < sizeof(cfg)) + return -EIO; + + memset(&cfg, 0, sizeof(cfg)); + + if (ds) { + cfg.size = ds_get_bts_size(ds); + + if (ds_get_overflow(ds) == DS_O_SIGNAL) + cfg.flags |= PTRACE_BTS_O_SIGNAL; + + if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && + child->thread.debugctlmsr & ds_debugctl_mask()) + cfg.flags |= PTRACE_BTS_O_TRACE; + + if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + cfg.flags |= PTRACE_BTS_O_SCHED; + } + + cfg.bts_size = sizeof(struct bts_struct); + + if (copy_to_user(ucfg, &cfg, sizeof(cfg))) + return -EFAULT; + + return sizeof(cfg); +} + +void ptrace_bts_take_timestamp(struct task_struct *tsk, + enum bts_qualifier qualifier) +{ + struct bts_struct rec = { + .qualifier = qualifier, + .variant.jiffies = jiffies_64 + }; + + ptrace_bts_write_record(tsk, &rec); +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void ptrace_disable(struct task_struct *child) +{ + user_disable_single_step(child); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#endif + if (child->thread.ds_area_msr) { + ptrace_bts_realloc(child, 0, 0); + child->thread.debugctlmsr &= ~ds_debugctl_mask(); + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + } +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static const struct user_regset_view user_x86_32_view; /* Initialized below. */ +#endif + +long arch_ptrace(struct task_struct *child, long request, long addr, long data) +{ + int ret; + unsigned long __user *datap = (unsigned long __user *)data; + + switch (request) { + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr < 0 || + addr >= sizeof(struct user)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, datap); + break; + } + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr < 0 || + addr >= sizeof(struct user)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + +#ifdef CONFIG_X86_32 + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, + 0, sizeof(struct user_fxsr_struct), + datap); + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, + 0, sizeof(struct user_fxsr_struct), + datap); +#endif + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + case PTRACE_GET_THREAD_AREA: + if (addr < 0) + return -EIO; + ret = do_get_thread_area(child, addr, + (struct user_desc __user *) data); + break; + + case PTRACE_SET_THREAD_AREA: + if (addr < 0) + return -EIO; + ret = do_set_thread_area(child, addr, + (struct user_desc __user *) data, 0); + break; +#endif + +#ifdef CONFIG_X86_64 + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + ret = do_arch_prctl(child, data, addr); + break; +#endif + + case PTRACE_BTS_CONFIG: + ret = ptrace_bts_config + (child, data, (struct ptrace_bts_config __user *)addr); + break; + + case PTRACE_BTS_STATUS: + ret = ptrace_bts_status + (child, data, (struct ptrace_bts_config __user *)addr); + break; + + case PTRACE_BTS_SIZE: + ret = ptrace_bts_get_size(child); + break; + + case PTRACE_BTS_GET: + ret = ptrace_bts_read_record + (child, data, (struct bts_struct __user *) addr); + break; + + case PTRACE_BTS_CLEAR: + ret = ptrace_bts_clear(child); + break; + + case PTRACE_BTS_DRAIN: + ret = ptrace_bts_drain + (child, data, (struct bts_struct __user *) addr); + break; + + default: + ret = ptrace_request(child, request, addr, data); + break; + } + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION + +#include <linux/compat.h> +#include <linux/syscalls.h> +#include <asm/ia32.h> +#include <asm/user32.h> + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + regs->q = value; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + return set_segment_reg(child, \ + offsetof(struct user_regs_struct, rs), \ + value); \ + break + +static int putreg32(struct task_struct *child, unsigned regno, u32 value) +{ + struct pt_regs *regs = task_pt_regs(child); + + switch (regno) { + + SEG32(cs); + SEG32(ds); + SEG32(es); + SEG32(fs); + SEG32(gs); + SEG32(ss); + + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(orig_eax, orig_ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.eflags): + return set_flags(child, value); + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + return ptrace_set_debugreg(child, regno / 4, value); + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + *val = regs->q; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + *val = get_segment_reg(child, \ + offsetof(struct user_regs_struct, rs)); \ + break + +static int getreg32(struct task_struct *child, unsigned regno, u32 *val) +{ + struct pt_regs *regs = task_pt_regs(child); + + switch (regno) { + + SEG32(ds); + SEG32(es); + SEG32(fs); + SEG32(gs); + + R32(cs, cs); + R32(ss, ss); + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(orig_eax, orig_ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.eflags): + *val = get_flags(child); + break; + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + *val = ptrace_get_debugreg(child, regno / 4); + break; + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + *val = 0; + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +static int genregs32_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (kbuf) { + compat_ulong_t *k = kbuf; + while (count > 0) { + getreg32(target, pos, k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + compat_ulong_t __user *u = ubuf; + while (count > 0) { + compat_ulong_t word; + getreg32(target, pos, &word); + if (__put_user(word, u++)) + return -EFAULT; + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + return 0; +} + +static int genregs32_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const compat_ulong_t *k = kbuf; + while (count > 0 && !ret) { + ret = putreg(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const compat_ulong_t __user *u = ubuf; + while (count > 0 && !ret) { + compat_ulong_t word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) +{ + siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t)); + compat_siginfo_t __user *si32 = compat_ptr(data); + siginfo_t ssi; + int ret; + + if (request == PTRACE_SETSIGINFO) { + memset(&ssi, 0, sizeof(siginfo_t)); + ret = copy_siginfo_from_user32(&ssi, si32); + if (ret) + return ret; + if (copy_to_user(si, &ssi, sizeof(siginfo_t))) + return -EFAULT; + } + ret = sys_ptrace(request, pid, addr, (unsigned long)si); + if (ret) + return ret; + if (request == PTRACE_GETSIGINFO) { + if (copy_from_user(&ssi, si, sizeof(siginfo_t))) + return -EFAULT; + ret = copy_siginfo_to_user32(si32, &ssi); + } + return ret; +} + +asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) +{ + struct task_struct *child; + struct pt_regs *childregs; + void __user *datap = compat_ptr(data); + int ret; + __u32 val; + + switch (request) { + case PTRACE_TRACEME: + case PTRACE_ATTACH: + case PTRACE_KILL: + case PTRACE_CONT: + case PTRACE_SINGLESTEP: + case PTRACE_SINGLEBLOCK: + case PTRACE_DETACH: + case PTRACE_SYSCALL: + case PTRACE_OLDSETOPTIONS: + case PTRACE_SETOPTIONS: + case PTRACE_SET_THREAD_AREA: + case PTRACE_GET_THREAD_AREA: + case PTRACE_BTS_CONFIG: + case PTRACE_BTS_STATUS: + case PTRACE_BTS_SIZE: + case PTRACE_BTS_GET: + case PTRACE_BTS_CLEAR: + case PTRACE_BTS_DRAIN: + return sys_ptrace(request, pid, addr, data); + + default: + return -EINVAL; + + case PTRACE_PEEKTEXT: + case PTRACE_PEEKDATA: + case PTRACE_POKEDATA: + case PTRACE_POKETEXT: + case PTRACE_POKEUSR: + case PTRACE_PEEKUSR: + case PTRACE_GETREGS: + case PTRACE_SETREGS: + case PTRACE_SETFPREGS: + case PTRACE_GETFPREGS: + case PTRACE_SETFPXREGS: + case PTRACE_GETFPXREGS: + case PTRACE_GETEVENTMSG: + break; + + case PTRACE_SETSIGINFO: + case PTRACE_GETSIGINFO: + return ptrace32_siginfo(request, pid, addr, data); + } + + child = ptrace_get_task_struct(pid); + if (IS_ERR(child)) + return PTR_ERR(child); + + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) + goto out; + + childregs = task_pt_regs(child); + + switch (request) { + case PTRACE_PEEKUSR: + ret = getreg32(child, addr, &val); + if (ret == 0) + ret = put_user(val, (__u32 __user *)datap); + break; + + case PTRACE_POKEUSR: + ret = putreg32(child, addr, data); + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct32), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_GENERAL, 0, + sizeof(struct user_regs_struct32), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_FP, 0, + sizeof(struct user_i387_ia32_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user( + child, &user_x86_32_view, REGSET_FP, + 0, sizeof(struct user_i387_ia32_struct), datap); + + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + out: + put_task_struct(child); + return ret; +} + +#endif /* CONFIG_IA32_EMULATION */ + +#ifdef CONFIG_X86_64 + +static const struct user_regset x86_64_regsets[] = { + [REGSET_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct) / sizeof(long), + .size = sizeof(long), .align = sizeof(long), + .get = genregs_get, .set = genregs_set + }, + [REGSET_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(long), + .size = sizeof(long), .align = sizeof(long), + .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + }, +}; + +static const struct user_regset_view user_x86_64_view = { + .name = "x86_64", .e_machine = EM_X86_64, + .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets) +}; + +#else /* CONFIG_X86_32 */ + +#define user_regs_struct32 user_regs_struct +#define genregs32_get genregs_get +#define genregs32_set genregs_set + +#endif /* CONFIG_X86_64 */ + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static const struct user_regset x86_32_regsets[] = { + [REGSET_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct32) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .get = genregs32_get, .set = genregs32_set + }, + [REGSET_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = fpregs_active, .get = fpregs_get, .set = fpregs_set + }, + [REGSET_XFP] = { + .core_note_type = NT_PRXFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + }, + [REGSET_TLS] = { + .core_note_type = NT_386_TLS, + .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, + .size = sizeof(struct user_desc), + .align = sizeof(struct user_desc), + .active = regset_tls_active, + .get = regset_tls_get, .set = regset_tls_set + }, +}; + +static const struct user_regset_view user_x86_32_view = { + .name = "i386", .e_machine = EM_386, + .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets) +}; +#endif + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + return &user_x86_32_view; +#endif +#ifdef CONFIG_X86_64 + return &user_x86_64_view; +#endif +} + +#ifdef CONFIG_X86_32 + +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) +{ + struct siginfo info; + + tsk->thread.trap_no = 1; + tsk->thread.error_code = error_code; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGTRAP; + info.si_code = TRAP_BRKPT; + + /* User-mode ip? */ + info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; + + /* Send us the fake SIGTRAP */ + force_sig_info(SIGTRAP, &info, tsk); +} + +/* notification of system call entry/exit + * - triggered by current->work.syscall_trace + */ +__attribute__((regparm(3))) +int do_syscall_trace(struct pt_regs *regs, int entryexit) +{ + int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); + /* + * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall + * interception + */ + int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); + int ret = 0; + + /* do the secure computing check first */ + if (!entryexit) + secure_computing(regs->orig_ax); + + if (unlikely(current->audit_context)) { + if (entryexit) + audit_syscall_exit(AUDITSC_RESULT(regs->ax), + regs->ax); + /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only + * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is + * not used, entry.S will call us only on syscall exit, not + * entry; so when TIF_SYSCALL_AUDIT is used we must avoid + * calling send_sigtrap() on syscall entry. + * + * Note that when PTRACE_SYSEMU_SINGLESTEP is used, + * is_singlestep is false, despite his name, so we will still do + * the correct thing. + */ + else if (is_singlestep) + goto out; + } + + if (!(current->ptrace & PT_PTRACED)) + goto out; + + /* If a process stops on the 1st tracepoint with SYSCALL_TRACE + * and then is resumed with SYSEMU_SINGLESTEP, it will come in + * here. We have to check this and return */ + if (is_sysemu && entryexit) + return 0; + + /* Fake a debug trap */ + if (is_singlestep) + send_sigtrap(current, regs, 0); + + if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) + goto out; + + /* the 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ + /* Note that the debugger could change the result of test_thread_flag!*/ + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); + + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } + ret = is_sysemu; +out: + if (unlikely(current->audit_context) && !entryexit) + audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, + regs->bx, regs->cx, regs->dx, regs->si); + if (ret == 0) + return 0; + + regs->orig_ax = -1; /* force skip of syscall restarting */ + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + return 1; +} + +#else /* CONFIG_X86_64 */ + +static void syscall_trace(struct pt_regs *regs) +{ + +#if 0 + printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", + current->comm, + regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0), + current_thread_info()->flags, current->ptrace); +#endif + + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) + ? 0x80 : 0)); + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } +} + +asmlinkage void syscall_trace_enter(struct pt_regs *regs) +{ + /* do the secure computing check first */ + secure_computing(regs->orig_ax); + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); + + if (unlikely(current->audit_context)) { + if (test_thread_flag(TIF_IA32)) { + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); + } else { + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); + } + } +} + +asmlinkage void syscall_trace_leave(struct pt_regs *regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + + if ((test_thread_flag(TIF_SYSCALL_TRACE) + || test_thread_flag(TIF_SINGLESTEP)) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); +} + +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c deleted file mode 100644 index ff5431cc03e..00000000000 --- a/arch/x86/kernel/ptrace_32.c +++ /dev/null @@ -1,717 +0,0 @@ -/* By Ross Biro 1/23/92 */ -/* - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/security.h> -#include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/signal.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/debugreg.h> -#include <asm/ldt.h> -#include <asm/desc.h> - -/* - * does not yet catch signals sent when the child dies. - * in exit.c or in signal.c. - */ - -/* - * Determines which flags the user has access to [1 = access, 0 = no access]. - * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9). - * Also masks reserved bits (31-22, 15, 5, 3, 1). - */ -#define FLAG_MASK 0x00050dd5 - -/* set's the trap flag. */ -#define TRAP_FLAG 0x100 - -/* - * Offset of eflags on child stack.. - */ -#define EFL_OFFSET offsetof(struct pt_regs, eflags) - -static inline struct pt_regs *get_child_regs(struct task_struct *task) -{ - void *stack_top = (void *)task->thread.esp0; - return stack_top - sizeof(struct pt_regs); -} - -/* - * This routine will get a word off of the processes privileged stack. - * the offset is bytes into the pt_regs structure on the stack. - * This routine assumes that all the privileged stacks are in our - * data space. - */ -static inline int get_stack_long(struct task_struct *task, int offset) -{ - unsigned char *stack; - - stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs); - stack += offset; - return (*((int *)stack)); -} - -/* - * This routine will put a word on the processes privileged stack. - * the offset is bytes into the pt_regs structure on the stack. - * This routine assumes that all the privileged stacks are in our - * data space. - */ -static inline int put_stack_long(struct task_struct *task, int offset, - unsigned long data) -{ - unsigned char * stack; - - stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs); - stack += offset; - *(unsigned long *) stack = data; - return 0; -} - -static int putreg(struct task_struct *child, - unsigned long regno, unsigned long value) -{ - switch (regno >> 2) { - case GS: - if (value && (value & 3) != 3) - return -EIO; - child->thread.gs = value; - return 0; - case DS: - case ES: - case FS: - if (value && (value & 3) != 3) - return -EIO; - value &= 0xffff; - break; - case SS: - case CS: - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - break; - case EFL: - value &= FLAG_MASK; - value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; - break; - } - if (regno > FS*4) - regno -= 1*4; - put_stack_long(child, regno, value); - return 0; -} - -static unsigned long getreg(struct task_struct *child, - unsigned long regno) -{ - unsigned long retval = ~0UL; - - switch (regno >> 2) { - case GS: - retval = child->thread.gs; - break; - case DS: - case ES: - case FS: - case SS: - case CS: - retval = 0xffff; - /* fall through */ - default: - if (regno > FS*4) - regno -= 1*4; - retval &= get_stack_long(child, regno); - } - return retval; -} - -#define LDT_SEGMENT 4 - -static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs) -{ - unsigned long addr, seg; - - addr = regs->eip; - seg = regs->xcs & 0xffff; - if (regs->eflags & VM_MASK) { - addr = (addr & 0xffff) + (seg << 4); - return addr; - } - - /* - * We'll assume that the code segments in the GDT - * are all zero-based. That is largely true: the - * TLS segments are used for data, and the PNPBIOS - * and APM bios ones we just ignore here. - */ - if (seg & LDT_SEGMENT) { - u32 *desc; - unsigned long base; - - seg &= ~7UL; - - mutex_lock(&child->mm->context.lock); - if (unlikely((seg >> 3) >= child->mm->context.size)) - addr = -1L; /* bogus selector, access would fault */ - else { - desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); - - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; - } - mutex_unlock(&child->mm->context.lock); - } - return addr; -} - -static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) -{ - int i, copied; - unsigned char opcode[15]; - unsigned long addr = convert_eip_to_linear(child, regs); - - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); - for (i = 0; i < copied; i++) { - switch (opcode[i]) { - /* popf and iret */ - case 0x9d: case 0xcf: - return 1; - /* opcode and address size prefixes */ - case 0x66: case 0x67: - continue; - /* irrelevant prefixes (segment overrides and repeats) */ - case 0x26: case 0x2e: - case 0x36: case 0x3e: - case 0x64: case 0x65: - case 0xf0: case 0xf2: case 0xf3: - continue; - - /* - * pushf: NOTE! We should probably not let - * the user see the TF bit being set. But - * it's more pain than it's worth to avoid - * it, and a debugger could emulate this - * all in user space if it _really_ cares. - */ - case 0x9c: - default: - return 0; - } - } - return 0; -} - -static void set_singlestep(struct task_struct *child) -{ - struct pt_regs *regs = get_child_regs(child); - - /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also - * cause us to set TF when returning to user mode. - */ - set_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* - * If TF was already set, don't do anything else - */ - if (regs->eflags & TRAP_FLAG) - return; - - /* Set TF on the kernel stack.. */ - regs->eflags |= TRAP_FLAG; - - /* - * ..but if TF is changed by the instruction we will trace, - * don't mark it as being "us" that set it, so that we - * won't clear it by hand later. - */ - if (is_setting_trap_flag(child, regs)) - return; - - child->ptrace |= PT_DTRACE; -} - -static void clear_singlestep(struct task_struct *child) -{ - /* Always clear TIF_SINGLESTEP... */ - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* But touch TF only if it was set by us.. */ - if (child->ptrace & PT_DTRACE) { - struct pt_regs *regs = get_child_regs(child); - regs->eflags &= ~TRAP_FLAG; - child->ptrace &= ~PT_DTRACE; - } -} - -/* - * Called by kernel/ptrace.c when detaching.. - * - * Make sure the single step bit is not set. - */ -void ptrace_disable(struct task_struct *child) -{ - clear_singlestep(child); - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); -} - -/* - * Perform get_thread_area on behalf of the traced child. - */ -static int -ptrace_get_thread_area(struct task_struct *child, - int idx, struct user_desc __user *user_desc) -{ - struct user_desc info; - struct desc_struct *desc; - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - - if (copy_to_user(user_desc, &info, sizeof(info))) - return -EFAULT; - - return 0; -} - -/* - * Perform set_thread_area on behalf of the traced child. - */ -static int -ptrace_set_thread_area(struct task_struct *child, - int idx, struct user_desc __user *user_desc) -{ - struct user_desc info; - struct desc_struct *desc; - - if (copy_from_user(&info, user_desc, sizeof(info))) - return -EFAULT; - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - - return 0; -} - -long arch_ptrace(struct task_struct *child, long request, long addr, long data) -{ - struct user * dummy = NULL; - int i, ret; - unsigned long __user *datap = (unsigned long __user *)data; - - switch (request) { - /* when I and D space are separate, these will need to be fixed. */ - case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: - ret = generic_ptrace_peekdata(child, addr, data); - break; - - /* read the word at location addr in the USER area. */ - case PTRACE_PEEKUSR: { - unsigned long tmp; - - ret = -EIO; - if ((addr & 3) || addr < 0 || - addr > sizeof(struct user) - 3) - break; - - tmp = 0; /* Default return condition */ - if(addr < FRAME_SIZE*sizeof(long)) - tmp = getreg(child, addr); - if(addr >= (long) &dummy->u_debugreg[0] && - addr <= (long) &dummy->u_debugreg[7]){ - addr -= (long) &dummy->u_debugreg[0]; - addr = addr >> 2; - tmp = child->thread.debugreg[addr]; - } - ret = put_user(tmp, datap); - break; - } - - /* when I and D space are separate, this will have to be fixed. */ - case PTRACE_POKETEXT: /* write the word at location addr. */ - case PTRACE_POKEDATA: - ret = generic_ptrace_pokedata(child, addr, data); - break; - - case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ - ret = -EIO; - if ((addr & 3) || addr < 0 || - addr > sizeof(struct user) - 3) - break; - - if (addr < FRAME_SIZE*sizeof(long)) { - ret = putreg(child, addr, data); - break; - } - /* We need to be very careful here. We implicitly - want to modify a portion of the task_struct, and we - have to be selective about what portions we allow someone - to modify. */ - - ret = -EIO; - if(addr >= (long) &dummy->u_debugreg[0] && - addr <= (long) &dummy->u_debugreg[7]){ - - if(addr == (long) &dummy->u_debugreg[4]) break; - if(addr == (long) &dummy->u_debugreg[5]) break; - if(addr < (long) &dummy->u_debugreg[4] && - ((unsigned long) data) >= TASK_SIZE-3) break; - - /* Sanity-check data. Take one half-byte at once with - * check = (val >> (16 + 4*i)) & 0xf. It contains the - * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits - * 2 and 3 are LENi. Given a list of invalid values, - * we do mask |= 1 << invalid_value, so that - * (mask >> check) & 1 is a correct test for invalid - * values. - * - * R/Wi contains the type of the breakpoint / - * watchpoint, LENi contains the length of the watched - * data in the watchpoint case. - * - * The invalid values are: - * - LENi == 0x10 (undefined), so mask |= 0x0f00. - * - R/Wi == 0x10 (break on I/O reads or writes), so - * mask |= 0x4444. - * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= - * 0x1110. - * - * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. - * - * See the Intel Manual "System Programming Guide", - * 15.2.4 - * - * Note that LENi == 0x10 is defined on x86_64 in long - * mode (i.e. even for 32-bit userspace software, but - * 64-bit kernel), so the x86_64 mask value is 0x5454. - * See the AMD manual no. 24593 (AMD64 System - * Programming)*/ - - if(addr == (long) &dummy->u_debugreg[7]) { - data &= ~DR_CONTROL_RESERVED; - for(i=0; i<4; i++) - if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1) - goto out_tsk; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - } - addr -= (long) &dummy->u_debugreg; - addr = addr >> 2; - child->thread.debugreg[addr] = data; - ret = 0; - } - break; - - case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */ - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: /* restart after signal. */ - ret = -EIO; - if (!valid_signal(data)) - break; - if (request == PTRACE_SYSEMU) { - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - } else if (request == PTRACE_SYSCALL) { - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); - } else { - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - } - child->exit_code = data; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - ret = 0; - break; - -/* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - case PTRACE_KILL: - ret = 0; - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - break; - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - break; - - case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */ - case PTRACE_SINGLESTEP: /* set the trap flag. */ - ret = -EIO; - if (!valid_signal(data)) - break; - - if (request == PTRACE_SYSEMU_SINGLESTEP) - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); - else - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); - - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - set_singlestep(child); - child->exit_code = data; - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; - break; - - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) { - ret = -EIO; - break; - } - for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { - __put_user(getreg(child, i), datap); - datap++; - } - ret = 0; - break; - } - - case PTRACE_SETREGS: { /* Set all gp regs in the child. */ - unsigned long tmp; - if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) { - ret = -EIO; - break; - } - for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { - __get_user(tmp, datap); - putreg(child, i, tmp); - datap++; - } - ret = 0; - break; - } - - case PTRACE_GETFPREGS: { /* Get the child FPU state. */ - if (!access_ok(VERIFY_WRITE, datap, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - ret = 0; - if (!tsk_used_math(child)) - init_fpu(child); - get_fpregs((struct user_i387_struct __user *)data, child); - break; - } - - case PTRACE_SETFPREGS: { /* Set the child FPU state. */ - if (!access_ok(VERIFY_READ, datap, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - set_stopped_child_used_math(child); - set_fpregs(child, (struct user_i387_struct __user *)data); - ret = 0; - break; - } - - case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */ - if (!access_ok(VERIFY_WRITE, datap, - sizeof(struct user_fxsr_struct))) { - ret = -EIO; - break; - } - if (!tsk_used_math(child)) - init_fpu(child); - ret = get_fpxregs((struct user_fxsr_struct __user *)data, child); - break; - } - - case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */ - if (!access_ok(VERIFY_READ, datap, - sizeof(struct user_fxsr_struct))) { - ret = -EIO; - break; - } - set_stopped_child_used_math(child); - ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data); - break; - } - - case PTRACE_GET_THREAD_AREA: - ret = ptrace_get_thread_area(child, addr, - (struct user_desc __user *) data); - break; - - case PTRACE_SET_THREAD_AREA: - ret = ptrace_set_thread_area(child, addr, - (struct user_desc __user *) data); - break; - - default: - ret = ptrace_request(child, request, addr, data); - break; - } - out_tsk: - return ret; -} - -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) -{ - struct siginfo info; - - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGTRAP; - info.si_code = TRAP_BRKPT; - - /* User-mode eip? */ - info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL; - - /* Send us the fake SIGTRAP */ - force_sig_info(SIGTRAP, &info, tsk); -} - -/* notification of system call entry/exit - * - triggered by current->work.syscall_trace - */ -__attribute__((regparm(3))) -int do_syscall_trace(struct pt_regs *regs, int entryexit) -{ - int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); - /* - * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall - * interception - */ - int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); - int ret = 0; - - /* do the secure computing check first */ - if (!entryexit) - secure_computing(regs->orig_eax); - - if (unlikely(current->audit_context)) { - if (entryexit) - audit_syscall_exit(AUDITSC_RESULT(regs->eax), - regs->eax); - /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only - * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is - * not used, entry.S will call us only on syscall exit, not - * entry; so when TIF_SYSCALL_AUDIT is used we must avoid - * calling send_sigtrap() on syscall entry. - * - * Note that when PTRACE_SYSEMU_SINGLESTEP is used, - * is_singlestep is false, despite his name, so we will still do - * the correct thing. - */ - else if (is_singlestep) - goto out; - } - - if (!(current->ptrace & PT_PTRACED)) - goto out; - - /* If a process stops on the 1st tracepoint with SYSCALL_TRACE - * and then is resumed with SYSEMU_SINGLESTEP, it will come in - * here. We have to check this and return */ - if (is_sysemu && entryexit) - return 0; - - /* Fake a debug trap */ - if (is_singlestep) - send_sigtrap(current, regs, 0); - - if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) - goto out; - - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - /* Note that the debugger could change the result of test_thread_flag!*/ - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); - - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } - ret = is_sysemu; -out: - if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax, - regs->ebx, regs->ecx, regs->edx, regs->esi); - if (ret == 0) - return 0; - - regs->orig_eax = -1; /* force skip of syscall restarting */ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax); - return 1; -} diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c deleted file mode 100644 index 607085f3f08..00000000000 --- a/arch/x86/kernel/ptrace_64.c +++ /dev/null @@ -1,621 +0,0 @@ -/* By Ross Biro 1/23/92 */ -/* - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * x86-64 port 2000-2002 Andi Kleen - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/security.h> -#include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/signal.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/debugreg.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/ia32.h> - -/* - * does not yet catch signals sent when the child dies. - * in exit.c or in signal.c. - */ - -/* - * Determines which flags the user has access to [1 = access, 0 = no access]. - * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). - * Also masks reserved bits (63-22, 15, 5, 3, 1). - */ -#define FLAG_MASK 0x54dd5UL - -/* set's the trap flag. */ -#define TRAP_FLAG 0x100UL - -/* - * eflags and offset of eflags on child stack.. - */ -#define EFLAGS offsetof(struct pt_regs, eflags) -#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) - -/* - * this routine will get a word off of the processes privileged stack. - * the offset is how far from the base addr as stored in the TSS. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline unsigned long get_stack_long(struct task_struct *task, int offset) -{ - unsigned char *stack; - - stack = (unsigned char *)task->thread.rsp0; - stack += offset; - return (*((unsigned long *)stack)); -} - -/* - * this routine will put a word on the processes privileged stack. - * the offset is how far from the base addr as stored in the TSS. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline long put_stack_long(struct task_struct *task, int offset, - unsigned long data) -{ - unsigned char * stack; - - stack = (unsigned char *) task->thread.rsp0; - stack += offset; - *(unsigned long *) stack = data; - return 0; -} - -#define LDT_SEGMENT 4 - -unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs) -{ - unsigned long addr, seg; - - addr = regs->rip; - seg = regs->cs & 0xffff; - - /* - * We'll assume that the code segments in the GDT - * are all zero-based. That is largely true: the - * TLS segments are used for data, and the PNPBIOS - * and APM bios ones we just ignore here. - */ - if (seg & LDT_SEGMENT) { - u32 *desc; - unsigned long base; - - seg &= ~7UL; - - mutex_lock(&child->mm->context.lock); - if (unlikely((seg >> 3) >= child->mm->context.size)) - addr = -1L; /* bogus selector, access would fault */ - else { - desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); - - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; - } - mutex_unlock(&child->mm->context.lock); - } - - return addr; -} - -static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) -{ - int i, copied; - unsigned char opcode[15]; - unsigned long addr = convert_rip_to_linear(child, regs); - - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); - for (i = 0; i < copied; i++) { - switch (opcode[i]) { - /* popf and iret */ - case 0x9d: case 0xcf: - return 1; - - /* CHECKME: 64 65 */ - - /* opcode and address size prefixes */ - case 0x66: case 0x67: - continue; - /* irrelevant prefixes (segment overrides and repeats) */ - case 0x26: case 0x2e: - case 0x36: case 0x3e: - case 0x64: case 0x65: - case 0xf2: case 0xf3: - continue; - - case 0x40 ... 0x4f: - if (regs->cs != __USER_CS) - /* 32-bit mode: register increment */ - return 0; - /* 64-bit mode: REX prefix */ - continue; - - /* CHECKME: f2, f3 */ - - /* - * pushf: NOTE! We should probably not let - * the user see the TF bit being set. But - * it's more pain than it's worth to avoid - * it, and a debugger could emulate this - * all in user space if it _really_ cares. - */ - case 0x9c: - default: - return 0; - } - } - return 0; -} - -static void set_singlestep(struct task_struct *child) -{ - struct pt_regs *regs = task_pt_regs(child); - - /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also - * cause us to set TF when returning to user mode. - */ - set_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* - * If TF was already set, don't do anything else - */ - if (regs->eflags & TRAP_FLAG) - return; - - /* Set TF on the kernel stack.. */ - regs->eflags |= TRAP_FLAG; - - /* - * ..but if TF is changed by the instruction we will trace, - * don't mark it as being "us" that set it, so that we - * won't clear it by hand later. - */ - if (is_setting_trap_flag(child, regs)) - return; - - child->ptrace |= PT_DTRACE; -} - -static void clear_singlestep(struct task_struct *child) -{ - /* Always clear TIF_SINGLESTEP... */ - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* But touch TF only if it was set by us.. */ - if (child->ptrace & PT_DTRACE) { - struct pt_regs *regs = task_pt_regs(child); - regs->eflags &= ~TRAP_FLAG; - child->ptrace &= ~PT_DTRACE; - } -} - -/* - * Called by kernel/ptrace.c when detaching.. - * - * Make sure the single step bit is not set. - */ -void ptrace_disable(struct task_struct *child) -{ - clear_singlestep(child); -} - -static int putreg(struct task_struct *child, - unsigned long regno, unsigned long value) -{ - unsigned long tmp; - - switch (regno) { - case offsetof(struct user_regs_struct,fs): - if (value && (value & 3) != 3) - return -EIO; - child->thread.fsindex = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,gs): - if (value && (value & 3) != 3) - return -EIO; - child->thread.gsindex = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,ds): - if (value && (value & 3) != 3) - return -EIO; - child->thread.ds = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,es): - if (value && (value & 3) != 3) - return -EIO; - child->thread.es = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,ss): - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - return 0; - case offsetof(struct user_regs_struct,fs_base): - if (value >= TASK_SIZE_OF(child)) - return -EIO; - child->thread.fs = value; - return 0; - case offsetof(struct user_regs_struct,gs_base): - if (value >= TASK_SIZE_OF(child)) - return -EIO; - child->thread.gs = value; - return 0; - case offsetof(struct user_regs_struct, eflags): - value &= FLAG_MASK; - tmp = get_stack_long(child, EFL_OFFSET); - tmp &= ~FLAG_MASK; - value |= tmp; - break; - case offsetof(struct user_regs_struct,cs): - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - break; - } - put_stack_long(child, regno - sizeof(struct pt_regs), value); - return 0; -} - -static unsigned long getreg(struct task_struct *child, unsigned long regno) -{ - unsigned long val; - switch (regno) { - case offsetof(struct user_regs_struct, fs): - return child->thread.fsindex; - case offsetof(struct user_regs_struct, gs): - return child->thread.gsindex; - case offsetof(struct user_regs_struct, ds): - return child->thread.ds; - case offsetof(struct user_regs_struct, es): - return child->thread.es; - case offsetof(struct user_regs_struct, fs_base): - return child->thread.fs; - case offsetof(struct user_regs_struct, gs_base): - return child->thread.gs; - default: - regno = regno - sizeof(struct pt_regs); - val = get_stack_long(child, regno); - if (test_tsk_thread_flag(child, TIF_IA32)) - val &= 0xffffffff; - return val; - } - -} - -long arch_ptrace(struct task_struct *child, long request, long addr, long data) -{ - long i, ret; - unsigned ui; - - switch (request) { - /* when I and D space are separate, these will need to be fixed. */ - case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: - ret = generic_ptrace_peekdata(child, addr, data); - break; - - /* read the word at location addr in the USER area. */ - case PTRACE_PEEKUSR: { - unsigned long tmp; - - ret = -EIO; - if ((addr & 7) || - addr > sizeof(struct user) - 7) - break; - - switch (addr) { - case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - tmp = getreg(child, addr); - break; - case offsetof(struct user, u_debugreg[0]): - tmp = child->thread.debugreg0; - break; - case offsetof(struct user, u_debugreg[1]): - tmp = child->thread.debugreg1; - break; - case offsetof(struct user, u_debugreg[2]): - tmp = child->thread.debugreg2; - break; - case offsetof(struct user, u_debugreg[3]): - tmp = child->thread.debugreg3; - break; - case offsetof(struct user, u_debugreg[6]): - tmp = child->thread.debugreg6; - break; - case offsetof(struct user, u_debugreg[7]): - tmp = child->thread.debugreg7; - break; - default: - tmp = 0; - break; - } - ret = put_user(tmp,(unsigned long __user *) data); - break; - } - - /* when I and D space are separate, this will have to be fixed. */ - case PTRACE_POKETEXT: /* write the word at location addr. */ - case PTRACE_POKEDATA: - ret = generic_ptrace_pokedata(child, addr, data); - break; - - case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ - { - int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7; - ret = -EIO; - if ((addr & 7) || - addr > sizeof(struct user) - 7) - break; - - switch (addr) { - case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - ret = putreg(child, addr, data); - break; - /* Disallows to set a breakpoint into the vsyscall */ - case offsetof(struct user, u_debugreg[0]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg0 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[1]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg1 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[2]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg2 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[3]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg3 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[6]): - if (data >> 32) - break; - child->thread.debugreg6 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[7]): - /* See arch/i386/kernel/ptrace.c for an explanation of - * this awkward check.*/ - data &= ~DR_CONTROL_RESERVED; - for(i=0; i<4; i++) - if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) - break; - if (i == 4) { - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - ret = 0; - } - break; - } - break; - } - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: /* restart after signal. */ - - ret = -EIO; - if (!valid_signal(data)) - break; - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - else - clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - child->exit_code = data; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - ret = 0; - break; - -#ifdef CONFIG_IA32_EMULATION - /* This makes only sense with 32bit programs. Allow a - 64bit debugger to fully examine them too. Better - don't use it against 64bit processes, use - PTRACE_ARCH_PRCTL instead. */ - case PTRACE_SET_THREAD_AREA: { - struct user_desc __user *p; - int old; - p = (struct user_desc __user *)data; - get_user(old, &p->entry_number); - put_user(addr, &p->entry_number); - ret = do_set_thread_area(&child->thread, p); - put_user(old, &p->entry_number); - break; - case PTRACE_GET_THREAD_AREA: - p = (struct user_desc __user *)data; - get_user(old, &p->entry_number); - put_user(addr, &p->entry_number); - ret = do_get_thread_area(&child->thread, p); - put_user(old, &p->entry_number); - break; - } -#endif - /* normal 64bit interface to access TLS data. - Works just like arch_prctl, except that the arguments - are reversed. */ - case PTRACE_ARCH_PRCTL: - ret = do_arch_prctl(child, data, addr); - break; - -/* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - case PTRACE_KILL: - ret = 0; - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - break; - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - break; - - case PTRACE_SINGLESTEP: /* set the trap flag. */ - ret = -EIO; - if (!valid_signal(data)) - break; - clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - set_singlestep(child); - child->exit_code = data; - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; - break; - - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, - sizeof(struct user_regs_struct))) { - ret = -EIO; - break; - } - ret = 0; - for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); - data += sizeof(long); - } - break; - } - - case PTRACE_SETREGS: { /* Set all gp regs in the child. */ - unsigned long tmp; - if (!access_ok(VERIFY_READ, (unsigned __user *)data, - sizeof(struct user_regs_struct))) { - ret = -EIO; - break; - } - ret = 0; - for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret = __get_user(tmp, (unsigned long __user *) data); - if (ret) - break; - ret = putreg(child, ui, tmp); - if (ret) - break; - data += sizeof(long); - } - break; - } - - case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ - if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - ret = get_fpregs((struct user_i387_struct __user *)data, child); - break; - } - - case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ - if (!access_ok(VERIFY_READ, (unsigned __user *)data, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - set_stopped_child_used_math(child); - ret = set_fpregs(child, (struct user_i387_struct __user *)data); - break; - } - - default: - ret = ptrace_request(child, request, addr, data); - break; - } - return ret; -} - -static void syscall_trace(struct pt_regs *regs) -{ - -#if 0 - printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", - current->comm, - regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), - current_thread_info()->flags, current->ptrace); -#endif - - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } -} - -asmlinkage void syscall_trace_enter(struct pt_regs *regs) -{ - /* do the secure computing check first */ - secure_computing(regs->orig_rax); - - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); - - if (unlikely(current->audit_context)) { - if (test_thread_flag(TIF_IA32)) { - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_rax, - regs->rbx, regs->rcx, - regs->rdx, regs->rsi); - } else { - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_rax, - regs->rdi, regs->rsi, - regs->rdx, regs->r10); - } - } -} - -asmlinkage void syscall_trace_leave(struct pt_regs *regs) -{ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax); - - if ((test_thread_flag(TIF_SYSCALL_TRACE) - || test_thread_flag(TIF_SINGLESTEP)) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); -} diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index fab30e13483..150ba29a0d3 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -162,6 +162,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, + ich_force_enable_hpet); static struct pci_dev *cached_dev; diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot.c index bb1a0f889c5..5818dc28167 100644 --- a/arch/x86/kernel/reboot_32.c +++ b/arch/x86/kernel/reboot.c @@ -1,64 +1,94 @@ -#include <linux/mm.h> #include <linux/module.h> -#include <linux/delay.h> #include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/efi.h> -#include <linux/dmi.h> -#include <linux/ctype.h> -#include <linux/pm.h> #include <linux/reboot.h> -#include <asm/uaccess.h> +#include <linux/init.h> +#include <linux/pm.h> +#include <linux/efi.h> +#include <acpi/reboot.h> +#include <asm/io.h> #include <asm/apic.h> -#include <asm/hpet.h> #include <asm/desc.h> -#include "mach_reboot.h" +#include <asm/hpet.h> #include <asm/reboot_fixups.h> #include <asm/reboot.h> +#ifdef CONFIG_X86_32 +# include <linux/dmi.h> +# include <linux/ctype.h> +# include <linux/mc146818rtc.h> +# include <asm/pgtable.h> +#else +# include <asm/iommu.h> +#endif + /* * Power off function, if any */ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); +static long no_idt[3]; static int reboot_mode; -static int reboot_thru_bios; +enum reboot_type reboot_type = BOOT_KBD; +int reboot_force; -#ifdef CONFIG_SMP +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) static int reboot_cpu = -1; #endif + +/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] + warm Don't set the cold reboot flag + cold Set the cold reboot flag + bios Reboot by jumping through the BIOS (only for X86_32) + smp Reboot by executing reset on BSP or other CPU (only for X86_32) + triple Force a triple fault (init) + kbd Use the keyboard controller. cold reset (default) + acpi Use the RESET_REG in the FADT + efi Use efi reset_system runtime service + force Avoid anything that could hang. + */ static int __init reboot_setup(char *str) { - while(1) { + for (;;) { switch (*str) { - case 'w': /* "warm" reboot (no memory testing etc) */ + case 'w': reboot_mode = 0x1234; break; - case 'c': /* "cold" reboot (with memory testing etc) */ - reboot_mode = 0x0; - break; - case 'b': /* "bios" reboot by jumping through the BIOS */ - reboot_thru_bios = 1; - break; - case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */ - reboot_thru_bios = 0; + + case 'c': + reboot_mode = 0; break; + +#ifdef CONFIG_X86_32 #ifdef CONFIG_SMP - case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ + case 's': if (isdigit(*(str+1))) { reboot_cpu = (int) (*(str+1) - '0'); if (isdigit(*(str+2))) reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); } - /* we will leave sorting out the final value - when we are ready to reboot, since we might not - have set up boot_cpu_id or smp_num_cpu */ + /* we will leave sorting out the final value + when we are ready to reboot, since we might not + have set up boot_cpu_id or smp_num_cpu */ break; +#endif /* CONFIG_SMP */ + + case 'b': #endif + case 'a': + case 'k': + case 't': + case 'e': + reboot_type = *str; + break; + + case 'f': + reboot_force = 1; + break; } - if((str = strchr(str,',')) != NULL) + + str = strchr(str, ','); + if (str) str++; else break; @@ -68,18 +98,21 @@ static int __init reboot_setup(char *str) __setup("reboot=", reboot_setup); + +#ifdef CONFIG_X86_32 /* * Reboot options and system auto-detection code provided by * Dell Inc. so their systems "just work". :-) */ /* - * Some machines require the "reboot=b" commandline option, this quirk makes that automatic. + * Some machines require the "reboot=b" commandline option, + * this quirk makes that automatic. */ static int __init set_bios_reboot(const struct dmi_system_id *d) { - if (!reboot_thru_bios) { - reboot_thru_bios = 1; + if (reboot_type != BOOT_BIOS) { + reboot_type = BOOT_BIOS; printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); } return 0; @@ -143,7 +176,6 @@ static int __init reboot_init(void) dmi_check_system(reboot_dmi_table); return 0; } - core_initcall(reboot_init); /* The following code and data reboots the machine by switching to real @@ -152,7 +184,6 @@ core_initcall(reboot_init); controller to pulse the CPU reset line, which is more thorough, but doesn't work with at least one type of 486 motherboard. It is easy to stop this code working; hence the copious comments. */ - static unsigned long long real_mode_gdt_entries [3] = { @@ -161,11 +192,9 @@ real_mode_gdt_entries [3] = 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ }; -static struct Xgt_desc_struct +static struct desc_ptr real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, -real_mode_idt = { 0x3ff, 0 }, -no_idt = { 0, 0 }; - +real_mode_idt = { 0x3ff, 0 }; /* This is 16-bit protected mode code to disable paging and the cache, switch to real mode and jump to the BIOS reset code. @@ -185,7 +214,6 @@ no_idt = { 0, 0 }; More could be done here to set up the registers as if a CPU reset had occurred; hopefully real BIOSs don't assume much. */ - static unsigned char real_mode_switch [] = { 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ @@ -223,7 +251,6 @@ void machine_real_restart(unsigned char *code, int length) `outb_p' is needed instead of just `outb'. Use it to be on the safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) */ - spin_lock(&rtc_lock); CMOS_WRITE(0x00, 0x8f); spin_unlock(&rtc_lock); @@ -231,9 +258,8 @@ void machine_real_restart(unsigned char *code, int length) /* Remap the kernel at virtual address zero, as well as offset zero from the kernel segment. This assumes the kernel segment starts at virtual address PAGE_OFFSET. */ - - memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); + memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); /* * Use `swapper_pg_dir' as our page directory. @@ -245,7 +271,6 @@ void machine_real_restart(unsigned char *code, int length) boot)". This seems like a fairly standard thing that gets set by REBOOT.COM programs, and the previous reset routine did this too. */ - *((unsigned short *)0x472) = reboot_mode; /* For the switch to real mode, copy some code to low memory. It has @@ -253,19 +278,16 @@ void machine_real_restart(unsigned char *code, int length) has to have the same physical and virtual address, because it turns off paging. Copy it near the end of the first page, out of the way of BIOS variables. */ - - memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100), + memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), real_mode_switch, sizeof (real_mode_switch)); - memcpy ((void *) (0x1000 - 100), code, length); + memcpy((void *)(0x1000 - 100), code, length); /* Set up the IDT for real mode. */ - load_idt(&real_mode_idt); /* Set up a GDT from which we can load segment descriptors for real mode. The GDT is not used in real mode; it is just needed here to prepare the descriptors. */ - load_gdt(&real_mode_gdt); /* Load the data segment registers, and thus the descriptors ready for @@ -273,7 +295,6 @@ void machine_real_restart(unsigned char *code, int length) selector value being loaded here. This is so that the segment registers don't have to be reloaded after switching to real mode: the values are consistent for real mode operation already. */ - __asm__ __volatile__ ("movl $0x0010,%%eax\n" "\tmovl %%eax,%%ds\n" "\tmovl %%eax,%%es\n" @@ -284,130 +305,147 @@ void machine_real_restart(unsigned char *code, int length) /* Jump to the 16-bit code that we copied earlier. It disables paging and the cache, switches to real mode, and jumps to the BIOS reset entry point. */ - __asm__ __volatile__ ("ljmp $0x0008,%0" : - : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); + : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100))); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); #endif -static void native_machine_shutdown(void) +#endif /* CONFIG_X86_32 */ + +static inline void kb_wait(void) +{ + int i; + + for (i = 0; i < 0x10000; i++) { + if ((inb(0x64) & 0x02) == 0) + break; + udelay(2); + } +} + +void machine_emergency_restart(void) +{ + int i; + + /* Tell the BIOS if we want cold or warm reboot */ + *((unsigned short *)__va(0x472)) = reboot_mode; + + for (;;) { + /* Could also try the reset bit in the Hammer NB */ + switch (reboot_type) { + case BOOT_KBD: + for (i = 0; i < 10; i++) { + kb_wait(); + udelay(50); + outb(0xfe, 0x64); /* pulse reset low */ + udelay(50); + } + + case BOOT_TRIPLE: + load_idt((const struct desc_ptr *)&no_idt); + __asm__ __volatile__("int3"); + + reboot_type = BOOT_KBD; + break; + +#ifdef CONFIG_X86_32 + case BOOT_BIOS: + machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); + + reboot_type = BOOT_KBD; + break; +#endif + + case BOOT_ACPI: + acpi_reboot(); + reboot_type = BOOT_KBD; + break; + + + case BOOT_EFI: + if (efi_enabled) + efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, + EFI_SUCCESS, 0, NULL); + + reboot_type = BOOT_KBD; + break; + } + } +} + +void machine_shutdown(void) { + /* Stop the cpus and apics */ #ifdef CONFIG_SMP int reboot_cpu_id; /* The boot cpu is always logical cpu 0 */ reboot_cpu_id = 0; +#ifdef CONFIG_X86_32 /* See if there has been given a command line override */ if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && - cpu_isset(reboot_cpu, cpu_online_map)) { + cpu_isset(reboot_cpu, cpu_online_map)) reboot_cpu_id = reboot_cpu; - } +#endif - /* Make certain the cpu I'm rebooting on is online */ - if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) reboot_cpu_id = smp_processor_id(); - } /* Make certain I only run on the appropriate processor */ set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); - /* O.K. Now that I'm on the appropriate processor, stop - * all of the others, and disable their local APICs. + /* O.K Now that I'm on the appropriate processor, + * stop all of the others. */ - smp_send_stop(); -#endif /* CONFIG_SMP */ +#endif lapic_shutdown(); #ifdef CONFIG_X86_IO_APIC disable_IO_APIC(); #endif + #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif -} -void __attribute__((weak)) mach_reboot_fixups(void) -{ +#ifdef CONFIG_X86_64 + pci_iommu_shutdown(); +#endif } -static void native_machine_emergency_restart(void) +void machine_restart(char *__unused) { - if (!reboot_thru_bios) { - if (efi_enabled) { - efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); - load_idt(&no_idt); - __asm__ __volatile__("int3"); - } - /* rebooting needs to touch the page at absolute addr 0 */ - *((unsigned short *)__va(0x472)) = reboot_mode; - for (;;) { - mach_reboot_fixups(); /* for board specific fixups */ - mach_reboot(); - /* That didn't work - force a triple fault.. */ - load_idt(&no_idt); - __asm__ __volatile__("int3"); - } - } - if (efi_enabled) - efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL); + printk("machine restart\n"); - machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); -} - -static void native_machine_restart(char * __unused) -{ - machine_shutdown(); + if (!reboot_force) + machine_shutdown(); machine_emergency_restart(); } -static void native_machine_halt(void) +void machine_halt(void) { } -static void native_machine_power_off(void) +void machine_power_off(void) { if (pm_power_off) { - machine_shutdown(); + if (!reboot_force) + machine_shutdown(); pm_power_off(); } } - struct machine_ops machine_ops = { - .power_off = native_machine_power_off, - .shutdown = native_machine_shutdown, - .emergency_restart = native_machine_emergency_restart, - .restart = native_machine_restart, - .halt = native_machine_halt, + .power_off = machine_power_off, + .shutdown = machine_shutdown, + .emergency_restart = machine_emergency_restart, + .restart = machine_restart, + .halt = machine_halt }; - -void machine_power_off(void) -{ - machine_ops.power_off(); -} - -void machine_shutdown(void) -{ - machine_ops.shutdown(); -} - -void machine_emergency_restart(void) -{ - machine_ops.emergency_restart(); -} - -void machine_restart(char *cmd) -{ - machine_ops.restart(cmd); -} - -void machine_halt(void) -{ - machine_ops.halt(); -} diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c deleted file mode 100644 index 53620a92a8f..00000000000 --- a/arch/x86/kernel/reboot_64.c +++ /dev/null @@ -1,176 +0,0 @@ -/* Various gunk just to reboot the machine. */ -#include <linux/module.h> -#include <linux/reboot.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/string.h> -#include <linux/pm.h> -#include <linux/kdebug.h> -#include <linux/sched.h> -#include <asm/io.h> -#include <asm/delay.h> -#include <asm/desc.h> -#include <asm/hw_irq.h> -#include <asm/system.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/apic.h> -#include <asm/hpet.h> -#include <asm/gart.h> - -/* - * Power off function, if any - */ -void (*pm_power_off)(void); -EXPORT_SYMBOL(pm_power_off); - -static long no_idt[3]; -static enum { - BOOT_TRIPLE = 't', - BOOT_KBD = 'k' -} reboot_type = BOOT_KBD; -static int reboot_mode = 0; -int reboot_force; - -/* reboot=t[riple] | k[bd] [, [w]arm | [c]old] - warm Don't set the cold reboot flag - cold Set the cold reboot flag - triple Force a triple fault (init) - kbd Use the keyboard controller. cold reset (default) - force Avoid anything that could hang. - */ -static int __init reboot_setup(char *str) -{ - for (;;) { - switch (*str) { - case 'w': - reboot_mode = 0x1234; - break; - - case 'c': - reboot_mode = 0; - break; - - case 't': - case 'b': - case 'k': - reboot_type = *str; - break; - case 'f': - reboot_force = 1; - break; - } - if((str = strchr(str,',')) != NULL) - str++; - else - break; - } - return 1; -} - -__setup("reboot=", reboot_setup); - -static inline void kb_wait(void) -{ - int i; - - for (i=0; i<0x10000; i++) - if ((inb_p(0x64) & 0x02) == 0) - break; -} - -void machine_shutdown(void) -{ - unsigned long flags; - - /* Stop the cpus and apics */ -#ifdef CONFIG_SMP - int reboot_cpu_id; - - /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { - reboot_cpu_id = smp_processor_id(); - } - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); - - /* O.K Now that I'm on the appropriate processor, - * stop all of the others. - */ - smp_send_stop(); -#endif - - local_irq_save(flags); - -#ifndef CONFIG_SMP - disable_local_APIC(); -#endif - - disable_IO_APIC(); - -#ifdef CONFIG_HPET_TIMER - hpet_disable(); -#endif - local_irq_restore(flags); - - pci_iommu_shutdown(); -} - -void machine_emergency_restart(void) -{ - int i; - - /* Tell the BIOS if we want cold or warm reboot */ - *((unsigned short *)__va(0x472)) = reboot_mode; - - for (;;) { - /* Could also try the reset bit in the Hammer NB */ - switch (reboot_type) { - case BOOT_KBD: - for (i=0; i<10; i++) { - kb_wait(); - udelay(50); - outb(0xfe,0x64); /* pulse reset low */ - udelay(50); - } - - case BOOT_TRIPLE: - load_idt((const struct desc_ptr *)&no_idt); - __asm__ __volatile__("int3"); - - reboot_type = BOOT_KBD; - break; - } - } -} - -void machine_restart(char * __unused) -{ - printk("machine restart\n"); - - if (!reboot_force) { - machine_shutdown(); - } - machine_emergency_restart(); -} - -void machine_halt(void) -{ -} - -void machine_power_off(void) -{ - if (pm_power_off) { - if (!reboot_force) { - machine_shutdown(); - } - pm_power_off(); - } -} - diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index f452726c0fe..dec0b5ec25c 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -30,6 +30,19 @@ static void cs5536_warm_reset(struct pci_dev *dev) udelay(50); /* shouldn't get here but be safe and spin a while */ } +static void rdc321x_reset(struct pci_dev *dev) +{ + unsigned i; + /* Voluntary reset the watchdog timer */ + outl(0x80003840, 0xCF8); + /* Generate a CPU reset on next tick */ + i = inl(0xCFC); + /* Use the minimum timer resolution */ + i |= 0x1600; + outl(i, 0xCFC); + outb(1, 0x92); +} + struct device_fixup { unsigned int vendor; unsigned int device; @@ -40,6 +53,7 @@ static struct device_fixup fixups_table[] = { { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, +{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, }; /* diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c new file mode 100644 index 00000000000..eb9b1a198f5 --- /dev/null +++ b/arch/x86/kernel/rtc.c @@ -0,0 +1,204 @@ +/* + * RTC related functions + */ +#include <linux/acpi.h> +#include <linux/bcd.h> +#include <linux/mc146818rtc.h> + +#include <asm/time.h> +#include <asm/vsyscall.h> + +#ifdef CONFIG_X86_32 +# define CMOS_YEARS_OFFS 1900 +/* + * This is a special lock that is owned by the CPU and holds the index + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + */ +volatile unsigned long cmos_lock = 0; +EXPORT_SYMBOL(cmos_lock); +#else +/* + * x86-64 systems only exists since 2002. + * This will work up to Dec 31, 2100 + */ +# define CMOS_YEARS_OFFS 2000 +#endif + +DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); + +/* + * In order to set the CMOS clock precisely, set_rtc_mmss has to be + * called 500 ms after the second nowtime has started, because when + * nowtime is written into the registers of the CMOS clock, it will + * jump to the next second precisely 500 ms later. Check the Motorola + * MC146818A or Dallas DS12887 data sheet for details. + * + * BUG: This routine does not handle hour overflow properly; it just + * sets the minutes. Usually you'll only notice that after reboot! + */ +int mach_set_rtc_mmss(unsigned long nowtime) +{ + int retval = 0; + int real_seconds, real_minutes, cmos_minutes; + unsigned char save_control, save_freq_select; + + /* tell the clock it's being set */ + save_control = CMOS_READ(RTC_CONTROL); + CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); + + /* stop and reset prescaler */ + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + + cmos_minutes = CMOS_READ(RTC_MINUTES); + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) + BCD_TO_BIN(cmos_minutes); + + /* + * since we're only adjusting minutes and seconds, + * don't interfere with hour overflow. This avoids + * messing with unknown time zones but requires your + * RTC not to be off by more than 15 minutes + */ + real_seconds = nowtime % 60; + real_minutes = nowtime / 60; + /* correct for half hour time zone */ + if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1) + real_minutes += 30; + real_minutes %= 60; + + if (abs(real_minutes - cmos_minutes) < 30) { + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { + BIN_TO_BCD(real_seconds); + BIN_TO_BCD(real_minutes); + } + CMOS_WRITE(real_seconds,RTC_SECONDS); + CMOS_WRITE(real_minutes,RTC_MINUTES); + } else { + printk(KERN_WARNING + "set_rtc_mmss: can't update from %d to %d\n", + cmos_minutes, real_minutes); + retval = -1; + } + + /* The following flags have to be released exactly in this order, + * otherwise the DS12887 (popular MC146818A clone with integrated + * battery and quartz) will not reset the oscillator and will not + * update precisely 500 ms later. You won't find this mentioned in + * the Dallas Semiconductor data sheets, but who believes data + * sheets anyway ... -- Markus Kuhn + */ + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + + return retval; +} + +unsigned long mach_get_cmos_time(void) +{ + unsigned int year, mon, day, hour, min, sec, century = 0; + + /* + * If UIP is clear, then we have >= 244 microseconds before + * RTC registers will be updated. Spec sheet says that this + * is the reliable way to read RTC - registers. If UIP is set + * then the register access might be invalid. + */ + while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) + cpu_relax(); + + sec = CMOS_READ(RTC_SECONDS); + min = CMOS_READ(RTC_MINUTES); + hour = CMOS_READ(RTC_HOURS); + day = CMOS_READ(RTC_DAY_OF_MONTH); + mon = CMOS_READ(RTC_MONTH); + year = CMOS_READ(RTC_YEAR); + +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_64) + /* CHECKME: Is this really 64bit only ??? */ + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + acpi_gbl_FADT.century) + century = CMOS_READ(acpi_gbl_FADT.century); +#endif + + if (RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)) { + BCD_TO_BIN(sec); + BCD_TO_BIN(min); + BCD_TO_BIN(hour); + BCD_TO_BIN(day); + BCD_TO_BIN(mon); + BCD_TO_BIN(year); + } + + if (century) { + BCD_TO_BIN(century); + year += century * 100; + printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); + } else { + year += CMOS_YEARS_OFFS; + if (year < 1970) + year += 100; + } + + return mktime(year, mon, day, hour, min, sec); +} + +/* Routines for accessing the CMOS RAM/RTC. */ +unsigned char rtc_cmos_read(unsigned char addr) +{ + unsigned char val; + + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + val = inb_p(RTC_PORT(1)); + lock_cmos_suffix(addr); + return val; +} +EXPORT_SYMBOL(rtc_cmos_read); + +void rtc_cmos_write(unsigned char val, unsigned char addr) +{ + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + outb_p(val, RTC_PORT(1)); + lock_cmos_suffix(addr); +} +EXPORT_SYMBOL(rtc_cmos_write); + +static int set_rtc_mmss(unsigned long nowtime) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + retval = set_wallclock(nowtime); + spin_unlock_irqrestore(&rtc_lock, flags); + + return retval; +} + +/* not static: needed by APM */ +unsigned long read_persistent_clock(void) +{ + unsigned long retval, flags; + + spin_lock_irqsave(&rtc_lock, flags); + retval = get_wallclock(); + spin_unlock_irqrestore(&rtc_lock, flags); + + return retval; +} + +int update_persistent_clock(struct timespec now) +{ + return set_rtc_mmss(now.tv_sec); +} + +unsigned long long native_read_tsc(void) +{ + return __native_read_tsc(); +} +EXPORT_SYMBOL(native_read_tsc); + diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index 3558ac78c92..309366f8f60 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -24,7 +24,11 @@ #include <asm/sections.h> #include <asm/setup.h> +#ifndef CONFIG_DEBUG_BOOT_PARAMS struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; @@ -37,6 +41,8 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); unsigned long __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + static int do_not_nx __cpuinitdata = 0; /* noexec=on|off @@ -80,6 +86,43 @@ static int __init nonx32_setup(char *str) __setup("noexec32=", nonx32_setup); /* + * Copy data used in early init routines from the initial arrays to the + * per cpu data areas. These arrays then become expendable and the + * *_early_ptr's are zeroed indicating that the static arrays are gone. + */ +static void __init setup_per_cpu_maps(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { +#ifdef CONFIG_SMP + if (per_cpu_offset(cpu)) { +#endif + per_cpu(x86_cpu_to_apicid, cpu) = + x86_cpu_to_apicid_init[cpu]; + per_cpu(x86_bios_cpu_apicid, cpu) = + x86_bios_cpu_apicid_init[cpu]; +#ifdef CONFIG_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + x86_cpu_to_node_map_init[cpu]; +#endif +#ifdef CONFIG_SMP + } + else + printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n", + cpu); +#endif + } + + /* indicate the early static arrays will soon be gone */ + x86_cpu_to_apicid_early_ptr = NULL; + x86_bios_cpu_apicid_early_ptr = NULL; +#ifdef CONFIG_NUMA + x86_cpu_to_node_map_early_ptr = NULL; +#endif +} + +/* * Great future plan: * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. * Always point %gs to its beginning @@ -100,18 +143,21 @@ void __init setup_per_cpu_areas(void) for_each_cpu_mask (i, cpu_possible_map) { char *ptr; - if (!NODE_DATA(cpu_to_node(i))) { + if (!NODE_DATA(early_cpu_to_node(i))) { printk("cpu with no node %d, num_online_nodes %d\n", i, num_online_nodes()); ptr = alloc_bootmem_pages(size); } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); + ptr = alloc_bootmem_pages_node(NODE_DATA(early_cpu_to_node(i)), size); } if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); cpu_pda(i)->data_offset = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); } + + /* setup percpu data maps early */ + setup_per_cpu_maps(); } void pda_init(int cpu) @@ -169,7 +215,8 @@ void syscall_init(void) #endif /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); } void __cpuinit check_efer(void) @@ -227,7 +274,7 @@ void __cpuinit cpu_init (void) * and set up the GDT descriptor: */ if (cpu) - memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); + memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE); cpu_gdt_descr[cpu].size = GDT_SIZE; load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]); @@ -257,10 +304,10 @@ void __cpuinit cpu_init (void) v, cpu); } estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; + orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } - t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); /* * <= is required because the CPU will access up to * 8 bits beyond the end of the IO permission bitmap. diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 9c24b45b513..62adc5f20be 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -44,9 +44,12 @@ #include <linux/crash_dump.h> #include <linux/dmi.h> #include <linux/pfn.h> +#include <linux/pci.h> +#include <linux/init_ohci1394_dma.h> #include <video/edid.h> +#include <asm/mtrr.h> #include <asm/apic.h> #include <asm/e820.h> #include <asm/mpspec.h> @@ -67,14 +70,83 @@ address, and must not be in the .bss segment! */ unsigned long init_pg_tables_end __initdata = ~0UL; -int disable_pse __cpuinitdata = 0; - /* * Machine setup.. */ -extern struct resource code_resource; -extern struct resource data_resource; -extern struct resource bss_resource; +static struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer0", + .start = 0x0040, + .end = 0x0043, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer1", + .start = 0x0050, + .end = 0x0053, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; @@ -116,13 +188,17 @@ extern int root_mountflags; unsigned long saved_videomode; -#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_IMAGE_START_MASK 0x07FF #define RAMDISK_PROMPT_FLAG 0x8000 -#define RAMDISK_LOAD_FLAG 0x4000 +#define RAMDISK_LOAD_FLAG 0x4000 static char __initdata command_line[COMMAND_LINE_SIZE]; +#ifndef CONFIG_DEBUG_BOOT_PARAMS struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; @@ -166,8 +242,7 @@ static int __init parse_mem(char *arg) return -EINVAL; if (strcmp(arg, "nopentium") == 0) { - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; + setup_clear_cpu_cap(X86_FEATURE_PSE); } else { /* If the user specifies memory size, we * limit the BIOS-provided memory map to @@ -176,7 +251,7 @@ static int __init parse_mem(char *arg) * trim the existing memory map. */ unsigned long long mem_size; - + mem_size = memparse(arg, &arg); limit_regions(mem_size); user_defined_memmap = 1; @@ -315,7 +390,7 @@ static void __init reserve_ebda_region(void) unsigned int addr; addr = get_bios_ebda(); if (addr) - reserve_bootmem(addr, PAGE_SIZE); + reserve_bootmem(addr, PAGE_SIZE); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -420,6 +495,100 @@ static inline void __init reserve_crashkernel(void) {} #endif +#ifdef CONFIG_BLK_DEV_INITRD + +static bool do_relocate_initrd = false; + +static void __init reserve_initrd(void) +{ + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; + unsigned long ramdisk_here; + + initrd_start = 0; + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + if (ramdisk_end < ramdisk_image) { + printk(KERN_ERR "initrd wraps around end of memory, " + "disabling initrd\n"); + return; + } + if (ramdisk_size >= end_of_lowmem/2) { + printk(KERN_ERR "initrd too large to handle, " + "disabling initrd\n"); + return; + } + if (ramdisk_end <= end_of_lowmem) { + /* All in lowmem, easy case */ + reserve_bootmem(ramdisk_image, ramdisk_size); + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start+ramdisk_size; + return; + } + + /* We need to move the initrd down into lowmem */ + ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK; + + /* Note: this includes all the lowmem currently occupied by + the initrd, we rely on that fact to keep the data intact. */ + reserve_bootmem(ramdisk_here, ramdisk_size); + initrd_start = ramdisk_here + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + + do_relocate_initrd = true; +} + +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) + +static void __init relocate_initrd(void) +{ + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; + unsigned long ramdisk_here; + unsigned long slop, clen, mapaddr; + char *p, *q; + + if (!do_relocate_initrd) + return; + + ramdisk_here = initrd_start - PAGE_OFFSET; + + q = (char *)initrd_start; + + /* Copy any lowmem portion of the initrd */ + if (ramdisk_image < end_of_lowmem) { + clen = end_of_lowmem - ramdisk_image; + p = (char *)__va(ramdisk_image); + memcpy(q, p, clen); + q += clen; + ramdisk_image += clen; + ramdisk_size -= clen; + } + + /* Copy the highmem portion of the initrd */ + while (ramdisk_size) { + slop = ramdisk_image & ~PAGE_MASK; + clen = ramdisk_size; + if (clen > MAX_MAP_CHUNK-slop) + clen = MAX_MAP_CHUNK-slop; + mapaddr = ramdisk_image & PAGE_MASK; + p = early_ioremap(mapaddr, clen+slop); + memcpy(q, p+slop, clen); + early_iounmap(p, clen+slop); + q += clen; + ramdisk_image += clen; + ramdisk_size -= clen; + } +} + +#endif /* CONFIG_BLK_DEV_INITRD */ + void __init setup_bootmem_allocator(void) { unsigned long bootmap_size; @@ -475,26 +644,10 @@ void __init setup_bootmem_allocator(void) */ find_smp_config(); #endif - numa_kva_reserve(); #ifdef CONFIG_BLK_DEV_INITRD - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; - - if (ramdisk_end <= end_of_lowmem) { - reserve_bootmem(ramdisk_image, ramdisk_size); - initrd_start = ramdisk_image + PAGE_OFFSET; - initrd_end = initrd_start+ramdisk_size; - } else { - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - ramdisk_end, end_of_lowmem); - initrd_start = 0; - } - } + reserve_initrd(); #endif + numa_kva_reserve(); reserve_crashkernel(); } @@ -545,17 +698,11 @@ void __init setup_arch(char **cmdline_p) memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); pre_setup_arch_hook(); early_cpu_init(); + early_ioremap_init(); - /* - * FIXME: This isn't an official loader_type right - * now but does currently work with elilo. - * If we were configured as an EFI kernel, check to make - * sure that we were loaded correctly from elilo and that - * the system table is valid. If not, then initialize normally. - */ #ifdef CONFIG_EFI - if ((boot_params.hdr.type_of_loader == 0x50) && - boot_params.efi_info.efi_systab) + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL32", 4)) efi_enabled = 1; #endif @@ -579,12 +726,9 @@ void __init setup_arch(char **cmdline_p) rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif ARCH_SETUP - if (efi_enabled) - efi_init(); - else { - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(memory_setup()); - } + + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + print_memory_map(memory_setup()); copy_edd(); @@ -612,8 +756,16 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; + if (efi_enabled) + efi_init(); + max_low_pfn = setup_memory(); + /* update e820 for memory not covered by WB MTRRs */ + mtrr_bp_init(); + if (mtrr_trim_uncached_memory(max_pfn)) + max_low_pfn = setup_memory(); + #ifdef CONFIG_VMI /* * Must be after max_low_pfn is determined, and before kernel @@ -636,6 +788,16 @@ void __init setup_arch(char **cmdline_p) smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ #endif paging_init(); + + /* + * NOTE: On x86-32, only from this point on, fixmaps are ready for use. + */ + +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +#endif + remapped_pgdat_init(); sparse_init(); zone_sizes_init(); @@ -644,15 +806,19 @@ void __init setup_arch(char **cmdline_p) * NOTE: at this point the bootmem allocator is fully available. */ +#ifdef CONFIG_BLK_DEV_INITRD + relocate_initrd(); +#endif + paravirt_post_allocator_init(); dmi_scan_machine(); + io_delay_init(); + #ifdef CONFIG_X86_GENERICARCH generic_apic_probe(); -#endif - if (efi_enabled) - efi_map_memmap(); +#endif #ifdef CONFIG_ACPI /* @@ -661,9 +827,7 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif -#ifdef CONFIG_PCI early_quirks(); -#endif #ifdef CONFIG_ACPI acpi_boot_init(); @@ -692,3 +856,26 @@ void __init setup_arch(char **cmdline_p) #endif #endif } + +/* + * Request address space for all standard resources + * + * This is called just before pcibios_init(), which is also a + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). + */ +static int __init request_standard_resources(void) +{ + int i; + + printk(KERN_INFO "Setting up standard PCI resources\n"); + init_iomem_resources(&code_resource, &data_resource, &bss_resource); + + request_resource(&iomem_resource, &video_ram_resource); + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + return 0; +} + +subsys_initcall(request_standard_resources); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 30d94d1d5f5..77fb87bf6e5 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -30,6 +30,7 @@ #include <linux/crash_dump.h> #include <linux/root_dev.h> #include <linux/pci.h> +#include <linux/efi.h> #include <linux/acpi.h> #include <linux/kallsyms.h> #include <linux/edd.h> @@ -39,10 +40,13 @@ #include <linux/dmi.h> #include <linux/dma-mapping.h> #include <linux/ctype.h> +#include <linux/uaccess.h> +#include <linux/init_ohci1394_dma.h> #include <asm/mtrr.h> #include <asm/uaccess.h> #include <asm/system.h> +#include <asm/vsyscall.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/msr.h> @@ -50,6 +54,7 @@ #include <video/edid.h> #include <asm/e820.h> #include <asm/dma.h> +#include <asm/gart.h> #include <asm/mpspec.h> #include <asm/mmu_context.h> #include <asm/proto.h> @@ -59,6 +64,15 @@ #include <asm/sections.h> #include <asm/dmi.h> #include <asm/cacheflush.h> +#include <asm/mce.h> +#include <asm/ds.h> +#include <asm/topology.h> + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define ARCH_SETUP +#endif /* * Machine setup.. @@ -67,6 +81,8 @@ struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + unsigned long mmu_cr4_features; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ @@ -76,7 +92,7 @@ unsigned long saved_video_mode; int force_mwait __cpuinitdata; -/* +/* * Early DMI memory */ int dmi_alloc_index; @@ -122,25 +138,27 @@ struct resource standard_io_resources[] = { #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) -struct resource data_resource = { +static struct resource data_resource = { .name = "Kernel data", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; -struct resource code_resource = { +static struct resource code_resource = { .name = "Kernel code", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; -struct resource bss_resource = { +static struct resource bss_resource = { .name = "Kernel bss", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); + #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed @@ -166,12 +184,12 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); if (bootmap == -1L) - panic("Cannot find bootmem map of size %ld\n",bootmap_size); + panic("Cannot find bootmem map of size %ld\n", bootmap_size); bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); reserve_bootmem(bootmap, bootmap_size); -} +} #endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) @@ -205,7 +223,8 @@ static void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; - free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; + free_mem = + ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; ret = parse_crashkernel(boot_command_line, free_mem, &crash_size, &crash_base); @@ -229,33 +248,21 @@ static inline void __init reserve_crashkernel(void) {} #endif -#define EBDA_ADDR_POINTER 0x40E - -unsigned __initdata ebda_addr; -unsigned __initdata ebda_size; - -static void discover_ebda(void) +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +void __attribute__((weak)) __init memory_setup(void) { - /* - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E - */ - ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); - ebda_addr <<= 4; - - ebda_size = *(unsigned short *)__va(ebda_addr); - - /* Round EBDA up to pages */ - if (ebda_size == 0) - ebda_size = 1; - ebda_size <<= 10; - ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); - if (ebda_size > 64*1024) - ebda_size = 64*1024; + machine_specific_memory_setup(); } +/* + * setup_arch - architecture-specific boot-time initializations + * + * Note: On x86_64, fixmaps are ready for use even before this is called. + */ void __init setup_arch(char **cmdline_p) { + unsigned i; + printk(KERN_INFO "Command line: %s\n", boot_command_line); ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); @@ -269,7 +276,15 @@ void __init setup_arch(char **cmdline_p) rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif - setup_memory_region(); +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL64", 4)) + efi_enabled = 1; +#endif + + ARCH_SETUP + + memory_setup(); copy_edd(); if (!boot_params.hdr.root_flags) @@ -293,27 +308,47 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +#endif + finish_e820_parsing(); + early_gart_iommu_check(); + e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus * we are rounding upwards: */ end_pfn = e820_end_of_ram(); + /* update e820 for memory not covered by WB MTRRs */ + mtrr_bp_init(); + if (mtrr_trim_uncached_memory(end_pfn)) { + e820_register_active_regions(0, 0, -1UL); + end_pfn = e820_end_of_ram(); + } + num_physpages = end_pfn; check_efer(); - discover_ebda(); - init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); + if (efi_enabled) + efi_init(); dmi_scan_machine(); + io_delay_init(); + #ifdef CONFIG_SMP - /* setup to use the static apicid table during kernel startup */ - x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init; + /* setup to use the early static init tables during kernel startup */ + x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; + x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; +#ifdef CONFIG_NUMA + x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; +#endif #endif #ifdef CONFIG_ACPI @@ -340,48 +375,26 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_NUMA - numa_initmem_init(0, end_pfn); + numa_initmem_init(0, end_pfn); #else contig_initmem_init(0, end_pfn); #endif - /* Reserve direct mapping */ - reserve_bootmem_generic(table_start << PAGE_SHIFT, - (table_end - table_start) << PAGE_SHIFT); - - /* reserve kernel */ - reserve_bootmem_generic(__pa_symbol(&_text), - __pa_symbol(&_end) - __pa_symbol(&_text)); + early_res_to_bootmem(); +#ifdef CONFIG_ACPI_SLEEP /* - * reserve physical page 0 - it's a special BIOS page on many boxes, - * enabling clean reboots, SMP operation, laptop functions. + * Reserve low memory region for sleep support. */ - reserve_bootmem_generic(0, PAGE_SIZE); - - /* reserve ebda region */ - if (ebda_addr) - reserve_bootmem_generic(ebda_addr, ebda_size); -#ifdef CONFIG_NUMA - /* reserve nodemap region */ - if (nodemap_addr) - reserve_bootmem_generic(nodemap_addr, nodemap_size); + acpi_reserve_bootmem(); #endif -#ifdef CONFIG_SMP - /* Reserve SMP trampoline */ - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); -#endif + if (efi_enabled) + efi_reserve_bootmem(); -#ifdef CONFIG_ACPI_SLEEP /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif - /* - * Find and reserve possible boot-time SMP configuration: - */ + * Find and reserve possible boot-time SMP configuration: + */ find_smp_config(); #ifdef CONFIG_BLK_DEV_INITRD if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { @@ -395,6 +408,8 @@ void __init setup_arch(char **cmdline_p) initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start+ramdisk_size; } else { + /* Assumes everything on node 0 */ + free_bootmem(ramdisk_image, ramdisk_size); printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", ramdisk_end, end_of_mem); @@ -404,17 +419,10 @@ void __init setup_arch(char **cmdline_p) #endif reserve_crashkernel(); paging_init(); + map_vsyscall(); -#ifdef CONFIG_PCI early_quirks(); -#endif - /* - * set this early, so we dont allocate cpu0 - * if MADT list doesnt list BSP first - * mpparse.c/MP_processor_info() allocates logical cpu numbers. - */ - cpu_set(0, cpu_present_map); #ifdef CONFIG_ACPI /* * Read APIC and some other early information from ACPI tables. @@ -430,25 +438,24 @@ void __init setup_arch(char **cmdline_p) if (smp_found_config) get_smp_config(); init_apic_mappings(); + ioapic_init_mappings(); /* * We trust e820 completely. No explicit ROM probing in memory. - */ - e820_reserve_resources(); + */ + e820_reserve_resources(&code_resource, &data_resource, &bss_resource); e820_mark_nosave_regions(); - { - unsigned i; /* request I/O space for devices used on all i[345]86 PCs */ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) request_resource(&ioport_resource, &standard_io_resources[i]); - } e820_setup_gap(); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) - conswitchp = &vga_con; + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + conswitchp = &vga_con; #elif defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; #endif @@ -479,9 +486,10 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size=(ecx>>24)+(edx>>24); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " + "D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size = (ecx>>24) + (edx>>24); /* On K8 L1 TLB is inclusive, so don't count it */ c->x86_tlbsize = 0; } @@ -495,11 +503,8 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", c->x86_cache_size, ecx & 0xFF); } - - if (n >= 0x80000007) - cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); if (n >= 0x80000008) { - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; } @@ -508,14 +513,15 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA static int nearby_node(int apicid) { - int i; + int i, node; + for (i = apicid - 1; i >= 0; i--) { - int node = apicid_to_node[i]; + node = apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - int node = apicid_to_node[i]; + node = apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } @@ -527,7 +533,7 @@ static int nearby_node(int apicid) * On a AMD dual core setup the lower bits of the APIC id distingush the cores. * Assumes number of cores is a power of two. */ -static void __init amd_detect_cmp(struct cpuinfo_x86 *c) +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned bits; @@ -536,7 +542,54 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) int node = 0; unsigned apicid = hard_smp_processor_id(); #endif - unsigned ecx = cpuid_ecx(0x80000008); + bits = c->x86_coreid_bits; + + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); + /* Convert the APIC ID into the socket ID */ + c->phys_proc_id = phys_pkg_id(bits); + +#ifdef CONFIG_NUMA + node = c->phys_proc_id; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + + int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); + + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +#endif +} + +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); c->x86_max_cores = (ecx & 0xff) + 1; @@ -549,37 +602,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) bits++; } - /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); - /* Convert the APIC ID into the socket ID */ - c->phys_proc_id = phys_pkg_id(bits); - -#ifdef CONFIG_NUMA - node = c->phys_proc_id; - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; - if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); - if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; - /* Pick a nearby node */ - if (!node_online(node)) - node = nearby_node(apicid); - } - numa_set_node(cpu, node); + c->x86_coreid_bits = bits; - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif #endif } @@ -595,8 +619,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ static __cpuinit int amd_apic_timer_broken(void) { - u32 lo, hi; - u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + switch (eax & CPUID_XFAM) { case CPUID_XFAM_K8: if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) @@ -614,6 +638,15 @@ static __cpuinit int amd_apic_timer_broken(void) return 0; } +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +{ + early_init_amd_mc(c); + + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ + if (c->x86_power & (1<<8)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +} + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { unsigned level; @@ -624,7 +657,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* * Disable TLB flush filter by setting HWCR.FFDIS on K8 * bit 6 of msr C001_0015 - * + * * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ @@ -637,35 +670,32 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_bit(0*32+31, &c->x86_capability); - + clear_bit(0*32+31, (unsigned long *)&c->x86_capability); + /* On C+ stepping K8 rep microcode works well for copy/memset */ level = cpuid_eax(1); - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || + level >= 0x0f58)) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); if (c->x86 == 0x10 || c->x86 == 0x11) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* Enable workaround for FXSAVE leak */ if (c->x86 >= 6) - set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); level = get_model_name(c); if (!level) { - switch (c->x86) { + switch (c->x86) { case 15: /* Should distinguish Models here, but this is only a fallback anyways. */ strcpy(c->x86_model_id, "Hammer"); - break; - } - } + break; + } + } display_cacheinfo(c); - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - /* Multi core CPU? */ if (c->extended_cpuid_level >= 0x80000008) amd_detect_cmp(c); @@ -677,41 +707,38 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) - set_bit(X86_FEATURE_K8, &c->x86_capability); - - /* RDTSC can be speculated around */ - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_K8); - /* Family 10 doesn't support C states in MWAIT so don't use it */ - if (c->x86 == 0x10 && !force_mwait) - clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); if (amd_apic_timer_broken()) disable_apic_timer = 1; } -static void __cpuinit detect_ht(struct cpuinfo_x86 *c) +void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; cpuid(1, &eax, &ebx, &ecx, &edx); if (!cpu_has(c, X86_FEATURE_HT)) return; - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) goto out; smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1 ) { + } else if (smp_num_siblings > 1) { if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); + printk(KERN_WARNING "CPU: Unsupported number of " + "siblings %d", smp_num_siblings); smp_num_siblings = 1; return; } @@ -721,7 +748,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) smp_num_siblings = smp_num_siblings / c->x86_max_cores; - index_msb = get_count_order(smp_num_siblings) ; + index_msb = get_count_order(smp_num_siblings); core_bits = get_count_order(c->x86_max_cores); @@ -730,8 +757,10 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) } out: if ((c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); } #endif @@ -773,28 +802,39 @@ static void srat_detect_node(void) #endif } +static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) +{ + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); +} + static void __cpuinit init_intel(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned n; init_intel_cacheinfo(c); - if (c->cpuid_level > 9 ) { + if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); /* Check for version and the number of counters */ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) - set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } if (cpu_has_ds) { unsigned int l1, l2; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); if (!(l1 & (1<<11))) - set_bit(X86_FEATURE_BTS, c->x86_capability); + set_cpu_cap(c, X86_FEATURE_BTS); if (!(l1 & (1<<12))) - set_bit(X86_FEATURE_PEBS, c->x86_capability); + set_cpu_cap(c, X86_FEATURE_PEBS); } + + if (cpu_has_bts) + ds_init_intel(c); + n = c->extended_cpuid_level; if (n >= 0x80000008) { unsigned eax = cpuid_eax(0x80000008); @@ -811,14 +851,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) c->x86_cache_alignment = c->x86_clflush_size * 2; if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); if (c->x86 == 6) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); - if (c->x86 == 15) - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - else - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - c->x86_max_cores = intel_num_cpu_cores(c); + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + c->x86_max_cores = intel_num_cpu_cores(c); srat_detect_node(); } @@ -835,18 +872,12 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) c->x86_vendor = X86_VENDOR_UNKNOWN; } -struct cpu_model_info { - int vendor; - int family; - char *model_names[16]; -}; - /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu below. */ -void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) { - u32 tfms; + u32 tfms, xlvl; c->loops_per_jiffy = loops_per_jiffy; c->x86_cache_size = -1; @@ -857,6 +888,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_clflush_size = 64; c->x86_cache_alignment = c->x86_clflush_size; c->x86_max_cores = 1; + c->x86_coreid_bits = 0; c->extended_cpuid_level = 0; memset(&c->x86_capability, 0, sizeof c->x86_capability); @@ -865,7 +897,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) (unsigned int *)&c->x86_vendor_id[0], (unsigned int *)&c->x86_vendor_id[8], (unsigned int *)&c->x86_vendor_id[4]); - + get_cpu_vendor(c); /* Initialize the standard set of capabilities */ @@ -883,7 +915,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (c->x86_capability[0] & (1<<19)) + if (c->x86_capability[0] & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ @@ -893,18 +925,6 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - u32 xlvl; - - early_identify_cpu(c); - /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -925,6 +945,30 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } + c->extended_cpuid_level = cpuid_eax(0x80000000); + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + early_init_amd(c); + break; + case X86_VENDOR_INTEL: + early_init_intel(c); + break; + } + +} + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + early_identify_cpu(c); + init_scattered_cpuid_features(c); c->apicid = phys_pkg_id(0); @@ -954,8 +998,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) break; } - select_idle_routine(c); - detect_ht(c); + detect_ht(c); /* * On SMP, boot_cpu_data holds the common feature set between @@ -965,32 +1008,56 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) */ if (c != &boot_cpu_data) { /* AND the already accumulated flags with these */ - for (i = 0 ; i < NCAPINTS ; i++) + for (i = 0; i < NCAPINTS; i++) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } + /* Clear all flags overriden by options */ + for (i = 0; i < NCAPINTS; i++) + c->x86_capability[i] ^= cleared_cpu_caps[i]; + #ifdef CONFIG_X86_MCE mcheck_init(c); #endif + select_idle_routine(c); + if (c != &boot_cpu_data) mtrr_ap_init(); #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif + +} + +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; } - +__setup("noclflush", setup_noclflush); void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { if (c->x86_model_id[0]) - printk("%s", c->x86_model_id); + printk(KERN_INFO "%s", c->x86_model_id); - if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); + if (c->x86_mask || c->cpuid_level >= 0) + printk(KERN_CONT " stepping %02x\n", c->x86_mask); else - printk("\n"); + printk(KERN_CONT "\n"); } +static __init int setup_disablecpuid(char *arg) +{ + int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + return 1; +} +__setup("clearcpuid=", setup_disablecpuid); + /* * Get CPU information for use by the procfs. */ @@ -998,9 +1065,9 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - int cpu = 0; + int cpu = 0, i; - /* + /* * These flag bits must match the definitions in <asm/cpufeature.h>. * NULL means this bit is undefined or reserved; either way it doesn't * have meaning as far as Linux is concerned. Note that it's important @@ -1010,10 +1077,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) */ static const char *const x86_cap_flags[] = { /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", /* AMD-defined */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1080,34 +1147,35 @@ static int show_cpuinfo(struct seq_file *m, void *v) cpu = c->cpu_index; #endif - seq_printf(m,"processor\t: %u\n" - "vendor_id\t: %s\n" - "cpu family\t: %d\n" - "model\t\t: %d\n" - "model name\t: %s\n", - (unsigned)cpu, - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", - c->x86, - (int)c->x86_model, - c->x86_model_id[0] ? c->x86_model_id : "unknown"); - + seq_printf(m, "processor\t: %u\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n", + (unsigned)cpu, + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", + c->x86, + (int)c->x86_model, + c->x86_model_id[0] ? c->x86_model_id : "unknown"); + if (c->x86_mask || c->cpuid_level >= 0) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else seq_printf(m, "stepping\t: unknown\n"); - - if (cpu_has(c,X86_FEATURE_TSC)) { + + if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = cpufreq_quick_get((unsigned)cpu); + if (!freq) freq = cpu_khz; seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); + freq / 1000, (freq % 1000)); } /* Cache size */ - if (c->x86_cache_size >= 0) + if (c->x86_cache_size >= 0) seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); - + #ifdef CONFIG_SMP if (smp_num_siblings * c->x86_max_cores > 1) { seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); @@ -1116,48 +1184,43 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } -#endif +#endif seq_printf(m, - "fpu\t\t: yes\n" - "fpu_exception\t: yes\n" - "cpuid level\t: %d\n" - "wp\t\t: yes\n" - "flags\t\t:", + "fpu\t\t: yes\n" + "fpu_exception\t: yes\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n" + "flags\t\t:", c->cpuid_level); - { - int i; - for ( i = 0 ; i < 32*NCAPINTS ; i++ ) - if (cpu_has(c, i) && x86_cap_flags[i] != NULL) - seq_printf(m, " %s", x86_cap_flags[i]); - } - + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(c, i) && x86_cap_flags[i] != NULL) + seq_printf(m, " %s", x86_cap_flags[i]); + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); - if (c->x86_tlbsize > 0) + if (c->x86_tlbsize > 0) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); seq_printf(m, "power management:"); - { - unsigned i; - for (i = 0; i < 32; i++) - if (c->x86_power & (1 << i)) { - if (i < ARRAY_SIZE(x86_power_flags) && - x86_power_flags[i]) - seq_printf(m, "%s%s", - x86_power_flags[i][0]?" ":"", - x86_power_flags[i]); - else - seq_printf(m, " [%d]", i); - } + for (i = 0; i < 32; i++) { + if (c->x86_power & (1 << i)) { + if (i < ARRAY_SIZE(x86_power_flags) && + x86_power_flags[i]) + seq_printf(m, "%s%s", + x86_power_flags[i][0]?" ":"", + x86_power_flags[i]); + else + seq_printf(m, " [%d]", i); + } } seq_printf(m, "\n\n"); @@ -1184,8 +1247,8 @@ static void c_stop(struct seq_file *m, void *v) { } -struct seq_operations cpuinfo_op = { - .start =c_start, +const struct seq_operations cpuinfo_op = { + .start = c_start, .next = c_next, .stop = c_stop, .show = show_cpuinfo, diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 20f29e4c1d3..caee1f002fe 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -23,6 +23,7 @@ #include <asm/ucontext.h> #include <asm/uaccess.h> #include <asm/i387.h> +#include <asm/vdso.h> #include "sigframe_32.h" #define DEBUG_SIG 0 @@ -81,14 +82,14 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, } asmlinkage int -sys_sigaltstack(unsigned long ebx) +sys_sigaltstack(unsigned long bx) { /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */ - struct pt_regs *regs = (struct pt_regs *)&ebx; - const stack_t __user *uss = (const stack_t __user *)ebx; - stack_t __user *uoss = (stack_t __user *)regs->ecx; + struct pt_regs *regs = (struct pt_regs *)&bx; + const stack_t __user *uss = (const stack_t __user *)bx; + stack_t __user *uoss = (stack_t __user *)regs->cx; - return do_sigaltstack(uss, uoss, regs->esp); + return do_sigaltstack(uss, uoss, regs->sp); } @@ -109,12 +110,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax #define COPY_SEG(seg) \ { unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ - regs->x##seg = tmp; } + regs->seg = tmp; } #define COPY_SEG_STRICT(seg) \ { unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ - regs->x##seg = tmp|3; } + regs->seg = tmp|3; } #define GET_SEG(seg) \ { unsigned short tmp; \ @@ -130,22 +131,22 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax COPY_SEG(fs); COPY_SEG(es); COPY_SEG(ds); - COPY(edi); - COPY(esi); - COPY(ebp); - COPY(esp); - COPY(ebx); - COPY(edx); - COPY(ecx); - COPY(eip); + COPY(di); + COPY(si); + COPY(bp); + COPY(sp); + COPY(bx); + COPY(dx); + COPY(cx); + COPY(ip); COPY_SEG_STRICT(cs); COPY_SEG_STRICT(ss); { unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_eax = -1; /* disable syscall checks */ + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ } { @@ -164,7 +165,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax } } - err |= __get_user(*peax, &sc->eax); + err |= __get_user(*peax, &sc->ax); return err; badframe: @@ -174,9 +175,9 @@ badframe: asmlinkage int sys_sigreturn(unsigned long __unused) { struct pt_regs *regs = (struct pt_regs *) &__unused; - struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8); + struct sigframe __user *frame = (struct sigframe __user *)(regs->sp - 8); sigset_t set; - int eax; + int ax; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -192,17 +193,20 @@ asmlinkage int sys_sigreturn(unsigned long __unused) recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(regs, &frame->sc, &eax)) + if (restore_sigcontext(regs, &frame->sc, &ax)) goto badframe; - return eax; + return ax; badframe: - if (show_unhandled_signals && printk_ratelimit()) - printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" - " esp:%lx oeax:%lx\n", + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s%s[%d] bad frame in sigreturn frame:%p ip:%lx" + " sp:%lx oeax:%lx", task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, - current->comm, task_pid_nr(current), frame, regs->eip, - regs->esp, regs->orig_eax); + current->comm, task_pid_nr(current), frame, regs->ip, + regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, current); return 0; @@ -211,9 +215,9 @@ badframe: asmlinkage int sys_rt_sigreturn(unsigned long __unused) { struct pt_regs *regs = (struct pt_regs *) &__unused; - struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4); + struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->sp - 4); sigset_t set; - int eax; + int ax; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -226,13 +230,13 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT) + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) goto badframe; - return eax; + return ax; badframe: force_sig(SIGSEGV, current); @@ -249,27 +253,27 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, { int tmp, err = 0; - err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs); + err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); savesegment(gs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->gs); - err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); - err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); - err |= __put_user(regs->edi, &sc->edi); - err |= __put_user(regs->esi, &sc->esi); - err |= __put_user(regs->ebp, &sc->ebp); - err |= __put_user(regs->esp, &sc->esp); - err |= __put_user(regs->ebx, &sc->ebx); - err |= __put_user(regs->edx, &sc->edx); - err |= __put_user(regs->ecx, &sc->ecx); - err |= __put_user(regs->eax, &sc->eax); + err |= __put_user(regs->es, (unsigned int __user *)&sc->es); + err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->eip, &sc->eip); - err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs); - err |= __put_user(regs->eflags, &sc->eflags); - err |= __put_user(regs->esp, &sc->esp_at_signal); - err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss); + err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); tmp = save_i387(fpstate); if (tmp < 0) @@ -290,29 +294,36 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, static inline void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) { - unsigned long esp; + unsigned long sp; /* Default to using normal stack */ - esp = regs->esp; + sp = regs->sp; + + /* + * If we are on the alternate signal stack and would overflow it, don't. + * Return an always-bogus address instead so we will die with SIGSEGV. + */ + if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size))) + return (void __user *) -1L; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(esp) == 0) - esp = current->sas_ss_sp + current->sas_ss_size; + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; } /* This is the legacy signal stack switching. */ - else if ((regs->xss & 0xffff) != __USER_DS && + else if ((regs->ss & 0xffff) != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { - esp = (unsigned long) ka->sa.sa_restorer; + sp = (unsigned long) ka->sa.sa_restorer; } - esp -= frame_size; + sp -= frame_size; /* Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ - esp = ((esp + 4) & -16ul) - 4; - return (void __user *) esp; + sp = ((sp + 4) & -16ul) - 4; + return (void __user *) sp; } /* These symbols are defined with the addresses in the vsyscall page. @@ -355,9 +366,9 @@ static int setup_frame(int sig, struct k_sigaction *ka, } if (current->binfmt->hasvdso) - restorer = (void *)VDSO_SYM(&__kernel_sigreturn); + restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); else - restorer = (void *)&frame->retcode; + restorer = &frame->retcode; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -379,16 +390,16 @@ static int setup_frame(int sig, struct k_sigaction *ka, goto give_sigsegv; /* Set up registers for signal handler */ - regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; - regs->eax = (unsigned long) sig; - regs->edx = (unsigned long) 0; - regs->ecx = (unsigned long) 0; + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ax = (unsigned long) sig; + regs->dx = (unsigned long) 0; + regs->cx = (unsigned long) 0; - regs->xds = __USER_DS; - regs->xes = __USER_DS; - regs->xss = __USER_DS; - regs->xcs = __USER_CS; + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; /* * Clear TF when entering the signal handler, but @@ -396,13 +407,13 @@ static int setup_frame(int sig, struct k_sigaction *ka, * The tracer may want to single-step inside the * handler too. */ - regs->eflags &= ~TF_MASK; + regs->flags &= ~TF_MASK; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->eip, frame->pretcode); + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; @@ -442,7 +453,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->esp), + err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, @@ -452,13 +463,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Set up to return from userspace. */ - restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn); + restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); /* - * This is movl $,%eax ; int $0x80 + * This is movl $,%ax ; int $0x80 * * WE DO NOT USE IT ANY MORE! It's only left here for historical * reasons and because gdb uses it as a signature to notice @@ -472,16 +483,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Set up registers for signal handler */ - regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; - regs->eax = (unsigned long) usig; - regs->edx = (unsigned long) &frame->info; - regs->ecx = (unsigned long) &frame->uc; + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ax = (unsigned long) usig; + regs->dx = (unsigned long) &frame->info; + regs->cx = (unsigned long) &frame->uc; - regs->xds = __USER_DS; - regs->xes = __USER_DS; - regs->xss = __USER_DS; - regs->xcs = __USER_CS; + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; /* * Clear TF when entering the signal handler, but @@ -489,13 +500,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * The tracer may want to single-step inside the * handler too. */ - regs->eflags &= ~TF_MASK; + regs->flags &= ~TF_MASK; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->eip, frame->pretcode); + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; @@ -516,35 +527,33 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; /* Are we from a system call? */ - if (regs->orig_eax >= 0) { + if (regs->orig_ax >= 0) { /* If so, check system call restarting.. */ - switch (regs->eax) { + switch (regs->ax) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: - regs->eax = -EINTR; + regs->ax = -EINTR; break; case -ERESTARTSYS: if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->eax = -EINTR; + regs->ax = -EINTR; break; } /* fallthrough */ case -ERESTARTNOINTR: - regs->eax = regs->orig_eax; - regs->eip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; } } /* - * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so - * that register information in the sigcontext is correct. + * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF + * flag so that register information in the sigcontext is correct. */ - if (unlikely(regs->eflags & TF_MASK) - && likely(current->ptrace & PT_DTRACE)) { - current->ptrace &= ~PT_DTRACE; - regs->eflags &= ~TF_MASK; - } + if (unlikely(regs->flags & X86_EFLAGS_TF) && + likely(test_and_clear_thread_flag(TIF_FORCED_TF))) + regs->flags &= ~X86_EFLAGS_TF; /* Set up the stack frame */ if (ka->sa.sa_flags & SA_SIGINFO) @@ -569,7 +578,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -static void fastcall do_signal(struct pt_regs *regs) +static void do_signal(struct pt_regs *regs) { siginfo_t info; int signr; @@ -599,8 +608,8 @@ static void fastcall do_signal(struct pt_regs *regs) * have been cleared if the watchpoint triggered * inside the kernel. */ - if (unlikely(current->thread.debugreg[7])) - set_debugreg(current->thread.debugreg[7], 7); + if (unlikely(current->thread.debugreg7)) + set_debugreg(current->thread.debugreg7, 7); /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { @@ -616,19 +625,19 @@ static void fastcall do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if (regs->orig_eax >= 0) { + if (regs->orig_ax >= 0) { /* Restart the system call - no handlers present */ - switch (regs->eax) { + switch (regs->ax) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: - regs->eax = regs->orig_eax; - regs->eip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: - regs->eax = __NR_restart_syscall; - regs->eip -= 2; + regs->ax = __NR_restart_syscall; + regs->ip -= 2; break; } } @@ -651,7 +660,7 @@ void do_notify_resume(struct pt_regs *regs, void *_unused, { /* Pending single-step? */ if (thread_info_flags & _TIF_SINGLESTEP) { - regs->eflags |= TF_MASK; + regs->flags |= TF_MASK; clear_thread_flag(TIF_SINGLESTEP); } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 38d806467c0..7347bb14e30 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -39,7 +39,7 @@ asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) { - return do_sigaltstack(uss, uoss, regs->rsp); + return do_sigaltstack(uss, uoss, regs->sp); } @@ -64,8 +64,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned #define COPY(x) err |= __get_user(regs->x, &sc->x) - COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); - COPY(rdx); COPY(rcx); COPY(rip); + COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); + COPY(dx); COPY(cx); COPY(ip); COPY(r8); COPY(r9); COPY(r10); @@ -86,9 +86,9 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned { unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); - regs->orig_rax = -1; /* disable syscall checks */ + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); + regs->orig_ax = -1; /* disable syscall checks */ } { @@ -108,7 +108,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned } } - err |= __get_user(*prax, &sc->rax); + err |= __get_user(*prax, &sc->ax); return err; badframe: @@ -119,9 +119,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; sigset_t set; - unsigned long eax; + unsigned long ax; - frame = (struct rt_sigframe __user *)(regs->rsp - 8); + frame = (struct rt_sigframe __user *)(regs->sp - 8); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { goto badframe; } @@ -135,17 +135,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; #ifdef DEBUG_SIG - printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); + printk("%d sigreturn ip:%lx sp:%lx frame:%p ax:%lx\n",current->pid,regs->ip,regs->sp,frame,ax); #endif - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) goto badframe; - return eax; + return ax; badframe: signal_fault(regs,frame,"sigreturn"); @@ -165,14 +165,14 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo err |= __put_user(0, &sc->gs); err |= __put_user(0, &sc->fs); - err |= __put_user(regs->rdi, &sc->rdi); - err |= __put_user(regs->rsi, &sc->rsi); - err |= __put_user(regs->rbp, &sc->rbp); - err |= __put_user(regs->rsp, &sc->rsp); - err |= __put_user(regs->rbx, &sc->rbx); - err |= __put_user(regs->rdx, &sc->rdx); - err |= __put_user(regs->rcx, &sc->rcx); - err |= __put_user(regs->rax, &sc->rax); + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); err |= __put_user(regs->r8, &sc->r8); err |= __put_user(regs->r9, &sc->r9); err |= __put_user(regs->r10, &sc->r10); @@ -183,8 +183,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo err |= __put_user(regs->r15, &sc->r15); err |= __put_user(me->thread.trap_no, &sc->trapno); err |= __put_user(me->thread.error_code, &sc->err); - err |= __put_user(regs->rip, &sc->rip); - err |= __put_user(regs->eflags, &sc->eflags); + err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->flags, &sc->flags); err |= __put_user(mask, &sc->oldmask); err |= __put_user(me->thread.cr2, &sc->cr2); @@ -198,18 +198,18 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo static void __user * get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) { - unsigned long rsp; + unsigned long sp; /* Default to using normal stack - redzone*/ - rsp = regs->rsp - 128; + sp = regs->sp - 128; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(rsp) == 0) - rsp = current->sas_ss_sp + current->sas_ss_size; + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; } - return (void __user *)round_down(rsp - size, 16); + return (void __user *)round_down(sp - size, 16); } static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -246,7 +246,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->rsp), + err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); @@ -271,21 +271,21 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; #ifdef DEBUG_SIG - printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); + printk("%d old ip %lx old sp %lx old ax %lx\n", current->pid,regs->ip,regs->sp,regs->ax); #endif /* Set up registers for signal handler */ - regs->rdi = sig; + regs->di = sig; /* In case the signal handler was declared without prototypes */ - regs->rax = 0; + regs->ax = 0; /* This also works for non SA_SIGINFO handlers because they expect the next argument after the signal number on the stack. */ - regs->rsi = (unsigned long)&frame->info; - regs->rdx = (unsigned long)&frame->uc; - regs->rip = (unsigned long) ka->sa.sa_handler; + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ka->sa.sa_handler; - regs->rsp = (unsigned long)frame; + regs->sp = (unsigned long)frame; /* Set up the CS register to run signal handlers in 64-bit mode, even if the handler happens to be interrupting 32-bit code. */ @@ -295,12 +295,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, see include/asm-x86_64/uaccess.h for details. */ set_fs(USER_DS); - regs->eflags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #ifdef DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; @@ -321,44 +321,40 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; #ifdef DEBUG_SIG - printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", + printk("handle_signal pid:%d sig:%lu ip:%lx sp:%lx regs=%p\n", current->pid, sig, - regs->rip, regs->rsp, regs); + regs->ip, regs->sp, regs); #endif /* Are we from a system call? */ - if ((long)regs->orig_rax >= 0) { + if ((long)regs->orig_ax >= 0) { /* If so, check system call restarting.. */ - switch (regs->rax) { + switch (regs->ax) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: - regs->rax = -EINTR; + regs->ax = -EINTR; break; case -ERESTARTSYS: if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->rax = -EINTR; + regs->ax = -EINTR; break; } /* fallthrough */ case -ERESTARTNOINTR: - regs->rax = regs->orig_rax; - regs->rip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; break; } } /* - * If TF is set due to a debugger (PT_DTRACE), clear the TF - * flag so that register information in the sigcontext is - * correct. + * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF + * flag so that register information in the sigcontext is correct. */ - if (unlikely(regs->eflags & TF_MASK)) { - if (likely(current->ptrace & PT_DTRACE)) { - current->ptrace &= ~PT_DTRACE; - regs->eflags &= ~TF_MASK; - } - } + if (unlikely(regs->flags & X86_EFLAGS_TF) && + likely(test_and_clear_thread_flag(TIF_FORCED_TF))) + regs->flags &= ~X86_EFLAGS_TF; #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) { @@ -430,21 +426,21 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if ((long)regs->orig_rax >= 0) { + if ((long)regs->orig_ax >= 0) { /* Restart the system call - no handlers present */ - long res = regs->rax; + long res = regs->ax; switch (res) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: - regs->rax = regs->orig_rax; - regs->rip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: - regs->rax = test_thread_flag(TIF_IA32) ? + regs->ax = test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall; - regs->rip -= 2; + regs->ip -= 2; break; } } @@ -461,13 +457,13 @@ void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { #ifdef DEBUG_SIG - printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", - thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); + printk("do_notify_resume flags:%x ip:%lx sp:%lx caller:%p pending:%x\n", + thread_info_flags, regs->ip, regs->sp, __builtin_return_address(0),signal_pending(current)); #endif /* Pending single-step? */ if (thread_info_flags & _TIF_SINGLESTEP) { - regs->eflags |= TF_MASK; + regs->flags |= X86_EFLAGS_TF; clear_thread_flag(TIF_SINGLESTEP); } @@ -488,9 +484,12 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; - if (show_unhandled_signals && printk_ratelimit()) - printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", - me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, me); } diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c index fcaa026eb80..dc0cde9d16f 100644 --- a/arch/x86/kernel/smp_32.c +++ b/arch/x86/kernel/smp_32.c @@ -159,7 +159,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector) apic_write_around(APIC_ICR, cfg); } -void fastcall send_IPI_self(int vector) +void send_IPI_self(int vector) { __send_IPI_shortcut(APIC_DEST_SELF, vector); } @@ -223,7 +223,7 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) */ local_irq_save(flags); - for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { + for_each_possible_cpu(query_cpu) { if (cpu_isset(query_cpu, mask)) { __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); @@ -256,13 +256,14 @@ static DEFINE_SPINLOCK(tlbstate_lock); * We need to reload %cr3 since the page tables may be going * away from under us.. */ -void leave_mm(unsigned long cpu) +void leave_mm(int cpu) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) BUG(); cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); load_cr3(swapper_pg_dir); } +EXPORT_SYMBOL_GPL(leave_mm); /* * @@ -310,7 +311,7 @@ void leave_mm(unsigned long cpu) * 2) Leave the mm if we are in the lazy tlb mode. */ -fastcall void smp_invalidate_interrupt(struct pt_regs *regs) +void smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -638,13 +639,13 @@ static void native_smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); __get_cpu_var(irq_stat).irq_resched_count++; } -fastcall void smp_call_function_interrupt(struct pt_regs *regs) +void smp_call_function_interrupt(struct pt_regs *regs) { void (*func) (void *info) = call_data->func; void *info = call_data->info; @@ -675,7 +676,7 @@ static int convert_apicid_to_cpu(int apic_id) { int i; - for (i = 0; i < NR_CPUS; i++) { + for_each_possible_cpu(i) { if (per_cpu(x86_cpu_to_apicid, i) == apic_id) return i; } diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c index 03fa6ed559c..2fd74b06db6 100644 --- a/arch/x86/kernel/smp_64.c +++ b/arch/x86/kernel/smp_64.c @@ -29,7 +29,7 @@ #include <asm/idle.h> /* - * Smarter SMP flushing macros. + * Smarter SMP flushing macros. * c/o Linus Torvalds. * * These mean you can really definitely utterly forget about @@ -37,15 +37,15 @@ * * Optimizations Manfred Spraul <manfred@colorfullife.com> * - * More scalable flush, from Andi Kleen + * More scalable flush, from Andi Kleen * - * To avoid global state use 8 different call vectors. - * Each CPU uses a specific vector to trigger flushes on other - * CPUs. Depending on the received vector the target CPUs look into + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into * the right per cpu variable for the flush data. * - * With more than 8 CPUs they are hashed to the 8 available - * vectors. The limited global vector space forces us to this right now. + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. * In future when interrupts are split into per CPU domains this could be * fixed, at the cost of triggering multiple IPIs in some cases. */ @@ -55,7 +55,6 @@ union smp_flush_state { cpumask_t flush_cpumask; struct mm_struct *flush_mm; unsigned long flush_va; -#define FLUSH_ALL -1ULL spinlock_t tlbstate_lock; }; char pad[SMP_CACHE_BYTES]; @@ -67,16 +66,17 @@ union smp_flush_state { static DEFINE_PER_CPU(union smp_flush_state, flush_state); /* - * We cannot call mmdrop() because we are in interrupt context, + * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. */ -static inline void leave_mm(int cpu) +void leave_mm(int cpu) { if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } +EXPORT_SYMBOL_GPL(leave_mm); /* * @@ -85,25 +85,25 @@ static inline void leave_mm(int cpu) * 1) switch_mm() either 1a) or 1b) * 1a) thread switch to a different mm * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superfluous + * tlb flush. * 1a2) set cpu mmu_state to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 * was in lazy tlb mode. * 1a3) update cpu active_mm - * Now cpu0 accepts tlb flushes for the new mm. + * Now cpu0 accepts tlb flushes for the new mm. * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. + * Now the other cpus will send tlb flush ipis. * 1a4) change cr3. * 1b) thread switch without mm change * cpu active_mm is correct, cpu0 already handles * flush ipis. * 1b1) set cpu mmu_state to TLBSTATE_OK * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. * 1b3) if the bit was 0: leave_mm was called, flush the tlb. * 2) switch %%esp, ie current * @@ -137,12 +137,12 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) * orig_rax contains the negated interrupt vector. * Use that to determine where the sender put the data. */ - sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; f = &per_cpu(flush_state, sender); if (!cpu_isset(cpu, f->flush_cpumask)) goto out; - /* + /* * This was a BUG() but until someone can quote me the * line from the intel manual that guarantees an IPI to * multiple CPUs is retried _only_ on the erroring CPUs @@ -150,10 +150,10 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) * * BUG(); */ - + if (f->flush_mm == read_pda(active_mm)) { if (read_pda(mmu_state) == TLBSTATE_OK) { - if (f->flush_va == FLUSH_ALL) + if (f->flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(f->flush_va); @@ -166,19 +166,22 @@ out: add_pda(irq_tlb_count, 1); } -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, + unsigned long va) { int sender; union smp_flush_state *f; + cpumask_t cpumask = *cpumaskp; /* Caller has disabled preemption */ sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; f = &per_cpu(flush_state, sender); - /* Could avoid this lock when - num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is - probably not worth checking this for a cache-hot lock. */ + /* + * Could avoid this lock when + * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is + * probably not worth checking this for a cache-hot lock. + */ spin_lock(&f->tlbstate_lock); f->flush_mm = mm; @@ -202,14 +205,14 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, int __cpuinit init_smp_flush(void) { int i; + for_each_cpu_mask(i, cpu_possible_map) { spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); } return 0; } - core_initcall(init_smp_flush); - + void flush_tlb_current_task(void) { struct mm_struct *mm = current->mm; @@ -221,10 +224,9 @@ void flush_tlb_current_task(void) local_flush_tlb(); if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_current_task); void flush_tlb_mm (struct mm_struct * mm) { @@ -241,11 +243,10 @@ void flush_tlb_mm (struct mm_struct * mm) leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_mm); void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) { @@ -259,8 +260,8 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) if (current->active_mm == mm) { if(current->mm) __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); + else + leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) @@ -268,7 +269,6 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_page); static void do_flush_tlb_all(void* info) { @@ -325,11 +325,9 @@ void unlock_ipi_call_lock(void) * this function sends a 'generic call function' IPI to all other CPU * of the system defined in the mask. */ - -static int -__smp_call_function_mask(cpumask_t mask, - void (*func)(void *), void *info, - int wait) +static int __smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { struct call_data_struct data; cpumask_t allbutself; @@ -417,11 +415,10 @@ EXPORT_SYMBOL(smp_call_function_mask); */ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) + int nonatomic, int wait) { /* prevent preemption and reschedule on another processor */ - int ret; - int me = get_cpu(); + int ret, me = get_cpu(); /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -471,9 +468,9 @@ static void stop_this_cpu(void *dummy) */ cpu_clear(smp_processor_id(), cpu_online_map); disable_local_APIC(); - for (;;) + for (;;) halt(); -} +} void smp_send_stop(void) { diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c index 4ea80cbe52e..5787a0c3e29 100644 --- a/arch/x86/kernel/smpboot_32.c +++ b/arch/x86/kernel/smpboot_32.c @@ -83,7 +83,6 @@ EXPORT_SYMBOL(cpu_online_map); cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; -EXPORT_SYMBOL(cpu_callout_map); cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; @@ -92,15 +91,10 @@ static cpumask_t smp_commenced_mask; DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); -/* - * The following static array is used during kernel startup - * and the x86_cpu_to_apicid_ptr contains the address of the - * array during this time. Is it zeroed when the per_cpu - * data area is removed. - */ +/* which logical CPU number maps to which CPU (physical APIC ID) */ u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata = { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_cpu_to_apicid_ptr; +void *x86_cpu_to_apicid_early_ptr; DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); @@ -113,7 +107,6 @@ u8 apicid_2_node[MAX_APICID]; extern const unsigned char trampoline_data []; extern const unsigned char trampoline_end []; static unsigned char *trampoline_base; -static int trampoline_exec; static void map_cpu_to_logical_apicid(void); @@ -138,17 +131,13 @@ static unsigned long __cpuinit setup_trampoline(void) */ void __init smp_alloc_memory(void) { - trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); + trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE); /* * Has to be in very low memory so we can execute * real-mode AP code. */ if (__pa(trampoline_base) >= 0x9F000) BUG(); - /* - * Make the SMP trampoline executable: - */ - trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1); } /* @@ -405,7 +394,7 @@ static void __cpuinit start_secondary(void *unused) setup_secondary_clock(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); - enable_NMI_through_LVT0(NULL); + enable_NMI_through_LVT0(); enable_8259A_irq(0); } /* @@ -448,38 +437,38 @@ void __devinit initialize_secondary(void) { /* * We don't actually need to load the full TSS, - * basically just the stack pointer and the eip. + * basically just the stack pointer and the ip. */ asm volatile( "movl %0,%%esp\n\t" "jmp *%1" : - :"m" (current->thread.esp),"m" (current->thread.eip)); + :"m" (current->thread.sp),"m" (current->thread.ip)); } /* Static state in head.S used to set up a CPU */ extern struct { - void * esp; + void * sp; unsigned short ss; } stack_start; #ifdef CONFIG_NUMA /* which logical CPUs are on which nodes */ -cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = +cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly = { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; -EXPORT_SYMBOL(node_2_cpu_mask); +EXPORT_SYMBOL(node_to_cpumask_map); /* which node each logical CPU is on */ -int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; -EXPORT_SYMBOL(cpu_2_node); +int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; +EXPORT_SYMBOL(cpu_to_node_map); /* set up a mapping between cpu and node. */ static inline void map_cpu_to_node(int cpu, int node) { printk("Mapping cpu %d to node %d\n", cpu, node); - cpu_set(cpu, node_2_cpu_mask[node]); - cpu_2_node[cpu] = node; + cpu_set(cpu, node_to_cpumask_map[node]); + cpu_to_node_map[cpu] = node; } /* undo a mapping between cpu and node. */ @@ -489,8 +478,8 @@ static inline void unmap_cpu_to_node(int cpu) printk("Unmapping cpu %d from all nodes\n", cpu); for (node = 0; node < MAX_NUMNODES; node ++) - cpu_clear(cpu, node_2_cpu_mask[node]); - cpu_2_node[cpu] = 0; + cpu_clear(cpu, node_to_cpumask_map[node]); + cpu_to_node_map[cpu] = 0; } #else /* !CONFIG_NUMA */ @@ -668,7 +657,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) * target processor state. */ startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, - (unsigned long) stack_start.esp); + (unsigned long) stack_start.sp); /* * Run STARTUP IPI loop. @@ -754,7 +743,7 @@ static inline struct task_struct * __cpuinit alloc_idle_task(int cpu) /* initialize thread_struct. we really want to avoid destroy * idle tread */ - idle->thread.esp = (unsigned long)task_pt_regs(idle); + idle->thread.sp = (unsigned long)task_pt_regs(idle); init_idle(idle, cpu); return idle; } @@ -799,7 +788,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) per_cpu(current_task, cpu) = idle; early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); - idle->thread.eip = (unsigned long) start_secondary; + idle->thread.ip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); @@ -807,9 +796,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) alternatives_smp_switch(1); /* So we see what's up */ - printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); + printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip); /* Stack for startup_32 can be just as for start_secondary onwards */ - stack_start.esp = (void *) idle->thread.esp; + stack_start.sp = (void *) idle->thread.sp; irq_ctx_init(cpu); @@ -1091,7 +1080,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) * Allow the user to impress friends. */ Dprintk("Before bogomips.\n"); - for (cpu = 0; cpu < NR_CPUS; cpu++) + for_each_possible_cpu(cpu) if (cpu_isset(cpu, cpu_callout_map)) bogosum += cpu_data(cpu).loops_per_jiffy; printk(KERN_INFO @@ -1122,7 +1111,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) * construct cpu_sibling_map, so that we can tell sibling CPUs * efficiently. */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { cpus_clear(per_cpu(cpu_sibling_map, cpu)); cpus_clear(per_cpu(cpu_core_map, cpu)); } @@ -1296,12 +1285,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif zap_low_mappings(); -#ifndef CONFIG_HOTPLUG_CPU - /* - * Disable executability of the SMP trampoline: - */ - set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); -#endif } void __init smp_intr_init(void) diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c index aaf4e129121..cc64b8085c2 100644 --- a/arch/x86/kernel/smpboot_64.c +++ b/arch/x86/kernel/smpboot_64.c @@ -65,7 +65,7 @@ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID; +DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; @@ -78,8 +78,6 @@ EXPORT_SYMBOL(cpu_online_map); */ cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; -EXPORT_SYMBOL(cpu_callout_map); - cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); @@ -113,10 +111,20 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 }; * a new thread. Also avoids complicated thread destroy functionality * for idle threads. */ +#ifdef CONFIG_HOTPLUG_CPU +/* + * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is + * removed after init for !CONFIG_HOTPLUG_CPU. + */ +static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); +#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) +#define set_idle_for_cpu(x,p) (per_cpu(idle_thread_array, x) = (p)) +#else struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; - #define get_idle_for_cpu(x) (idle_thread_array[(x)]) #define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) +#endif + /* * Currently trivial. Write the real->protected mode @@ -212,6 +220,7 @@ void __cpuinit smp_callin(void) Dprintk("CALLIN, before setup_local_APIC().\n"); setup_local_APIC(); + end_local_APIC_setup(); /* * Get our bogomips. @@ -338,7 +347,7 @@ void __cpuinit start_secondary(void) if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); - enable_NMI_through_LVT0(NULL); + enable_NMI_through_LVT0(); enable_8259A_irq(0); } @@ -370,7 +379,7 @@ void __cpuinit start_secondary(void) unlock_ipi_call_lock(); - setup_secondary_APIC_clock(); + setup_secondary_clock(); cpu_idle(); } @@ -384,19 +393,20 @@ static void inquire_remote_apic(int apicid) unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; int timeout; - unsigned int status; + u32 status; printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk("... APIC #%d %s: ", apicid, names[i]); + printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); /* * Wait for idle. */ status = safe_apic_wait_icr_idle(); if (status) - printk("a previous APIC delivery may have failed\n"); + printk(KERN_CONT + "a previous APIC delivery may have failed\n"); apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); @@ -410,10 +420,10 @@ static void inquire_remote_apic(int apicid) switch (status) { case APIC_ICR_RR_VALID: status = apic_read(APIC_RRR); - printk("%08x\n", status); + printk(KERN_CONT "%08x\n", status); break; default: - printk("failed\n"); + printk(KERN_CONT "failed\n"); } } } @@ -466,7 +476,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta */ Dprintk("#startup loops: %d.\n", num_starts); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n",j); @@ -577,7 +587,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) c_idle.idle = get_idle_for_cpu(cpu); if (c_idle.idle) { - c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) + c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *) (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); init_idle(c_idle.idle, cpu); goto do_rest; @@ -613,8 +623,8 @@ do_rest: start_rip = setup_trampoline(); - init_rsp = c_idle.idle->thread.rsp; - per_cpu(init_tss,cpu).rsp0 = init_rsp; + init_rsp = c_idle.idle->thread.sp; + load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread); initial_code = start_secondary; clear_tsk_thread_flag(c_idle.idle, TIF_FORK); @@ -691,7 +701,7 @@ do_rest: } if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */ clear_node_cpumask(cpu); /* was set by numa_add_cpu */ cpu_clear(cpu, cpu_present_map); cpu_clear(cpu, cpu_possible_map); @@ -841,24 +851,16 @@ static int __init smp_sanity_check(unsigned max_cpus) return 0; } -/* - * Copy apicid's found by MP_processor_info from initial array to the per cpu - * data area. The x86_cpu_to_apicid_init array is then expendable and the - * x86_cpu_to_apicid_ptr is zeroed indicating that the static array is no - * longer available. - */ -void __init smp_set_apicids(void) +static void __init smp_cpu_index_default(void) { - int cpu; + int i; + struct cpuinfo_x86 *c; - for_each_cpu_mask(cpu, cpu_possible_map) { - if (per_cpu_offset(cpu)) - per_cpu(x86_cpu_to_apicid, cpu) = - x86_cpu_to_apicid_init[cpu]; + for_each_cpu_mask(i, cpu_possible_map) { + c = &cpu_data(i); + /* mark all to hotplug */ + c->cpu_index = NR_CPUS; } - - /* indicate the static array will be going away soon */ - x86_cpu_to_apicid_ptr = NULL; } /* @@ -868,9 +870,9 @@ void __init smp_set_apicids(void) void __init smp_prepare_cpus(unsigned int max_cpus) { nmi_watchdog_default(); + smp_cpu_index_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ - smp_set_apicids(); set_cpu_sibling_map(0); if (smp_sanity_check(max_cpus) < 0) { @@ -885,6 +887,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus) */ setup_local_APIC(); + /* + * Enable IO APIC before setting up error vector + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); + end_local_APIC_setup(); + if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); @@ -903,7 +912,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) * Set up local APIC timer on boot CPU. */ - setup_boot_APIC_clock(); + setup_boot_clock(); } /* @@ -912,7 +921,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { int me = smp_processor_id(); - cpu_set(me, cpu_online_map); + /* already set me in cpu_online_map in boot_cpu_init() */ cpu_set(me, cpu_callout_map); per_cpu(cpu_state, me) = CPU_ONLINE; } @@ -1016,7 +1025,7 @@ void remove_cpu_from_maps(void) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */ clear_node_cpumask(cpu); } diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c index bbfe85a0f69..8bc38af29ae 100644 --- a/arch/x86/kernel/smpcommon_32.c +++ b/arch/x86/kernel/smpcommon_32.c @@ -14,10 +14,11 @@ __cpuinit void init_gdt(int cpu) { struct desc_struct *gdt = get_cpu_gdt_table(cpu); - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, - (u32 *)&gdt[GDT_ENTRY_PERCPU].b, + pack_descriptor(&gdt[GDT_ENTRY_PERCPU], __per_cpu_offset[cpu], 0xFFFFF, - 0x80 | DESCTYPE_S | 0x2, 0x8); + 0x2 | DESCTYPE_S, 0x8); + + gdt[GDT_ENTRY_PERCPU].s = 1; per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; per_cpu(cpu_number, cpu) = cpu; diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 2a8713ec0f9..2bf6903cb44 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -57,8 +57,6 @@ static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; static int num_memory_chunks; /* total number of memory chunks */ static u8 __initdata apicid_to_pxm[MAX_APICID]; -extern void * boot_ioremap(unsigned long, unsigned long); - /* Identify CPU proximity domains */ static void __init parse_cpu_affinity_structure(char *p) { @@ -299,7 +297,7 @@ int __init get_memcfg_from_srat(void) } rsdt = (struct acpi_table_rsdt *) - boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); + early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); if (!rsdt) { printk(KERN_WARNING @@ -339,11 +337,11 @@ int __init get_memcfg_from_srat(void) for (i = 0; i < tables; i++) { /* Map in header, then map in full table length. */ header = (struct acpi_table_header *) - boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); + early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); if (!header) break; header = (struct acpi_table_header *) - boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); + early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); if (!header) break; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 55771fd7e54..02f0f61f5b1 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -22,9 +22,9 @@ static int save_stack_stack(void *data, char *name) return -1; } -static void save_stack_address(void *data, unsigned long addr) +static void save_stack_address(void *data, unsigned long addr, int reliable) { - struct stack_trace *trace = (struct stack_trace *)data; + struct stack_trace *trace = data; if (trace->skip > 0) { trace->skip--; return; @@ -33,7 +33,8 @@ static void save_stack_address(void *data, unsigned long addr) trace->entries[trace->nr_entries++] = addr; } -static void save_stack_address_nosched(void *data, unsigned long addr) +static void +save_stack_address_nosched(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = (struct stack_trace *)data; if (in_sched_functions(addr)) @@ -65,15 +66,14 @@ static const struct stacktrace_ops save_stack_ops_nosched = { */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, &save_stack_ops, trace); + dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } -EXPORT_SYMBOL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); + dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c new file mode 100644 index 00000000000..2ef1a5f8d67 --- /dev/null +++ b/arch/x86/kernel/step.c @@ -0,0 +1,203 @@ +/* + * x86 single-step support code, common to 32-bit and 64-bit. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/ptrace.h> + +unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) +{ + unsigned long addr, seg; + + addr = regs->ip; + seg = regs->cs & 0xffff; + if (v8086_mode(regs)) { + addr = (addr & 0xffff) + (seg << 4); + return addr; + } + + /* + * We'll assume that the code segments in the GDT + * are all zero-based. That is largely true: the + * TLS segments are used for data, and the PNPBIOS + * and APM bios ones we just ignore here. + */ + if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { + u32 *desc; + unsigned long base; + + seg &= ~7UL; + + mutex_lock(&child->mm->context.lock); + if (unlikely((seg >> 3) >= child->mm->context.size)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = child->mm->context.ldt + seg; + base = ((desc[0] >> 16) | + ((desc[1] & 0xff) << 16) | + (desc[1] & 0xff000000)); + + /* 16-bit code segment? */ + if (!((desc[1] >> 22) & 1)) + addr &= 0xffff; + addr += base; + } + mutex_unlock(&child->mm->context.lock); + } + + return addr; +} + +static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) +{ + int i, copied; + unsigned char opcode[15]; + unsigned long addr = convert_ip_to_linear(child, regs); + + copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + for (i = 0; i < copied; i++) { + switch (opcode[i]) { + /* popf and iret */ + case 0x9d: case 0xcf: + return 1; + + /* CHECKME: 64 65 */ + + /* opcode and address size prefixes */ + case 0x66: case 0x67: + continue; + /* irrelevant prefixes (segment overrides and repeats) */ + case 0x26: case 0x2e: + case 0x36: case 0x3e: + case 0x64: case 0x65: + case 0xf0: case 0xf2: case 0xf3: + continue; + +#ifdef CONFIG_X86_64 + case 0x40 ... 0x4f: + if (regs->cs != __USER_CS) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ + continue; +#endif + + /* CHECKME: f2, f3 */ + + /* + * pushf: NOTE! We should probably not let + * the user see the TF bit being set. But + * it's more pain than it's worth to avoid + * it, and a debugger could emulate this + * all in user space if it _really_ cares. + */ + case 0x9c: + default: + return 0; + } + } + return 0; +} + +/* + * Enable single-stepping. Return nonzero if user mode is not using TF itself. + */ +static int enable_single_step(struct task_struct *child) +{ + struct pt_regs *regs = task_pt_regs(child); + + /* + * Always set TIF_SINGLESTEP - this guarantees that + * we single-step system calls etc.. This will also + * cause us to set TF when returning to user mode. + */ + set_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* + * If TF was already set, don't do anything else + */ + if (regs->flags & X86_EFLAGS_TF) + return 0; + + /* Set TF on the kernel stack.. */ + regs->flags |= X86_EFLAGS_TF; + + /* + * ..but if TF is changed by the instruction we will trace, + * don't mark it as being "us" that set it, so that we + * won't clear it by hand later. + */ + if (is_setting_trap_flag(child, regs)) + return 0; + + set_tsk_thread_flag(child, TIF_FORCED_TF); + + return 1; +} + +/* + * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running. + */ +static void write_debugctlmsr(struct task_struct *child, unsigned long val) +{ + child->thread.debugctlmsr = val; + + if (child != current) + return; + + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + +/* + * Enable single or block step. + */ +static void enable_step(struct task_struct *child, bool block) +{ + /* + * Make sure block stepping (BTF) is not enabled unless it should be. + * Note that we don't try to worry about any is_setting_trap_flag() + * instructions after the first when using block stepping. + * So noone should try to use debugger block stepping in a program + * that uses user-mode single stepping itself. + */ + if (enable_single_step(child) && block) { + set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + write_debugctlmsr(child, + child->thread.debugctlmsr | DEBUGCTLMSR_BTF); + } else { + write_debugctlmsr(child, + child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR); + + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + } +} + +void user_enable_single_step(struct task_struct *child) +{ + enable_step(child, 0); +} + +void user_enable_block_step(struct task_struct *child) +{ + enable_step(child, 1); +} + +void user_disable_single_step(struct task_struct *child) +{ + /* + * Make sure block stepping (BTF) is disabled. + */ + write_debugctlmsr(child, + child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR); + + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + + /* Always clear TIF_SINGLESTEP... */ + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* But touch TF only if it was set by us.. */ + if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) + task_pt_regs(child)->flags &= ~X86_EFLAGS_TF; +} diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c index 2e5efaaf880..09199511c25 100644 --- a/arch/x86/kernel/suspend_64.c +++ b/arch/x86/kernel/suspend_64.c @@ -17,9 +17,26 @@ /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; +static void fix_processor_context(void); + struct saved_context saved_context; -void __save_processor_state(struct saved_context *ctxt) +/** + * __save_processor_state - save CPU registers before creating a + * hibernation image and before restoring the memory state from it + * @ctxt - structure to store the registers contents in + * + * NOTE: If there is a CPU register the modification of which by the + * boot kernel (ie. the kernel used for loading the hibernation image) + * might affect the operations of the restored target kernel (ie. the one + * saved in the hibernation image), then its contents must be saved by this + * function. In other words, if kernel A is hibernated and different + * kernel B is used for loading the hibernation image into memory, the + * kernel A's __save_processor_state() function must save all registers + * needed by kernel A, so that it can operate correctly after the resume + * regardless of what kernel B does in the meantime. + */ +static void __save_processor_state(struct saved_context *ctxt) { kernel_fpu_begin(); @@ -69,7 +86,12 @@ static void do_fpu_end(void) kernel_fpu_end(); } -void __restore_processor_state(struct saved_context *ctxt) +/** + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + */ +static void __restore_processor_state(struct saved_context *ctxt) { /* * control registers @@ -113,14 +135,14 @@ void restore_processor_state(void) __restore_processor_state(&saved_context); } -void fix_processor_context(void) +static void fix_processor_context(void) { int cpu = smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ - cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9; + get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; syscall_init(); /* This sets MSR_*STAR and related */ load_TR_desc(); /* This does ltr */ diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S index 72f952103e5..aeb9a4d7681 100644 --- a/arch/x86/kernel/suspend_asm_64.S +++ b/arch/x86/kernel/suspend_asm_64.S @@ -18,13 +18,13 @@ ENTRY(swsusp_arch_suspend) movq $saved_context, %rax - movq %rsp, pt_regs_rsp(%rax) - movq %rbp, pt_regs_rbp(%rax) - movq %rsi, pt_regs_rsi(%rax) - movq %rdi, pt_regs_rdi(%rax) - movq %rbx, pt_regs_rbx(%rax) - movq %rcx, pt_regs_rcx(%rax) - movq %rdx, pt_regs_rdx(%rax) + movq %rsp, pt_regs_sp(%rax) + movq %rbp, pt_regs_bp(%rax) + movq %rsi, pt_regs_si(%rax) + movq %rdi, pt_regs_di(%rax) + movq %rbx, pt_regs_bx(%rax) + movq %rcx, pt_regs_cx(%rax) + movq %rdx, pt_regs_dx(%rax) movq %r8, pt_regs_r8(%rax) movq %r9, pt_regs_r9(%rax) movq %r10, pt_regs_r10(%rax) @@ -34,7 +34,7 @@ ENTRY(swsusp_arch_suspend) movq %r14, pt_regs_r14(%rax) movq %r15, pt_regs_r15(%rax) pushfq - popq pt_regs_eflags(%rax) + popq pt_regs_flags(%rax) /* save the address of restore_registers */ movq $restore_registers, %rax @@ -115,13 +115,13 @@ ENTRY(restore_registers) /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax - movq pt_regs_rsp(%rax), %rsp - movq pt_regs_rbp(%rax), %rbp - movq pt_regs_rsi(%rax), %rsi - movq pt_regs_rdi(%rax), %rdi - movq pt_regs_rbx(%rax), %rbx - movq pt_regs_rcx(%rax), %rcx - movq pt_regs_rdx(%rax), %rdx + movq pt_regs_sp(%rax), %rsp + movq pt_regs_bp(%rax), %rbp + movq pt_regs_si(%rax), %rsi + movq pt_regs_di(%rax), %rdi + movq pt_regs_bx(%rax), %rbx + movq pt_regs_cx(%rax), %rcx + movq pt_regs_dx(%rax), %rdx movq pt_regs_r8(%rax), %r8 movq pt_regs_r9(%rax), %r9 movq pt_regs_r10(%rax), %r10 @@ -130,7 +130,7 @@ ENTRY(restore_registers) movq pt_regs_r13(%rax), %r13 movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 - pushq pt_regs_eflags(%rax) + pushq pt_regs_flags(%rax) popfq xorq %rax, %rax diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 907942ee6e7..bd802a5e1aa 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -12,6 +12,7 @@ #include <linux/file.h> #include <linux/utsname.h> #include <linux/personality.h> +#include <linux/random.h> #include <asm/uaccess.h> #include <asm/ia32.h> @@ -65,6 +66,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { + unsigned long new_begin; /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit it to that. This means we need to move the @@ -74,6 +76,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin, of playground for now. -AK */ *begin = 0x40000000; *end = 0x80000000; + if (current->flags & PF_RANDOMIZE) { + new_begin = randomize_range(*begin, *begin + 0x02000000, 0); + if (new_begin) + *begin = new_begin; + } } else { *begin = TASK_UNMAPPED_BASE; *end = TASK_SIZE; @@ -143,6 +150,97 @@ full_search: } } + +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + /* for MAP_32BIT mappings we force the legact mmap base */ + if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) + goto bottomup; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + /* check if free_area_cache is useful for us */ + if (len <= mm->cached_hole_size) { + mm->cached_hole_size = 0; + mm->free_area_cache = mm->mmap_base; + } + + /* either no address requested or can't fit in requested address hole */ + addr = mm->free_area_cache; + + /* make sure it can fit in the remaining address space */ + if (addr > len) { + vma = find_vma(mm, addr-len); + if (!vma || addr <= vma->vm_start) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr-len); + } + + if (mm->mmap_base < len) + goto bottomup; + + addr = mm->mmap_base-len; + + do { + /* + * Lookup failure means no vma is above this address, + * else if new region fits below vma->vm_start, + * return with success: + */ + vma = find_vma(mm, addr); + if (!vma || addr+len <= vma->vm_start) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr); + + /* remember the largest hole we saw so far */ + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + + /* try just below the current vma->vm_start */ + addr = vma->vm_start-len; + } while (len < vma->vm_start); + +bottomup: + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + mm->cached_hole_size = ~0UL; + mm->free_area_cache = TASK_UNMAPPED_BASE; + addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + /* + * Restore the topdown base: + */ + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = ~0UL; + + return addr; +} + + asmlinkage long sys_uname(struct new_utsname __user * name) { int err; diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c new file mode 100644 index 00000000000..6d7ef11e797 --- /dev/null +++ b/arch/x86/kernel/test_nx.c @@ -0,0 +1,176 @@ +/* + * test_nx.c: functional test for NX functionality + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/module.h> +#include <linux/sort.h> +#include <asm/uaccess.h> + +extern int rodata_test_data; + +/* + * This file checks 4 things: + * 1) Check if the stack is not executable + * 2) Check if kmalloc memory is not executable + * 3) Check if the .rodata section is not executable + * 4) Check if the .data section of a module is not executable + * + * To do this, the test code tries to execute memory in stack/kmalloc/etc, + * and then checks if the expected trap happens. + * + * Sadly, this implies having a dynamic exception handling table entry. + * ... which can be done (and will make Rusty cry)... but it can only + * be done in a stand-alone module with only 1 entry total. + * (otherwise we'd have to sort and that's just too messy) + */ + + + +/* + * We want to set up an exception handling point on our stack, + * which means a variable value. This function is rather dirty + * and walks the exception table of the module, looking for a magic + * marker and replaces it with a specific function. + */ +static void fudze_exception_table(void *marker, void *new) +{ + struct module *mod = THIS_MODULE; + struct exception_table_entry *extable; + + /* + * Note: This module has only 1 exception table entry, + * so searching and sorting is not needed. If that changes, + * this would be the place to search and re-sort the exception + * table. + */ + if (mod->num_exentries > 1) { + printk(KERN_ERR "test_nx: too many exception table entries!\n"); + printk(KERN_ERR "test_nx: test results are not reliable.\n"); + return; + } + extable = (struct exception_table_entry *)mod->extable; + extable[0].insn = (unsigned long)new; +} + + +/* + * exception tables get their symbols translated so we need + * to use a fake function to put in there, which we can then + * replace at runtime. + */ +void foo_label(void); + +/* + * returns 0 for not-executable, negative for executable + * + * Note: we cannot allow this function to be inlined, because + * that would give us more than 1 exception table entry. + * This in turn would break the assumptions above. + */ +static noinline int test_address(void *address) +{ + unsigned long result; + + /* Set up an exception table entry for our address */ + fudze_exception_table(&foo_label, address); + result = 1; + asm volatile( + "foo_label:\n" + "0: call *%[fake_code]\n" + "1:\n" + ".section .fixup,\"ax\"\n" + "2: mov %[zero], %[rslt]\n" + " ret\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" + " .quad 0b\n" + " .quad 2b\n" + ".previous\n" + : [rslt] "=r" (result) + : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result) + ); + /* change the exception table back for the next round */ + fudze_exception_table(address, &foo_label); + + if (result) + return -ENODEV; + return 0; +} + +static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */ + +static int test_NX(void) +{ + int ret = 0; + /* 0xC3 is the opcode for "ret" */ + char stackcode[] = {0xC3, 0x90, 0 }; + char *heap; + + test_data = 0xC3; + + printk(KERN_INFO "Testing NX protection\n"); + + /* Test 1: check if the stack is not executable */ + if (test_address(&stackcode)) { + printk(KERN_ERR "test_nx: stack was executable\n"); + ret = -ENODEV; + } + + + /* Test 2: Check if the heap is executable */ + heap = kmalloc(64, GFP_KERNEL); + if (!heap) + return -ENOMEM; + heap[0] = 0xC3; /* opcode for "ret" */ + + if (test_address(heap)) { + printk(KERN_ERR "test_nx: heap was executable\n"); + ret = -ENODEV; + } + kfree(heap); + + /* + * The following 2 tests currently fail, this needs to get fixed + * Until then, don't run them to avoid too many people getting scared + * by the error message + */ +#if 0 + +#ifdef CONFIG_DEBUG_RODATA + /* Test 3: Check if the .rodata section is executable */ + if (rodata_test_data != 0xC3) { + printk(KERN_ERR "test_nx: .rodata marker has invalid value\n"); + ret = -ENODEV; + } else if (test_address(&rodata_test_data)) { + printk(KERN_ERR "test_nx: .rodata section is executable\n"); + ret = -ENODEV; + } +#endif + + /* Test 4: Check if the .data section of a module is executable */ + if (test_address(&test_data)) { + printk(KERN_ERR "test_nx: .data section is executable\n"); + ret = -ENODEV; + } + +#endif + return 0; +} + +static void test_exit(void) +{ +} + +module_init(test_NX); +module_exit(test_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Testcase for the NX infrastructure"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c new file mode 100644 index 00000000000..4c163772000 --- /dev/null +++ b/arch/x86/kernel/test_rodata.c @@ -0,0 +1,86 @@ +/* + * test_rodata.c: functional test for mark_rodata_ro function + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/module.h> +#include <asm/sections.h> +extern int rodata_test_data; + +int rodata_test(void) +{ + unsigned long result; + unsigned long start, end; + + /* test 1: read the value */ + /* If this test fails, some previous testrun has clobbered the state */ + if (!rodata_test_data) { + printk(KERN_ERR "rodata_test: test 1 fails (start data)\n"); + return -ENODEV; + } + + /* test 2: write to the variable; this should fault */ + /* + * If this test fails, we managed to overwrite the data + * + * This is written in assembly to be able to catch the + * exception that is supposed to happen in the correct + * case + */ + + result = 1; + asm volatile( + "0: mov %[zero],(%[rodata_test])\n" + " mov %[zero], %[rslt]\n" + "1:\n" + ".section .fixup,\"ax\"\n" + "2: jmp 1b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 16\n" +#ifdef CONFIG_X86_32 + " .long 0b,2b\n" +#else + " .quad 0b,2b\n" +#endif + ".previous" + : [rslt] "=r" (result) + : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) + ); + + + if (!result) { + printk(KERN_ERR "rodata_test: test data was not read only\n"); + return -ENODEV; + } + + /* test 3: check the value hasn't changed */ + /* If this test fails, we managed to overwrite the data */ + if (!rodata_test_data) { + printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n"); + return -ENODEV; + } + /* test 4: check if the rodata section is 4Kb aligned */ + start = (unsigned long)__start_rodata; + end = (unsigned long)__end_rodata; + if (start & (PAGE_SIZE - 1)) { + printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n"); + return -ENODEV; + } + if (end & (PAGE_SIZE - 1)) { + printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n"); + return -ENODEV; + } + + return 0; +} + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 8a322c96bc2..1a89e93f3f1 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -28,98 +28,20 @@ * serialize accesses to xtime/lost_ticks). */ -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/param.h> -#include <linux/string.h> -#include <linux/mm.h> +#include <linux/init.h> #include <linux/interrupt.h> #include <linux/time.h> -#include <linux/delay.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/sysdev.h> -#include <linux/bcd.h> -#include <linux/efi.h> #include <linux/mca.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/irq.h> -#include <asm/msr.h> -#include <asm/delay.h> -#include <asm/mpspec.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/timer.h> -#include <asm/time.h> - -#include "mach_time.h" - -#include <linux/timex.h> - -#include <asm/hpet.h> - #include <asm/arch_hooks.h> - -#include "io_ports.h" - -#include <asm/i8259.h> +#include <asm/hpet.h> +#include <asm/time.h> #include "do_timer.h" unsigned int cpu_khz; /* Detected as we calibrate the TSC */ EXPORT_SYMBOL(cpu_khz); -DEFINE_SPINLOCK(rtc_lock); -EXPORT_SYMBOL(rtc_lock); - -/* - * This is a special lock that is owned by the CPU and holds the index - * register we are working with. It is required for NMI access to the - * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. - */ -volatile unsigned long cmos_lock = 0; -EXPORT_SYMBOL(cmos_lock); - -/* Routines for accessing the CMOS RAM/RTC. */ -unsigned char rtc_cmos_read(unsigned char addr) -{ - unsigned char val; - lock_cmos_prefix(addr); - outb_p(addr, RTC_PORT(0)); - val = inb_p(RTC_PORT(1)); - lock_cmos_suffix(addr); - return val; -} -EXPORT_SYMBOL(rtc_cmos_read); - -void rtc_cmos_write(unsigned char val, unsigned char addr) -{ - lock_cmos_prefix(addr); - outb_p(addr, RTC_PORT(0)); - outb_p(val, RTC_PORT(1)); - lock_cmos_suffix(addr); -} -EXPORT_SYMBOL(rtc_cmos_write); - -static int set_rtc_mmss(unsigned long nowtime) -{ - int retval; - unsigned long flags; - - /* gets recalled with irq locally disabled */ - /* XXX - does irqsave resolve this? -johnstul */ - spin_lock_irqsave(&rtc_lock, flags); - retval = set_wallclock(nowtime); - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; -} - - int timer_ack; unsigned long profile_pc(struct pt_regs *regs) @@ -127,17 +49,17 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) && + if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->ebp + 4); + return *(unsigned long *)(regs->bp + 4); #else - unsigned long *sp = (unsigned long *)®s->esp; + unsigned long *sp = (unsigned long *)®s->sp; /* Return address is either directly at stack pointer - or above a saved eflags. Eflags has bits 22-31 zero, + or above a saved flags. Eflags has bits 22-31 zero, kernel addresses don't. */ - if (sp[0] >> 22) + if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; @@ -193,26 +115,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* not static: needed by APM */ -unsigned long read_persistent_clock(void) -{ - unsigned long retval; - unsigned long flags; - - spin_lock_irqsave(&rtc_lock, flags); - - retval = get_wallclock(); - - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; -} - -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); -} - extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ void __init hpet_time_init(void) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 368b1942b39..0380795121a 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -11,43 +11,18 @@ * RTC support code taken from arch/i386/kernel/timers/time_hpet.c */ -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/interrupt.h> +#include <linux/clockchips.h> #include <linux/init.h> -#include <linux/mc146818rtc.h> -#include <linux/time.h> -#include <linux/ioport.h> +#include <linux/interrupt.h> #include <linux/module.h> -#include <linux/device.h> -#include <linux/sysdev.h> -#include <linux/bcd.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/kallsyms.h> -#include <linux/acpi.h> -#include <linux/clockchips.h> +#include <linux/time.h> -#ifdef CONFIG_ACPI -#include <acpi/achware.h> /* for PM timer frequency */ -#include <acpi/acpi_bus.h> -#endif #include <asm/i8253.h> -#include <asm/pgtable.h> -#include <asm/vsyscall.h> -#include <asm/timex.h> -#include <asm/proto.h> -#include <asm/hpet.h> -#include <asm/sections.h> -#include <linux/hpet.h> -#include <asm/apic.h> #include <asm/hpet.h> -#include <asm/mpspec.h> #include <asm/nmi.h> #include <asm/vgtod.h> - -DEFINE_SPINLOCK(rtc_lock); -EXPORT_SYMBOL(rtc_lock); +#include <asm/time.h> +#include <asm/timer.h> volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; @@ -56,10 +31,10 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); /* Assume the lock function has either no stack frame or a copy - of eflags from PUSHF + of flags from PUSHF Eflags always has bits 22 and up cleared unlike kernel addresses. */ if (!user_mode(regs) && in_lock_functions(pc)) { - unsigned long *sp = (unsigned long *)regs->rsp; + unsigned long *sp = (unsigned long *)regs->sp; if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) @@ -69,82 +44,6 @@ unsigned long profile_pc(struct pt_regs *regs) } EXPORT_SYMBOL(profile_pc); -/* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 - * ms after the second nowtime has started, because when nowtime is written - * into the registers of the CMOS clock, it will jump to the next second - * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data - * sheet for details. - */ - -static int set_rtc_mmss(unsigned long nowtime) -{ - int retval = 0; - int real_seconds, real_minutes, cmos_minutes; - unsigned char control, freq_select; - unsigned long flags; - -/* - * set_rtc_mmss is called when irqs are enabled, so disable irqs here - */ - spin_lock_irqsave(&rtc_lock, flags); -/* - * Tell the clock it's being set and stop it. - */ - control = CMOS_READ(RTC_CONTROL); - CMOS_WRITE(control | RTC_SET, RTC_CONTROL); - - freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT); - - cmos_minutes = CMOS_READ(RTC_MINUTES); - BCD_TO_BIN(cmos_minutes); - -/* - * since we're only adjusting minutes and seconds, don't interfere with hour - * overflow. This avoids messing with unknown time zones but requires your RTC - * not to be off by more than 15 minutes. Since we're calling it only when - * our clock is externally synchronized using NTP, this shouldn't be a problem. - */ - - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) - real_minutes += 30; /* correct for half hour time zone */ - real_minutes %= 60; - - if (abs(real_minutes - cmos_minutes) >= 30) { - printk(KERN_WARNING "time.c: can't update CMOS clock " - "from %d to %d\n", cmos_minutes, real_minutes); - retval = -1; - } else { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); - CMOS_WRITE(real_seconds, RTC_SECONDS); - CMOS_WRITE(real_minutes, RTC_MINUTES); - } - -/* - * The following flags have to be released exactly in this order, otherwise the - * DS12887 (popular MC146818A clone with integrated battery and quartz) will - * not reset the oscillator and will not update precisely 500 ms later. You - * won't find this mentioned in the Dallas Semiconductor data sheets, but who - * believes data sheets anyway ... -- Markus Kuhn - */ - - CMOS_WRITE(control, RTC_CONTROL); - CMOS_WRITE(freq_select, RTC_FREQ_SELECT); - - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; -} - -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); -} - static irqreturn_t timer_event_interrupt(int irq, void *dev_id) { add_pda(irq0_irqs, 1); @@ -154,67 +53,10 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -unsigned long read_persistent_clock(void) -{ - unsigned int year, mon, day, hour, min, sec; - unsigned long flags; - unsigned century = 0; - - spin_lock_irqsave(&rtc_lock, flags); - /* - * if UIP is clear, then we have >= 244 microseconds before RTC - * registers will be updated. Spec sheet says that this is the - * reliable way to read RTC - registers invalid (off bus) during update - */ - while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) - cpu_relax(); - - - /* now read all RTC registers while stable with interrupts disabled */ - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - acpi_gbl_FADT.century) - century = CMOS_READ(acpi_gbl_FADT.century); -#endif - spin_unlock_irqrestore(&rtc_lock, flags); - - /* - * We know that x86-64 always uses BCD format, no need to check the - * config register. - */ - - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); - - if (century) { - BCD_TO_BIN(century); - year += century * 100; - printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); - } else { - /* - * x86-64 systems only exists since 2002. - * This will work up to Dec 31, 2100 - */ - year += 2000; - } - - return mktime(year, mon, day, hour, min, sec); -} - /* calibrate_cpu is used on systems with fixed rate TSCs to determine * processor frequency */ #define TICK_COUNT 100000000 -static unsigned int __init tsc_calibrate_cpu_khz(void) +unsigned long __init native_calculate_cpu_khz(void) { int tsc_start, tsc_now; int i, no_ctr_free; @@ -241,7 +83,7 @@ static unsigned int __init tsc_calibrate_cpu_khz(void) rdtscl(tsc_start); do { rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles_sync(); + tsc_now = get_cycles(); } while ((tsc_now - tsc_start) < TICK_COUNT); local_irq_restore(flags); @@ -264,20 +106,22 @@ static struct irqaction irq0 = { .name = "timer" }; -void __init time_init(void) +void __init hpet_time_init(void) { if (!hpet_enable()) setup_pit_timer(); setup_irq(0, &irq0); +} +void __init time_init(void) +{ tsc_calibrate(); cpu_khz = tsc_khz; if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 16) - cpu_khz = tsc_calibrate_cpu_khz(); + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) + cpu_khz = calculate_cpu_khz(); if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); @@ -290,4 +134,5 @@ void __init time_init(void) printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); init_tsc_clocksource(); + late_time_init = choose_time_init(); } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c new file mode 100644 index 00000000000..6dfd4e76661 --- /dev/null +++ b/arch/x86/kernel/tls.c @@ -0,0 +1,213 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/user.h> +#include <linux/regset.h> + +#include <asm/uaccess.h> +#include <asm/desc.h> +#include <asm/system.h> +#include <asm/ldt.h> +#include <asm/processor.h> +#include <asm/proto.h> + +#include "tls.h" + +/* + * sys_alloc_thread_area: get a yet unused TLS descriptor index. + */ +static int get_free_idx(void) +{ + struct thread_struct *t = ¤t->thread; + int idx; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (desc_empty(&t->tls_array[idx])) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +static void set_tls_desc(struct task_struct *p, int idx, + const struct user_desc *info, int n) +{ + struct thread_struct *t = &p->thread; + struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN]; + int cpu; + + /* + * We must not get preempted while modifying the TLS. + */ + cpu = get_cpu(); + + while (n-- > 0) { + if (LDT_empty(info)) + desc->a = desc->b = 0; + else + fill_ldt(desc, info); + ++info; + ++desc; + } + + if (t == ¤t->thread) + load_TLS(t, cpu); + + put_cpu(); +} + +/* + * Set a given TLS descriptor: + */ +int do_set_thread_area(struct task_struct *p, int idx, + struct user_desc __user *u_info, + int can_allocate) +{ + struct user_desc info; + + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + + if (idx == -1) + idx = info.entry_number; + + /* + * index -1 means the kernel should try to find and + * allocate an empty descriptor: + */ + if (idx == -1 && can_allocate) { + idx = get_free_idx(); + if (idx < 0) + return idx; + if (put_user(idx, &u_info->entry_number)) + return -EFAULT; + } + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + set_tls_desc(p, idx, &info, 1); + + return 0; +} + +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) +{ + return do_set_thread_area(current, -1, u_info, 1); +} + + +/* + * Get the current Thread-Local Storage area: + */ + +static void fill_user_desc(struct user_desc *info, int idx, + const struct desc_struct *desc) + +{ + memset(info, 0, sizeof(*info)); + info->entry_number = idx; + info->base_addr = get_desc_base(desc); + info->limit = get_desc_limit(desc); + info->seg_32bit = desc->d; + info->contents = desc->type >> 2; + info->read_exec_only = !(desc->type & 2); + info->limit_in_pages = desc->g; + info->seg_not_present = !desc->p; + info->useable = desc->avl; +#ifdef CONFIG_X86_64 + info->lm = desc->l; +#endif +} + +int do_get_thread_area(struct task_struct *p, int idx, + struct user_desc __user *u_info) +{ + struct user_desc info; + + if (idx == -1 && get_user(idx, &u_info->entry_number)) + return -EFAULT; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + fill_user_desc(&info, idx, + &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]); + + if (copy_to_user(u_info, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) +{ + return do_get_thread_area(current, -1, u_info); +} + +int regset_tls_active(struct task_struct *target, + const struct user_regset *regset) +{ + struct thread_struct *t = &target->thread; + int n = GDT_ENTRY_TLS_ENTRIES; + while (n > 0 && desc_empty(&t->tls_array[n - 1])) + --n; + return n; +} + +int regset_tls_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + const struct desc_struct *tls; + + if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + (pos % sizeof(struct user_desc)) != 0 || + (count % sizeof(struct user_desc)) != 0) + return -EINVAL; + + pos /= sizeof(struct user_desc); + count /= sizeof(struct user_desc); + + tls = &target->thread.tls_array[pos]; + + if (kbuf) { + struct user_desc *info = kbuf; + while (count-- > 0) + fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++, + tls++); + } else { + struct user_desc __user *u_info = ubuf; + while (count-- > 0) { + struct user_desc info; + fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++); + if (__copy_to_user(u_info++, &info, sizeof(info))) + return -EFAULT; + } + } + + return 0; +} + +int regset_tls_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; + const struct user_desc *info; + + if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + (pos % sizeof(struct user_desc)) != 0 || + (count % sizeof(struct user_desc)) != 0) + return -EINVAL; + + if (kbuf) + info = kbuf; + else if (__copy_from_user(infobuf, ubuf, count)) + return -EFAULT; + else + info = infobuf; + + set_tls_desc(target, + GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), + info, count / sizeof(struct user_desc)); + + return 0; +} diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h new file mode 100644 index 00000000000..2f083a2fe21 --- /dev/null +++ b/arch/x86/kernel/tls.h @@ -0,0 +1,21 @@ +/* + * Internal declarations for x86 TLS implementation functions. + * + * Copyright (C) 2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * Red Hat Author: Roland McGrath. + */ + +#ifndef _ARCH_X86_KERNEL_TLS_H + +#include <linux/regset.h> + +extern user_regset_active_fn regset_tls_active; +extern user_regset_get_fn regset_tls_get; +extern user_regset_set_fn regset_tls_set; + +#endif /* _ARCH_X86_KERNEL_TLS_H */ diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 7e16d675eb8..78cbb655aa7 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -31,9 +31,10 @@ #include <linux/mmzone.h> #include <asm/cpu.h> -static struct i386_cpu cpu_devices[NR_CPUS]; +static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); -int __cpuinit arch_register_cpu(int num) +#ifdef CONFIG_HOTPLUG_CPU +int arch_register_cpu(int num) { /* * CPU0 cannot be offlined due to several @@ -44,21 +45,23 @@ int __cpuinit arch_register_cpu(int num) * Also certain PCI quirks require not to enable hotplug control * for all CPU's. */ -#ifdef CONFIG_HOTPLUG_CPU if (num) - cpu_devices[num].cpu.hotpluggable = 1; -#endif - - return register_cpu(&cpu_devices[num].cpu, num); + per_cpu(cpu_devices, num).cpu.hotpluggable = 1; + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); } +EXPORT_SYMBOL(arch_register_cpu); -#ifdef CONFIG_HOTPLUG_CPU void arch_unregister_cpu(int num) { - return unregister_cpu(&cpu_devices[num].cpu); + return unregister_cpu(&per_cpu(cpu_devices, num).cpu); } -EXPORT_SYMBOL(arch_register_cpu); EXPORT_SYMBOL(arch_unregister_cpu); +#else +int arch_register_cpu(int num) +{ + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); +} +EXPORT_SYMBOL(arch_register_cpu); #endif /*CONFIG_HOTPLUG_CPU*/ static int __init topology_init(void) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 02d1e1e58e8..3cf72977d01 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -76,7 +76,8 @@ char ignore_fpu_irq = 0; * F0 0F bug workaround.. We have a special link segment * for this. */ -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; +gate_desc idt_table[256] + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -101,6 +102,34 @@ asmlinkage void machine_check(void); int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; +void printk_address(unsigned long address, int reliable) +{ +#ifdef CONFIG_KALLSYMS + unsigned long offset = 0, symsize; + const char *symname; + char *modname; + char *delim = ":"; + char namebuf[128]; + char reliab[4] = ""; + + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { + printk(" [<%08lx>]\n", address); + return; + } + if (!reliable) + strcpy(reliab, "? "); + + if (!modname) + modname = delim = ""; + printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", + address, reliab, delim, modname, delim, symname, offset, symsize); +#else + printk(" [<%08lx>]\n", address); +#endif +} + static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) { return p > (void *)tinfo && @@ -114,48 +143,35 @@ struct stack_frame { }; static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long ebp, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { -#ifdef CONFIG_FRAME_POINTER - struct stack_frame *frame = (struct stack_frame *)ebp; - while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) { - struct stack_frame *next; - unsigned long addr; + struct stack_frame *frame = (struct stack_frame *)bp; - addr = frame->return_address; - ops->address(data, addr); - /* - * break out of recursive entries (such as - * end_of_stack_stop_unwind_function). Also, - * we can never allow a frame pointer to - * move downwards! - */ - next = frame->next_frame; - if (next <= frame) - break; - frame = next; - } -#else while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { unsigned long addr; - addr = *stack++; - if (__kernel_text_address(addr)) - ops->address(data, addr); + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 4) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; } -#endif - return ebp; + return bp; } #define MSG(msg) ops->warning(data, msg) void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { - unsigned long ebp = 0; - if (!task) task = current; @@ -163,17 +179,17 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long dummy; stack = &dummy; if (task != current) - stack = (unsigned long *)task->thread.esp; + stack = (unsigned long *)task->thread.sp; } #ifdef CONFIG_FRAME_POINTER - if (!ebp) { + if (!bp) { if (task == current) { - /* Grab ebp right from our regs */ - asm ("movl %%ebp, %0" : "=r" (ebp) : ); + /* Grab bp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (bp) : ); } else { - /* ebp is the last reg pushed by switch_to */ - ebp = *(unsigned long *) task->thread.esp; + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; } } #endif @@ -182,7 +198,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, struct thread_info *context; context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = print_context_stack(context, stack, ebp, ops, data); + bp = print_context_stack(context, stack, bp, ops, data); /* Should be after the line below, but somewhere in early boot context comes out corrupted and we can't reference it -AK */ @@ -217,9 +233,11 @@ static int print_trace_stack(void *data, char *name) /* * Print one address/symbol entries per line. */ -static void print_trace_address(void *data, unsigned long addr) +static void print_trace_address(void *data, unsigned long addr, int reliable) { printk("%s [<%08lx>] ", (char *)data, addr); + if (!reliable) + printk("? "); print_symbol("%s\n", addr); touch_nmi_watchdog(); } @@ -233,32 +251,32 @@ static const struct stacktrace_ops print_trace_ops = { static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long * stack, char *log_lvl) + unsigned long *stack, unsigned long bp, char *log_lvl) { - dump_trace(task, regs, stack, &print_trace_ops, log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long * stack) + unsigned long *stack, unsigned long bp) { - show_trace_log_lvl(task, regs, stack, ""); + show_trace_log_lvl(task, regs, stack, bp, ""); } static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *esp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; - if (esp == NULL) { + if (sp == NULL) { if (task) - esp = (unsigned long*)task->thread.esp; + sp = (unsigned long*)task->thread.sp; else - esp = (unsigned long *)&esp; + sp = (unsigned long *)&sp; } - stack = esp; + stack = sp; for(i = 0; i < kstack_depth_to_print; i++) { if (kstack_end(stack)) break; @@ -267,13 +285,13 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%08lx ", *stack++); } printk("\n%sCall Trace:\n", log_lvl); - show_trace_log_lvl(task, regs, esp, log_lvl); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *esp) +void show_stack(struct task_struct *task, unsigned long *sp) { printk(" "); - show_stack_log_lvl(task, NULL, esp, ""); + show_stack_log_lvl(task, NULL, sp, 0, ""); } /* @@ -282,13 +300,19 @@ void show_stack(struct task_struct *task, unsigned long *esp) void dump_stack(void) { unsigned long stack; + unsigned long bp = 0; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movl %%ebp, %0" : "=r" (bp):); +#endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(current, NULL, &stack); + show_trace(current, NULL, &stack, bp); } EXPORT_SYMBOL(dump_stack); @@ -307,30 +331,30 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (!user_mode_vm(regs)) { - u8 *eip; + u8 *ip; unsigned int code_prologue = code_bytes * 43 / 64; unsigned int code_len = code_bytes; unsigned char c; printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, ®s->esp, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); printk(KERN_EMERG "Code: "); - eip = (u8 *)regs->eip - code_prologue; - if (eip < (u8 *)PAGE_OFFSET || - probe_kernel_address(eip, c)) { + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { /* try starting at EIP */ - eip = (u8 *)regs->eip; + ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } - for (i = 0; i < code_len; i++, eip++) { - if (eip < (u8 *)PAGE_OFFSET || - probe_kernel_address(eip, c)) { + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { printk(" Bad EIP value."); break; } - if (eip == (u8 *)regs->eip) + if (ip == (u8 *)regs->ip) printk("<%02x> ", c); else printk("%02x ", c); @@ -339,18 +363,57 @@ void show_registers(struct pt_regs *regs) printk("\n"); } -int is_valid_bugaddr(unsigned long eip) +int is_valid_bugaddr(unsigned long ip) { unsigned short ud2; - if (eip < PAGE_OFFSET) + if (ip < PAGE_OFFSET) return 0; - if (probe_kernel_address((unsigned short *)eip, ud2)) + if (probe_kernel_address((unsigned short *)ip, ud2)) return 0; return ud2 == 0x0b0f; } +static int die_counter; + +int __kprobes __die(const char * str, struct pt_regs * regs, long err) +{ + unsigned long sp; + unsigned short ss; + + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) != + NOTIFY_STOP) { + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; + } else { + return 1; + } +} + /* * This is gone through when something in the kernel has done something bad and * is about to be terminated. @@ -366,7 +429,6 @@ void die(const char * str, struct pt_regs * regs, long err) .lock_owner = -1, .lock_owner_depth = 0 }; - static int die_counter; unsigned long flags; oops_enter(); @@ -382,43 +444,13 @@ void die(const char * str, struct pt_regs * regs, long err) raw_local_irq_save(flags); if (++die.lock_owner_depth < 3) { - unsigned long esp; - unsigned short ss; + report_bug(regs->ip, regs); - report_bug(regs->eip, regs); - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, - ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) != - NOTIFY_STOP) { - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - esp = (unsigned long) (®s->esp); - savesegment(ss, ss); - if (user_mode(regs)) { - esp = regs->esp; - ss = regs->xss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip); - print_symbol("%s", regs->eip); - printk(" SS:ESP %04x:%08lx\n", ss, esp); - } - else + if (__die(str, regs, err)) regs = NULL; - } else + } else { printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + } bust_spinlocks(0); die.lock_owner = -1; @@ -454,7 +486,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, { struct task_struct *tsk = current; - if (regs->eflags & VM_MASK) { + if (regs->flags & VM_MASK) { if (vm86) goto vm86_trap; goto trap_signal; @@ -500,7 +532,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, } #define DO_ERROR(trapnr, signr, str, name) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -509,7 +541,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ if (irq) \ @@ -525,7 +557,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ } #define DO_VM86_ERROR(trapnr, signr, str, name) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -534,7 +566,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ @@ -548,13 +580,13 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) #endif DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) @@ -562,7 +594,7 @@ DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) -fastcall void __kprobes do_general_protection(struct pt_regs * regs, +void __kprobes do_general_protection(struct pt_regs * regs, long error_code) { int cpu = get_cpu(); @@ -596,7 +628,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, } put_cpu(); - if (regs->eflags & VM_MASK) + if (regs->flags & VM_MASK) goto gp_in_vm86; if (!user_mode(regs)) @@ -605,11 +637,14 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, current->thread.error_code = error_code; current->thread.trap_no = 13; if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", + "%s[%d] general protection ip:%lx sp:%lx error:%lx", current->comm, task_pid_nr(current), - regs->eip, regs->esp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, current); return; @@ -705,8 +740,8 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg) */ bust_spinlocks(1); printk(KERN_EMERG "%s", msg); - printk(" on CPU%d, eip %08lx, registers:\n", - smp_processor_id(), regs->eip); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); show_registers(regs); console_silent(); spin_unlock(&nmi_print_lock); @@ -763,7 +798,7 @@ static __kprobes void default_do_nmi(struct pt_regs * regs) static int ignore_nmis; -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) +__kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -792,7 +827,7 @@ void restart_nmi(void) } #ifdef CONFIG_KPROBES -fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) +void __kprobes do_int3(struct pt_regs *regs, long error_code) { trace_hardirqs_fixup(); @@ -828,7 +863,7 @@ fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) * find every occurrence of the TF bit that could be saved away even * by user code) */ -fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) +void __kprobes do_debug(struct pt_regs * regs, long error_code) { unsigned int condition; struct task_struct *tsk = current; @@ -837,24 +872,30 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) get_debugreg(condition, 6); + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); + tsk->thread.debugctlmsr = 0; + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg[7]) + if (!tsk->thread.debugreg7) goto clear_dr7; } - if (regs->eflags & VM_MASK) + if (regs->flags & VM_MASK) goto debug_vm86; /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg[6] = condition; + tsk->thread.debugreg6 = condition; /* * Single-stepping through TF: make sure we ignore any events in @@ -886,7 +927,7 @@ debug_vm86: clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->eflags &= ~TF_MASK; + regs->flags &= ~TF_MASK; return; } @@ -895,7 +936,7 @@ clear_TF_reenable: * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -void math_error(void __user *eip) +void math_error(void __user *ip) { struct task_struct * task; siginfo_t info; @@ -911,7 +952,7 @@ void math_error(void __user *eip) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = eip; + info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the @@ -954,13 +995,13 @@ void math_error(void __user *eip) force_sig_info(SIGFPE, &info, task); } -fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) +void do_coprocessor_error(struct pt_regs * regs, long error_code) { ignore_fpu_irq = 1; - math_error((void __user *)regs->eip); + math_error((void __user *)regs->ip); } -static void simd_math_error(void __user *eip) +static void simd_math_error(void __user *ip) { struct task_struct * task; siginfo_t info; @@ -976,7 +1017,7 @@ static void simd_math_error(void __user *eip) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = eip; + info.si_addr = ip; /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which @@ -1008,19 +1049,19 @@ static void simd_math_error(void __user *eip) force_sig_info(SIGFPE, &info, task); } -fastcall void do_simd_coprocessor_error(struct pt_regs * regs, +void do_simd_coprocessor_error(struct pt_regs * regs, long error_code) { if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->eip); + simd_math_error((void __user *)regs->ip); } else { /* * Handle strange cache flush from user space exception * in all other cases. This is undocumented behaviour. */ - if (regs->eflags & VM_MASK) { + if (regs->flags & VM_MASK) { handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); return; @@ -1032,7 +1073,7 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs, } } -fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, +void do_spurious_interrupt_bug(struct pt_regs * regs, long error_code) { #if 0 @@ -1041,7 +1082,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, #endif } -fastcall unsigned long patch_espfix_desc(unsigned long uesp, +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; @@ -1095,51 +1136,17 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -/* - * This needs to use 'idt_table' rather than 'idt', and - * thus use the _nonmapped_ version of the IDT, as the - * Pentium F0 0F bugfix can have resulted in the mapped - * IDT being write-protected. - */ -void set_intr_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS); -} - -/* - * This routine sets up an interrupt gate at directory privilege level 3. - */ -static inline void set_system_intr_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS); -} - -static void __init set_trap_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS); -} - -static void __init set_system_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS); -} - -static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) -{ - _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3)); -} - void __init trap_init(void) { int i; #ifdef CONFIG_EISA - void __iomem *p = ioremap(0x0FFFD9, 4); + void __iomem *p = early_ioremap(0x0FFFD9, 4); if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { EISA_bus = 1; } - iounmap(p); + early_iounmap(p, 4); #endif #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index cc68b92316c..efc66df728b 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -74,22 +74,24 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); +static unsigned int code_bytes = 64; + static inline void conditional_sti(struct pt_regs *regs) { - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } static inline void preempt_conditional_sti(struct pt_regs *regs) { preempt_disable(); - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } static inline void preempt_conditional_cli(struct pt_regs *regs) { - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); /* Make sure to not schedule here because we could be running on an exception stack. */ @@ -98,14 +100,15 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) int kstack_depth_to_print = 12; -#ifdef CONFIG_KALLSYMS -void printk_address(unsigned long address) +void printk_address(unsigned long address, int reliable) { +#ifdef CONFIG_KALLSYMS unsigned long offset = 0, symsize; const char *symname; char *modname; char *delim = ":"; - char namebuf[128]; + char namebuf[KSYM_NAME_LEN]; + char reliab[4] = ""; symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); @@ -113,17 +116,17 @@ void printk_address(unsigned long address) printk(" [<%016lx>]\n", address); return; } + if (!reliable) + strcpy(reliab, "? "); + if (!modname) - modname = delim = ""; - printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", - address, delim, modname, delim, symname, offset, symsize); -} + modname = delim = ""; + printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n", + address, reliab, delim, modname, delim, symname, offset, symsize); #else -void printk_address(unsigned long address) -{ printk(" [<%016lx>]\n", address); -} #endif +} static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) @@ -208,14 +211,53 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + + +static inline unsigned long print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) { - void *t = (void *)tinfo; - return p > t && p < t + THREAD_SIZE - 3; + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 8) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; } void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); @@ -225,36 +267,28 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, if (!tsk) tsk = current; + tinfo = task_thread_info(tsk); if (!stack) { unsigned long dummy; stack = &dummy; if (tsk && tsk != current) - stack = (unsigned long *)tsk->thread.rsp; + stack = (unsigned long *)tsk->thread.sp; } - /* - * Print function call entries within a stack. 'cond' is the - * "end of stackframe" condition, that the 'stack++' - * iteration will eventually trigger. - */ -#define HANDLE_STACK(cond) \ - do while (cond) { \ - unsigned long addr = *stack++; \ - /* Use unlocked access here because except for NMIs \ - we should be already protected against module unloads */ \ - if (__kernel_text_address(addr)) { \ - /* \ - * If the address is either in the text segment of the \ - * kernel, or in the region which contains vmalloc'ed \ - * memory, it *may* be the address of a calling \ - * routine; if so, print it so that someone tracing \ - * down the cause of the crash will be able to figure \ - * out the call path that was taken. \ - */ \ - ops->address(data, addr); \ - } \ - } while (0) +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (tsk == current) { + /* Grab bp right from our regs */ + asm("movq %%rbp, %0" : "=r" (bp):); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) tsk->thread.sp; + } + } +#endif + + /* * Print function call entries in all stacks, starting at the @@ -270,7 +304,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, if (estack_end) { if (ops->stack(data, id) < 0) break; - HANDLE_STACK (stack < estack_end); + + bp = print_context_stack(tinfo, stack, bp, ops, + data, estack_end); ops->stack(data, "<EOE>"); /* * We link to the next stack via the @@ -288,7 +324,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, if (stack >= irqstack && stack < irqstack_end) { if (ops->stack(data, "IRQ") < 0) break; - HANDLE_STACK (stack < irqstack_end); + bp = print_context_stack(tinfo, stack, bp, + ops, data, irqstack_end); /* * We link to the next stack (which would be * the process stack normally) the last @@ -306,9 +343,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, /* * This handles the process stack: */ - tinfo = task_thread_info(tsk); - HANDLE_STACK (valid_stack_ptr(tinfo, stack)); -#undef HANDLE_STACK + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -331,10 +366,10 @@ static int print_trace_stack(void *data, char *name) return 0; } -static void print_trace_address(void *data, unsigned long addr) +static void print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); - printk_address(addr); + printk_address(addr, reliable); } static const struct stacktrace_ops print_trace_ops = { @@ -345,15 +380,17 @@ static const struct stacktrace_ops print_trace_ops = { }; void -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, + unsigned long bp) { printk("\nCall Trace:\n"); - dump_trace(tsk, regs, stack, &print_trace_ops, NULL); + dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); printk("\n"); } static void -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, + unsigned long bp) { unsigned long *stack; int i; @@ -364,14 +401,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) // debugging aid: "show_stack(NULL, NULL);" prints the // back trace for this cpu. - if (rsp == NULL) { + if (sp == NULL) { if (tsk) - rsp = (unsigned long *)tsk->thread.rsp; + sp = (unsigned long *)tsk->thread.sp; else - rsp = (unsigned long *)&rsp; + sp = (unsigned long *)&sp; } - stack = rsp; + stack = sp; for(i=0; i < kstack_depth_to_print; i++) { if (stack >= irqstack && stack <= irqstack_end) { if (stack == irqstack_end) { @@ -387,12 +424,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace(tsk, regs, rsp); + show_trace(tsk, regs, sp, bp); } -void show_stack(struct task_struct *tsk, unsigned long * rsp) +void show_stack(struct task_struct *tsk, unsigned long * sp) { - _show_stack(tsk, NULL, rsp); + _show_stack(tsk, NULL, sp, 0); } /* @@ -401,13 +438,19 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp) void dump_stack(void) { unsigned long dummy; + unsigned long bp = 0; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movq %%rbp, %0" : "=r" (bp):); +#endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &dummy); + show_trace(NULL, NULL, &dummy, bp); } EXPORT_SYMBOL(dump_stack); @@ -415,12 +458,15 @@ EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { int i; - int in_kernel = !user_mode(regs); - unsigned long rsp; + unsigned long sp; const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; + u8 *ip; + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; - rsp = regs->rsp; + sp = regs->sp; + ip = (u8 *) regs->ip - code_prologue; printk("CPU %d ", cpu); __show_regs(regs); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", @@ -430,45 +476,43 @@ void show_registers(struct pt_regs *regs) * When in-kernel, we also print out the stack and code at the * time of the fault.. */ - if (in_kernel) { + if (!user_mode(regs)) { + unsigned char c; printk("Stack: "); - _show_stack(NULL, regs, (unsigned long*)rsp); - - printk("\nCode: "); - if (regs->rip < PAGE_OFFSET) - goto bad; - - for (i=0; i<20; i++) { - unsigned char c; - if (__get_user(c, &((unsigned char*)regs->rip)[i])) { -bad: + _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); + printk("\n"); + + printk(KERN_EMERG "Code: "); + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at RIP */ + ip = (u8 *) regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { printk(" Bad RIP value."); break; } - printk("%02x ", c); + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); } } printk("\n"); } -int is_valid_bugaddr(unsigned long rip) +int is_valid_bugaddr(unsigned long ip) { unsigned short ud2; - if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) return 0; return ud2 == 0x0b0f; } -#ifdef CONFIG_BUG -void out_of_line_bug(void) -{ - BUG(); -} -EXPORT_SYMBOL(out_of_line_bug); -#endif - static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -496,7 +540,7 @@ unsigned __kprobes long oops_begin(void) return flags; } -void __kprobes oops_end(unsigned long flags) +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { die_owner = -1; bust_spinlocks(0); @@ -505,12 +549,17 @@ void __kprobes oops_end(unsigned long flags) /* Nest count reaches zero, release the lock. */ __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); + if (!regs) { + oops_exit(); + return; + } if (panic_on_oops) panic("Fatal exception"); oops_exit(); + do_exit(signr); } -void __kprobes __die(const char * str, struct pt_regs * regs, long err) +int __kprobes __die(const char * str, struct pt_regs * regs, long err) { static int die_counter; printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); @@ -524,15 +573,17 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; show_registers(regs); add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ printk(KERN_ALERT "RIP "); - printk_address(regs->rip); - printk(" RSP <%016lx>\n", regs->rsp); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); if (kexec_should_crash(current)) crash_kexec(regs); + return 0; } void die(const char * str, struct pt_regs * regs, long err) @@ -540,11 +591,11 @@ void die(const char * str, struct pt_regs * regs, long err) unsigned long flags = oops_begin(); if (!user_mode(regs)) - report_bug(regs->rip, regs); + report_bug(regs->ip, regs); - __die(str, regs, err); - oops_end(flags); - do_exit(SIGSEGV); + if (__die(str, regs, err)) + regs = NULL; + oops_end(flags, regs, SIGSEGV); } void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) @@ -561,10 +612,10 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) crash_kexec(regs); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - oops_end(flags); + oops_end(flags, NULL, SIGBUS); nmi_exit(); local_irq_enable(); - do_exit(SIGSEGV); + do_exit(SIGBUS); } static void __kprobes do_trap(int trapnr, int signr, char *str, @@ -588,11 +639,14 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, tsk->thread.trap_no = trapnr; if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, str, - regs->rip, regs->rsp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } if (info) force_sig_info(signr, info, tsk); @@ -602,19 +656,12 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, } - /* kernel trap */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) - regs->rip = fixup->fixup; - else { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + die(str, regs, error_code); } + return; } #define DO_ERROR(trapnr, signr, str, name) \ @@ -643,10 +690,10 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) DO_ERROR( 4, SIGSEGV, "overflow", overflow) DO_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) @@ -694,32 +741,28 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, tsk->thread.trap_no = 13; if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", + "%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, - regs->rip, regs->rsp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, tsk); return; } - /* kernel gp */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } + if (fixup_exception(regs)) + return; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); - } + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); } static __kprobes void @@ -832,15 +875,15 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->rsp) + if (eregs == (struct pt_regs *)eregs->sp) ; /* Exception from user space */ else if (user_mode(eregs)) regs = task_pt_regs(current); /* Exception from kernel and interrupts are enabled. Move to kernel process stack. */ - else if (eregs->eflags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); if (eregs != regs) *regs = *eregs; return regs; @@ -858,6 +901,12 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, get_debugreg(condition, 6); + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); + tsk->thread.debugctlmsr = 0; + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; @@ -873,27 +922,14 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, tsk->thread.debugreg6 = condition; - /* Mask out spurious TF errors due to lazy TF clearing */ + + /* + * Single-stepping through TF: make sure we ignore any events in + * kernel space (but re-enable TF when returning to user mode). + */ if (condition & DR_STEP) { - /* - * The TF error should be masked out only if the current - * process is not traced and if the TRAP flag has been set - * previously by a tracing process (condition detected by - * the PT_DTRACE flag); remember that the i386 TRAP flag - * can be modified by the process itself in user mode, - * allowing programs to debug themselves without the ptrace() - * interface. - */ if (!user_mode(regs)) goto clear_TF_reenable; - /* - * Was the TF flag set by a debugger? If so, clear it now, - * so that register information is correct. - */ - if (tsk->ptrace & PT_DTRACE) { - regs->eflags &= ~TF_MASK; - tsk->ptrace &= ~PT_DTRACE; - } } /* Ok, finally something we can handle */ @@ -902,7 +938,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = TRAP_BRKPT; - info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; + info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; force_sig_info(SIGTRAP, &info, tsk); clear_dr7: @@ -912,18 +948,15 @@ clear_dr7: clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->eflags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; preempt_conditional_cli(regs); } static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; + if (fixup_exception(regs)) return 1; - } + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); /* Illegal floating point operation in the kernel */ current->thread.trap_no = trapnr; @@ -938,7 +971,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) */ asmlinkage void do_coprocessor_error(struct pt_regs *regs) { - void __user *rip = (void __user *)(regs->rip); + void __user *ip = (void __user *)(regs->ip); struct task_struct * task; siginfo_t info; unsigned short cwd, swd; @@ -958,7 +991,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = rip; + info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the @@ -1007,7 +1040,7 @@ asmlinkage void bad_intr(void) asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) { - void __user *rip = (void __user *)(regs->rip); + void __user *ip = (void __user *)(regs->ip); struct task_struct * task; siginfo_t info; unsigned short mxcsr; @@ -1027,7 +1060,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = rip; + info.si_addr = ip; /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which @@ -1089,6 +1122,7 @@ asmlinkage void math_state_restore(void) task_thread_info(me)->status |= TS_USEDFPU; me->fpu_counter++; } +EXPORT_SYMBOL_GPL(math_state_restore); void __init trap_init(void) { @@ -1144,3 +1178,14 @@ static int __init kstack_setup(char *s) return 0; } early_param("kstack", kstack_setup); + + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 9ebc0dab66b..43517e324be 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -5,6 +5,7 @@ #include <linux/jiffies.h> #include <linux/init.h> #include <linux/dmi.h> +#include <linux/percpu.h> #include <asm/delay.h> #include <asm/tsc.h> @@ -23,8 +24,6 @@ static int tsc_enabled; unsigned int tsc_khz; EXPORT_SYMBOL_GPL(tsc_khz); -int tsc_disable; - #ifdef CONFIG_X86_TSC static int __init tsc_setup(char *str) { @@ -39,8 +38,7 @@ static int __init tsc_setup(char *str) */ static int __init tsc_setup(char *str) { - tsc_disable = 1; - + setup_clear_cpu_cap(X86_FEATURE_TSC); return 1; } #endif @@ -80,13 +78,31 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ +DEFINE_PER_CPU(unsigned long, cyc2ns); -static inline void set_cyc2ns_scale(unsigned long cpu_khz) +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; + unsigned long flags, prev_scale, *scale; + unsigned long long tsc_now, ns_now; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + + rdtscll(tsc_now); + ns_now = __cycles_2_ns(tsc_now); + + prev_scale = *scale; + if (cpu_khz) + *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + + /* + * Start smoothly with the new frequency: + */ + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); } /* @@ -239,7 +255,9 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { tsc_khz = cpu_khz; - set_cyc2ns_scale(cpu_khz); + preempt_disable(); + set_cyc2ns_scale(cpu_khz, smp_processor_id()); + preempt_enable(); /* * TSC based sched_clock turns * to junk w/ cpufreq @@ -333,6 +351,11 @@ __cpuinit int unsynchronized_tsc(void) { if (!cpu_has_tsc || tsc_unstable) return 1; + + /* Anything with constant TSC should be synchronized */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; + /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: @@ -367,7 +390,9 @@ static inline void check_geode_tsc_reliable(void) { } void __init tsc_init(void) { - if (!cpu_has_tsc || tsc_disable) + int cpu; + + if (!cpu_has_tsc) goto out_no_tsc; cpu_khz = calculate_cpu_khz(); @@ -380,7 +405,15 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); - set_cyc2ns_scale(cpu_khz); + /* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(cpu_khz, cpu); + use_tsc_delay(); /* Check and install the TSC clocksource */ @@ -403,10 +436,5 @@ void __init tsc_init(void) return; out_no_tsc: - /* - * Set the tsc_disable flag if there's no TSC support, this - * makes it a fast flag for the kernel to see whether it - * should be using the TSC. - */ - tsc_disable = 1; + setup_clear_cpu_cap(X86_FEATURE_TSC); } diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 9c70af45b42..947554ddabb 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -10,6 +10,7 @@ #include <asm/hpet.h> #include <asm/timex.h> +#include <asm/timer.h> static int notsc __initdata = 0; @@ -18,19 +19,51 @@ EXPORT_SYMBOL(cpu_khz); unsigned int tsc_khz; EXPORT_SYMBOL(tsc_khz); -static unsigned int cyc2ns_scale __read_mostly; +/* Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +DEFINE_PER_CPU(unsigned long, cyc2ns); -static inline void set_cyc2ns_scale(unsigned long khz) +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; -} + unsigned long flags, prev_scale, *scale; + unsigned long long tsc_now, ns_now; -static unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> NS_SCALE; + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + + rdtscll(tsc_now); + ns_now = __cycles_2_ns(tsc_now); + + prev_scale = *scale; + if (cpu_khz) + *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); } -unsigned long long sched_clock(void) +unsigned long long native_sched_clock(void) { unsigned long a = 0; @@ -44,12 +77,27 @@ unsigned long long sched_clock(void) return cycles_2_ns(a); } +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long +sched_clock(void) __attribute__((alias("native_sched_clock"))); +#endif + + static int tsc_unstable; -inline int check_tsc_unstable(void) +int check_tsc_unstable(void) { return tsc_unstable; } +EXPORT_SYMBOL_GPL(check_tsc_unstable); + #ifdef CONFIG_CPU_FREQ /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency @@ -100,7 +148,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, mark_tsc_unstable("cpufreq changes"); } - set_cyc2ns_scale(tsc_khz_ref); + preempt_disable(); + set_cyc2ns_scale(tsc_khz_ref, smp_processor_id()); + preempt_enable(); return 0; } @@ -133,12 +183,12 @@ static unsigned long __init tsc_read_refs(unsigned long *pm, int i; for (i = 0; i < MAX_RETRIES; i++) { - t1 = get_cycles_sync(); + t1 = get_cycles(); if (hpet) *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; else *pm = acpi_pm_read_early(); - t2 = get_cycles_sync(); + t2 = get_cycles(); if ((t2 - t1) < SMI_TRESHOLD) return t2; } @@ -151,7 +201,7 @@ static unsigned long __init tsc_read_refs(unsigned long *pm, void __init tsc_calibrate(void) { unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; - int hpet = is_hpet_enabled(); + int hpet = is_hpet_enabled(), cpu; local_irq_save(flags); @@ -162,9 +212,9 @@ void __init tsc_calibrate(void) outb(0xb0, 0x43); outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); - tr1 = get_cycles_sync(); + tr1 = get_cycles(); while ((inb(0x61) & 0x20) == 0); - tr2 = get_cycles_sync(); + tr2 = get_cycles(); tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); @@ -206,7 +256,9 @@ void __init tsc_calibrate(void) } tsc_khz = tsc2 / tsc1; - set_cyc2ns_scale(tsc_khz); + + for_each_possible_cpu(cpu) + set_cyc2ns_scale(tsc_khz, cpu); } /* @@ -222,17 +274,9 @@ __cpuinit int unsynchronized_tsc(void) if (apic_is_clustered_box()) return 1; #endif - /* Most intel systems have synchronized TSCs except for - multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { -#ifdef CONFIG_ACPI - /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_gbl_FADT.header.length > 0 && - acpi_gbl_FADT.C3latency < 1000) - return 1; -#endif + + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; - } /* Assume multi socket systems are not synchronized */ return num_present_cpus() > 1; @@ -250,13 +294,13 @@ __setup("notsc", notsc_setup); /* clock source code: */ static cycle_t read_tsc(void) { - cycle_t ret = (cycle_t)get_cycles_sync(); + cycle_t ret = (cycle_t)get_cycles(); return ret; } static cycle_t __vsyscall_fn vread_tsc(void) { - cycle_t ret = (cycle_t)get_cycles_sync(); + cycle_t ret = (cycle_t)vget_cycles(); return ret; } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9125efe66a0..0577825cf89 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -46,7 +46,7 @@ static __cpuinit void check_tsc_warp(void) cycles_t start, now, prev, end; int i; - start = get_cycles_sync(); + start = get_cycles(); /* * The measurement runs for 20 msecs: */ @@ -61,18 +61,18 @@ static __cpuinit void check_tsc_warp(void) */ __raw_spin_lock(&sync_lock); prev = last_tsc; - now = get_cycles_sync(); + now = get_cycles(); last_tsc = now; __raw_spin_unlock(&sync_lock); /* * Be nice every now and then (and also check whether - * measurement is done [we also insert a 100 million + * measurement is done [we also insert a 10 million * loops safety exit, so we dont lock up in case the * TSC readout is totally broken]): */ if (unlikely(!(i & 7))) { - if (now > end || i > 100000000) + if (now > end || i > 10000000) break; cpu_relax(); touch_nmi_watchdog(); @@ -87,7 +87,11 @@ static __cpuinit void check_tsc_warp(void) nr_warps++; __raw_spin_unlock(&sync_lock); } - + } + if (!(now-start)) { + printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", + now-start, end-start); + WARN_ON(1); } } @@ -129,24 +133,24 @@ void __cpuinit check_tsc_sync_source(int cpu) while (atomic_read(&stop_count) != cpus-1) cpu_relax(); - /* - * Reset it - just in case we boot another CPU later: - */ - atomic_set(&start_count, 0); - if (nr_warps) { printk("\n"); printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," " turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); - nr_warps = 0; - max_warp = 0; - last_tsc = 0; } else { printk(" passed.\n"); } /* + * Reset it - just in case we boot another CPU later: + */ + atomic_set(&start_count, 0); + nr_warps = 0; + max_warp = 0; + last_tsc = 0; + + /* * Let the target continue with the bootup: */ atomic_inc(&stop_count); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 157e4bedd3c..738c2104df3 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -70,10 +70,10 @@ /* * 8- and 16-bit register defines.. */ -#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0]) -#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1]) -#define IP(regs) (*(unsigned short *)&((regs)->pt.eip)) -#define SP(regs) (*(unsigned short *)&((regs)->pt.esp)) +#define AL(regs) (((unsigned char *)&((regs)->pt.ax))[0]) +#define AH(regs) (((unsigned char *)&((regs)->pt.ax))[1]) +#define IP(regs) (*(unsigned short *)&((regs)->pt.ip)) +#define SP(regs) (*(unsigned short *)&((regs)->pt.sp)) /* * virtual flags (16 and 32-bit versions) @@ -93,12 +93,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user, { int ret = 0; - /* kernel_vm86_regs is missing xgs, so copy everything up to + /* kernel_vm86_regs is missing gs, so copy everything up to (but not including) orig_eax, and then rest including orig_eax. */ - ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax)); - ret += copy_to_user(&user->orig_eax, ®s->pt.orig_eax, + ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax)); + ret += copy_to_user(&user->orig_eax, ®s->pt.orig_ax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.orig_eax)); + offsetof(struct kernel_vm86_regs, pt.orig_ax)); return ret; } @@ -110,18 +110,17 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, { int ret = 0; - /* copy eax-xfs inclusive */ - ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax)); - /* copy orig_eax-__gsh+extra */ - ret += copy_from_user(®s->pt.orig_eax, &user->orig_eax, + /* copy ax-fs inclusive */ + ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax)); + /* copy orig_ax-__gsh+extra */ + ret += copy_from_user(®s->pt.orig_ax, &user->orig_eax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.orig_eax) + + offsetof(struct kernel_vm86_regs, pt.orig_ax) + extra); return ret; } -struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); -struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) +struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs) { struct tss_struct *tss; struct pt_regs *ret; @@ -138,7 +137,7 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); + set_flags(regs->pt.flags, VEFLAGS, VIF_MASK | current->thread.v86mask); tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs,regs); tmp += put_user(current->thread.screen_bitmap,¤t->thread.vm86_info->screen_bitmap); if (tmp) { @@ -147,15 +146,15 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) } tss = &per_cpu(init_tss, get_cpu()); - current->thread.esp0 = current->thread.saved_esp0; + current->thread.sp0 = current->thread.saved_sp0; current->thread.sysenter_cs = __KERNEL_CS; - load_esp0(tss, ¤t->thread); - current->thread.saved_esp0 = 0; + load_sp0(tss, ¤t->thread); + current->thread.saved_sp0 = 0; put_cpu(); ret = KVM86->regs32; - ret->xfs = current->thread.saved_fs; + ret->fs = current->thread.saved_fs; loadsegment(gs, current->thread.saved_gs); return ret; @@ -197,7 +196,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk asmlinkage int sys_vm86old(struct pt_regs regs) { - struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx; + struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx; struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. * This remains on the stack until we @@ -207,7 +206,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs) int tmp, ret = -EPERM; tsk = current; - if (tsk->thread.saved_esp0) + if (tsk->thread.saved_sp0) goto out; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, vm86plus) - @@ -237,12 +236,12 @@ asmlinkage int sys_vm86(struct pt_regs regs) struct vm86plus_struct __user *v86; tsk = current; - switch (regs.ebx) { + switch (regs.bx) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: case VM86_GET_IRQ_BITS: case VM86_GET_AND_RESET_IRQ: - ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx); + ret = do_vm86_irq_handling(regs.bx, (int)regs.cx); goto out; case VM86_PLUS_INSTALL_CHECK: /* NOTE: on old vm86 stuff this will return the error @@ -256,9 +255,9 @@ asmlinkage int sys_vm86(struct pt_regs regs) /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ ret = -EPERM; - if (tsk->thread.saved_esp0) + if (tsk->thread.saved_sp0) goto out; - v86 = (struct vm86plus_struct __user *)regs.ecx; + v86 = (struct vm86plus_struct __user *)regs.cx; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, regs32) - sizeof(info.regs)); @@ -281,23 +280,23 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk /* * make sure the vm86() system call doesn't try to do anything silly */ - info->regs.pt.xds = 0; - info->regs.pt.xes = 0; - info->regs.pt.xfs = 0; + info->regs.pt.ds = 0; + info->regs.pt.es = 0; + info->regs.pt.fs = 0; /* we are clearing gs later just before "jmp resume_userspace", * because it is not saved/restored. */ /* - * The eflags register is also special: we cannot trust that the user + * The flags register is also special: we cannot trust that the user * has set it up safely, so this makes sure interrupt etc flags are * inherited from protected mode. */ - VEFLAGS = info->regs.pt.eflags; - info->regs.pt.eflags &= SAFE_MASK; - info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK; - info->regs.pt.eflags |= VM_MASK; + VEFLAGS = info->regs.pt.flags; + info->regs.pt.flags &= SAFE_MASK; + info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK; + info->regs.pt.flags |= VM_MASK; switch (info->cpu_type) { case CPU_286: @@ -315,18 +314,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk } /* - * Save old state, set default return value (%eax) to 0 + * Save old state, set default return value (%ax) to 0 */ - info->regs32->eax = 0; - tsk->thread.saved_esp0 = tsk->thread.esp0; - tsk->thread.saved_fs = info->regs32->xfs; + info->regs32->ax = 0; + tsk->thread.saved_sp0 = tsk->thread.sp0; + tsk->thread.saved_fs = info->regs32->fs; savesegment(gs, tsk->thread.saved_gs); tss = &per_cpu(init_tss, get_cpu()); - tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; - load_esp0(tss, &tsk->thread); + load_sp0(tss, &tsk->thread); put_cpu(); tsk->thread.screen_bitmap = info->screen_bitmap; @@ -352,7 +351,7 @@ static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval) struct pt_regs * regs32; regs32 = save_v86_state(regs16); - regs32->eax = retval; + regs32->ax = retval; __asm__ __volatile__("movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" "jmp resume_userspace" @@ -373,30 +372,30 @@ static inline void clear_IF(struct kernel_vm86_regs * regs) static inline void clear_TF(struct kernel_vm86_regs * regs) { - regs->pt.eflags &= ~TF_MASK; + regs->pt.flags &= ~TF_MASK; } static inline void clear_AC(struct kernel_vm86_regs * regs) { - regs->pt.eflags &= ~AC_MASK; + regs->pt.flags &= ~AC_MASK; } /* It is correct to call set_IF(regs) from the set_vflags_* * functions. However someone forgot to call clear_IF(regs) * in the opposite case. * After the command sequence CLI PUSHF STI POPF you should - * end up with interrups disabled, but you ended up with + * end up with interrupts disabled, but you ended up with * interrupts enabled. * ( I was testing my own changes, but the only bug I * could find was in a function I had not changed. ) * [KD] */ -static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) +static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs * regs) { - set_flags(VEFLAGS, eflags, current->thread.v86mask); - set_flags(regs->pt.eflags, eflags, SAFE_MASK); - if (eflags & IF_MASK) + set_flags(VEFLAGS, flags, current->thread.v86mask); + set_flags(regs->pt.flags, flags, SAFE_MASK); + if (flags & IF_MASK) set_IF(regs); else clear_IF(regs); @@ -405,7 +404,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) { set_flags(VFLAGS, flags, current->thread.v86mask); - set_flags(regs->pt.eflags, flags, SAFE_MASK); + set_flags(regs->pt.flags, flags, SAFE_MASK); if (flags & IF_MASK) set_IF(regs); else @@ -414,7 +413,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) { - unsigned long flags = regs->pt.eflags & RETURN_MASK; + unsigned long flags = regs->pt.flags & RETURN_MASK; if (VEFLAGS & VIF_MASK) flags |= IF_MASK; @@ -518,7 +517,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i, unsigned long __user *intr_ptr; unsigned long segoffs; - if (regs->pt.xcs == BIOSSEG) + if (regs->pt.cs == BIOSSEG) goto cannot_handle; if (is_revectored(i, &KVM86->int_revectored)) goto cannot_handle; @@ -530,9 +529,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i, if ((segoffs >> 16) == BIOSSEG) goto cannot_handle; pushw(ssp, sp, get_vflags(regs), cannot_handle); - pushw(ssp, sp, regs->pt.xcs, cannot_handle); + pushw(ssp, sp, regs->pt.cs, cannot_handle); pushw(ssp, sp, IP(regs), cannot_handle); - regs->pt.xcs = segoffs >> 16; + regs->pt.cs = segoffs >> 16; SP(regs) -= 6; IP(regs) = segoffs & 0xffff; clear_TF(regs); @@ -549,7 +548,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno if (VMPI.is_vm86pus) { if ( (trapno==3) || (trapno==1) ) return_to_32bit(regs, VM86_TRAP + (trapno << 8)); - do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs)); + do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); return 0; } if (trapno !=1) @@ -585,10 +584,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) handle_vm86_trap(regs, 0, 1); \ return; } while (0) - orig_flags = *(unsigned short *)®s->pt.eflags; + orig_flags = *(unsigned short *)®s->pt.flags; - csp = (unsigned char __user *) (regs->pt.xcs << 4); - ssp = (unsigned char __user *) (regs->pt.xss << 4); + csp = (unsigned char __user *) (regs->pt.cs << 4); + ssp = (unsigned char __user *) (regs->pt.ss << 4); sp = SP(regs); ip = IP(regs); @@ -675,7 +674,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) SP(regs) += 6; } IP(regs) = newip; - regs->pt.xcs = newcs; + regs->pt.cs = newcs; CHECK_IF_IN_TRAP; if (data32) { set_vflags_long(newflags, regs); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index f02bad68aba..4525bc2c2e1 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -62,7 +62,10 @@ static struct { void (*cpuid)(void /* non-c */); void (*_set_ldt)(u32 selector); void (*set_tr)(u32 selector); - void (*set_kernel_stack)(u32 selector, u32 esp0); + void (*write_idt_entry)(struct desc_struct *, int, u32, u32); + void (*write_gdt_entry)(struct desc_struct *, int, u32, u32); + void (*write_ldt_entry)(struct desc_struct *, int, u32, u32); + void (*set_kernel_stack)(u32 selector, u32 sp0); void (*allocate_page)(u32, u32, u32, u32, u32); void (*release_page)(u32, u32); void (*set_pte)(pte_t, pte_t *, unsigned); @@ -88,13 +91,13 @@ struct vmi_timer_ops vmi_timer_ops; #define IRQ_PATCH_DISABLE 5 static inline void patch_offset(void *insnbuf, - unsigned long eip, unsigned long dest) + unsigned long ip, unsigned long dest) { - *(unsigned long *)(insnbuf+1) = dest-eip-5; + *(unsigned long *)(insnbuf+1) = dest-ip-5; } static unsigned patch_internal(int call, unsigned len, void *insnbuf, - unsigned long eip) + unsigned long ip) { u64 reloc; struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; @@ -103,13 +106,13 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf, case VMI_RELOCATION_CALL_REL: BUG_ON(len < 5); *(char *)insnbuf = MNEM_CALL; - patch_offset(insnbuf, eip, (unsigned long)rel->eip); + patch_offset(insnbuf, ip, (unsigned long)rel->eip); return 5; case VMI_RELOCATION_JUMP_REL: BUG_ON(len < 5); *(char *)insnbuf = MNEM_JMP; - patch_offset(insnbuf, eip, (unsigned long)rel->eip); + patch_offset(insnbuf, ip, (unsigned long)rel->eip); return 5; case VMI_RELOCATION_NOP: @@ -131,25 +134,25 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf, * sequence. The callee does nop padding for us. */ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, - unsigned long eip, unsigned len) + unsigned long ip, unsigned len) { switch (type) { case PARAVIRT_PATCH(pv_irq_ops.irq_disable): return patch_internal(VMI_CALL_DisableInterrupts, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_irq_ops.irq_enable): return patch_internal(VMI_CALL_EnableInterrupts, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_irq_ops.restore_fl): return patch_internal(VMI_CALL_SetInterruptMask, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_irq_ops.save_fl): return patch_internal(VMI_CALL_GetInterruptMask, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_cpu_ops.iret): - return patch_internal(VMI_CALL_IRET, len, insns, eip); - case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): - return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); + return patch_internal(VMI_CALL_IRET, len, insns, ip); + case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret): + return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip); default: break; } @@ -157,36 +160,36 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, } /* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ -static void vmi_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static void vmi_cpuid(unsigned int *ax, unsigned int *bx, + unsigned int *cx, unsigned int *dx) { int override = 0; - if (*eax == 1) + if (*ax == 1) override = 1; asm volatile ("call *%6" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid)); + : "=a" (*ax), + "=b" (*bx), + "=c" (*cx), + "=d" (*dx) + : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid)); if (override) { if (disable_pse) - *edx &= ~X86_FEATURE_PSE; + *dx &= ~X86_FEATURE_PSE; if (disable_pge) - *edx &= ~X86_FEATURE_PGE; + *dx &= ~X86_FEATURE_PGE; if (disable_sep) - *edx &= ~X86_FEATURE_SEP; + *dx &= ~X86_FEATURE_SEP; if (disable_tsc) - *edx &= ~X86_FEATURE_TSC; + *dx &= ~X86_FEATURE_TSC; if (disable_mtrr) - *edx &= ~X86_FEATURE_MTRR; + *dx &= ~X86_FEATURE_MTRR; } } static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) { if (gdt[nr].a != new->a || gdt[nr].b != new->b) - write_gdt_entry(gdt, nr, new->a, new->b); + write_gdt_entry(gdt, nr, new, 0); } static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) @@ -200,12 +203,12 @@ static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) static void vmi_set_ldt(const void *addr, unsigned entries) { unsigned cpu = smp_processor_id(); - u32 low, high; + struct desc_struct desc; - pack_descriptor(&low, &high, (unsigned long)addr, + pack_descriptor(&desc, (unsigned long)addr, entries * sizeof(struct desc_struct) - 1, - DESCTYPE_LDT, 0); - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high); + DESC_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT); vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); } @@ -214,17 +217,37 @@ static void vmi_set_tr(void) vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); } -static void vmi_load_esp0(struct tss_struct *tss, +static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) +{ + u32 *idt_entry = (u32 *)g; + vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[2]); +} + +static void vmi_write_gdt_entry(struct desc_struct *dt, int entry, + const void *desc, int type) +{ + u32 *gdt_entry = (u32 *)desc; + vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[2]); +} + +static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, + const void *desc) +{ + u32 *ldt_entry = (u32 *)desc; + vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[2]); +} + +static void vmi_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { - tss->x86_tss.esp0 = thread->esp0; + tss->x86_tss.sp0 = thread->sp0; /* This can only happen when SEP is enabled, no need to test "SEP"arately */ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { tss->x86_tss.ss1 = thread->sysenter_cs; wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } - vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0); + vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0); } static void vmi_flush_tlb_user(void) @@ -375,7 +398,7 @@ static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } -static void vmi_allocate_pd(u32 pfn) +static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) { /* * This call comes in very early, before mem_map is setup. @@ -452,7 +475,7 @@ static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) { #ifdef CONFIG_X86_PAE - const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 }; + const pte_t pte = { .pte = pmdval.pmd }; vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); #else const pte_t pte = { pmdval.pud.pgd.pgd }; @@ -485,21 +508,21 @@ static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t static void vmi_set_pud(pud_t *pudp, pud_t pudval) { /* Um, eww */ - const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 }; + const pte_t pte = { .pte = pudval.pgd.pgd }; vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); } static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - const pte_t pte = { 0 }; + const pte_t pte = { .pte = 0 }; vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_pmd_clear(pmd_t *pmd) { - const pte_t pte = { 0 }; + const pte_t pte = { .pte = 0 }; vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); } @@ -790,10 +813,13 @@ static inline int __init activate_vmi(void) para_fill(pv_cpu_ops.store_idt, GetIDT); para_fill(pv_cpu_ops.store_tr, GetTR); pv_cpu_ops.load_tls = vmi_load_tls; - para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry); - para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry); - para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry); - para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); + para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry, + write_ldt_entry, WriteLDTEntry); + para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry, + write_gdt_entry, WriteGDTEntry); + para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry, + write_idt_entry, WriteIDTEntry); + para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack); para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); para_fill(pv_cpu_ops.io_delay, IODelay); @@ -870,7 +896,7 @@ static inline int __init activate_vmi(void) * the backend. They are performance critical anyway, so requiring * a patch is not a big problem. */ - pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; + pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0; pv_cpu_ops.iret = (void *)0xbadbab0; #ifdef CONFIG_SMP @@ -963,19 +989,19 @@ static int __init parse_vmi(char *arg) return -EINVAL; if (!strcmp(arg, "disable_pge")) { - clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); disable_pge = 1; } else if (!strcmp(arg, "disable_pse")) { - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE); disable_pse = 1; } else if (!strcmp(arg, "disable_sep")) { - clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); disable_sep = 1; } else if (!strcmp(arg, "disable_tsc")) { - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC); disable_tsc = 1; } else if (!strcmp(arg, "disable_mtrr")) { - clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR); disable_mtrr = 1; } else if (!strcmp(arg, "disable_timer")) { disable_vmi_timer = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index b1b5ab08b26..a2b030780aa 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -35,7 +35,6 @@ #include <asm/i8253.h> #include <irq_vectors.h> -#include "io_ports.h" #define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) @@ -238,7 +237,7 @@ static void __devinit vmi_time_init_clockevent(void) void __init vmi_time_init(void) { /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ - outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ + outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 84c913f38f9..f1148ac8abe 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -8,12 +8,6 @@ * put it inside the section definition. */ -/* Don't define absolute symbols until and unless you know that symbol - * value is should remain constant even if kernel image is relocated - * at run time. Absolute symbols are not relocated. If symbol value should - * change if kernel is relocated, make the symbol section relative and - * put it inside the section definition. - */ #define LOAD_OFFSET __PAGE_OFFSET #include <asm-generic/vmlinux.lds.h> @@ -44,6 +38,8 @@ SECTIONS /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + . = ALIGN(4096); /* not really needed, already page aligned */ + *(.text.page_aligned) TEXT_TEXT SCHED_TEXT LOCK_TEXT diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index ea5386944e6..0992b9946c6 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -37,16 +37,15 @@ SECTIONS KPROBES_TEXT *(.fixup) *(.gnu.warning) - } :text = 0x9090 - /* out-of-line lock text */ - .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } - - _etext = .; /* End of text section */ + _etext = .; /* End of text section */ + } :text = 0x9090 . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } - __stop___ex_table = .; + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } NOTES :text :note @@ -179,6 +178,14 @@ SECTIONS } __con_initcall_end = .; SECURITY_INIT + + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + . = ALIGN(8); __alt_instructions = .; .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 414caf0c5f9..d971210a6d3 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -25,21 +25,24 @@ static int __init vsmp_init(void) return 0; /* Check if we are running on a ScaleMP vSMP box */ - if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || - (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) + if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != + PCI_VENDOR_ID_SCALEMP) || + (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != + PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) return 0; /* set vSMP magic bits to indicate vSMP capable kernel */ address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8); cap = readl(address); ctl = readl(address + 4); - printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); + printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", + cap, ctl); if (cap & ctl & (1 << 4)) { /* Turn on vSMP IRQ fastpath handling (see system.h) */ ctl &= ~(1 << 4); writel(ctl, address + 4); ctl = readl(address + 4); - printk("vSMP CTL: control set to:0x%08x\n", ctl); + printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl); } iounmap(address); diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S deleted file mode 100644 index a5ab3dc4fd2..00000000000 --- a/arch/x86/kernel/vsyscall_32.S +++ /dev/null @@ -1,15 +0,0 @@ -#include <linux/init.h> - -__INITDATA - - .globl vsyscall_int80_start, vsyscall_int80_end -vsyscall_int80_start: - .incbin "arch/x86/kernel/vsyscall-int80_32.so" -vsyscall_int80_end: - - .globl vsyscall_sysenter_start, vsyscall_sysenter_end -vsyscall_sysenter_start: - .incbin "arch/x86/kernel/vsyscall-sysenter_32.so" -vsyscall_sysenter_end: - -__FINIT diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S deleted file mode 100644 index 4a8b0ed9b8f..00000000000 --- a/arch/x86/kernel/vsyscall_32.lds.S +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Linker script for vsyscall DSO. The vsyscall page is an ELF shared - * object prelinked to its virtual address, and with only one read-only - * segment (that fits in one page). This script controls its layout. - */ -#include <asm/asm-offsets.h> - -SECTIONS -{ - . = VDSO_PRELINK_asm + SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - /* This linker script is used both with -r and with -shared. - For the layouts to match, we need to skip more than enough - space for the dynamic symbol table et al. If this amount - is insufficient, ld -shared will barf. Just increase it here. */ - . = VDSO_PRELINK_asm + 0x400; - - .text : { *(.text) } :text =0x90909090 - .note : { *(.note.*) } :text :note - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic - .useless : { - *(.got.plt) *(.got) - *(.data .data.* .gnu.linkonce.d.*) - *(.dynbss) - *(.bss .bss.* .gnu.linkonce.b.*) - } :text -} - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ -} - -/* - * This controls what symbols we export from the DSO. - */ -VERSION -{ - LINUX_2.5 { - global: - __kernel_vsyscall; - __kernel_sigreturn; - __kernel_rt_sigreturn; - - local: *; - }; -} - -/* The ELF entry point can be used to set the AT_SYSINFO value. */ -ENTRY(__kernel_vsyscall); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index ad4005c6d4a..3f824277458 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -43,7 +43,7 @@ #include <asm/vgtod.h> #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) -#define __syscall_clobber "r11","rcx","memory" +#define __syscall_clobber "r11","cx","memory" #define __pa_vsymbol(x) \ ({unsigned long v; \ extern char __vsyscall_0; \ @@ -190,7 +190,7 @@ time_t __vsyscall(1) vtime(time_t *t) long __vsyscall(2) vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) { - unsigned int dummy, p; + unsigned int p; unsigned long j = 0; /* Fast cache - only recompute value once per jiffies and avoid @@ -205,7 +205,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) p = tcache->blob[1]; } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ - rdtscp(dummy, dummy, p); + native_read_tscp(&p); } else { /* Load per CPU data from GDT */ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); @@ -297,7 +297,7 @@ static void __cpuinit vsyscall_set_cpu(int cpu) /* Store cpu number in limit so that it can be loaded quickly in user space in vgetcpu. 12 bits for the CPU and 8 bits for the node. */ - d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); + d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU); *d = 0x0f40000000000ULL; *d |= cpu; *d |= (node & 0xf) << 12; @@ -319,7 +319,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) return NOTIFY_DONE; } -static void __init map_vsyscall(void) +void __init map_vsyscall(void) { extern char __vsyscall_0; unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); @@ -335,7 +335,6 @@ static int __init vsyscall_init(void) BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); - map_vsyscall(); #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2); #endif diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 77c25b30763..a66e9c1a053 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -8,6 +8,7 @@ #include <asm/processor.h> #include <asm/uaccess.h> #include <asm/pgtable.h> +#include <asm/desc.h> EXPORT_SYMBOL(kernel_thread); @@ -34,13 +35,6 @@ EXPORT_SYMBOL(__copy_from_user_inatomic); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); -#ifdef CONFIG_SMP -extern void __write_lock_failed(rwlock_t *rw); -extern void __read_lock_failed(rwlock_t *rw); -EXPORT_SYMBOL(__write_lock_failed); -EXPORT_SYMBOL(__read_lock_failed); -#endif - /* Export string functions. We normally rely on gcc builtin for most of these, but gcc sometimes decides not to inline them. */ #undef memcpy @@ -60,3 +54,8 @@ EXPORT_SYMBOL(init_level4_pgt); EXPORT_SYMBOL(load_gs_index); EXPORT_SYMBOL(_proxy_pda); + +#ifdef CONFIG_PARAVIRT +/* Virtualized guests may want to use it */ +EXPORT_SYMBOL_GPL(cpu_gdt_descr); +#endif diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig new file mode 100644 index 00000000000..c83e1c9b512 --- /dev/null +++ b/arch/x86/kvm/Kconfig @@ -0,0 +1,57 @@ +# +# KVM configuration +# +config HAVE_KVM + bool + +menuconfig VIRTUALIZATION + bool "Virtualization" + depends on HAVE_KVM || X86 + default y + ---help--- + Say Y here to get to see options for using your Linux host to run other + operating systems inside virtual machines (guests). + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and disabled. + +if VIRTUALIZATION + +config KVM + tristate "Kernel-based Virtual Machine (KVM) support" + depends on HAVE_KVM && EXPERIMENTAL + select PREEMPT_NOTIFIERS + select ANON_INODES + ---help--- + Support hosting fully virtualized guest machines using hardware + virtualization extensions. You will need a fairly recent + processor equipped with virtualization extensions. You will also + need to select one or more of the processor modules below. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + To compile this as a module, choose M here: the module + will be called kvm. + + If unsure, say N. + +config KVM_INTEL + tristate "KVM for Intel processors support" + depends on KVM + ---help--- + Provides support for KVM on Intel processors equipped with the VT + extensions. + +config KVM_AMD + tristate "KVM for AMD processors support" + depends on KVM + ---help--- + Provides support for KVM on AMD processors equipped with the AMD-V + (SVM) extensions. + +# OK, it's a little counter-intuitive to do this, but it puts it neatly under +# the virtualization menu. +source drivers/lguest/Kconfig + +endif # VIRTUALIZATION diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile new file mode 100644 index 00000000000..ffdd0b31078 --- /dev/null +++ b/arch/x86/kvm/Makefile @@ -0,0 +1,14 @@ +# +# Makefile for Kernel-based Virtual Machine module +# + +common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) + +EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm + +kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o +obj-$(CONFIG_KVM) += kvm.o +kvm-intel-objs = vmx.o +obj-$(CONFIG_KVM_INTEL) += kvm-intel.o +kvm-amd-objs = svm.o +obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c new file mode 100644 index 00000000000..ab29cf2def4 --- /dev/null +++ b/arch/x86/kvm/i8259.c @@ -0,0 +1,450 @@ +/* + * 8259 interrupt controller emulation + * + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2007 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * Port from Qemu. + */ +#include <linux/mm.h> +#include "irq.h" + +#include <linux/kvm_host.h> + +/* + * set irq level. If an edge is detected, then the IRR is set to 1 + */ +static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) +{ + int mask; + mask = 1 << irq; + if (s->elcr & mask) /* level triggered */ + if (level) { + s->irr |= mask; + s->last_irr |= mask; + } else { + s->irr &= ~mask; + s->last_irr &= ~mask; + } + else /* edge triggered */ + if (level) { + if ((s->last_irr & mask) == 0) + s->irr |= mask; + s->last_irr |= mask; + } else + s->last_irr &= ~mask; +} + +/* + * return the highest priority found in mask (highest = smallest + * number). Return 8 if no irq + */ +static inline int get_priority(struct kvm_kpic_state *s, int mask) +{ + int priority; + if (mask == 0) + return 8; + priority = 0; + while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0) + priority++; + return priority; +} + +/* + * return the pic wanted interrupt. return -1 if none + */ +static int pic_get_irq(struct kvm_kpic_state *s) +{ + int mask, cur_priority, priority; + + mask = s->irr & ~s->imr; + priority = get_priority(s, mask); + if (priority == 8) + return -1; + /* + * compute current priority. If special fully nested mode on the + * master, the IRQ coming from the slave is not taken into account + * for the priority computation. + */ + mask = s->isr; + if (s->special_fully_nested_mode && s == &s->pics_state->pics[0]) + mask &= ~(1 << 2); + cur_priority = get_priority(s, mask); + if (priority < cur_priority) + /* + * higher priority found: an irq should be generated + */ + return (priority + s->priority_add) & 7; + else + return -1; +} + +/* + * raise irq to CPU if necessary. must be called every time the active + * irq may change + */ +static void pic_update_irq(struct kvm_pic *s) +{ + int irq2, irq; + + irq2 = pic_get_irq(&s->pics[1]); + if (irq2 >= 0) { + /* + * if irq request by slave pic, signal master PIC + */ + pic_set_irq1(&s->pics[0], 2, 1); + pic_set_irq1(&s->pics[0], 2, 0); + } + irq = pic_get_irq(&s->pics[0]); + if (irq >= 0) + s->irq_request(s->irq_request_opaque, 1); + else + s->irq_request(s->irq_request_opaque, 0); +} + +void kvm_pic_update_irq(struct kvm_pic *s) +{ + pic_update_irq(s); +} + +void kvm_pic_set_irq(void *opaque, int irq, int level) +{ + struct kvm_pic *s = opaque; + + pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + pic_update_irq(s); +} + +/* + * acknowledge interrupt 'irq' + */ +static inline void pic_intack(struct kvm_kpic_state *s, int irq) +{ + if (s->auto_eoi) { + if (s->rotate_on_auto_eoi) + s->priority_add = (irq + 1) & 7; + } else + s->isr |= (1 << irq); + /* + * We don't clear a level sensitive interrupt here + */ + if (!(s->elcr & (1 << irq))) + s->irr &= ~(1 << irq); +} + +int kvm_pic_read_irq(struct kvm_pic *s) +{ + int irq, irq2, intno; + + irq = pic_get_irq(&s->pics[0]); + if (irq >= 0) { + pic_intack(&s->pics[0], irq); + if (irq == 2) { + irq2 = pic_get_irq(&s->pics[1]); + if (irq2 >= 0) + pic_intack(&s->pics[1], irq2); + else + /* + * spurious IRQ on slave controller + */ + irq2 = 7; + intno = s->pics[1].irq_base + irq2; + irq = irq2 + 8; + } else + intno = s->pics[0].irq_base + irq; + } else { + /* + * spurious IRQ on host controller + */ + irq = 7; + intno = s->pics[0].irq_base + irq; + } + pic_update_irq(s); + + return intno; +} + +void kvm_pic_reset(struct kvm_kpic_state *s) +{ + s->last_irr = 0; + s->irr = 0; + s->imr = 0; + s->isr = 0; + s->priority_add = 0; + s->irq_base = 0; + s->read_reg_select = 0; + s->poll = 0; + s->special_mask = 0; + s->init_state = 0; + s->auto_eoi = 0; + s->rotate_on_auto_eoi = 0; + s->special_fully_nested_mode = 0; + s->init4 = 0; +} + +static void pic_ioport_write(void *opaque, u32 addr, u32 val) +{ + struct kvm_kpic_state *s = opaque; + int priority, cmd, irq; + + addr &= 1; + if (addr == 0) { + if (val & 0x10) { + kvm_pic_reset(s); /* init */ + /* + * deassert a pending interrupt + */ + s->pics_state->irq_request(s->pics_state-> + irq_request_opaque, 0); + s->init_state = 1; + s->init4 = val & 1; + if (val & 0x02) + printk(KERN_ERR "single mode not supported"); + if (val & 0x08) + printk(KERN_ERR + "level sensitive irq not supported"); + } else if (val & 0x08) { + if (val & 0x04) + s->poll = 1; + if (val & 0x02) + s->read_reg_select = val & 1; + if (val & 0x40) + s->special_mask = (val >> 5) & 1; + } else { + cmd = val >> 5; + switch (cmd) { + case 0: + case 4: + s->rotate_on_auto_eoi = cmd >> 2; + break; + case 1: /* end of interrupt */ + case 5: + priority = get_priority(s, s->isr); + if (priority != 8) { + irq = (priority + s->priority_add) & 7; + s->isr &= ~(1 << irq); + if (cmd == 5) + s->priority_add = (irq + 1) & 7; + pic_update_irq(s->pics_state); + } + break; + case 3: + irq = val & 7; + s->isr &= ~(1 << irq); + pic_update_irq(s->pics_state); + break; + case 6: + s->priority_add = (val + 1) & 7; + pic_update_irq(s->pics_state); + break; + case 7: + irq = val & 7; + s->isr &= ~(1 << irq); + s->priority_add = (irq + 1) & 7; + pic_update_irq(s->pics_state); + break; + default: + break; /* no operation */ + } + } + } else + switch (s->init_state) { + case 0: /* normal mode */ + s->imr = val; + pic_update_irq(s->pics_state); + break; + case 1: + s->irq_base = val & 0xf8; + s->init_state = 2; + break; + case 2: + if (s->init4) + s->init_state = 3; + else + s->init_state = 0; + break; + case 3: + s->special_fully_nested_mode = (val >> 4) & 1; + s->auto_eoi = (val >> 1) & 1; + s->init_state = 0; + break; + } +} + +static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) +{ + int ret; + + ret = pic_get_irq(s); + if (ret >= 0) { + if (addr1 >> 7) { + s->pics_state->pics[0].isr &= ~(1 << 2); + s->pics_state->pics[0].irr &= ~(1 << 2); + } + s->irr &= ~(1 << ret); + s->isr &= ~(1 << ret); + if (addr1 >> 7 || ret != 2) + pic_update_irq(s->pics_state); + } else { + ret = 0x07; + pic_update_irq(s->pics_state); + } + + return ret; +} + +static u32 pic_ioport_read(void *opaque, u32 addr1) +{ + struct kvm_kpic_state *s = opaque; + unsigned int addr; + int ret; + + addr = addr1; + addr &= 1; + if (s->poll) { + ret = pic_poll_read(s, addr1); + s->poll = 0; + } else + if (addr == 0) + if (s->read_reg_select) + ret = s->isr; + else + ret = s->irr; + else + ret = s->imr; + return ret; +} + +static void elcr_ioport_write(void *opaque, u32 addr, u32 val) +{ + struct kvm_kpic_state *s = opaque; + s->elcr = val & s->elcr_mask; +} + +static u32 elcr_ioport_read(void *opaque, u32 addr1) +{ + struct kvm_kpic_state *s = opaque; + return s->elcr; +} + +static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) +{ + switch (addr) { + case 0x20: + case 0x21: + case 0xa0: + case 0xa1: + case 0x4d0: + case 0x4d1: + return 1; + default: + return 0; + } +} + +static void picdev_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *val) +{ + struct kvm_pic *s = this->private; + unsigned char data = *(unsigned char *)val; + + if (len != 1) { + if (printk_ratelimit()) + printk(KERN_ERR "PIC: non byte write\n"); + return; + } + switch (addr) { + case 0x20: + case 0x21: + case 0xa0: + case 0xa1: + pic_ioport_write(&s->pics[addr >> 7], addr, data); + break; + case 0x4d0: + case 0x4d1: + elcr_ioport_write(&s->pics[addr & 1], addr, data); + break; + } +} + +static void picdev_read(struct kvm_io_device *this, + gpa_t addr, int len, void *val) +{ + struct kvm_pic *s = this->private; + unsigned char data = 0; + + if (len != 1) { + if (printk_ratelimit()) + printk(KERN_ERR "PIC: non byte read\n"); + return; + } + switch (addr) { + case 0x20: + case 0x21: + case 0xa0: + case 0xa1: + data = pic_ioport_read(&s->pics[addr >> 7], addr); + break; + case 0x4d0: + case 0x4d1: + data = elcr_ioport_read(&s->pics[addr & 1], addr); + break; + } + *(unsigned char *)val = data; +} + +/* + * callback when PIC0 irq status changed + */ +static void pic_irq_request(void *opaque, int level) +{ + struct kvm *kvm = opaque; + struct kvm_vcpu *vcpu = kvm->vcpus[0]; + + pic_irqchip(kvm)->output = level; + if (vcpu) + kvm_vcpu_kick(vcpu); +} + +struct kvm_pic *kvm_create_pic(struct kvm *kvm) +{ + struct kvm_pic *s; + s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); + if (!s) + return NULL; + s->pics[0].elcr_mask = 0xf8; + s->pics[1].elcr_mask = 0xde; + s->irq_request = pic_irq_request; + s->irq_request_opaque = kvm; + s->pics[0].pics_state = s; + s->pics[1].pics_state = s; + + /* + * Initialize PIO device + */ + s->dev.read = picdev_read; + s->dev.write = picdev_write; + s->dev.in_range = picdev_in_range; + s->dev.private = s; + kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); + return s; +} diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c new file mode 100644 index 00000000000..e5714759e97 --- /dev/null +++ b/arch/x86/kvm/irq.c @@ -0,0 +1,78 @@ +/* + * irq.c: API for in kernel interrupt controller + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + */ + +#include <linux/module.h> +#include <linux/kvm_host.h> + +#include "irq.h" + +/* + * check if there is pending interrupt without + * intack. + */ +int kvm_cpu_has_interrupt(struct kvm_vcpu *v) +{ + struct kvm_pic *s; + + if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ + if (kvm_apic_accept_pic_intr(v)) { + s = pic_irqchip(v->kvm); /* PIC */ + return s->output; + } else + return 0; + } + return 1; +} +EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); + +/* + * Read pending interrupt vector and intack. + */ +int kvm_cpu_get_interrupt(struct kvm_vcpu *v) +{ + struct kvm_pic *s; + int vector; + + vector = kvm_get_apic_interrupt(v); /* APIC */ + if (vector == -1) { + if (kvm_apic_accept_pic_intr(v)) { + s = pic_irqchip(v->kvm); + s->output = 0; /* PIC */ + vector = kvm_pic_read_irq(s); + } + } + return vector; +} +EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); + +void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) +{ + kvm_inject_apic_timer_irqs(vcpu); + /* TODO: PIT, RTC etc. */ +} +EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); + +void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) +{ + kvm_apic_timer_intr_post(vcpu, vec); + /* TODO: PIT, RTC etc. */ +} +EXPORT_SYMBOL_GPL(kvm_timer_intr_post); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h new file mode 100644 index 00000000000..fa5ed5d59b5 --- /dev/null +++ b/arch/x86/kvm/irq.h @@ -0,0 +1,88 @@ +/* + * irq.h: in kernel interrupt controller related definitions + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * Authors: + * Yaozu (Eddie) Dong <Eddie.dong@intel.com> + * + */ + +#ifndef __IRQ_H +#define __IRQ_H + +#include <linux/mm_types.h> +#include <linux/hrtimer.h> +#include <linux/kvm_host.h> + +#include "iodev.h" +#include "ioapic.h" +#include "lapic.h" + +struct kvm; +struct kvm_vcpu; + +typedef void irq_request_func(void *opaque, int level); + +struct kvm_kpic_state { + u8 last_irr; /* edge detection */ + u8 irr; /* interrupt request register */ + u8 imr; /* interrupt mask register */ + u8 isr; /* interrupt service register */ + u8 priority_add; /* highest irq priority */ + u8 irq_base; + u8 read_reg_select; + u8 poll; + u8 special_mask; + u8 init_state; + u8 auto_eoi; + u8 rotate_on_auto_eoi; + u8 special_fully_nested_mode; + u8 init4; /* true if 4 byte init */ + u8 elcr; /* PIIX edge/trigger selection */ + u8 elcr_mask; + struct kvm_pic *pics_state; +}; + +struct kvm_pic { + struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ + irq_request_func *irq_request; + void *irq_request_opaque; + int output; /* intr from master PIC */ + struct kvm_io_device dev; +}; + +struct kvm_pic *kvm_create_pic(struct kvm *kvm); +void kvm_pic_set_irq(void *opaque, int irq, int level); +int kvm_pic_read_irq(struct kvm_pic *s); +void kvm_pic_update_irq(struct kvm_pic *s); + +static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) +{ + return kvm->arch.vpic; +} + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + return pic_irqchip(kvm) != NULL; +} + +void kvm_pic_reset(struct kvm_kpic_state *s); + +void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); +void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); +void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); + +#endif diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h new file mode 100644 index 00000000000..ecdfe97e463 --- /dev/null +++ b/arch/x86/kvm/kvm_svm.h @@ -0,0 +1,45 @@ +#ifndef __KVM_SVM_H +#define __KVM_SVM_H + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/kvm_host.h> +#include <asm/msr.h> + +#include "svm.h" + +static const u32 host_save_user_msrs[] = { +#ifdef CONFIG_X86_64 + MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, + MSR_FS_BASE, +#endif + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, +}; + +#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) +#define NUM_DB_REGS 4 + +struct kvm_vcpu; + +struct vcpu_svm { + struct kvm_vcpu vcpu; + struct vmcb *vmcb; + unsigned long vmcb_pa; + struct svm_cpu_data *svm_data; + uint64_t asid_generation; + + unsigned long db_regs[NUM_DB_REGS]; + + u64 next_rip; + + u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; + u64 host_gs_base; + unsigned long host_cr2; + unsigned long host_db_regs[NUM_DB_REGS]; + unsigned long host_dr6; + unsigned long host_dr7; +}; + +#endif + diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c new file mode 100644 index 00000000000..2cbee9479ce --- /dev/null +++ b/arch/x86/kvm/lapic.c @@ -0,0 +1,1154 @@ + +/* + * Local APIC virtualization + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2007 Novell + * Copyright (C) 2007 Intel + * + * Authors: + * Dor Laor <dor.laor@qumranet.com> + * Gregory Haskins <ghaskins@novell.com> + * Yaozu (Eddie) Dong <eddie.dong@intel.com> + * + * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/smp.h> +#include <linux/hrtimer.h> +#include <linux/io.h> +#include <linux/module.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/page.h> +#include <asm/current.h> +#include <asm/apicdef.h> +#include <asm/atomic.h> +#include <asm/div64.h> +#include "irq.h" + +#define PRId64 "d" +#define PRIx64 "llx" +#define PRIu64 "u" +#define PRIo64 "o" + +#define APIC_BUS_CYCLE_NS 1 + +/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ +#define apic_debug(fmt, arg...) + +#define APIC_LVT_NUM 6 +/* 14 is the version for Xeon and Pentium 8.4.8*/ +#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) +#define LAPIC_MMIO_LENGTH (1 << 12) +/* followed define is not in apicdef.h */ +#define APIC_SHORT_MASK 0xc0000 +#define APIC_DEST_NOSHORT 0x0 +#define APIC_DEST_MASK 0x800 +#define MAX_APIC_VECTOR 256 + +#define VEC_POS(v) ((v) & (32 - 1)) +#define REG_POS(v) (((v) >> 5) << 4) + +static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) +{ + return *((u32 *) (apic->regs + reg_off)); +} + +static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) +{ + *((u32 *) (apic->regs + reg_off)) = val; +} + +static inline int apic_test_and_set_vector(int vec, void *bitmap) +{ + return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int apic_test_and_clear_vector(int vec, void *bitmap) +{ + return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline void apic_set_vector(int vec, void *bitmap) +{ + set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline void apic_clear_vector(int vec, void *bitmap) +{ + clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int apic_hw_enabled(struct kvm_lapic *apic) +{ + return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; +} + +static inline int apic_sw_enabled(struct kvm_lapic *apic) +{ + return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; +} + +static inline int apic_enabled(struct kvm_lapic *apic) +{ + return apic_sw_enabled(apic) && apic_hw_enabled(apic); +} + +#define LVT_MASK \ + (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK) + +#define LINT_MASK \ + (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ + APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) + +static inline int kvm_apic_id(struct kvm_lapic *apic) +{ + return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; +} + +static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) +{ + return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); +} + +static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) +{ + return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; +} + +static inline int apic_lvtt_period(struct kvm_lapic *apic) +{ + return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; +} + +static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { + LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ + LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ + LVT_MASK | APIC_MODE_MASK, /* LVTPC */ + LINT_MASK, LINT_MASK, /* LVT0-1 */ + LVT_MASK /* LVTERR */ +}; + +static int find_highest_vector(void *bitmap) +{ + u32 *word = bitmap; + int word_offset = MAX_APIC_VECTOR >> 5; + + while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) + continue; + + if (likely(!word_offset && !word[0])) + return -1; + else + return fls(word[word_offset << 2]) - 1 + (word_offset << 5); +} + +static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) +{ + return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); +} + +static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) +{ + apic_clear_vector(vec, apic->regs + APIC_IRR); +} + +static inline int apic_find_highest_irr(struct kvm_lapic *apic) +{ + int result; + + result = find_highest_vector(apic->regs + APIC_IRR); + ASSERT(result == -1 || result >= 16); + + return result; +} + +int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + int highest_irr; + + if (!apic) + return 0; + highest_irr = apic_find_highest_irr(apic); + + return highest_irr; +} +EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); + +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!apic_test_and_set_irr(vec, apic)) { + /* a new pending irq is set in IRR */ + if (trig) + apic_set_vector(vec, apic->regs + APIC_TMR); + else + apic_clear_vector(vec, apic->regs + APIC_TMR); + kvm_vcpu_kick(apic->vcpu); + return 1; + } + return 0; +} + +static inline int apic_find_highest_isr(struct kvm_lapic *apic) +{ + int result; + + result = find_highest_vector(apic->regs + APIC_ISR); + ASSERT(result == -1 || result >= 16); + + return result; +} + +static void apic_update_ppr(struct kvm_lapic *apic) +{ + u32 tpr, isrv, ppr; + int isr; + + tpr = apic_get_reg(apic, APIC_TASKPRI); + isr = apic_find_highest_isr(apic); + isrv = (isr != -1) ? isr : 0; + + if ((tpr & 0xf0) >= (isrv & 0xf0)) + ppr = tpr & 0xff; + else + ppr = isrv & 0xf0; + + apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", + apic, ppr, isr, isrv); + + apic_set_reg(apic, APIC_PROCPRI, ppr); +} + +static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) +{ + apic_set_reg(apic, APIC_TASKPRI, tpr); + apic_update_ppr(apic); +} + +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) +{ + return kvm_apic_id(apic) == dest; +} + +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) +{ + int result = 0; + u8 logical_id; + + logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); + + switch (apic_get_reg(apic, APIC_DFR)) { + case APIC_DFR_FLAT: + if (logical_id & mda) + result = 1; + break; + case APIC_DFR_CLUSTER: + if (((logical_id >> 4) == (mda >> 0x4)) + && (logical_id & mda & 0xf)) + result = 1; + break; + default: + printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", + apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); + break; + } + + return result; +} + +static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, + int short_hand, int dest, int dest_mode) +{ + int result = 0; + struct kvm_lapic *target = vcpu->arch.apic; + + apic_debug("target %p, source %p, dest 0x%x, " + "dest_mode 0x%x, short_hand 0x%x", + target, source, dest, dest_mode, short_hand); + + ASSERT(!target); + switch (short_hand) { + case APIC_DEST_NOSHORT: + if (dest_mode == 0) { + /* Physical mode. */ + if ((dest == 0xFF) || (dest == kvm_apic_id(target))) + result = 1; + } else + /* Logical mode. */ + result = kvm_apic_match_logical_addr(target, dest); + break; + case APIC_DEST_SELF: + if (target == source) + result = 1; + break; + case APIC_DEST_ALLINC: + result = 1; + break; + case APIC_DEST_ALLBUT: + if (target != source) + result = 1; + break; + default: + printk(KERN_WARNING "Bad dest shorthand value %x\n", + short_hand); + break; + } + + return result; +} + +/* + * Add a pending IRQ into lapic. + * Return 1 if successfully added and 0 if discarded. + */ +static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, + int vector, int level, int trig_mode) +{ + int orig_irr, result = 0; + struct kvm_vcpu *vcpu = apic->vcpu; + + switch (delivery_mode) { + case APIC_DM_FIXED: + case APIC_DM_LOWEST: + /* FIXME add logic for vcpu on reset */ + if (unlikely(!apic_enabled(apic))) + break; + + orig_irr = apic_test_and_set_irr(vector, apic); + if (orig_irr && trig_mode) { + apic_debug("level trig mode repeatedly for vector %d", + vector); + break; + } + + if (trig_mode) { + apic_debug("level trig mode for vector %d", vector); + apic_set_vector(vector, apic->regs + APIC_TMR); + } else + apic_clear_vector(vector, apic->regs + APIC_TMR); + + if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) + kvm_vcpu_kick(vcpu); + else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) { + vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; + if (waitqueue_active(&vcpu->wq)) + wake_up_interruptible(&vcpu->wq); + } + + result = (orig_irr == 0); + break; + + case APIC_DM_REMRD: + printk(KERN_DEBUG "Ignoring delivery mode 3\n"); + break; + + case APIC_DM_SMI: + printk(KERN_DEBUG "Ignoring guest SMI\n"); + break; + case APIC_DM_NMI: + printk(KERN_DEBUG "Ignoring guest NMI\n"); + break; + + case APIC_DM_INIT: + if (level) { + if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) + printk(KERN_DEBUG + "INIT on a runnable vcpu %d\n", + vcpu->vcpu_id); + vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED; + kvm_vcpu_kick(vcpu); + } else { + printk(KERN_DEBUG + "Ignoring de-assert INIT to vcpu %d\n", + vcpu->vcpu_id); + } + + break; + + case APIC_DM_STARTUP: + printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", + vcpu->vcpu_id, vector); + if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) { + vcpu->arch.sipi_vector = vector; + vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED; + if (waitqueue_active(&vcpu->wq)) + wake_up_interruptible(&vcpu->wq); + } + break; + + default: + printk(KERN_ERR "TODO: unsupported delivery mode %x\n", + delivery_mode); + break; + } + return result; +} + +static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, + unsigned long bitmap) +{ + int last; + int next; + struct kvm_lapic *apic = NULL; + + last = kvm->arch.round_robin_prev_vcpu; + next = last; + + do { + if (++next == KVM_MAX_VCPUS) + next = 0; + if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) + continue; + apic = kvm->vcpus[next]->arch.apic; + if (apic && apic_enabled(apic)) + break; + apic = NULL; + } while (next != last); + kvm->arch.round_robin_prev_vcpu = next; + + if (!apic) + printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); + + return apic; +} + +struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, + unsigned long bitmap) +{ + struct kvm_lapic *apic; + + apic = kvm_apic_round_robin(kvm, vector, bitmap); + if (apic) + return apic->vcpu; + return NULL; +} + +static void apic_set_eoi(struct kvm_lapic *apic) +{ + int vector = apic_find_highest_isr(apic); + + /* + * Not every write EOI will has corresponding ISR, + * one example is when Kernel check timer on setup_IO_APIC + */ + if (vector == -1) + return; + + apic_clear_vector(vector, apic->regs + APIC_ISR); + apic_update_ppr(apic); + + if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); +} + +static void apic_send_ipi(struct kvm_lapic *apic) +{ + u32 icr_low = apic_get_reg(apic, APIC_ICR); + u32 icr_high = apic_get_reg(apic, APIC_ICR2); + + unsigned int dest = GET_APIC_DEST_FIELD(icr_high); + unsigned int short_hand = icr_low & APIC_SHORT_MASK; + unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; + unsigned int level = icr_low & APIC_INT_ASSERT; + unsigned int dest_mode = icr_low & APIC_DEST_MASK; + unsigned int delivery_mode = icr_low & APIC_MODE_MASK; + unsigned int vector = icr_low & APIC_VECTOR_MASK; + + struct kvm_vcpu *target; + struct kvm_vcpu *vcpu; + unsigned long lpr_map = 0; + int i; + + apic_debug("icr_high 0x%x, icr_low 0x%x, " + "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " + "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", + icr_high, icr_low, short_hand, dest, + trig_mode, level, dest_mode, delivery_mode, vector); + + for (i = 0; i < KVM_MAX_VCPUS; i++) { + vcpu = apic->vcpu->kvm->vcpus[i]; + if (!vcpu) + continue; + + if (vcpu->arch.apic && + apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { + if (delivery_mode == APIC_DM_LOWEST) + set_bit(vcpu->vcpu_id, &lpr_map); + else + __apic_accept_irq(vcpu->arch.apic, delivery_mode, + vector, level, trig_mode); + } + } + + if (delivery_mode == APIC_DM_LOWEST) { + target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); + if (target != NULL) + __apic_accept_irq(target->arch.apic, delivery_mode, + vector, level, trig_mode); + } +} + +static u32 apic_get_tmcct(struct kvm_lapic *apic) +{ + u64 counter_passed; + ktime_t passed, now; + u32 tmcct; + + ASSERT(apic != NULL); + + now = apic->timer.dev.base->get_time(); + tmcct = apic_get_reg(apic, APIC_TMICT); + + /* if initial count is 0, current count should also be 0 */ + if (tmcct == 0) + return 0; + + if (unlikely(ktime_to_ns(now) <= + ktime_to_ns(apic->timer.last_update))) { + /* Wrap around */ + passed = ktime_add(( { + (ktime_t) { + .tv64 = KTIME_MAX - + (apic->timer.last_update).tv64}; } + ), now); + apic_debug("time elapsed\n"); + } else + passed = ktime_sub(now, apic->timer.last_update); + + counter_passed = div64_64(ktime_to_ns(passed), + (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); + + if (counter_passed > tmcct) { + if (unlikely(!apic_lvtt_period(apic))) { + /* one-shot timers stick at 0 until reset */ + tmcct = 0; + } else { + /* + * periodic timers reset to APIC_TMICT when they + * hit 0. The while loop simulates this happening N + * times. (counter_passed %= tmcct) would also work, + * but might be slower or not work on 32-bit?? + */ + while (counter_passed > tmcct) + counter_passed -= tmcct; + tmcct -= counter_passed; + } + } else { + tmcct -= counter_passed; + } + + return tmcct; +} + +static void __report_tpr_access(struct kvm_lapic *apic, bool write) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + struct kvm_run *run = vcpu->run; + + set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); + kvm_x86_ops->cache_regs(vcpu); + run->tpr_access.rip = vcpu->arch.rip; + run->tpr_access.is_write = write; +} + +static inline void report_tpr_access(struct kvm_lapic *apic, bool write) +{ + if (apic->vcpu->arch.tpr_access_reporting) + __report_tpr_access(apic, write); +} + +static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) +{ + u32 val = 0; + + if (offset >= LAPIC_MMIO_LENGTH) + return 0; + + switch (offset) { + case APIC_ARBPRI: + printk(KERN_WARNING "Access APIC ARBPRI register " + "which is for P6\n"); + break; + + case APIC_TMCCT: /* Timer CCR */ + val = apic_get_tmcct(apic); + break; + + case APIC_TASKPRI: + report_tpr_access(apic, false); + /* fall thru */ + default: + apic_update_ppr(apic); + val = apic_get_reg(apic, offset); + break; + } + + return val; +} + +static void apic_mmio_read(struct kvm_io_device *this, + gpa_t address, int len, void *data) +{ + struct kvm_lapic *apic = (struct kvm_lapic *)this->private; + unsigned int offset = address - apic->base_address; + unsigned char alignment = offset & 0xf; + u32 result; + + if ((alignment + len) > 4) { + printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", + (unsigned long)address, len); + return; + } + result = __apic_read(apic, offset & ~0xf); + + switch (len) { + case 1: + case 2: + case 4: + memcpy(data, (char *)&result + alignment, len); + break; + default: + printk(KERN_ERR "Local APIC read with len = %x, " + "should be 1,2, or 4 instead\n", len); + break; + } +} + +static void update_divide_count(struct kvm_lapic *apic) +{ + u32 tmp1, tmp2, tdcr; + + tdcr = apic_get_reg(apic, APIC_TDCR); + tmp1 = tdcr & 0xf; + tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; + apic->timer.divide_count = 0x1 << (tmp2 & 0x7); + + apic_debug("timer divide count is 0x%x\n", + apic->timer.divide_count); +} + +static void start_apic_timer(struct kvm_lapic *apic) +{ + ktime_t now = apic->timer.dev.base->get_time(); + + apic->timer.last_update = now; + + apic->timer.period = apic_get_reg(apic, APIC_TMICT) * + APIC_BUS_CYCLE_NS * apic->timer.divide_count; + atomic_set(&apic->timer.pending, 0); + hrtimer_start(&apic->timer.dev, + ktime_add_ns(now, apic->timer.period), + HRTIMER_MODE_ABS); + + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + PRIx64 ", " + "timer initial count 0x%x, period %lldns, " + "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__, + APIC_BUS_CYCLE_NS, ktime_to_ns(now), + apic_get_reg(apic, APIC_TMICT), + apic->timer.period, + ktime_to_ns(ktime_add_ns(now, + apic->timer.period))); +} + +static void apic_mmio_write(struct kvm_io_device *this, + gpa_t address, int len, const void *data) +{ + struct kvm_lapic *apic = (struct kvm_lapic *)this->private; + unsigned int offset = address - apic->base_address; + unsigned char alignment = offset & 0xf; + u32 val; + + /* + * APIC register must be aligned on 128-bits boundary. + * 32/64/128 bits registers must be accessed thru 32 bits. + * Refer SDM 8.4.1 + */ + if (len != 4 || alignment) { + if (printk_ratelimit()) + printk(KERN_ERR "apic write: bad size=%d %lx\n", + len, (long)address); + return; + } + + val = *(u32 *) data; + + /* too common printing */ + if (offset != APIC_EOI) + apic_debug("%s: offset 0x%x with length 0x%x, and value is " + "0x%x\n", __FUNCTION__, offset, len, val); + + offset &= 0xff0; + + switch (offset) { + case APIC_ID: /* Local APIC ID */ + apic_set_reg(apic, APIC_ID, val); + break; + + case APIC_TASKPRI: + report_tpr_access(apic, true); + apic_set_tpr(apic, val & 0xff); + break; + + case APIC_EOI: + apic_set_eoi(apic); + break; + + case APIC_LDR: + apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + break; + + case APIC_DFR: + apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + break; + + case APIC_SPIV: + apic_set_reg(apic, APIC_SPIV, val & 0x3ff); + if (!(val & APIC_SPIV_APIC_ENABLED)) { + int i; + u32 lvt_val; + + for (i = 0; i < APIC_LVT_NUM; i++) { + lvt_val = apic_get_reg(apic, + APIC_LVTT + 0x10 * i); + apic_set_reg(apic, APIC_LVTT + 0x10 * i, + lvt_val | APIC_LVT_MASKED); + } + atomic_set(&apic->timer.pending, 0); + + } + break; + + case APIC_ICR: + /* No delay here, so we always clear the pending bit */ + apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); + apic_send_ipi(apic); + break; + + case APIC_ICR2: + apic_set_reg(apic, APIC_ICR2, val & 0xff000000); + break; + + case APIC_LVTT: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + /* TODO: Check vector */ + if (!apic_sw_enabled(apic)) + val |= APIC_LVT_MASKED; + + val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; + apic_set_reg(apic, offset, val); + + break; + + case APIC_TMICT: + hrtimer_cancel(&apic->timer.dev); + apic_set_reg(apic, APIC_TMICT, val); + start_apic_timer(apic); + return; + + case APIC_TDCR: + if (val & 4) + printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); + apic_set_reg(apic, APIC_TDCR, val); + update_divide_count(apic); + break; + + default: + apic_debug("Local APIC Write to read-only register %x\n", + offset); + break; + } + +} + +static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) +{ + struct kvm_lapic *apic = (struct kvm_lapic *)this->private; + int ret = 0; + + + if (apic_hw_enabled(apic) && + (addr >= apic->base_address) && + (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) + ret = 1; + + return ret; +} + +void kvm_free_lapic(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.apic) + return; + + hrtimer_cancel(&vcpu->arch.apic->timer.dev); + + if (vcpu->arch.apic->regs_page) + __free_page(vcpu->arch.apic->regs_page); + + kfree(vcpu->arch.apic); +} + +/* + *---------------------------------------------------------------------- + * LAPIC interface + *---------------------------------------------------------------------- + */ + +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!apic) + return; + apic_set_tpr(apic, ((cr8 & 0x0f) << 4) + | (apic_get_reg(apic, APIC_TASKPRI) & 4)); +} + +u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u64 tpr; + + if (!apic) + return 0; + tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); + + return (tpr & 0xf0) >> 4; +} +EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); + +void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!apic) { + value |= MSR_IA32_APICBASE_BSP; + vcpu->arch.apic_base = value; + return; + } + if (apic->vcpu->vcpu_id) + value &= ~MSR_IA32_APICBASE_BSP; + + vcpu->arch.apic_base = value; + apic->base_address = apic->vcpu->arch.apic_base & + MSR_IA32_APICBASE_BASE; + + /* with FSB delivery interrupt, we can restart APIC functionality */ + apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " + "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); + +} + +u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.apic_base; +} +EXPORT_SYMBOL_GPL(kvm_lapic_get_base); + +void kvm_lapic_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic; + int i; + + apic_debug("%s\n", __FUNCTION__); + + ASSERT(vcpu); + apic = vcpu->arch.apic; + ASSERT(apic != NULL); + + /* Stop the timer in case it's a reset to an active apic */ + hrtimer_cancel(&apic->timer.dev); + + apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); + apic_set_reg(apic, APIC_LVR, APIC_VERSION); + + for (i = 0; i < APIC_LVT_NUM; i++) + apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + apic_set_reg(apic, APIC_LVT0, + SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); + + apic_set_reg(apic, APIC_DFR, 0xffffffffU); + apic_set_reg(apic, APIC_SPIV, 0xff); + apic_set_reg(apic, APIC_TASKPRI, 0); + apic_set_reg(apic, APIC_LDR, 0); + apic_set_reg(apic, APIC_ESR, 0); + apic_set_reg(apic, APIC_ICR, 0); + apic_set_reg(apic, APIC_ICR2, 0); + apic_set_reg(apic, APIC_TDCR, 0); + apic_set_reg(apic, APIC_TMICT, 0); + for (i = 0; i < 8; i++) { + apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); + apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); + apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); + } + update_divide_count(apic); + atomic_set(&apic->timer.pending, 0); + if (vcpu->vcpu_id == 0) + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + apic_update_ppr(apic); + + apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" + "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, + vcpu, kvm_apic_id(apic), + vcpu->arch.apic_base, apic->base_address); +} +EXPORT_SYMBOL_GPL(kvm_lapic_reset); + +int kvm_lapic_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + int ret = 0; + + if (!apic) + return 0; + ret = apic_enabled(apic); + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_lapic_enabled); + +/* + *---------------------------------------------------------------------- + * timer interface + *---------------------------------------------------------------------- + */ + +/* TODO: make sure __apic_timer_fn runs in current pCPU */ +static int __apic_timer_fn(struct kvm_lapic *apic) +{ + int result = 0; + wait_queue_head_t *q = &apic->vcpu->wq; + + atomic_inc(&apic->timer.pending); + if (waitqueue_active(q)) { + apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; + wake_up_interruptible(q); + } + if (apic_lvtt_period(apic)) { + result = 1; + apic->timer.dev.expires = ktime_add_ns( + apic->timer.dev.expires, + apic->timer.period); + } + return result; +} + +static int __inject_apic_timer_irq(struct kvm_lapic *apic) +{ + int vector; + + vector = apic_lvt_vector(apic, APIC_LVTT); + return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); +} + +static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) +{ + struct kvm_lapic *apic; + int restart_timer = 0; + + apic = container_of(data, struct kvm_lapic, timer.dev); + + restart_timer = __apic_timer_fn(apic); + + if (restart_timer) + return HRTIMER_RESTART; + else + return HRTIMER_NORESTART; +} + +int kvm_create_lapic(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic; + + ASSERT(vcpu != NULL); + apic_debug("apic_init %d\n", vcpu->vcpu_id); + + apic = kzalloc(sizeof(*apic), GFP_KERNEL); + if (!apic) + goto nomem; + + vcpu->arch.apic = apic; + + apic->regs_page = alloc_page(GFP_KERNEL); + if (apic->regs_page == NULL) { + printk(KERN_ERR "malloc apic regs error for vcpu %x\n", + vcpu->vcpu_id); + goto nomem_free_apic; + } + apic->regs = page_address(apic->regs_page); + memset(apic->regs, 0, PAGE_SIZE); + apic->vcpu = vcpu; + + hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + apic->timer.dev.function = apic_timer_fn; + apic->base_address = APIC_DEFAULT_PHYS_BASE; + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; + + kvm_lapic_reset(vcpu); + apic->dev.read = apic_mmio_read; + apic->dev.write = apic_mmio_write; + apic->dev.in_range = apic_mmio_range; + apic->dev.private = apic; + + return 0; +nomem_free_apic: + kfree(apic); +nomem: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(kvm_create_lapic); + +int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + int highest_irr; + + if (!apic || !apic_enabled(apic)) + return -1; + + apic_update_ppr(apic); + highest_irr = apic_find_highest_irr(apic); + if ((highest_irr == -1) || + ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) + return -1; + return highest_irr; +} + +int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) +{ + u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); + int r = 0; + + if (vcpu->vcpu_id == 0) { + if (!apic_hw_enabled(vcpu->arch.apic)) + r = 1; + if ((lvt0 & APIC_LVT_MASKED) == 0 && + GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) + r = 1; + } + return r; +} + +void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (apic && apic_lvt_enabled(apic, APIC_LVTT) && + atomic_read(&apic->timer.pending) > 0) { + if (__inject_apic_timer_irq(apic)) + atomic_dec(&apic->timer.pending); + } +} + +void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) + apic->timer.last_update = ktime_add_ns( + apic->timer.last_update, + apic->timer.period); +} + +int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) +{ + int vector = kvm_apic_has_interrupt(vcpu); + struct kvm_lapic *apic = vcpu->arch.apic; + + if (vector == -1) + return -1; + + apic_set_vector(vector, apic->regs + APIC_ISR); + apic_update_ppr(apic); + apic_clear_irr(vector, apic); + return vector; +} + +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + apic->base_address = vcpu->arch.apic_base & + MSR_IA32_APICBASE_BASE; + apic_set_reg(apic, APIC_LVR, APIC_VERSION); + apic_update_ppr(apic); + hrtimer_cancel(&apic->timer.dev); + update_divide_count(apic); + start_apic_timer(apic); +} + +void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct hrtimer *timer; + + if (!apic) + return; + + timer = &apic->timer.dev; + if (hrtimer_cancel(timer)) + hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); +} + +void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) +{ + u32 data; + void *vapic; + + if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + return; + + vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); + data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); + kunmap_atomic(vapic, KM_USER0); + + apic_set_tpr(vcpu->arch.apic, data & 0xff); +} + +void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) +{ + u32 data, tpr; + int max_irr, max_isr; + struct kvm_lapic *apic; + void *vapic; + + if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + return; + + apic = vcpu->arch.apic; + tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; + max_irr = apic_find_highest_irr(apic); + if (max_irr < 0) + max_irr = 0; + max_isr = apic_find_highest_isr(apic); + if (max_isr < 0) + max_isr = 0; + data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); + + vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); + *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; + kunmap_atomic(vapic, KM_USER0); +} + +void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) +{ + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + vcpu->arch.apic->vapic_addr = vapic_addr; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h new file mode 100644 index 00000000000..676c396c9ce --- /dev/null +++ b/arch/x86/kvm/lapic.h @@ -0,0 +1,50 @@ +#ifndef __KVM_X86_LAPIC_H +#define __KVM_X86_LAPIC_H + +#include "iodev.h" + +#include <linux/kvm_host.h> + +struct kvm_lapic { + unsigned long base_address; + struct kvm_io_device dev; + struct { + atomic_t pending; + s64 period; /* unit: ns */ + u32 divide_count; + ktime_t last_update; + struct hrtimer dev; + } timer; + struct kvm_vcpu *vcpu; + struct page *regs_page; + void *regs; + gpa_t vapic_addr; + struct page *vapic_page; +}; +int kvm_create_lapic(struct kvm_vcpu *vcpu); +void kvm_free_lapic(struct kvm_vcpu *vcpu); + +int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); +int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); +void kvm_lapic_reset(struct kvm_vcpu *vcpu); +u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); +void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); +void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); + +int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); +int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); +int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); + +u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); +void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); +void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); +int kvm_lapic_enabled(struct kvm_vcpu *vcpu); +int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); + +void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); +void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); +void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); + +#endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c new file mode 100644 index 00000000000..8efdcdbebb0 --- /dev/null +++ b/arch/x86/kvm/mmu.c @@ -0,0 +1,1885 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "vmx.h" +#include "mmu.h" + +#include <linux/kvm_host.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/swap.h> + +#include <asm/page.h> +#include <asm/cmpxchg.h> +#include <asm/io.h> + +#undef MMU_DEBUG + +#undef AUDIT + +#ifdef AUDIT +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); +#else +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} +#endif + +#ifdef MMU_DEBUG + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) + +#else + +#define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) + +#endif + +#if defined(MMU_DEBUG) || defined(AUDIT) +static int dbg = 1; +#endif + +#ifndef MMU_DEBUG +#define ASSERT(x) do { } while (0) +#else +#define ASSERT(x) \ + if (!(x)) { \ + printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ + __FILE__, __LINE__, #x); \ + } +#endif + +#define PT64_PT_BITS 9 +#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) +#define PT32_PT_BITS 10 +#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) + +#define PT_WRITABLE_SHIFT 1 + +#define PT_PRESENT_MASK (1ULL << 0) +#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) +#define PT_USER_MASK (1ULL << 2) +#define PT_PWT_MASK (1ULL << 3) +#define PT_PCD_MASK (1ULL << 4) +#define PT_ACCESSED_MASK (1ULL << 5) +#define PT_DIRTY_MASK (1ULL << 6) +#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_PAT_MASK (1ULL << 7) +#define PT_GLOBAL_MASK (1ULL << 8) +#define PT64_NX_SHIFT 63 +#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) + +#define PT_PAT_SHIFT 7 +#define PT_DIR_PAT_SHIFT 12 +#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) + +#define PT32_DIR_PSE36_SIZE 4 +#define PT32_DIR_PSE36_SHIFT 13 +#define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) + + +#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT64_SECOND_AVAIL_BITS_SHIFT 52 + +#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) + +#define VALID_PAGE(x) ((x) != INVALID_PAGE) + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) + +#define PT64_LEVEL_MASK(level) \ + (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) + + +#define PT32_LEVEL_BITS 10 + +#define PT32_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) + +#define PT32_LEVEL_MASK(level) \ + (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) + +#define PT32_INDEX(address, level)\ + (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) + + +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define PT64_DIR_BASE_ADDR_MASK \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) + +#define PT32_BASE_ADDR_MASK PAGE_MASK +#define PT32_DIR_BASE_ADDR_MASK \ + (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) + +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ + | PT64_NX_MASK) + +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_FETCH_MASK (1U << 4) + +#define PT64_ROOT_LEVEL 4 +#define PT32_ROOT_LEVEL 2 +#define PT32E_ROOT_LEVEL 3 + +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + +#define RMAP_EXT 4 + +#define ACC_EXEC_MASK 1 +#define ACC_WRITE_MASK PT_WRITABLE_MASK +#define ACC_USER_MASK PT_USER_MASK +#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) + +struct kvm_rmap_desc { + u64 *shadow_ptes[RMAP_EXT]; + struct kvm_rmap_desc *more; +}; + +static struct kmem_cache *pte_chain_cache; +static struct kmem_cache *rmap_desc_cache; +static struct kmem_cache *mmu_page_header_cache; + +static u64 __read_mostly shadow_trap_nonpresent_pte; +static u64 __read_mostly shadow_notrap_nonpresent_pte; + +void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) +{ + shadow_trap_nonpresent_pte = trap_pte; + shadow_notrap_nonpresent_pte = notrap_pte; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); + +static int is_write_protection(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr0 & X86_CR0_WP; +} + +static int is_cpuid_PSE36(void) +{ + return 1; +} + +static int is_nx(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.shadow_efer & EFER_NX; +} + +static int is_present_pte(unsigned long pte) +{ + return pte & PT_PRESENT_MASK; +} + +static int is_shadow_present_pte(u64 pte) +{ + pte &= ~PT_SHADOW_IO_MARK; + return pte != shadow_trap_nonpresent_pte + && pte != shadow_notrap_nonpresent_pte; +} + +static int is_writeble_pte(unsigned long pte) +{ + return pte & PT_WRITABLE_MASK; +} + +static int is_dirty_pte(unsigned long pte) +{ + return pte & PT_DIRTY_MASK; +} + +static int is_io_pte(unsigned long pte) +{ + return pte & PT_SHADOW_IO_MARK; +} + +static int is_rmap_pte(u64 pte) +{ + return pte != shadow_trap_nonpresent_pte + && pte != shadow_notrap_nonpresent_pte; +} + +static gfn_t pse36_gfn_delta(u32 gpte) +{ + int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; + + return (gpte & PT32_DIR_PSE36_MASK) << shift; +} + +static void set_shadow_pte(u64 *sptep, u64 spte) +{ +#ifdef CONFIG_X86_64 + set_64bit((unsigned long *)sptep, spte); +#else + set_64bit((unsigned long long *)sptep, spte); +#endif +} + +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + struct kmem_cache *base_cache, int min) +{ + void *obj; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); + if (!obj) + return -ENOMEM; + cache->objects[cache->nobjs++] = obj; + } + return 0; +} + +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + kfree(mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, + int min) +{ + struct page *page; + + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < ARRAY_SIZE(cache->objects)) { + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + set_page_private(page, 0); + cache->objects[cache->nobjs++] = page_address(page); + } + return 0; +} + +static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + free_page((unsigned long)mc->objects[--mc->nobjs]); +} + +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +{ + int r; + + r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, + pte_chain_cache, 4); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, + rmap_desc_cache, 1); + if (r) + goto out; + r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); + if (r) + goto out; + r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, + mmu_page_header_cache, 4); +out: + return r; +} + +static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) +{ + mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); + mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); +} + +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, + size_t size) +{ + void *p; + + BUG_ON(!mc->nobjs); + p = mc->objects[--mc->nobjs]; + memset(p, 0, size); + return p; +} + +static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, + sizeof(struct kvm_pte_chain)); +} + +static void mmu_free_pte_chain(struct kvm_pte_chain *pc) +{ + kfree(pc); +} + +static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) +{ + return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, + sizeof(struct kvm_rmap_desc)); +} + +static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) +{ + kfree(rd); +} + +/* + * Take gfn and return the reverse mapping to it. + * Note: gfn must be unaliased before this function get called + */ + +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + slot = gfn_to_memslot(kvm, gfn); + return &slot->rmap[gfn - slot->base_gfn]; +} + +/* + * Reverse mapping data structures: + * + * If rmapp bit zero is zero, then rmapp point to the shadw page table entry + * that points to page_address(page). + * + * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc + * containing more mappings. + */ +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +{ + struct kvm_mmu_page *sp; + struct kvm_rmap_desc *desc; + unsigned long *rmapp; + int i; + + if (!is_rmap_pte(*spte)) + return; + gfn = unalias_gfn(vcpu->kvm, gfn); + sp = page_header(__pa(spte)); + sp->gfns[spte - sp->spt] = gfn; + rmapp = gfn_to_rmap(vcpu->kvm, gfn); + if (!*rmapp) { + rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); + *rmapp = (unsigned long)spte; + } else if (!(*rmapp & 1)) { + rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); + desc = mmu_alloc_rmap_desc(vcpu); + desc->shadow_ptes[0] = (u64 *)*rmapp; + desc->shadow_ptes[1] = spte; + *rmapp = (unsigned long)desc | 1; + } else { + rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) + desc = desc->more; + if (desc->shadow_ptes[RMAP_EXT-1]) { + desc->more = mmu_alloc_rmap_desc(vcpu); + desc = desc->more; + } + for (i = 0; desc->shadow_ptes[i]; ++i) + ; + desc->shadow_ptes[i] = spte; + } +} + +static void rmap_desc_remove_entry(unsigned long *rmapp, + struct kvm_rmap_desc *desc, + int i, + struct kvm_rmap_desc *prev_desc) +{ + int j; + + for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) + ; + desc->shadow_ptes[i] = desc->shadow_ptes[j]; + desc->shadow_ptes[j] = NULL; + if (j != 0) + return; + if (!prev_desc && !desc->more) + *rmapp = (unsigned long)desc->shadow_ptes[0]; + else + if (prev_desc) + prev_desc->more = desc->more; + else + *rmapp = (unsigned long)desc->more | 1; + mmu_free_rmap_desc(desc); +} + +static void rmap_remove(struct kvm *kvm, u64 *spte) +{ + struct kvm_rmap_desc *desc; + struct kvm_rmap_desc *prev_desc; + struct kvm_mmu_page *sp; + struct page *page; + unsigned long *rmapp; + int i; + + if (!is_rmap_pte(*spte)) + return; + sp = page_header(__pa(spte)); + page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); + mark_page_accessed(page); + if (is_writeble_pte(*spte)) + kvm_release_page_dirty(page); + else + kvm_release_page_clean(page); + rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); + if (!*rmapp) { + printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); + BUG(); + } else if (!(*rmapp & 1)) { + rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); + if ((u64 *)*rmapp != spte) { + printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", + spte, *spte); + BUG(); + } + *rmapp = 0; + } else { + rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + prev_desc = NULL; + while (desc) { + for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) + if (desc->shadow_ptes[i] == spte) { + rmap_desc_remove_entry(rmapp, + desc, i, + prev_desc); + return; + } + prev_desc = desc; + desc = desc->more; + } + BUG(); + } +} + +static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) +{ + struct kvm_rmap_desc *desc; + struct kvm_rmap_desc *prev_desc; + u64 *prev_spte; + int i; + + if (!*rmapp) + return NULL; + else if (!(*rmapp & 1)) { + if (!spte) + return (u64 *)*rmapp; + return NULL; + } + desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + prev_desc = NULL; + prev_spte = NULL; + while (desc) { + for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { + if (prev_spte == spte) + return desc->shadow_ptes[i]; + prev_spte = desc->shadow_ptes[i]; + } + desc = desc->more; + } + return NULL; +} + +static void rmap_write_protect(struct kvm *kvm, u64 gfn) +{ + unsigned long *rmapp; + u64 *spte; + int write_protected = 0; + + gfn = unalias_gfn(kvm, gfn); + rmapp = gfn_to_rmap(kvm, gfn); + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!spte); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); + if (is_writeble_pte(*spte)) { + set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); + write_protected = 1; + } + spte = rmap_next(kvm, rmapp, spte); + } + if (write_protected) + kvm_flush_remote_tlbs(kvm); +} + +#ifdef MMU_DEBUG +static int is_empty_shadow_page(u64 *spt) +{ + u64 *pos; + u64 *end; + + for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) + if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { + printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, + pos, *pos); + return 0; + } + return 1; +} +#endif + +static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + ASSERT(is_empty_shadow_page(sp->spt)); + list_del(&sp->link); + __free_page(virt_to_page(sp->spt)); + __free_page(virt_to_page(sp->gfns)); + kfree(sp); + ++kvm->arch.n_free_mmu_pages; +} + +static unsigned kvm_page_table_hashfn(gfn_t gfn) +{ + return gfn; +} + +static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, + u64 *parent_pte) +{ + struct kvm_mmu_page *sp; + + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); + ASSERT(is_empty_shadow_page(sp->spt)); + sp->slot_bitmap = 0; + sp->multimapped = 0; + sp->parent_pte = parent_pte; + --vcpu->kvm->arch.n_free_mmu_pages; + return sp; +} + +static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!parent_pte) + return; + if (!sp->multimapped) { + u64 *old = sp->parent_pte; + + if (!old) { + sp->parent_pte = parent_pte; + return; + } + sp->multimapped = 1; + pte_chain = mmu_alloc_pte_chain(vcpu); + INIT_HLIST_HEAD(&sp->parent_ptes); + hlist_add_head(&pte_chain->link, &sp->parent_ptes); + pte_chain->parent_ptes[0] = old; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { + if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) + continue; + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) + if (!pte_chain->parent_ptes[i]) { + pte_chain->parent_ptes[i] = parent_pte; + return; + } + } + pte_chain = mmu_alloc_pte_chain(vcpu); + BUG_ON(!pte_chain); + hlist_add_head(&pte_chain->link, &sp->parent_ptes); + pte_chain->parent_ptes[0] = parent_pte; +} + +static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, + u64 *parent_pte) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!sp->multimapped) { + BUG_ON(sp->parent_pte != parent_pte); + sp->parent_pte = NULL; + return; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + if (pte_chain->parent_ptes[i] != parent_pte) + continue; + while (i + 1 < NR_PTE_CHAIN_ENTRIES + && pte_chain->parent_ptes[i + 1]) { + pte_chain->parent_ptes[i] + = pte_chain->parent_ptes[i + 1]; + ++i; + } + pte_chain->parent_ptes[i] = NULL; + if (i == 0) { + hlist_del(&pte_chain->link); + mmu_free_pte_chain(pte_chain); + if (hlist_empty(&sp->parent_ptes)) { + sp->multimapped = 0; + sp->parent_pte = NULL; + } + } + return; + } + BUG(); +} + +static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node; + + pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry(sp, node, bucket, hash_link) + if (sp->gfn == gfn && !sp->role.metaphysical) { + pgprintk("%s: found role %x\n", + __FUNCTION__, sp->role.word); + return sp; + } + return NULL; +} + +static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + gva_t gaddr, + unsigned level, + int metaphysical, + unsigned access, + u64 *parent_pte, + bool *new_page) +{ + union kvm_mmu_page_role role; + unsigned index; + unsigned quadrant; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node; + + role.word = 0; + role.glevels = vcpu->arch.mmu.root_level; + role.level = level; + role.metaphysical = metaphysical; + role.access = access; + if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); + quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; + role.quadrant = quadrant; + } + pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, + gfn, role.word); + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry(sp, node, bucket, hash_link) + if (sp->gfn == gfn && sp->role.word == role.word) { + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + pgprintk("%s: found\n", __FUNCTION__); + return sp; + } + ++vcpu->kvm->stat.mmu_cache_miss; + sp = kvm_mmu_alloc_page(vcpu, parent_pte); + if (!sp) + return sp; + pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); + sp->gfn = gfn; + sp->role = role; + hlist_add_head(&sp->hash_link, bucket); + vcpu->arch.mmu.prefetch_page(vcpu, sp); + if (!metaphysical) + rmap_write_protect(vcpu->kvm, gfn); + if (new_page) + *new_page = 1; + return sp; +} + +static void kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *sp) +{ + unsigned i; + u64 *pt; + u64 ent; + + pt = sp->spt; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) { + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (is_shadow_present_pte(pt[i])) + rmap_remove(kvm, &pt[i]); + pt[i] = shadow_trap_nonpresent_pte; + } + kvm_flush_remote_tlbs(kvm); + return; + } + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + ent = pt[i]; + + pt[i] = shadow_trap_nonpresent_pte; + if (!is_shadow_present_pte(ent)) + continue; + ent &= PT64_BASE_ADDR_MASK; + mmu_page_remove_parent_pte(page_header(ent), &pt[i]); + } + kvm_flush_remote_tlbs(kvm); +} + +static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) +{ + mmu_page_remove_parent_pte(sp, parent_pte); +} + +static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) +{ + int i; + + for (i = 0; i < KVM_MAX_VCPUS; ++i) + if (kvm->vcpus[i]) + kvm->vcpus[i]->arch.last_pte_updated = NULL; +} + +static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + u64 *parent_pte; + + ++kvm->stat.mmu_shadow_zapped; + while (sp->multimapped || sp->parent_pte) { + if (!sp->multimapped) + parent_pte = sp->parent_pte; + else { + struct kvm_pte_chain *chain; + + chain = container_of(sp->parent_ptes.first, + struct kvm_pte_chain, link); + parent_pte = chain->parent_ptes[0]; + } + BUG_ON(!parent_pte); + kvm_mmu_put_page(sp, parent_pte); + set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); + } + kvm_mmu_page_unlink_children(kvm, sp); + if (!sp->root_count) { + hlist_del(&sp->hash_link); + kvm_mmu_free_page(kvm, sp); + } else + list_move(&sp->link, &kvm->arch.active_mmu_pages); + kvm_mmu_reset_last_pte_updated(kvm); +} + +/* + * Changing the number of mmu pages allocated to the vm + * Note: if kvm_nr_mmu_pages is too small, you will get dead lock + */ +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) +{ + /* + * If we set the number of mmu pages to be smaller be than the + * number of actived pages , we must to free some mmu pages before we + * change the value + */ + + if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > + kvm_nr_mmu_pages) { + int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages + - kvm->arch.n_free_mmu_pages; + + while (n_used_mmu_pages > kvm_nr_mmu_pages) { + struct kvm_mmu_page *page; + + page = container_of(kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(kvm, page); + n_used_mmu_pages--; + } + kvm->arch.n_free_mmu_pages = 0; + } + else + kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages + - kvm->arch.n_alloc_mmu_pages; + + kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; +} + +static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *sp; + struct hlist_node *node, *n; + int r; + + pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + r = 0; + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) + if (sp->gfn == gfn && !sp->role.metaphysical) { + pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, + sp->role.word); + kvm_mmu_zap_page(kvm, sp); + r = 1; + } + return r; +} + +static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_mmu_page *sp; + + while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { + pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); + kvm_mmu_zap_page(kvm, sp); + } +} + +static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) +{ + int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + struct kvm_mmu_page *sp = page_header(__pa(pte)); + + __set_bit(slot, &sp->slot_bitmap); +} + +struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + + if (gpa == UNMAPPED_GVA) + return NULL; + return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); +} + +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pt_access, unsigned pte_access, + int user_fault, int write_fault, int dirty, + int *ptwrite, gfn_t gfn, struct page *page) +{ + u64 spte; + int was_rmapped = is_rmap_pte(*shadow_pte); + int was_writeble = is_writeble_pte(*shadow_pte); + + pgprintk("%s: spte %llx access %x write_fault %d" + " user_fault %d gfn %lx\n", + __FUNCTION__, *shadow_pte, pt_access, + write_fault, user_fault, gfn); + + /* + * We don't set the accessed bit, since we sometimes want to see + * whether the guest actually used the pte (in order to detect + * demand paging). + */ + spte = PT_PRESENT_MASK | PT_DIRTY_MASK; + if (!dirty) + pte_access &= ~ACC_WRITE_MASK; + if (!(pte_access & ACC_EXEC_MASK)) + spte |= PT64_NX_MASK; + + spte |= PT_PRESENT_MASK; + if (pte_access & ACC_USER_MASK) + spte |= PT_USER_MASK; + + if (is_error_page(page)) { + set_shadow_pte(shadow_pte, + shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); + kvm_release_page_clean(page); + return; + } + + spte |= page_to_phys(page); + + if ((pte_access & ACC_WRITE_MASK) + || (write_fault && !is_write_protection(vcpu) && !user_fault)) { + struct kvm_mmu_page *shadow; + + spte |= PT_WRITABLE_MASK; + if (user_fault) { + mmu_unshadow(vcpu->kvm, gfn); + goto unshadowed; + } + + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + pgprintk("%s: found shadow page for %lx, marking ro\n", + __FUNCTION__, gfn); + pte_access &= ~ACC_WRITE_MASK; + if (is_writeble_pte(spte)) { + spte &= ~PT_WRITABLE_MASK; + kvm_x86_ops->tlb_flush(vcpu); + } + if (write_fault) + *ptwrite = 1; + } + } + +unshadowed: + + if (pte_access & ACC_WRITE_MASK) + mark_page_dirty(vcpu->kvm, gfn); + + pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); + set_shadow_pte(shadow_pte, spte); + page_header_update_slot(vcpu->kvm, shadow_pte, gfn); + if (!was_rmapped) { + rmap_add(vcpu, shadow_pte, gfn); + if (!is_rmap_pte(*shadow_pte)) + kvm_release_page_clean(page); + } else { + if (was_writeble) + kvm_release_page_dirty(page); + else + kvm_release_page_clean(page); + } + if (!ptwrite || !*ptwrite) + vcpu->arch.last_pte_updated = shadow_pte; +} + +static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) +{ +} + +static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, + gfn_t gfn, struct page *page) +{ + int level = PT32E_ROOT_LEVEL; + hpa_t table_addr = vcpu->arch.mmu.root_hpa; + int pt_write = 0; + + for (; ; level--) { + u32 index = PT64_INDEX(v, level); + u64 *table; + + ASSERT(VALID_PAGE(table_addr)); + table = __va(table_addr); + + if (level == 1) { + mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, + 0, write, 1, &pt_write, gfn, page); + return pt_write || is_io_pte(table[index]); + } + + if (table[index] == shadow_trap_nonpresent_pte) { + struct kvm_mmu_page *new_table; + gfn_t pseudo_gfn; + + pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) + >> PAGE_SHIFT; + new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, + v, level - 1, + 1, ACC_ALL, &table[index], + NULL); + if (!new_table) { + pgprintk("nonpaging_map: ENOMEM\n"); + kvm_release_page_clean(page); + return -ENOMEM; + } + + table[index] = __pa(new_table->spt) | PT_PRESENT_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + } + table_addr = table[index] & PT64_BASE_ADDR_MASK; + } +} + +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) +{ + int r; + + struct page *page; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, gfn); + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + r = __nonpaging_map(vcpu, v, write, gfn, page); + spin_unlock(&vcpu->kvm->mmu_lock); + + up_read(¤t->mm->mmap_sem); + + return r; +} + + +static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + sp->spt[i] = shadow_trap_nonpresent_pte; +} + +static void mmu_free_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + spin_lock(&vcpu->kvm->mmu_lock); +#ifdef CONFIG_X86_64 + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + sp = page_header(root); + --sp->root_count; + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + spin_unlock(&vcpu->kvm->mmu_lock); + return; + } +#endif + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + --sp->root_count; + } + vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + } + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; +} + +static void mmu_alloc_roots(struct kvm_vcpu *vcpu) +{ + int i; + gfn_t root_gfn; + struct kvm_mmu_page *sp; + + root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; + +#ifdef CONFIG_X86_64 + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + ASSERT(!VALID_PAGE(root)); + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, + PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL); + root = __pa(sp->spt); + ++sp->root_count; + vcpu->arch.mmu.root_hpa = root; + return; + } +#endif + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + ASSERT(!VALID_PAGE(root)); + if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { + if (!is_present_pte(vcpu->arch.pdptrs[i])) { + vcpu->arch.mmu.pae_root[i] = 0; + continue; + } + root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; + } else if (vcpu->arch.mmu.root_level == 0) + root_gfn = 0; + sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, + PT32_ROOT_LEVEL, !is_paging(vcpu), + ACC_ALL, NULL, NULL); + root = __pa(sp->spt); + ++sp->root_count; + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + } + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); +} + +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +{ + return vaddr; +} + +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, + u32 error_code) +{ + gfn_t gfn; + int r; + + pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + gfn = gva >> PAGE_SHIFT; + + return nonpaging_map(vcpu, gva & PAGE_MASK, + error_code & PFERR_WRITE_MASK, gfn); +} + +static void nonpaging_free(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} + +static int nonpaging_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = nonpaging_page_fault; + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->free = nonpaging_free; + context->prefetch_page = nonpaging_prefetch_page; + context->root_level = 0; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + return 0; +} + +void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops->tlb_flush(vcpu); +} + +static void paging_new_cr3(struct kvm_vcpu *vcpu) +{ + pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); + mmu_free_roots(vcpu); +} + +static void inject_page_fault(struct kvm_vcpu *vcpu, + u64 addr, + u32 err_code) +{ + kvm_inject_page_fault(vcpu, addr, err_code); +} + +static void paging_free(struct kvm_vcpu *vcpu) +{ + nonpaging_free(vcpu); +} + +#define PTTYPE 64 +#include "paging_tmpl.h" +#undef PTTYPE + +#define PTTYPE 32 +#include "paging_tmpl.h" +#undef PTTYPE + +static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + ASSERT(is_pae(vcpu)); + context->new_cr3 = paging_new_cr3; + context->page_fault = paging64_page_fault; + context->gva_to_gpa = paging64_gva_to_gpa; + context->prefetch_page = paging64_prefetch_page; + context->free = paging_free; + context->root_level = level; + context->shadow_root_level = level; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static int paging64_init_context(struct kvm_vcpu *vcpu) +{ + return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); +} + +static int paging32_init_context(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + context->new_cr3 = paging_new_cr3; + context->page_fault = paging32_page_fault; + context->gva_to_gpa = paging32_gva_to_gpa; + context->free = paging_free; + context->prefetch_page = paging32_prefetch_page; + context->root_level = PT32_ROOT_LEVEL; + context->shadow_root_level = PT32E_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + return 0; +} + +static int paging32E_init_context(struct kvm_vcpu *vcpu) +{ + return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); +} + +static int init_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + if (!is_paging(vcpu)) + return nonpaging_init_context(vcpu); + else if (is_long_mode(vcpu)) + return paging64_init_context(vcpu); + else if (is_pae(vcpu)) + return paging32E_init_context(vcpu); + else + return paging32_init_context(vcpu); +} + +static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + vcpu->arch.mmu.free(vcpu); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + } +} + +int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) +{ + destroy_kvm_mmu(vcpu); + return init_kvm_mmu(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); + +int kvm_mmu_load(struct kvm_vcpu *vcpu) +{ + int r; + + r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + mmu_alloc_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + kvm_mmu_flush_tlb(vcpu); +out: + return r; +} +EXPORT_SYMBOL_GPL(kvm_mmu_load); + +void kvm_mmu_unload(struct kvm_vcpu *vcpu) +{ + mmu_free_roots(vcpu); +} + +static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *spte) +{ + u64 pte; + struct kvm_mmu_page *child; + + pte = *spte; + if (is_shadow_present_pte(pte)) { + if (sp->role.level == PT_PAGE_TABLE_LEVEL) + rmap_remove(vcpu->kvm, spte); + else { + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, spte); + } + } + set_shadow_pte(spte, shadow_trap_nonpresent_pte); +} + +static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *spte, + const void *new, int bytes, + int offset_in_pte) +{ + if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + ++vcpu->kvm->stat.mmu_pde_zapped; + return; + } + + ++vcpu->kvm->stat.mmu_pte_updated; + if (sp->role.glevels == PT32_ROOT_LEVEL) + paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); + else + paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); +} + +static bool need_remote_flush(u64 old, u64 new) +{ + if (!is_shadow_present_pte(old)) + return false; + if (!is_shadow_present_pte(new)) + return true; + if ((old ^ new) & PT64_BASE_ADDR_MASK) + return true; + old ^= PT64_NX_MASK; + new ^= PT64_NX_MASK; + return (old & ~new & PT64_PERM_MASK) != 0; +} + +static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) +{ + if (need_remote_flush(old, new)) + kvm_flush_remote_tlbs(vcpu->kvm); + else + kvm_mmu_flush_tlb(vcpu); +} + +static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) +{ + u64 *spte = vcpu->arch.last_pte_updated; + + return !!(spte && (*spte & PT_ACCESSED_MASK)); +} + +static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + gfn_t gfn; + int r; + u64 gpte = 0; + + if (bytes != 4 && bytes != 8) + return; + + /* + * Assume that the pte write on a page table of the same type + * as the current vcpu paging mode. This is nearly always true + * (might be false while changing modes). Note it is verified later + * by update_pte(). + */ + if (is_pae(vcpu)) { + /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ + if ((bytes == 4) && (gpa % 4 == 0)) { + r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); + if (r) + return; + memcpy((void *)&gpte + (gpa % 8), new, 4); + } else if ((bytes == 8) && (gpa % 8 == 0)) { + memcpy((void *)&gpte, new, 8); + } + } else { + if ((bytes == 4) && (gpa % 4 == 0)) + memcpy((void *)&gpte, new, 4); + } + if (!is_present_pte(gpte)) + return; + gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + vcpu->arch.update_pte.gfn = gfn; + vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn); +} + +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + struct kvm_mmu_page *sp; + struct hlist_node *node, *n; + struct hlist_head *bucket; + unsigned index; + u64 entry; + u64 *spte; + unsigned offset = offset_in_page(gpa); + unsigned pte_size; + unsigned page_offset; + unsigned misaligned; + unsigned quadrant; + int level; + int flooded = 0; + int npte; + + pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); + mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + ++vcpu->kvm->stat.mmu_pte_write; + kvm_mmu_audit(vcpu, "pre pte write"); + if (gfn == vcpu->arch.last_pt_write_gfn + && !last_updated_pte_accessed(vcpu)) { + ++vcpu->arch.last_pt_write_count; + if (vcpu->arch.last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->arch.last_pt_write_gfn = gfn; + vcpu->arch.last_pt_write_count = 1; + vcpu->arch.last_pte_updated = NULL; + } + index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { + if (sp->gfn != gfn || sp->role.metaphysical) + continue; + pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; + misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + misaligned |= bytes < 4; + if (misaligned || flooded) { + /* + * Misaligned accesses are too much trouble to fix + * up; also, they usually indicate a page is not used + * as a page table. + * + * If we're seeing too many writes to a page, + * it may no longer be a page table, or we may be + * forking, in which case it is better to unmap the + * page. + */ + pgprintk("misaligned: gpa %llx bytes %d role %x\n", + gpa, bytes, sp->role.word); + kvm_mmu_zap_page(vcpu->kvm, sp); + ++vcpu->kvm->stat.mmu_flooded; + continue; + } + page_offset = offset; + level = sp->role.level; + npte = 1; + if (sp->role.glevels == PT32_ROOT_LEVEL) { + page_offset <<= 1; /* 32->64 */ + /* + * A 32-bit pde maps 4MB while the shadow pdes map + * only 2MB. So we need to double the offset again + * and zap two pdes instead of one. + */ + if (level == PT32_ROOT_LEVEL) { + page_offset &= ~7; /* kill rounding error */ + page_offset <<= 1; + npte = 2; + } + quadrant = page_offset >> PAGE_SHIFT; + page_offset &= ~PAGE_MASK; + if (quadrant != sp->role.quadrant) + continue; + } + spte = &sp->spt[page_offset / sizeof(*spte)]; + while (npte--) { + entry = *spte; + mmu_pte_write_zap_pte(vcpu, sp, spte); + mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, + page_offset & (pte_size - 1)); + mmu_pte_write_flush_tlb(vcpu, entry, *spte); + ++spte; + } + } + kvm_mmu_audit(vcpu, "post pte write"); + spin_unlock(&vcpu->kvm->mmu_lock); + if (vcpu->arch.update_pte.page) { + kvm_release_page_clean(vcpu->arch.update_pte.page); + vcpu->arch.update_pte.page = NULL; + } +} + +int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + int r; + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + up_read(¤t->mm->mmap_sem); + + spin_lock(&vcpu->kvm->mmu_lock); + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + spin_unlock(&vcpu->kvm->mmu_lock); + return r; +} + +void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +{ + while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { + struct kvm_mmu_page *sp; + + sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu->kvm, sp); + ++vcpu->kvm->stat.mmu_recycled; + } +} + +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) +{ + int r; + enum emulation_result er; + + r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); + if (r < 0) + goto out; + + if (!r) { + r = 1; + goto out; + } + + r = mmu_topup_memory_caches(vcpu); + if (r) + goto out; + + er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); + + switch (er) { + case EMULATE_DONE: + return 1; + case EMULATE_DO_MMIO: + ++vcpu->stat.mmio_exits; + return 0; + case EMULATE_FAIL: + kvm_report_emulation_failure(vcpu, "pagetable"); + return 1; + default: + BUG(); + } +out: + return r; +} +EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); + +static void free_mmu_pages(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + + while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { + sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(vcpu->kvm, sp); + } + free_page((unsigned long)vcpu->arch.mmu.pae_root); +} + +static int alloc_mmu_pages(struct kvm_vcpu *vcpu) +{ + struct page *page; + int i; + + ASSERT(vcpu); + + if (vcpu->kvm->arch.n_requested_mmu_pages) + vcpu->kvm->arch.n_free_mmu_pages = + vcpu->kvm->arch.n_requested_mmu_pages; + else + vcpu->kvm->arch.n_free_mmu_pages = + vcpu->kvm->arch.n_alloc_mmu_pages; + /* + * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. + * Therefore we need to allocate shadow page tables in the first + * 4GB of memory, which happens to fit the DMA32 zone. + */ + page = alloc_page(GFP_KERNEL | __GFP_DMA32); + if (!page) + goto error_1; + vcpu->arch.mmu.pae_root = page_address(page); + for (i = 0; i < 4; ++i) + vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + + return 0; + +error_1: + free_mmu_pages(vcpu); + return -ENOMEM; +} + +int kvm_mmu_create(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + return alloc_mmu_pages(vcpu); +} + +int kvm_mmu_setup(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + return init_kvm_mmu(vcpu); +} + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + + destroy_kvm_mmu(vcpu); + free_mmu_pages(vcpu); + mmu_free_memory_caches(vcpu); +} + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +{ + struct kvm_mmu_page *sp; + + list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { + int i; + u64 *pt; + + if (!test_bit(slot, &sp->slot_bitmap)) + continue; + + pt = sp->spt; + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + /* avoid RMW */ + if (pt[i] & PT_WRITABLE_MASK) + pt[i] &= ~PT_WRITABLE_MASK; + } +} + +void kvm_mmu_zap_all(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + + spin_lock(&kvm->mmu_lock); + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) + kvm_mmu_zap_page(kvm, sp); + spin_unlock(&kvm->mmu_lock); + + kvm_flush_remote_tlbs(kvm); +} + +void kvm_mmu_module_exit(void) +{ + if (pte_chain_cache) + kmem_cache_destroy(pte_chain_cache); + if (rmap_desc_cache) + kmem_cache_destroy(rmap_desc_cache); + if (mmu_page_header_cache) + kmem_cache_destroy(mmu_page_header_cache); +} + +int kvm_mmu_module_init(void) +{ + pte_chain_cache = kmem_cache_create("kvm_pte_chain", + sizeof(struct kvm_pte_chain), + 0, 0, NULL); + if (!pte_chain_cache) + goto nomem; + rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", + sizeof(struct kvm_rmap_desc), + 0, 0, NULL); + if (!rmap_desc_cache) + goto nomem; + + mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", + sizeof(struct kvm_mmu_page), + 0, 0, NULL); + if (!mmu_page_header_cache) + goto nomem; + + return 0; + +nomem: + kvm_mmu_module_exit(); + return -ENOMEM; +} + +/* + * Caculate mmu pages needed for kvm. + */ +unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) +{ + int i; + unsigned int nr_mmu_pages; + unsigned int nr_pages = 0; + + for (i = 0; i < kvm->nmemslots; i++) + nr_pages += kvm->memslots[i].npages; + + nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; + nr_mmu_pages = max(nr_mmu_pages, + (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + + return nr_mmu_pages; +} + +#ifdef AUDIT + +static const char *audit_msg; + +static gva_t canonicalize(gva_t gva) +{ +#ifdef CONFIG_X86_64 + gva = (long long)(gva << 16) >> 16; +#endif + return gva; +} + +static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, + gva_t va, int level) +{ + u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); + int i; + gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { + u64 ent = pt[i]; + + if (ent == shadow_trap_nonpresent_pte) + continue; + + va = canonicalize(va); + if (level > 1) { + if (ent == shadow_notrap_nonpresent_pte) + printk(KERN_ERR "audit: (%s) nontrapping pte" + " in nonleaf level: levels %d gva %lx" + " level %d pte %llx\n", audit_msg, + vcpu->arch.mmu.root_level, va, level, ent); + + audit_mappings_page(vcpu, ent, va, level - 1); + } else { + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + struct page *page = gpa_to_page(vcpu, gpa); + hpa_t hpa = page_to_phys(page); + + if (is_shadow_present_pte(ent) + && (ent & PT64_BASE_ADDR_MASK) != hpa) + printk(KERN_ERR "xx audit error: (%s) levels %d" + " gva %lx gpa %llx hpa %llx ent %llx %d\n", + audit_msg, vcpu->arch.mmu.root_level, + va, gpa, hpa, ent, + is_shadow_present_pte(ent)); + else if (ent == shadow_notrap_nonpresent_pte + && !is_error_hpa(hpa)) + printk(KERN_ERR "audit: (%s) notrap shadow," + " valid guest gva %lx\n", audit_msg, va); + kvm_release_page_clean(page); + + } + } +} + +static void audit_mappings(struct kvm_vcpu *vcpu) +{ + unsigned i; + + if (vcpu->arch.mmu.root_level == 4) + audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); + else + for (i = 0; i < 4; ++i) + if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) + audit_mappings_page(vcpu, + vcpu->arch.mmu.pae_root[i], + i << 30, + 2); +} + +static int count_rmaps(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + int i, j, k; + + for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { + struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_rmap_desc *d; + + for (j = 0; j < m->npages; ++j) { + unsigned long *rmapp = &m->rmap[j]; + + if (!*rmapp) + continue; + if (!(*rmapp & 1)) { + ++nmaps; + continue; + } + d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); + while (d) { + for (k = 0; k < RMAP_EXT; ++k) + if (d->shadow_ptes[k]) + ++nmaps; + else + break; + d = d->more; + } + } + } + return nmaps; +} + +static int count_writable_mappings(struct kvm_vcpu *vcpu) +{ + int nmaps = 0; + struct kvm_mmu_page *sp; + int i; + + list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { + u64 *pt = sp->spt; + + if (sp->role.level != PT_PAGE_TABLE_LEVEL) + continue; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = pt[i]; + + if (!(ent & PT_PRESENT_MASK)) + continue; + if (!(ent & PT_WRITABLE_MASK)) + continue; + ++nmaps; + } + } + return nmaps; +} + +static void audit_rmap(struct kvm_vcpu *vcpu) +{ + int n_rmap = count_rmaps(vcpu); + int n_actual = count_writable_mappings(vcpu); + + if (n_rmap != n_actual) + printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", + __FUNCTION__, audit_msg, n_rmap, n_actual); +} + +static void audit_write_protection(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + struct kvm_memory_slot *slot; + unsigned long *rmapp; + gfn_t gfn; + + list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { + if (sp->role.metaphysical) + continue; + + slot = gfn_to_memslot(vcpu->kvm, sp->gfn); + gfn = unalias_gfn(vcpu->kvm, sp->gfn); + rmapp = &slot->rmap[gfn - slot->base_gfn]; + if (*rmapp) + printk(KERN_ERR "%s: (%s) shadow page has writable" + " mappings: gfn %lx role %x\n", + __FUNCTION__, audit_msg, sp->gfn, + sp->role.word); + } +} + +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) +{ + int olddbg = dbg; + + dbg = 0; + audit_msg = msg; + audit_rmap(vcpu); + audit_write_protection(vcpu); + audit_mappings(vcpu); + dbg = olddbg; +} + +#endif diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h new file mode 100644 index 00000000000..1fce19ec7a2 --- /dev/null +++ b/arch/x86/kvm/mmu.h @@ -0,0 +1,44 @@ +#ifndef __KVM_X86_MMU_H +#define __KVM_X86_MMU_H + +#include <linux/kvm_host.h> + +static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) +{ + if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) + __kvm_mmu_free_some_pages(vcpu); +} + +static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) +{ + if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) + return 0; + + return kvm_mmu_load(vcpu); +} + +static inline int is_long_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return vcpu->arch.shadow_efer & EFER_LME; +#else + return 0; +#endif +} + +static inline int is_pae(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr4 & X86_CR4_PAE; +} + +static inline int is_pse(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr4 & X86_CR4_PSE; +} + +static inline int is_paging(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr0 & X86_CR0_PG; +} + +#endif diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h new file mode 100644 index 00000000000..03ba8608fe0 --- /dev/null +++ b/arch/x86/kvm/paging_tmpl.h @@ -0,0 +1,484 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * MMU support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +/* + * We need the mmu code to access both 32-bit and 64-bit guest ptes, + * so the code in this file is compiled twice, once per pte size. + */ + +#if PTTYPE == 64 + #define pt_element_t u64 + #define guest_walker guest_walker64 + #define FNAME(name) paging##64_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #ifdef CONFIG_X86_64 + #define PT_MAX_FULL_LEVELS 4 + #define CMPXCHG cmpxchg + #else + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 2 + #endif +#elif PTTYPE == 32 + #define pt_element_t u32 + #define guest_walker guest_walker32 + #define FNAME(name) paging##32_##name + #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK + #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK + #define PT_INDEX(addr, level) PT32_INDEX(addr, level) + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) + #define PT_LEVEL_BITS PT32_LEVEL_BITS + #define PT_MAX_FULL_LEVELS 2 + #define CMPXCHG cmpxchg +#else + #error Invalid PTTYPE value +#endif + +#define gpte_to_gfn FNAME(gpte_to_gfn) +#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) + +/* + * The guest_walker structure emulates the behavior of the hardware page + * table walker. + */ +struct guest_walker { + int level; + gfn_t table_gfn[PT_MAX_FULL_LEVELS]; + pt_element_t ptes[PT_MAX_FULL_LEVELS]; + gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; + unsigned pt_access; + unsigned pte_access; + gfn_t gfn; + u32 error_code; +}; + +static gfn_t gpte_to_gfn(pt_element_t gpte) +{ + return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; +} + +static gfn_t gpte_to_gfn_pde(pt_element_t gpte) +{ + return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; +} + +static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, + gfn_t table_gfn, unsigned index, + pt_element_t orig_pte, pt_element_t new_pte) +{ + pt_element_t ret; + pt_element_t *table; + struct page *page; + + page = gfn_to_page(kvm, table_gfn); + table = kmap_atomic(page, KM_USER0); + + ret = CMPXCHG(&table[index], orig_pte, new_pte); + + kunmap_atomic(table, KM_USER0); + + kvm_release_page_dirty(page); + + return (ret != orig_pte); +} + +static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) +{ + unsigned access; + + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; +#if PTTYPE == 64 + if (is_nx(vcpu)) + access &= ~(gpte >> PT64_NX_SHIFT); +#endif + return access; +} + +/* + * Fetch a guest pte for a guest virtual address + */ +static int FNAME(walk_addr)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, + int write_fault, int user_fault, int fetch_fault) +{ + pt_element_t pte; + gfn_t table_gfn; + unsigned index, pt_access, pte_access; + gpa_t pte_gpa; + + pgprintk("%s: addr %lx\n", __FUNCTION__, addr); +walk: + walker->level = vcpu->arch.mmu.root_level; + pte = vcpu->arch.cr3; +#if PTTYPE == 64 + if (!is_long_mode(vcpu)) { + pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; + if (!is_present_pte(pte)) + goto not_present; + --walker->level; + } +#endif + ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || + (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); + + pt_access = ACC_ALL; + + for (;;) { + index = PT_INDEX(addr, walker->level); + + table_gfn = gpte_to_gfn(pte); + pte_gpa = gfn_to_gpa(table_gfn); + pte_gpa += index * sizeof(pt_element_t); + walker->table_gfn[walker->level - 1] = table_gfn; + walker->pte_gpa[walker->level - 1] = pte_gpa; + pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, + walker->level - 1, table_gfn); + + kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + + if (!is_present_pte(pte)) + goto not_present; + + if (write_fault && !is_writeble_pte(pte)) + if (user_fault || is_write_protection(vcpu)) + goto access_error; + + if (user_fault && !(pte & PT_USER_MASK)) + goto access_error; + +#if PTTYPE == 64 + if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) + goto access_error; +#endif + + if (!(pte & PT_ACCESSED_MASK)) { + mark_page_dirty(vcpu->kvm, table_gfn); + if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, + index, pte, pte|PT_ACCESSED_MASK)) + goto walk; + pte |= PT_ACCESSED_MASK; + } + + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); + + walker->ptes[walker->level - 1] = pte; + + if (walker->level == PT_PAGE_TABLE_LEVEL) { + walker->gfn = gpte_to_gfn(pte); + break; + } + + if (walker->level == PT_DIRECTORY_LEVEL + && (pte & PT_PAGE_SIZE_MASK) + && (PTTYPE == 64 || is_pse(vcpu))) { + walker->gfn = gpte_to_gfn_pde(pte); + walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); + if (PTTYPE == 32 && is_cpuid_PSE36()) + walker->gfn += pse36_gfn_delta(pte); + break; + } + + pt_access = pte_access; + --walker->level; + } + + if (write_fault && !is_dirty_pte(pte)) { + bool ret; + + mark_page_dirty(vcpu->kvm, table_gfn); + ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, + pte|PT_DIRTY_MASK); + if (ret) + goto walk; + pte |= PT_DIRTY_MASK; + kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); + walker->ptes[walker->level - 1] = pte; + } + + walker->pt_access = pt_access; + walker->pte_access = pte_access; + pgprintk("%s: pte %llx pte_access %x pt_access %x\n", + __FUNCTION__, (u64)pte, pt_access, pte_access); + return 1; + +not_present: + walker->error_code = 0; + goto err; + +access_error: + walker->error_code = PFERR_PRESENT_MASK; + +err: + if (write_fault) + walker->error_code |= PFERR_WRITE_MASK; + if (user_fault) + walker->error_code |= PFERR_USER_MASK; + if (fetch_fault) + walker->error_code |= PFERR_FETCH_MASK; + return 0; +} + +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, + u64 *spte, const void *pte, int bytes, + int offset_in_pte) +{ + pt_element_t gpte; + unsigned pte_access; + struct page *npage; + + gpte = *(const pt_element_t *)pte; + if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { + if (!offset_in_pte && !is_present_pte(gpte)) + set_shadow_pte(spte, shadow_notrap_nonpresent_pte); + return; + } + if (bytes < sizeof(pt_element_t)) + return; + pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); + pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); + if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) + return; + npage = vcpu->arch.update_pte.page; + if (!npage) + return; + get_page(npage); + mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, + gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); +} + +/* + * Fetch a shadow pte for a specific level in the paging hierarchy. + */ +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *walker, + int user_fault, int write_fault, int *ptwrite, + struct page *page) +{ + hpa_t shadow_addr; + int level; + u64 *shadow_ent; + unsigned access = walker->pt_access; + + if (!is_present_pte(walker->ptes[walker->level - 1])) + return NULL; + + shadow_addr = vcpu->arch.mmu.root_hpa; + level = vcpu->arch.mmu.shadow_root_level; + if (level == PT32E_ROOT_LEVEL) { + shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + shadow_addr &= PT64_BASE_ADDR_MASK; + --level; + } + + for (; ; level--) { + u32 index = SHADOW_PT_INDEX(addr, level); + struct kvm_mmu_page *shadow_page; + u64 shadow_pte; + int metaphysical; + gfn_t table_gfn; + bool new_page = 0; + + shadow_ent = ((u64 *)__va(shadow_addr)) + index; + if (level == PT_PAGE_TABLE_LEVEL) + break; + if (is_shadow_present_pte(*shadow_ent)) { + shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; + continue; + } + + if (level - 1 == PT_PAGE_TABLE_LEVEL + && walker->level == PT_DIRECTORY_LEVEL) { + metaphysical = 1; + if (!is_dirty_pte(walker->ptes[level - 1])) + access &= ~ACC_WRITE_MASK; + table_gfn = gpte_to_gfn(walker->ptes[level - 1]); + } else { + metaphysical = 0; + table_gfn = walker->table_gfn[level - 2]; + } + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + metaphysical, access, + shadow_ent, &new_page); + if (new_page && !metaphysical) { + int r; + pt_element_t curr_pte; + r = kvm_read_guest_atomic(vcpu->kvm, + walker->pte_gpa[level - 2], + &curr_pte, sizeof(curr_pte)); + if (r || curr_pte != walker->ptes[level - 2]) { + kvm_release_page_clean(page); + return NULL; + } + } + shadow_addr = __pa(shadow_page->spt); + shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *shadow_ent = shadow_pte; + } + + mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, + user_fault, write_fault, + walker->ptes[walker->level-1] & PT_DIRTY_MASK, + ptwrite, walker->gfn, page); + + return shadow_ent; +} + +/* + * Page fault handler. There are several causes for a page fault: + * - there is no shadow pte for the guest pte + * - write access through a shadow pte marked read only so that we can set + * the dirty bit + * - write access to a shadow pte marked read only so we can update the page + * dirty bitmap, when userspace requests it + * - mmio access; in this case we will never install a present shadow pte + * - normal guest page fault due to the guest pte marked not present, not + * writable, or not executable + * + * Returns: 1 if we need to emulate the instruction, 0 otherwise, or + * a negative value on error. + */ +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, + u32 error_code) +{ + int write_fault = error_code & PFERR_WRITE_MASK; + int user_fault = error_code & PFERR_USER_MASK; + int fetch_fault = error_code & PFERR_FETCH_MASK; + struct guest_walker walker; + u64 *shadow_pte; + int write_pt = 0; + int r; + struct page *page; + + pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); + kvm_mmu_audit(vcpu, "pre page fault"); + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + down_read(¤t->mm->mmap_sem); + /* + * Look up the shadow pte for the faulting address. + */ + r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, + fetch_fault); + + /* + * The page is not mapped by the guest. Let the guest handle it. + */ + if (!r) { + pgprintk("%s: guest page fault\n", __FUNCTION__); + inject_page_fault(vcpu, addr, walker.error_code); + vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ + up_read(¤t->mm->mmap_sem); + return 0; + } + + page = gfn_to_page(vcpu->kvm, walker.gfn); + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + &write_pt, page); + pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, + shadow_pte, *shadow_pte, write_pt); + + if (!write_pt) + vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ + + /* + * mmio: emulate if accessible, otherwise its a guest fault. + */ + if (shadow_pte && is_io_pte(*shadow_pte)) { + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); + return 1; + } + + ++vcpu->stat.pf_fixed; + kvm_mmu_audit(vcpu, "post page fault (fixed)"); + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); + + return write_pt; +} + +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +{ + struct guest_walker walker; + gpa_t gpa = UNMAPPED_GVA; + int r; + + r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + + if (r) { + gpa = gfn_to_gpa(walker.gfn); + gpa |= vaddr & ~PAGE_MASK; + } + + return gpa; +} + +static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int i, offset = 0, r = 0; + pt_element_t pt; + + if (sp->role.metaphysical + || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { + nonpaging_prefetch_page(vcpu, sp); + return; + } + + if (PTTYPE == 32) + offset = sp->role.quadrant << PT64_LEVEL_BITS; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + gpa_t pte_gpa = gfn_to_gpa(sp->gfn); + pte_gpa += (i+offset) * sizeof(pt_element_t); + + r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, + sizeof(pt_element_t)); + if (r || is_present_pte(pt)) + sp->spt[i] = shadow_trap_nonpresent_pte; + else + sp->spt[i] = shadow_notrap_nonpresent_pte; + } +} + +#undef pt_element_t +#undef guest_walker +#undef FNAME +#undef PT_BASE_ADDR_MASK +#undef PT_INDEX +#undef SHADOW_PT_INDEX +#undef PT_LEVEL_MASK +#undef PT_DIR_BASE_ADDR_MASK +#undef PT_LEVEL_BITS +#undef PT_MAX_FULL_LEVELS +#undef gpte_to_gfn +#undef gpte_to_gfn_pde +#undef CMPXCHG diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h new file mode 100644 index 00000000000..56fc4c87338 --- /dev/null +++ b/arch/x86/kvm/segment_descriptor.h @@ -0,0 +1,29 @@ +#ifndef __SEGMENT_DESCRIPTOR_H +#define __SEGMENT_DESCRIPTOR_H + +struct segment_descriptor { + u16 limit_low; + u16 base_low; + u8 base_mid; + u8 type : 4; + u8 system : 1; + u8 dpl : 2; + u8 present : 1; + u8 limit_high : 4; + u8 avl : 1; + u8 long_mode : 1; + u8 default_op : 1; + u8 granularity : 1; + u8 base_high; +} __attribute__((packed)); + +#ifdef CONFIG_X86_64 +/* LDT or TSS descriptor in the GDT. 16 bytes. */ +struct segment_descriptor_64 { + struct segment_descriptor s; + u32 base_higher; + u32 pad_zero; +}; + +#endif +#endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c new file mode 100644 index 00000000000..de755cb1431 --- /dev/null +++ b/arch/x86/kvm/svm.c @@ -0,0 +1,1731 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * AMD SVM support + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ +#include <linux/kvm_host.h> + +#include "kvm_svm.h" +#include "irq.h" +#include "mmu.h" + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <linux/sched.h> + +#include <asm/desc.h> + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +#define IOPM_ALLOC_ORDER 2 +#define MSRPM_ALLOC_ORDER 1 + +#define DB_VECTOR 1 +#define UD_VECTOR 6 +#define GP_VECTOR 13 + +#define DR7_GD_MASK (1 << 13) +#define DR6_BD_MASK (1 << 13) + +#define SEG_TYPE_LDT 2 +#define SEG_TYPE_BUSY_TSS16 3 + +#define SVM_FEATURE_NPT (1 << 0) +#define SVM_FEATURE_LBRV (1 << 1) +#define SVM_DEATURE_SVML (1 << 2) + +static void kvm_reput_irq(struct vcpu_svm *svm); + +static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_svm, vcpu); +} + +unsigned long iopm_base; +unsigned long msrpm_base; + +struct kvm_ldttss_desc { + u16 limit0; + u16 base0; + unsigned base1 : 8, type : 5, dpl : 2, p : 1; + unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; + u32 base3; + u32 zero1; +} __attribute__((packed)); + +struct svm_cpu_data { + int cpu; + + u64 asid_generation; + u32 max_asid; + u32 next_asid; + struct kvm_ldttss_desc *tss_desc; + + struct page *save_area; +}; + +static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +static uint32_t svm_features; + +struct svm_init_data { + int cpu; + int r; +}; + +static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; + +#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) +#define MSRS_RANGE_SIZE 2048 +#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) + +#define MAX_INST_SIZE 15 + +static inline u32 svm_has(u32 feat) +{ + return svm_features & feat; +} + +static inline u8 pop_irq(struct kvm_vcpu *vcpu) +{ + int word_index = __ffs(vcpu->arch.irq_summary); + int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); + int irq = word_index * BITS_PER_LONG + bit_index; + + clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); + if (!vcpu->arch.irq_pending[word_index]) + clear_bit(word_index, &vcpu->arch.irq_summary); + return irq; +} + +static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) +{ + set_bit(irq, vcpu->arch.irq_pending); + set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); +} + +static inline void clgi(void) +{ + asm volatile (SVM_CLGI); +} + +static inline void stgi(void) +{ + asm volatile (SVM_STGI); +} + +static inline void invlpga(unsigned long addr, u32 asid) +{ + asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); +} + +static inline unsigned long kvm_read_cr2(void) +{ + unsigned long cr2; + + asm volatile ("mov %%cr2, %0" : "=r" (cr2)); + return cr2; +} + +static inline void kvm_write_cr2(unsigned long val) +{ + asm volatile ("mov %0, %%cr2" :: "r" (val)); +} + +static inline unsigned long read_dr6(void) +{ + unsigned long dr6; + + asm volatile ("mov %%dr6, %0" : "=r" (dr6)); + return dr6; +} + +static inline void write_dr6(unsigned long val) +{ + asm volatile ("mov %0, %%dr6" :: "r" (val)); +} + +static inline unsigned long read_dr7(void) +{ + unsigned long dr7; + + asm volatile ("mov %%dr7, %0" : "=r" (dr7)); + return dr7; +} + +static inline void write_dr7(unsigned long val) +{ + asm volatile ("mov %0, %%dr7" :: "r" (val)); +} + +static inline void force_new_asid(struct kvm_vcpu *vcpu) +{ + to_svm(vcpu)->asid_generation--; +} + +static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) +{ + force_new_asid(vcpu); +} + +static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (!(efer & EFER_LMA)) + efer &= ~EFER_LME; + + to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; + vcpu->arch.shadow_efer = efer; +} + +static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.event_inj = nr + | SVM_EVTINJ_VALID + | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) + | SVM_EVTINJ_TYPE_EXEPT; + svm->vmcb->control.event_inj_err = error_code; +} + +static bool svm_exception_injected(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); +} + +static int is_external_interrupt(u32 info) +{ + info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; + return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); +} + +static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!svm->next_rip) { + printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); + return; + } + if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) + printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", + __FUNCTION__, + svm->vmcb->save.rip, + svm->next_rip); + + vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; + svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; + + vcpu->arch.interrupt_window_open = 1; +} + +static int has_svm(void) +{ + uint32_t eax, ebx, ecx, edx; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + printk(KERN_INFO "has_svm: not amd\n"); + return 0; + } + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + if (eax < SVM_CPUID_FUNC) { + printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); + return 0; + } + + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { + printk(KERN_DEBUG "has_svm: svm not available\n"); + return 0; + } + return 1; +} + +static void svm_hardware_disable(void *garbage) +{ + struct svm_cpu_data *svm_data + = per_cpu(svm_data, raw_smp_processor_id()); + + if (svm_data) { + uint64_t efer; + + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); + per_cpu(svm_data, raw_smp_processor_id()) = NULL; + __free_page(svm_data->save_area); + kfree(svm_data); + } +} + +static void svm_hardware_enable(void *garbage) +{ + + struct svm_cpu_data *svm_data; + uint64_t efer; +#ifdef CONFIG_X86_64 + struct desc_ptr gdt_descr; +#else + struct desc_ptr gdt_descr; +#endif + struct desc_struct *gdt; + int me = raw_smp_processor_id(); + + if (!has_svm()) { + printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); + return; + } + svm_data = per_cpu(svm_data, me); + + if (!svm_data) { + printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", + me); + return; + } + + svm_data->asid_generation = 1; + svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; + svm_data->next_asid = svm_data->max_asid + 1; + svm_features = cpuid_edx(SVM_CPUID_FUNC); + + asm volatile ("sgdt %0" : "=m"(gdt_descr)); + gdt = (struct desc_struct *)gdt_descr.address; + svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); + + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK); + + wrmsrl(MSR_VM_HSAVE_PA, + page_to_pfn(svm_data->save_area) << PAGE_SHIFT); +} + +static int svm_cpu_init(int cpu) +{ + struct svm_cpu_data *svm_data; + int r; + + svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); + if (!svm_data) + return -ENOMEM; + svm_data->cpu = cpu; + svm_data->save_area = alloc_page(GFP_KERNEL); + r = -ENOMEM; + if (!svm_data->save_area) + goto err_1; + + per_cpu(svm_data, cpu) = svm_data; + + return 0; + +err_1: + kfree(svm_data); + return r; + +} + +static void set_msr_interception(u32 *msrpm, unsigned msr, + int read, int write) +{ + int i; + + for (i = 0; i < NUM_MSR_MAPS; i++) { + if (msr >= msrpm_ranges[i] && + msr < msrpm_ranges[i] + MSRS_IN_RANGE) { + u32 msr_offset = (i * MSRS_IN_RANGE + msr - + msrpm_ranges[i]) * 2; + + u32 *base = msrpm + (msr_offset / 32); + u32 msr_shift = msr_offset % 32; + u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); + *base = (*base & ~(0x3 << msr_shift)) | + (mask << msr_shift); + return; + } + } + BUG(); +} + +static __init int svm_hardware_setup(void) +{ + int cpu; + struct page *iopm_pages; + struct page *msrpm_pages; + void *iopm_va, *msrpm_va; + int r; + + iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); + + if (!iopm_pages) + return -ENOMEM; + + iopm_va = page_address(iopm_pages); + memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); + clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ + iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + + + msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + + r = -ENOMEM; + if (!msrpm_pages) + goto err_1; + + msrpm_va = page_address(msrpm_pages); + memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; + +#ifdef CONFIG_X86_64 + set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); + set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); + set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); + set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); + set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); + set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); +#endif + set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1); + set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); + set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); + set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); + + for_each_online_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err_2; + } + return 0; + +err_2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); + msrpm_base = 0; +err_1: + __free_pages(iopm_pages, IOPM_ALLOC_ORDER); + iopm_base = 0; + return r; +} + +static __exit void svm_hardware_unsetup(void) +{ + __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); + __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); + iopm_base = msrpm_base = 0; +} + +static void init_seg(struct vmcb_seg *seg) +{ + seg->selector = 0; + seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | + SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ + seg->limit = 0xffff; + seg->base = 0; +} + +static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) +{ + seg->selector = 0; + seg->attrib = SVM_SELECTOR_P_MASK | type; + seg->limit = 0xffff; + seg->base = 0; +} + +static void init_vmcb(struct vmcb *vmcb) +{ + struct vmcb_control_area *control = &vmcb->control; + struct vmcb_save_area *save = &vmcb->save; + + control->intercept_cr_read = INTERCEPT_CR0_MASK | + INTERCEPT_CR3_MASK | + INTERCEPT_CR4_MASK | + INTERCEPT_CR8_MASK; + + control->intercept_cr_write = INTERCEPT_CR0_MASK | + INTERCEPT_CR3_MASK | + INTERCEPT_CR4_MASK | + INTERCEPT_CR8_MASK; + + control->intercept_dr_read = INTERCEPT_DR0_MASK | + INTERCEPT_DR1_MASK | + INTERCEPT_DR2_MASK | + INTERCEPT_DR3_MASK; + + control->intercept_dr_write = INTERCEPT_DR0_MASK | + INTERCEPT_DR1_MASK | + INTERCEPT_DR2_MASK | + INTERCEPT_DR3_MASK | + INTERCEPT_DR5_MASK | + INTERCEPT_DR7_MASK; + + control->intercept_exceptions = (1 << PF_VECTOR) | + (1 << UD_VECTOR); + + + control->intercept = (1ULL << INTERCEPT_INTR) | + (1ULL << INTERCEPT_NMI) | + (1ULL << INTERCEPT_SMI) | + /* + * selective cr0 intercept bug? + * 0: 0f 22 d8 mov %eax,%cr3 + * 3: 0f 20 c0 mov %cr0,%eax + * 6: 0d 00 00 00 80 or $0x80000000,%eax + * b: 0f 22 c0 mov %eax,%cr0 + * set cr3 ->interception + * get cr0 ->interception + * set cr0 -> no interception + */ + /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ + (1ULL << INTERCEPT_CPUID) | + (1ULL << INTERCEPT_INVD) | + (1ULL << INTERCEPT_HLT) | + (1ULL << INTERCEPT_INVLPGA) | + (1ULL << INTERCEPT_IOIO_PROT) | + (1ULL << INTERCEPT_MSR_PROT) | + (1ULL << INTERCEPT_TASK_SWITCH) | + (1ULL << INTERCEPT_SHUTDOWN) | + (1ULL << INTERCEPT_VMRUN) | + (1ULL << INTERCEPT_VMMCALL) | + (1ULL << INTERCEPT_VMLOAD) | + (1ULL << INTERCEPT_VMSAVE) | + (1ULL << INTERCEPT_STGI) | + (1ULL << INTERCEPT_CLGI) | + (1ULL << INTERCEPT_SKINIT) | + (1ULL << INTERCEPT_WBINVD) | + (1ULL << INTERCEPT_MONITOR) | + (1ULL << INTERCEPT_MWAIT); + + control->iopm_base_pa = iopm_base; + control->msrpm_base_pa = msrpm_base; + control->tsc_offset = 0; + control->int_ctl = V_INTR_MASKING_MASK; + + init_seg(&save->es); + init_seg(&save->ss); + init_seg(&save->ds); + init_seg(&save->fs); + init_seg(&save->gs); + + save->cs.selector = 0xf000; + /* Executable/Readable Code Segment */ + save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | + SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; + save->cs.limit = 0xffff; + /* + * cs.base should really be 0xffff0000, but vmx can't handle that, so + * be consistent with it. + * + * Replace when we have real mode working for vmx. + */ + save->cs.base = 0xf0000; + + save->gdtr.limit = 0xffff; + save->idtr.limit = 0xffff; + + init_sys_seg(&save->ldtr, SEG_TYPE_LDT); + init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); + + save->efer = MSR_EFER_SVME_MASK; + save->dr6 = 0xffff0ff0; + save->dr7 = 0x400; + save->rflags = 2; + save->rip = 0x0000fff0; + + /* + * cr0 val on cpu init should be 0x60000010, we enable cpu + * cache by default. the orderly way is to enable cache in bios. + */ + save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; + save->cr4 = X86_CR4_PAE; + /* rdx = ?? */ +} + +static int svm_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + init_vmcb(svm->vmcb); + + if (vcpu->vcpu_id != 0) { + svm->vmcb->save.rip = 0; + svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; + svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; + } + + return 0; +} + +static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) +{ + struct vcpu_svm *svm; + struct page *page; + int err; + + svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!svm) { + err = -ENOMEM; + goto out; + } + + err = kvm_vcpu_init(&svm->vcpu, kvm, id); + if (err) + goto free_svm; + + page = alloc_page(GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto uninit; + } + + svm->vmcb = page_address(page); + clear_page(svm->vmcb); + svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; + svm->asid_generation = 0; + memset(svm->db_regs, 0, sizeof(svm->db_regs)); + init_vmcb(svm->vmcb); + + fx_init(&svm->vcpu); + svm->vcpu.fpu_active = 1; + svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (svm->vcpu.vcpu_id == 0) + svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + + return &svm->vcpu; + +uninit: + kvm_vcpu_uninit(&svm->vcpu); +free_svm: + kmem_cache_free(kvm_vcpu_cache, svm); +out: + return ERR_PTR(err); +} + +static void svm_free_vcpu(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, svm); +} + +static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int i; + + if (unlikely(cpu != vcpu->cpu)) { + u64 tsc_this, delta; + + /* + * Make sure that the guest sees a monotonically + * increasing TSC. + */ + rdtscll(tsc_this); + delta = vcpu->arch.host_tsc - tsc_this; + svm->vmcb->control.tsc_offset += delta; + vcpu->cpu = cpu; + kvm_migrate_apic_timer(vcpu); + } + + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) + rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); +} + +static void svm_vcpu_put(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int i; + + ++vcpu->stat.host_state_reload; + for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) + wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + + rdtscll(vcpu->arch.host_tsc); +} + +static void svm_vcpu_decache(struct kvm_vcpu *vcpu) +{ +} + +static void svm_cache_regs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->arch.rip = svm->vmcb->save.rip; +} + +static void svm_decache_regs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.rip; +} + +static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) +{ + return to_svm(vcpu)->vmcb->save.rflags; +} + +static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + to_svm(vcpu)->vmcb->save.rflags = rflags; +} + +static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) +{ + struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; + + switch (seg) { + case VCPU_SREG_CS: return &save->cs; + case VCPU_SREG_DS: return &save->ds; + case VCPU_SREG_ES: return &save->es; + case VCPU_SREG_FS: return &save->fs; + case VCPU_SREG_GS: return &save->gs; + case VCPU_SREG_SS: return &save->ss; + case VCPU_SREG_TR: return &save->tr; + case VCPU_SREG_LDTR: return &save->ldtr; + } + BUG(); + return NULL; +} + +static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct vmcb_seg *s = svm_seg(vcpu, seg); + + return s->base; +} + +static void svm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vmcb_seg *s = svm_seg(vcpu, seg); + + var->base = s->base; + var->limit = s->limit; + var->selector = s->selector; + var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; + var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; + var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; + var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; + var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; + var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; + var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; + var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; + var->unusable = !var->present; +} + +static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + dt->limit = svm->vmcb->save.idtr.limit; + dt->base = svm->vmcb->save.idtr.base; +} + +static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.idtr.limit = dt->limit; + svm->vmcb->save.idtr.base = dt->base ; +} + +static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + dt->limit = svm->vmcb->save.gdtr.limit; + dt->base = svm->vmcb->save.gdtr.base; +} + +static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.gdtr.limit = dt->limit; + svm->vmcb->save.gdtr.base = dt->base ; +} + +static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) +{ +} + +static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + struct vcpu_svm *svm = to_svm(vcpu); + +#ifdef CONFIG_X86_64 + if (vcpu->arch.shadow_efer & EFER_LME) { + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { + vcpu->arch.shadow_efer |= EFER_LMA; + svm->vmcb->save.efer |= EFER_LMA | EFER_LME; + } + + if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { + vcpu->arch.shadow_efer &= ~EFER_LMA; + svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); + } + } +#endif + if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { + svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); + vcpu->fpu_active = 1; + } + + vcpu->arch.cr0 = cr0; + cr0 |= X86_CR0_PG | X86_CR0_WP; + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); + svm->vmcb->save.cr0 = cr0; +} + +static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + vcpu->arch.cr4 = cr4; + to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; +} + +static void svm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_seg *s = svm_seg(vcpu, seg); + + s->base = var->base; + s->limit = var->limit; + s->selector = var->selector; + if (var->unusable) + s->attrib = 0; + else { + s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); + s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; + s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; + s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; + s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; + s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; + s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; + s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; + } + if (seg == VCPU_SREG_CS) + svm->vmcb->save.cpl + = (svm->vmcb->save.cs.attrib + >> SVM_SELECTOR_DPL_SHIFT) & 3; + +} + +/* FIXME: + + svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK; + svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); + +*/ + +static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) +{ + return -EOPNOTSUPP; +} + +static int svm_get_irq(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u32 exit_int_info = svm->vmcb->control.exit_int_info; + + if (is_external_interrupt(exit_int_info)) + return exit_int_info & SVM_EVTINJ_VEC_MASK; + return -1; +} + +static void load_host_msrs(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); +#endif +} + +static void save_host_msrs(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); +#endif +} + +static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) +{ + if (svm_data->next_asid > svm_data->max_asid) { + ++svm_data->asid_generation; + svm_data->next_asid = 1; + svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; + } + + svm->vcpu.cpu = svm_data->cpu; + svm->asid_generation = svm_data->asid_generation; + svm->vmcb->control.asid = svm_data->next_asid++; +} + +static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) +{ + return to_svm(vcpu)->db_regs[dr]; +} + +static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, + int *exception) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + *exception = 0; + + if (svm->vmcb->save.dr7 & DR7_GD_MASK) { + svm->vmcb->save.dr7 &= ~DR7_GD_MASK; + svm->vmcb->save.dr6 |= DR6_BD_MASK; + *exception = DB_VECTOR; + return; + } + + switch (dr) { + case 0 ... 3: + svm->db_regs[dr] = value; + return; + case 4 ... 5: + if (vcpu->arch.cr4 & X86_CR4_DE) { + *exception = UD_VECTOR; + return; + } + case 7: { + if (value & ~((1ULL << 32) - 1)) { + *exception = GP_VECTOR; + return; + } + svm->vmcb->save.dr7 = value; + return; + } + default: + printk(KERN_DEBUG "%s: unexpected dr %u\n", + __FUNCTION__, dr); + *exception = UD_VECTOR; + return; + } +} + +static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + u32 exit_int_info = svm->vmcb->control.exit_int_info; + struct kvm *kvm = svm->vcpu.kvm; + u64 fault_address; + u32 error_code; + + if (!irqchip_in_kernel(kvm) && + is_external_interrupt(exit_int_info)) + push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); + + fault_address = svm->vmcb->control.exit_info_2; + error_code = svm->vmcb->control.exit_info_1; + return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); +} + +static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + int er; + + er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); + if (er != EMULATE_DONE) + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; +} + +static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); + if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) + svm->vmcb->save.cr0 &= ~X86_CR0_TS; + svm->vcpu.fpu_active = 1; + + return 1; +} + +static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + /* + * VMCB is undefined after a SHUTDOWN intercept + * so reinitialize it. + */ + clear_page(svm->vmcb); + init_vmcb(svm->vmcb); + + kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + return 0; +} + +static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ + int size, down, in, string, rep; + unsigned port; + + ++svm->vcpu.stat.io_exits; + + svm->next_rip = svm->vmcb->control.exit_info_2; + + string = (io_info & SVM_IOIO_STR_MASK) != 0; + + if (string) { + if (emulate_instruction(&svm->vcpu, + kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) + return 0; + return 1; + } + + in = (io_info & SVM_IOIO_TYPE_MASK) != 0; + port = io_info >> 16; + size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; + rep = (io_info & SVM_IOIO_REP_MASK) != 0; + down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; + + return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); +} + +static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + return 1; +} + +static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + svm->next_rip = svm->vmcb->save.rip + 1; + skip_emulated_instruction(&svm->vcpu); + return kvm_emulate_halt(&svm->vcpu); +} + +static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + svm->next_rip = svm->vmcb->save.rip + 3; + skip_emulated_instruction(&svm->vcpu); + kvm_emulate_hypercall(&svm->vcpu); + return 1; +} + +static int invalid_op_interception(struct vcpu_svm *svm, + struct kvm_run *kvm_run) +{ + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; +} + +static int task_switch_interception(struct vcpu_svm *svm, + struct kvm_run *kvm_run) +{ + pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__); + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + return 0; +} + +static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + svm->next_rip = svm->vmcb->save.rip + 2; + kvm_emulate_cpuid(&svm->vcpu); + return 1; +} + +static int emulate_on_interception(struct vcpu_svm *svm, + struct kvm_run *kvm_run) +{ + if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) + pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); + return 1; +} + +static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); + if (irqchip_in_kernel(svm->vcpu.kvm)) + return 1; + kvm_run->exit_reason = KVM_EXIT_SET_TPR; + return 0; +} + +static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + switch (ecx) { + case MSR_IA32_TIME_STAMP_COUNTER: { + u64 tsc; + + rdtscll(tsc); + *data = svm->vmcb->control.tsc_offset + tsc; + break; + } + case MSR_K6_STAR: + *data = svm->vmcb->save.star; + break; +#ifdef CONFIG_X86_64 + case MSR_LSTAR: + *data = svm->vmcb->save.lstar; + break; + case MSR_CSTAR: + *data = svm->vmcb->save.cstar; + break; + case MSR_KERNEL_GS_BASE: + *data = svm->vmcb->save.kernel_gs_base; + break; + case MSR_SYSCALL_MASK: + *data = svm->vmcb->save.sfmask; + break; +#endif + case MSR_IA32_SYSENTER_CS: + *data = svm->vmcb->save.sysenter_cs; + break; + case MSR_IA32_SYSENTER_EIP: + *data = svm->vmcb->save.sysenter_eip; + break; + case MSR_IA32_SYSENTER_ESP: + *data = svm->vmcb->save.sysenter_esp; + break; + default: + return kvm_get_msr_common(vcpu, ecx, data); + } + return 0; +} + +static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + u64 data; + + if (svm_get_msr(&svm->vcpu, ecx, &data)) + kvm_inject_gp(&svm->vcpu, 0); + else { + svm->vmcb->save.rax = data & 0xffffffff; + svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; + svm->next_rip = svm->vmcb->save.rip + 2; + skip_emulated_instruction(&svm->vcpu); + } + return 1; +} + +static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + switch (ecx) { + case MSR_IA32_TIME_STAMP_COUNTER: { + u64 tsc; + + rdtscll(tsc); + svm->vmcb->control.tsc_offset = data - tsc; + break; + } + case MSR_K6_STAR: + svm->vmcb->save.star = data; + break; +#ifdef CONFIG_X86_64 + case MSR_LSTAR: + svm->vmcb->save.lstar = data; + break; + case MSR_CSTAR: + svm->vmcb->save.cstar = data; + break; + case MSR_KERNEL_GS_BASE: + svm->vmcb->save.kernel_gs_base = data; + break; + case MSR_SYSCALL_MASK: + svm->vmcb->save.sfmask = data; + break; +#endif + case MSR_IA32_SYSENTER_CS: + svm->vmcb->save.sysenter_cs = data; + break; + case MSR_IA32_SYSENTER_EIP: + svm->vmcb->save.sysenter_eip = data; + break; + case MSR_IA32_SYSENTER_ESP: + svm->vmcb->save.sysenter_esp = data; + break; + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + /* + * only support writing 0 to the performance counters for now + * to make Windows happy. Should be replaced by a real + * performance counter emulation later. + */ + if (data != 0) + goto unhandled; + break; + default: + unhandled: + return kvm_set_msr_common(vcpu, ecx, data); + } + return 0; +} + +static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + u64 data = (svm->vmcb->save.rax & -1u) + | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); + svm->next_rip = svm->vmcb->save.rip + 2; + if (svm_set_msr(&svm->vcpu, ecx, data)) + kvm_inject_gp(&svm->vcpu, 0); + else + skip_emulated_instruction(&svm->vcpu); + return 1; +} + +static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (svm->vmcb->control.exit_info_1) + return wrmsr_interception(svm, kvm_run); + else + return rdmsr_interception(svm, kvm_run); +} + +static int interrupt_window_interception(struct vcpu_svm *svm, + struct kvm_run *kvm_run) +{ + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + /* + * If the user space waits to inject interrupts, exit as soon as + * possible + */ + if (kvm_run->request_interrupt_window && + !svm->vcpu.arch.irq_summary) { + ++svm->vcpu.stat.irq_window_exits; + kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + return 0; + } + + return 1; +} + +static int (*svm_exit_handlers[])(struct vcpu_svm *svm, + struct kvm_run *kvm_run) = { + [SVM_EXIT_READ_CR0] = emulate_on_interception, + [SVM_EXIT_READ_CR3] = emulate_on_interception, + [SVM_EXIT_READ_CR4] = emulate_on_interception, + [SVM_EXIT_READ_CR8] = emulate_on_interception, + /* for now: */ + [SVM_EXIT_WRITE_CR0] = emulate_on_interception, + [SVM_EXIT_WRITE_CR3] = emulate_on_interception, + [SVM_EXIT_WRITE_CR4] = emulate_on_interception, + [SVM_EXIT_WRITE_CR8] = cr8_write_interception, + [SVM_EXIT_READ_DR0] = emulate_on_interception, + [SVM_EXIT_READ_DR1] = emulate_on_interception, + [SVM_EXIT_READ_DR2] = emulate_on_interception, + [SVM_EXIT_READ_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR0] = emulate_on_interception, + [SVM_EXIT_WRITE_DR1] = emulate_on_interception, + [SVM_EXIT_WRITE_DR2] = emulate_on_interception, + [SVM_EXIT_WRITE_DR3] = emulate_on_interception, + [SVM_EXIT_WRITE_DR5] = emulate_on_interception, + [SVM_EXIT_WRITE_DR7] = emulate_on_interception, + [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, + [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, + [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, + [SVM_EXIT_INTR] = nop_on_interception, + [SVM_EXIT_NMI] = nop_on_interception, + [SVM_EXIT_SMI] = nop_on_interception, + [SVM_EXIT_INIT] = nop_on_interception, + [SVM_EXIT_VINTR] = interrupt_window_interception, + /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ + [SVM_EXIT_CPUID] = cpuid_interception, + [SVM_EXIT_INVD] = emulate_on_interception, + [SVM_EXIT_HLT] = halt_interception, + [SVM_EXIT_INVLPG] = emulate_on_interception, + [SVM_EXIT_INVLPGA] = invalid_op_interception, + [SVM_EXIT_IOIO] = io_interception, + [SVM_EXIT_MSR] = msr_interception, + [SVM_EXIT_TASK_SWITCH] = task_switch_interception, + [SVM_EXIT_SHUTDOWN] = shutdown_interception, + [SVM_EXIT_VMRUN] = invalid_op_interception, + [SVM_EXIT_VMMCALL] = vmmcall_interception, + [SVM_EXIT_VMLOAD] = invalid_op_interception, + [SVM_EXIT_VMSAVE] = invalid_op_interception, + [SVM_EXIT_STGI] = invalid_op_interception, + [SVM_EXIT_CLGI] = invalid_op_interception, + [SVM_EXIT_SKINIT] = invalid_op_interception, + [SVM_EXIT_WBINVD] = emulate_on_interception, + [SVM_EXIT_MONITOR] = invalid_op_interception, + [SVM_EXIT_MWAIT] = invalid_op_interception, +}; + + +static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u32 exit_code = svm->vmcb->control.exit_code; + + kvm_reput_irq(svm); + + if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason + = svm->vmcb->control.exit_code; + return 0; + } + + if (is_external_interrupt(svm->vmcb->control.exit_int_info) && + exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) + printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " + "exit_code 0x%x\n", + __FUNCTION__, svm->vmcb->control.exit_int_info, + exit_code); + + if (exit_code >= ARRAY_SIZE(svm_exit_handlers) + || !svm_exit_handlers[exit_code]) { + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason = exit_code; + return 0; + } + + return svm_exit_handlers[exit_code](svm, kvm_run); +} + +static void reload_tss(struct kvm_vcpu *vcpu) +{ + int cpu = raw_smp_processor_id(); + + struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); + svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ + load_TR_desc(); +} + +static void pre_svm_run(struct vcpu_svm *svm) +{ + int cpu = raw_smp_processor_id(); + + struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); + + svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + if (svm->vcpu.cpu != cpu || + svm->asid_generation != svm_data->asid_generation) + new_asid(svm, svm_data); +} + + +static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) +{ + struct vmcb_control_area *control; + + control = &svm->vmcb->control; + control->int_vector = irq; + control->int_ctl &= ~V_INTR_PRIO_MASK; + control->int_ctl |= V_IRQ_MASK | + ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); +} + +static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm_inject_irq(svm, irq); +} + +static void svm_intr_assist(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + int intr_vector = -1; + + if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && + ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { + intr_vector = vmcb->control.exit_int_info & + SVM_EVTINJ_VEC_MASK; + vmcb->control.exit_int_info = 0; + svm_inject_irq(svm, intr_vector); + return; + } + + if (vmcb->control.int_ctl & V_IRQ_MASK) + return; + + if (!kvm_cpu_has_interrupt(vcpu)) + return; + + if (!(vmcb->save.rflags & X86_EFLAGS_IF) || + (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || + (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { + /* unable to deliver irq, set pending irq */ + vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); + svm_inject_irq(svm, 0x0); + return; + } + /* Okay, we can deliver the interrupt: grab it and update PIC state. */ + intr_vector = kvm_cpu_get_interrupt(vcpu); + svm_inject_irq(svm, intr_vector); + kvm_timer_intr_post(vcpu, intr_vector); +} + +static void kvm_reput_irq(struct vcpu_svm *svm) +{ + struct vmcb_control_area *control = &svm->vmcb->control; + + if ((control->int_ctl & V_IRQ_MASK) + && !irqchip_in_kernel(svm->vcpu.kvm)) { + control->int_ctl &= ~V_IRQ_MASK; + push_irq(&svm->vcpu, control->int_vector); + } + + svm->vcpu.arch.interrupt_window_open = + !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); +} + +static void svm_do_inject_vector(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + int word_index = __ffs(vcpu->arch.irq_summary); + int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); + int irq = word_index * BITS_PER_LONG + bit_index; + + clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); + if (!vcpu->arch.irq_pending[word_index]) + clear_bit(word_index, &vcpu->arch.irq_summary); + svm_inject_irq(svm, irq); +} + +static void do_interrupt_requests(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + svm->vcpu.arch.interrupt_window_open = + (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && + (svm->vmcb->save.rflags & X86_EFLAGS_IF)); + + if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) + /* + * If interrupts enabled, and not blocked by sti or mov ss. Good. + */ + svm_do_inject_vector(svm); + + /* + * Interrupts blocked. Wait for unblock. + */ + if (!svm->vcpu.arch.interrupt_window_open && + (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) + control->intercept |= 1ULL << INTERCEPT_VINTR; + else + control->intercept &= ~(1ULL << INTERCEPT_VINTR); +} + +static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + return 0; +} + +static void save_db_regs(unsigned long *db_regs) +{ + asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); + asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1])); + asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2])); + asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3])); +} + +static void load_db_regs(unsigned long *db_regs) +{ + asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0])); + asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1])); + asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2])); + asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); +} + +static void svm_flush_tlb(struct kvm_vcpu *vcpu) +{ + force_new_asid(vcpu); +} + +static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) +{ +} + +static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u16 fs_selector; + u16 gs_selector; + u16 ldt_selector; + + pre_svm_run(svm); + + save_host_msrs(vcpu); + fs_selector = read_fs(); + gs_selector = read_gs(); + ldt_selector = read_ldt(); + svm->host_cr2 = kvm_read_cr2(); + svm->host_dr6 = read_dr6(); + svm->host_dr7 = read_dr7(); + svm->vmcb->save.cr2 = vcpu->arch.cr2; + + if (svm->vmcb->save.dr7 & 0xff) { + write_dr7(0); + save_db_regs(svm->host_db_regs); + load_db_regs(svm->db_regs); + } + + clgi(); + + local_irq_enable(); + + asm volatile ( +#ifdef CONFIG_X86_64 + "push %%rbp; \n\t" +#else + "push %%ebp; \n\t" +#endif + +#ifdef CONFIG_X86_64 + "mov %c[rbx](%[svm]), %%rbx \n\t" + "mov %c[rcx](%[svm]), %%rcx \n\t" + "mov %c[rdx](%[svm]), %%rdx \n\t" + "mov %c[rsi](%[svm]), %%rsi \n\t" + "mov %c[rdi](%[svm]), %%rdi \n\t" + "mov %c[rbp](%[svm]), %%rbp \n\t" + "mov %c[r8](%[svm]), %%r8 \n\t" + "mov %c[r9](%[svm]), %%r9 \n\t" + "mov %c[r10](%[svm]), %%r10 \n\t" + "mov %c[r11](%[svm]), %%r11 \n\t" + "mov %c[r12](%[svm]), %%r12 \n\t" + "mov %c[r13](%[svm]), %%r13 \n\t" + "mov %c[r14](%[svm]), %%r14 \n\t" + "mov %c[r15](%[svm]), %%r15 \n\t" +#else + "mov %c[rbx](%[svm]), %%ebx \n\t" + "mov %c[rcx](%[svm]), %%ecx \n\t" + "mov %c[rdx](%[svm]), %%edx \n\t" + "mov %c[rsi](%[svm]), %%esi \n\t" + "mov %c[rdi](%[svm]), %%edi \n\t" + "mov %c[rbp](%[svm]), %%ebp \n\t" +#endif + +#ifdef CONFIG_X86_64 + /* Enter guest mode */ + "push %%rax \n\t" + "mov %c[vmcb](%[svm]), %%rax \n\t" + SVM_VMLOAD "\n\t" + SVM_VMRUN "\n\t" + SVM_VMSAVE "\n\t" + "pop %%rax \n\t" +#else + /* Enter guest mode */ + "push %%eax \n\t" + "mov %c[vmcb](%[svm]), %%eax \n\t" + SVM_VMLOAD "\n\t" + SVM_VMRUN "\n\t" + SVM_VMSAVE "\n\t" + "pop %%eax \n\t" +#endif + + /* Save guest registers, load host registers */ +#ifdef CONFIG_X86_64 + "mov %%rbx, %c[rbx](%[svm]) \n\t" + "mov %%rcx, %c[rcx](%[svm]) \n\t" + "mov %%rdx, %c[rdx](%[svm]) \n\t" + "mov %%rsi, %c[rsi](%[svm]) \n\t" + "mov %%rdi, %c[rdi](%[svm]) \n\t" + "mov %%rbp, %c[rbp](%[svm]) \n\t" + "mov %%r8, %c[r8](%[svm]) \n\t" + "mov %%r9, %c[r9](%[svm]) \n\t" + "mov %%r10, %c[r10](%[svm]) \n\t" + "mov %%r11, %c[r11](%[svm]) \n\t" + "mov %%r12, %c[r12](%[svm]) \n\t" + "mov %%r13, %c[r13](%[svm]) \n\t" + "mov %%r14, %c[r14](%[svm]) \n\t" + "mov %%r15, %c[r15](%[svm]) \n\t" + + "pop %%rbp; \n\t" +#else + "mov %%ebx, %c[rbx](%[svm]) \n\t" + "mov %%ecx, %c[rcx](%[svm]) \n\t" + "mov %%edx, %c[rdx](%[svm]) \n\t" + "mov %%esi, %c[rsi](%[svm]) \n\t" + "mov %%edi, %c[rdi](%[svm]) \n\t" + "mov %%ebp, %c[rbp](%[svm]) \n\t" + + "pop %%ebp; \n\t" +#endif + : + : [svm]"a"(svm), + [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), + [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) +#ifdef CONFIG_X86_64 + , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), + [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), + [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) +#endif + : "cc", "memory" +#ifdef CONFIG_X86_64 + , "rbx", "rcx", "rdx", "rsi", "rdi" + , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" +#else + , "ebx", "ecx", "edx" , "esi", "edi" +#endif + ); + + if ((svm->vmcb->save.dr7 & 0xff)) + load_db_regs(svm->host_db_regs); + + vcpu->arch.cr2 = svm->vmcb->save.cr2; + + write_dr6(svm->host_dr6); + write_dr7(svm->host_dr7); + kvm_write_cr2(svm->host_cr2); + + load_fs(fs_selector); + load_gs(gs_selector); + load_ldt(ldt_selector); + load_host_msrs(vcpu); + + reload_tss(vcpu); + + local_irq_disable(); + + stgi(); + + svm->next_rip = 0; +} + +static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->save.cr3 = root; + force_new_asid(vcpu); + + if (vcpu->fpu_active) { + svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + svm->vmcb->save.cr0 |= X86_CR0_TS; + vcpu->fpu_active = 0; + } +} + +static int is_disabled(void) +{ + u64 vm_cr; + + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) + return 1; + + return 0; +} + +static void +svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +{ + /* + * Patch in the VMMCALL instruction: + */ + hypercall[0] = 0x0f; + hypercall[1] = 0x01; + hypercall[2] = 0xd9; +} + +static void svm_check_processor_compat(void *rtn) +{ + *(int *)rtn = 0; +} + +static bool svm_cpu_has_accelerated_tpr(void) +{ + return false; +} + +static struct kvm_x86_ops svm_x86_ops = { + .cpu_has_kvm_support = has_svm, + .disabled_by_bios = is_disabled, + .hardware_setup = svm_hardware_setup, + .hardware_unsetup = svm_hardware_unsetup, + .check_processor_compatibility = svm_check_processor_compat, + .hardware_enable = svm_hardware_enable, + .hardware_disable = svm_hardware_disable, + .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, + + .vcpu_create = svm_create_vcpu, + .vcpu_free = svm_free_vcpu, + .vcpu_reset = svm_vcpu_reset, + + .prepare_guest_switch = svm_prepare_guest_switch, + .vcpu_load = svm_vcpu_load, + .vcpu_put = svm_vcpu_put, + .vcpu_decache = svm_vcpu_decache, + + .set_guest_debug = svm_guest_debug, + .get_msr = svm_get_msr, + .set_msr = svm_set_msr, + .get_segment_base = svm_get_segment_base, + .get_segment = svm_get_segment, + .set_segment = svm_set_segment, + .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, + .set_cr0 = svm_set_cr0, + .set_cr3 = svm_set_cr3, + .set_cr4 = svm_set_cr4, + .set_efer = svm_set_efer, + .get_idt = svm_get_idt, + .set_idt = svm_set_idt, + .get_gdt = svm_get_gdt, + .set_gdt = svm_set_gdt, + .get_dr = svm_get_dr, + .set_dr = svm_set_dr, + .cache_regs = svm_cache_regs, + .decache_regs = svm_decache_regs, + .get_rflags = svm_get_rflags, + .set_rflags = svm_set_rflags, + + .tlb_flush = svm_flush_tlb, + + .run = svm_vcpu_run, + .handle_exit = handle_exit, + .skip_emulated_instruction = skip_emulated_instruction, + .patch_hypercall = svm_patch_hypercall, + .get_irq = svm_get_irq, + .set_irq = svm_set_irq, + .queue_exception = svm_queue_exception, + .exception_injected = svm_exception_injected, + .inject_pending_irq = svm_intr_assist, + .inject_pending_vectors = do_interrupt_requests, + + .set_tss_addr = svm_set_tss_addr, +}; + +static int __init svm_init(void) +{ + return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), + THIS_MODULE); +} + +static void __exit svm_exit(void) +{ + kvm_exit(); +} + +module_init(svm_init) +module_exit(svm_exit) diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h new file mode 100644 index 00000000000..5fd50491b55 --- /dev/null +++ b/arch/x86/kvm/svm.h @@ -0,0 +1,325 @@ +#ifndef __SVM_H +#define __SVM_H + +enum { + INTERCEPT_INTR, + INTERCEPT_NMI, + INTERCEPT_SMI, + INTERCEPT_INIT, + INTERCEPT_VINTR, + INTERCEPT_SELECTIVE_CR0, + INTERCEPT_STORE_IDTR, + INTERCEPT_STORE_GDTR, + INTERCEPT_STORE_LDTR, + INTERCEPT_STORE_TR, + INTERCEPT_LOAD_IDTR, + INTERCEPT_LOAD_GDTR, + INTERCEPT_LOAD_LDTR, + INTERCEPT_LOAD_TR, + INTERCEPT_RDTSC, + INTERCEPT_RDPMC, + INTERCEPT_PUSHF, + INTERCEPT_POPF, + INTERCEPT_CPUID, + INTERCEPT_RSM, + INTERCEPT_IRET, + INTERCEPT_INTn, + INTERCEPT_INVD, + INTERCEPT_PAUSE, + INTERCEPT_HLT, + INTERCEPT_INVLPG, + INTERCEPT_INVLPGA, + INTERCEPT_IOIO_PROT, + INTERCEPT_MSR_PROT, + INTERCEPT_TASK_SWITCH, + INTERCEPT_FERR_FREEZE, + INTERCEPT_SHUTDOWN, + INTERCEPT_VMRUN, + INTERCEPT_VMMCALL, + INTERCEPT_VMLOAD, + INTERCEPT_VMSAVE, + INTERCEPT_STGI, + INTERCEPT_CLGI, + INTERCEPT_SKINIT, + INTERCEPT_RDTSCP, + INTERCEPT_ICEBP, + INTERCEPT_WBINVD, + INTERCEPT_MONITOR, + INTERCEPT_MWAIT, + INTERCEPT_MWAIT_COND, +}; + + +struct __attribute__ ((__packed__)) vmcb_control_area { + u16 intercept_cr_read; + u16 intercept_cr_write; + u16 intercept_dr_read; + u16 intercept_dr_write; + u32 intercept_exceptions; + u64 intercept; + u8 reserved_1[44]; + u64 iopm_base_pa; + u64 msrpm_base_pa; + u64 tsc_offset; + u32 asid; + u8 tlb_ctl; + u8 reserved_2[3]; + u32 int_ctl; + u32 int_vector; + u32 int_state; + u8 reserved_3[4]; + u32 exit_code; + u32 exit_code_hi; + u64 exit_info_1; + u64 exit_info_2; + u32 exit_int_info; + u32 exit_int_info_err; + u64 nested_ctl; + u8 reserved_4[16]; + u32 event_inj; + u32 event_inj_err; + u64 nested_cr3; + u64 lbr_ctl; + u8 reserved_5[832]; +}; + + +#define TLB_CONTROL_DO_NOTHING 0 +#define TLB_CONTROL_FLUSH_ALL_ASID 1 + +#define V_TPR_MASK 0x0f + +#define V_IRQ_SHIFT 8 +#define V_IRQ_MASK (1 << V_IRQ_SHIFT) + +#define V_INTR_PRIO_SHIFT 16 +#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) + +#define V_IGN_TPR_SHIFT 20 +#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) + +#define V_INTR_MASKING_SHIFT 24 +#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) + +#define SVM_INTERRUPT_SHADOW_MASK 1 + +#define SVM_IOIO_STR_SHIFT 2 +#define SVM_IOIO_REP_SHIFT 3 +#define SVM_IOIO_SIZE_SHIFT 4 +#define SVM_IOIO_ASIZE_SHIFT 7 + +#define SVM_IOIO_TYPE_MASK 1 +#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) +#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) +#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) +#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) + +struct __attribute__ ((__packed__)) vmcb_seg { + u16 selector; + u16 attrib; + u32 limit; + u64 base; +}; + +struct __attribute__ ((__packed__)) vmcb_save_area { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg fs; + struct vmcb_seg gs; + struct vmcb_seg gdtr; + struct vmcb_seg ldtr; + struct vmcb_seg idtr; + struct vmcb_seg tr; + u8 reserved_1[43]; + u8 cpl; + u8 reserved_2[4]; + u64 efer; + u8 reserved_3[112]; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u8 reserved_4[88]; + u64 rsp; + u8 reserved_5[24]; + u64 rax; + u64 star; + u64 lstar; + u64 cstar; + u64 sfmask; + u64 kernel_gs_base; + u64 sysenter_cs; + u64 sysenter_esp; + u64 sysenter_eip; + u64 cr2; + u8 reserved_6[32]; + u64 g_pat; + u64 dbgctl; + u64 br_from; + u64 br_to; + u64 last_excp_from; + u64 last_excp_to; +}; + +struct __attribute__ ((__packed__)) vmcb { + struct vmcb_control_area control; + struct vmcb_save_area save; +}; + +#define SVM_CPUID_FEATURE_SHIFT 2 +#define SVM_CPUID_FUNC 0x8000000a + +#define MSR_EFER_SVME_MASK (1ULL << 12) +#define MSR_VM_CR 0xc0010114 +#define MSR_VM_HSAVE_PA 0xc0010117ULL + +#define SVM_VM_CR_SVM_DISABLE 4 + +#define SVM_SELECTOR_S_SHIFT 4 +#define SVM_SELECTOR_DPL_SHIFT 5 +#define SVM_SELECTOR_P_SHIFT 7 +#define SVM_SELECTOR_AVL_SHIFT 8 +#define SVM_SELECTOR_L_SHIFT 9 +#define SVM_SELECTOR_DB_SHIFT 10 +#define SVM_SELECTOR_G_SHIFT 11 + +#define SVM_SELECTOR_TYPE_MASK (0xf) +#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) +#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) +#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) +#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) +#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) +#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) +#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) + +#define SVM_SELECTOR_WRITE_MASK (1 << 1) +#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK +#define SVM_SELECTOR_CODE_MASK (1 << 3) + +#define INTERCEPT_CR0_MASK 1 +#define INTERCEPT_CR3_MASK (1 << 3) +#define INTERCEPT_CR4_MASK (1 << 4) +#define INTERCEPT_CR8_MASK (1 << 8) + +#define INTERCEPT_DR0_MASK 1 +#define INTERCEPT_DR1_MASK (1 << 1) +#define INTERCEPT_DR2_MASK (1 << 2) +#define INTERCEPT_DR3_MASK (1 << 3) +#define INTERCEPT_DR4_MASK (1 << 4) +#define INTERCEPT_DR5_MASK (1 << 5) +#define INTERCEPT_DR6_MASK (1 << 6) +#define INTERCEPT_DR7_MASK (1 << 7) + +#define SVM_EVTINJ_VEC_MASK 0xff + +#define SVM_EVTINJ_TYPE_SHIFT 8 +#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_VALID (1 << 31) +#define SVM_EVTINJ_VALID_ERR (1 << 11) + +#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK + +#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR +#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI +#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT +#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT + +#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID +#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR + +#define SVM_EXIT_READ_CR0 0x000 +#define SVM_EXIT_READ_CR3 0x003 +#define SVM_EXIT_READ_CR4 0x004 +#define SVM_EXIT_READ_CR8 0x008 +#define SVM_EXIT_WRITE_CR0 0x010 +#define SVM_EXIT_WRITE_CR3 0x013 +#define SVM_EXIT_WRITE_CR4 0x014 +#define SVM_EXIT_WRITE_CR8 0x018 +#define SVM_EXIT_READ_DR0 0x020 +#define SVM_EXIT_READ_DR1 0x021 +#define SVM_EXIT_READ_DR2 0x022 +#define SVM_EXIT_READ_DR3 0x023 +#define SVM_EXIT_READ_DR4 0x024 +#define SVM_EXIT_READ_DR5 0x025 +#define SVM_EXIT_READ_DR6 0x026 +#define SVM_EXIT_READ_DR7 0x027 +#define SVM_EXIT_WRITE_DR0 0x030 +#define SVM_EXIT_WRITE_DR1 0x031 +#define SVM_EXIT_WRITE_DR2 0x032 +#define SVM_EXIT_WRITE_DR3 0x033 +#define SVM_EXIT_WRITE_DR4 0x034 +#define SVM_EXIT_WRITE_DR5 0x035 +#define SVM_EXIT_WRITE_DR6 0x036 +#define SVM_EXIT_WRITE_DR7 0x037 +#define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_INTR 0x060 +#define SVM_EXIT_NMI 0x061 +#define SVM_EXIT_SMI 0x062 +#define SVM_EXIT_INIT 0x063 +#define SVM_EXIT_VINTR 0x064 +#define SVM_EXIT_CR0_SEL_WRITE 0x065 +#define SVM_EXIT_IDTR_READ 0x066 +#define SVM_EXIT_GDTR_READ 0x067 +#define SVM_EXIT_LDTR_READ 0x068 +#define SVM_EXIT_TR_READ 0x069 +#define SVM_EXIT_IDTR_WRITE 0x06a +#define SVM_EXIT_GDTR_WRITE 0x06b +#define SVM_EXIT_LDTR_WRITE 0x06c +#define SVM_EXIT_TR_WRITE 0x06d +#define SVM_EXIT_RDTSC 0x06e +#define SVM_EXIT_RDPMC 0x06f +#define SVM_EXIT_PUSHF 0x070 +#define SVM_EXIT_POPF 0x071 +#define SVM_EXIT_CPUID 0x072 +#define SVM_EXIT_RSM 0x073 +#define SVM_EXIT_IRET 0x074 +#define SVM_EXIT_SWINT 0x075 +#define SVM_EXIT_INVD 0x076 +#define SVM_EXIT_PAUSE 0x077 +#define SVM_EXIT_HLT 0x078 +#define SVM_EXIT_INVLPG 0x079 +#define SVM_EXIT_INVLPGA 0x07a +#define SVM_EXIT_IOIO 0x07b +#define SVM_EXIT_MSR 0x07c +#define SVM_EXIT_TASK_SWITCH 0x07d +#define SVM_EXIT_FERR_FREEZE 0x07e +#define SVM_EXIT_SHUTDOWN 0x07f +#define SVM_EXIT_VMRUN 0x080 +#define SVM_EXIT_VMMCALL 0x081 +#define SVM_EXIT_VMLOAD 0x082 +#define SVM_EXIT_VMSAVE 0x083 +#define SVM_EXIT_STGI 0x084 +#define SVM_EXIT_CLGI 0x085 +#define SVM_EXIT_SKINIT 0x086 +#define SVM_EXIT_RDTSCP 0x087 +#define SVM_EXIT_ICEBP 0x088 +#define SVM_EXIT_WBINVD 0x089 +#define SVM_EXIT_MONITOR 0x08a +#define SVM_EXIT_MWAIT 0x08b +#define SVM_EXIT_MWAIT_COND 0x08c +#define SVM_EXIT_NPF 0x400 + +#define SVM_EXIT_ERR -1 + +#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ + +#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" +#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" +#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" +#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" +#define SVM_STGI ".byte 0x0f, 0x01, 0xdc" +#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" + +#endif + diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c new file mode 100644 index 00000000000..ad36447e696 --- /dev/null +++ b/arch/x86/kvm/vmx.c @@ -0,0 +1,2679 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "irq.h" +#include "vmx.h" +#include "segment_descriptor.h" +#include "mmu.h" + +#include <linux/kvm_host.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/sched.h> +#include <linux/moduleparam.h> + +#include <asm/io.h> +#include <asm/desc.h> + +MODULE_AUTHOR("Qumranet"); +MODULE_LICENSE("GPL"); + +static int bypass_guest_pf = 1; +module_param(bypass_guest_pf, bool, 0); + +struct vmcs { + u32 revision_id; + u32 abort; + char data[0]; +}; + +struct vcpu_vmx { + struct kvm_vcpu vcpu; + int launched; + u8 fail; + u32 idt_vectoring_info; + struct kvm_msr_entry *guest_msrs; + struct kvm_msr_entry *host_msrs; + int nmsrs; + int save_nmsrs; + int msr_offset_efer; +#ifdef CONFIG_X86_64 + int msr_offset_kernel_gs_base; +#endif + struct vmcs *vmcs; + struct { + int loaded; + u16 fs_sel, gs_sel, ldt_sel; + int gs_ldt_reload_needed; + int fs_reload_needed; + int guest_efer_loaded; + } host_state; + struct { + struct { + bool pending; + u8 vector; + unsigned rip; + } irq; + } rmode; +}; + +static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_vmx, vcpu); +} + +static int init_rmode_tss(struct kvm *kvm); + +static DEFINE_PER_CPU(struct vmcs *, vmxarea); +static DEFINE_PER_CPU(struct vmcs *, current_vmcs); + +static struct page *vmx_io_bitmap_a; +static struct page *vmx_io_bitmap_b; + +static struct vmcs_config { + int size; + int order; + u32 revision_id; + u32 pin_based_exec_ctrl; + u32 cpu_based_exec_ctrl; + u32 cpu_based_2nd_exec_ctrl; + u32 vmexit_ctrl; + u32 vmentry_ctrl; +} vmcs_config; + +#define VMX_SEGMENT_FIELD(seg) \ + [VCPU_SREG_##seg] = { \ + .selector = GUEST_##seg##_SELECTOR, \ + .base = GUEST_##seg##_BASE, \ + .limit = GUEST_##seg##_LIMIT, \ + .ar_bytes = GUEST_##seg##_AR_BYTES, \ + } + +static struct kvm_vmx_segment_field { + unsigned selector; + unsigned base; + unsigned limit; + unsigned ar_bytes; +} kvm_vmx_segment_fields[] = { + VMX_SEGMENT_FIELD(CS), + VMX_SEGMENT_FIELD(DS), + VMX_SEGMENT_FIELD(ES), + VMX_SEGMENT_FIELD(FS), + VMX_SEGMENT_FIELD(GS), + VMX_SEGMENT_FIELD(SS), + VMX_SEGMENT_FIELD(TR), + VMX_SEGMENT_FIELD(LDTR), +}; + +/* + * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it + * away by decrementing the array size. + */ +static const u32 vmx_msr_index[] = { +#ifdef CONFIG_X86_64 + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, +#endif + MSR_EFER, MSR_K6_STAR, +}; +#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) + +static void load_msrs(struct kvm_msr_entry *e, int n) +{ + int i; + + for (i = 0; i < n; ++i) + wrmsrl(e[i].index, e[i].data); +} + +static void save_msrs(struct kvm_msr_entry *e, int n) +{ + int i; + + for (i = 0; i < n; ++i) + rdmsrl(e[i].index, e[i].data); +} + +static inline int is_page_fault(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline int is_no_device(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline int is_invalid_opcode(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | + INTR_INFO_VALID_MASK)) == + (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); +} + +static inline int is_external_interrupt(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); +} + +static inline int cpu_has_vmx_tpr_shadow(void) +{ + return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); +} + +static inline int vm_need_tpr_shadow(struct kvm *kvm) +{ + return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); +} + +static inline int cpu_has_secondary_exec_ctrls(void) +{ + return (vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); +} + +static inline bool cpu_has_vmx_virtualize_apic_accesses(void) +{ + return (vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); +} + +static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) +{ + return ((cpu_has_vmx_virtualize_apic_accesses()) && + (irqchip_in_kernel(kvm))); +} + +static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) +{ + int i; + + for (i = 0; i < vmx->nmsrs; ++i) + if (vmx->guest_msrs[i].index == msr) + return i; + return -1; +} + +static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +{ + int i; + + i = __find_msr_index(vmx, msr); + if (i >= 0) + return &vmx->guest_msrs[i]; + return NULL; +} + +static void vmcs_clear(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + u8 error; + + asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc", "memory"); + if (error) + printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", + vmcs, phys_addr); +} + +static void __vcpu_clear(void *arg) +{ + struct vcpu_vmx *vmx = arg; + int cpu = raw_smp_processor_id(); + + if (vmx->vcpu.cpu == cpu) + vmcs_clear(vmx->vmcs); + if (per_cpu(current_vmcs, cpu) == vmx->vmcs) + per_cpu(current_vmcs, cpu) = NULL; + rdtscll(vmx->vcpu.arch.host_tsc); +} + +static void vcpu_clear(struct vcpu_vmx *vmx) +{ + if (vmx->vcpu.cpu == -1) + return; + smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); + vmx->launched = 0; +} + +static unsigned long vmcs_readl(unsigned long field) +{ + unsigned long value; + + asm volatile (ASM_VMX_VMREAD_RDX_RAX + : "=a"(value) : "d"(field) : "cc"); + return value; +} + +static u16 vmcs_read16(unsigned long field) +{ + return vmcs_readl(field); +} + +static u32 vmcs_read32(unsigned long field) +{ + return vmcs_readl(field); +} + +static u64 vmcs_read64(unsigned long field) +{ +#ifdef CONFIG_X86_64 + return vmcs_readl(field); +#else + return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); +#endif +} + +static noinline void vmwrite_error(unsigned long field, unsigned long value) +{ + printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); + dump_stack(); +} + +static void vmcs_writel(unsigned long field, unsigned long value) +{ + u8 error; + + asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" + : "=q"(error) : "a"(value), "d"(field) : "cc"); + if (unlikely(error)) + vmwrite_error(field, value); +} + +static void vmcs_write16(unsigned long field, u16 value) +{ + vmcs_writel(field, value); +} + +static void vmcs_write32(unsigned long field, u32 value) +{ + vmcs_writel(field, value); +} + +static void vmcs_write64(unsigned long field, u64 value) +{ +#ifdef CONFIG_X86_64 + vmcs_writel(field, value); +#else + vmcs_writel(field, value); + asm volatile (""); + vmcs_writel(field+1, value >> 32); +#endif +} + +static void vmcs_clear_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) & ~mask); +} + +static void vmcs_set_bits(unsigned long field, u32 mask) +{ + vmcs_writel(field, vmcs_readl(field) | mask); +} + +static void update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + u32 eb; + + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); + if (!vcpu->fpu_active) + eb |= 1u << NM_VECTOR; + if (vcpu->guest_debug.enabled) + eb |= 1u << 1; + if (vcpu->arch.rmode.active) + eb = ~0; + vmcs_write32(EXCEPTION_BITMAP, eb); +} + +static void reload_tss(void) +{ +#ifndef CONFIG_X86_64 + + /* + * VT restores TR but not its size. Useless. + */ + struct descriptor_table gdt; + struct segment_descriptor *descs; + + get_gdt(&gdt); + descs = (void *)gdt.base; + descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ + load_TR_desc(); +#endif +} + +static void load_transition_efer(struct vcpu_vmx *vmx) +{ + int efer_offset = vmx->msr_offset_efer; + u64 host_efer = vmx->host_msrs[efer_offset].data; + u64 guest_efer = vmx->guest_msrs[efer_offset].data; + u64 ignore_bits; + + if (efer_offset < 0) + return; + /* + * NX is emulated; LMA and LME handled by hardware; SCE meaninless + * outside long mode + */ + ignore_bits = EFER_NX | EFER_SCE; +#ifdef CONFIG_X86_64 + ignore_bits |= EFER_LMA | EFER_LME; + /* SCE is meaningful only in long mode on Intel */ + if (guest_efer & EFER_LMA) + ignore_bits &= ~(u64)EFER_SCE; +#endif + if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) + return; + + vmx->host_state.guest_efer_loaded = 1; + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; + wrmsrl(MSR_EFER, guest_efer); + vmx->vcpu.stat.efer_reload++; +} + +static void reload_host_efer(struct vcpu_vmx *vmx) +{ + if (vmx->host_state.guest_efer_loaded) { + vmx->host_state.guest_efer_loaded = 0; + load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); + } +} + +static void vmx_save_host_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->host_state.loaded) + return; + + vmx->host_state.loaded = 1; + /* + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not + * allow segment selectors with cpl > 0 or ti == 1. + */ + vmx->host_state.ldt_sel = read_ldt(); + vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; + vmx->host_state.fs_sel = read_fs(); + if (!(vmx->host_state.fs_sel & 7)) { + vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); + vmx->host_state.fs_reload_needed = 0; + } else { + vmcs_write16(HOST_FS_SELECTOR, 0); + vmx->host_state.fs_reload_needed = 1; + } + vmx->host_state.gs_sel = read_gs(); + if (!(vmx->host_state.gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); + else { + vmcs_write16(HOST_GS_SELECTOR, 0); + vmx->host_state.gs_ldt_reload_needed = 1; + } + +#ifdef CONFIG_X86_64 + vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); + vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); +#else + vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); + vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); +#endif + +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) + save_msrs(vmx->host_msrs + + vmx->msr_offset_kernel_gs_base, 1); + +#endif + load_msrs(vmx->guest_msrs, vmx->save_nmsrs); + load_transition_efer(vmx); +} + +static void vmx_load_host_state(struct vcpu_vmx *vmx) +{ + unsigned long flags; + + if (!vmx->host_state.loaded) + return; + + ++vmx->vcpu.stat.host_state_reload; + vmx->host_state.loaded = 0; + if (vmx->host_state.fs_reload_needed) + load_fs(vmx->host_state.fs_sel); + if (vmx->host_state.gs_ldt_reload_needed) { + load_ldt(vmx->host_state.ldt_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_save(flags); + load_gs(vmx->host_state.gs_sel); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); +#endif + local_irq_restore(flags); + } + reload_tss(); + save_msrs(vmx->guest_msrs, vmx->save_nmsrs); + load_msrs(vmx->host_msrs, vmx->save_nmsrs); + reload_host_efer(vmx); +} + +/* + * Switches to specified vcpu, until a matching vcpu_put(), but assumes + * vcpu mutex is already taken. + */ +static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 phys_addr = __pa(vmx->vmcs); + u64 tsc_this, delta; + + if (vcpu->cpu != cpu) { + vcpu_clear(vmx); + kvm_migrate_apic_timer(vcpu); + } + + if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { + u8 error; + + per_cpu(current_vmcs, cpu) = vmx->vmcs; + asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc"); + if (error) + printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + vmx->vmcs, phys_addr); + } + + if (vcpu->cpu != cpu) { + struct descriptor_table dt; + unsigned long sysenter_esp; + + vcpu->cpu = cpu; + /* + * Linux uses per-cpu TSS and GDT, so set these when switching + * processors. + */ + vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ + get_gdt(&dt); + vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ + + rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); + vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + + /* + * Make sure the time stamp counter is monotonous. + */ + rdtscll(tsc_this); + delta = vcpu->arch.host_tsc - tsc_this; + vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); + } +} + +static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +{ + vmx_load_host_state(to_vmx(vcpu)); +} + +static void vmx_fpu_activate(struct kvm_vcpu *vcpu) +{ + if (vcpu->fpu_active) + return; + vcpu->fpu_active = 1; + vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); + if (vcpu->arch.cr0 & X86_CR0_TS) + vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + update_exception_bitmap(vcpu); +} + +static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + if (!vcpu->fpu_active) + return; + vcpu->fpu_active = 0; + vmcs_set_bits(GUEST_CR0, X86_CR0_TS); + update_exception_bitmap(vcpu); +} + +static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) +{ + vcpu_clear(to_vmx(vcpu)); +} + +static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) +{ + return vmcs_readl(GUEST_RFLAGS); +} + +static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (vcpu->arch.rmode.active) + rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + vmcs_writel(GUEST_RFLAGS, rflags); +} + +static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + unsigned long rip; + u32 interruptibility; + + rip = vmcs_readl(GUEST_RIP); + rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_writel(GUEST_RIP, rip); + + /* + * We emulated an instruction, so temporary interrupt blocking + * should be removed, if set. + */ + interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + if (interruptibility & 3) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + interruptibility & ~3); + vcpu->arch.interrupt_window_open = 1; +} + +static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code) +{ + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + nr | INTR_TYPE_EXCEPTION + | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) + | INTR_INFO_VALID_MASK); + if (has_error_code) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); +} + +static bool vmx_exception_injected(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); +} + +/* + * Swap MSR entry in host/guest MSR entry array. + */ +#ifdef CONFIG_X86_64 +static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) +{ + struct kvm_msr_entry tmp; + + tmp = vmx->guest_msrs[to]; + vmx->guest_msrs[to] = vmx->guest_msrs[from]; + vmx->guest_msrs[from] = tmp; + tmp = vmx->host_msrs[to]; + vmx->host_msrs[to] = vmx->host_msrs[from]; + vmx->host_msrs[from] = tmp; +} +#endif + +/* + * Set up the vmcs to automatically save and restore system + * msrs. Don't touch the 64-bit msrs if the guest is in legacy + * mode, as fiddling with msrs is very expensive. + */ +static void setup_msrs(struct vcpu_vmx *vmx) +{ + int save_nmsrs; + + save_nmsrs = 0; +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) { + int index; + + index = __find_msr_index(vmx, MSR_SYSCALL_MASK); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_LSTAR); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_CSTAR); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); + /* + * MSR_K6_STAR is only needed on long mode guests, and only + * if efer.sce is enabled. + */ + index = __find_msr_index(vmx, MSR_K6_STAR); + if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) + move_msr_up(vmx, index, save_nmsrs++); + } +#endif + vmx->save_nmsrs = save_nmsrs; + +#ifdef CONFIG_X86_64 + vmx->msr_offset_kernel_gs_base = + __find_msr_index(vmx, MSR_KERNEL_GS_BASE); +#endif + vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); +} + +/* + * reads and returns guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset -- 21.3 + */ +static u64 guest_read_tsc(void) +{ + u64 host_tsc, tsc_offset; + + rdtscll(host_tsc); + tsc_offset = vmcs_read64(TSC_OFFSET); + return host_tsc + tsc_offset; +} + +/* + * writes 'guest_tsc' into guest's timestamp counter "register" + * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc + */ +static void guest_write_tsc(u64 guest_tsc) +{ + u64 host_tsc; + + rdtscll(host_tsc); + vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); +} + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + u64 data; + struct kvm_msr_entry *msr; + + if (!pdata) { + printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); + return -EINVAL; + } + + switch (msr_index) { +#ifdef CONFIG_X86_64 + case MSR_FS_BASE: + data = vmcs_readl(GUEST_FS_BASE); + break; + case MSR_GS_BASE: + data = vmcs_readl(GUEST_GS_BASE); + break; + case MSR_EFER: + return kvm_get_msr_common(vcpu, msr_index, pdata); +#endif + case MSR_IA32_TIME_STAMP_COUNTER: + data = guest_read_tsc(); + break; + case MSR_IA32_SYSENTER_CS: + data = vmcs_read32(GUEST_SYSENTER_CS); + break; + case MSR_IA32_SYSENTER_EIP: + data = vmcs_readl(GUEST_SYSENTER_EIP); + break; + case MSR_IA32_SYSENTER_ESP: + data = vmcs_readl(GUEST_SYSENTER_ESP); + break; + default: + msr = find_msr_entry(to_vmx(vcpu), msr_index); + if (msr) { + data = msr->data; + break; + } + return kvm_get_msr_common(vcpu, msr_index, pdata); + } + + *pdata = data; + return 0; +} + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_msr_entry *msr; + int ret = 0; + + switch (msr_index) { +#ifdef CONFIG_X86_64 + case MSR_EFER: + ret = kvm_set_msr_common(vcpu, msr_index, data); + if (vmx->host_state.loaded) { + reload_host_efer(vmx); + load_transition_efer(vmx); + } + break; + case MSR_FS_BASE: + vmcs_writel(GUEST_FS_BASE, data); + break; + case MSR_GS_BASE: + vmcs_writel(GUEST_GS_BASE, data); + break; +#endif + case MSR_IA32_SYSENTER_CS: + vmcs_write32(GUEST_SYSENTER_CS, data); + break; + case MSR_IA32_SYSENTER_EIP: + vmcs_writel(GUEST_SYSENTER_EIP, data); + break; + case MSR_IA32_SYSENTER_ESP: + vmcs_writel(GUEST_SYSENTER_ESP, data); + break; + case MSR_IA32_TIME_STAMP_COUNTER: + guest_write_tsc(data); + break; + default: + msr = find_msr_entry(vmx, msr_index); + if (msr) { + msr->data = data; + if (vmx->host_state.loaded) + load_msrs(vmx->guest_msrs, vmx->save_nmsrs); + break; + } + ret = kvm_set_msr_common(vcpu, msr_index, data); + } + + return ret; +} + +/* + * Sync the rsp and rip registers into the vcpu structure. This allows + * registers to be accessed by indexing vcpu->arch.regs. + */ +static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) +{ + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + vcpu->arch.rip = vmcs_readl(GUEST_RIP); +} + +/* + * Syncs rsp and rip back into the vmcs. Should be called after possible + * modification. + */ +static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) +{ + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + vmcs_writel(GUEST_RIP, vcpu->arch.rip); +} + +static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) +{ + unsigned long dr7 = 0x400; + int old_singlestep; + + old_singlestep = vcpu->guest_debug.singlestep; + + vcpu->guest_debug.enabled = dbg->enabled; + if (vcpu->guest_debug.enabled) { + int i; + + dr7 |= 0x200; /* exact */ + for (i = 0; i < 4; ++i) { + if (!dbg->breakpoints[i].enabled) + continue; + vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; + dr7 |= 2 << (i*2); /* global enable */ + dr7 |= 0 << (i*4+16); /* execution breakpoint */ + } + + vcpu->guest_debug.singlestep = dbg->singlestep; + } else + vcpu->guest_debug.singlestep = 0; + + if (old_singlestep && !vcpu->guest_debug.singlestep) { + unsigned long flags; + + flags = vmcs_readl(GUEST_RFLAGS); + flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + vmcs_writel(GUEST_RFLAGS, flags); + } + + update_exception_bitmap(vcpu); + vmcs_writel(GUEST_DR7, dr7); + + return 0; +} + +static int vmx_get_irq(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 idtv_info_field; + + idtv_info_field = vmx->idt_vectoring_info; + if (idtv_info_field & INTR_INFO_VALID_MASK) { + if (is_external_interrupt(idtv_info_field)) + return idtv_info_field & VECTORING_INFO_VECTOR_MASK; + else + printk(KERN_DEBUG "pending exception: not handled yet\n"); + } + return -1; +} + +static __init int cpu_has_kvm_support(void) +{ + unsigned long ecx = cpuid_ecx(1); + return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ +} + +static __init int vmx_disabled_by_bios(void) +{ + u64 msr; + + rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); + return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + == MSR_IA32_FEATURE_CONTROL_LOCKED; + /* locked but not enabled */ +} + +static void hardware_enable(void *garbage) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + u64 old; + + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); + if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + != (MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + /* enable and lock */ + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | + MSR_IA32_FEATURE_CONTROL_LOCKED | + MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); + write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ + asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) + : "memory", "cc"); +} + +static void hardware_disable(void *garbage) +{ + asm volatile (ASM_VMX_VMXOFF : : : "cc"); +} + +static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, + u32 msr, u32 *result) +{ + u32 vmx_msr_low, vmx_msr_high; + u32 ctl = ctl_min | ctl_opt; + + rdmsr(msr, vmx_msr_low, vmx_msr_high); + + ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ + ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ + + /* Ensure minimum (required) set of control bits are supported. */ + if (ctl_min & ~ctl) + return -EIO; + + *result = ctl; + return 0; +} + +static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) +{ + u32 vmx_msr_low, vmx_msr_high; + u32 min, opt; + u32 _pin_based_exec_control = 0; + u32 _cpu_based_exec_control = 0; + u32 _cpu_based_2nd_exec_control = 0; + u32 _vmexit_control = 0; + u32 _vmentry_control = 0; + + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = 0; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control) < 0) + return -EIO; + + min = CPU_BASED_HLT_EXITING | +#ifdef CONFIG_X86_64 + CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING | +#endif + CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_MOV_DR_EXITING | + CPU_BASED_USE_TSC_OFFSETING; + opt = CPU_BASED_TPR_SHADOW | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, + &_cpu_based_exec_control) < 0) + return -EIO; +#ifdef CONFIG_X86_64 + if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & + ~CPU_BASED_CR8_STORE_EXITING; +#endif + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { + min = 0; + opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_WBINVD_EXITING; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, + &_cpu_based_2nd_exec_control) < 0) + return -EIO; + } +#ifndef CONFIG_X86_64 + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; +#endif + + min = 0; +#ifdef CONFIG_X86_64 + min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; +#endif + opt = 0; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, + &_vmexit_control) < 0) + return -EIO; + + min = opt = 0; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, + &_vmentry_control) < 0) + return -EIO; + + rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); + + /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ + if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) + return -EIO; + +#ifdef CONFIG_X86_64 + /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ + if (vmx_msr_high & (1u<<16)) + return -EIO; +#endif + + /* Require Write-Back (WB) memory type for VMCS accesses. */ + if (((vmx_msr_high >> 18) & 15) != 6) + return -EIO; + + vmcs_conf->size = vmx_msr_high & 0x1fff; + vmcs_conf->order = get_order(vmcs_config.size); + vmcs_conf->revision_id = vmx_msr_low; + + vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; + vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; + vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->vmexit_ctrl = _vmexit_control; + vmcs_conf->vmentry_ctrl = _vmentry_control; + + return 0; +} + +static struct vmcs *alloc_vmcs_cpu(int cpu) +{ + int node = cpu_to_node(cpu); + struct page *pages; + struct vmcs *vmcs; + + pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + if (!pages) + return NULL; + vmcs = page_address(pages); + memset(vmcs, 0, vmcs_config.size); + vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ + return vmcs; +} + +static struct vmcs *alloc_vmcs(void) +{ + return alloc_vmcs_cpu(raw_smp_processor_id()); +} + +static void free_vmcs(struct vmcs *vmcs) +{ + free_pages((unsigned long)vmcs, vmcs_config.order); +} + +static void free_kvm_area(void) +{ + int cpu; + + for_each_online_cpu(cpu) + free_vmcs(per_cpu(vmxarea, cpu)); +} + +static __init int alloc_kvm_area(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct vmcs *vmcs; + + vmcs = alloc_vmcs_cpu(cpu); + if (!vmcs) { + free_kvm_area(); + return -ENOMEM; + } + + per_cpu(vmxarea, cpu) = vmcs; + } + return 0; +} + +static __init int hardware_setup(void) +{ + if (setup_vmcs_config(&vmcs_config) < 0) + return -EIO; + return alloc_kvm_area(); +} + +static __exit void hardware_unsetup(void) +{ + free_kvm_area(); +} + +static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { + vmcs_write16(sf->selector, save->selector); + vmcs_writel(sf->base, save->base); + vmcs_write32(sf->limit, save->limit); + vmcs_write32(sf->ar_bytes, save->ar); + } else { + u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) + << AR_DPL_SHIFT; + vmcs_write32(sf->ar_bytes, 0x93 | dpl); + } +} + +static void enter_pmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + vcpu->arch.rmode.active = 0; + + vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); + vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); + vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); + + flags = vmcs_readl(GUEST_RFLAGS); + flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); + vmcs_writel(GUEST_RFLAGS, flags); + + vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | + (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); + + update_exception_bitmap(vcpu); + + fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); + fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); + fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); + fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + + vmcs_write16(GUEST_SS_SELECTOR, 0); + vmcs_write32(GUEST_SS_AR_BYTES, 0x93); + + vmcs_write16(GUEST_CS_SELECTOR, + vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); + vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); +} + +static gva_t rmode_tss_base(struct kvm *kvm) +{ + if (!kvm->arch.tss_addr) { + gfn_t base_gfn = kvm->memslots[0].base_gfn + + kvm->memslots[0].npages - 3; + return base_gfn << PAGE_SHIFT; + } + return kvm->arch.tss_addr; +} + +static void fix_rmode_seg(int seg, struct kvm_save_segment *save) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + save->selector = vmcs_read16(sf->selector); + save->base = vmcs_readl(sf->base); + save->limit = vmcs_read32(sf->limit); + save->ar = vmcs_read32(sf->ar_bytes); + vmcs_write16(sf->selector, save->base >> 4); + vmcs_write32(sf->base, save->base & 0xfffff); + vmcs_write32(sf->limit, 0xffff); + vmcs_write32(sf->ar_bytes, 0xf3); +} + +static void enter_rmode(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + + vcpu->arch.rmode.active = 1; + + vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); + vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); + + vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); + + vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + flags = vmcs_readl(GUEST_RFLAGS); + vcpu->arch.rmode.save_iopl + = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + + flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + + vmcs_writel(GUEST_RFLAGS, flags); + vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); + update_exception_bitmap(vcpu); + + vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); + vmcs_write32(GUEST_SS_LIMIT, 0xffff); + vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + + vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); + vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); + + fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); + fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); + fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); + fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + + kvm_mmu_reset_context(vcpu); + init_rmode_tss(vcpu->kvm); +} + +#ifdef CONFIG_X86_64 + +static void enter_lmode(struct kvm_vcpu *vcpu) +{ + u32 guest_tr_ar; + + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); + if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { + printk(KERN_DEBUG "%s: tss fixup for long mode. \n", + __FUNCTION__); + vmcs_write32(GUEST_TR_AR_BYTES, + (guest_tr_ar & ~AR_TYPE_MASK) + | AR_TYPE_BUSY_64_TSS); + } + + vcpu->arch.shadow_efer |= EFER_LMA; + + find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) + | VM_ENTRY_IA32E_MODE); +} + +static void exit_lmode(struct kvm_vcpu *vcpu) +{ + vcpu->arch.shadow_efer &= ~EFER_LMA; + + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) + & ~VM_ENTRY_IA32E_MODE); +} + +#endif + +static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) +{ + vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; +} + +static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + vmx_fpu_deactivate(vcpu); + + if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) + enter_pmode(vcpu); + + if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) + enter_rmode(vcpu); + +#ifdef CONFIG_X86_64 + if (vcpu->arch.shadow_efer & EFER_LME) { + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) + enter_lmode(vcpu); + if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) + exit_lmode(vcpu); + } +#endif + + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, + (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); + vcpu->arch.cr0 = cr0; + + if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) + vmx_fpu_activate(vcpu); +} + +static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + vmcs_writel(GUEST_CR3, cr3); + if (vcpu->arch.cr0 & X86_CR0_PE) + vmx_fpu_deactivate(vcpu); +} + +static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + vmcs_writel(CR4_READ_SHADOW, cr4); + vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); + vcpu->arch.cr4 = cr4; +} + +#ifdef CONFIG_X86_64 + +static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + + vcpu->arch.shadow_efer = efer; + if (efer & EFER_LMA) { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) | + VM_ENTRY_IA32E_MODE); + msr->data = efer; + + } else { + vmcs_write32(VM_ENTRY_CONTROLS, + vmcs_read32(VM_ENTRY_CONTROLS) & + ~VM_ENTRY_IA32E_MODE); + + msr->data = efer & ~EFER_LME; + } + setup_msrs(vmx); +} + +#endif + +static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + return vmcs_readl(sf->base); +} + +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + u32 ar; + + var->base = vmcs_readl(sf->base); + var->limit = vmcs_read32(sf->limit); + var->selector = vmcs_read16(sf->selector); + ar = vmcs_read32(sf->ar_bytes); + if (ar & AR_UNUSABLE_MASK) + ar = 0; + var->type = ar & 15; + var->s = (ar >> 4) & 1; + var->dpl = (ar >> 5) & 3; + var->present = (ar >> 7) & 1; + var->avl = (ar >> 12) & 1; + var->l = (ar >> 13) & 1; + var->db = (ar >> 14) & 1; + var->g = (ar >> 15) & 1; + var->unusable = (ar >> 16) & 1; +} + +static u32 vmx_segment_access_rights(struct kvm_segment *var) +{ + u32 ar; + + if (var->unusable) + ar = 1 << 16; + else { + ar = var->type & 15; + ar |= (var->s & 1) << 4; + ar |= (var->dpl & 3) << 5; + ar |= (var->present & 1) << 7; + ar |= (var->avl & 1) << 12; + ar |= (var->l & 1) << 13; + ar |= (var->db & 1) << 14; + ar |= (var->g & 1) << 15; + } + if (ar == 0) /* a 0 value means unusable */ + ar = AR_UNUSABLE_MASK; + + return ar; +} + +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + u32 ar; + + if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { + vcpu->arch.rmode.tr.selector = var->selector; + vcpu->arch.rmode.tr.base = var->base; + vcpu->arch.rmode.tr.limit = var->limit; + vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); + return; + } + vmcs_writel(sf->base, var->base); + vmcs_write32(sf->limit, var->limit); + vmcs_write16(sf->selector, var->selector); + if (vcpu->arch.rmode.active && var->s) { + /* + * Hack real-mode segments into vm86 compatibility. + */ + if (var->base == 0xffff0000 && var->selector == 0xf000) + vmcs_writel(sf->base, 0xf0000); + ar = 0xf3; + } else + ar = vmx_segment_access_rights(var); + vmcs_write32(sf->ar_bytes, ar); +} + +static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); + + *db = (ar >> 14) & 1; + *l = (ar >> 13) & 1; +} + +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); + dt->base = vmcs_readl(GUEST_IDTR_BASE); +} + +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); + vmcs_writel(GUEST_IDTR_BASE, dt->base); +} + +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); + dt->base = vmcs_readl(GUEST_GDTR_BASE); +} + +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +{ + vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); + vmcs_writel(GUEST_GDTR_BASE, dt->base); +} + +static int init_rmode_tss(struct kvm *kvm) +{ + gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; + u16 data = 0; + int ret = 0; + int r; + + down_read(¤t->mm->mmap_sem); + r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); + if (r < 0) + goto out; + data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; + r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); + if (r < 0) + goto out; + r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); + if (r < 0) + goto out; + r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); + if (r < 0) + goto out; + data = ~0; + r = kvm_write_guest_page(kvm, fn, &data, + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, + sizeof(u8)); + if (r < 0) + goto out; + + ret = 1; +out: + up_read(¤t->mm->mmap_sem); + return ret; +} + +static void seg_setup(int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + + vmcs_write16(sf->selector, 0); + vmcs_writel(sf->base, 0); + vmcs_write32(sf->limit, 0xffff); + vmcs_write32(sf->ar_bytes, 0x93); +} + +static int alloc_apic_access_page(struct kvm *kvm) +{ + struct kvm_userspace_memory_region kvm_userspace_mem; + int r = 0; + + down_write(¤t->mm->mmap_sem); + if (kvm->arch.apic_access_page) + goto out; + kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; + kvm_userspace_mem.flags = 0; + kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; + kvm_userspace_mem.memory_size = PAGE_SIZE; + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + if (r) + goto out; + kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); +out: + up_write(¤t->mm->mmap_sem); + return r; +} + +/* + * Sets up the vmcs for emulated real mode. + */ +static int vmx_vcpu_setup(struct vcpu_vmx *vmx) +{ + u32 host_sysenter_cs; + u32 junk; + unsigned long a; + struct descriptor_table dt; + int i; + unsigned long kvm_vmx_return; + u32 exec_control; + + /* I/O */ + vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); + vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + + /* Control */ + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, + vmcs_config.pin_based_exec_ctrl); + + exec_control = vmcs_config.cpu_based_exec_ctrl; + if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { + exec_control &= ~CPU_BASED_TPR_SHADOW; +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_STORE_EXITING | + CPU_BASED_CR8_LOAD_EXITING; +#endif + } + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; + if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + exec_control &= + ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + } + + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); + vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + + vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ + vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ + vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ + vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ +#ifdef CONFIG_X86_64 + rdmsrl(MSR_FS_BASE, a); + vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ + rdmsrl(MSR_GS_BASE, a); + vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ +#else + vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ + vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ +#endif + + vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ + + get_idt(&dt); + vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ + + asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); + vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + + rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); + vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); + rdmsrl(MSR_IA32_SYSENTER_ESP, a); + vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ + rdmsrl(MSR_IA32_SYSENTER_EIP, a); + vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ + + for (i = 0; i < NR_VMX_MSR; ++i) { + u32 index = vmx_msr_index[i]; + u32 data_low, data_high; + u64 data; + int j = vmx->nmsrs; + + if (rdmsr_safe(index, &data_low, &data_high) < 0) + continue; + if (wrmsr_safe(index, data_low, data_high) < 0) + continue; + data = data_low | ((u64)data_high << 32); + vmx->host_msrs[j].index = index; + vmx->host_msrs[j].reserved = 0; + vmx->host_msrs[j].data = data; + vmx->guest_msrs[j] = vmx->host_msrs[j]; + ++vmx->nmsrs; + } + + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + /* 22.2.1, 20.8.1 */ + vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); + + vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); + vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + + if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + if (alloc_apic_access_page(vmx->vcpu.kvm) != 0) + return -ENOMEM; + + return 0; +} + +static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 msr; + int ret; + + if (!init_rmode_tss(vmx->vcpu.kvm)) { + ret = -ENOMEM; + goto out; + } + + vmx->vcpu.arch.rmode.active = 0; + + vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); + set_cr8(&vmx->vcpu, 0); + msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; + if (vmx->vcpu.vcpu_id == 0) + msr |= MSR_IA32_APICBASE_BSP; + kvm_set_apic_base(&vmx->vcpu, msr); + + fx_init(&vmx->vcpu); + + /* + * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode + * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. + */ + if (vmx->vcpu.vcpu_id == 0) { + vmcs_write16(GUEST_CS_SELECTOR, 0xf000); + vmcs_writel(GUEST_CS_BASE, 0x000f0000); + } else { + vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); + vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); + } + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); + + seg_setup(VCPU_SREG_DS); + seg_setup(VCPU_SREG_ES); + seg_setup(VCPU_SREG_FS); + seg_setup(VCPU_SREG_GS); + seg_setup(VCPU_SREG_SS); + + vmcs_write16(GUEST_TR_SELECTOR, 0); + vmcs_writel(GUEST_TR_BASE, 0); + vmcs_write32(GUEST_TR_LIMIT, 0xffff); + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); + + vmcs_write16(GUEST_LDTR_SELECTOR, 0); + vmcs_writel(GUEST_LDTR_BASE, 0); + vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); + vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); + + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + + vmcs_writel(GUEST_RFLAGS, 0x02); + if (vmx->vcpu.vcpu_id == 0) + vmcs_writel(GUEST_RIP, 0xfff0); + else + vmcs_writel(GUEST_RIP, 0); + vmcs_writel(GUEST_RSP, 0); + + /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ + vmcs_writel(GUEST_DR7, 0x400); + + vmcs_writel(GUEST_GDTR_BASE, 0); + vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); + + vmcs_writel(GUEST_IDTR_BASE, 0); + vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + + vmcs_write32(GUEST_ACTIVITY_STATE, 0); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); + vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); + + guest_write_tsc(0); + + /* Special registers */ + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + setup_msrs(vmx); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (vm_need_tpr_shadow(vmx->vcpu.kvm)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + page_to_phys(vmx->vcpu.arch.apic->regs_page)); + vmcs_write32(TPR_THRESHOLD, 0); + } + + if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + vmcs_write64(APIC_ACCESS_ADDR, + page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + + vmx->vcpu.arch.cr0 = 0x60000010; + vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ + vmx_set_cr4(&vmx->vcpu, 0); +#ifdef CONFIG_X86_64 + vmx_set_efer(&vmx->vcpu, 0); +#endif + vmx_fpu_activate(&vmx->vcpu); + update_exception_bitmap(&vmx->vcpu); + + return 0; + +out: + return ret; +} + +static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vcpu->arch.rmode.active) { + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = irq; + vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); +} + +static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) +{ + int word_index = __ffs(vcpu->arch.irq_summary); + int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); + int irq = word_index * BITS_PER_LONG + bit_index; + + clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); + if (!vcpu->arch.irq_pending[word_index]) + clear_bit(word_index, &vcpu->arch.irq_summary); + vmx_inject_irq(vcpu, irq); +} + + +static void do_interrupt_requests(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + u32 cpu_based_vm_exec_control; + + vcpu->arch.interrupt_window_open = + ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); + + if (vcpu->arch.interrupt_window_open && + vcpu->arch.irq_summary && + !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) + /* + * If interrupts enabled, and not blocked by sti or mov ss. Good. + */ + kvm_do_inject_irq(vcpu); + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + if (!vcpu->arch.interrupt_window_open && + (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) + /* + * Interrupts blocked. Wait for unblock. + */ + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; + else + cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + +static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + int ret; + struct kvm_userspace_memory_region tss_mem = { + .slot = 8, + .guest_phys_addr = addr, + .memory_size = PAGE_SIZE * 3, + .flags = 0, + }; + + ret = kvm_set_memory_region(kvm, &tss_mem, 0); + if (ret) + return ret; + kvm->arch.tss_addr = addr; + return 0; +} + +static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) +{ + struct kvm_guest_debug *dbg = &vcpu->guest_debug; + + set_debugreg(dbg->bp[0], 0); + set_debugreg(dbg->bp[1], 1); + set_debugreg(dbg->bp[2], 2); + set_debugreg(dbg->bp[3], 3); + + if (dbg->singlestep) { + unsigned long flags; + + flags = vmcs_readl(GUEST_RFLAGS); + flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + vmcs_writel(GUEST_RFLAGS, flags); + } +} + +static int handle_rmode_exception(struct kvm_vcpu *vcpu, + int vec, u32 err_code) +{ + if (!vcpu->arch.rmode.active) + return 0; + + /* + * Instruction with address size override prefix opcode 0x67 + * Cause the #SS fault with 0 error code in VM86 mode. + */ + if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) + if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) + return 1; + return 0; +} + +static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 intr_info, error_code; + unsigned long cr2, rip; + u32 vect_info; + enum emulation_result er; + + vect_info = vmx->idt_vectoring_info; + intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !is_page_fault(intr_info)) + printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " + "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); + + if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { + int irq = vect_info & VECTORING_INFO_VECTOR_MASK; + set_bit(irq, vcpu->arch.irq_pending); + set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); + } + + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + return 1; /* already handled by vmx_vcpu_run() */ + + if (is_no_device(intr_info)) { + vmx_fpu_activate(vcpu); + return 1; + } + + if (is_invalid_opcode(intr_info)) { + er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); + if (er != EMULATE_DONE) + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + error_code = 0; + rip = vmcs_readl(GUEST_RIP); + if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) + error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + if (is_page_fault(intr_info)) { + cr2 = vmcs_readl(EXIT_QUALIFICATION); + return kvm_mmu_page_fault(vcpu, cr2, error_code); + } + + if (vcpu->arch.rmode.active && + handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, + error_code)) { + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + return kvm_emulate_halt(vcpu); + } + return 1; + } + + if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == + (INTR_TYPE_EXCEPTION | 1)) { + kvm_run->exit_reason = KVM_EXIT_DEBUG; + return 0; + } + kvm_run->exit_reason = KVM_EXIT_EXCEPTION; + kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; + kvm_run->ex.error_code = error_code; + return 0; +} + +static int handle_external_interrupt(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + ++vcpu->stat.irq_exits; + return 1; +} + +static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + return 0; +} + +static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + unsigned long exit_qualification; + int size, down, in, string, rep; + unsigned port; + + ++vcpu->stat.io_exits; + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + string = (exit_qualification & 16) != 0; + + if (string) { + if (emulate_instruction(vcpu, + kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) + return 0; + return 1; + } + + size = (exit_qualification & 7) + 1; + in = (exit_qualification & 8) != 0; + down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; + rep = (exit_qualification & 32) != 0; + port = exit_qualification >> 16; + + return kvm_emulate_pio(vcpu, kvm_run, in, size, port); +} + +static void +vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +{ + /* + * Patch in the VMCALL instruction: + */ + hypercall[0] = 0x0f; + hypercall[1] = 0x01; + hypercall[2] = 0xc1; +} + +static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + unsigned long exit_qualification; + int cr; + int reg; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + cr = exit_qualification & 15; + reg = (exit_qualification >> 8) & 15; + switch ((exit_qualification >> 4) & 3) { + case 0: /* mov to cr */ + switch (cr) { + case 0: + vcpu_load_rsp_rip(vcpu); + set_cr0(vcpu, vcpu->arch.regs[reg]); + skip_emulated_instruction(vcpu); + return 1; + case 3: + vcpu_load_rsp_rip(vcpu); + set_cr3(vcpu, vcpu->arch.regs[reg]); + skip_emulated_instruction(vcpu); + return 1; + case 4: + vcpu_load_rsp_rip(vcpu); + set_cr4(vcpu, vcpu->arch.regs[reg]); + skip_emulated_instruction(vcpu); + return 1; + case 8: + vcpu_load_rsp_rip(vcpu); + set_cr8(vcpu, vcpu->arch.regs[reg]); + skip_emulated_instruction(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + return 1; + kvm_run->exit_reason = KVM_EXIT_SET_TPR; + return 0; + }; + break; + case 2: /* clts */ + vcpu_load_rsp_rip(vcpu); + vmx_fpu_deactivate(vcpu); + vcpu->arch.cr0 &= ~X86_CR0_TS; + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); + vmx_fpu_activate(vcpu); + skip_emulated_instruction(vcpu); + return 1; + case 1: /*mov from cr*/ + switch (cr) { + case 3: + vcpu_load_rsp_rip(vcpu); + vcpu->arch.regs[reg] = vcpu->arch.cr3; + vcpu_put_rsp_rip(vcpu); + skip_emulated_instruction(vcpu); + return 1; + case 8: + vcpu_load_rsp_rip(vcpu); + vcpu->arch.regs[reg] = get_cr8(vcpu); + vcpu_put_rsp_rip(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + break; + case 3: /* lmsw */ + lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + + skip_emulated_instruction(vcpu); + return 1; + default: + break; + } + kvm_run->exit_reason = 0; + pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", + (int)(exit_qualification >> 4) & 3, cr); + return 0; +} + +static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + unsigned long exit_qualification; + unsigned long val; + int dr, reg; + + /* + * FIXME: this code assumes the host is debugging the guest. + * need to deal with guest debugging itself too. + */ + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + dr = exit_qualification & 7; + reg = (exit_qualification >> 8) & 15; + vcpu_load_rsp_rip(vcpu); + if (exit_qualification & 16) { + /* mov from dr */ + switch (dr) { + case 6: + val = 0xffff0ff0; + break; + case 7: + val = 0x400; + break; + default: + val = 0; + } + vcpu->arch.regs[reg] = val; + } else { + /* mov to dr */ + } + vcpu_put_rsp_rip(vcpu); + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + kvm_emulate_cpuid(vcpu); + return 1; +} + +static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + u64 data; + + if (vmx_get_msr(vcpu, ecx, &data)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* FIXME: handling of bits 32:63 of rax, rdx */ + vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; + vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) + | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + + if (vmx_set_msr(vcpu, ecx, data) != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + return 1; +} + +static int handle_interrupt_window(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + u32 cpu_based_vm_exec_control; + + /* clear pending irq */ + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + /* + * If the user space waits to inject interrupts, exit as soon as + * possible + */ + if (kvm_run->request_interrupt_window && + !vcpu->arch.irq_summary) { + kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + ++vcpu->stat.irq_window_exits; + return 0; + } + return 1; +} + +static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + skip_emulated_instruction(vcpu); + return kvm_emulate_halt(vcpu); +} + +static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + skip_emulated_instruction(vcpu); + kvm_emulate_hypercall(vcpu); + return 1; +} + +static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + skip_emulated_instruction(vcpu); + /* TODO: Add support for VT-d/pass-through device */ + return 1; +} + +static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification; + enum emulation_result er; + unsigned long offset; + + exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + offset = exit_qualification & 0xffful; + + er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + + if (er != EMULATE_DONE) { + printk(KERN_ERR + "Fail to handle apic access vmexit! Offset is 0x%lx\n", + offset); + return -ENOTSUPP; + } + return 1; +} + +/* + * The exit handlers return 1 if the exit was handled fully and guest execution + * may resume. Otherwise they set the kvm_run parameter to indicate what needs + * to be done to userspace and return 0. + */ +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) = { + [EXIT_REASON_EXCEPTION_NMI] = handle_exception, + [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, + [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, + [EXIT_REASON_IO_INSTRUCTION] = handle_io, + [EXIT_REASON_CR_ACCESS] = handle_cr, + [EXIT_REASON_DR_ACCESS] = handle_dr, + [EXIT_REASON_CPUID] = handle_cpuid, + [EXIT_REASON_MSR_READ] = handle_rdmsr, + [EXIT_REASON_MSR_WRITE] = handle_wrmsr, + [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, + [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_VMCALL] = handle_vmcall, + [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, + [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_WBINVD] = handle_wbinvd, +}; + +static const int kvm_vmx_max_exit_handlers = + ARRAY_SIZE(kvm_vmx_exit_handlers); + +/* + * The guest has exited. See if we can fix it or if we need userspace + * assistance. + */ +static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + u32 exit_reason = vmcs_read32(VM_EXIT_REASON); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vectoring_info = vmx->idt_vectoring_info; + + if (unlikely(vmx->fail)) { + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason + = vmcs_read32(VM_INSTRUCTION_ERROR); + return 0; + } + + if ((vectoring_info & VECTORING_INFO_VALID_MASK) && + exit_reason != EXIT_REASON_EXCEPTION_NMI) + printk(KERN_WARNING "%s: unexpected, valid vectoring info and " + "exit reason is 0x%x\n", __FUNCTION__, exit_reason); + if (exit_reason < kvm_vmx_max_exit_handlers + && kvm_vmx_exit_handlers[exit_reason]) + return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); + else { + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason = exit_reason; + } + return 0; +} + +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ +} + +static void update_tpr_threshold(struct kvm_vcpu *vcpu) +{ + int max_irr, tpr; + + if (!vm_need_tpr_shadow(vcpu->kvm)) + return; + + if (!kvm_lapic_enabled(vcpu) || + ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) { + vmcs_write32(TPR_THRESHOLD, 0); + return; + } + + tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; + vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); +} + +static void enable_irq_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + +static void vmx_intr_assist(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 idtv_info_field, intr_info_field; + int has_ext_irq, interrupt_window_open; + int vector; + + update_tpr_threshold(vcpu); + + has_ext_irq = kvm_cpu_has_interrupt(vcpu); + intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); + idtv_info_field = vmx->idt_vectoring_info; + if (intr_info_field & INTR_INFO_VALID_MASK) { + if (idtv_info_field & INTR_INFO_VALID_MASK) { + /* TODO: fault when IDT_Vectoring */ + if (printk_ratelimit()) + printk(KERN_ERR "Fault when IDT_Vectoring\n"); + } + if (has_ext_irq) + enable_irq_window(vcpu); + return; + } + if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { + if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) + == INTR_TYPE_EXT_INTR + && vcpu->arch.rmode.active) { + u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; + + vmx_inject_irq(vcpu, vect); + if (unlikely(has_ext_irq)) + enable_irq_window(vcpu); + return; + } + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); + + if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs_read32(IDT_VECTORING_ERROR_CODE)); + if (unlikely(has_ext_irq)) + enable_irq_window(vcpu); + return; + } + if (!has_ext_irq) + return; + interrupt_window_open = + ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); + if (interrupt_window_open) { + vector = kvm_cpu_get_interrupt(vcpu); + vmx_inject_irq(vcpu, vector); + kvm_timer_intr_post(vcpu, vector); + } else + enable_irq_window(vcpu); +} + +/* + * Failure to inject an interrupt should give us the information + * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs + * when fetching the interrupt redirection bitmap in the real-mode + * tss, this doesn't happen. So we do it ourselves. + */ +static void fixup_rmode_irq(struct vcpu_vmx *vmx) +{ + vmx->rmode.irq.pending = 0; + if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) + return; + vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); + if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { + vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; + vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; + return; + } + vmx->idt_vectoring_info = + VECTORING_INFO_VALID_MASK + | INTR_TYPE_EXT_INTR + | vmx->rmode.irq.vector; +} + +static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 intr_info; + + /* + * Loading guest fpu may have cleared host cr0.ts + */ + vmcs_writel(HOST_CR0, read_cr0()); + + asm( + /* Store host registers */ +#ifdef CONFIG_X86_64 + "push %%rdx; push %%rbp;" + "push %%rcx \n\t" +#else + "push %%edx; push %%ebp;" + "push %%ecx \n\t" +#endif + ASM_VMX_VMWRITE_RSP_RDX "\n\t" + /* Check if vmlaunch of vmresume is needed */ + "cmpl $0, %c[launched](%0) \n\t" + /* Load guest registers. Don't clobber flags. */ +#ifdef CONFIG_X86_64 + "mov %c[cr2](%0), %%rax \n\t" + "mov %%rax, %%cr2 \n\t" + "mov %c[rax](%0), %%rax \n\t" + "mov %c[rbx](%0), %%rbx \n\t" + "mov %c[rdx](%0), %%rdx \n\t" + "mov %c[rsi](%0), %%rsi \n\t" + "mov %c[rdi](%0), %%rdi \n\t" + "mov %c[rbp](%0), %%rbp \n\t" + "mov %c[r8](%0), %%r8 \n\t" + "mov %c[r9](%0), %%r9 \n\t" + "mov %c[r10](%0), %%r10 \n\t" + "mov %c[r11](%0), %%r11 \n\t" + "mov %c[r12](%0), %%r12 \n\t" + "mov %c[r13](%0), %%r13 \n\t" + "mov %c[r14](%0), %%r14 \n\t" + "mov %c[r15](%0), %%r15 \n\t" + "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ +#else + "mov %c[cr2](%0), %%eax \n\t" + "mov %%eax, %%cr2 \n\t" + "mov %c[rax](%0), %%eax \n\t" + "mov %c[rbx](%0), %%ebx \n\t" + "mov %c[rdx](%0), %%edx \n\t" + "mov %c[rsi](%0), %%esi \n\t" + "mov %c[rdi](%0), %%edi \n\t" + "mov %c[rbp](%0), %%ebp \n\t" + "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ +#endif + /* Enter guest mode */ + "jne .Llaunched \n\t" + ASM_VMX_VMLAUNCH "\n\t" + "jmp .Lkvm_vmx_return \n\t" + ".Llaunched: " ASM_VMX_VMRESUME "\n\t" + ".Lkvm_vmx_return: " + /* Save guest registers, load host registers, keep flags */ +#ifdef CONFIG_X86_64 + "xchg %0, (%%rsp) \n\t" + "mov %%rax, %c[rax](%0) \n\t" + "mov %%rbx, %c[rbx](%0) \n\t" + "pushq (%%rsp); popq %c[rcx](%0) \n\t" + "mov %%rdx, %c[rdx](%0) \n\t" + "mov %%rsi, %c[rsi](%0) \n\t" + "mov %%rdi, %c[rdi](%0) \n\t" + "mov %%rbp, %c[rbp](%0) \n\t" + "mov %%r8, %c[r8](%0) \n\t" + "mov %%r9, %c[r9](%0) \n\t" + "mov %%r10, %c[r10](%0) \n\t" + "mov %%r11, %c[r11](%0) \n\t" + "mov %%r12, %c[r12](%0) \n\t" + "mov %%r13, %c[r13](%0) \n\t" + "mov %%r14, %c[r14](%0) \n\t" + "mov %%r15, %c[r15](%0) \n\t" + "mov %%cr2, %%rax \n\t" + "mov %%rax, %c[cr2](%0) \n\t" + + "pop %%rbp; pop %%rbp; pop %%rdx \n\t" +#else + "xchg %0, (%%esp) \n\t" + "mov %%eax, %c[rax](%0) \n\t" + "mov %%ebx, %c[rbx](%0) \n\t" + "pushl (%%esp); popl %c[rcx](%0) \n\t" + "mov %%edx, %c[rdx](%0) \n\t" + "mov %%esi, %c[rsi](%0) \n\t" + "mov %%edi, %c[rdi](%0) \n\t" + "mov %%ebp, %c[rbp](%0) \n\t" + "mov %%cr2, %%eax \n\t" + "mov %%eax, %c[cr2](%0) \n\t" + + "pop %%ebp; pop %%ebp; pop %%edx \n\t" +#endif + "setbe %c[fail](%0) \n\t" + : : "c"(vmx), "d"((unsigned long)HOST_RSP), + [launched]"i"(offsetof(struct vcpu_vmx, launched)), + [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), + [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), + [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), + [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), + [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), + [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), + [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), +#ifdef CONFIG_X86_64 + [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), + [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), + [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), + [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), + [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), + [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), + [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), + [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), +#endif + [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) + : "cc", "memory" +#ifdef CONFIG_X86_64 + , "rbx", "rdi", "rsi" + , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#else + , "ebx", "edi", "rsi" +#endif + ); + + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); + if (vmx->rmode.irq.pending) + fixup_rmode_irq(vmx); + + vcpu->arch.interrupt_window_open = + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; + + asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + vmx->launched = 1; + + intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + /* We need to handle NMIs before interrupts are enabled */ + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + asm("int $2"); +} + +static void vmx_free_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (vmx->vmcs) { + on_each_cpu(__vcpu_clear, vmx, 0, 1); + free_vmcs(vmx->vmcs); + vmx->vmcs = NULL; + } +} + +static void vmx_free_vcpu(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx_free_vmcs(vcpu); + kfree(vmx->host_msrs); + kfree(vmx->guest_msrs); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vmx); +} + +static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) +{ + int err; + struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + int cpu; + + if (!vmx) + return ERR_PTR(-ENOMEM); + + err = kvm_vcpu_init(&vmx->vcpu, kvm, id); + if (err) + goto free_vcpu; + + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!vmx->guest_msrs) { + err = -ENOMEM; + goto uninit_vcpu; + } + + vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!vmx->host_msrs) + goto free_guest_msrs; + + vmx->vmcs = alloc_vmcs(); + if (!vmx->vmcs) + goto free_msrs; + + vmcs_clear(vmx->vmcs); + + cpu = get_cpu(); + vmx_vcpu_load(&vmx->vcpu, cpu); + err = vmx_vcpu_setup(vmx); + vmx_vcpu_put(&vmx->vcpu); + put_cpu(); + if (err) + goto free_vmcs; + + return &vmx->vcpu; + +free_vmcs: + free_vmcs(vmx->vmcs); +free_msrs: + kfree(vmx->host_msrs); +free_guest_msrs: + kfree(vmx->guest_msrs); +uninit_vcpu: + kvm_vcpu_uninit(&vmx->vcpu); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vmx); + return ERR_PTR(err); +} + +static void __init vmx_check_processor_compat(void *rtn) +{ + struct vmcs_config vmcs_conf; + + *(int *)rtn = 0; + if (setup_vmcs_config(&vmcs_conf) < 0) + *(int *)rtn = -EIO; + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { + printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", + smp_processor_id()); + *(int *)rtn = -EIO; + } +} + +static struct kvm_x86_ops vmx_x86_ops = { + .cpu_has_kvm_support = cpu_has_kvm_support, + .disabled_by_bios = vmx_disabled_by_bios, + .hardware_setup = hardware_setup, + .hardware_unsetup = hardware_unsetup, + .check_processor_compatibility = vmx_check_processor_compat, + .hardware_enable = hardware_enable, + .hardware_disable = hardware_disable, + .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, + + .vcpu_create = vmx_create_vcpu, + .vcpu_free = vmx_free_vcpu, + .vcpu_reset = vmx_vcpu_reset, + + .prepare_guest_switch = vmx_save_host_state, + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + .vcpu_decache = vmx_vcpu_decache, + + .set_guest_debug = set_guest_debug, + .guest_debug_pre = kvm_guest_debug_pre, + .get_msr = vmx_get_msr, + .set_msr = vmx_set_msr, + .get_segment_base = vmx_get_segment_base, + .get_segment = vmx_get_segment, + .set_segment = vmx_set_segment, + .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, + .set_cr0 = vmx_set_cr0, + .set_cr3 = vmx_set_cr3, + .set_cr4 = vmx_set_cr4, +#ifdef CONFIG_X86_64 + .set_efer = vmx_set_efer, +#endif + .get_idt = vmx_get_idt, + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, + .cache_regs = vcpu_load_rsp_rip, + .decache_regs = vcpu_put_rsp_rip, + .get_rflags = vmx_get_rflags, + .set_rflags = vmx_set_rflags, + + .tlb_flush = vmx_flush_tlb, + + .run = vmx_vcpu_run, + .handle_exit = kvm_handle_exit, + .skip_emulated_instruction = skip_emulated_instruction, + .patch_hypercall = vmx_patch_hypercall, + .get_irq = vmx_get_irq, + .set_irq = vmx_inject_irq, + .queue_exception = vmx_queue_exception, + .exception_injected = vmx_exception_injected, + .inject_pending_irq = vmx_intr_assist, + .inject_pending_vectors = do_interrupt_requests, + + .set_tss_addr = vmx_set_tss_addr, +}; + +static int __init vmx_init(void) +{ + void *iova; + int r; + + vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!vmx_io_bitmap_a) + return -ENOMEM; + + vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!vmx_io_bitmap_b) { + r = -ENOMEM; + goto out; + } + + /* + * Allow direct access to the PC debug port (it is often used for I/O + * delays, but the vmexits simply slow things down). + */ + iova = kmap(vmx_io_bitmap_a); + memset(iova, 0xff, PAGE_SIZE); + clear_bit(0x80, iova); + kunmap(vmx_io_bitmap_a); + + iova = kmap(vmx_io_bitmap_b); + memset(iova, 0xff, PAGE_SIZE); + kunmap(vmx_io_bitmap_b); + + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); + if (r) + goto out1; + + if (bypass_guest_pf) + kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); + + return 0; + +out1: + __free_page(vmx_io_bitmap_b); +out: + __free_page(vmx_io_bitmap_a); + return r; +} + +static void __exit vmx_exit(void) +{ + __free_page(vmx_io_bitmap_b); + __free_page(vmx_io_bitmap_a); + + kvm_exit(); +} + +module_init(vmx_init) +module_exit(vmx_exit) diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h new file mode 100644 index 00000000000..d52ae8d7303 --- /dev/null +++ b/arch/x86/kvm/vmx.h @@ -0,0 +1,324 @@ +#ifndef VMX_H +#define VMX_H + +/* + * vmx.h: VMX Architecture related definitions + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * A few random additions are: + * Copyright (C) 2006 Qumranet + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + */ + +/* + * Definitions of Primary Processor-Based VM-Execution Controls. + */ +#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 +#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 +#define CPU_BASED_HLT_EXITING 0x00000080 +#define CPU_BASED_INVLPG_EXITING 0x00000200 +#define CPU_BASED_MWAIT_EXITING 0x00000400 +#define CPU_BASED_RDPMC_EXITING 0x00000800 +#define CPU_BASED_RDTSC_EXITING 0x00001000 +#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 +#define CPU_BASED_CR8_STORE_EXITING 0x00100000 +#define CPU_BASED_TPR_SHADOW 0x00200000 +#define CPU_BASED_MOV_DR_EXITING 0x00800000 +#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 +#define CPU_BASED_USE_IO_BITMAPS 0x02000000 +#define CPU_BASED_USE_MSR_BITMAPS 0x10000000 +#define CPU_BASED_MONITOR_EXITING 0x20000000 +#define CPU_BASED_PAUSE_EXITING 0x40000000 +#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 +/* + * Definitions of Secondary Processor-Based VM-Execution Controls. + */ +#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 +#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 + + +#define PIN_BASED_EXT_INTR_MASK 0x00000001 +#define PIN_BASED_NMI_EXITING 0x00000008 +#define PIN_BASED_VIRTUAL_NMIS 0x00000020 + +#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 +#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 + +#define VM_ENTRY_IA32E_MODE 0x00000200 +#define VM_ENTRY_SMM 0x00000400 +#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 + +/* VMCS Encodings */ +enum vmcs_field { + GUEST_ES_SELECTOR = 0x00000800, + GUEST_CS_SELECTOR = 0x00000802, + GUEST_SS_SELECTOR = 0x00000804, + GUEST_DS_SELECTOR = 0x00000806, + GUEST_FS_SELECTOR = 0x00000808, + GUEST_GS_SELECTOR = 0x0000080a, + GUEST_LDTR_SELECTOR = 0x0000080c, + GUEST_TR_SELECTOR = 0x0000080e, + HOST_ES_SELECTOR = 0x00000c00, + HOST_CS_SELECTOR = 0x00000c02, + HOST_SS_SELECTOR = 0x00000c04, + HOST_DS_SELECTOR = 0x00000c06, + HOST_FS_SELECTOR = 0x00000c08, + HOST_GS_SELECTOR = 0x00000c0a, + HOST_TR_SELECTOR = 0x00000c0c, + IO_BITMAP_A = 0x00002000, + IO_BITMAP_A_HIGH = 0x00002001, + IO_BITMAP_B = 0x00002002, + IO_BITMAP_B_HIGH = 0x00002003, + MSR_BITMAP = 0x00002004, + MSR_BITMAP_HIGH = 0x00002005, + VM_EXIT_MSR_STORE_ADDR = 0x00002006, + VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, + VM_EXIT_MSR_LOAD_ADDR = 0x00002008, + VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, + VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, + VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + TSC_OFFSET = 0x00002010, + TSC_OFFSET_HIGH = 0x00002011, + VIRTUAL_APIC_PAGE_ADDR = 0x00002012, + VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, + APIC_ACCESS_ADDR = 0x00002014, + APIC_ACCESS_ADDR_HIGH = 0x00002015, + VMCS_LINK_POINTER = 0x00002800, + VMCS_LINK_POINTER_HIGH = 0x00002801, + GUEST_IA32_DEBUGCTL = 0x00002802, + GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + PIN_BASED_VM_EXEC_CONTROL = 0x00004000, + CPU_BASED_VM_EXEC_CONTROL = 0x00004002, + EXCEPTION_BITMAP = 0x00004004, + PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, + PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, + CR3_TARGET_COUNT = 0x0000400a, + VM_EXIT_CONTROLS = 0x0000400c, + VM_EXIT_MSR_STORE_COUNT = 0x0000400e, + VM_EXIT_MSR_LOAD_COUNT = 0x00004010, + VM_ENTRY_CONTROLS = 0x00004012, + VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, + VM_ENTRY_INTR_INFO_FIELD = 0x00004016, + VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, + VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, + TPR_THRESHOLD = 0x0000401c, + SECONDARY_VM_EXEC_CONTROL = 0x0000401e, + VM_INSTRUCTION_ERROR = 0x00004400, + VM_EXIT_REASON = 0x00004402, + VM_EXIT_INTR_INFO = 0x00004404, + VM_EXIT_INTR_ERROR_CODE = 0x00004406, + IDT_VECTORING_INFO_FIELD = 0x00004408, + IDT_VECTORING_ERROR_CODE = 0x0000440a, + VM_EXIT_INSTRUCTION_LEN = 0x0000440c, + VMX_INSTRUCTION_INFO = 0x0000440e, + GUEST_ES_LIMIT = 0x00004800, + GUEST_CS_LIMIT = 0x00004802, + GUEST_SS_LIMIT = 0x00004804, + GUEST_DS_LIMIT = 0x00004806, + GUEST_FS_LIMIT = 0x00004808, + GUEST_GS_LIMIT = 0x0000480a, + GUEST_LDTR_LIMIT = 0x0000480c, + GUEST_TR_LIMIT = 0x0000480e, + GUEST_GDTR_LIMIT = 0x00004810, + GUEST_IDTR_LIMIT = 0x00004812, + GUEST_ES_AR_BYTES = 0x00004814, + GUEST_CS_AR_BYTES = 0x00004816, + GUEST_SS_AR_BYTES = 0x00004818, + GUEST_DS_AR_BYTES = 0x0000481a, + GUEST_FS_AR_BYTES = 0x0000481c, + GUEST_GS_AR_BYTES = 0x0000481e, + GUEST_LDTR_AR_BYTES = 0x00004820, + GUEST_TR_AR_BYTES = 0x00004822, + GUEST_INTERRUPTIBILITY_INFO = 0x00004824, + GUEST_ACTIVITY_STATE = 0X00004826, + GUEST_SYSENTER_CS = 0x0000482A, + HOST_IA32_SYSENTER_CS = 0x00004c00, + CR0_GUEST_HOST_MASK = 0x00006000, + CR4_GUEST_HOST_MASK = 0x00006002, + CR0_READ_SHADOW = 0x00006004, + CR4_READ_SHADOW = 0x00006006, + CR3_TARGET_VALUE0 = 0x00006008, + CR3_TARGET_VALUE1 = 0x0000600a, + CR3_TARGET_VALUE2 = 0x0000600c, + CR3_TARGET_VALUE3 = 0x0000600e, + EXIT_QUALIFICATION = 0x00006400, + GUEST_LINEAR_ADDRESS = 0x0000640a, + GUEST_CR0 = 0x00006800, + GUEST_CR3 = 0x00006802, + GUEST_CR4 = 0x00006804, + GUEST_ES_BASE = 0x00006806, + GUEST_CS_BASE = 0x00006808, + GUEST_SS_BASE = 0x0000680a, + GUEST_DS_BASE = 0x0000680c, + GUEST_FS_BASE = 0x0000680e, + GUEST_GS_BASE = 0x00006810, + GUEST_LDTR_BASE = 0x00006812, + GUEST_TR_BASE = 0x00006814, + GUEST_GDTR_BASE = 0x00006816, + GUEST_IDTR_BASE = 0x00006818, + GUEST_DR7 = 0x0000681a, + GUEST_RSP = 0x0000681c, + GUEST_RIP = 0x0000681e, + GUEST_RFLAGS = 0x00006820, + GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, + GUEST_SYSENTER_ESP = 0x00006824, + GUEST_SYSENTER_EIP = 0x00006826, + HOST_CR0 = 0x00006c00, + HOST_CR3 = 0x00006c02, + HOST_CR4 = 0x00006c04, + HOST_FS_BASE = 0x00006c06, + HOST_GS_BASE = 0x00006c08, + HOST_TR_BASE = 0x00006c0a, + HOST_GDTR_BASE = 0x00006c0c, + HOST_IDTR_BASE = 0x00006c0e, + HOST_IA32_SYSENTER_ESP = 0x00006c10, + HOST_IA32_SYSENTER_EIP = 0x00006c12, + HOST_RSP = 0x00006c14, + HOST_RIP = 0x00006c16, +}; + +#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 + +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 +#define EXIT_REASON_TRIPLE_FAULT 2 + +#define EXIT_REASON_PENDING_INTERRUPT 7 + +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMOFF 26 +#define EXIT_REASON_VMON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_WBINVD 54 + +/* + * Interruption-information format + */ +#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ +#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ +#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ + +#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK +#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK +#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK +#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK + +#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ +#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ +#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ + +/* + * Exit Qualifications for MOV for Control Register Access + */ +#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/ +#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ +#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */ +#define LMSW_SOURCE_DATA_SHIFT 16 +#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ +#define REG_EAX (0 << 8) +#define REG_ECX (1 << 8) +#define REG_EDX (2 << 8) +#define REG_EBX (3 << 8) +#define REG_ESP (4 << 8) +#define REG_EBP (5 << 8) +#define REG_ESI (6 << 8) +#define REG_EDI (7 << 8) +#define REG_R8 (8 << 8) +#define REG_R9 (9 << 8) +#define REG_R10 (10 << 8) +#define REG_R11 (11 << 8) +#define REG_R12 (12 << 8) +#define REG_R13 (13 << 8) +#define REG_R14 (14 << 8) +#define REG_R15 (15 << 8) + +/* + * Exit Qualifications for MOV for Debug Register Access + */ +#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */ +#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ +#define TYPE_MOV_TO_DR (0 << 4) +#define TYPE_MOV_FROM_DR (1 << 4) +#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */ + + +/* segment AR */ +#define SEGMENT_AR_L_MASK (1 << 13) + +#define AR_TYPE_ACCESSES_MASK 1 +#define AR_TYPE_READABLE_MASK (1 << 1) +#define AR_TYPE_WRITEABLE_MASK (1 << 2) +#define AR_TYPE_CODE_MASK (1 << 3) +#define AR_TYPE_MASK 0x0f +#define AR_TYPE_BUSY_64_TSS 11 +#define AR_TYPE_BUSY_32_TSS 11 +#define AR_TYPE_BUSY_16_TSS 3 +#define AR_TYPE_LDT 2 + +#define AR_UNUSABLE_MASK (1 << 16) +#define AR_S_MASK (1 << 4) +#define AR_P_MASK (1 << 7) +#define AR_L_MASK (1 << 13) +#define AR_DB_MASK (1 << 14) +#define AR_G_MASK (1 << 15) +#define AR_DPL_SHIFT 5 +#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) + +#define AR_RESERVD_MASK 0xfffe0f00 + +#define MSR_IA32_VMX_BASIC 0x480 +#define MSR_IA32_VMX_PINBASED_CTLS 0x481 +#define MSR_IA32_VMX_PROCBASED_CTLS 0x482 +#define MSR_IA32_VMX_EXIT_CTLS 0x483 +#define MSR_IA32_VMX_ENTRY_CTLS 0x484 +#define MSR_IA32_VMX_MISC 0x485 +#define MSR_IA32_VMX_CR0_FIXED0 0x486 +#define MSR_IA32_VMX_CR0_FIXED1 0x487 +#define MSR_IA32_VMX_CR4_FIXED0 0x488 +#define MSR_IA32_VMX_CR4_FIXED1 0x489 +#define MSR_IA32_VMX_VMCS_ENUM 0x48a +#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b + +#define MSR_IA32_FEATURE_CONTROL 0x3a +#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 +#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 + +#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 + +#endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c new file mode 100644 index 00000000000..8f94a0b89df --- /dev/null +++ b/arch/x86/kvm/x86.c @@ -0,0 +1,3287 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * derived from drivers/kvm/kvm_main.c + * + * Copyright (C) 2006 Qumranet, Inc. + * + * Authors: + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/kvm_host.h> +#include "segment_descriptor.h" +#include "irq.h" +#include "mmu.h" + +#include <linux/kvm.h> +#include <linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/mman.h> +#include <linux/highmem.h> + +#include <asm/uaccess.h> +#include <asm/msr.h> + +#define MAX_IO_MSRS 256 +#define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ + | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) +#define CR4_RESERVED_BITS \ + (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + +#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) +#define EFER_RESERVED_BITS 0xfffffffffffff2fe + +#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM +#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU + +struct kvm_x86_ops *kvm_x86_ops; + +struct kvm_stats_debugfs_item debugfs_entries[] = { + { "pf_fixed", VCPU_STAT(pf_fixed) }, + { "pf_guest", VCPU_STAT(pf_guest) }, + { "tlb_flush", VCPU_STAT(tlb_flush) }, + { "invlpg", VCPU_STAT(invlpg) }, + { "exits", VCPU_STAT(exits) }, + { "io_exits", VCPU_STAT(io_exits) }, + { "mmio_exits", VCPU_STAT(mmio_exits) }, + { "signal_exits", VCPU_STAT(signal_exits) }, + { "irq_window", VCPU_STAT(irq_window_exits) }, + { "halt_exits", VCPU_STAT(halt_exits) }, + { "halt_wakeup", VCPU_STAT(halt_wakeup) }, + { "request_irq", VCPU_STAT(request_irq_exits) }, + { "irq_exits", VCPU_STAT(irq_exits) }, + { "host_state_reload", VCPU_STAT(host_state_reload) }, + { "efer_reload", VCPU_STAT(efer_reload) }, + { "fpu_reload", VCPU_STAT(fpu_reload) }, + { "insn_emulation", VCPU_STAT(insn_emulation) }, + { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, + { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, + { "mmu_pte_write", VM_STAT(mmu_pte_write) }, + { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, + { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, + { "mmu_flooded", VM_STAT(mmu_flooded) }, + { "mmu_recycled", VM_STAT(mmu_recycled) }, + { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, + { NULL } +}; + + +unsigned long segment_base(u16 selector) +{ + struct descriptor_table gdt; + struct segment_descriptor *d; + unsigned long table_base; + unsigned long v; + + if (selector == 0) + return 0; + + asm("sgdt %0" : "=m"(gdt)); + table_base = gdt.base; + + if (selector & 4) { /* from ldt */ + u16 ldt_selector; + + asm("sldt %0" : "=g"(ldt_selector)); + table_base = segment_base(ldt_selector); + } + d = (struct segment_descriptor *)(table_base + (selector & ~7)); + v = d->base_low | ((unsigned long)d->base_mid << 16) | + ((unsigned long)d->base_high << 24); +#ifdef CONFIG_X86_64 + if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((unsigned long) \ + ((struct segment_descriptor_64 *)d)->base_higher) << 32; +#endif + return v; +} +EXPORT_SYMBOL_GPL(segment_base); + +u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) + return vcpu->arch.apic_base; + else + return vcpu->arch.apic_base; +} +EXPORT_SYMBOL_GPL(kvm_get_apic_base); + +void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) +{ + /* TODO: reserve bits check */ + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_base(vcpu, data); + else + vcpu->arch.apic_base = data; +} +EXPORT_SYMBOL_GPL(kvm_set_apic_base); + +void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) +{ + WARN_ON(vcpu->arch.exception.pending); + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = false; + vcpu->arch.exception.nr = nr; +} +EXPORT_SYMBOL_GPL(kvm_queue_exception); + +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, + u32 error_code) +{ + ++vcpu->stat.pf_guest; + if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { + printk(KERN_DEBUG "kvm: inject_page_fault:" + " double fault 0x%lx\n", addr); + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + return; + } + vcpu->arch.cr2 = addr; + kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); +} + +void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) +{ + WARN_ON(vcpu->arch.exception.pending); + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; +} +EXPORT_SYMBOL_GPL(kvm_queue_exception_e); + +static void __queue_exception(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); +} + +/* + * Load the pae pdptrs. Return true is they are all valid. + */ +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; + unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; + int i; + int ret; + u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + + down_read(¤t->mm->mmap_sem); + ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, + offset * sizeof(u64), sizeof(pdpte)); + if (ret < 0) { + ret = 0; + goto out; + } + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { + if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { + ret = 0; + goto out; + } + } + ret = 1; + + memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); +out: + up_read(¤t->mm->mmap_sem); + + return ret; +} + +static bool pdptrs_changed(struct kvm_vcpu *vcpu) +{ + u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + bool changed = true; + int r; + + if (is_long_mode(vcpu) || !is_pae(vcpu)) + return false; + + down_read(¤t->mm->mmap_sem); + r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); + if (r < 0) + goto out; + changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; +out: + up_read(¤t->mm->mmap_sem); + + return changed; +} + +void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (cr0 & CR0_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", + cr0, vcpu->arch.cr0); + kvm_inject_gp(vcpu, 0); + return; + } + + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { + printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { + printk(KERN_DEBUG "set_cr0: #GP, set PG flag " + "and a clear PE flag\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { +#ifdef CONFIG_X86_64 + if ((vcpu->arch.shadow_efer & EFER_LME)) { + int cs_db, cs_l; + + if (!is_pae(vcpu)) { + printk(KERN_DEBUG "set_cr0: #GP, start paging " + "in long mode while PAE is disabled\n"); + kvm_inject_gp(vcpu, 0); + return; + } + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + if (cs_l) { + printk(KERN_DEBUG "set_cr0: #GP, start paging " + "in long mode while CS.L == 1\n"); + kvm_inject_gp(vcpu, 0); + return; + + } + } else +#endif + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + printk(KERN_DEBUG "set_cr0: #GP, pdptrs " + "reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + } + + kvm_x86_ops->set_cr0(vcpu, cr0); + vcpu->arch.cr0 = cr0; + + kvm_mmu_reset_context(vcpu); + return; +} +EXPORT_SYMBOL_GPL(set_cr0); + +void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +{ + set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); +} +EXPORT_SYMBOL_GPL(lmsw); + +void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (cr4 & CR4_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + if (is_long_mode(vcpu)) { + if (!(cr4 & X86_CR4_PAE)) { + printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " + "in long mode\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) + && !load_pdptrs(vcpu, vcpu->arch.cr3)) { + printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + if (cr4 & X86_CR4_VMXE) { + printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); + kvm_inject_gp(vcpu, 0); + return; + } + kvm_x86_ops->set_cr4(vcpu, cr4); + vcpu->arch.cr4 = cr4; + kvm_mmu_reset_context(vcpu); +} +EXPORT_SYMBOL_GPL(set_cr4); + +void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { + kvm_mmu_flush_tlb(vcpu); + return; + } + + if (is_long_mode(vcpu)) { + if (cr3 & CR3_L_MODE_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } else { + if (is_pae(vcpu)) { + if (cr3 & CR3_PAE_RESERVED_BITS) { + printk(KERN_DEBUG + "set_cr3: #GP, reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { + printk(KERN_DEBUG "set_cr3: #GP, pdptrs " + "reserved bits\n"); + kvm_inject_gp(vcpu, 0); + return; + } + } + /* + * We don't check reserved bits in nonpae mode, because + * this isn't enforced, and VMware depends on this. + */ + } + + down_read(¤t->mm->mmap_sem); + /* + * Does the new cr3 value map to physical memory? (Note, we + * catch an invalid cr3 even in real-mode, because it would + * cause trouble later on when we turn on paging anyway.) + * + * A real CPU would silently accept an invalid cr3 and would + * attempt to use it - with largely undefined (and often hard + * to debug) behavior on the guest side. + */ + if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) + kvm_inject_gp(vcpu, 0); + else { + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); + } + up_read(¤t->mm->mmap_sem); +} +EXPORT_SYMBOL_GPL(set_cr3); + +void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if (cr8 & CR8_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); + kvm_inject_gp(vcpu, 0); + return; + } + if (irqchip_in_kernel(vcpu->kvm)) + kvm_lapic_set_tpr(vcpu, cr8); + else + vcpu->arch.cr8 = cr8; +} +EXPORT_SYMBOL_GPL(set_cr8); + +unsigned long get_cr8(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) + return kvm_lapic_get_cr8(vcpu); + else + return vcpu->arch.cr8; +} +EXPORT_SYMBOL_GPL(get_cr8); + +/* + * List of msr numbers which we expose to userspace through KVM_GET_MSRS + * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + * + * This list is modified at module load time to reflect the + * capabilities of the host cpu. + */ +static u32 msrs_to_save[] = { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_K6_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TIME_STAMP_COUNTER, +}; + +static unsigned num_msrs_to_save; + +static u32 emulated_msrs[] = { + MSR_IA32_MISC_ENABLE, +}; + +#ifdef CONFIG_X86_64 + +static void set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (efer & EFER_RESERVED_BITS) { + printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", + efer); + kvm_inject_gp(vcpu, 0); + return; + } + + if (is_paging(vcpu) + && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { + printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); + kvm_inject_gp(vcpu, 0); + return; + } + + kvm_x86_ops->set_efer(vcpu, efer); + + efer &= ~EFER_LMA; + efer |= vcpu->arch.shadow_efer & EFER_LMA; + + vcpu->arch.shadow_efer = efer; +} + +#endif + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + return kvm_x86_ops->set_msr(vcpu, msr_index, data); +} + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return kvm_set_msr(vcpu, index, *data); +} + + +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { +#ifdef CONFIG_X86_64 + case MSR_EFER: + set_efer(vcpu, data); + break; +#endif + case MSR_IA32_MC0_STATUS: + pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; + case MSR_IA32_MCG_STATUS: + pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case 0x200 ... 0x2ff: /* MTRRs */ + break; + case MSR_IA32_APICBASE: + kvm_set_apic_base(vcpu, data); + break; + case MSR_IA32_MISC_ENABLE: + vcpu->arch.ia32_misc_enable_msr = data; + break; + default: + pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_msr_common); + + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); +} + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data; + + switch (msr) { + case 0xc0010010: /* SYSCFG */ + case 0xc0010015: /* HWCR */ + case MSR_IA32_PLATFORM_ID: + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MC0_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MC0_MISC: + case MSR_IA32_MC0_MISC+4: + case MSR_IA32_MC0_MISC+8: + case MSR_IA32_MC0_MISC+12: + case MSR_IA32_MC0_MISC+16: + case MSR_IA32_UCODE_REV: + case MSR_IA32_PERF_STATUS: + case MSR_IA32_EBL_CR_POWERON: + /* MTRR registers */ + case 0xfe: + case 0x200 ... 0x2ff: + data = 0; + break; + case 0xcd: /* fsb frequency */ + data = 3; + break; + case MSR_IA32_APICBASE: + data = kvm_get_apic_base(vcpu); + break; + case MSR_IA32_MISC_ENABLE: + data = vcpu->arch.ia32_misc_enable_msr; + break; +#ifdef CONFIG_X86_64 + case MSR_EFER: + data = vcpu->arch.shadow_efer; + break; +#endif + default: + pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_msr_common); + +/* + * Read or write a bunch of msrs. All parameters are kernel addresses. + * + * @return number of msrs set successfully. + */ +static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data)) +{ + int i; + + vcpu_load(vcpu); + + for (i = 0; i < msrs->nmsrs; ++i) + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; + + vcpu_put(vcpu); + + return i; +} + +/* + * Read or write a bunch of msrs. Parameters are user addresses. + * + * @return number of msrs set successfully. + */ +static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data), + int writeback) +{ + struct kvm_msrs msrs; + struct kvm_msr_entry *entries; + int r, n; + unsigned size; + + r = -EFAULT; + if (copy_from_user(&msrs, user_msrs, sizeof msrs)) + goto out; + + r = -E2BIG; + if (msrs.nmsrs >= MAX_IO_MSRS) + goto out; + + r = -ENOMEM; + size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; + entries = vmalloc(size); + if (!entries) + goto out; + + r = -EFAULT; + if (copy_from_user(entries, user_msrs->entries, size)) + goto out_free; + + r = n = __msr_io(vcpu, &msrs, entries, do_msr); + if (r < 0) + goto out_free; + + r = -EFAULT; + if (writeback && copy_to_user(user_msrs->entries, entries, size)) + goto out_free; + + r = n; + +out_free: + vfree(entries); +out: + return r; +} + +/* + * Make sure that a cpu that is being hot-unplugged does not have any vcpus + * cached on it. + */ +void decache_vcpus_on_cpu(int cpu) +{ + struct kvm *vm; + struct kvm_vcpu *vcpu; + int i; + + spin_lock(&kvm_lock); + list_for_each_entry(vm, &vm_list, vm_list) + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = vm->vcpus[i]; + if (!vcpu) + continue; + /* + * If the vcpu is locked, then it is running on some + * other cpu and therefore it is not cached on the + * cpu in question. + * + * If it's not locked, check the last cpu it executed + * on. + */ + if (mutex_trylock(&vcpu->mutex)) { + if (vcpu->cpu == cpu) { + kvm_x86_ops->vcpu_decache(vcpu); + vcpu->cpu = -1; + } + mutex_unlock(&vcpu->mutex); + } + } + spin_unlock(&kvm_lock); +} + +int kvm_dev_ioctl_check_extension(long ext) +{ + int r; + + switch (ext) { + case KVM_CAP_IRQCHIP: + case KVM_CAP_HLT: + case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: + case KVM_CAP_USER_MEMORY: + case KVM_CAP_SET_TSS_ADDR: + case KVM_CAP_EXT_CPUID: + r = 1; + break; + case KVM_CAP_VAPIC: + r = !kvm_x86_ops->cpu_has_accelerated_tpr(); + break; + default: + r = 0; + break; + } + return r; + +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + long r; + + switch (ioctl) { + case KVM_GET_MSR_INDEX_LIST: { + struct kvm_msr_list __user *user_msr_list = argp; + struct kvm_msr_list msr_list; + unsigned n; + + r = -EFAULT; + if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) + goto out; + n = msr_list.nmsrs; + msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) + goto out; + r = -E2BIG; + if (n < num_msrs_to_save) + goto out; + r = -EFAULT; + if (copy_to_user(user_msr_list->indices, &msrs_to_save, + num_msrs_to_save * sizeof(u32))) + goto out; + if (copy_to_user(user_msr_list->indices + + num_msrs_to_save * sizeof(u32), + &emulated_msrs, + ARRAY_SIZE(emulated_msrs) * sizeof(u32))) + goto out; + r = 0; + break; + } + default: + r = -EINVAL; + } +out: + return r; +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + kvm_x86_ops->vcpu_load(vcpu, cpu); +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->vcpu_put(vcpu); + kvm_put_guest_fpu(vcpu); +} + +static int is_efer_nx(void) +{ + u64 efer; + + rdmsrl(MSR_EFER, efer); + return efer & EFER_NX; +} + +static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_cpuid_entry2 *e, *entry; + + entry = NULL; + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + e = &vcpu->arch.cpuid_entries[i]; + if (e->function == 0x80000001) { + entry = e; + break; + } + } + if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { + entry->edx &= ~(1 << 20); + printk(KERN_INFO "kvm: guest NX capability removed\n"); + } +} + +/* when an old userspace process fills a new kernel module */ +static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) +{ + int r, i; + struct kvm_cpuid_entry *cpuid_entries; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); + if (!cpuid_entries) + goto out; + r = -EFAULT; + if (copy_from_user(cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry))) + goto out_free; + for (i = 0; i < cpuid->nent; i++) { + vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; + vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; + vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; + vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; + vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; + vcpu->arch.cpuid_entries[i].index = 0; + vcpu->arch.cpuid_entries[i].flags = 0; + vcpu->arch.cpuid_entries[i].padding[0] = 0; + vcpu->arch.cpuid_entries[i].padding[1] = 0; + vcpu->arch.cpuid_entries[i].padding[2] = 0; + } + vcpu->arch.cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); + r = 0; + +out_free: + vfree(cpuid_entries); +out: + return r; +} + +static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -EFAULT; + if (copy_from_user(&vcpu->arch.cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + goto out; + vcpu->arch.cpuid_nent = cpuid->nent; + return 0; + +out: + return r; +} + +static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent < vcpu->arch.cpuid_nent) + goto out; + r = -EFAULT; + if (copy_to_user(entries, &vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) + goto out; + return 0; + +out: + cpuid->nent = vcpu->arch.cpuid_nent; + return r; +} + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + +static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index) +{ + entry->function = function; + entry->index = index; + cpuid_count(entry->function, entry->index, + &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + entry->flags = 0; +} + +static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index, int *nent, int maxnent) +{ + const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | + bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | + bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | + bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | + bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | + bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | + bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | + bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | + bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | + bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); + const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | + bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | + bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | + bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | + bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | + bit(X86_FEATURE_PGE) | + bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | + bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | + bit(X86_FEATURE_SYSCALL) | + (bit(X86_FEATURE_NX) && is_efer_nx()) | +#ifdef CONFIG_X86_64 + bit(X86_FEATURE_LM) | +#endif + bit(X86_FEATURE_MMXEXT) | + bit(X86_FEATURE_3DNOWEXT) | + bit(X86_FEATURE_3DNOW); + const u32 kvm_supported_word3_x86_features = + bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); + const u32 kvm_supported_word6_x86_features = + bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); + + /* all func 2 cpuid_count() should be called on the same cpu */ + get_cpu(); + do_cpuid_1_ent(entry, function, index); + ++*nent; + + switch (function) { + case 0: + entry->eax = min(entry->eax, (u32)0xb); + break; + case 1: + entry->edx &= kvm_supported_word0_x86_features; + entry->ecx &= kvm_supported_word3_x86_features; + break; + /* function 2 entries are STATEFUL. That is, repeated cpuid commands + * may return different values. This forces us to get_cpu() before + * issuing the first command, and also to emulate this annoying behavior + * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ + case 2: { + int t, times = entry->eax & 0xff; + + entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + for (t = 1; t < times && *nent < maxnent; ++t) { + do_cpuid_1_ent(&entry[t], function, 0); + entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + ++*nent; + } + break; + } + /* function 4 and 0xb have additional index. */ + case 4: { + int index, cache_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until cache_type is zero */ + for (index = 1; *nent < maxnent; ++index) { + cache_type = entry[index - 1].eax & 0x1f; + if (!cache_type) + break; + do_cpuid_1_ent(&entry[index], function, index); + entry[index].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0xb: { + int index, level_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until level_type is zero */ + for (index = 1; *nent < maxnent; ++index) { + level_type = entry[index - 1].ecx & 0xff; + if (!level_type) + break; + do_cpuid_1_ent(&entry[index], function, index); + entry[index].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0x80000000: + entry->eax = min(entry->eax, 0x8000001a); + break; + case 0x80000001: + entry->edx &= kvm_supported_word1_x86_features; + entry->ecx &= kvm_supported_word6_x86_features; + break; + } + put_cpu(); +} + +static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + struct kvm_cpuid_entry2 *cpuid_entries; + int limit, nent = 0, r = -E2BIG; + u32 func; + + if (cpuid->nent < 1) + goto out; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + if (!cpuid_entries) + goto out; + + do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); + limit = cpuid_entries[0].eax; + for (func = 1; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); + limit = cpuid_entries[nent - 1].eax; + for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + r = -EFAULT; + if (copy_to_user(entries, cpuid_entries, + nent * sizeof(struct kvm_cpuid_entry2))) + goto out_free; + cpuid->nent = nent; + r = 0; + +out_free: + vfree(cpuid_entries); +out: + return r; +} + +static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s) +{ + vcpu_load(vcpu); + memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); + kvm_apic_post_state_restore(vcpu); + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) +{ + if (irq->irq < 0 || irq->irq >= 256) + return -EINVAL; + if (irqchip_in_kernel(vcpu->kvm)) + return -ENXIO; + vcpu_load(vcpu); + + set_bit(irq->irq, vcpu->arch.irq_pending); + set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); + + vcpu_put(vcpu); + + return 0; +} + +static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, + struct kvm_tpr_access_ctl *tac) +{ + if (tac->flags) + return -EINVAL; + vcpu->arch.tpr_access_reporting = !!tac->enabled; + return 0; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + int r; + + switch (ioctl) { + case KVM_GET_LAPIC: { + struct kvm_lapic_state lapic; + + memset(&lapic, 0, sizeof lapic); + r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &lapic, sizeof lapic)) + goto out; + r = 0; + break; + } + case KVM_SET_LAPIC: { + struct kvm_lapic_state lapic; + + r = -EFAULT; + if (copy_from_user(&lapic, argp, sizeof lapic)) + goto out; + r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; + if (r) + goto out; + r = 0; + break; + } + case KVM_INTERRUPT: { + struct kvm_interrupt irq; + + r = -EFAULT; + if (copy_from_user(&irq, argp, sizeof irq)) + goto out; + r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); + if (r) + goto out; + r = 0; + break; + } + case KVM_SET_CPUID: { + struct kvm_cpuid __user *cpuid_arg = argp; + struct kvm_cpuid cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); + if (r) + goto out; + break; + } + case KVM_SET_CPUID2: { + struct kvm_cpuid2 __user *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, + cpuid_arg->entries); + if (r) + goto out; + break; + } + case KVM_GET_CPUID2: { + struct kvm_cpuid2 __user *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, + cpuid_arg->entries); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + goto out; + r = 0; + break; + } + case KVM_GET_MSRS: + r = msr_io(vcpu, argp, kvm_get_msr, 1); + break; + case KVM_SET_MSRS: + r = msr_io(vcpu, argp, do_set_msr, 0); + break; + case KVM_TPR_ACCESS_REPORTING: { + struct kvm_tpr_access_ctl tac; + + r = -EFAULT; + if (copy_from_user(&tac, argp, sizeof tac)) + goto out; + r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &tac, sizeof tac)) + goto out; + r = 0; + break; + }; + case KVM_SET_VAPIC_ADDR: { + struct kvm_vapic_addr va; + + r = -EINVAL; + if (!irqchip_in_kernel(vcpu->kvm)) + goto out; + r = -EFAULT; + if (copy_from_user(&va, argp, sizeof va)) + goto out; + r = 0; + kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); + break; + } + default: + r = -EINVAL; + } +out: + return r; +} + +static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) +{ + int ret; + + if (addr > (unsigned int)(-3 * PAGE_SIZE)) + return -1; + ret = kvm_x86_ops->set_tss_addr(kvm, addr); + return ret; +} + +static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, + u32 kvm_nr_mmu_pages) +{ + if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) + return -EINVAL; + + down_write(¤t->mm->mmap_sem); + + kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); + kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; + + up_write(¤t->mm->mmap_sem); + return 0; +} + +static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) +{ + return kvm->arch.n_alloc_mmu_pages; +} + +gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + + for (i = 0; i < kvm->arch.naliases; ++i) { + alias = &kvm->arch.aliases[i]; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + +/* + * Set a new alias region. Aliases map a portion of physical memory into + * another portion. This is useful for memory windows, for example the PC + * VGA region. + */ +static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, + struct kvm_memory_alias *alias) +{ + int r, n; + struct kvm_mem_alias *p; + + r = -EINVAL; + /* General sanity checks */ + if (alias->memory_size & (PAGE_SIZE - 1)) + goto out; + if (alias->guest_phys_addr & (PAGE_SIZE - 1)) + goto out; + if (alias->slot >= KVM_ALIAS_SLOTS) + goto out; + if (alias->guest_phys_addr + alias->memory_size + < alias->guest_phys_addr) + goto out; + if (alias->target_phys_addr + alias->memory_size + < alias->target_phys_addr) + goto out; + + down_write(¤t->mm->mmap_sem); + + p = &kvm->arch.aliases[alias->slot]; + p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; + p->npages = alias->memory_size >> PAGE_SHIFT; + p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + + for (n = KVM_ALIAS_SLOTS; n > 0; --n) + if (kvm->arch.aliases[n - 1].npages) + break; + kvm->arch.naliases = n; + + kvm_mmu_zap_all(kvm); + + up_write(¤t->mm->mmap_sem); + + return 0; + +out: + return r; +} + +static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + memcpy(&chip->chip.pic, + &pic_irqchip(kvm)->pics[0], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_PIC_SLAVE: + memcpy(&chip->chip.pic, + &pic_irqchip(kvm)->pics[1], + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_IOAPIC: + memcpy(&chip->chip.ioapic, + ioapic_irqchip(kvm), + sizeof(struct kvm_ioapic_state)); + break; + default: + r = -EINVAL; + break; + } + return r; +} + +static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) +{ + int r; + + r = 0; + switch (chip->chip_id) { + case KVM_IRQCHIP_PIC_MASTER: + memcpy(&pic_irqchip(kvm)->pics[0], + &chip->chip.pic, + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_PIC_SLAVE: + memcpy(&pic_irqchip(kvm)->pics[1], + &chip->chip.pic, + sizeof(struct kvm_pic_state)); + break; + case KVM_IRQCHIP_IOAPIC: + memcpy(ioapic_irqchip(kvm), + &chip->chip.ioapic, + sizeof(struct kvm_ioapic_state)); + break; + default: + r = -EINVAL; + break; + } + kvm_pic_update_irq(pic_irqchip(kvm)); + return r; +} + +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + int r; + int n; + struct kvm_memory_slot *memslot; + int is_dirty = 0; + + down_write(¤t->mm->mmap_sem); + + r = kvm_get_dirty_log(kvm, log, &is_dirty); + if (r) + goto out; + + /* If nothing is dirty, don't bother messing with page tables. */ + if (is_dirty) { + kvm_mmu_slot_remove_write_access(kvm, log->slot); + kvm_flush_remote_tlbs(kvm); + memslot = &kvm->memslots[log->slot]; + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + memset(memslot->dirty_bitmap, 0, n); + } + r = 0; +out: + up_write(¤t->mm->mmap_sem); + return r; +} + +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm = filp->private_data; + void __user *argp = (void __user *)arg; + int r = -EINVAL; + + switch (ioctl) { + case KVM_SET_TSS_ADDR: + r = kvm_vm_ioctl_set_tss_addr(kvm, arg); + if (r < 0) + goto out; + break; + case KVM_SET_MEMORY_REGION: { + struct kvm_memory_region kvm_mem; + struct kvm_userspace_memory_region kvm_userspace_mem; + + r = -EFAULT; + if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) + goto out; + kvm_userspace_mem.slot = kvm_mem.slot; + kvm_userspace_mem.flags = kvm_mem.flags; + kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; + kvm_userspace_mem.memory_size = kvm_mem.memory_size; + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); + if (r) + goto out; + break; + } + case KVM_SET_NR_MMU_PAGES: + r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); + if (r) + goto out; + break; + case KVM_GET_NR_MMU_PAGES: + r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); + break; + case KVM_SET_MEMORY_ALIAS: { + struct kvm_memory_alias alias; + + r = -EFAULT; + if (copy_from_user(&alias, argp, sizeof alias)) + goto out; + r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); + if (r) + goto out; + break; + } + case KVM_CREATE_IRQCHIP: + r = -ENOMEM; + kvm->arch.vpic = kvm_create_pic(kvm); + if (kvm->arch.vpic) { + r = kvm_ioapic_init(kvm); + if (r) { + kfree(kvm->arch.vpic); + kvm->arch.vpic = NULL; + goto out; + } + } else + goto out; + break; + case KVM_IRQ_LINE: { + struct kvm_irq_level irq_event; + + r = -EFAULT; + if (copy_from_user(&irq_event, argp, sizeof irq_event)) + goto out; + if (irqchip_in_kernel(kvm)) { + mutex_lock(&kvm->lock); + if (irq_event.irq < 16) + kvm_pic_set_irq(pic_irqchip(kvm), + irq_event.irq, + irq_event.level); + kvm_ioapic_set_irq(kvm->arch.vioapic, + irq_event.irq, + irq_event.level); + mutex_unlock(&kvm->lock); + r = 0; + } + break; + } + case KVM_GET_IRQCHIP: { + /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ + struct kvm_irqchip chip; + + r = -EFAULT; + if (copy_from_user(&chip, argp, sizeof chip)) + goto out; + r = -ENXIO; + if (!irqchip_in_kernel(kvm)) + goto out; + r = kvm_vm_ioctl_get_irqchip(kvm, &chip); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &chip, sizeof chip)) + goto out; + r = 0; + break; + } + case KVM_SET_IRQCHIP: { + /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ + struct kvm_irqchip chip; + + r = -EFAULT; + if (copy_from_user(&chip, argp, sizeof chip)) + goto out; + r = -ENXIO; + if (!irqchip_in_kernel(kvm)) + goto out; + r = kvm_vm_ioctl_set_irqchip(kvm, &chip); + if (r) + goto out; + r = 0; + break; + } + case KVM_GET_SUPPORTED_CPUID: { + struct kvm_cpuid2 __user *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid, + cpuid_arg->entries); + if (r) + goto out; + + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + goto out; + r = 0; + break; + } + default: + ; + } +out: + return r; +} + +static void kvm_init_msr_list(void) +{ + u32 dummy[2]; + unsigned i, j; + + for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { + if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + continue; + if (j < i) + msrs_to_save[j] = msrs_to_save[i]; + j++; + } + num_msrs_to_save = j; +} + +/* + * Only apic need an MMIO device hook, so shortcut now.. + */ +static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, + gpa_t addr) +{ + struct kvm_io_device *dev; + + if (vcpu->arch.apic) { + dev = &vcpu->arch.apic->dev; + if (dev->in_range(dev, addr)) + return dev; + } + return NULL; +} + + +static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, + gpa_t addr) +{ + struct kvm_io_device *dev; + + dev = vcpu_find_pervcpu_dev(vcpu, addr); + if (dev == NULL) + dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); + return dev; +} + +int emulator_read_std(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + void *data = val; + int r = X86EMUL_CONTINUE; + + down_read(¤t->mm->mmap_sem); + while (bytes) { + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + unsigned offset = addr & (PAGE_SIZE-1); + unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); + int ret; + + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } + ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); + if (ret < 0) { + r = X86EMUL_UNHANDLEABLE; + goto out; + } + + bytes -= tocopy; + data += tocopy; + addr += tocopy; + } +out: + up_read(¤t->mm->mmap_sem); + return r; +} +EXPORT_SYMBOL_GPL(emulator_read_std); + +static int emulator_read_emulated(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + struct kvm_io_device *mmio_dev; + gpa_t gpa; + + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); + vcpu->mmio_read_completed = 0; + return X86EMUL_CONTINUE; + } + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + up_read(¤t->mm->mmap_sem); + + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; + + if (emulator_read_std(addr, val, bytes, vcpu) + == X86EMUL_CONTINUE) + return X86EMUL_CONTINUE; + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + +mmio: + /* + * Is this MMIO handled locally? + */ + mutex_lock(&vcpu->kvm->lock); + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + kvm_iodevice_read(mmio_dev, gpa, bytes, val); + mutex_unlock(&vcpu->kvm->lock); + return X86EMUL_CONTINUE; + } + mutex_unlock(&vcpu->kvm->lock); + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 0; + + return X86EMUL_UNHANDLEABLE; +} + +static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes) +{ + int ret; + + down_read(¤t->mm->mmap_sem); + ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); + if (ret < 0) { + up_read(¤t->mm->mmap_sem); + return 0; + } + kvm_mmu_pte_write(vcpu, gpa, val, bytes); + up_read(¤t->mm->mmap_sem); + return 1; +} + +static int emulator_write_emulated_onepage(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + struct kvm_io_device *mmio_dev; + gpa_t gpa; + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + up_read(¤t->mm->mmap_sem); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, 2); + return X86EMUL_PROPAGATE_FAULT; + } + + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto mmio; + + if (emulator_write_phys(vcpu, gpa, val, bytes)) + return X86EMUL_CONTINUE; + +mmio: + /* + * Is this MMIO handled locally? + */ + mutex_lock(&vcpu->kvm->lock); + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + kvm_iodevice_write(mmio_dev, gpa, bytes, val); + mutex_unlock(&vcpu->kvm->lock); + return X86EMUL_CONTINUE; + } + mutex_unlock(&vcpu->kvm->lock); + + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 1; + memcpy(vcpu->mmio_data, val, bytes); + + return X86EMUL_CONTINUE; +} + +int emulator_write_emulated(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + /* Crossing a page boundary? */ + if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { + int rc, now; + + now = -addr & ~PAGE_MASK; + rc = emulator_write_emulated_onepage(addr, val, now, vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + addr += now; + val += now; + bytes -= now; + } + return emulator_write_emulated_onepage(addr, val, bytes, vcpu); +} +EXPORT_SYMBOL_GPL(emulator_write_emulated); + +static int emulator_cmpxchg_emulated(unsigned long addr, + const void *old, + const void *new, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + static int reported; + + if (!reported) { + reported = 1; + printk(KERN_WARNING "kvm: emulating exchange as write\n"); + } +#ifndef CONFIG_X86_64 + /* guests cmpxchg8b have to be emulated atomically */ + if (bytes == 8) { + gpa_t gpa; + struct page *page; + char *addr; + u64 val; + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + + if (gpa == UNMAPPED_GVA || + (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto emul_write; + + if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) + goto emul_write; + + val = *(u64 *)new; + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + addr = kmap_atomic(page, KM_USER0); + set_64bit((u64 *)(addr + offset_in_page(gpa)), val); + kunmap_atomic(addr, KM_USER0); + kvm_release_page_dirty(page); + emul_write: + up_read(¤t->mm->mmap_sem); + } +#endif + + return emulator_write_emulated(addr, new, bytes, vcpu); +} + +static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + return kvm_x86_ops->get_segment_base(vcpu, seg); +} + +int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) +{ + return X86EMUL_CONTINUE; +} + +int emulate_clts(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); + return X86EMUL_CONTINUE; +} + +int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) +{ + struct kvm_vcpu *vcpu = ctxt->vcpu; + + switch (dr) { + case 0 ... 3: + *dest = kvm_x86_ops->get_dr(vcpu, dr); + return X86EMUL_CONTINUE; + default: + pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); + return X86EMUL_UNHANDLEABLE; + } +} + +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) +{ + unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; + int exception; + + kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); + if (exception) { + /* FIXME: better handling */ + return X86EMUL_UNHANDLEABLE; + } + return X86EMUL_CONTINUE; +} + +void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) +{ + static int reported; + u8 opcodes[4]; + unsigned long rip = vcpu->arch.rip; + unsigned long rip_linear; + + rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); + + if (reported) + return; + + emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); + + printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", + context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); + reported = 1; +} +EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); + +struct x86_emulate_ops emulate_ops = { + .read_std = emulator_read_std, + .read_emulated = emulator_read_emulated, + .write_emulated = emulator_write_emulated, + .cmpxchg_emulated = emulator_cmpxchg_emulated, +}; + +int emulate_instruction(struct kvm_vcpu *vcpu, + struct kvm_run *run, + unsigned long cr2, + u16 error_code, + int emulation_type) +{ + int r; + struct decode_cache *c; + + vcpu->arch.mmio_fault_cr2 = cr2; + kvm_x86_ops->cache_regs(vcpu); + + vcpu->mmio_is_write = 0; + vcpu->arch.pio.string = 0; + + if (!(emulation_type & EMULTYPE_NO_DECODE)) { + int cs_db, cs_l; + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.mode = + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + + if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) { + vcpu->arch.emulate_ctxt.cs_base = 0; + vcpu->arch.emulate_ctxt.ds_base = 0; + vcpu->arch.emulate_ctxt.es_base = 0; + vcpu->arch.emulate_ctxt.ss_base = 0; + } else { + vcpu->arch.emulate_ctxt.cs_base = + get_segment_base(vcpu, VCPU_SREG_CS); + vcpu->arch.emulate_ctxt.ds_base = + get_segment_base(vcpu, VCPU_SREG_DS); + vcpu->arch.emulate_ctxt.es_base = + get_segment_base(vcpu, VCPU_SREG_ES); + vcpu->arch.emulate_ctxt.ss_base = + get_segment_base(vcpu, VCPU_SREG_SS); + } + + vcpu->arch.emulate_ctxt.gs_base = + get_segment_base(vcpu, VCPU_SREG_GS); + vcpu->arch.emulate_ctxt.fs_base = + get_segment_base(vcpu, VCPU_SREG_FS); + + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + + /* Reject the instructions other than VMCALL/VMMCALL when + * try to emulate invalid opcode */ + c = &vcpu->arch.emulate_ctxt.decode; + if ((emulation_type & EMULTYPE_TRAP_UD) && + (!(c->twobyte && c->b == 0x01 && + (c->modrm_reg == 0 || c->modrm_reg == 3) && + c->modrm_mod == 3 && c->modrm_rm == 1))) + return EMULATE_FAIL; + + ++vcpu->stat.insn_emulation; + if (r) { + ++vcpu->stat.insn_emulation_fail; + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return EMULATE_DONE; + return EMULATE_FAIL; + } + } + + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + + if (vcpu->arch.pio.string) + return EMULATE_DO_MMIO; + + if ((r || vcpu->mmio_is_write) && run) { + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = vcpu->mmio_phys_addr; + memcpy(run->mmio.data, vcpu->mmio_data, 8); + run->mmio.len = vcpu->mmio_size; + run->mmio.is_write = vcpu->mmio_is_write; + } + + if (r) { + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return EMULATE_DONE; + if (!vcpu->mmio_needed) { + kvm_report_emulation_failure(vcpu, "mmio"); + return EMULATE_FAIL; + } + return EMULATE_DO_MMIO; + } + + kvm_x86_ops->decache_regs(vcpu); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + + if (vcpu->mmio_is_write) { + vcpu->mmio_needed = 0; + return EMULATE_DO_MMIO; + } + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(emulate_instruction); + +static void free_pio_guest_pages(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) + if (vcpu->arch.pio.guest_pages[i]) { + kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); + vcpu->arch.pio.guest_pages[i] = NULL; + } +} + +static int pio_copy_data(struct kvm_vcpu *vcpu) +{ + void *p = vcpu->arch.pio_data; + void *q; + unsigned bytes; + int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; + + q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, + PAGE_KERNEL); + if (!q) { + free_pio_guest_pages(vcpu); + return -ENOMEM; + } + q += vcpu->arch.pio.guest_page_offset; + bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; + if (vcpu->arch.pio.in) + memcpy(q, p, bytes); + else + memcpy(p, q, bytes); + q -= vcpu->arch.pio.guest_page_offset; + vunmap(q); + free_pio_guest_pages(vcpu); + return 0; +} + +int complete_pio(struct kvm_vcpu *vcpu) +{ + struct kvm_pio_request *io = &vcpu->arch.pio; + long delta; + int r; + + kvm_x86_ops->cache_regs(vcpu); + + if (!io->string) { + if (io->in) + memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, + io->size); + } else { + if (io->in) { + r = pio_copy_data(vcpu); + if (r) { + kvm_x86_ops->cache_regs(vcpu); + return r; + } + } + + delta = 1; + if (io->rep) { + delta *= io->cur_count; + /* + * The size of the register should really depend on + * current address size. + */ + vcpu->arch.regs[VCPU_REGS_RCX] -= delta; + } + if (io->down) + delta = -delta; + delta *= io->size; + if (io->in) + vcpu->arch.regs[VCPU_REGS_RDI] += delta; + else + vcpu->arch.regs[VCPU_REGS_RSI] += delta; + } + + kvm_x86_ops->decache_regs(vcpu); + + io->count -= io->cur_count; + io->cur_count = 0; + + return 0; +} + +static void kernel_pio(struct kvm_io_device *pio_dev, + struct kvm_vcpu *vcpu, + void *pd) +{ + /* TODO: String I/O for in kernel device */ + + mutex_lock(&vcpu->kvm->lock); + if (vcpu->arch.pio.in) + kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, + vcpu->arch.pio.size, + pd); + else + kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, + vcpu->arch.pio.size, + pd); + mutex_unlock(&vcpu->kvm->lock); +} + +static void pio_string_write(struct kvm_io_device *pio_dev, + struct kvm_vcpu *vcpu) +{ + struct kvm_pio_request *io = &vcpu->arch.pio; + void *pd = vcpu->arch.pio_data; + int i; + + mutex_lock(&vcpu->kvm->lock); + for (i = 0; i < io->cur_count; i++) { + kvm_iodevice_write(pio_dev, io->port, + io->size, + pd); + pd += io->size; + } + mutex_unlock(&vcpu->kvm->lock); +} + +static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, + gpa_t addr) +{ + return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); +} + +int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned port) +{ + struct kvm_io_device *pio_dev; + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->io.size = vcpu->arch.pio.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; + vcpu->run->io.port = vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.string = 0; + vcpu->arch.pio.down = 0; + vcpu->arch.pio.guest_page_offset = 0; + vcpu->arch.pio.rep = 0; + + kvm_x86_ops->cache_regs(vcpu); + memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); + kvm_x86_ops->decache_regs(vcpu); + + kvm_x86_ops->skip_emulated_instruction(vcpu); + + pio_dev = vcpu_find_pio_dev(vcpu, port); + if (pio_dev) { + kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); + complete_pio(vcpu); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_emulate_pio); + +int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, + int size, unsigned long count, int down, + gva_t address, int rep, unsigned port) +{ + unsigned now, in_page; + int i, ret = 0; + int nr_pages = 1; + struct page *page; + struct kvm_io_device *pio_dev; + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; + vcpu->run->io.size = vcpu->arch.pio.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; + vcpu->run->io.port = vcpu->arch.pio.port = port; + vcpu->arch.pio.in = in; + vcpu->arch.pio.string = 1; + vcpu->arch.pio.down = down; + vcpu->arch.pio.guest_page_offset = offset_in_page(address); + vcpu->arch.pio.rep = rep; + + if (!count) { + kvm_x86_ops->skip_emulated_instruction(vcpu); + return 1; + } + + if (!down) + in_page = PAGE_SIZE - offset_in_page(address); + else + in_page = offset_in_page(address) + size; + now = min(count, (unsigned long)in_page / size); + if (!now) { + /* + * String I/O straddles page boundary. Pin two guest pages + * so that we satisfy atomicity constraints. Do just one + * transaction to avoid complexity. + */ + nr_pages = 2; + now = 1; + } + if (down) { + /* + * String I/O in reverse. Yuck. Kill the guest, fix later. + */ + pr_unimpl(vcpu, "guest string pio down\n"); + kvm_inject_gp(vcpu, 0); + return 1; + } + vcpu->run->io.count = now; + vcpu->arch.pio.cur_count = now; + + if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) + kvm_x86_ops->skip_emulated_instruction(vcpu); + + for (i = 0; i < nr_pages; ++i) { + down_read(¤t->mm->mmap_sem); + page = gva_to_page(vcpu, address + i * PAGE_SIZE); + vcpu->arch.pio.guest_pages[i] = page; + up_read(¤t->mm->mmap_sem); + if (!page) { + kvm_inject_gp(vcpu, 0); + free_pio_guest_pages(vcpu); + return 1; + } + } + + pio_dev = vcpu_find_pio_dev(vcpu, port); + if (!vcpu->arch.pio.in) { + /* string PIO write */ + ret = pio_copy_data(vcpu); + if (ret >= 0 && pio_dev) { + pio_string_write(pio_dev, vcpu); + complete_pio(vcpu); + if (vcpu->arch.pio.count == 0) + ret = 1; + } + } else if (pio_dev) + pr_unimpl(vcpu, "no string pio read support yet, " + "port %x size %d count %ld\n", + port, size, count); + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); + +int kvm_arch_init(void *opaque) +{ + int r; + struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; + + if (kvm_x86_ops) { + printk(KERN_ERR "kvm: already loaded the other module\n"); + r = -EEXIST; + goto out; + } + + if (!ops->cpu_has_kvm_support()) { + printk(KERN_ERR "kvm: no hardware support\n"); + r = -EOPNOTSUPP; + goto out; + } + if (ops->disabled_by_bios()) { + printk(KERN_ERR "kvm: disabled by bios\n"); + r = -EOPNOTSUPP; + goto out; + } + + r = kvm_mmu_module_init(); + if (r) + goto out; + + kvm_init_msr_list(); + + kvm_x86_ops = ops; + kvm_mmu_set_nonpresent_ptes(0ull, 0ull); + return 0; + +out: + return r; +} + +void kvm_arch_exit(void) +{ + kvm_x86_ops = NULL; + kvm_mmu_module_exit(); +} + +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.halt_exits; + if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; + kvm_vcpu_block(vcpu); + if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) + return -EINTR; + return 1; + } else { + vcpu->run->exit_reason = KVM_EXIT_HLT; + return 0; + } +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt); + +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +{ + unsigned long nr, a0, a1, a2, a3, ret; + + kvm_x86_ops->cache_regs(vcpu); + + nr = vcpu->arch.regs[VCPU_REGS_RAX]; + a0 = vcpu->arch.regs[VCPU_REGS_RBX]; + a1 = vcpu->arch.regs[VCPU_REGS_RCX]; + a2 = vcpu->arch.regs[VCPU_REGS_RDX]; + a3 = vcpu->arch.regs[VCPU_REGS_RSI]; + + if (!is_long_mode(vcpu)) { + nr &= 0xFFFFFFFF; + a0 &= 0xFFFFFFFF; + a1 &= 0xFFFFFFFF; + a2 &= 0xFFFFFFFF; + a3 &= 0xFFFFFFFF; + } + + switch (nr) { + case KVM_HC_VAPIC_POLL_IRQ: + ret = 0; + break; + default: + ret = -KVM_ENOSYS; + break; + } + vcpu->arch.regs[VCPU_REGS_RAX] = ret; + kvm_x86_ops->decache_regs(vcpu); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); + +int kvm_fix_hypercall(struct kvm_vcpu *vcpu) +{ + char instruction[3]; + int ret = 0; + + + /* + * Blow out the MMU to ensure that no other VCPU has an active mapping + * to ensure that the updated hypercall appears atomically across all + * VCPUs. + */ + kvm_mmu_zap_all(vcpu->kvm); + + kvm_x86_ops->cache_regs(vcpu); + kvm_x86_ops->patch_hypercall(vcpu, instruction); + if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) + != X86EMUL_CONTINUE) + ret = -EFAULT; + + return ret; +} + +static u64 mk_cr_64(u64 curr_cr, u32 new_val) +{ + return (curr_cr & ~((1ULL << 32) - 1)) | new_val; +} + +void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct descriptor_table dt = { limit, base }; + + kvm_x86_ops->set_gdt(vcpu, &dt); +} + +void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct descriptor_table dt = { limit, base }; + + kvm_x86_ops->set_idt(vcpu, &dt); +} + +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags) +{ + lmsw(vcpu, msw); + *rflags = kvm_x86_ops->get_rflags(vcpu); +} + +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) +{ + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + switch (cr) { + case 0: + return vcpu->arch.cr0; + case 2: + return vcpu->arch.cr2; + case 3: + return vcpu->arch.cr3; + case 4: + return vcpu->arch.cr4; + case 8: + return get_cr8(vcpu); + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + return 0; + } +} + +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, + unsigned long *rflags) +{ + switch (cr) { + case 0: + set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); + *rflags = kvm_x86_ops->get_rflags(vcpu); + break; + case 2: + vcpu->arch.cr2 = val; + break; + case 3: + set_cr3(vcpu, val); + break; + case 4: + set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); + break; + case 8: + set_cr8(vcpu, val & 0xfUL); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + } +} + +static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) +{ + struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; + int j, nent = vcpu->arch.cpuid_nent; + + e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; + /* when no next entry is found, the current entry[i] is reselected */ + for (j = i + 1; j == i; j = (j + 1) % nent) { + struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; + if (ej->function == e->function) { + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + return j; + } + } + return 0; /* silence gcc, even though control never reaches here */ +} + +/* find an entry with matching function, matching index (if needed), and that + * should be read next (if it's stateful) */ +static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, + u32 function, u32 index) +{ + if (e->function != function) + return 0; + if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) + return 0; + if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && + !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + return 0; + return 1; +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + int i; + u32 function, index; + struct kvm_cpuid_entry2 *e, *best; + + kvm_x86_ops->cache_regs(vcpu); + function = vcpu->arch.regs[VCPU_REGS_RAX]; + index = vcpu->arch.regs[VCPU_REGS_RCX]; + vcpu->arch.regs[VCPU_REGS_RAX] = 0; + vcpu->arch.regs[VCPU_REGS_RBX] = 0; + vcpu->arch.regs[VCPU_REGS_RCX] = 0; + vcpu->arch.regs[VCPU_REGS_RDX] = 0; + best = NULL; + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + e = &vcpu->arch.cpuid_entries[i]; + if (is_matching_cpuid_entry(e, function, index)) { + if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) + move_to_next_stateful_cpuid_entry(vcpu, i); + best = e; + break; + } + /* + * Both basic or both extended? + */ + if (((e->function ^ function) & 0x80000000) == 0) + if (!best || e->function > best->function) + best = e; + } + if (best) { + vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; + vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; + vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; + vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; + } + kvm_x86_ops->decache_regs(vcpu); + kvm_x86_ops->skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); + +/* + * Check if userspace requested an interrupt window, and that the + * interrupt window is open. + * + * No need to exit to userspace if we already have an interrupt queued. + */ +static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + return (!vcpu->arch.irq_summary && + kvm_run->request_interrupt_window && + vcpu->arch.interrupt_window_open && + (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); +} + +static void post_kvm_run_save(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->cr8 = get_cr8(vcpu); + kvm_run->apic_base = kvm_get_apic_base(vcpu); + if (irqchip_in_kernel(vcpu->kvm)) + kvm_run->ready_for_interrupt_injection = 1; + else + kvm_run->ready_for_interrupt_injection = + (vcpu->arch.interrupt_window_open && + vcpu->arch.irq_summary == 0); +} + +static void vapic_enter(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct page *page; + + if (!apic || !apic->vapic_addr) + return; + + down_read(¤t->mm->mmap_sem); + page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); + vcpu->arch.apic->vapic_page = page; + up_read(¤t->mm->mmap_sem); +} + +static void vapic_exit(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!apic || !apic->vapic_addr) + return; + + kvm_release_page_dirty(apic->vapic_page); + mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); +} + +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + + if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { + pr_debug("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); + kvm_lapic_reset(vcpu); + r = kvm_x86_ops->vcpu_reset(vcpu); + if (r) + return r; + vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; + } + + vapic_enter(vcpu); + +preempted: + if (vcpu->guest_debug.enabled) + kvm_x86_ops->guest_debug_pre(vcpu); + +again: + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + + if (vcpu->requests) { + if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) + __kvm_migrate_apic_timer(vcpu); + if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, + &vcpu->requests)) { + kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; + r = 0; + goto out; + } + } + + kvm_inject_pending_timer_irqs(vcpu); + + preempt_disable(); + + kvm_x86_ops->prepare_guest_switch(vcpu); + kvm_load_guest_fpu(vcpu); + + local_irq_disable(); + + if (need_resched()) { + local_irq_enable(); + preempt_enable(); + r = 1; + goto out; + } + + if (signal_pending(current)) { + local_irq_enable(); + preempt_enable(); + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + goto out; + } + + if (vcpu->arch.exception.pending) + __queue_exception(vcpu); + else if (irqchip_in_kernel(vcpu->kvm)) + kvm_x86_ops->inject_pending_irq(vcpu); + else + kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); + + kvm_lapic_sync_to_vapic(vcpu); + + vcpu->guest_mode = 1; + kvm_guest_enter(); + + if (vcpu->requests) + if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + kvm_x86_ops->tlb_flush(vcpu); + + kvm_x86_ops->run(vcpu, kvm_run); + + vcpu->guest_mode = 0; + local_irq_enable(); + + ++vcpu->stat.exits; + + /* + * We must have an instruction between local_irq_enable() and + * kvm_guest_exit(), so the timer interrupt isn't delayed by + * the interrupt shadow. The stat.exits increment will do nicely. + * But we need to prevent reordering, hence this barrier(): + */ + barrier(); + + kvm_guest_exit(); + + preempt_enable(); + + /* + * Profile KVM exit RIPs: + */ + if (unlikely(prof_on == KVM_PROFILING)) { + kvm_x86_ops->cache_regs(vcpu); + profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); + } + + if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) + vcpu->arch.exception.pending = false; + + kvm_lapic_sync_from_vapic(vcpu); + + r = kvm_x86_ops->handle_exit(kvm_run, vcpu); + + if (r > 0) { + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + goto out; + } + if (!need_resched()) + goto again; + } + +out: + if (r > 0) { + kvm_resched(vcpu); + goto preempted; + } + + post_kvm_run_save(vcpu, kvm_run); + + vapic_exit(vcpu); + + return r; +} + +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + sigset_t sigsaved; + + vcpu_load(vcpu); + + if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) { + kvm_vcpu_block(vcpu); + vcpu_put(vcpu); + return -EAGAIN; + } + + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + + /* re-sync apic's tpr */ + if (!irqchip_in_kernel(vcpu->kvm)) + set_cr8(vcpu, kvm_run->cr8); + + if (vcpu->arch.pio.cur_count) { + r = complete_pio(vcpu); + if (r) + goto out; + } +#if CONFIG_HAS_IOMEM + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; + r = emulate_instruction(vcpu, kvm_run, + vcpu->arch.mmio_fault_cr2, 0, + EMULTYPE_NO_DECODE); + if (r == EMULATE_DO_MMIO) { + /* + * Read-modify-write. Back to userspace. + */ + r = 0; + goto out; + } + } +#endif + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { + kvm_x86_ops->cache_regs(vcpu); + vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; + kvm_x86_ops->decache_regs(vcpu); + } + + r = __vcpu_run(vcpu, kvm_run); + +out: + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + vcpu_put(vcpu); + return r; +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + + kvm_x86_ops->cache_regs(vcpu); + + regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; + regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; + regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; + regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; + regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; + regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; + regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; +#ifdef CONFIG_X86_64 + regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; + regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; + regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; + regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; + regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; + regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; + regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; + regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; +#endif + + regs->rip = vcpu->arch.rip; + regs->rflags = kvm_x86_ops->get_rflags(vcpu); + + /* + * Don't leak debug flags in case they were set for guest debugging + */ + if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) + regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + + vcpu_put(vcpu); + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_load(vcpu); + + vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; + vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; + vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; + vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; + vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; + vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; + vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; + vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; +#ifdef CONFIG_X86_64 + vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; + vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; + vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; + vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; + vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; + vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; + vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; + vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; +#endif + + vcpu->arch.rip = regs->rip; + kvm_x86_ops->set_rflags(vcpu, regs->rflags); + + kvm_x86_ops->decache_regs(vcpu); + + vcpu_put(vcpu); + + return 0; +} + +static void get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + return kvm_x86_ops->get_segment(vcpu, var, seg); +} + +void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct kvm_segment cs; + + get_segment(vcpu, &cs, VCPU_SREG_CS); + *db = cs.db; + *l = cs.l; +} +EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct descriptor_table dt; + int pending_vec; + + vcpu_load(vcpu); + + get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + get_segment(vcpu, &sregs->es, VCPU_SREG_ES); + get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + kvm_x86_ops->get_idt(vcpu, &dt); + sregs->idt.limit = dt.limit; + sregs->idt.base = dt.base; + kvm_x86_ops->get_gdt(vcpu, &dt); + sregs->gdt.limit = dt.limit; + sregs->gdt.base = dt.base; + + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + sregs->cr0 = vcpu->arch.cr0; + sregs->cr2 = vcpu->arch.cr2; + sregs->cr3 = vcpu->arch.cr3; + sregs->cr4 = vcpu->arch.cr4; + sregs->cr8 = get_cr8(vcpu); + sregs->efer = vcpu->arch.shadow_efer; + sregs->apic_base = kvm_get_apic_base(vcpu); + + if (irqchip_in_kernel(vcpu->kvm)) { + memset(sregs->interrupt_bitmap, 0, + sizeof sregs->interrupt_bitmap); + pending_vec = kvm_x86_ops->get_irq(vcpu); + if (pending_vec >= 0) + set_bit(pending_vec, + (unsigned long *)sregs->interrupt_bitmap); + } else + memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, + sizeof sregs->interrupt_bitmap); + + vcpu_put(vcpu); + + return 0; +} + +static void set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + return kvm_x86_ops->set_segment(vcpu, var, seg); +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int mmu_reset_needed = 0; + int i, pending_vec, max_bits; + struct descriptor_table dt; + + vcpu_load(vcpu); + + dt.limit = sregs->idt.limit; + dt.base = sregs->idt.base; + kvm_x86_ops->set_idt(vcpu, &dt); + dt.limit = sregs->gdt.limit; + dt.base = sregs->gdt.base; + kvm_x86_ops->set_gdt(vcpu, &dt); + + vcpu->arch.cr2 = sregs->cr2; + mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; + vcpu->arch.cr3 = sregs->cr3; + + set_cr8(vcpu, sregs->cr8); + + mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; +#ifdef CONFIG_X86_64 + kvm_x86_ops->set_efer(vcpu, sregs->efer); +#endif + kvm_set_apic_base(vcpu, sregs->apic_base); + + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + + mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; + vcpu->arch.cr0 = sregs->cr0; + kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + + mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; + kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + if (!is_long_mode(vcpu) && is_pae(vcpu)) + load_pdptrs(vcpu, vcpu->arch.cr3); + + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + + if (!irqchip_in_kernel(vcpu->kvm)) { + memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, + sizeof vcpu->arch.irq_pending); + vcpu->arch.irq_summary = 0; + for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) + if (vcpu->arch.irq_pending[i]) + __set_bit(i, &vcpu->arch.irq_summary); + } else { + max_bits = (sizeof sregs->interrupt_bitmap) << 3; + pending_vec = find_first_bit( + (const unsigned long *)sregs->interrupt_bitmap, + max_bits); + /* Only pending external irq is handled here */ + if (pending_vec < max_bits) { + kvm_x86_ops->set_irq(vcpu, pending_vec); + pr_debug("Set back pending irq %d\n", + pending_vec); + } + } + + set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + vcpu_put(vcpu); + + return 0; +} + +int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, + struct kvm_debug_guest *dbg) +{ + int r; + + vcpu_load(vcpu); + + r = kvm_x86_ops->set_guest_debug(vcpu, dbg); + + vcpu_put(vcpu); + + return r; +} + +/* + * fxsave fpu state. Taken from x86_64/processor.h. To be killed when + * we have asm/x86/processor.h + */ +struct fxsave { + u16 cwd; + u16 swd; + u16 twd; + u16 fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ +#ifdef CONFIG_X86_64 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ +#else + u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ +#endif +}; + +/* + * Translate a guest virtual address to a guest physical address. + */ +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + unsigned long vaddr = tr->linear_address; + gpa_t gpa; + + vcpu_load(vcpu); + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); + up_read(¤t->mm->mmap_sem); + tr->physical_address = gpa; + tr->valid = gpa != UNMAPPED_GVA; + tr->writeable = 1; + tr->usermode = 0; + vcpu_put(vcpu); + + return 0; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fpu->fpr, fxsave->st_space, 128); + fpu->fcw = fxsave->cwd; + fpu->fsw = fxsave->swd; + fpu->ftwx = fxsave->twd; + fpu->last_opcode = fxsave->fop; + fpu->last_ip = fxsave->rip; + fpu->last_dp = fxsave->rdp; + memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + + vcpu_put(vcpu); + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fxsave->st_space, fpu->fpr, 128); + fxsave->cwd = fpu->fcw; + fxsave->swd = fpu->fsw; + fxsave->twd = fpu->ftwx; + fxsave->fop = fpu->last_opcode; + fxsave->rip = fpu->last_ip; + fxsave->rdp = fpu->last_dp; + memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + + vcpu_put(vcpu); + + return 0; +} + +void fx_init(struct kvm_vcpu *vcpu) +{ + unsigned after_mxcsr_mask; + + /* Initialize guest FPU by resetting ours and saving into guest's */ + preempt_disable(); + fx_save(&vcpu->arch.host_fx_image); + fpu_init(); + fx_save(&vcpu->arch.guest_fx_image); + fx_restore(&vcpu->arch.host_fx_image); + preempt_enable(); + + vcpu->arch.cr0 |= X86_CR0_ET; + after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); + vcpu->arch.guest_fx_image.mxcsr = 0x1f80; + memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, + 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); +} +EXPORT_SYMBOL_GPL(fx_init); + +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) + return; + + vcpu->guest_fpu_loaded = 1; + fx_save(&vcpu->arch.host_fx_image); + fx_restore(&vcpu->arch.guest_fx_image); +} +EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); + +void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->guest_fpu_loaded) + return; + + vcpu->guest_fpu_loaded = 0; + fx_save(&vcpu->arch.guest_fx_image); + fx_restore(&vcpu->arch.host_fx_image); + ++vcpu->stat.fpu_reload; +} +EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); + +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->vcpu_free(vcpu); +} + +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, + unsigned int id) +{ + return kvm_x86_ops->vcpu_create(kvm, id); +} + +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +{ + int r; + + /* We do fxsave: this must be aligned. */ + BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); + + vcpu_load(vcpu); + r = kvm_arch_vcpu_reset(vcpu); + if (r == 0) + r = kvm_mmu_setup(vcpu); + vcpu_put(vcpu); + if (r < 0) + goto free_vcpu; + + return 0; +free_vcpu: + kvm_x86_ops->vcpu_free(vcpu); + return r; +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); + + kvm_x86_ops->vcpu_free(vcpu); +} + +int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->vcpu_reset(vcpu); +} + +void kvm_arch_hardware_enable(void *garbage) +{ + kvm_x86_ops->hardware_enable(garbage); +} + +void kvm_arch_hardware_disable(void *garbage) +{ + kvm_x86_ops->hardware_disable(garbage); +} + +int kvm_arch_hardware_setup(void) +{ + return kvm_x86_ops->hardware_setup(); +} + +void kvm_arch_hardware_unsetup(void) +{ + kvm_x86_ops->hardware_unsetup(); +} + +void kvm_arch_check_processor_compat(void *rtn) +{ + kvm_x86_ops->check_processor_compatibility(rtn); +} + +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) +{ + struct page *page; + struct kvm *kvm; + int r; + + BUG_ON(vcpu->kvm == NULL); + kvm = vcpu->kvm; + + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) + vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; + else + vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r = -ENOMEM; + goto fail; + } + vcpu->arch.pio_data = page_address(page); + + r = kvm_mmu_create(vcpu); + if (r < 0) + goto fail_free_pio_data; + + if (irqchip_in_kernel(kvm)) { + r = kvm_create_lapic(vcpu); + if (r < 0) + goto fail_mmu_destroy; + } + + return 0; + +fail_mmu_destroy: + kvm_mmu_destroy(vcpu); +fail_free_pio_data: + free_page((unsigned long)vcpu->arch.pio_data); +fail: + return r; +} + +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kvm_free_lapic(vcpu); + kvm_mmu_destroy(vcpu); + free_page((unsigned long)vcpu->arch.pio_data); +} + +struct kvm *kvm_arch_create_vm(void) +{ + struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + + if (!kvm) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + + return kvm; +} + +static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + kvm_mmu_unload(vcpu); + vcpu_put(vcpu); +} + +static void kvm_free_vcpus(struct kvm *kvm) +{ + unsigned int i; + + /* + * Unpin any mmu pages first. + */ + for (i = 0; i < KVM_MAX_VCPUS; ++i) + if (kvm->vcpus[i]) + kvm_unload_vcpu_mmu(kvm->vcpus[i]); + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (kvm->vcpus[i]) { + kvm_arch_vcpu_free(kvm->vcpus[i]); + kvm->vcpus[i] = NULL; + } + } + +} + +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + kfree(kvm->arch.vpic); + kfree(kvm->arch.vioapic); + kvm_free_vcpus(kvm); + kvm_free_physmem(kvm); + kfree(kvm); +} + +int kvm_arch_set_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + int npages = mem->memory_size >> PAGE_SHIFT; + struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + + /*To keep backward compatibility with older userspace, + *x86 needs to hanlde !user_alloc case. + */ + if (!user_alloc) { + if (npages && !old.rmap) { + memslot->userspace_addr = do_mmap(NULL, 0, + npages * PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + 0); + + if (IS_ERR((void *)memslot->userspace_addr)) + return PTR_ERR((void *)memslot->userspace_addr); + } else { + if (!old.user_alloc && old.rmap) { + int ret; + + ret = do_munmap(current->mm, old.userspace_addr, + old.npages * PAGE_SIZE); + if (ret < 0) + printk(KERN_WARNING + "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + } + } + + if (!kvm->arch.n_requested_mmu_pages) { + unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + } + + kvm_mmu_slot_remove_write_access(kvm, mem->slot); + kvm_flush_remote_tlbs(kvm); + + return 0; +} + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE + || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED; +} + +static void vcpu_kick_intr(void *info) +{ +#ifdef DEBUG + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; + printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); +#endif +} + +void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + int ipi_pcpu = vcpu->cpu; + + if (waitqueue_active(&vcpu->wq)) { + wake_up_interruptible(&vcpu->wq); + ++vcpu->stat.halt_wakeup; + } + if (vcpu->guest_mode) + smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); +} diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c new file mode 100644 index 00000000000..79586003397 --- /dev/null +++ b/arch/x86/kvm/x86_emulate.c @@ -0,0 +1,1912 @@ +/****************************************************************************** + * x86_emulate.c + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * Linux coding style, mod r/m decoder, segment base fixes, real-mode + * privileged instructions: + * + * Copyright (C) 2006 Qumranet + * + * Avi Kivity <avi@qumranet.com> + * Yaniv Kamay <yaniv@qumranet.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef __KERNEL__ +#include <stdio.h> +#include <stdint.h> +#include <public/xen.h> +#define DPRINTF(_f, _a ...) printf(_f , ## _a) +#else +#include <linux/kvm_host.h> +#define DPRINTF(x...) do {} while (0) +#endif +#include <linux/module.h> +#include <asm/kvm_x86_emulate.h> + +/* + * Opcode effective-address decode tables. + * Note that we only emulate instructions that have at least one memory + * operand (excluding implicit stack references). We assume that stack + * references and instruction fetches will never occur in special memory + * areas that require emulation. So, for example, 'mov <imm>,<reg>' need + * not be handled. + */ + +/* Operand sizes: 8-bit operands or specified/overridden size. */ +#define ByteOp (1<<0) /* 8-bit operands. */ +/* Destination operand type. */ +#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ +#define DstReg (2<<1) /* Register operand. */ +#define DstMem (3<<1) /* Memory operand. */ +#define DstMask (3<<1) +/* Source operand type. */ +#define SrcNone (0<<3) /* No source operand. */ +#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<3) /* Register operand. */ +#define SrcMem (2<<3) /* Memory operand. */ +#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ +#define SrcImm (5<<3) /* Immediate operand. */ +#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ +#define SrcMask (7<<3) +/* Generic ModRM decode. */ +#define ModRM (1<<6) +/* Destination is only written; never read. */ +#define Mov (1<<7) +#define BitOp (1<<8) +#define MemAbs (1<<9) /* Memory operand is absolute displacement */ +#define String (1<<10) /* String instruction (rep capable) */ +#define Stack (1<<11) /* Stack instruction (push/pop) */ + +static u16 opcode_table[256] = { + /* 0x00 - 0x07 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x08 - 0x0F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x10 - 0x17 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x18 - 0x1F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x20 - 0x27 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + SrcImmByte, SrcImm, 0, 0, + /* 0x28 - 0x2F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x30 - 0x37 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x38 - 0x3F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x40 - 0x47 */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x48 - 0x4F */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x50 - 0x57 */ + SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, + SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, + /* 0x58 - 0x5F */ + DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, + DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, + /* 0x60 - 0x67 */ + 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , + 0, 0, 0, 0, + /* 0x68 - 0x6F */ + 0, 0, ImplicitOps | Mov | Stack, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ + /* 0x70 - 0x77 */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x78 - 0x7F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x80 - 0x87 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + /* 0x88 - 0x8F */ + ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, + ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, + /* 0xA0 - 0xA7 */ + ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, + ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, + ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | String, ImplicitOps | String, + /* 0xA8 - 0xAF */ + 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | String, ImplicitOps | String, + /* 0xB0 - 0xBF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xC0 - 0xC7 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + 0, ImplicitOps | Stack, 0, 0, + ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, + /* 0xC8 - 0xCF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xD7 */ + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + 0, 0, 0, 0, + /* 0xD8 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xE7 */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE8 - 0xEF */ + ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, + 0, 0, 0, 0, + /* 0xF0 - 0xF7 */ + 0, 0, 0, 0, + ImplicitOps, ImplicitOps, + ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + /* 0xF8 - 0xFF */ + ImplicitOps, 0, ImplicitOps, ImplicitOps, + 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM +}; + +static u16 twobyte_table[256] = { + /* 0x00 - 0x0F */ + 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + /* 0x10 - 0x1F */ + 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, + /* 0x20 - 0x2F */ + ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x30 - 0x3F */ + ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x40 - 0x47 */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x48 - 0x4F */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 - 0x6F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 - 0x8F */ + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xA0 - 0xA7 */ + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, + /* 0xA8 - 0xAF */ + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, + DstMem | SrcReg | ModRM | BitOp, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xB8 - 0xBF */ + 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xC0 - 0xCF */ + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xF0 - 0xFF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* EFLAGS bit definitions. */ +#define EFLG_OF (1<<11) +#define EFLG_DF (1<<10) +#define EFLG_SF (1<<7) +#define EFLG_ZF (1<<6) +#define EFLG_AF (1<<4) +#define EFLG_PF (1<<2) +#define EFLG_CF (1<<0) + +/* + * Instruction emulation: + * Most instructions are emulated directly via a fragment of inline assembly + * code. This allows us to save/restore EFLAGS and thus very easily pick up + * any modified flags. + */ + +#if defined(CONFIG_X86_64) +#define _LO32 "k" /* force 32-bit operand */ +#define _STK "%%rsp" /* stack pointer */ +#elif defined(__i386__) +#define _LO32 "" /* force 32-bit operand */ +#define _STK "%%esp" /* stack pointer */ +#endif + +/* + * These EFLAGS bits are restored from saved value during emulation, and + * any changes are written back to the saved value after emulation. + */ +#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) + +/* Before executing instruction: restore necessary bits in EFLAGS. */ +#define _PRE_EFLAGS(_sav, _msk, _tmp) \ + /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ + "movl %"_sav",%"_LO32 _tmp"; " \ + "push %"_tmp"; " \ + "push %"_tmp"; " \ + "movl %"_msk",%"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "pushf; " \ + "notl %"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ + "pop %"_tmp"; " \ + "orl %"_LO32 _tmp",("_STK"); " \ + "popf; " \ + "pop %"_sav"; " + +/* After executing instruction: write-back necessary bits in EFLAGS. */ +#define _POST_EFLAGS(_sav, _msk, _tmp) \ + /* _sav |= EFLAGS & _msk; */ \ + "pushf; " \ + "pop %"_tmp"; " \ + "andl %"_msk",%"_LO32 _tmp"; " \ + "orl %"_LO32 _tmp",%"_sav"; " + +/* Raw emulation: instruction has two explicit operands. */ +#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op"w %"_wx"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _wy ((_src).val), "i" (EFLAGS_MASK)); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op"l %"_lx"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _ly ((_src).val), "i" (EFLAGS_MASK)); \ + break; \ + case 8: \ + __emulate_2op_8byte(_op, _src, _dst, \ + _eflags, _qx, _qy); \ + break; \ + } \ + } while (0) + +#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + switch ((_dst).bytes) { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op"b %"_bx"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _by ((_src).val), "i" (EFLAGS_MASK)); \ + break; \ + default: \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + _wx, _wy, _lx, _ly, _qx, _qy); \ + break; \ + } \ + } while (0) + +/* Source operand is byte-sized and may be restricted to just %cl. */ +#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "c", "b", "c", "b", "c", "b", "c") + +/* Source operand is byte, word, long or quad sized. */ +#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "q", "w", "r", _LO32, "r", "", "r") + +/* Source operand is word, long or quad sized. */ +#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + "w", "r", _LO32, "r", "", "r") + +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op, _dst, _eflags) \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 1: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op"b %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ + break; \ + case 2: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op"w %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ + break; \ + case 4: \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op"l %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ + break; \ + case 8: \ + __emulate_1op_8byte(_op, _dst, _eflags); \ + break; \ + } \ + } while (0) + +/* Emulate an instruction with quadword operands (x86/64 only). */ +#if defined(CONFIG_X86_64) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op"q %"_qx"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : _qy ((_src).val), "i" (EFLAGS_MASK)); \ + } while (0) + +#define __emulate_1op_8byte(_op, _dst, _eflags) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op"q %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ + } while (0) + +#elif defined(__i386__) +#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) +#define __emulate_1op_8byte(_op, _dst, _eflags) +#endif /* __i386__ */ + +/* Fetch next part of the instruction being emulated. */ +#define insn_fetch(_type, _size, _eip) \ +({ unsigned long _x; \ + rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ + if (rc != 0) \ + goto done; \ + (_eip) += (_size); \ + (_type)_x; \ +}) + +/* Access/update address held in a register, based on addressing mode. */ +#define address_mask(reg) \ + ((c->ad_bytes == sizeof(unsigned long)) ? \ + (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1))) +#define register_address(base, reg) \ + ((base) + address_mask(reg)) +#define register_address_increment(reg, inc) \ + do { \ + /* signed type ensures sign extension to long */ \ + int _inc = (inc); \ + if (c->ad_bytes == sizeof(unsigned long)) \ + (reg) += _inc; \ + else \ + (reg) = ((reg) & \ + ~((1UL << (c->ad_bytes << 3)) - 1)) | \ + (((reg) + _inc) & \ + ((1UL << (c->ad_bytes << 3)) - 1)); \ + } while (0) + +#define JMP_REL(rel) \ + do { \ + register_address_increment(c->eip, rel); \ + } while (0) + +static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long linear, u8 *dest) +{ + struct fetch_cache *fc = &ctxt->decode.fetch; + int rc; + int size; + + if (linear < fc->start || linear >= fc->end) { + size = min(15UL, PAGE_SIZE - offset_in_page(linear)); + rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); + if (rc) + return rc; + fc->start = linear; + fc->end = linear + size; + } + *dest = fc->data[linear - fc->start]; + return 0; +} + +static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long eip, void *dest, unsigned size) +{ + int rc = 0; + + eip += ctxt->cs_base; + while (size--) { + rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); + if (rc) + return rc; + } + return 0; +} + +/* + * Given the 'reg' portion of a ModRM byte, and a register block, return a + * pointer into the block that addresses the relevant register. + * @highbyte_regs specifies whether to decode AH,CH,DH,BH. + */ +static void *decode_register(u8 modrm_reg, unsigned long *regs, + int highbyte_regs) +{ + void *p; + + p = ®s[modrm_reg]; + if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) + p = (unsigned char *)®s[modrm_reg & 3] + 1; + return p; +} + +static int read_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *ptr, + u16 *size, unsigned long *address, int op_bytes) +{ + int rc; + + if (op_bytes == 2) + op_bytes = 3; + *address = 0; + rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, + ctxt->vcpu); + if (rc) + return rc; + rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, + ctxt->vcpu); + return rc; +} + +static int test_cc(unsigned int condition, unsigned int flags) +{ + int rc = 0; + + switch ((condition & 15) >> 1) { + case 0: /* o */ + rc |= (flags & EFLG_OF); + break; + case 1: /* b/c/nae */ + rc |= (flags & EFLG_CF); + break; + case 2: /* z/e */ + rc |= (flags & EFLG_ZF); + break; + case 3: /* be/na */ + rc |= (flags & (EFLG_CF|EFLG_ZF)); + break; + case 4: /* s */ + rc |= (flags & EFLG_SF); + break; + case 5: /* p/pe */ + rc |= (flags & EFLG_PF); + break; + case 7: /* le/ng */ + rc |= (flags & EFLG_ZF); + /* fall through */ + case 6: /* l/nge */ + rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); + break; + } + + /* Odd condition identifiers (lsb == 1) have inverted sense. */ + return (!!rc ^ (condition & 1)); +} + +static void decode_register_operand(struct operand *op, + struct decode_cache *c, + int inhibit_bytereg) +{ + unsigned reg = c->modrm_reg; + int highbyte_regs = c->rex_prefix == 0; + + if (!(c->d & ModRM)) + reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); + op->type = OP_REG; + if ((c->d & ByteOp) && !inhibit_bytereg) { + op->ptr = decode_register(reg, c->regs, highbyte_regs); + op->val = *(u8 *)op->ptr; + op->bytes = 1; + } else { + op->ptr = decode_register(reg, c->regs, 0); + op->bytes = c->op_bytes; + switch (op->bytes) { + case 2: + op->val = *(u16 *)op->ptr; + break; + case 4: + op->val = *(u32 *)op->ptr; + break; + case 8: + op->val = *(u64 *) op->ptr; + break; + } + } + op->orig_val = op->val; +} + +static int decode_modrm(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + u8 sib; + int index_reg = 0, base_reg = 0, scale, rip_relative = 0; + int rc = 0; + + if (c->rex_prefix) { + c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ + index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ + c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ + } + + c->modrm = insn_fetch(u8, 1, c->eip); + c->modrm_mod |= (c->modrm & 0xc0) >> 6; + c->modrm_reg |= (c->modrm & 0x38) >> 3; + c->modrm_rm |= (c->modrm & 0x07); + c->modrm_ea = 0; + c->use_modrm_ea = 1; + + if (c->modrm_mod == 3) { + c->modrm_val = *(unsigned long *) + decode_register(c->modrm_rm, c->regs, c->d & ByteOp); + return rc; + } + + if (c->ad_bytes == 2) { + unsigned bx = c->regs[VCPU_REGS_RBX]; + unsigned bp = c->regs[VCPU_REGS_RBP]; + unsigned si = c->regs[VCPU_REGS_RSI]; + unsigned di = c->regs[VCPU_REGS_RDI]; + + /* 16-bit ModR/M decode. */ + switch (c->modrm_mod) { + case 0: + if (c->modrm_rm == 6) + c->modrm_ea += insn_fetch(u16, 2, c->eip); + break; + case 1: + c->modrm_ea += insn_fetch(s8, 1, c->eip); + break; + case 2: + c->modrm_ea += insn_fetch(u16, 2, c->eip); + break; + } + switch (c->modrm_rm) { + case 0: + c->modrm_ea += bx + si; + break; + case 1: + c->modrm_ea += bx + di; + break; + case 2: + c->modrm_ea += bp + si; + break; + case 3: + c->modrm_ea += bp + di; + break; + case 4: + c->modrm_ea += si; + break; + case 5: + c->modrm_ea += di; + break; + case 6: + if (c->modrm_mod != 0) + c->modrm_ea += bp; + break; + case 7: + c->modrm_ea += bx; + break; + } + if (c->modrm_rm == 2 || c->modrm_rm == 3 || + (c->modrm_rm == 6 && c->modrm_mod != 0)) + if (!c->override_base) + c->override_base = &ctxt->ss_base; + c->modrm_ea = (u16)c->modrm_ea; + } else { + /* 32/64-bit ModR/M decode. */ + switch (c->modrm_rm) { + case 4: + case 12: + sib = insn_fetch(u8, 1, c->eip); + index_reg |= (sib >> 3) & 7; + base_reg |= sib & 7; + scale = sib >> 6; + + switch (base_reg) { + case 5: + if (c->modrm_mod != 0) + c->modrm_ea += c->regs[base_reg]; + else + c->modrm_ea += + insn_fetch(s32, 4, c->eip); + break; + default: + c->modrm_ea += c->regs[base_reg]; + } + switch (index_reg) { + case 4: + break; + default: + c->modrm_ea += c->regs[index_reg] << scale; + } + break; + case 5: + if (c->modrm_mod != 0) + c->modrm_ea += c->regs[c->modrm_rm]; + else if (ctxt->mode == X86EMUL_MODE_PROT64) + rip_relative = 1; + break; + default: + c->modrm_ea += c->regs[c->modrm_rm]; + break; + } + switch (c->modrm_mod) { + case 0: + if (c->modrm_rm == 5) + c->modrm_ea += insn_fetch(s32, 4, c->eip); + break; + case 1: + c->modrm_ea += insn_fetch(s8, 1, c->eip); + break; + case 2: + c->modrm_ea += insn_fetch(s32, 4, c->eip); + break; + } + } + if (rip_relative) { + c->modrm_ea += c->eip; + switch (c->d & SrcMask) { + case SrcImmByte: + c->modrm_ea += 1; + break; + case SrcImm: + if (c->d & ByteOp) + c->modrm_ea += 1; + else + if (c->op_bytes == 8) + c->modrm_ea += 4; + else + c->modrm_ea += c->op_bytes; + } + } +done: + return rc; +} + +static int decode_abs(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + + switch (c->ad_bytes) { + case 2: + c->modrm_ea = insn_fetch(u16, 2, c->eip); + break; + case 4: + c->modrm_ea = insn_fetch(u32, 4, c->eip); + break; + case 8: + c->modrm_ea = insn_fetch(u64, 8, c->eip); + break; + } +done: + return rc; +} + +int +x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + int mode = ctxt->mode; + int def_op_bytes, def_ad_bytes; + + /* Shadow copy of register state. Committed on successful emulation. */ + + memset(c, 0, sizeof(struct decode_cache)); + c->eip = ctxt->vcpu->arch.rip; + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_PROT16: + def_op_bytes = def_ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + def_op_bytes = def_ad_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86EMUL_MODE_PROT64: + def_op_bytes = 4; + def_ad_bytes = 8; + break; +#endif + default: + return -1; + } + + c->op_bytes = def_op_bytes; + c->ad_bytes = def_ad_bytes; + + /* Legacy prefixes. */ + for (;;) { + switch (c->b = insn_fetch(u8, 1, c->eip)) { + case 0x66: /* operand-size override */ + /* switch between 2/4 bytes */ + c->op_bytes = def_op_bytes ^ 6; + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + /* switch between 4/8 bytes */ + c->ad_bytes = def_ad_bytes ^ 12; + else + /* switch between 2/4 bytes */ + c->ad_bytes = def_ad_bytes ^ 6; + break; + case 0x2e: /* CS override */ + c->override_base = &ctxt->cs_base; + break; + case 0x3e: /* DS override */ + c->override_base = &ctxt->ds_base; + break; + case 0x26: /* ES override */ + c->override_base = &ctxt->es_base; + break; + case 0x64: /* FS override */ + c->override_base = &ctxt->fs_base; + break; + case 0x65: /* GS override */ + c->override_base = &ctxt->gs_base; + break; + case 0x36: /* SS override */ + c->override_base = &ctxt->ss_base; + break; + case 0x40 ... 0x4f: /* REX */ + if (mode != X86EMUL_MODE_PROT64) + goto done_prefixes; + c->rex_prefix = c->b; + continue; + case 0xf0: /* LOCK */ + c->lock_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + c->rep_prefix = REPNE_PREFIX; + break; + case 0xf3: /* REP/REPE/REPZ */ + c->rep_prefix = REPE_PREFIX; + break; + default: + goto done_prefixes; + } + + /* Any legacy prefix after a REX prefix nullifies its effect. */ + + c->rex_prefix = 0; + } + +done_prefixes: + + /* REX prefix. */ + if (c->rex_prefix) + if (c->rex_prefix & 8) + c->op_bytes = 8; /* REX.W */ + + /* Opcode byte(s). */ + c->d = opcode_table[c->b]; + if (c->d == 0) { + /* Two-byte opcode? */ + if (c->b == 0x0f) { + c->twobyte = 1; + c->b = insn_fetch(u8, 1, c->eip); + c->d = twobyte_table[c->b]; + } + + /* Unrecognised? */ + if (c->d == 0) { + DPRINTF("Cannot emulate %02x\n", c->b); + return -1; + } + } + + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) + c->op_bytes = 8; + + /* ModRM and SIB bytes. */ + if (c->d & ModRM) + rc = decode_modrm(ctxt, ops); + else if (c->d & MemAbs) + rc = decode_abs(ctxt, ops); + if (rc) + goto done; + + if (!c->override_base) + c->override_base = &ctxt->ds_base; + if (mode == X86EMUL_MODE_PROT64 && + c->override_base != &ctxt->fs_base && + c->override_base != &ctxt->gs_base) + c->override_base = NULL; + + if (c->override_base) + c->modrm_ea += *c->override_base; + + if (c->ad_bytes != 8) + c->modrm_ea = (u32)c->modrm_ea; + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (c->d & SrcMask) { + case SrcNone: + break; + case SrcReg: + decode_register_operand(&c->src, c, 0); + break; + case SrcMem16: + c->src.bytes = 2; + goto srcmem_common; + case SrcMem32: + c->src.bytes = 4; + goto srcmem_common; + case SrcMem: + c->src.bytes = (c->d & ByteOp) ? 1 : + c->op_bytes; + /* Don't fetch the address for invlpg: it could be unmapped. */ + if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) + break; + srcmem_common: + /* + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. + */ + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->src.type = OP_REG; + break; + } + c->src.type = OP_MEM; + break; + case SrcImm: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + if (c->src.bytes == 8) + c->src.bytes = 4; + /* NB. Immediates are sign-extended as necessary. */ + switch (c->src.bytes) { + case 1: + c->src.val = insn_fetch(s8, 1, c->eip); + break; + case 2: + c->src.val = insn_fetch(s16, 2, c->eip); + break; + case 4: + c->src.val = insn_fetch(s32, 4, c->eip); + break; + } + break; + case SrcImmByte: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = 1; + c->src.val = insn_fetch(s8, 1, c->eip); + break; + } + + /* Decode and fetch the destination operand: register or memory. */ + switch (c->d & DstMask) { + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + return 0; + case DstReg: + decode_register_operand(&c->dst, c, + c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); + break; + case DstMem: + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->dst.type = OP_REG; + break; + } + c->dst.type = OP_MEM; + break; + } + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; +} + +static inline void emulate_push(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.type = OP_MEM; + c->dst.bytes = c->op_bytes; + c->dst.val = c->src.val; + register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes); + c->dst.ptr = (void *) register_address(ctxt->ss_base, + c->regs[VCPU_REGS_RSP]); +} + +static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + rc = ops->read_std(register_address(ctxt->ss_base, + c->regs[VCPU_REGS_RSP]), + &c->dst.val, c->dst.bytes, ctxt->vcpu); + if (rc != 0) + return rc; + + register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes); + + return 0; +} + +static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + switch (c->modrm_reg) { + case 0: /* rol */ + emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); + break; + case 1: /* ror */ + emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); + break; + case 2: /* rcl */ + emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); + break; + case 3: /* rcr */ + emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); + break; + case 4: /* sal/shl */ + case 6: /* sal/shl */ + emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); + break; + case 5: /* shr */ + emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); + break; + case 7: /* sar */ + emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); + break; + } +} + +static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + + switch (c->modrm_reg) { + case 0 ... 1: /* test */ + /* + * Special case in Grp3: test has an immediate + * source operand. + */ + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + if (c->src.bytes == 8) + c->src.bytes = 4; + switch (c->src.bytes) { + case 1: + c->src.val = insn_fetch(s8, 1, c->eip); + break; + case 2: + c->src.val = insn_fetch(s16, 2, c->eip); + break; + case 4: + c->src.val = insn_fetch(s32, 4, c->eip); + break; + } + emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + break; + case 2: /* not */ + c->dst.val = ~c->dst.val; + break; + case 3: /* neg */ + emulate_1op("neg", c->dst, ctxt->eflags); + break; + default: + DPRINTF("Cannot emulate %02x\n", c->b); + rc = X86EMUL_UNHANDLEABLE; + break; + } +done: + return rc; +} + +static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + switch (c->modrm_reg) { + case 0: /* inc */ + emulate_1op("inc", c->dst, ctxt->eflags); + break; + case 1: /* dec */ + emulate_1op("dec", c->dst, ctxt->eflags); + break; + case 4: /* jmp abs */ + if (c->b == 0xff) + c->eip = c->dst.val; + else { + DPRINTF("Cannot emulate %02x\n", c->b); + return X86EMUL_UNHANDLEABLE; + } + break; + case 6: /* push */ + + /* 64-bit mode: PUSH always pushes a 64-bit operand. */ + + if (ctxt->mode == X86EMUL_MODE_PROT64) { + c->dst.bytes = 8; + rc = ops->read_std((unsigned long)c->dst.ptr, + &c->dst.val, 8, ctxt->vcpu); + if (rc != 0) + return rc; + } + register_address_increment(c->regs[VCPU_REGS_RSP], + -c->dst.bytes); + rc = ops->write_emulated(register_address(ctxt->ss_base, + c->regs[VCPU_REGS_RSP]), &c->dst.val, + c->dst.bytes, ctxt->vcpu); + if (rc != 0) + return rc; + c->dst.type = OP_NONE; + break; + default: + DPRINTF("Cannot emulate %02x\n", c->b); + return X86EMUL_UNHANDLEABLE; + } + return 0; +} + +static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long memop) +{ + struct decode_cache *c = &ctxt->decode; + u64 old, new; + int rc; + + rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); + if (rc != 0) + return rc; + + if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || + ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { + + c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); + c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); + ctxt->eflags &= ~EFLG_ZF; + + } else { + new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | + (u32) c->regs[VCPU_REGS_RBX]; + + rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); + if (rc != 0) + return rc; + ctxt->eflags |= EFLG_ZF; + } + return 0; +} + +static inline int writeback(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + int rc; + struct decode_cache *c = &ctxt->decode; + + switch (c->dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: + * in 64-bit mode we zero-extend. + */ + switch (c->dst.bytes) { + case 1: + *(u8 *)c->dst.ptr = (u8)c->dst.val; + break; + case 2: + *(u16 *)c->dst.ptr = (u16)c->dst.val; + break; + case 4: + *c->dst.ptr = (u32)c->dst.val; + break; /* 64b: zero-ext */ + case 8: + *c->dst.ptr = c->dst.val; + break; + } + break; + case OP_MEM: + if (c->lock_prefix) + rc = ops->cmpxchg_emulated( + (unsigned long)c->dst.ptr, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + else + rc = ops->write_emulated( + (unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != 0) + return rc; + break; + case OP_NONE: + /* no writeback */ + break; + default: + break; + } + return 0; +} + +int +x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + unsigned long memop = 0; + u64 msr_data; + unsigned long saved_eip = 0; + struct decode_cache *c = &ctxt->decode; + int rc = 0; + + /* Shadow copy of register state. Committed on successful emulation. + * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't + * modify them. + */ + + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + saved_eip = c->eip; + + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) + memop = c->modrm_ea; + + if (c->rep_prefix && (c->d & String)) { + /* All REP prefixes have the same first termination condition */ + if (c->regs[VCPU_REGS_RCX] == 0) { + ctxt->vcpu->arch.rip = c->eip; + goto done; + } + /* The second termination condition only applies for REPE + * and REPNE. Test if the repeat string operation prefix is + * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the + * corresponding termination condition according to: + * - if REPE/REPZ and ZF = 0 then done + * - if REPNE/REPNZ and ZF = 1 then done + */ + if ((c->b == 0xa6) || (c->b == 0xa7) || + (c->b == 0xae) || (c->b == 0xaf)) { + if ((c->rep_prefix == REPE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == 0)) { + ctxt->vcpu->arch.rip = c->eip; + goto done; + } + if ((c->rep_prefix == REPNE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { + ctxt->vcpu->arch.rip = c->eip; + goto done; + } + } + c->regs[VCPU_REGS_RCX]--; + c->eip = ctxt->vcpu->arch.rip; + } + + if (c->src.type == OP_MEM) { + c->src.ptr = (unsigned long *)memop; + c->src.val = 0; + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); + if (rc != 0) + goto done; + c->src.orig_val = c->src.val; + } + + if ((c->d & DstMask) == ImplicitOps) + goto special_insn; + + + if (c->dst.type == OP_MEM) { + c->dst.ptr = (unsigned long *)memop; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.val = 0; + if (c->d & BitOp) { + unsigned long mask = ~(c->dst.bytes * 8 - 1); + + c->dst.ptr = (void *)c->dst.ptr + + (c->src.val & mask) / 8; + } + if (!(c->d & Mov) && + /* optimisation - avoid slow emulated read */ + ((rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, ctxt->vcpu)) != 0)) + goto done; + } + c->dst.orig_val = c->dst.val; + +special_insn: + + if (c->twobyte) + goto twobyte_insn; + + switch (c->b) { + case 0x00 ... 0x05: + add: /* add */ + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + break; + case 0x08 ... 0x0d: + or: /* or */ + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + break; + case 0x10 ... 0x15: + adc: /* adc */ + emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + break; + case 0x18 ... 0x1d: + sbb: /* sbb */ + emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); + break; + case 0x20 ... 0x23: + and: /* and */ + emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); + break; + case 0x24: /* and al imm8 */ + c->dst.type = OP_REG; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + c->dst.val = *(u8 *)c->dst.ptr; + c->dst.bytes = 1; + c->dst.orig_val = c->dst.val; + goto and; + case 0x25: /* and ax imm16, or eax imm32 */ + c->dst.type = OP_REG; + c->dst.bytes = c->op_bytes; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + if (c->op_bytes == 2) + c->dst.val = *(u16 *)c->dst.ptr; + else + c->dst.val = *(u32 *)c->dst.ptr; + c->dst.orig_val = c->dst.val; + goto and; + case 0x28 ... 0x2d: + sub: /* sub */ + emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); + break; + case 0x30 ... 0x35: + xor: /* xor */ + emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); + break; + case 0x38 ... 0x3d: + cmp: /* cmp */ + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + break; + case 0x40 ... 0x47: /* inc r16/r32 */ + emulate_1op("inc", c->dst, ctxt->eflags); + break; + case 0x48 ... 0x4f: /* dec r16/r32 */ + emulate_1op("dec", c->dst, ctxt->eflags); + break; + case 0x50 ... 0x57: /* push reg */ + c->dst.type = OP_MEM; + c->dst.bytes = c->op_bytes; + c->dst.val = c->src.val; + register_address_increment(c->regs[VCPU_REGS_RSP], + -c->op_bytes); + c->dst.ptr = (void *) register_address( + ctxt->ss_base, c->regs[VCPU_REGS_RSP]); + break; + case 0x58 ... 0x5f: /* pop reg */ + pop_instruction: + if ((rc = ops->read_std(register_address(ctxt->ss_base, + c->regs[VCPU_REGS_RSP]), c->dst.ptr, + c->op_bytes, ctxt->vcpu)) != 0) + goto done; + + register_address_increment(c->regs[VCPU_REGS_RSP], + c->op_bytes); + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0x63: /* movsxd */ + if (ctxt->mode != X86EMUL_MODE_PROT64) + goto cannot_emulate; + c->dst.val = (s32) c->src.val; + break; + case 0x6a: /* push imm8 */ + c->src.val = 0L; + c->src.val = insn_fetch(s8, 1, c->eip); + emulate_push(ctxt); + break; + case 0x6c: /* insb */ + case 0x6d: /* insw/insd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 1, + (c->d & ByteOp) ? 1 : c->op_bytes, + c->rep_prefix ? + address_mask(c->regs[VCPU_REGS_RCX]) : 1, + (ctxt->eflags & EFLG_DF), + register_address(ctxt->es_base, + c->regs[VCPU_REGS_RDI]), + c->rep_prefix, + c->regs[VCPU_REGS_RDX]) == 0) { + c->eip = saved_eip; + return -1; + } + return 0; + case 0x6e: /* outsb */ + case 0x6f: /* outsw/outsd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 0, + (c->d & ByteOp) ? 1 : c->op_bytes, + c->rep_prefix ? + address_mask(c->regs[VCPU_REGS_RCX]) : 1, + (ctxt->eflags & EFLG_DF), + register_address(c->override_base ? + *c->override_base : + ctxt->ds_base, + c->regs[VCPU_REGS_RSI]), + c->rep_prefix, + c->regs[VCPU_REGS_RDX]) == 0) { + c->eip = saved_eip; + return -1; + } + return 0; + case 0x70 ... 0x7f: /* jcc (short) */ { + int rel = insn_fetch(s8, 1, c->eip); + + if (test_cc(c->b, ctxt->eflags)) + JMP_REL(rel); + break; + } + case 0x80 ... 0x83: /* Grp1 */ + switch (c->modrm_reg) { + case 0: + goto add; + case 1: + goto or; + case 2: + goto adc; + case 3: + goto sbb; + case 4: + goto and; + case 5: + goto sub; + case 6: + goto xor; + case 7: + goto cmp; + } + break; + case 0x84 ... 0x85: + emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + break; + case 0x86 ... 0x87: /* xchg */ + /* Write back the register source. */ + switch (c->dst.bytes) { + case 1: + *(u8 *) c->src.ptr = (u8) c->dst.val; + break; + case 2: + *(u16 *) c->src.ptr = (u16) c->dst.val; + break; + case 4: + *c->src.ptr = (u32) c->dst.val; + break; /* 64b reg: zero-extend */ + case 8: + *c->src.ptr = c->dst.val; + break; + } + /* + * Write back the memory destination with implicit LOCK + * prefix. + */ + c->dst.val = c->src.val; + c->lock_prefix = 1; + break; + case 0x88 ... 0x8b: /* mov */ + goto mov; + case 0x8d: /* lea r16/r32, m */ + c->dst.val = c->modrm_val; + break; + case 0x8f: /* pop (sole member of Grp1a) */ + rc = emulate_grp1a(ctxt, ops); + if (rc != 0) + goto done; + break; + case 0x9c: /* pushf */ + c->src.val = (unsigned long) ctxt->eflags; + emulate_push(ctxt); + break; + case 0x9d: /* popf */ + c->dst.ptr = (unsigned long *) &ctxt->eflags; + goto pop_instruction; + case 0xa0 ... 0xa1: /* mov */ + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + c->dst.val = c->src.val; + break; + case 0xa2 ... 0xa3: /* mov */ + c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; + break; + case 0xa4 ... 0xa5: /* movs */ + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address( + ctxt->es_base, + c->regs[VCPU_REGS_RDI]); + if ((rc = ops->read_emulated(register_address( + c->override_base ? *c->override_base : + ctxt->ds_base, + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, ctxt->vcpu)) != 0) + goto done; + register_address_increment(c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + register_address_increment(c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xa6 ... 0xa7: /* cmps */ + c->src.type = OP_NONE; /* Disable writeback. */ + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = (unsigned long *)register_address( + c->override_base ? *c->override_base : + ctxt->ds_base, + c->regs[VCPU_REGS_RSI]); + if ((rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu)) != 0) + goto done; + + c->dst.type = OP_NONE; /* Disable writeback. */ + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address( + ctxt->es_base, + c->regs[VCPU_REGS_RDI]); + if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu)) != 0) + goto done; + + DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); + + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + + register_address_increment(c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->src.bytes + : c->src.bytes); + register_address_increment(c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + + break; + case 0xaa ... 0xab: /* stos */ + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address( + ctxt->es_base, + c->regs[VCPU_REGS_RDI]); + c->dst.val = c->regs[VCPU_REGS_RAX]; + register_address_increment(c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xac ... 0xad: /* lods */ + c->dst.type = OP_REG; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + if ((rc = ops->read_emulated(register_address( + c->override_base ? *c->override_base : + ctxt->ds_base, + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, + ctxt->vcpu)) != 0) + goto done; + register_address_increment(c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xae ... 0xaf: /* scas */ + DPRINTF("Urk! I don't handle SCAS.\n"); + goto cannot_emulate; + case 0xc0 ... 0xc1: + emulate_grp2(ctxt); + break; + case 0xc3: /* ret */ + c->dst.ptr = &c->eip; + goto pop_instruction; + case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ + mov: + c->dst.val = c->src.val; + break; + case 0xd0 ... 0xd1: /* Grp2 */ + c->src.val = 1; + emulate_grp2(ctxt); + break; + case 0xd2 ... 0xd3: /* Grp2 */ + c->src.val = c->regs[VCPU_REGS_RCX]; + emulate_grp2(ctxt); + break; + case 0xe8: /* call (near) */ { + long int rel; + switch (c->op_bytes) { + case 2: + rel = insn_fetch(s16, 2, c->eip); + break; + case 4: + rel = insn_fetch(s32, 4, c->eip); + break; + default: + DPRINTF("Call: Invalid op_bytes\n"); + goto cannot_emulate; + } + c->src.val = (unsigned long) c->eip; + JMP_REL(rel); + c->op_bytes = c->ad_bytes; + emulate_push(ctxt); + break; + } + case 0xe9: /* jmp rel */ + case 0xeb: /* jmp rel short */ + JMP_REL(c->src.val); + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xf4: /* hlt */ + ctxt->vcpu->arch.halt_request = 1; + goto done; + case 0xf5: /* cmc */ + /* complement carry flag from eflags reg */ + ctxt->eflags ^= EFLG_CF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xf6 ... 0xf7: /* Grp3 */ + rc = emulate_grp3(ctxt, ops); + if (rc != 0) + goto done; + break; + case 0xf8: /* clc */ + ctxt->eflags &= ~EFLG_CF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfa: /* cli */ + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfb: /* sti */ + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfe ... 0xff: /* Grp4/Grp5 */ + rc = emulate_grp45(ctxt, ops); + if (rc != 0) + goto done; + break; + } + +writeback: + rc = writeback(ctxt, ops); + if (rc != 0) + goto done; + + /* Commit shadow register state. */ + memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); + ctxt->vcpu->arch.rip = c->eip; + +done: + if (rc == X86EMUL_UNHANDLEABLE) { + c->eip = saved_eip; + return -1; + } + return 0; + +twobyte_insn: + switch (c->b) { + case 0x01: /* lgdt, lidt, lmsw */ + switch (c->modrm_reg) { + u16 size; + unsigned long address; + + case 0: /* vmcall */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + goto cannot_emulate; + + rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc) + goto done; + + kvm_emulate_hypercall(ctxt->vcpu); + break; + case 2: /* lgdt */ + rc = read_descriptor(ctxt, ops, c->src.ptr, + &size, &address, c->op_bytes); + if (rc) + goto done; + realmode_lgdt(ctxt->vcpu, size, address); + break; + case 3: /* lidt/vmmcall */ + if (c->modrm_mod == 3 && c->modrm_rm == 1) { + rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc) + goto done; + kvm_emulate_hypercall(ctxt->vcpu); + } else { + rc = read_descriptor(ctxt, ops, c->src.ptr, + &size, &address, + c->op_bytes); + if (rc) + goto done; + realmode_lidt(ctxt->vcpu, size, address); + } + break; + case 4: /* smsw */ + if (c->modrm_mod != 3) + goto cannot_emulate; + *(u16 *)&c->regs[c->modrm_rm] + = realmode_get_cr(ctxt->vcpu, 0); + break; + case 6: /* lmsw */ + if (c->modrm_mod != 3) + goto cannot_emulate; + realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val, + &ctxt->eflags); + break; + case 7: /* invlpg*/ + emulate_invlpg(ctxt->vcpu, memop); + break; + default: + goto cannot_emulate; + } + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + case 0x06: + emulate_clts(ctxt->vcpu); + c->dst.type = OP_NONE; + break; + case 0x08: /* invd */ + case 0x09: /* wbinvd */ + case 0x0d: /* GrpP (prefetch) */ + case 0x18: /* Grp16 (prefetch/nop) */ + c->dst.type = OP_NONE; + break; + case 0x20: /* mov cr, reg */ + if (c->modrm_mod != 3) + goto cannot_emulate; + c->regs[c->modrm_rm] = + realmode_get_cr(ctxt->vcpu, c->modrm_reg); + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x21: /* mov from dr to reg */ + if (c->modrm_mod != 3) + goto cannot_emulate; + rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); + if (rc) + goto cannot_emulate; + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x22: /* mov reg, cr */ + if (c->modrm_mod != 3) + goto cannot_emulate; + realmode_set_cr(ctxt->vcpu, + c->modrm_reg, c->modrm_val, &ctxt->eflags); + c->dst.type = OP_NONE; + break; + case 0x23: /* mov from reg to dr */ + if (c->modrm_mod != 3) + goto cannot_emulate; + rc = emulator_set_dr(ctxt, c->modrm_reg, + c->regs[c->modrm_rm]); + if (rc) + goto cannot_emulate; + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x30: + /* wrmsr */ + msr_data = (u32)c->regs[VCPU_REGS_RAX] + | ((u64)c->regs[VCPU_REGS_RDX] << 32); + rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); + if (rc) { + kvm_inject_gp(ctxt->vcpu, 0); + c->eip = ctxt->vcpu->arch.rip; + } + rc = X86EMUL_CONTINUE; + c->dst.type = OP_NONE; + break; + case 0x32: + /* rdmsr */ + rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); + if (rc) { + kvm_inject_gp(ctxt->vcpu, 0); + c->eip = ctxt->vcpu->arch.rip; + } else { + c->regs[VCPU_REGS_RAX] = (u32)msr_data; + c->regs[VCPU_REGS_RDX] = msr_data >> 32; + } + rc = X86EMUL_CONTINUE; + c->dst.type = OP_NONE; + break; + case 0x40 ... 0x4f: /* cmov */ + c->dst.val = c->dst.orig_val = c->src.val; + if (!test_cc(c->b, ctxt->eflags)) + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x80 ... 0x8f: /* jnz rel, etc*/ { + long int rel; + + switch (c->op_bytes) { + case 2: + rel = insn_fetch(s16, 2, c->eip); + break; + case 4: + rel = insn_fetch(s32, 4, c->eip); + break; + case 8: + rel = insn_fetch(s64, 8, c->eip); + break; + default: + DPRINTF("jnz: Invalid op_bytes\n"); + goto cannot_emulate; + } + if (test_cc(c->b, ctxt->eflags)) + JMP_REL(rel); + c->dst.type = OP_NONE; + break; + } + case 0xa3: + bt: /* bt */ + c->dst.type = OP_NONE; + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); + break; + case 0xab: + bts: /* bts */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); + break; + case 0xb0 ... 0xb1: /* cmpxchg */ + /* + * Save real source value, then compare EAX against + * destination. + */ + c->src.orig_val = c->src.val; + c->src.val = c->regs[VCPU_REGS_RAX]; + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + if (ctxt->eflags & EFLG_ZF) { + /* Success: write back to memory. */ + c->dst.val = c->src.orig_val; + } else { + /* Failure: write the value we saw to EAX. */ + c->dst.type = OP_REG; + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + } + break; + case 0xb3: + btr: /* btr */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); + break; + case 0xb6 ... 0xb7: /* movzx */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->d & ByteOp) ? (u8) c->src.val + : (u16) c->src.val; + break; + case 0xba: /* Grp8 */ + switch (c->modrm_reg & 3) { + case 0: + goto bt; + case 1: + goto bts; + case 2: + goto btr; + case 3: + goto btc; + } + break; + case 0xbb: + btc: /* btc */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); + break; + case 0xbe ... 0xbf: /* movsx */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : + (s16) c->src.val; + break; + case 0xc3: /* movnti */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : + (u64) c->src.val; + break; + case 0xc7: /* Grp9 (cmpxchg8b) */ + rc = emulate_grp9(ctxt, ops, memop); + if (rc != 0) + goto done; + c->dst.type = OP_NONE; + break; + } + goto writeback; + +cannot_emulate: + DPRINTF("Cannot emulate %02x\n", c->b); + c->eip = saved_eip; + return -1; +} diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 19626ace0f5..964dfa36d36 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -1,6 +1,7 @@ config LGUEST_GUEST bool "Lguest guest support" select PARAVIRT + depends on X86_32 depends on !X86_PAE depends on !(X86_VISWS || X86_VOYAGER) select VIRTIO diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d6b18e2e543..5afdde4895d 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -176,8 +176,8 @@ static void lguest_leave_lazy_mode(void) * check there when it wants to deliver an interrupt. */ -/* save_flags() is expected to return the processor state (ie. "eflags"). The - * eflags word contains all kind of stuff, but in practice Linux only cares +/* save_flags() is expected to return the processor state (ie. "flags"). The + * flags word contains all kind of stuff, but in practice Linux only cares * about the interrupt flag. Our "save_flags()" just returns that. */ static unsigned long save_fl(void) { @@ -218,19 +218,20 @@ static void irq_enable(void) * address of the handler, and... well, who cares? The Guest just asks the * Host to make the change anyway, because the Host controls the real IDT. */ -static void lguest_write_idt_entry(struct desc_struct *dt, - int entrynum, u32 low, u32 high) +static void lguest_write_idt_entry(gate_desc *dt, + int entrynum, const gate_desc *g) { + u32 *desc = (u32 *)g; /* Keep the local copy up to date. */ - write_dt_entry(dt, entrynum, low, high); + native_write_idt_entry(dt, entrynum, g); /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); + hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); } /* Changing to a different IDT is very rare: we keep the IDT up-to-date every * time it is written, so we can simply loop through all entries and tell the * Host about them. */ -static void lguest_load_idt(const struct Xgt_desc_struct *desc) +static void lguest_load_idt(const struct desc_ptr *desc) { unsigned int i; struct desc_struct *idt = (void *)desc->address; @@ -253,7 +254,7 @@ static void lguest_load_idt(const struct Xgt_desc_struct *desc) * hypercall and use that repeatedly to load a new IDT. I don't think it * really matters, but wouldn't it be nice if they were the same? */ -static void lguest_load_gdt(const struct Xgt_desc_struct *desc) +static void lguest_load_gdt(const struct desc_ptr *desc) { BUG_ON((desc->size+1)/8 != GDT_ENTRIES); hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); @@ -262,10 +263,10 @@ static void lguest_load_gdt(const struct Xgt_desc_struct *desc) /* For a single GDT entry which changes, we do the lazy thing: alter our GDT, * then tell the Host to reload the entire thing. This operation is so rare * that this naive implementation is reasonable. */ -static void lguest_write_gdt_entry(struct desc_struct *dt, - int entrynum, u32 low, u32 high) +static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, + const void *desc, int type) { - write_dt_entry(dt, entrynum, low, high); + native_write_gdt_entry(dt, entrynum, desc, type); hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); } @@ -324,30 +325,30 @@ static void lguest_load_tr_desc(void) * anyone (including userspace) can just use the raw "cpuid" instruction and * the Host won't even notice since it isn't privileged. So we try not to get * too worked up about it. */ -static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static void lguest_cpuid(unsigned int *ax, unsigned int *bx, + unsigned int *cx, unsigned int *dx) { - int function = *eax; + int function = *ax; - native_cpuid(eax, ebx, ecx, edx); + native_cpuid(ax, bx, cx, dx); switch (function) { case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ - *ecx &= 0x00002201; + *cx &= 0x00002201; /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ - *edx &= 0x07808101; + *dx &= 0x07808101; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls * flush_tlb_user() for both user and kernel mappings unless * the Page Global Enable (PGE) feature bit is set. */ - *edx |= 0x00002000; + *dx |= 0x00002000; break; case 0x80000000: /* Futureproof this a little: if they ask how much extended * processor information there is, limit it to known fields. */ - if (*eax > 0x80000008) - *eax = 0x80000008; + if (*ax > 0x80000008) + *ax = 0x80000008; break; } } @@ -756,10 +757,10 @@ static void lguest_time_init(void) * segment), the privilege level (we're privilege level 1, the Host is 0 and * will not tolerate us trying to use that), the stack pointer, and the number * of pages in the stack. */ -static void lguest_load_esp0(struct tss_struct *tss, +static void lguest_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { - lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0, + lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0, THREAD_SIZE/PAGE_SIZE); } @@ -789,11 +790,11 @@ static void lguest_wbinvd(void) * code qualifies for Advanced. It will also never interrupt anything. It * does, however, allow us to get through the Linux boot code. */ #ifdef CONFIG_X86_LOCAL_APIC -static void lguest_apic_write(unsigned long reg, unsigned long v) +static void lguest_apic_write(unsigned long reg, u32 v) { } -static unsigned long lguest_apic_read(unsigned long reg) +static u32 lguest_apic_read(unsigned long reg) { return 0; } @@ -963,7 +964,7 @@ __init void lguest_init(void) pv_cpu_ops.cpuid = lguest_cpuid; pv_cpu_ops.load_idt = lguest_load_idt; pv_cpu_ops.iret = lguest_iret; - pv_cpu_ops.load_esp0 = lguest_load_esp0; + pv_cpu_ops.load_sp0 = lguest_load_sp0; pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; pv_cpu_ops.set_ldt = lguest_set_ldt; pv_cpu_ops.load_tls = lguest_load_tls; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 329da276c6f..4876182daf8 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -1,5 +1,27 @@ +# +# Makefile for x86 specific library files. +# + +obj-$(CONFIG_SMP) := msr-on-cpu.o + +lib-y := delay_$(BITS).o +lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o +lib-y += memcpy_$(BITS).o + ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/lib/Makefile_32 + lib-y += checksum_32.o + lib-y += strstr_32.o + lib-y += bitops_32.o semaphore_32.o string_32.o + + lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else -include ${srctree}/arch/x86/lib/Makefile_64 + obj-y += io_64.o iomap_copy_64.o + + CFLAGS_csum-partial_64.o := -funroll-loops + + lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o + lib-y += thunk_64.o clear_page_64.o copy_page_64.o + lib-y += bitstr_64.o bitops_64.o + lib-y += memmove_64.o memset_64.o + lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o endif diff --git a/arch/x86/lib/Makefile_32 b/arch/x86/lib/Makefile_32 deleted file mode 100644 index 98d1f1e2e2e..00000000000 --- a/arch/x86/lib/Makefile_32 +++ /dev/null @@ -1,11 +0,0 @@ -# -# Makefile for i386-specific library files.. -# - - -lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \ - bitops_32.o semaphore_32.o string_32.o - -lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o - -obj-$(CONFIG_SMP) += msr-on-cpu.o diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64 deleted file mode 100644 index bbabad3c933..00000000000 --- a/arch/x86/lib/Makefile_64 +++ /dev/null @@ -1,13 +0,0 @@ -# -# Makefile for x86_64-specific library files. -# - -CFLAGS_csum-partial_64.o := -funroll-loops - -obj-y := io_64.o iomap_copy_64.o -obj-$(CONFIG_SMP) += msr-on-cpu.o - -lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \ - usercopy_64.o getuser_64.o putuser_64.o \ - thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o -lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index 8ac51b82a63..37756b6fb32 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -34,8 +34,8 @@ void *memmove(void *dest, const void *src, size_t n) "cld" : "=&c" (d0), "=&S" (d1), "=&D" (d2) :"0" (n), - "1" (n-1+(const char *)src), - "2" (n-1+(char *)dest) + "1" (n-1+src), + "2" (n-1+dest) :"memory"); } return dest; diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c index 751ebae8ec4..80175e47b19 100644 --- a/arch/x86/lib/memmove_64.c +++ b/arch/x86/lib/memmove_64.c @@ -11,8 +11,8 @@ void *memmove(void * dest,const void *src,size_t count) if (dest < src) { return memcpy(dest,src,count); } else { - char *p = (char *) dest + count; - char *s = (char *) src + count; + char *p = dest + count; + const char *s = src + count; while (count--) *--p = *--s; } diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S index 444fba40098..3899bd37fdf 100644 --- a/arch/x86/lib/semaphore_32.S +++ b/arch/x86/lib/semaphore_32.S @@ -29,7 +29,7 @@ * registers (%eax, %edx and %ecx) except %eax whish is either a return * value or just clobbered.. */ - .section .sched.text + .section .sched.text, "ax" ENTRY(__down_failed) CFI_STARTPROC FRAME @@ -49,7 +49,7 @@ ENTRY(__down_failed) ENDFRAME ret CFI_ENDPROC - END(__down_failed) + ENDPROC(__down_failed) ENTRY(__down_failed_interruptible) CFI_STARTPROC @@ -70,7 +70,7 @@ ENTRY(__down_failed_interruptible) ENDFRAME ret CFI_ENDPROC - END(__down_failed_interruptible) + ENDPROC(__down_failed_interruptible) ENTRY(__down_failed_trylock) CFI_STARTPROC @@ -91,7 +91,7 @@ ENTRY(__down_failed_trylock) ENDFRAME ret CFI_ENDPROC - END(__down_failed_trylock) + ENDPROC(__down_failed_trylock) ENTRY(__up_wakeup) CFI_STARTPROC @@ -112,7 +112,7 @@ ENTRY(__up_wakeup) ENDFRAME ret CFI_ENDPROC - END(__up_wakeup) + ENDPROC(__up_wakeup) /* * rw spinlock fallbacks @@ -132,7 +132,7 @@ ENTRY(__write_lock_failed) ENDFRAME ret CFI_ENDPROC - END(__write_lock_failed) + ENDPROC(__write_lock_failed) ENTRY(__read_lock_failed) CFI_STARTPROC @@ -148,7 +148,7 @@ ENTRY(__read_lock_failed) ENDFRAME ret CFI_ENDPROC - END(__read_lock_failed) + ENDPROC(__read_lock_failed) #endif @@ -170,7 +170,7 @@ ENTRY(call_rwsem_down_read_failed) CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC - END(call_rwsem_down_read_failed) + ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) CFI_STARTPROC @@ -182,7 +182,7 @@ ENTRY(call_rwsem_down_write_failed) CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC - END(call_rwsem_down_write_failed) + ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_wake) CFI_STARTPROC @@ -196,7 +196,7 @@ ENTRY(call_rwsem_wake) CFI_ADJUST_CFA_OFFSET -4 1: ret CFI_ENDPROC - END(call_rwsem_wake) + ENDPROC(call_rwsem_wake) /* Fix up special calling conventions */ ENTRY(call_rwsem_downgrade_wake) @@ -214,6 +214,6 @@ ENTRY(call_rwsem_downgrade_wake) CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC - END(call_rwsem_downgrade_wake) + ENDPROC(call_rwsem_downgrade_wake) #endif diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index 6ea73f3de56..8b92d428ab0 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -33,7 +33,7 @@ .endm - .section .sched.text + .section .sched.text, "ax" #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed diff --git a/arch/x86/mach-rdc321x/Makefile b/arch/x86/mach-rdc321x/Makefile new file mode 100644 index 00000000000..1faac8125e3 --- /dev/null +++ b/arch/x86/mach-rdc321x/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the RDC321x specific parts of the kernel +# +obj-$(CONFIG_X86_RDC321X) := gpio.o platform.o wdt.o + diff --git a/arch/x86/mach-rdc321x/gpio.c b/arch/x86/mach-rdc321x/gpio.c new file mode 100644 index 00000000000..031269163bd --- /dev/null +++ b/arch/x86/mach-rdc321x/gpio.c @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2007, OpenWrt.org, Florian Fainelli <florian@openwrt.org> + * RDC321x architecture specific GPIO support + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/autoconf.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/delay.h> + +#include <asm/mach-rdc321x/rdc321x_defs.h> + +static inline int rdc_gpio_is_valid(unsigned gpio) +{ + return (gpio <= RDC_MAX_GPIO); +} + +static unsigned int rdc_gpio_read(unsigned gpio) +{ + unsigned int val; + + val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x84:0x48)); + outl(val, RDC3210_CFGREG_ADDR); + udelay(10); + val = inl(RDC3210_CFGREG_DATA); + val |= (0x1 << (gpio & 0x1F)); + outl(val, RDC3210_CFGREG_DATA); + udelay(10); + val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x88:0x4C)); + outl(val, RDC3210_CFGREG_ADDR); + udelay(10); + val = inl(RDC3210_CFGREG_DATA); + + return val; +} + +static void rdc_gpio_write(unsigned int val) +{ + if (val) { + outl(val, RDC3210_CFGREG_DATA); + udelay(10); + } +} + +int rdc_gpio_get_value(unsigned gpio) +{ + if (rdc_gpio_is_valid(gpio)) + return (int)rdc_gpio_read(gpio); + else + return -EINVAL; +} +EXPORT_SYMBOL(rdc_gpio_get_value); + +void rdc_gpio_set_value(unsigned gpio, int value) +{ + unsigned int val; + + if (!rdc_gpio_is_valid(gpio)) + return; + + val = rdc_gpio_read(gpio); + + if (value) + val &= ~(0x1 << (gpio & 0x1F)); + else + val |= (0x1 << (gpio & 0x1F)); + + rdc_gpio_write(val); +} +EXPORT_SYMBOL(rdc_gpio_set_value); + +int rdc_gpio_direction_input(unsigned gpio) +{ + return 0; +} +EXPORT_SYMBOL(rdc_gpio_direction_input); + +int rdc_gpio_direction_output(unsigned gpio, int value) +{ + return 0; +} +EXPORT_SYMBOL(rdc_gpio_direction_output); + + diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c new file mode 100644 index 00000000000..dda6024a586 --- /dev/null +++ b/arch/x86/mach-rdc321x/platform.c @@ -0,0 +1,68 @@ +/* + * Generic RDC321x platform devices + * + * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/device.h> +#include <linux/platform_device.h> +#include <linux/version.h> +#include <linux/leds.h> + +#include <asm/gpio.h> + +/* LEDS */ +static struct gpio_led default_leds[] = { + { .name = "rdc:dmz", .gpio = 1, }, +}; + +static struct gpio_led_platform_data rdc321x_led_data = { + .num_leds = ARRAY_SIZE(default_leds), + .leds = default_leds, +}; + +static struct platform_device rdc321x_leds = { + .name = "leds-gpio", + .id = -1, + .dev = { + .platform_data = &rdc321x_led_data, + } +}; + +/* Watchdog */ +static struct platform_device rdc321x_wdt = { + .name = "rdc321x-wdt", + .id = -1, + .num_resources = 0, +}; + +static struct platform_device *rdc321x_devs[] = { + &rdc321x_leds, + &rdc321x_wdt +}; + +static int __init rdc_board_setup(void) +{ + return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs)); +} + +arch_initcall(rdc_board_setup); diff --git a/arch/x86/mach-rdc321x/wdt.c b/arch/x86/mach-rdc321x/wdt.c new file mode 100644 index 00000000000..ec5625ae706 --- /dev/null +++ b/arch/x86/mach-rdc321x/wdt.c @@ -0,0 +1,275 @@ +/* + * RDC321x watchdog driver + * + * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org> + * + * This driver is highly inspired from the cpu5_wdt driver + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/timer.h> +#include <linux/completion.h> +#include <linux/jiffies.h> +#include <linux/platform_device.h> +#include <linux/watchdog.h> +#include <linux/io.h> +#include <linux/uaccess.h> + +#include <asm/mach-rdc321x/rdc321x_defs.h> + +#define RDC_WDT_MASK 0x80000000 /* Mask */ +#define RDC_WDT_EN 0x00800000 /* Enable bit */ +#define RDC_WDT_WTI 0x00200000 /* Generate CPU reset/NMI/WDT on timeout */ +#define RDC_WDT_RST 0x00100000 /* Reset bit */ +#define RDC_WDT_WIF 0x00040000 /* WDT IRQ Flag */ +#define RDC_WDT_IRT 0x00000100 /* IRQ Routing table */ +#define RDC_WDT_CNT 0x00000001 /* WDT count */ + +#define RDC_CLS_TMR 0x80003844 /* Clear timer */ + +#define RDC_WDT_INTERVAL (HZ/10+1) + +int nowayout = WATCHDOG_NOWAYOUT; +module_param(nowayout, int, 0); +MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +static int ticks = 1000; + +/* some device data */ + +static struct { + struct completion stop; + volatile int running; + struct timer_list timer; + volatile int queue; + int default_ticks; + unsigned long inuse; +} rdc321x_wdt_device; + +/* generic helper functions */ + +static void rdc321x_wdt_trigger(unsigned long unused) +{ + if (rdc321x_wdt_device.running) + ticks--; + + /* keep watchdog alive */ + outl(RDC_WDT_EN|inl(RDC3210_CFGREG_DATA), RDC3210_CFGREG_DATA); + + /* requeue?? */ + if (rdc321x_wdt_device.queue && ticks) + mod_timer(&rdc321x_wdt_device.timer, + jiffies + RDC_WDT_INTERVAL); + else { + /* ticks doesn't matter anyway */ + complete(&rdc321x_wdt_device.stop); + } + +} + +static void rdc321x_wdt_reset(void) +{ + ticks = rdc321x_wdt_device.default_ticks; +} + +static void rdc321x_wdt_start(void) +{ + if (!rdc321x_wdt_device.queue) { + rdc321x_wdt_device.queue = 1; + + /* Clear the timer */ + outl(RDC_CLS_TMR, RDC3210_CFGREG_ADDR); + + /* Enable watchdog and set the timeout to 81.92 us */ + outl(RDC_WDT_EN|RDC_WDT_CNT, RDC3210_CFGREG_DATA); + + mod_timer(&rdc321x_wdt_device.timer, + jiffies + RDC_WDT_INTERVAL); + } + + /* if process dies, counter is not decremented */ + rdc321x_wdt_device.running++; +} + +static int rdc321x_wdt_stop(void) +{ + if (rdc321x_wdt_device.running) + rdc321x_wdt_device.running = 0; + + ticks = rdc321x_wdt_device.default_ticks; + + return -EIO; +} + +/* filesystem operations */ + +static int rdc321x_wdt_open(struct inode *inode, struct file *file) +{ + if (test_and_set_bit(0, &rdc321x_wdt_device.inuse)) + return -EBUSY; + + return nonseekable_open(inode, file); +} + +static int rdc321x_wdt_release(struct inode *inode, struct file *file) +{ + clear_bit(0, &rdc321x_wdt_device.inuse); + return 0; +} + +static int rdc321x_wdt_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int value; + static struct watchdog_info ident = { + .options = WDIOF_CARDRESET, + .identity = "RDC321x WDT", + }; + + switch (cmd) { + case WDIOC_KEEPALIVE: + rdc321x_wdt_reset(); + break; + case WDIOC_GETSTATUS: + /* Read the value from the DATA register */ + value = inl(RDC3210_CFGREG_DATA); + if (copy_to_user(argp, &value, sizeof(int))) + return -EFAULT; + break; + case WDIOC_GETSUPPORT: + if (copy_to_user(argp, &ident, sizeof(ident))) + return -EFAULT; + break; + case WDIOC_SETOPTIONS: + if (copy_from_user(&value, argp, sizeof(int))) + return -EFAULT; + switch (value) { + case WDIOS_ENABLECARD: + rdc321x_wdt_start(); + break; + case WDIOS_DISABLECARD: + return rdc321x_wdt_stop(); + default: + return -EINVAL; + } + break; + default: + return -ENOTTY; + } + return 0; +} + +static ssize_t rdc321x_wdt_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (!count) + return -EIO; + + rdc321x_wdt_reset(); + + return count; +} + +static const struct file_operations rdc321x_wdt_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .ioctl = rdc321x_wdt_ioctl, + .open = rdc321x_wdt_open, + .write = rdc321x_wdt_write, + .release = rdc321x_wdt_release, +}; + +static struct miscdevice rdc321x_wdt_misc = { + .minor = WATCHDOG_MINOR, + .name = "watchdog", + .fops = &rdc321x_wdt_fops, +}; + +static int __devinit rdc321x_wdt_probe(struct platform_device *pdev) +{ + int err; + + err = misc_register(&rdc321x_wdt_misc); + if (err < 0) { + printk(KERN_ERR PFX "watchdog misc_register failed\n"); + return err; + } + + /* Reset the watchdog */ + outl(RDC_WDT_RST, RDC3210_CFGREG_DATA); + + init_completion(&rdc321x_wdt_device.stop); + rdc321x_wdt_device.queue = 0; + + clear_bit(0, &rdc321x_wdt_device.inuse); + + setup_timer(&rdc321x_wdt_device.timer, rdc321x_wdt_trigger, 0); + + rdc321x_wdt_device.default_ticks = ticks; + + printk(KERN_INFO PFX "watchdog init success\n"); + + return 0; +} + +static int rdc321x_wdt_remove(struct platform_device *pdev) +{ + if (rdc321x_wdt_device.queue) { + rdc321x_wdt_device.queue = 0; + wait_for_completion(&rdc321x_wdt_device.stop); + } + + misc_deregister(&rdc321x_wdt_misc); + + return 0; +} + +static struct platform_driver rdc321x_wdt_driver = { + .probe = rdc321x_wdt_probe, + .remove = rdc321x_wdt_remove, + .driver = { + .owner = THIS_MODULE, + .name = "rdc321x-wdt", + }, +}; + +static int __init rdc321x_wdt_init(void) +{ + return platform_driver_register(&rdc321x_wdt_driver); +} + +static void __exit rdc321x_wdt_exit(void) +{ + platform_driver_unregister(&rdc321x_wdt_driver); +} + +module_init(rdc321x_wdt_init); +module_exit(rdc321x_wdt_exit); + +MODULE_AUTHOR("Florian Fainelli <florian@openwrt.org>"); +MODULE_DESCRIPTION("RDC321x watchdog driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR); diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c index f3c74fab8b9..2a8456a1f44 100644 --- a/arch/x86/mach-visws/mpparse.c +++ b/arch/x86/mach-visws/mpparse.c @@ -36,19 +36,19 @@ unsigned int __initdata maxcpus = NR_CPUS; static void __init MP_processor_info (struct mpc_config_processor *m) { - int ver, logical_apicid; + int ver, logical_apicid; physid_mask_t apic_cpus; - + if (!(m->mpc_cpuflag & CPU_ENABLED)) return; logical_apicid = m->mpc_apicid; - printk(KERN_INFO "%sCPU #%d %ld:%ld APIC version %d\n", - m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, - (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, - m->mpc_apicver); + printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n", + m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, + (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, + m->mpc_apicver); if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) boot_cpu_physical_apicid = m->mpc_apicid; diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c index 3bef977cb29..5ae5466b9eb 100644 --- a/arch/x86/mach-voyager/setup.c +++ b/arch/x86/mach-voyager/setup.c @@ -37,14 +37,14 @@ void __init pre_setup_arch_hook(void) { /* Voyagers run their CPUs from independent clocks, so disable * the TSC code because we can't sync them */ - tsc_disable = 1; + setup_clear_cpu_cap(X86_FEATURE_TSC); } void __init trap_init_hook(void) { } -static struct irqaction irq0 = { +static struct irqaction irq0 = { .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, .mask = CPU_MASK_NONE, @@ -59,44 +59,47 @@ void __init time_init_hook(void) /* Hook for machine specific memory setup. */ -char * __init machine_specific_memory_setup(void) +char *__init machine_specific_memory_setup(void) { char *who; who = "NOT VOYAGER"; - if(voyager_level == 5) { + if (voyager_level == 5) { __u32 addr, length; int i; who = "Voyager-SUS"; e820.nr_map = 0; - for(i=0; voyager_memory_detect(i, &addr, &length); i++) { + for (i = 0; voyager_memory_detect(i, &addr, &length); i++) { add_memory_region(addr, length, E820_RAM); } return who; - } else if(voyager_level == 4) { + } else if (voyager_level == 4) { __u32 tom; - __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8; + __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8; /* select the DINO config space */ outb(VOYAGER_DINO, VOYAGER_CAT_CONFIG_PORT); /* Read DINO top of memory register */ tom = ((inb(catbase + 0x4) & 0xf0) << 16) - + ((inb(catbase + 0x5) & 0x7f) << 24); + + ((inb(catbase + 0x5) & 0x7f) << 24); - if(inb(catbase) != VOYAGER_DINO) { - printk(KERN_ERR "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n"); - tom = (boot_params.screen_info.ext_mem_k)<<10; + if (inb(catbase) != VOYAGER_DINO) { + printk(KERN_ERR + "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n"); + tom = (boot_params.screen_info.ext_mem_k) << 10; } who = "Voyager-TOM"; add_memory_region(0, 0x9f000, E820_RAM); /* map from 1M to top of memory */ - add_memory_region(1*1024*1024, tom - 1*1024*1024, E820_RAM); + add_memory_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024, + E820_RAM); /* FIXME: Should check the ASICs to see if I need to * take out the 8M window. Just do it at the moment * */ - add_memory_region(8*1024*1024, 8*1024*1024, E820_RESERVED); + add_memory_region(8 * 1024 * 1024, 8 * 1024 * 1024, + E820_RESERVED); return who; } @@ -114,8 +117,7 @@ char * __init machine_specific_memory_setup(void) unsigned long mem_size; /* compare results from other methods and take the greater */ - if (boot_params.alt_mem_k - < boot_params.screen_info.ext_mem_k) { + if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) { mem_size = boot_params.screen_info.ext_mem_k; who = "BIOS-88"; } else { @@ -126,6 +128,6 @@ char * __init machine_specific_memory_setup(void) e820.nr_map = 0; add_memory_region(0, LOWMEMSIZE(), E820_RAM); add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } + } return who; } diff --git a/arch/x86/mach-voyager/voyager_basic.c b/arch/x86/mach-voyager/voyager_basic.c index 9b77b39b71a..6a949e4edde 100644 --- a/arch/x86/mach-voyager/voyager_basic.c +++ b/arch/x86/mach-voyager/voyager_basic.c @@ -35,7 +35,7 @@ /* * Power off function, if any */ -void (*pm_power_off)(void); +void (*pm_power_off) (void); EXPORT_SYMBOL(pm_power_off); int voyager_level = 0; @@ -43,39 +43,38 @@ int voyager_level = 0; struct voyager_SUS *voyager_SUS = NULL; #ifdef CONFIG_SMP -static void -voyager_dump(int dummy1, struct tty_struct *dummy3) +static void voyager_dump(int dummy1, struct tty_struct *dummy3) { /* get here via a sysrq */ voyager_smp_dump(); } static struct sysrq_key_op sysrq_voyager_dump_op = { - .handler = voyager_dump, - .help_msg = "Voyager", - .action_msg = "Dump Voyager Status", + .handler = voyager_dump, + .help_msg = "Voyager", + .action_msg = "Dump Voyager Status", }; #endif -void -voyager_detect(struct voyager_bios_info *bios) +void voyager_detect(struct voyager_bios_info *bios) { - if(bios->len != 0xff) { - int class = (bios->class_1 << 8) - | (bios->class_2 & 0xff); + if (bios->len != 0xff) { + int class = (bios->class_1 << 8) + | (bios->class_2 & 0xff); printk("Voyager System detected.\n" " Class %x, Revision %d.%d\n", class, bios->major, bios->minor); - if(class == VOYAGER_LEVEL4) + if (class == VOYAGER_LEVEL4) voyager_level = 4; - else if(class < VOYAGER_LEVEL5_AND_ABOVE) + else if (class < VOYAGER_LEVEL5_AND_ABOVE) voyager_level = 3; else voyager_level = 5; printk(" Architecture Level %d\n", voyager_level); - if(voyager_level < 4) - printk("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n"); + if (voyager_level < 4) + printk + ("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n"); /* install the power off handler */ pm_power_off = voyager_power_off; #ifdef CONFIG_SMP @@ -86,15 +85,13 @@ voyager_detect(struct voyager_bios_info *bios) } } -void -voyager_system_interrupt(int cpl, void *dev_id) +void voyager_system_interrupt(int cpl, void *dev_id) { printk("Voyager: detected system interrupt\n"); } /* Routine to read information from the extended CMOS area */ -__u8 -voyager_extended_cmos_read(__u16 addr) +__u8 voyager_extended_cmos_read(__u16 addr) { outb(addr & 0xff, 0x74); outb((addr >> 8) & 0xff, 0x75); @@ -108,12 +105,11 @@ voyager_extended_cmos_read(__u16 addr) typedef struct ClickMap { struct Entry { - __u32 Address; - __u32 Length; + __u32 Address; + __u32 Length; } Entry[CLICK_ENTRIES]; } ClickMap_t; - /* This routine is pretty much an awful hack to read the bios clickmap by * mapping it into page 0. There are usually three regions in the map: * Base Memory @@ -122,8 +118,7 @@ typedef struct ClickMap { * * Returns are 0 for failure and 1 for success on extracting region. */ -int __init -voyager_memory_detect(int region, __u32 *start, __u32 *length) +int __init voyager_memory_detect(int region, __u32 * start, __u32 * length) { int i; int retval = 0; @@ -132,13 +127,14 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length) unsigned long map_addr; unsigned long old; - if(region >= CLICK_ENTRIES) { + if (region >= CLICK_ENTRIES) { printk("Voyager: Illegal ClickMap region %d\n", region); return 0; } - for(i = 0; i < sizeof(cmos); i++) - cmos[i] = voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i); + for (i = 0; i < sizeof(cmos); i++) + cmos[i] = + voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i); map_addr = *(unsigned long *)cmos; @@ -147,10 +143,10 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length) pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); local_flush_tlb(); /* now clear everything out but page 0 */ - map = (ClickMap_t *)(map_addr & (~PAGE_MASK)); + map = (ClickMap_t *) (map_addr & (~PAGE_MASK)); /* zero length is the end of the clickmap */ - if(map->Entry[region].Length != 0) { + if (map->Entry[region].Length != 0) { *length = map->Entry[region].Length * CLICK_SIZE; *start = map->Entry[region].Address; retval = 1; @@ -165,10 +161,9 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length) /* voyager specific handling code for timer interrupts. Used to hand * off the timer tick to the SMP code, since the VIC doesn't have an * internal timer (The QIC does, but that's another story). */ -void -voyager_timer_interrupt(void) +void voyager_timer_interrupt(void) { - if((jiffies & 0x3ff) == 0) { + if ((jiffies & 0x3ff) == 0) { /* There seems to be something flaky in either * hardware or software that is resetting the timer 0 @@ -186,18 +181,20 @@ voyager_timer_interrupt(void) __u16 val; spin_lock(&i8253_lock); - + outb_p(0x00, 0x43); val = inb_p(0x40); val |= inb(0x40) << 8; spin_unlock(&i8253_lock); - if(val > LATCH) { - printk("\nVOYAGER: countdown timer value too high (%d), resetting\n\n", val); + if (val > LATCH) { + printk + ("\nVOYAGER: countdown timer value too high (%d), resetting\n\n", + val); spin_lock(&i8253_lock); - outb(0x34,0x43); - outb_p(LATCH & 0xff , 0x40); /* LSB */ - outb(LATCH >> 8 , 0x40); /* MSB */ + outb(0x34, 0x43); + outb_p(LATCH & 0xff, 0x40); /* LSB */ + outb(LATCH >> 8, 0x40); /* MSB */ spin_unlock(&i8253_lock); } } @@ -206,14 +203,13 @@ voyager_timer_interrupt(void) #endif } -void -voyager_power_off(void) +void voyager_power_off(void) { printk("VOYAGER Power Off\n"); - if(voyager_level == 5) { + if (voyager_level == 5) { voyager_cat_power_off(); - } else if(voyager_level == 4) { + } else if (voyager_level == 4) { /* This doesn't apparently work on most L4 machines, * but the specs say to do this to get automatic power * off. Unfortunately, if it doesn't power off the @@ -222,10 +218,8 @@ voyager_power_off(void) #if 0 int port; - /* enable the voyager Configuration Space */ - outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, - VOYAGER_MC_SETUP); + outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, VOYAGER_MC_SETUP); /* the port for the power off flag is an offset from the floating base */ port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21; @@ -235,62 +229,57 @@ voyager_power_off(void) } /* and wait for it to happen */ local_irq_disable(); - for(;;) + for (;;) halt(); } /* copied from process.c */ -static inline void -kb_wait(void) +static inline void kb_wait(void) { int i; - for (i=0; i<0x10000; i++) + for (i = 0; i < 0x10000; i++) if ((inb_p(0x64) & 0x02) == 0) break; } -void -machine_shutdown(void) +void machine_shutdown(void) { /* Architecture specific shutdown needed before a kexec */ } -void -machine_restart(char *cmd) +void machine_restart(char *cmd) { printk("Voyager Warm Restart\n"); kb_wait(); - if(voyager_level == 5) { + if (voyager_level == 5) { /* write magic values to the RTC to inform system that * shutdown is beginning */ outb(0x8f, 0x70); - outb(0x5 , 0x71); - + outb(0x5, 0x71); + udelay(50); - outb(0xfe,0x64); /* pull reset low */ - } else if(voyager_level == 4) { - __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8; + outb(0xfe, 0x64); /* pull reset low */ + } else if (voyager_level == 4) { + __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8; __u8 basebd = inb(VOYAGER_MC_SETUP); - + outb(basebd | 0x08, VOYAGER_MC_SETUP); outb(0x02, catbase + 0x21); } local_irq_disable(); - for(;;) + for (;;) halt(); } -void -machine_emergency_restart(void) +void machine_emergency_restart(void) { /*for now, just hook this to a warm restart */ machine_restart(NULL); } -void -mca_nmi_hook(void) +void mca_nmi_hook(void) { __u8 dumpval __maybe_unused = inb(0xf823); __u8 swnmi __maybe_unused = inb(0xf813); @@ -301,8 +290,8 @@ mca_nmi_hook(void) /* clear swnmi */ outb(0xff, 0xf813); /* tell SUS to ignore dump */ - if(voyager_level == 5 && voyager_SUS != NULL) { - if(voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) { + if (voyager_level == 5 && voyager_SUS != NULL) { + if (voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) { voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND; voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS; udelay(1000); @@ -310,15 +299,14 @@ mca_nmi_hook(void) voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS; } } - printk(KERN_ERR "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n", smp_processor_id()); + printk(KERN_ERR + "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n", + smp_processor_id()); show_stack(NULL, NULL); show_state(); } - - -void -machine_halt(void) +void machine_halt(void) { /* treat a halt like a power off */ machine_power_off(); diff --git a/arch/x86/mach-voyager/voyager_cat.c b/arch/x86/mach-voyager/voyager_cat.c index 2132ca652df..17a7904f75b 100644 --- a/arch/x86/mach-voyager/voyager_cat.c +++ b/arch/x86/mach-voyager/voyager_cat.c @@ -39,34 +39,32 @@ #define CAT_DATA (sspb + 0xd) /* the internal cat functions */ -static void cat_pack(__u8 *msg, __u16 start_bit, __u8 *data, - __u16 num_bits); -static void cat_unpack(__u8 *msg, __u16 start_bit, __u8 *data, +static void cat_pack(__u8 * msg, __u16 start_bit, __u8 * data, __u16 num_bits); +static void cat_unpack(__u8 * msg, __u16 start_bit, __u8 * data, __u16 num_bits); -static void cat_build_header(__u8 *header, const __u16 len, +static void cat_build_header(__u8 * header, const __u16 len, const __u16 smallest_reg_bits, const __u16 longest_reg_bits); -static int cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, +static int cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 op); -static int cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, - __u8 reg, __u8 *value); -static int cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, +static int cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp, + __u8 reg, __u8 * value); +static int cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits); -static int cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, +static int cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value); -static int cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, - __u8 *value); -static int cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, +static int cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, + __u8 * value); +static int cat_subread(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset, __u16 len, void *buf); -static int cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, +static int cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value); -static int cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp); -static int cat_connect(voyager_module_t *modp, voyager_asic_t *asicp); +static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp); +static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp); -static inline const char * -cat_module_name(int module_id) +static inline const char *cat_module_name(int module_id) { - switch(module_id) { + switch (module_id) { case 0x10: return "Processor Slot 0"; case 0x11: @@ -105,14 +103,14 @@ voyager_module_t *voyager_cat_list; /* the I/O port assignments for the VIC and QIC */ static struct resource vic_res = { - .name = "Voyager Interrupt Controller", - .start = 0xFC00, - .end = 0xFC6F + .name = "Voyager Interrupt Controller", + .start = 0xFC00, + .end = 0xFC6F }; static struct resource qic_res = { - .name = "Quad Interrupt Controller", - .start = 0xFC70, - .end = 0xFCFF + .name = "Quad Interrupt Controller", + .start = 0xFC70, + .end = 0xFCFF }; /* This function is used to pack a data bit stream inside a message. @@ -120,7 +118,7 @@ static struct resource qic_res = { * Note: This function assumes that any unused bit in the data stream * is set to zero so that the ors will work correctly */ static void -cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) +cat_pack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits) { /* compute initial shift needed */ const __u16 offset = start_bit % BITS_PER_BYTE; @@ -130,7 +128,7 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) int i; /* adjust if we have more than a byte of residue */ - if(residue >= BITS_PER_BYTE) { + if (residue >= BITS_PER_BYTE) { residue -= BITS_PER_BYTE; len++; } @@ -138,24 +136,25 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) /* clear out the bits. We assume here that if len==0 then * residue >= offset. This is always true for the catbus * operations */ - msg[byte] &= 0xff << (BITS_PER_BYTE - offset); + msg[byte] &= 0xff << (BITS_PER_BYTE - offset); msg[byte++] |= data[0] >> offset; - if(len == 0) + if (len == 0) return; - for(i = 1; i < len; i++) - msg[byte++] = (data[i-1] << (BITS_PER_BYTE - offset)) - | (data[i] >> offset); - if(residue != 0) { + for (i = 1; i < len; i++) + msg[byte++] = (data[i - 1] << (BITS_PER_BYTE - offset)) + | (data[i] >> offset); + if (residue != 0) { __u8 mask = 0xff >> residue; - __u8 last_byte = data[i-1] << (BITS_PER_BYTE - offset) - | (data[i] >> offset); - + __u8 last_byte = data[i - 1] << (BITS_PER_BYTE - offset) + | (data[i] >> offset); + last_byte &= ~mask; msg[byte] &= mask; msg[byte] |= last_byte; } return; } + /* unpack the data again (same arguments as cat_pack()). data buffer * must be zero populated. * @@ -163,7 +162,7 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) * data (starting at bit 0 in data). */ static void -cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) +cat_unpack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits) { /* compute initial shift needed */ const __u16 offset = start_bit % BITS_PER_BYTE; @@ -172,97 +171,97 @@ cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) __u16 byte = start_bit / BITS_PER_BYTE; int i; - if(last_bits != 0) + if (last_bits != 0) len++; /* special case: want < 8 bits from msg and we can get it from * a single byte of the msg */ - if(len == 0 && BITS_PER_BYTE - offset >= num_bits) { + if (len == 0 && BITS_PER_BYTE - offset >= num_bits) { data[0] = msg[byte] << offset; data[0] &= 0xff >> (BITS_PER_BYTE - num_bits); return; } - for(i = 0; i < len; i++) { + for (i = 0; i < len; i++) { /* this annoying if has to be done just in case a read of * msg one beyond the array causes a panic */ - if(offset != 0) { + if (offset != 0) { data[i] = msg[byte++] << offset; data[i] |= msg[byte] >> (BITS_PER_BYTE - offset); - } - else { + } else { data[i] = msg[byte++]; } } /* do we need to truncate the final byte */ - if(last_bits != 0) { - data[i-1] &= 0xff << (BITS_PER_BYTE - last_bits); + if (last_bits != 0) { + data[i - 1] &= 0xff << (BITS_PER_BYTE - last_bits); } return; } static void -cat_build_header(__u8 *header, const __u16 len, const __u16 smallest_reg_bits, +cat_build_header(__u8 * header, const __u16 len, const __u16 smallest_reg_bits, const __u16 longest_reg_bits) { int i; __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE; __u8 *last_byte = &header[len - 1]; - if(start_bit == 0) + if (start_bit == 0) start_bit = 1; /* must have at least one bit in the hdr */ - - for(i=0; i < len; i++) + + for (i = 0; i < len; i++) header[i] = 0; - for(i = start_bit; i > 0; i--) + for (i = start_bit; i > 0; i--) *last_byte = ((*last_byte) << 1) + 1; } static int -cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op) +cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 op) { __u8 parity, inst, inst_buf[4] = { 0 }; __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE]; __u16 ibytes, hbytes, padbits; int i; - + /* * Parity is the parity of the register number + 1 (READ_REGISTER * and WRITE_REGISTER always add '1' to the number of bits == 1) */ - parity = (__u8)(1 + (reg & 0x01) + - ((__u8)(reg & 0x02) >> 1) + - ((__u8)(reg & 0x04) >> 2) + - ((__u8)(reg & 0x08) >> 3)) % 2; + parity = (__u8) (1 + (reg & 0x01) + + ((__u8) (reg & 0x02) >> 1) + + ((__u8) (reg & 0x04) >> 2) + + ((__u8) (reg & 0x08) >> 3)) % 2; inst = ((parity << 7) | (reg << 2) | op); outb(VOYAGER_CAT_IRCYC, CAT_CMD); - if(!modp->scan_path_connected) { - if(asicp->asic_id != VOYAGER_CAT_ID) { - printk("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n"); + if (!modp->scan_path_connected) { + if (asicp->asic_id != VOYAGER_CAT_ID) { + printk + ("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n"); return 1; } outb(VOYAGER_CAT_HEADER, CAT_DATA); outb(inst, CAT_DATA); - if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { + if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) { CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n")); return 1; } return 0; } ibytes = modp->inst_bits / BITS_PER_BYTE; - if((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) { + if ((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) { padbits = BITS_PER_BYTE - padbits; ibytes++; } hbytes = modp->largest_reg / BITS_PER_BYTE; - if(modp->largest_reg % BITS_PER_BYTE) + if (modp->largest_reg % BITS_PER_BYTE) hbytes++; CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes)); /* initialise the instruction sequence to 0xff */ - for(i=0; i < ibytes + hbytes; i++) + for (i = 0; i < ibytes + hbytes; i++) iseq[i] = 0xff; cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg); cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE); @@ -271,11 +270,11 @@ cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op) cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length); #ifdef VOYAGER_CAT_DEBUG printk("ins = 0x%x, iseq: ", inst); - for(i=0; i< ibytes + hbytes; i++) + for (i = 0; i < ibytes + hbytes; i++) printk("0x%x ", iseq[i]); printk("\n"); #endif - if(cat_shiftout(iseq, ibytes, hbytes, padbits)) { + if (cat_shiftout(iseq, ibytes, hbytes, padbits)) { CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n")); return 1; } @@ -284,72 +283,74 @@ cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op) } static int -cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, - __u8 *value) +cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, + __u8 * value) { - if(!modp->scan_path_connected) { - if(asicp->asic_id != VOYAGER_CAT_ID) { + if (!modp->scan_path_connected) { + if (asicp->asic_id != VOYAGER_CAT_ID) { CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n")); return 1; } - if(reg > VOYAGER_SUBADDRHI) + if (reg > VOYAGER_SUBADDRHI) outb(VOYAGER_CAT_RUN, CAT_CMD); outb(VOYAGER_CAT_DRCYC, CAT_CMD); outb(VOYAGER_CAT_HEADER, CAT_DATA); *value = inb(CAT_DATA); outb(0xAA, CAT_DATA); - if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { + if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) { CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n")); return 1; } return 0; - } - else { - __u16 sbits = modp->num_asics -1 + asicp->ireg_length; + } else { + __u16 sbits = modp->num_asics - 1 + asicp->ireg_length; __u16 sbytes = sbits / BITS_PER_BYTE; __u16 tbytes; - __u8 string[VOYAGER_MAX_SCAN_PATH], trailer[VOYAGER_MAX_REG_SIZE]; + __u8 string[VOYAGER_MAX_SCAN_PATH], + trailer[VOYAGER_MAX_REG_SIZE]; __u8 padbits; int i; - + outb(VOYAGER_CAT_DRCYC, CAT_CMD); - if((padbits = sbits % BITS_PER_BYTE) != 0) { + if ((padbits = sbits % BITS_PER_BYTE) != 0) { padbits = BITS_PER_BYTE - padbits; sbytes++; } tbytes = asicp->ireg_length / BITS_PER_BYTE; - if(asicp->ireg_length % BITS_PER_BYTE) + if (asicp->ireg_length % BITS_PER_BYTE) tbytes++; CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n", - tbytes, sbytes, padbits)); + tbytes, sbytes, padbits)); cat_build_header(trailer, tbytes, 1, asicp->ireg_length); - - for(i = tbytes - 1; i >= 0; i--) { + for (i = tbytes - 1; i >= 0; i--) { outb(trailer[i], CAT_DATA); string[sbytes + i] = inb(CAT_DATA); } - for(i = sbytes - 1; i >= 0; i--) { + for (i = sbytes - 1; i >= 0; i--) { outb(0xaa, CAT_DATA); string[i] = inb(CAT_DATA); } *value = 0; - cat_unpack(string, padbits + (tbytes * BITS_PER_BYTE) + asicp->asic_location, value, asicp->ireg_length); + cat_unpack(string, + padbits + (tbytes * BITS_PER_BYTE) + + asicp->asic_location, value, asicp->ireg_length); #ifdef VOYAGER_CAT_DEBUG printk("value=0x%x, string: ", *value); - for(i=0; i< tbytes+sbytes; i++) + for (i = 0; i < tbytes + sbytes; i++) printk("0x%x ", string[i]); printk("\n"); #endif - + /* sanity check the rest of the return */ - for(i=0; i < tbytes; i++) { + for (i = 0; i < tbytes; i++) { __u8 input = 0; - cat_unpack(string, padbits + (i * BITS_PER_BYTE), &input, BITS_PER_BYTE); - if(trailer[i] != input) { + cat_unpack(string, padbits + (i * BITS_PER_BYTE), + &input, BITS_PER_BYTE); + if (trailer[i] != input) { CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i])); return 1; } @@ -360,14 +361,14 @@ cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, } static int -cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) +cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) { int i; - - for(i = data_bytes + header_bytes - 1; i >= header_bytes; i--) + + for (i = data_bytes + header_bytes - 1; i >= header_bytes; i--) outb(data[i], CAT_DATA); - for(i = header_bytes - 1; i >= 0; i--) { + for (i = header_bytes - 1; i >= 0; i--) { __u8 header = 0; __u8 input; @@ -376,7 +377,7 @@ cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) CDEBUG(("cat_shiftout: returned 0x%x\n", input)); cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits, &header, BITS_PER_BYTE); - if(input != header) { + if (input != header) { CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header)); return 1; } @@ -385,57 +386,57 @@ cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) } static int -cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, +cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value) { outb(VOYAGER_CAT_DRCYC, CAT_CMD); - if(!modp->scan_path_connected) { - if(asicp->asic_id != VOYAGER_CAT_ID) { + if (!modp->scan_path_connected) { + if (asicp->asic_id != VOYAGER_CAT_ID) { CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n")); return 1; } outb(VOYAGER_CAT_HEADER, CAT_DATA); outb(value, CAT_DATA); - if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { + if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) { CDEBUG(("cat_senddata: failed to get correct header response to sent data\n")); return 1; } - if(reg > VOYAGER_SUBADDRHI) { + if (reg > VOYAGER_SUBADDRHI) { outb(VOYAGER_CAT_RUN, CAT_CMD); outb(VOYAGER_CAT_END, CAT_CMD); outb(VOYAGER_CAT_RUN, CAT_CMD); } - + return 0; - } - else { + } else { __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE; - __u16 dbytes = (modp->num_asics - 1 + asicp->ireg_length)/BITS_PER_BYTE; - __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH], - hseq[VOYAGER_MAX_REG_SIZE]; + __u16 dbytes = + (modp->num_asics - 1 + asicp->ireg_length) / BITS_PER_BYTE; + __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH], + hseq[VOYAGER_MAX_REG_SIZE]; int i; - if((padbits = (modp->num_asics - 1 - + asicp->ireg_length) % BITS_PER_BYTE) != 0) { + if ((padbits = (modp->num_asics - 1 + + asicp->ireg_length) % BITS_PER_BYTE) != 0) { padbits = BITS_PER_BYTE - padbits; dbytes++; } - if(asicp->ireg_length % BITS_PER_BYTE) + if (asicp->ireg_length % BITS_PER_BYTE) hbytes++; - + cat_build_header(hseq, hbytes, 1, asicp->ireg_length); - - for(i = 0; i < dbytes + hbytes; i++) + + for (i = 0; i < dbytes + hbytes; i++) dseq[i] = 0xff; CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n", dbytes, hbytes, padbits)); cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length, hseq, hbytes * BITS_PER_BYTE); - cat_pack(dseq, asicp->asic_location, &value, + cat_pack(dseq, asicp->asic_location, &value, asicp->ireg_length); #ifdef VOYAGER_CAT_DEBUG printk("dseq "); - for(i=0; i<hbytes+dbytes; i++) { + for (i = 0; i < hbytes + dbytes; i++) { printk("0x%x ", dseq[i]); } printk("\n"); @@ -445,121 +446,125 @@ cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, } static int -cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, - __u8 value) +cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value) { - if(cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG)) + if (cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG)) return 1; return cat_senddata(modp, asicp, reg, value); } static int -cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, - __u8 *value) +cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, + __u8 * value) { - if(cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG)) + if (cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG)) return 1; return cat_getdata(modp, asicp, reg, value); } static int -cat_subaddrsetup(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, +cat_subaddrsetup(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset, __u16 len) { __u8 val; - if(len > 1) { + if (len > 1) { /* set auto increment */ __u8 newval; - - if(cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) { + + if (cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) { CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n")); return 1; } - CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n", val)); + CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n", + val)); newval = val | VOYAGER_AUTO_INC; - if(newval != val) { - if(cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) { + if (newval != val) { + if (cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) { CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n")); return 1; } } } - if(cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8)(offset &0xff))) { + if (cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8) (offset & 0xff))) { CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n")); return 1; } - if(asicp->subaddr > VOYAGER_SUBADDR_LO) { - if(cat_write(modp, asicp, VOYAGER_SUBADDRHI, (__u8)(offset >> 8))) { + if (asicp->subaddr > VOYAGER_SUBADDR_LO) { + if (cat_write + (modp, asicp, VOYAGER_SUBADDRHI, (__u8) (offset >> 8))) { CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n")); return 1; } cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val); - CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset, val)); + CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset, + val)); } cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val); CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val)); return 0; } - + static int -cat_subwrite(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, - __u16 len, void *buf) +cat_subwrite(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset, + __u16 len, void *buf) { int i, retval; /* FIXME: need special actions for VOYAGER_CAT_ID here */ - if(asicp->asic_id == VOYAGER_CAT_ID) { + if (asicp->asic_id == VOYAGER_CAT_ID) { CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n")); /* FIXME -- This is supposed to be handled better * There is a problem writing to the cat asic in the * PSI. The 30us delay seems to work, though */ udelay(30); } - - if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { + + if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { printk("cat_subwrite: cat_subaddrsetup FAILED\n"); return retval; } - - if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) { + + if (cat_sendinst + (modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) { printk("cat_subwrite: cat_sendinst FAILED\n"); return 1; } - for(i = 0; i < len; i++) { - if(cat_senddata(modp, asicp, 0xFF, ((__u8 *)buf)[i])) { - printk("cat_subwrite: cat_sendata element at %d FAILED\n", i); + for (i = 0; i < len; i++) { + if (cat_senddata(modp, asicp, 0xFF, ((__u8 *) buf)[i])) { + printk + ("cat_subwrite: cat_sendata element at %d FAILED\n", + i); return 1; } } return 0; } static int -cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, +cat_subread(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset, __u16 len, void *buf) { int i, retval; - if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { + if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n")); return retval; } - if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) { + if (cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) { CDEBUG(("cat_subread: cat_sendinst failed\n")); return 1; } - for(i = 0; i < len; i++) { - if(cat_getdata(modp, asicp, 0xFF, - &((__u8 *)buf)[i])) { - CDEBUG(("cat_subread: cat_getdata element %d failed\n", i)); + for (i = 0; i < len; i++) { + if (cat_getdata(modp, asicp, 0xFF, &((__u8 *) buf)[i])) { + CDEBUG(("cat_subread: cat_getdata element %d failed\n", + i)); return 1; } } return 0; } - /* buffer for storing EPROM data read in during initialisation */ static __initdata __u8 eprom_buf[0xFFFF]; static voyager_module_t *voyager_initial_module; @@ -568,8 +573,7 @@ static voyager_module_t *voyager_initial_module; * boot cpu *after* all memory initialisation has been done (so we can * use kmalloc) but before smp initialisation, so we can probe the SMP * configuration and pick up necessary information. */ -void __init -voyager_cat_init(void) +void __init voyager_cat_init(void) { voyager_module_t **modpp = &voyager_initial_module; voyager_asic_t **asicpp; @@ -578,27 +582,29 @@ voyager_cat_init(void) unsigned long qic_addr = 0; __u8 qabc_data[0x20]; __u8 num_submodules, val; - voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *)&eprom_buf[0]; - + voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *) & eprom_buf[0]; + __u8 cmos[4]; unsigned long addr; - + /* initiallise the SUS mailbox */ - for(i=0; i<sizeof(cmos); i++) + for (i = 0; i < sizeof(cmos); i++) cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i); addr = *(unsigned long *)cmos; - if((addr & 0xff000000) != 0xff000000) { - printk(KERN_ERR "Voyager failed to get SUS mailbox (addr = 0x%lx\n", addr); + if ((addr & 0xff000000) != 0xff000000) { + printk(KERN_ERR + "Voyager failed to get SUS mailbox (addr = 0x%lx\n", + addr); } else { static struct resource res; - + res.name = "voyager SUS"; res.start = addr; - res.end = addr+0x3ff; - + res.end = addr + 0x3ff; + request_resource(&iomem_resource, &res); voyager_SUS = (struct voyager_SUS *) - ioremap(addr, 0x400); + ioremap(addr, 0x400); printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n", voyager_SUS->SUS_version); voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION; @@ -609,8 +615,6 @@ voyager_cat_init(void) voyager_extended_vic_processors = 0; voyager_quad_processors = 0; - - printk("VOYAGER: beginning CAT bus probe\n"); /* set up the SuperSet Port Block which tells us where the * CAT communication port is */ @@ -618,14 +622,14 @@ voyager_cat_init(void) VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb)); /* now find out if were 8 slot or normal */ - if((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER) - == EIGHT_SLOT_IDENTIFIER) { + if ((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER) + == EIGHT_SLOT_IDENTIFIER) { voyager_8slot = 1; - printk(KERN_NOTICE "Voyager: Eight slot 51xx configuration detected\n"); + printk(KERN_NOTICE + "Voyager: Eight slot 51xx configuration detected\n"); } - for(i = VOYAGER_MIN_MODULE; - i <= VOYAGER_MAX_MODULE; i++) { + for (i = VOYAGER_MIN_MODULE; i <= VOYAGER_MAX_MODULE; i++) { __u8 input; int asic; __u16 eprom_size; @@ -643,21 +647,21 @@ voyager_cat_init(void) outb(0xAA, CAT_DATA); input = inb(CAT_DATA); outb(VOYAGER_CAT_END, CAT_CMD); - if(input != VOYAGER_CAT_HEADER) { + if (input != VOYAGER_CAT_HEADER) { continue; } CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i, cat_module_name(i))); - *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++];*/ - if(*modpp == NULL) { + *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++]; */ + if (*modpp == NULL) { printk("**WARNING** kmalloc failure in cat_init\n"); continue; } memset(*modpp, 0, sizeof(voyager_module_t)); /* need temporary asic for cat_subread. It will be * filled in correctly later */ - (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count];*/ - if((*modpp)->asic == NULL) { + (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count]; */ + if ((*modpp)->asic == NULL) { printk("**WARNING** kmalloc failure in cat_init\n"); continue; } @@ -666,47 +670,52 @@ voyager_cat_init(void) (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI; (*modpp)->module_addr = i; (*modpp)->scan_path_connected = 0; - if(i == VOYAGER_PSI) { + if (i == VOYAGER_PSI) { /* Exception leg for modules with no EEPROM */ printk("Module \"%s\"\n", cat_module_name(i)); continue; } - + CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); outb(VOYAGER_CAT_RUN, CAT_CMD); cat_disconnect(*modpp, (*modpp)->asic); - if(cat_subread(*modpp, (*modpp)->asic, - VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), - &eprom_size)) { - printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i); + if (cat_subread(*modpp, (*modpp)->asic, + VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), + &eprom_size)) { + printk + ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", + i); outb(VOYAGER_CAT_END, CAT_CMD); continue; } - if(eprom_size > sizeof(eprom_buf)) { - printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size); + if (eprom_size > sizeof(eprom_buf)) { + printk + ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", + i, eprom_size); outb(VOYAGER_CAT_END, CAT_CMD); continue; } outb(VOYAGER_CAT_END, CAT_CMD); outb(VOYAGER_CAT_RUN, CAT_CMD); - CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size)); - if(cat_subread(*modpp, (*modpp)->asic, 0, - eprom_size, eprom_buf)) { + CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, + eprom_size)); + if (cat_subread + (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) { outb(VOYAGER_CAT_END, CAT_CMD); continue; } outb(VOYAGER_CAT_END, CAT_CMD); printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n", cat_module_name(i), eprom_hdr->version_id, - *((__u32 *)eprom_hdr->tracer), eprom_hdr->num_asics); + *((__u32 *) eprom_hdr->tracer), eprom_hdr->num_asics); (*modpp)->ee_size = eprom_hdr->ee_size; (*modpp)->num_asics = eprom_hdr->num_asics; asicpp = &((*modpp)->asic); sp_offset = eprom_hdr->scan_path_offset; /* All we really care about are the Quad cards. We - * identify them because they are in a processor slot - * and have only four asics */ - if((i < 0x10 || (i>=0x14 && i < 0x1c) || i>0x1f)) { + * identify them because they are in a processor slot + * and have only four asics */ + if ((i < 0x10 || (i >= 0x14 && i < 0x1c) || i > 0x1f)) { modpp = &((*modpp)->next); continue; } @@ -717,16 +726,17 @@ voyager_cat_init(void) &num_submodules); /* lowest two bits, active low */ num_submodules = ~(0xfc | num_submodules); - CDEBUG(("VOYAGER CAT: %d submodules present\n", num_submodules)); - if(num_submodules == 0) { + CDEBUG(("VOYAGER CAT: %d submodules present\n", + num_submodules)); + if (num_submodules == 0) { /* fill in the dyadic extended processors */ __u8 cpu = i & 0x07; printk("Module \"%s\": Dyadic Processor Card\n", cat_module_name(i)); - voyager_extended_vic_processors |= (1<<cpu); + voyager_extended_vic_processors |= (1 << cpu); cpu += 4; - voyager_extended_vic_processors |= (1<<cpu); + voyager_extended_vic_processors |= (1 << cpu); outb(VOYAGER_CAT_END, CAT_CMD); continue; } @@ -740,28 +750,32 @@ voyager_cat_init(void) cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val); outb(VOYAGER_CAT_END, CAT_CMD); - CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); outb(VOYAGER_CAT_RUN, CAT_CMD); cat_disconnect(*modpp, (*modpp)->asic); - if(cat_subread(*modpp, (*modpp)->asic, - VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), - &eprom_size)) { - printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i); + if (cat_subread(*modpp, (*modpp)->asic, + VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), + &eprom_size)) { + printk + ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", + i); outb(VOYAGER_CAT_END, CAT_CMD); continue; } - if(eprom_size > sizeof(eprom_buf)) { - printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size); + if (eprom_size > sizeof(eprom_buf)) { + printk + ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", + i, eprom_size); outb(VOYAGER_CAT_END, CAT_CMD); continue; } outb(VOYAGER_CAT_END, CAT_CMD); outb(VOYAGER_CAT_RUN, CAT_CMD); - CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size)); - if(cat_subread(*modpp, (*modpp)->asic, 0, - eprom_size, eprom_buf)) { + CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, + eprom_size)); + if (cat_subread + (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) { outb(VOYAGER_CAT_END, CAT_CMD); continue; } @@ -773,30 +787,35 @@ voyager_cat_init(void) sp_offset = eprom_hdr->scan_path_offset; /* get rid of the dummy CAT asic and read the real one */ kfree((*modpp)->asic); - for(asic=0; asic < (*modpp)->num_asics; asic++) { + for (asic = 0; asic < (*modpp)->num_asics; asic++) { int j; - voyager_asic_t *asicp = *asicpp - = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ + voyager_asic_t *asicp = *asicpp = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++]; */ voyager_sp_table_t *sp_table; voyager_at_t *asic_table; voyager_jtt_t *jtag_table; - if(asicp == NULL) { - printk("**WARNING** kmalloc failure in cat_init\n"); + if (asicp == NULL) { + printk + ("**WARNING** kmalloc failure in cat_init\n"); continue; } asicpp = &(asicp->next); asicp->asic_location = asic; - sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset); + sp_table = + (voyager_sp_table_t *) (eprom_buf + sp_offset); asicp->asic_id = sp_table->asic_id; - asic_table = (voyager_at_t *)(eprom_buf + sp_table->asic_data_offset); - for(j=0; j<4; j++) + asic_table = + (voyager_at_t *) (eprom_buf + + sp_table->asic_data_offset); + for (j = 0; j < 4; j++) asicp->jtag_id[j] = asic_table->jtag_id[j]; - jtag_table = (voyager_jtt_t *)(eprom_buf + asic_table->jtag_offset); + jtag_table = + (voyager_jtt_t *) (eprom_buf + + asic_table->jtag_offset); asicp->ireg_length = jtag_table->ireg_len; asicp->bit_location = (*modpp)->inst_bits; (*modpp)->inst_bits += asicp->ireg_length; - if(asicp->ireg_length > (*modpp)->largest_reg) + if (asicp->ireg_length > (*modpp)->largest_reg) (*modpp)->largest_reg = asicp->ireg_length; if (asicp->ireg_length < (*modpp)->smallest_reg || (*modpp)->smallest_reg == 0) @@ -804,15 +823,13 @@ voyager_cat_init(void) CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n", asicp->asic_id, asicp->ireg_length, asicp->bit_location)); - if(asicp->asic_id == VOYAGER_QUAD_QABC) { + if (asicp->asic_id == VOYAGER_QUAD_QABC) { CDEBUG(("VOYAGER CAT: QABC ASIC found\n")); qabc_asic = asicp; } sp_offset += sizeof(voyager_sp_table_t); } - CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", - (*modpp)->inst_bits, (*modpp)->largest_reg, - (*modpp)->smallest_reg)); + CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", (*modpp)->inst_bits, (*modpp)->largest_reg, (*modpp)->smallest_reg)); /* OK, now we have the QUAD ASICs set up, use them. * we need to: * @@ -828,10 +845,11 @@ voyager_cat_init(void) qic_addr = qabc_data[5] << 8; qic_addr = (qic_addr | qabc_data[6]) << 8; qic_addr = (qic_addr | qabc_data[7]) << 8; - printk("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n", - cat_module_name(i), qic_addr, qabc_data[8]); + printk + ("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n", + cat_module_name(i), qic_addr, qabc_data[8]); #if 0 /* plumbing fails---FIXME */ - if((qabc_data[8] & 0xf0) == 0) { + if ((qabc_data[8] & 0xf0) == 0) { /* FIXME: 32 way 8 CPU slot monster cannot be * plumbed this way---need to check for it */ @@ -842,94 +860,97 @@ voyager_cat_init(void) #ifdef VOYAGER_CAT_DEBUG /* verify plumbing */ cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]); - if((qabc_data[8] & 0xf0) == 0) { - CDEBUG(("PLUMBING FAILED: 0x%x\n", qabc_data[8])); + if ((qabc_data[8] & 0xf0) == 0) { + CDEBUG(("PLUMBING FAILED: 0x%x\n", + qabc_data[8])); } #endif } #endif { - struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL); + struct resource *res = + kzalloc(sizeof(struct resource), GFP_KERNEL); res->name = kmalloc(128, GFP_KERNEL); - sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i)); + sprintf((char *)res->name, "Voyager %s Quad CPI", + cat_module_name(i)); res->start = qic_addr; res->end = qic_addr + 0x3ff; request_resource(&iomem_resource, res); } qic_addr = (unsigned long)ioremap(qic_addr, 0x400); - - for(j = 0; j < 4; j++) { + + for (j = 0; j < 4; j++) { __u8 cpu; - if(voyager_8slot) { + if (voyager_8slot) { /* 8 slot has a different mapping, * each slot has only one vic line, so * 1 cpu in each slot must be < 8 */ - cpu = (i & 0x07) + j*8; + cpu = (i & 0x07) + j * 8; } else { - cpu = (i & 0x03) + j*4; + cpu = (i & 0x03) + j * 4; } - if( (qabc_data[8] & (1<<j))) { - voyager_extended_vic_processors |= (1<<cpu); + if ((qabc_data[8] & (1 << j))) { + voyager_extended_vic_processors |= (1 << cpu); } - if(qabc_data[8] & (1<<(j+4)) ) { + if (qabc_data[8] & (1 << (j + 4))) { /* Second SET register plumbed: Quad * card has two VIC connected CPUs. * Secondary cannot be booted as a VIC * CPU */ - voyager_extended_vic_processors |= (1<<cpu); - voyager_allowed_boot_processors &= (~(1<<cpu)); + voyager_extended_vic_processors |= (1 << cpu); + voyager_allowed_boot_processors &= + (~(1 << cpu)); } - voyager_quad_processors |= (1<<cpu); + voyager_quad_processors |= (1 << cpu); voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *) - (qic_addr+(j<<8)); + (qic_addr + (j << 8)); CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu, (unsigned long)voyager_quad_cpi_addr[cpu])); } outb(VOYAGER_CAT_END, CAT_CMD); - - *asicpp = NULL; modpp = &((*modpp)->next); } *modpp = NULL; - printk("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n", voyager_extended_vic_processors, voyager_quad_processors, voyager_allowed_boot_processors); + printk + ("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n", + voyager_extended_vic_processors, voyager_quad_processors, + voyager_allowed_boot_processors); request_resource(&ioport_resource, &vic_res); - if(voyager_quad_processors) + if (voyager_quad_processors) request_resource(&ioport_resource, &qic_res); /* set up the front power switch */ } -int -voyager_cat_readb(__u8 module, __u8 asic, int reg) +int voyager_cat_readb(__u8 module, __u8 asic, int reg) { return 0; } -static int -cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp) +static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp) { __u8 val; int err = 0; - if(!modp->scan_path_connected) + if (!modp->scan_path_connected) return 0; - if(asicp->asic_id != VOYAGER_CAT_ID) { + if (asicp->asic_id != VOYAGER_CAT_ID) { CDEBUG(("cat_disconnect: ASIC is not CAT\n")); return 1; } err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); - if(err) { + if (err) { CDEBUG(("cat_disconnect: failed to read SCANPATH\n")); return err; } val &= VOYAGER_DISCONNECT_ASIC; err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); - if(err) { + if (err) { CDEBUG(("cat_disconnect: failed to write SCANPATH\n")); return err; } @@ -940,27 +961,26 @@ cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp) return 0; } -static int -cat_connect(voyager_module_t *modp, voyager_asic_t *asicp) +static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp) { __u8 val; int err = 0; - if(modp->scan_path_connected) + if (modp->scan_path_connected) return 0; - if(asicp->asic_id != VOYAGER_CAT_ID) { + if (asicp->asic_id != VOYAGER_CAT_ID) { CDEBUG(("cat_connect: ASIC is not CAT\n")); return 1; } err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); - if(err) { + if (err) { CDEBUG(("cat_connect: failed to read SCANPATH\n")); return err; } val |= VOYAGER_CONNECT_ASIC; err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); - if(err) { + if (err) { CDEBUG(("cat_connect: failed to write SCANPATH\n")); return err; } @@ -971,11 +991,10 @@ cat_connect(voyager_module_t *modp, voyager_asic_t *asicp) return 0; } -void -voyager_cat_power_off(void) +void voyager_cat_power_off(void) { /* Power the machine off by writing to the PSI over the CAT - * bus */ + * bus */ __u8 data; voyager_module_t psi = { 0 }; voyager_asic_t psi_asic = { 0 }; @@ -1009,8 +1028,7 @@ voyager_cat_power_off(void) struct voyager_status voyager_status = { 0 }; -void -voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data) +void voyager_cat_psi(__u8 cmd, __u16 reg, __u8 * data) { voyager_module_t psi = { 0 }; voyager_asic_t psi_asic = { 0 }; @@ -1027,7 +1045,7 @@ voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data) outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT); outb(VOYAGER_CAT_RUN, CAT_CMD); cat_disconnect(&psi, &psi_asic); - switch(cmd) { + switch (cmd) { case VOYAGER_PSI_READ: cat_read(&psi, &psi_asic, reg, data); break; @@ -1047,8 +1065,7 @@ voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data) outb(VOYAGER_CAT_END, CAT_CMD); } -void -voyager_cat_do_common_interrupt(void) +void voyager_cat_do_common_interrupt(void) { /* This is caused either by a memory parity error or something * in the PSI */ @@ -1057,7 +1074,7 @@ voyager_cat_do_common_interrupt(void) voyager_asic_t psi_asic = { 0 }; struct voyager_psi psi_reg; int i; - re_read: + re_read: psi.asic = &psi_asic; psi.asic->asic_id = VOYAGER_CAT_ID; psi.asic->subaddr = VOYAGER_SUBADDR_HI; @@ -1072,43 +1089,45 @@ voyager_cat_do_common_interrupt(void) cat_disconnect(&psi, &psi_asic); /* Read the status. NOTE: Need to read *all* the PSI regs here * otherwise the cmn int will be reasserted */ - for(i = 0; i < sizeof(psi_reg.regs); i++) { - cat_read(&psi, &psi_asic, i, &((__u8 *)&psi_reg.regs)[i]); + for (i = 0; i < sizeof(psi_reg.regs); i++) { + cat_read(&psi, &psi_asic, i, &((__u8 *) & psi_reg.regs)[i]); } outb(VOYAGER_CAT_END, CAT_CMD); - if((psi_reg.regs.checkbit & 0x02) == 0) { + if ((psi_reg.regs.checkbit & 0x02) == 0) { psi_reg.regs.checkbit |= 0x02; cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit); printk("VOYAGER RE-READ PSI\n"); goto re_read; } outb(VOYAGER_CAT_RUN, CAT_CMD); - for(i = 0; i < sizeof(psi_reg.subregs); i++) { + for (i = 0; i < sizeof(psi_reg.subregs); i++) { /* This looks strange, but the PSI doesn't do auto increment * correctly */ - cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i, - 1, &((__u8 *)&psi_reg.subregs)[i]); + cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i, + 1, &((__u8 *) & psi_reg.subregs)[i]); } outb(VOYAGER_CAT_END, CAT_CMD); #ifdef VOYAGER_CAT_DEBUG printk("VOYAGER PSI: "); - for(i=0; i<sizeof(psi_reg.regs); i++) - printk("%02x ", ((__u8 *)&psi_reg.regs)[i]); + for (i = 0; i < sizeof(psi_reg.regs); i++) + printk("%02x ", ((__u8 *) & psi_reg.regs)[i]); printk("\n "); - for(i=0; i<sizeof(psi_reg.subregs); i++) - printk("%02x ", ((__u8 *)&psi_reg.subregs)[i]); + for (i = 0; i < sizeof(psi_reg.subregs); i++) + printk("%02x ", ((__u8 *) & psi_reg.subregs)[i]); printk("\n"); #endif - if(psi_reg.regs.intstatus & PSI_MON) { + if (psi_reg.regs.intstatus & PSI_MON) { /* switch off or power fail */ - if(psi_reg.subregs.supply & PSI_SWITCH_OFF) { - if(voyager_status.switch_off) { - printk(KERN_ERR "Voyager front panel switch turned off again---Immediate power off!\n"); + if (psi_reg.subregs.supply & PSI_SWITCH_OFF) { + if (voyager_status.switch_off) { + printk(KERN_ERR + "Voyager front panel switch turned off again---Immediate power off!\n"); voyager_cat_power_off(); /* not reached */ } else { - printk(KERN_ERR "Voyager front panel switch turned off\n"); + printk(KERN_ERR + "Voyager front panel switch turned off\n"); voyager_status.switch_off = 1; voyager_status.request_from_kernel = 1; wake_up_process(voyager_thread); @@ -1127,7 +1146,7 @@ voyager_cat_do_common_interrupt(void) VDEBUG(("Voyager ac fail reg 0x%x\n", psi_reg.subregs.ACfail)); - if((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) { + if ((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) { /* No further update */ return; } @@ -1135,20 +1154,20 @@ voyager_cat_do_common_interrupt(void) /* Don't bother trying to find out who failed. * FIXME: This probably makes the code incorrect on * anything other than a 345x */ - for(i=0; i< 5; i++) { - if( psi_reg.subregs.ACfail &(1<<i)) { + for (i = 0; i < 5; i++) { + if (psi_reg.subregs.ACfail & (1 << i)) { break; } } printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i); #endif /* DON'T do this: it shuts down the AC PSI - outb(VOYAGER_CAT_RUN, CAT_CMD); - data = PSI_MASK_MASK | i; - cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK, - 1, &data); - outb(VOYAGER_CAT_END, CAT_CMD); - */ + outb(VOYAGER_CAT_RUN, CAT_CMD); + data = PSI_MASK_MASK | i; + cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK, + 1, &data); + outb(VOYAGER_CAT_END, CAT_CMD); + */ printk(KERN_ERR "Voyager AC power failure\n"); outb(VOYAGER_CAT_RUN, CAT_CMD); data = PSI_COLD_START; @@ -1159,16 +1178,16 @@ voyager_cat_do_common_interrupt(void) voyager_status.request_from_kernel = 1; wake_up_process(voyager_thread); } - - - } else if(psi_reg.regs.intstatus & PSI_FAULT) { + + } else if (psi_reg.regs.intstatus & PSI_FAULT) { /* Major fault! */ - printk(KERN_ERR "Voyager PSI Detected major fault, immediate power off!\n"); + printk(KERN_ERR + "Voyager PSI Detected major fault, immediate power off!\n"); voyager_cat_power_off(); /* not reached */ - } else if(psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM - | PSI_CURRENT | PSI_DVM - | PSI_PSCFAULT | PSI_STAT_CHG)) { + } else if (psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM + | PSI_CURRENT | PSI_DVM + | PSI_PSCFAULT | PSI_STAT_CHG)) { /* other psi fault */ printk(KERN_WARNING "Voyager PSI status 0x%x\n", data); diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 88124dd3540..dffa786f61f 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -32,7 +32,8 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0 }; /* CPU IRQ affinity -- set to all ones initially */ -static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = ~0UL }; +static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = + {[0 ... NR_CPUS-1] = ~0UL }; /* per CPU data structure (for /proc/cpuinfo et al), visible externally * indexed physically */ @@ -76,7 +77,6 @@ EXPORT_SYMBOL(cpu_online_map); * by scheduler but indexed physically */ cpumask_t phys_cpu_present_map = CPU_MASK_NONE; - /* The internal functions */ static void send_CPI(__u32 cpuset, __u8 cpi); static void ack_CPI(__u8 cpi); @@ -101,94 +101,86 @@ int hard_smp_processor_id(void); int safe_smp_processor_id(void); /* Inline functions */ -static inline void -send_one_QIC_CPI(__u8 cpu, __u8 cpi) +static inline void send_one_QIC_CPI(__u8 cpu, __u8 cpi) { voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi = - (smp_processor_id() << 16) + cpi; + (smp_processor_id() << 16) + cpi; } -static inline void -send_QIC_CPI(__u32 cpuset, __u8 cpi) +static inline void send_QIC_CPI(__u32 cpuset, __u8 cpi) { int cpu; for_each_online_cpu(cpu) { - if(cpuset & (1<<cpu)) { + if (cpuset & (1 << cpu)) { #ifdef VOYAGER_DEBUG - if(!cpu_isset(cpu, cpu_online_map)) - VDEBUG(("CPU%d sending cpi %d to CPU%d not in cpu_online_map\n", hard_smp_processor_id(), cpi, cpu)); + if (!cpu_isset(cpu, cpu_online_map)) + VDEBUG(("CPU%d sending cpi %d to CPU%d not in " + "cpu_online_map\n", + hard_smp_processor_id(), cpi, cpu)); #endif send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); } } } -static inline void -wrapper_smp_local_timer_interrupt(void) +static inline void wrapper_smp_local_timer_interrupt(void) { irq_enter(); smp_local_timer_interrupt(); irq_exit(); } -static inline void -send_one_CPI(__u8 cpu, __u8 cpi) +static inline void send_one_CPI(__u8 cpu, __u8 cpi) { - if(voyager_quad_processors & (1<<cpu)) + if (voyager_quad_processors & (1 << cpu)) send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); else - send_CPI(1<<cpu, cpi); + send_CPI(1 << cpu, cpi); } -static inline void -send_CPI_allbutself(__u8 cpi) +static inline void send_CPI_allbutself(__u8 cpi) { __u8 cpu = smp_processor_id(); __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu); send_CPI(mask, cpi); } -static inline int -is_cpu_quad(void) +static inline int is_cpu_quad(void) { __u8 cpumask = inb(VIC_PROC_WHO_AM_I); return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER); } -static inline int -is_cpu_extended(void) +static inline int is_cpu_extended(void) { __u8 cpu = hard_smp_processor_id(); - return(voyager_extended_vic_processors & (1<<cpu)); + return (voyager_extended_vic_processors & (1 << cpu)); } -static inline int -is_cpu_vic_boot(void) +static inline int is_cpu_vic_boot(void) { __u8 cpu = hard_smp_processor_id(); - return(voyager_extended_vic_processors - & voyager_allowed_boot_processors & (1<<cpu)); + return (voyager_extended_vic_processors + & voyager_allowed_boot_processors & (1 << cpu)); } - -static inline void -ack_CPI(__u8 cpi) +static inline void ack_CPI(__u8 cpi) { - switch(cpi) { + switch (cpi) { case VIC_CPU_BOOT_CPI: - if(is_cpu_quad() && !is_cpu_vic_boot()) + if (is_cpu_quad() && !is_cpu_vic_boot()) ack_QIC_CPI(cpi); else ack_VIC_CPI(cpi); break; case VIC_SYS_INT: - case VIC_CMN_INT: + case VIC_CMN_INT: /* These are slightly strange. Even on the Quad card, * They are vectored as VIC CPIs */ - if(is_cpu_quad()) + if (is_cpu_quad()) ack_special_QIC_CPI(cpi); else ack_VIC_CPI(cpi); @@ -205,11 +197,11 @@ ack_CPI(__u8 cpi) * 8259 IRQs except that masks and things must be kept per processor */ static struct irq_chip vic_chip = { - .name = "VIC", - .startup = startup_vic_irq, - .mask = mask_vic_irq, - .unmask = unmask_vic_irq, - .set_affinity = set_vic_irq_affinity, + .name = "VIC", + .startup = startup_vic_irq, + .mask = mask_vic_irq, + .unmask = unmask_vic_irq, + .set_affinity = set_vic_irq_affinity, }; /* used to count up as CPUs are brought on line (starts at 0) */ @@ -223,7 +215,7 @@ static __u32 trampoline_base; /* The per cpu profile stuff - used in smp_local_timer_interrupt */ static DEFINE_PER_CPU(int, prof_multiplier) = 1; static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; -static DEFINE_PER_CPU(int, prof_counter) = 1; +static DEFINE_PER_CPU(int, prof_counter) = 1; /* the map used to check if a CPU has booted */ static __u32 cpu_booted_map; @@ -235,7 +227,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE; /* This is for the new dynamic CPU boot code */ cpumask_t cpu_callin_map = CPU_MASK_NONE; cpumask_t cpu_callout_map = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_callout_map); cpumask_t cpu_possible_map = CPU_MASK_NONE; EXPORT_SYMBOL(cpu_possible_map); @@ -246,9 +237,9 @@ static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned; static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 }; /* Lock for enable/disable of VIC interrupts */ -static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock); +static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock); -/* The boot processor is correctly set up in PC mode when it +/* The boot processor is correctly set up in PC mode when it * comes up, but the secondaries need their master/slave 8259 * pairs initializing correctly */ @@ -262,8 +253,7 @@ static unsigned long vic_tick[NR_CPUS] __cacheline_aligned = { 0 }; static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned; /* debugging routine to read the isr of the cpu's pic */ -static inline __u16 -vic_read_isr(void) +static inline __u16 vic_read_isr(void) { __u16 isr; @@ -275,17 +265,16 @@ vic_read_isr(void) return isr; } -static __init void -qic_setup(void) +static __init void qic_setup(void) { - if(!is_cpu_quad()) { + if (!is_cpu_quad()) { /* not a quad, no setup */ return; } outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); - - if(is_cpu_extended()) { + + if (is_cpu_extended()) { /* the QIC duplicate of the VIC base register */ outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER); outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER); @@ -295,8 +284,7 @@ qic_setup(void) } } -static __init void -vic_setup_pic(void) +static __init void vic_setup_pic(void) { outb(1, VIC_REDIRECT_REGISTER_1); /* clear the claim registers for dynamic routing */ @@ -333,7 +321,7 @@ vic_setup_pic(void) /* ICW2: slave vector base */ outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1); - + /* ICW3: slave ID */ outb(0x02, 0xA1); @@ -341,19 +329,18 @@ vic_setup_pic(void) outb(0x01, 0xA1); } -static void -do_quad_bootstrap(void) +static void do_quad_bootstrap(void) { - if(is_cpu_quad() && is_cpu_vic_boot()) { + if (is_cpu_quad() && is_cpu_vic_boot()) { int i; unsigned long flags; __u8 cpuid = hard_smp_processor_id(); local_irq_save(flags); - for(i = 0; i<4; i++) { + for (i = 0; i < 4; i++) { /* FIXME: this would be >>3 &0x7 on the 32 way */ - if(((cpuid >> 2) & 0x03) == i) + if (((cpuid >> 2) & 0x03) == i) /* don't lower our own mask! */ continue; @@ -368,12 +355,10 @@ do_quad_bootstrap(void) } } - /* Set up all the basic stuff: read the SMP config and make all the * SMP information reflect only the boot cpu. All others will be * brought on-line later. */ -void __init -find_smp_config(void) +void __init find_smp_config(void) { int i; @@ -382,24 +367,31 @@ find_smp_config(void) printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id); /* initialize the CPU structures (moved from smp_boot_cpus) */ - for(i=0; i<NR_CPUS; i++) { + for (i = 0; i < NR_CPUS; i++) { cpu_irq_affinity[i] = ~0; } cpu_online_map = cpumask_of_cpu(boot_cpu_id); /* The boot CPU must be extended */ - voyager_extended_vic_processors = 1<<boot_cpu_id; + voyager_extended_vic_processors = 1 << boot_cpu_id; /* initially, all of the first 8 CPUs can boot */ voyager_allowed_boot_processors = 0xff; /* set up everything for just this CPU, we can alter * this as we start the other CPUs later */ /* now get the CPU disposition from the extended CMOS */ - cpus_addr(phys_cpu_present_map)[0] = voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK); - cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8; - cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 2) << 16; - cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 3) << 24; + cpus_addr(phys_cpu_present_map)[0] = + voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK); + cpus_addr(phys_cpu_present_map)[0] |= + voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8; + cpus_addr(phys_cpu_present_map)[0] |= + voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + + 2) << 16; + cpus_addr(phys_cpu_present_map)[0] |= + voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + + 3) << 24; cpu_possible_map = phys_cpu_present_map; - printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n", cpus_addr(phys_cpu_present_map)[0]); + printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n", + cpus_addr(phys_cpu_present_map)[0]); /* Here we set up the VIC to enable SMP */ /* enable the CPIs by writing the base vector to their register */ outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER); @@ -427,8 +419,7 @@ find_smp_config(void) /* * The bootstrap kernel entry code has set these up. Save them * for a given CPU, id is physical */ -void __init -smp_store_cpu_info(int id) +void __init smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); @@ -438,21 +429,19 @@ smp_store_cpu_info(int id) } /* set up the trampoline and return the physical address of the code */ -static __u32 __init -setup_trampoline(void) +static __u32 __init setup_trampoline(void) { /* these two are global symbols in trampoline.S */ extern const __u8 trampoline_end[]; extern const __u8 trampoline_data[]; - memcpy((__u8 *)trampoline_base, trampoline_data, + memcpy((__u8 *) trampoline_base, trampoline_data, trampoline_end - trampoline_data); - return virt_to_phys((__u8 *)trampoline_base); + return virt_to_phys((__u8 *) trampoline_base); } /* Routine initially called when a non-boot CPU is brought online */ -static void __init -start_secondary(void *unused) +static void __init start_secondary(void *unused) { __u8 cpuid = hard_smp_processor_id(); /* external functions not defined in the headers */ @@ -464,17 +453,18 @@ start_secondary(void *unused) ack_CPI(VIC_CPU_BOOT_CPI); /* setup the 8259 master slave pair belonging to this CPU --- - * we won't actually receive any until the boot CPU - * relinquishes it's static routing mask */ + * we won't actually receive any until the boot CPU + * relinquishes it's static routing mask */ vic_setup_pic(); qic_setup(); - if(is_cpu_quad() && !is_cpu_vic_boot()) { + if (is_cpu_quad() && !is_cpu_vic_boot()) { /* clear the boot CPI */ __u8 dummy; - dummy = voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi; + dummy = + voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi; printk("read dummy %d\n", dummy); } @@ -516,7 +506,6 @@ start_secondary(void *unused) cpu_idle(); } - /* Routine to kick start the given CPU and wait for it to report ready * (or timeout in startup). When this routine returns, the requested * CPU is either fully running and configured or known to be dead. @@ -524,29 +513,28 @@ start_secondary(void *unused) * We call this routine sequentially 1 CPU at a time, so no need for * locking */ -static void __init -do_boot_cpu(__u8 cpu) +static void __init do_boot_cpu(__u8 cpu) { struct task_struct *idle; int timeout; unsigned long flags; - int quad_boot = (1<<cpu) & voyager_quad_processors - & ~( voyager_extended_vic_processors - & voyager_allowed_boot_processors); + int quad_boot = (1 << cpu) & voyager_quad_processors + & ~(voyager_extended_vic_processors + & voyager_allowed_boot_processors); /* This is an area in head.S which was used to set up the * initial kernel stack. We need to alter this to give the * booting CPU a new stack (taken from its idle process) */ extern struct { - __u8 *esp; + __u8 *sp; unsigned short ss; } stack_start; /* This is the format of the CPI IDT gate (in real mode) which * we're hijacking to boot the CPU */ - union IDTFormat { + union IDTFormat { struct seg { - __u16 Offset; - __u16 Segment; + __u16 Offset; + __u16 Segment; } idt; __u32 val; } hijack_source; @@ -565,37 +553,44 @@ do_boot_cpu(__u8 cpu) alternatives_smp_switch(1); idle = fork_idle(cpu); - if(IS_ERR(idle)) + if (IS_ERR(idle)) panic("failed fork for CPU%d", cpu); - idle->thread.eip = (unsigned long) start_secondary; + idle->thread.ip = (unsigned long)start_secondary; /* init_tasks (in sched.c) is indexed logically */ - stack_start.esp = (void *) idle->thread.esp; + stack_start.sp = (void *)idle->thread.sp; init_gdt(cpu); - per_cpu(current_task, cpu) = idle; + per_cpu(current_task, cpu) = idle; early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); irq_ctx_init(cpu); /* Note: Don't modify initial ss override */ - VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu, + VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu, (unsigned long)hijack_source.val, hijack_source.idt.Segment, - hijack_source.idt.Offset, stack_start.esp)); + hijack_source.idt.Offset, stack_start.sp)); /* init lowmem identity mapping */ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); flush_tlb_all(); - if(quad_boot) { + if (quad_boot) { printk("CPU %d: non extended Quad boot\n", cpu); - hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE)*4); + hijack_vector = + (__u32 *) + phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE) * 4); *hijack_vector = hijack_source.val; } else { printk("CPU%d: extended VIC boot\n", cpu); - hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE)*4); + hijack_vector = + (__u32 *) + phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE) * 4); *hijack_vector = hijack_source.val; /* VIC errata, may also receive interrupt at this address */ - hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI + VIC_DEFAULT_CPI_BASE)*4); + hijack_vector = + (__u32 *) + phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI + + VIC_DEFAULT_CPI_BASE) * 4); *hijack_vector = hijack_source.val; } /* All non-boot CPUs start with interrupts fully masked. Need @@ -603,73 +598,76 @@ do_boot_cpu(__u8 cpu) * this in the VIC by masquerading as the processor we're * about to boot and lowering its interrupt mask */ local_irq_save(flags); - if(quad_boot) { + if (quad_boot) { send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI); } else { outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID); /* here we're altering registers belonging to `cpu' */ - + outb(VIC_BOOT_INTERRUPT_MASK, 0x21); /* now go back to our original identity */ outb(boot_cpu_id, VIC_PROCESSOR_ID); /* and boot the CPU */ - send_CPI((1<<cpu), VIC_CPU_BOOT_CPI); + send_CPI((1 << cpu), VIC_CPU_BOOT_CPI); } cpu_booted_map = 0; local_irq_restore(flags); /* now wait for it to become ready (or timeout) */ - for(timeout = 0; timeout < 50000; timeout++) { - if(cpu_booted_map) + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_booted_map) break; udelay(100); } /* reset the page table */ zap_low_mappings(); - + if (cpu_booted_map) { VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n", cpu, smp_processor_id())); - + printk("CPU%d: ", cpu); print_cpu_info(&cpu_data(cpu)); wmb(); cpu_set(cpu, cpu_callout_map); cpu_set(cpu, cpu_present_map); - } - else { + } else { printk("CPU%d FAILED TO BOOT: ", cpu); - if (*((volatile unsigned char *)phys_to_virt(start_phys_address))==0xA5) + if (* + ((volatile unsigned char *)phys_to_virt(start_phys_address)) + == 0xA5) printk("Stuck.\n"); else printk("Not responding.\n"); - + cpucount--; } } -void __init -smp_boot_cpus(void) +void __init smp_boot_cpus(void) { int i; /* CAT BUS initialisation must be done after the memory */ /* FIXME: The L4 has a catbus too, it just needs to be * accessed in a totally different way */ - if(voyager_level == 5) { + if (voyager_level == 5) { voyager_cat_init(); /* now that the cat has probed the Voyager System Bus, sanity * check the cpu map */ - if( ((voyager_quad_processors | voyager_extended_vic_processors) - & cpus_addr(phys_cpu_present_map)[0]) != cpus_addr(phys_cpu_present_map)[0]) { + if (((voyager_quad_processors | voyager_extended_vic_processors) + & cpus_addr(phys_cpu_present_map)[0]) != + cpus_addr(phys_cpu_present_map)[0]) { /* should panic */ - printk("\n\n***WARNING*** Sanity check of CPU present map FAILED\n"); + printk("\n\n***WARNING*** " + "Sanity check of CPU present map FAILED\n"); } - } else if(voyager_level == 4) - voyager_extended_vic_processors = cpus_addr(phys_cpu_present_map)[0]; + } else if (voyager_level == 4) + voyager_extended_vic_processors = + cpus_addr(phys_cpu_present_map)[0]; /* this sets up the idle task to run on the current cpu */ voyager_extended_cpus = 1; @@ -678,14 +676,14 @@ smp_boot_cpus(void) //global_irq_holder = boot_cpu_id; /* FIXME: Need to do something about this but currently only works - * on CPUs with a tsc which none of mine have. - smp_tune_scheduling(); + * on CPUs with a tsc which none of mine have. + smp_tune_scheduling(); */ smp_store_cpu_info(boot_cpu_id); printk("CPU%d: ", boot_cpu_id); print_cpu_info(&cpu_data(boot_cpu_id)); - if(is_cpu_quad()) { + if (is_cpu_quad()) { /* booting on a Quad CPU */ printk("VOYAGER SMP: Boot CPU is Quad\n"); qic_setup(); @@ -697,11 +695,11 @@ smp_boot_cpus(void) cpu_set(boot_cpu_id, cpu_online_map); cpu_set(boot_cpu_id, cpu_callout_map); - - /* loop over all the extended VIC CPUs and boot them. The + + /* loop over all the extended VIC CPUs and boot them. The * Quad CPUs must be bootstrapped by their extended VIC cpu */ - for(i = 0; i < NR_CPUS; i++) { - if(i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) + for (i = 0; i < NR_CPUS; i++) { + if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) continue; do_boot_cpu(i); /* This udelay seems to be needed for the Quad boots @@ -715,25 +713,26 @@ smp_boot_cpus(void) for (i = 0; i < NR_CPUS; i++) if (cpu_isset(i, cpu_online_map)) bogosum += cpu_data(i).loops_per_jiffy; - printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", - cpucount+1, - bogosum/(500000/HZ), - (bogosum/(5000/HZ))%100); + printk(KERN_INFO "Total of %d processors activated " + "(%lu.%02lu BogoMIPS).\n", + cpucount + 1, bogosum / (500000 / HZ), + (bogosum / (5000 / HZ)) % 100); } voyager_extended_cpus = hweight32(voyager_extended_vic_processors); - printk("VOYAGER: Extended (interrupt handling CPUs): %d, non-extended: %d\n", voyager_extended_cpus, num_booting_cpus() - voyager_extended_cpus); + printk("VOYAGER: Extended (interrupt handling CPUs): " + "%d, non-extended: %d\n", voyager_extended_cpus, + num_booting_cpus() - voyager_extended_cpus); /* that's it, switch to symmetric mode */ outb(0, VIC_PRIORITY_REGISTER); outb(0, VIC_CLAIM_REGISTER_0); outb(0, VIC_CLAIM_REGISTER_1); - + VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus())); } /* Reload the secondary CPUs task structure (this function does not * return ) */ -void __init -initialize_secondary(void) +void __init initialize_secondary(void) { #if 0 // AC kernels only @@ -745,11 +744,9 @@ initialize_secondary(void) * basically just the stack pointer and the eip. */ - asm volatile( - "movl %0,%%esp\n\t" - "jmp *%1" - : - :"r" (current->thread.esp),"r" (current->thread.eip)); + asm volatile ("movl %0,%%esp\n\t" + "jmp *%1"::"r" (current->thread.sp), + "r"(current->thread.ip)); } /* handle a Voyager SYS_INT -- If we don't, the base board will @@ -758,25 +755,23 @@ initialize_secondary(void) * System interrupts occur because some problem was detected on the * various busses. To find out what you have to probe all the * hardware via the CAT bus. FIXME: At the moment we do nothing. */ -fastcall void -smp_vic_sys_interrupt(struct pt_regs *regs) +void smp_vic_sys_interrupt(struct pt_regs *regs) { ack_CPI(VIC_SYS_INT); - printk("Voyager SYSTEM INTERRUPT\n"); + printk("Voyager SYSTEM INTERRUPT\n"); } /* Handle a voyager CMN_INT; These interrupts occur either because of * a system status change or because a single bit memory error * occurred. FIXME: At the moment, ignore all this. */ -fastcall void -smp_vic_cmn_interrupt(struct pt_regs *regs) +void smp_vic_cmn_interrupt(struct pt_regs *regs) { static __u8 in_cmn_int = 0; static DEFINE_SPINLOCK(cmn_int_lock); /* common ints are broadcast, so make sure we only do this once */ _raw_spin_lock(&cmn_int_lock); - if(in_cmn_int) + if (in_cmn_int) goto unlock_end; in_cmn_int++; @@ -784,12 +779,12 @@ smp_vic_cmn_interrupt(struct pt_regs *regs) VDEBUG(("Voyager COMMON INTERRUPT\n")); - if(voyager_level == 5) + if (voyager_level == 5) voyager_cat_do_common_interrupt(); _raw_spin_lock(&cmn_int_lock); in_cmn_int = 0; - unlock_end: + unlock_end: _raw_spin_unlock(&cmn_int_lock); ack_CPI(VIC_CMN_INT); } @@ -797,26 +792,23 @@ smp_vic_cmn_interrupt(struct pt_regs *regs) /* * Reschedule call back. Nothing to do, all the work is done * automatically when we return from the interrupt. */ -static void -smp_reschedule_interrupt(void) +static void smp_reschedule_interrupt(void) { /* do nothing */ } -static struct mm_struct * flush_mm; +static struct mm_struct *flush_mm; static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); -#define FLUSH_ALL 0xffffffff /* - * We cannot call mmdrop() because we are in interrupt context, + * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. * * We need to reload %cr3 since the page tables may be going * away from under us.. */ -static inline void -leave_mm (unsigned long cpu) +static inline void voyager_leave_mm(unsigned long cpu) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) BUG(); @@ -824,12 +816,10 @@ leave_mm (unsigned long cpu) load_cr3(swapper_pg_dir); } - /* * Invalidate call-back */ -static void -smp_invalidate_interrupt(void) +static void smp_invalidate_interrupt(void) { __u8 cpu = smp_processor_id(); @@ -837,18 +827,18 @@ smp_invalidate_interrupt(void) return; /* This will flood messages. Don't uncomment unless you see * Problems with cross cpu invalidation - VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n", - smp_processor_id())); - */ + VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n", + smp_processor_id())); + */ if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { - if (flush_va == FLUSH_ALL) + if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(flush_va); } else - leave_mm(cpu); + voyager_leave_mm(cpu); } smp_mb__before_clear_bit(); clear_bit(cpu, &smp_invalidate_needed); @@ -857,11 +847,10 @@ smp_invalidate_interrupt(void) /* All the new flush operations for 2.4 */ - /* This routine is called with a physical cpu mask */ static void -voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, - unsigned long va) +voyager_flush_tlb_others(unsigned long cpumask, struct mm_struct *mm, + unsigned long va) { int stuck = 50000; @@ -875,7 +864,7 @@ voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, BUG(); spin_lock(&tlbstate_lock); - + flush_mm = mm; flush_va = va; atomic_set_mask(cpumask, &smp_invalidate_needed); @@ -887,23 +876,23 @@ voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, while (smp_invalidate_needed) { mb(); - if(--stuck == 0) { - printk("***WARNING*** Stuck doing invalidate CPI (CPU%d)\n", smp_processor_id()); + if (--stuck == 0) { + printk("***WARNING*** Stuck doing invalidate CPI " + "(CPU%d)\n", smp_processor_id()); break; } } /* Uncomment only to debug invalidation problems - VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu)); - */ + VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu)); + */ flush_mm = NULL; flush_va = 0; spin_unlock(&tlbstate_lock); } -void -flush_tlb_current_task(void) +void flush_tlb_current_task(void) { struct mm_struct *mm = current->mm; unsigned long cpu_mask; @@ -913,14 +902,12 @@ flush_tlb_current_task(void) cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); local_flush_tlb(); if (cpu_mask) - voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } - -void -flush_tlb_mm (struct mm_struct * mm) +void flush_tlb_mm(struct mm_struct *mm) { unsigned long cpu_mask; @@ -932,15 +919,15 @@ flush_tlb_mm (struct mm_struct * mm) if (current->mm) local_flush_tlb(); else - leave_mm(smp_processor_id()); + voyager_leave_mm(smp_processor_id()); } if (cpu_mask) - voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) { struct mm_struct *mm = vma->vm_mm; unsigned long cpu_mask; @@ -949,10 +936,10 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); if (current->active_mm == mm) { - if(current->mm) + if (current->mm) __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); + else + voyager_leave_mm(smp_processor_id()); } if (cpu_mask) @@ -960,21 +947,21 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) preempt_enable(); } + EXPORT_SYMBOL(flush_tlb_page); /* enable the requested IRQs */ -static void -smp_enable_irq_interrupt(void) +static void smp_enable_irq_interrupt(void) { __u8 irq; __u8 cpu = get_cpu(); VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu, - vic_irq_enable_mask[cpu])); + vic_irq_enable_mask[cpu])); spin_lock(&vic_irq_lock); - for(irq = 0; irq < 16; irq++) { - if(vic_irq_enable_mask[cpu] & (1<<irq)) + for (irq = 0; irq < 16; irq++) { + if (vic_irq_enable_mask[cpu] & (1 << irq)) enable_local_vic_irq(irq); } vic_irq_enable_mask[cpu] = 0; @@ -982,17 +969,16 @@ smp_enable_irq_interrupt(void) put_cpu_no_resched(); } - + /* * CPU halt call-back */ -static void -smp_stop_cpu_function(void *dummy) +static void smp_stop_cpu_function(void *dummy) { VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id())); cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); - for(;;) + for (;;) halt(); } @@ -1006,14 +992,13 @@ struct call_data_struct { int wait; }; -static struct call_data_struct * call_data; +static struct call_data_struct *call_data; /* execute a thread on a new CPU. The function to be called must be * previously set up. This is used to schedule a function for * execution on all CPUs - set up the function then broadcast a * function_interrupt CPI to come here on each CPU */ -static void -smp_call_function_interrupt(void) +static void smp_call_function_interrupt(void) { void (*func) (void *info) = call_data->func; void *info = call_data->info; @@ -1027,16 +1012,17 @@ smp_call_function_interrupt(void) * about to execute the function */ mb(); - if(!test_and_clear_bit(cpu, &call_data->started)) { + if (!test_and_clear_bit(cpu, &call_data->started)) { /* If the bit wasn't set, this could be a replay */ - printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion with no call pending\n", cpu); + printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion" + " with no call pending\n", cpu); return; } /* * At this point the info structure may be out of scope unless wait==1 */ irq_enter(); - (*func)(info); + (*func) (info); __get_cpu_var(irq_stat).irq_call_count++; irq_exit(); if (wait) { @@ -1046,14 +1032,13 @@ smp_call_function_interrupt(void) } static int -voyager_smp_call_function_mask (cpumask_t cpumask, - void (*func) (void *info), void *info, - int wait) +voyager_smp_call_function_mask(cpumask_t cpumask, + void (*func) (void *info), void *info, int wait) { struct call_data_struct data; u32 mask = cpus_addr(cpumask)[0]; - mask &= ~(1<<smp_processor_id()); + mask &= ~(1 << smp_processor_id()); if (!mask) return 0; @@ -1093,7 +1078,7 @@ voyager_smp_call_function_mask (cpumask_t cpumask, * so we use the system clock to interrupt one processor, which in * turn, broadcasts a timer CPI to all the others --- we receive that * CPI here. We don't use this actually for counting so losing - * ticks doesn't matter + * ticks doesn't matter * * FIXME: For those CPUs which actually have a local APIC, we could * try to use it to trigger this interrupt instead of having to @@ -1101,8 +1086,7 @@ voyager_smp_call_function_mask (cpumask_t cpumask, * no local APIC, so I can't do this * * This function is currently a placeholder and is unused in the code */ -fastcall void -smp_apic_timer_interrupt(struct pt_regs *regs) +void smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); wrapper_smp_local_timer_interrupt(); @@ -1110,8 +1094,7 @@ smp_apic_timer_interrupt(struct pt_regs *regs) } /* All of the QUAD interrupt GATES */ -fastcall void -smp_qic_timer_interrupt(struct pt_regs *regs) +void smp_qic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); ack_QIC_CPI(QIC_TIMER_CPI); @@ -1119,127 +1102,112 @@ smp_qic_timer_interrupt(struct pt_regs *regs) set_irq_regs(old_regs); } -fastcall void -smp_qic_invalidate_interrupt(struct pt_regs *regs) +void smp_qic_invalidate_interrupt(struct pt_regs *regs) { ack_QIC_CPI(QIC_INVALIDATE_CPI); smp_invalidate_interrupt(); } -fastcall void -smp_qic_reschedule_interrupt(struct pt_regs *regs) +void smp_qic_reschedule_interrupt(struct pt_regs *regs) { ack_QIC_CPI(QIC_RESCHEDULE_CPI); smp_reschedule_interrupt(); } -fastcall void -smp_qic_enable_irq_interrupt(struct pt_regs *regs) +void smp_qic_enable_irq_interrupt(struct pt_regs *regs) { ack_QIC_CPI(QIC_ENABLE_IRQ_CPI); smp_enable_irq_interrupt(); } -fastcall void -smp_qic_call_function_interrupt(struct pt_regs *regs) +void smp_qic_call_function_interrupt(struct pt_regs *regs) { ack_QIC_CPI(QIC_CALL_FUNCTION_CPI); smp_call_function_interrupt(); } -fastcall void -smp_vic_cpi_interrupt(struct pt_regs *regs) +void smp_vic_cpi_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); __u8 cpu = smp_processor_id(); - if(is_cpu_quad()) + if (is_cpu_quad()) ack_QIC_CPI(VIC_CPI_LEVEL0); else ack_VIC_CPI(VIC_CPI_LEVEL0); - if(test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu])) + if (test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu])) wrapper_smp_local_timer_interrupt(); - if(test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu])) + if (test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu])) smp_invalidate_interrupt(); - if(test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu])) + if (test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu])) smp_reschedule_interrupt(); - if(test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu])) + if (test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu])) smp_enable_irq_interrupt(); - if(test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) + if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) smp_call_function_interrupt(); set_irq_regs(old_regs); } -static void -do_flush_tlb_all(void* info) +static void do_flush_tlb_all(void *info) { unsigned long cpu = smp_processor_id(); __flush_tlb_all(); if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) - leave_mm(cpu); + voyager_leave_mm(cpu); } - /* flush the TLB of every active CPU in the system */ -void -flush_tlb_all(void) +void flush_tlb_all(void) { on_each_cpu(do_flush_tlb_all, 0, 1, 1); } /* used to set up the trampoline for other CPUs when the memory manager * is sorted out */ -void __init -smp_alloc_memory(void) +void __init smp_alloc_memory(void) { - trampoline_base = (__u32)alloc_bootmem_low_pages(PAGE_SIZE); - if(__pa(trampoline_base) >= 0x93000) + trampoline_base = (__u32) alloc_bootmem_low_pages(PAGE_SIZE); + if (__pa(trampoline_base) >= 0x93000) BUG(); } /* send a reschedule CPI to one CPU by physical CPU number*/ -static void -voyager_smp_send_reschedule(int cpu) +static void voyager_smp_send_reschedule(int cpu) { send_one_CPI(cpu, VIC_RESCHEDULE_CPI); } - -int -hard_smp_processor_id(void) +int hard_smp_processor_id(void) { __u8 i; __u8 cpumask = inb(VIC_PROC_WHO_AM_I); - if((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER) + if ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER) return cpumask & 0x1F; - for(i = 0; i < 8; i++) { - if(cpumask & (1<<i)) + for (i = 0; i < 8; i++) { + if (cpumask & (1 << i)) return i; } printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask); return 0; } -int -safe_smp_processor_id(void) +int safe_smp_processor_id(void) { return hard_smp_processor_id(); } /* broadcast a halt to all other CPUs */ -static void -voyager_smp_send_stop(void) +static void voyager_smp_send_stop(void) { smp_call_function(smp_stop_cpu_function, NULL, 1, 1); } /* this function is triggered in time.c when a clock tick fires * we need to re-broadcast the tick to all CPUs */ -void -smp_vic_timer_interrupt(void) +void smp_vic_timer_interrupt(void) { send_CPI_allbutself(VIC_TIMER_CPI); smp_local_timer_interrupt(); @@ -1253,8 +1221,7 @@ smp_vic_timer_interrupt(void) * multiplier is 1 and it can be changed by writing the new multiplier * value into /proc/profile. */ -void -smp_local_timer_interrupt(void) +void smp_local_timer_interrupt(void) { int cpu = smp_processor_id(); long weight; @@ -1269,18 +1236,18 @@ smp_local_timer_interrupt(void) * * Interrupts are already masked off at this point. */ - per_cpu(prof_counter,cpu) = per_cpu(prof_multiplier, cpu); + per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); if (per_cpu(prof_counter, cpu) != - per_cpu(prof_old_multiplier, cpu)) { + per_cpu(prof_old_multiplier, cpu)) { /* FIXME: need to update the vic timer tick here */ per_cpu(prof_old_multiplier, cpu) = - per_cpu(prof_counter, cpu); + per_cpu(prof_counter, cpu); } update_process_times(user_mode_vm(get_irq_regs())); } - if( ((1<<cpu) & voyager_extended_vic_processors) == 0) + if (((1 << cpu) & voyager_extended_vic_processors) == 0) /* only extended VIC processors participate in * interrupt distribution */ return; @@ -1296,12 +1263,12 @@ smp_local_timer_interrupt(void) * we can take more than 100K local irqs per second on a 100 MHz P5. */ - if((++vic_tick[cpu] & 0x7) != 0) + if ((++vic_tick[cpu] & 0x7) != 0) return; /* get here every 16 ticks (about every 1/6 of a second) */ /* Change our priority to give someone else a chance at getting - * the IRQ. The algorithm goes like this: + * the IRQ. The algorithm goes like this: * * In the VIC, the dynamically routed interrupt is always * handled by the lowest priority eligible (i.e. receiving @@ -1325,18 +1292,18 @@ smp_local_timer_interrupt(void) * affinity code since we now try to even up the interrupt * counts when an affinity binding is keeping them on a * particular CPU*/ - weight = (vic_intr_count[cpu]*voyager_extended_cpus + weight = (vic_intr_count[cpu] * voyager_extended_cpus - vic_intr_total) >> 4; weight += 4; - if(weight > 7) + if (weight > 7) weight = 7; - if(weight < 0) + if (weight < 0) weight = 0; - - outb((__u8)weight, VIC_PRIORITY_REGISTER); + + outb((__u8) weight, VIC_PRIORITY_REGISTER); #ifdef VOYAGER_DEBUG - if((vic_tick[cpu] & 0xFFF) == 0) { + if ((vic_tick[cpu] & 0xFFF) == 0) { /* print this message roughly every 25 secs */ printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n", cpu, vic_tick[cpu], weight); @@ -1345,15 +1312,14 @@ smp_local_timer_interrupt(void) } /* setup the profiling timer */ -int -setup_profiling_timer(unsigned int multiplier) +int setup_profiling_timer(unsigned int multiplier) { int i; - if ( (!multiplier)) + if ((!multiplier)) return -EINVAL; - /* + /* * Set the new multiplier for each CPU. CPUs don't start using the * new values until the next timer interrupt in which they do process * accounting. @@ -1367,15 +1333,13 @@ setup_profiling_timer(unsigned int multiplier) /* This is a bit of a mess, but forced on us by the genirq changes * there's no genirq handler that really does what voyager wants * so hack it up with the simple IRQ handler */ -static void fastcall -handle_vic_irq(unsigned int irq, struct irq_desc *desc) +static void handle_vic_irq(unsigned int irq, struct irq_desc *desc) { before_handle_vic_irq(irq); handle_simple_irq(irq, desc); after_handle_vic_irq(irq); } - /* The CPIs are handled in the per cpu 8259s, so they must be * enabled to be received: FIX: enabling the CPIs in the early * boot sequence interferes with bug checking; enable them later @@ -1385,13 +1349,12 @@ handle_vic_irq(unsigned int irq, struct irq_desc *desc) #define QIC_SET_GATE(cpi, vector) \ set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector)) -void __init -smp_intr_init(void) +void __init smp_intr_init(void) { int i; /* initialize the per cpu irq mask to all disabled */ - for(i = 0; i < NR_CPUS; i++) + for (i = 0; i < NR_CPUS; i++) vic_irq_mask[i] = 0xFFFF; VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt); @@ -1404,42 +1367,40 @@ smp_intr_init(void) QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt); QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt); QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt); - - /* now put the VIC descriptor into the first 48 IRQs + /* now put the VIC descriptor into the first 48 IRQs * * This is for later: first 16 correspond to PC IRQs; next 16 * are Primary MC IRQs and final 16 are Secondary MC IRQs */ - for(i = 0; i < 48; i++) + for (i = 0; i < 48; i++) set_irq_chip_and_handler(i, &vic_chip, handle_vic_irq); } /* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per * processor to receive CPI */ -static void -send_CPI(__u32 cpuset, __u8 cpi) +static void send_CPI(__u32 cpuset, __u8 cpi) { int cpu; __u32 quad_cpuset = (cpuset & voyager_quad_processors); - if(cpi < VIC_START_FAKE_CPI) { - /* fake CPI are only used for booting, so send to the + if (cpi < VIC_START_FAKE_CPI) { + /* fake CPI are only used for booting, so send to the * extended quads as well---Quads must be VIC booted */ - outb((__u8)(cpuset), VIC_CPI_Registers[cpi]); + outb((__u8) (cpuset), VIC_CPI_Registers[cpi]); return; } - if(quad_cpuset) + if (quad_cpuset) send_QIC_CPI(quad_cpuset, cpi); cpuset &= ~quad_cpuset; cpuset &= 0xff; /* only first 8 CPUs vaild for VIC CPI */ - if(cpuset == 0) + if (cpuset == 0) return; for_each_online_cpu(cpu) { - if(cpuset & (1<<cpu)) + if (cpuset & (1 << cpu)) set_bit(cpi, &vic_cpi_mailbox[cpu]); } - if(cpuset) - outb((__u8)cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]); + if (cpuset) + outb((__u8) cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]); } /* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and @@ -1448,20 +1409,19 @@ send_CPI(__u32 cpuset, __u8 cpi) * DON'T make this inline otherwise the cache line read will be * optimised away * */ -static int -ack_QIC_CPI(__u8 cpi) { +static int ack_QIC_CPI(__u8 cpi) +{ __u8 cpu = hard_smp_processor_id(); cpi &= 7; - outb(1<<cpi, QIC_INTERRUPT_CLEAR1); + outb(1 << cpi, QIC_INTERRUPT_CLEAR1); return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi; } -static void -ack_special_QIC_CPI(__u8 cpi) +static void ack_special_QIC_CPI(__u8 cpi) { - switch(cpi) { + switch (cpi) { case VIC_CMN_INT: outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0); break; @@ -1474,8 +1434,7 @@ ack_special_QIC_CPI(__u8 cpi) } /* Acknowledge receipt of CPI in the VIC (essentially an EOI) */ -static void -ack_VIC_CPI(__u8 cpi) +static void ack_VIC_CPI(__u8 cpi) { #ifdef VOYAGER_DEBUG unsigned long flags; @@ -1484,17 +1443,17 @@ ack_VIC_CPI(__u8 cpi) local_irq_save(flags); isr = vic_read_isr(); - if((isr & (1<<(cpi &7))) == 0) { + if ((isr & (1 << (cpi & 7))) == 0) { printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi); } #endif /* send specific EOI; the two system interrupts have * bit 4 set for a separate vector but behave as the * corresponding 3 bit intr */ - outb_p(0x60|(cpi & 7),0x20); + outb_p(0x60 | (cpi & 7), 0x20); #ifdef VOYAGER_DEBUG - if((vic_read_isr() & (1<<(cpi &7))) != 0) { + if ((vic_read_isr() & (1 << (cpi & 7))) != 0) { printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi); } local_irq_restore(flags); @@ -1502,12 +1461,11 @@ ack_VIC_CPI(__u8 cpi) } /* cribbed with thanks from irq.c */ -#define __byte(x,y) (((unsigned char *)&(y))[x]) +#define __byte(x,y) (((unsigned char *)&(y))[x]) #define cached_21(cpu) (__byte(0,vic_irq_mask[cpu])) #define cached_A1(cpu) (__byte(1,vic_irq_mask[cpu])) -static unsigned int -startup_vic_irq(unsigned int irq) +static unsigned int startup_vic_irq(unsigned int irq) { unmask_vic_irq(irq); @@ -1535,13 +1493,12 @@ startup_vic_irq(unsigned int irq) * broadcast an Interrupt enable CPI which causes all other CPUs to * adjust their masks accordingly. */ -static void -unmask_vic_irq(unsigned int irq) +static void unmask_vic_irq(unsigned int irq) { /* linux doesn't to processor-irq affinity, so enable on * all CPUs we know about */ int cpu = smp_processor_id(), real_cpu; - __u16 mask = (1<<irq); + __u16 mask = (1 << irq); __u32 processorList = 0; unsigned long flags; @@ -1549,78 +1506,72 @@ unmask_vic_irq(unsigned int irq) irq, cpu, cpu_irq_affinity[cpu])); spin_lock_irqsave(&vic_irq_lock, flags); for_each_online_cpu(real_cpu) { - if(!(voyager_extended_vic_processors & (1<<real_cpu))) + if (!(voyager_extended_vic_processors & (1 << real_cpu))) continue; - if(!(cpu_irq_affinity[real_cpu] & mask)) { + if (!(cpu_irq_affinity[real_cpu] & mask)) { /* irq has no affinity for this CPU, ignore */ continue; } - if(real_cpu == cpu) { + if (real_cpu == cpu) { enable_local_vic_irq(irq); - } - else if(vic_irq_mask[real_cpu] & mask) { + } else if (vic_irq_mask[real_cpu] & mask) { vic_irq_enable_mask[real_cpu] |= mask; - processorList |= (1<<real_cpu); + processorList |= (1 << real_cpu); } } spin_unlock_irqrestore(&vic_irq_lock, flags); - if(processorList) + if (processorList) send_CPI(processorList, VIC_ENABLE_IRQ_CPI); } -static void -mask_vic_irq(unsigned int irq) +static void mask_vic_irq(unsigned int irq) { /* lazy disable, do nothing */ } -static void -enable_local_vic_irq(unsigned int irq) +static void enable_local_vic_irq(unsigned int irq) { __u8 cpu = smp_processor_id(); __u16 mask = ~(1 << irq); __u16 old_mask = vic_irq_mask[cpu]; vic_irq_mask[cpu] &= mask; - if(vic_irq_mask[cpu] == old_mask) + if (vic_irq_mask[cpu] == old_mask) return; VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n", irq, cpu)); if (irq & 8) { - outb_p(cached_A1(cpu),0xA1); + outb_p(cached_A1(cpu), 0xA1); (void)inb_p(0xA1); - } - else { - outb_p(cached_21(cpu),0x21); + } else { + outb_p(cached_21(cpu), 0x21); (void)inb_p(0x21); } } -static void -disable_local_vic_irq(unsigned int irq) +static void disable_local_vic_irq(unsigned int irq) { __u8 cpu = smp_processor_id(); __u16 mask = (1 << irq); __u16 old_mask = vic_irq_mask[cpu]; - if(irq == 7) + if (irq == 7) return; vic_irq_mask[cpu] |= mask; - if(old_mask == vic_irq_mask[cpu]) + if (old_mask == vic_irq_mask[cpu]) return; VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n", irq, cpu)); if (irq & 8) { - outb_p(cached_A1(cpu),0xA1); + outb_p(cached_A1(cpu), 0xA1); (void)inb_p(0xA1); - } - else { - outb_p(cached_21(cpu),0x21); + } else { + outb_p(cached_21(cpu), 0x21); (void)inb_p(0x21); } } @@ -1631,8 +1582,7 @@ disable_local_vic_irq(unsigned int irq) * interrupt in the vic, so we merely set a flag (IRQ_DISABLED). If * this interrupt actually comes in, then we mask and ack here to push * the interrupt off to another CPU */ -static void -before_handle_vic_irq(unsigned int irq) +static void before_handle_vic_irq(unsigned int irq) { irq_desc_t *desc = irq_desc + irq; __u8 cpu = smp_processor_id(); @@ -1641,16 +1591,16 @@ before_handle_vic_irq(unsigned int irq) vic_intr_total++; vic_intr_count[cpu]++; - if(!(cpu_irq_affinity[cpu] & (1<<irq))) { + if (!(cpu_irq_affinity[cpu] & (1 << irq))) { /* The irq is not in our affinity mask, push it off * onto another CPU */ - VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d on cpu %d\n", - irq, cpu)); + VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d " + "on cpu %d\n", irq, cpu)); disable_local_vic_irq(irq); /* set IRQ_INPROGRESS to prevent the handler in irq.c from * actually calling the interrupt routine */ desc->status |= IRQ_REPLAY | IRQ_INPROGRESS; - } else if(desc->status & IRQ_DISABLED) { + } else if (desc->status & IRQ_DISABLED) { /* Damn, the interrupt actually arrived, do the lazy * disable thing. The interrupt routine in irq.c will * not handle a IRQ_DISABLED interrupt, so nothing more @@ -1667,8 +1617,7 @@ before_handle_vic_irq(unsigned int irq) } /* Finish the VIC interrupt: basically mask */ -static void -after_handle_vic_irq(unsigned int irq) +static void after_handle_vic_irq(unsigned int irq) { irq_desc_t *desc = irq_desc + irq; @@ -1685,11 +1634,11 @@ after_handle_vic_irq(unsigned int irq) #ifdef VOYAGER_DEBUG /* DEBUG: before we ack, check what's in progress */ isr = vic_read_isr(); - if((isr & (1<<irq) && !(status & IRQ_REPLAY)) == 0) { + if ((isr & (1 << irq) && !(status & IRQ_REPLAY)) == 0) { int i; __u8 cpu = smp_processor_id(); __u8 real_cpu; - int mask; /* Um... initialize me??? --RR */ + int mask; /* Um... initialize me??? --RR */ printk("VOYAGER SMP: CPU%d lost interrupt %d\n", cpu, irq); @@ -1698,9 +1647,10 @@ after_handle_vic_irq(unsigned int irq) outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu, VIC_PROCESSOR_ID); isr = vic_read_isr(); - if(isr & (1<<irq)) { - printk("VOYAGER SMP: CPU%d ack irq %d\n", - real_cpu, irq); + if (isr & (1 << irq)) { + printk + ("VOYAGER SMP: CPU%d ack irq %d\n", + real_cpu, irq); ack_vic_irq(irq); } outb(cpu, VIC_PROCESSOR_ID); @@ -1711,7 +1661,7 @@ after_handle_vic_irq(unsigned int irq) * receipt by another CPU so everything must be in * order here */ ack_vic_irq(irq); - if(status & IRQ_REPLAY) { + if (status & IRQ_REPLAY) { /* replay is set if we disable the interrupt * in the before_handle_vic_irq() routine, so * clear the in progress bit here to allow the @@ -1720,9 +1670,9 @@ after_handle_vic_irq(unsigned int irq) } #ifdef VOYAGER_DEBUG isr = vic_read_isr(); - if((isr & (1<<irq)) != 0) - printk("VOYAGER SMP: after_handle_vic_irq() after ack irq=%d, isr=0x%x\n", - irq, isr); + if ((isr & (1 << irq)) != 0) + printk("VOYAGER SMP: after_handle_vic_irq() after " + "ack irq=%d, isr=0x%x\n", irq, isr); #endif /* VOYAGER_DEBUG */ } _raw_spin_unlock(&vic_irq_lock); @@ -1731,7 +1681,6 @@ after_handle_vic_irq(unsigned int irq) * may be intercepted by another CPU if reasserted */ } - /* Linux processor - interrupt affinity manipulations. * * For each processor, we maintain a 32 bit irq affinity mask. @@ -1748,8 +1697,7 @@ after_handle_vic_irq(unsigned int irq) * change the mask and then do an interrupt enable CPI to re-enable on * the selected processors */ -void -set_vic_irq_affinity(unsigned int irq, cpumask_t mask) +void set_vic_irq_affinity(unsigned int irq, cpumask_t mask) { /* Only extended processors handle interrupts */ unsigned long real_mask; @@ -1757,13 +1705,13 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask) int cpu; real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors; - - if(cpus_addr(mask)[0] == 0) + + if (cpus_addr(mask)[0] == 0) /* can't have no CPUs to accept the interrupt -- extremely * bad things will happen */ return; - if(irq == 0) + if (irq == 0) /* can't change the affinity of the timer IRQ. This * is due to the constraint in the voyager * architecture that the CPI also comes in on and IRQ @@ -1772,7 +1720,7 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask) * will no-longer be able to accept VIC CPIs */ return; - if(irq >= 32) + if (irq >= 32) /* You can only have 32 interrupts in a voyager system * (and 32 only if you have a secondary microchannel * bus) */ @@ -1780,8 +1728,8 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask) for_each_online_cpu(cpu) { unsigned long cpu_mask = 1 << cpu; - - if(cpu_mask & real_mask) { + + if (cpu_mask & real_mask) { /* enable the interrupt for this cpu */ cpu_irq_affinity[cpu] |= irq_mask; } else { @@ -1800,25 +1748,23 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask) unmask_vic_irq(irq); } -static void -ack_vic_irq(unsigned int irq) +static void ack_vic_irq(unsigned int irq) { if (irq & 8) { - outb(0x62,0x20); /* Specific EOI to cascade */ - outb(0x60|(irq & 7),0xA0); + outb(0x62, 0x20); /* Specific EOI to cascade */ + outb(0x60 | (irq & 7), 0xA0); } else { - outb(0x60 | (irq & 7),0x20); + outb(0x60 | (irq & 7), 0x20); } } /* enable the CPIs. In the VIC, the CPIs are delivered by the 8259 * but are not vectored by it. This means that the 8259 mask must be * lowered to receive them */ -static __init void -vic_enable_cpi(void) +static __init void vic_enable_cpi(void) { __u8 cpu = smp_processor_id(); - + /* just take a copy of the current mask (nop for boot cpu) */ vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id]; @@ -1827,7 +1773,7 @@ vic_enable_cpi(void) /* for sys int and cmn int */ enable_local_vic_irq(7); - if(is_cpu_quad()) { + if (is_cpu_quad()) { outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n", @@ -1838,8 +1784,7 @@ vic_enable_cpi(void) cpu, vic_irq_mask[cpu])); } -void -voyager_smp_dump() +void voyager_smp_dump() { int old_cpu = smp_processor_id(), cpu; @@ -1865,10 +1810,10 @@ voyager_smp_dump() cpu, vic_irq_mask[cpu], imr, irr, isr); #if 0 /* These lines are put in to try to unstick an un ack'd irq */ - if(isr != 0) { + if (isr != 0) { int irq; - for(irq=0; irq<16; irq++) { - if(isr & (1<<irq)) { + for (irq = 0; irq < 16; irq++) { + if (isr & (1 << irq)) { printk("\tCPU%d: ack irq %d\n", cpu, irq); local_irq_save(flags); @@ -1884,17 +1829,15 @@ voyager_smp_dump() } } -void -smp_voyager_power_off(void *dummy) +void smp_voyager_power_off(void *dummy) { - if(smp_processor_id() == boot_cpu_id) + if (smp_processor_id() == boot_cpu_id) voyager_power_off(); else smp_stop_cpu_function(NULL); } -static void __init -voyager_smp_prepare_cpus(unsigned int max_cpus) +static void __init voyager_smp_prepare_cpus(unsigned int max_cpus) { /* FIXME: ignore max_cpus for now */ smp_boot_cpus(); @@ -1911,8 +1854,7 @@ static void __cpuinit voyager_smp_prepare_boot_cpu(void) cpu_set(smp_processor_id(), cpu_present_map); } -static int __cpuinit -voyager_cpu_up(unsigned int cpu) +static int __cpuinit voyager_cpu_up(unsigned int cpu) { /* This only works at boot for x86. See "rewrite" above. */ if (cpu_isset(cpu, smp_commenced_mask)) @@ -1928,14 +1870,12 @@ voyager_cpu_up(unsigned int cpu) return 0; } -static void __init -voyager_smp_cpus_done(unsigned int max_cpus) +static void __init voyager_smp_cpus_done(unsigned int max_cpus) { zap_low_mappings(); } -void __init -smp_setup_processor_id(void) +void __init smp_setup_processor_id(void) { current_thread_info()->cpu = hard_smp_processor_id(); x86_write_percpu(cpu_number, hard_smp_processor_id()); diff --git a/arch/x86/mach-voyager/voyager_thread.c b/arch/x86/mach-voyager/voyager_thread.c index 50f9366c411..c69c931818e 100644 --- a/arch/x86/mach-voyager/voyager_thread.c +++ b/arch/x86/mach-voyager/voyager_thread.c @@ -30,12 +30,10 @@ #include <asm/mtrr.h> #include <asm/msr.h> - struct task_struct *voyager_thread; static __u8 set_timeout; -static int -execute(const char *string) +static int execute(const char *string) { int ret; @@ -52,48 +50,48 @@ execute(const char *string) NULL, }; - if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { - printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", - string, ret); + if ((ret = + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { + printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string, + ret); } return ret; } -static void -check_from_kernel(void) +static void check_from_kernel(void) { - if(voyager_status.switch_off) { - + if (voyager_status.switch_off) { + /* FIXME: This should be configurable via proc */ execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1"); - } else if(voyager_status.power_fail) { + } else if (voyager_status.power_fail) { VDEBUG(("Voyager daemon detected AC power failure\n")); - + /* FIXME: This should be configureable via proc */ execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1"); set_timeout = 1; } } -static void -check_continuing_condition(void) +static void check_continuing_condition(void) { - if(voyager_status.power_fail) { + if (voyager_status.power_fail) { __u8 data; - voyager_cat_psi(VOYAGER_PSI_SUBREAD, + voyager_cat_psi(VOYAGER_PSI_SUBREAD, VOYAGER_PSI_AC_FAIL_REG, &data); - if((data & 0x1f) == 0) { + if ((data & 0x1f) == 0) { /* all power restored */ - printk(KERN_NOTICE "VOYAGER AC power restored, cancelling shutdown\n"); + printk(KERN_NOTICE + "VOYAGER AC power restored, cancelling shutdown\n"); /* FIXME: should be user configureable */ - execute("umask 600; echo O > /etc/powerstatus; kill -PWR 1"); + execute + ("umask 600; echo O > /etc/powerstatus; kill -PWR 1"); set_timeout = 0; } } } -static int -thread(void *unused) +static int thread(void *unused) { printk(KERN_NOTICE "Voyager starting monitor thread\n"); @@ -102,7 +100,7 @@ thread(void *unused) schedule_timeout(set_timeout ? HZ : MAX_SCHEDULE_TIMEOUT); VDEBUG(("Voyager Daemon awoken\n")); - if(voyager_status.request_from_kernel == 0) { + if (voyager_status.request_from_kernel == 0) { /* probably awoken from timeout */ check_continuing_condition(); } else { @@ -112,20 +110,18 @@ thread(void *unused) } } -static int __init -voyager_thread_start(void) +static int __init voyager_thread_start(void) { voyager_thread = kthread_run(thread, NULL, "kvoyagerd"); if (IS_ERR(voyager_thread)) { - printk(KERN_ERR "Voyager: Failed to create system monitor thread.\n"); + printk(KERN_ERR + "Voyager: Failed to create system monitor thread.\n"); return PTR_ERR(voyager_thread); } return 0; } - -static void __exit -voyager_thread_stop(void) +static void __exit voyager_thread_stop(void) { kthread_stop(voyager_thread); } diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index a1b0d22f697..59d353d2c59 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -33,45 +33,41 @@ #undef PRINT_MESSAGES /* */ - #if 0 void Un_impl(void) { - u_char byte1, FPU_modrm; - unsigned long address = FPU_ORIG_EIP; - - RE_ENTRANT_CHECK_OFF; - /* No need to check access_ok(), we have previously fetched these bytes. */ - printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *) address); - if ( FPU_CS == __USER_CS ) - { - while ( 1 ) - { - FPU_get_user(byte1, (u_char __user *) address); - if ( (byte1 & 0xf8) == 0xd8 ) break; - printk("[%02x]", byte1); - address++; + u_char byte1, FPU_modrm; + unsigned long address = FPU_ORIG_EIP; + + RE_ENTRANT_CHECK_OFF; + /* No need to check access_ok(), we have previously fetched these bytes. */ + printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *)address); + if (FPU_CS == __USER_CS) { + while (1) { + FPU_get_user(byte1, (u_char __user *) address); + if ((byte1 & 0xf8) == 0xd8) + break; + printk("[%02x]", byte1); + address++; + } + printk("%02x ", byte1); + FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); + + if (FPU_modrm >= 0300) + printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, + FPU_modrm & 7); + else + printk("/%d\n", (FPU_modrm >> 3) & 7); + } else { + printk("cs selector = %04x\n", FPU_CS); } - printk("%02x ", byte1); - FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); - - if (FPU_modrm >= 0300) - printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7); - else - printk("/%d\n", (FPU_modrm >> 3) & 7); - } - else - { - printk("cs selector = %04x\n", FPU_CS); - } - - RE_ENTRANT_CHECK_ON; - - EXCEPTION(EX_Invalid); -} -#endif /* 0 */ + RE_ENTRANT_CHECK_ON; + EXCEPTION(EX_Invalid); + +} +#endif /* 0 */ /* Called for opcodes which are illegal and which are known to result in a @@ -79,139 +75,152 @@ void Un_impl(void) */ void FPU_illegal(void) { - math_abort(FPU_info,SIGILL); + math_abort(FPU_info, SIGILL); } - - void FPU_printall(void) { - int i; - static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty", - "DeNorm", "Inf", "NaN" }; - u_char byte1, FPU_modrm; - unsigned long address = FPU_ORIG_EIP; - - RE_ENTRANT_CHECK_OFF; - /* No need to check access_ok(), we have previously fetched these bytes. */ - printk("At %p:", (void *) address); - if ( FPU_CS == __USER_CS ) - { + int i; + static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty", + "DeNorm", "Inf", "NaN" + }; + u_char byte1, FPU_modrm; + unsigned long address = FPU_ORIG_EIP; + + RE_ENTRANT_CHECK_OFF; + /* No need to check access_ok(), we have previously fetched these bytes. */ + printk("At %p:", (void *)address); + if (FPU_CS == __USER_CS) { #define MAX_PRINTED_BYTES 20 - for ( i = 0; i < MAX_PRINTED_BYTES; i++ ) - { - FPU_get_user(byte1, (u_char __user *) address); - if ( (byte1 & 0xf8) == 0xd8 ) - { - printk(" %02x", byte1); - break; - } - printk(" [%02x]", byte1); - address++; - } - if ( i == MAX_PRINTED_BYTES ) - printk(" [more..]\n"); - else - { - FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); - - if (FPU_modrm >= 0300) - printk(" %02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7); - else - printk(" /%d, mod=%d rm=%d\n", - (FPU_modrm >> 3) & 7, (FPU_modrm >> 6) & 3, FPU_modrm & 7); + for (i = 0; i < MAX_PRINTED_BYTES; i++) { + FPU_get_user(byte1, (u_char __user *) address); + if ((byte1 & 0xf8) == 0xd8) { + printk(" %02x", byte1); + break; + } + printk(" [%02x]", byte1); + address++; + } + if (i == MAX_PRINTED_BYTES) + printk(" [more..]\n"); + else { + FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); + + if (FPU_modrm >= 0300) + printk(" %02x (%02x+%d)\n", FPU_modrm, + FPU_modrm & 0xf8, FPU_modrm & 7); + else + printk(" /%d, mod=%d rm=%d\n", + (FPU_modrm >> 3) & 7, + (FPU_modrm >> 6) & 3, FPU_modrm & 7); + } + } else { + printk("%04x\n", FPU_CS); } - } - else - { - printk("%04x\n", FPU_CS); - } - partial_status = status_word(); + partial_status = status_word(); #ifdef DEBUGGING -if ( partial_status & SW_Backward ) printk("SW: backward compatibility\n"); -if ( partial_status & SW_C3 ) printk("SW: condition bit 3\n"); -if ( partial_status & SW_C2 ) printk("SW: condition bit 2\n"); -if ( partial_status & SW_C1 ) printk("SW: condition bit 1\n"); -if ( partial_status & SW_C0 ) printk("SW: condition bit 0\n"); -if ( partial_status & SW_Summary ) printk("SW: exception summary\n"); -if ( partial_status & SW_Stack_Fault ) printk("SW: stack fault\n"); -if ( partial_status & SW_Precision ) printk("SW: loss of precision\n"); -if ( partial_status & SW_Underflow ) printk("SW: underflow\n"); -if ( partial_status & SW_Overflow ) printk("SW: overflow\n"); -if ( partial_status & SW_Zero_Div ) printk("SW: divide by zero\n"); -if ( partial_status & SW_Denorm_Op ) printk("SW: denormalized operand\n"); -if ( partial_status & SW_Invalid ) printk("SW: invalid operation\n"); + if (partial_status & SW_Backward) + printk("SW: backward compatibility\n"); + if (partial_status & SW_C3) + printk("SW: condition bit 3\n"); + if (partial_status & SW_C2) + printk("SW: condition bit 2\n"); + if (partial_status & SW_C1) + printk("SW: condition bit 1\n"); + if (partial_status & SW_C0) + printk("SW: condition bit 0\n"); + if (partial_status & SW_Summary) + printk("SW: exception summary\n"); + if (partial_status & SW_Stack_Fault) + printk("SW: stack fault\n"); + if (partial_status & SW_Precision) + printk("SW: loss of precision\n"); + if (partial_status & SW_Underflow) + printk("SW: underflow\n"); + if (partial_status & SW_Overflow) + printk("SW: overflow\n"); + if (partial_status & SW_Zero_Div) + printk("SW: divide by zero\n"); + if (partial_status & SW_Denorm_Op) + printk("SW: denormalized operand\n"); + if (partial_status & SW_Invalid) + printk("SW: invalid operation\n"); #endif /* DEBUGGING */ - printk(" SW: b=%d st=%ld es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n", - partial_status & 0x8000 ? 1 : 0, /* busy */ - (partial_status & 0x3800) >> 11, /* stack top pointer */ - partial_status & 0x80 ? 1 : 0, /* Error summary status */ - partial_status & 0x40 ? 1 : 0, /* Stack flag */ - partial_status & SW_C3?1:0, partial_status & SW_C2?1:0, /* cc */ - partial_status & SW_C1?1:0, partial_status & SW_C0?1:0, /* cc */ - partial_status & SW_Precision?1:0, partial_status & SW_Underflow?1:0, - partial_status & SW_Overflow?1:0, partial_status & SW_Zero_Div?1:0, - partial_status & SW_Denorm_Op?1:0, partial_status & SW_Invalid?1:0); - -printk(" CW: ic=%d rc=%ld%ld pc=%ld%ld iem=%d ef=%d%d%d%d%d%d\n", - control_word & 0x1000 ? 1 : 0, - (control_word & 0x800) >> 11, (control_word & 0x400) >> 10, - (control_word & 0x200) >> 9, (control_word & 0x100) >> 8, - control_word & 0x80 ? 1 : 0, - control_word & SW_Precision?1:0, control_word & SW_Underflow?1:0, - control_word & SW_Overflow?1:0, control_word & SW_Zero_Div?1:0, - control_word & SW_Denorm_Op?1:0, control_word & SW_Invalid?1:0); - - for ( i = 0; i < 8; i++ ) - { - FPU_REG *r = &st(i); - u_char tagi = FPU_gettagi(i); - switch (tagi) - { - case TAG_Empty: - continue; - break; - case TAG_Zero: - case TAG_Special: - tagi = FPU_Special(r); - case TAG_Valid: - printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i, - getsign(r) ? '-' : '+', - (long)(r->sigh >> 16), - (long)(r->sigh & 0xFFFF), - (long)(r->sigl >> 16), - (long)(r->sigl & 0xFFFF), - exponent(r) - EXP_BIAS + 1); - break; - default: - printk("Whoops! Error in errors.c: tag%d is %d ", i, tagi); - continue; - break; + printk(" SW: b=%d st=%d es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n", partial_status & 0x8000 ? 1 : 0, /* busy */ + (partial_status & 0x3800) >> 11, /* stack top pointer */ + partial_status & 0x80 ? 1 : 0, /* Error summary status */ + partial_status & 0x40 ? 1 : 0, /* Stack flag */ + partial_status & SW_C3 ? 1 : 0, partial_status & SW_C2 ? 1 : 0, /* cc */ + partial_status & SW_C1 ? 1 : 0, partial_status & SW_C0 ? 1 : 0, /* cc */ + partial_status & SW_Precision ? 1 : 0, + partial_status & SW_Underflow ? 1 : 0, + partial_status & SW_Overflow ? 1 : 0, + partial_status & SW_Zero_Div ? 1 : 0, + partial_status & SW_Denorm_Op ? 1 : 0, + partial_status & SW_Invalid ? 1 : 0); + + printk(" CW: ic=%d rc=%d%d pc=%d%d iem=%d ef=%d%d%d%d%d%d\n", + control_word & 0x1000 ? 1 : 0, + (control_word & 0x800) >> 11, (control_word & 0x400) >> 10, + (control_word & 0x200) >> 9, (control_word & 0x100) >> 8, + control_word & 0x80 ? 1 : 0, + control_word & SW_Precision ? 1 : 0, + control_word & SW_Underflow ? 1 : 0, + control_word & SW_Overflow ? 1 : 0, + control_word & SW_Zero_Div ? 1 : 0, + control_word & SW_Denorm_Op ? 1 : 0, + control_word & SW_Invalid ? 1 : 0); + + for (i = 0; i < 8; i++) { + FPU_REG *r = &st(i); + u_char tagi = FPU_gettagi(i); + switch (tagi) { + case TAG_Empty: + continue; + break; + case TAG_Zero: + case TAG_Special: + tagi = FPU_Special(r); + case TAG_Valid: + printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i, + getsign(r) ? '-' : '+', + (long)(r->sigh >> 16), + (long)(r->sigh & 0xFFFF), + (long)(r->sigl >> 16), + (long)(r->sigl & 0xFFFF), + exponent(r) - EXP_BIAS + 1); + break; + default: + printk("Whoops! Error in errors.c: tag%d is %d ", i, + tagi); + continue; + break; + } + printk("%s\n", tag_desc[(int)(unsigned)tagi]); } - printk("%s\n", tag_desc[(int) (unsigned) tagi]); - } - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_ON; } static struct { - int type; - const char *name; + int type; + const char *name; } exception_names[] = { - { EX_StackOver, "stack overflow" }, - { EX_StackUnder, "stack underflow" }, - { EX_Precision, "loss of precision" }, - { EX_Underflow, "underflow" }, - { EX_Overflow, "overflow" }, - { EX_ZeroDiv, "divide by zero" }, - { EX_Denormal, "denormalized operand" }, - { EX_Invalid, "invalid operation" }, - { EX_INTERNAL, "INTERNAL BUG in "FPU_VERSION }, - { 0, NULL } + { + EX_StackOver, "stack overflow"}, { + EX_StackUnder, "stack underflow"}, { + EX_Precision, "loss of precision"}, { + EX_Underflow, "underflow"}, { + EX_Overflow, "overflow"}, { + EX_ZeroDiv, "divide by zero"}, { + EX_Denormal, "denormalized operand"}, { + EX_Invalid, "invalid operation"}, { + EX_INTERNAL, "INTERNAL BUG in " FPU_VERSION}, { + 0, NULL} }; /* @@ -295,445 +304,386 @@ static struct { asmlinkage void FPU_exception(int n) { - int i, int_type; - - int_type = 0; /* Needed only to stop compiler warnings */ - if ( n & EX_INTERNAL ) - { - int_type = n - EX_INTERNAL; - n = EX_INTERNAL; - /* Set lots of exception bits! */ - partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward); - } - else - { - /* Extract only the bits which we use to set the status word */ - n &= (SW_Exc_Mask); - /* Set the corresponding exception bit */ - partial_status |= n; - /* Set summary bits iff exception isn't masked */ - if ( partial_status & ~control_word & CW_Exceptions ) - partial_status |= (SW_Summary | SW_Backward); - if ( n & (SW_Stack_Fault | EX_Precision) ) - { - if ( !(n & SW_C1) ) - /* This bit distinguishes over- from underflow for a stack fault, - and roundup from round-down for precision loss. */ - partial_status &= ~SW_C1; + int i, int_type; + + int_type = 0; /* Needed only to stop compiler warnings */ + if (n & EX_INTERNAL) { + int_type = n - EX_INTERNAL; + n = EX_INTERNAL; + /* Set lots of exception bits! */ + partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward); + } else { + /* Extract only the bits which we use to set the status word */ + n &= (SW_Exc_Mask); + /* Set the corresponding exception bit */ + partial_status |= n; + /* Set summary bits iff exception isn't masked */ + if (partial_status & ~control_word & CW_Exceptions) + partial_status |= (SW_Summary | SW_Backward); + if (n & (SW_Stack_Fault | EX_Precision)) { + if (!(n & SW_C1)) + /* This bit distinguishes over- from underflow for a stack fault, + and roundup from round-down for precision loss. */ + partial_status &= ~SW_C1; + } } - } - RE_ENTRANT_CHECK_OFF; - if ( (~control_word & n & CW_Exceptions) || (n == EX_INTERNAL) ) - { + RE_ENTRANT_CHECK_OFF; + if ((~control_word & n & CW_Exceptions) || (n == EX_INTERNAL)) { #ifdef PRINT_MESSAGES - /* My message from the sponsor */ - printk(FPU_VERSION" "__DATE__" (C) W. Metzenthen.\n"); + /* My message from the sponsor */ + printk(FPU_VERSION " " __DATE__ " (C) W. Metzenthen.\n"); #endif /* PRINT_MESSAGES */ - - /* Get a name string for error reporting */ - for (i=0; exception_names[i].type; i++) - if ( (exception_names[i].type & n) == exception_names[i].type ) - break; - - if (exception_names[i].type) - { + + /* Get a name string for error reporting */ + for (i = 0; exception_names[i].type; i++) + if ((exception_names[i].type & n) == + exception_names[i].type) + break; + + if (exception_names[i].type) { #ifdef PRINT_MESSAGES - printk("FP Exception: %s!\n", exception_names[i].name); + printk("FP Exception: %s!\n", exception_names[i].name); #endif /* PRINT_MESSAGES */ - } - else - printk("FPU emulator: Unknown Exception: 0x%04x!\n", n); - - if ( n == EX_INTERNAL ) - { - printk("FPU emulator: Internal error type 0x%04x\n", int_type); - FPU_printall(); - } + } else + printk("FPU emulator: Unknown Exception: 0x%04x!\n", n); + + if (n == EX_INTERNAL) { + printk("FPU emulator: Internal error type 0x%04x\n", + int_type); + FPU_printall(); + } #ifdef PRINT_MESSAGES - else - FPU_printall(); + else + FPU_printall(); #endif /* PRINT_MESSAGES */ - /* - * The 80486 generates an interrupt on the next non-control FPU - * instruction. So we need some means of flagging it. - * We use the ES (Error Summary) bit for this. - */ - } - RE_ENTRANT_CHECK_ON; + /* + * The 80486 generates an interrupt on the next non-control FPU + * instruction. So we need some means of flagging it. + * We use the ES (Error Summary) bit for this. + */ + } + RE_ENTRANT_CHECK_ON; #ifdef __DEBUG__ - math_abort(FPU_info,SIGFPE); + math_abort(FPU_info, SIGFPE); #endif /* __DEBUG__ */ } - /* Real operation attempted on a NaN. */ /* Returns < 0 if the exception is unmasked */ int real_1op_NaN(FPU_REG *a) { - int signalling, isNaN; - - isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000); - - /* The default result for the case of two "equal" NaNs (signs may - differ) is chosen to reproduce 80486 behaviour */ - signalling = isNaN && !(a->sigh & 0x40000000); - - if ( !signalling ) - { - if ( !isNaN ) /* pseudo-NaN, or other unsupported? */ - { - if ( control_word & CW_Invalid ) - { - /* Masked response */ - reg_copy(&CONST_QNaN, a); - } - EXCEPTION(EX_Invalid); - return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; + int signalling, isNaN; + + isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000); + + /* The default result for the case of two "equal" NaNs (signs may + differ) is chosen to reproduce 80486 behaviour */ + signalling = isNaN && !(a->sigh & 0x40000000); + + if (!signalling) { + if (!isNaN) { /* pseudo-NaN, or other unsupported? */ + if (control_word & CW_Invalid) { + /* Masked response */ + reg_copy(&CONST_QNaN, a); + } + EXCEPTION(EX_Invalid); + return (!(control_word & CW_Invalid) ? FPU_Exception : + 0) | TAG_Special; + } + return TAG_Special; } - return TAG_Special; - } - if ( control_word & CW_Invalid ) - { - /* The masked response */ - if ( !(a->sigh & 0x80000000) ) /* pseudo-NaN ? */ - { - reg_copy(&CONST_QNaN, a); + if (control_word & CW_Invalid) { + /* The masked response */ + if (!(a->sigh & 0x80000000)) { /* pseudo-NaN ? */ + reg_copy(&CONST_QNaN, a); + } + /* ensure a Quiet NaN */ + a->sigh |= 0x40000000; } - /* ensure a Quiet NaN */ - a->sigh |= 0x40000000; - } - EXCEPTION(EX_Invalid); + EXCEPTION(EX_Invalid); - return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; } - /* Real operation attempted on two operands, one a NaN. */ /* Returns < 0 if the exception is unmasked */ int real_2op_NaN(FPU_REG const *b, u_char tagb, - int deststnr, - FPU_REG const *defaultNaN) + int deststnr, FPU_REG const *defaultNaN) { - FPU_REG *dest = &st(deststnr); - FPU_REG const *a = dest; - u_char taga = FPU_gettagi(deststnr); - FPU_REG const *x; - int signalling, unsupported; - - if ( taga == TAG_Special ) - taga = FPU_Special(a); - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); - - /* TW_NaN is also used for unsupported data types. */ - unsupported = ((taga == TW_NaN) - && !((exponent(a) == EXP_OVER) && (a->sigh & 0x80000000))) - || ((tagb == TW_NaN) - && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000))); - if ( unsupported ) - { - if ( control_word & CW_Invalid ) - { - /* Masked response */ - FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); - } - EXCEPTION(EX_Invalid); - return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; - } - - if (taga == TW_NaN) - { - x = a; - if (tagb == TW_NaN) - { - signalling = !(a->sigh & b->sigh & 0x40000000); - if ( significand(b) > significand(a) ) - x = b; - else if ( significand(b) == significand(a) ) - { - /* The default result for the case of two "equal" NaNs (signs may - differ) is chosen to reproduce 80486 behaviour */ - x = defaultNaN; - } - } - else - { - /* return the quiet version of the NaN in a */ - signalling = !(a->sigh & 0x40000000); + FPU_REG *dest = &st(deststnr); + FPU_REG const *a = dest; + u_char taga = FPU_gettagi(deststnr); + FPU_REG const *x; + int signalling, unsupported; + + if (taga == TAG_Special) + taga = FPU_Special(a); + if (tagb == TAG_Special) + tagb = FPU_Special(b); + + /* TW_NaN is also used for unsupported data types. */ + unsupported = ((taga == TW_NaN) + && !((exponent(a) == EXP_OVER) + && (a->sigh & 0x80000000))) + || ((tagb == TW_NaN) + && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000))); + if (unsupported) { + if (control_word & CW_Invalid) { + /* Masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); + } + EXCEPTION(EX_Invalid); + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | + TAG_Special; } - } - else + + if (taga == TW_NaN) { + x = a; + if (tagb == TW_NaN) { + signalling = !(a->sigh & b->sigh & 0x40000000); + if (significand(b) > significand(a)) + x = b; + else if (significand(b) == significand(a)) { + /* The default result for the case of two "equal" NaNs (signs may + differ) is chosen to reproduce 80486 behaviour */ + x = defaultNaN; + } + } else { + /* return the quiet version of the NaN in a */ + signalling = !(a->sigh & 0x40000000); + } + } else #ifdef PARANOID - if (tagb == TW_NaN) + if (tagb == TW_NaN) #endif /* PARANOID */ - { - signalling = !(b->sigh & 0x40000000); - x = b; - } + { + signalling = !(b->sigh & 0x40000000); + x = b; + } #ifdef PARANOID - else - { - signalling = 0; - EXCEPTION(EX_INTERNAL|0x113); - x = &CONST_QNaN; - } + else { + signalling = 0; + EXCEPTION(EX_INTERNAL | 0x113); + x = &CONST_QNaN; + } #endif /* PARANOID */ - if ( (!signalling) || (control_word & CW_Invalid) ) - { - if ( ! x ) - x = b; + if ((!signalling) || (control_word & CW_Invalid)) { + if (!x) + x = b; - if ( !(x->sigh & 0x80000000) ) /* pseudo-NaN ? */ - x = &CONST_QNaN; + if (!(x->sigh & 0x80000000)) /* pseudo-NaN ? */ + x = &CONST_QNaN; - FPU_copy_to_regi(x, TAG_Special, deststnr); + FPU_copy_to_regi(x, TAG_Special, deststnr); - if ( !signalling ) - return TAG_Special; + if (!signalling) + return TAG_Special; - /* ensure a Quiet NaN */ - dest->sigh |= 0x40000000; - } + /* ensure a Quiet NaN */ + dest->sigh |= 0x40000000; + } - EXCEPTION(EX_Invalid); + EXCEPTION(EX_Invalid); - return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; } - /* Invalid arith operation on Valid registers */ /* Returns < 0 if the exception is unmasked */ asmlinkage int arith_invalid(int deststnr) { - EXCEPTION(EX_Invalid); - - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); - } - - return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid; + EXCEPTION(EX_Invalid); -} + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); + } + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid; + +} /* Divide a finite number by zero */ asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign) { - FPU_REG *dest = &st(deststnr); - int tag = TAG_Valid; + FPU_REG *dest = &st(deststnr); + int tag = TAG_Valid; + + if (control_word & CW_ZeroDiv) { + /* The masked response */ + FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr); + setsign(dest, sign); + tag = TAG_Special; + } - if ( control_word & CW_ZeroDiv ) - { - /* The masked response */ - FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr); - setsign(dest, sign); - tag = TAG_Special; - } - - EXCEPTION(EX_ZeroDiv); + EXCEPTION(EX_ZeroDiv); - return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag; + return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag; } - /* This may be called often, so keep it lean */ int set_precision_flag(int flags) { - if ( control_word & CW_Precision ) - { - partial_status &= ~(SW_C1 & flags); - partial_status |= flags; /* The masked response */ - return 0; - } - else - { - EXCEPTION(flags); - return 1; - } + if (control_word & CW_Precision) { + partial_status &= ~(SW_C1 & flags); + partial_status |= flags; /* The masked response */ + return 0; + } else { + EXCEPTION(flags); + return 1; + } } - /* This may be called often, so keep it lean */ asmlinkage void set_precision_flag_up(void) { - if ( control_word & CW_Precision ) - partial_status |= (SW_Precision | SW_C1); /* The masked response */ - else - EXCEPTION(EX_Precision | SW_C1); + if (control_word & CW_Precision) + partial_status |= (SW_Precision | SW_C1); /* The masked response */ + else + EXCEPTION(EX_Precision | SW_C1); } - /* This may be called often, so keep it lean */ asmlinkage void set_precision_flag_down(void) { - if ( control_word & CW_Precision ) - { /* The masked response */ - partial_status &= ~SW_C1; - partial_status |= SW_Precision; - } - else - EXCEPTION(EX_Precision); + if (control_word & CW_Precision) { /* The masked response */ + partial_status &= ~SW_C1; + partial_status |= SW_Precision; + } else + EXCEPTION(EX_Precision); } - asmlinkage int denormal_operand(void) { - if ( control_word & CW_Denormal ) - { /* The masked response */ - partial_status |= SW_Denorm_Op; - return TAG_Special; - } - else - { - EXCEPTION(EX_Denormal); - return TAG_Special | FPU_Exception; - } + if (control_word & CW_Denormal) { /* The masked response */ + partial_status |= SW_Denorm_Op; + return TAG_Special; + } else { + EXCEPTION(EX_Denormal); + return TAG_Special | FPU_Exception; + } } - asmlinkage int arith_overflow(FPU_REG *dest) { - int tag = TAG_Valid; + int tag = TAG_Valid; - if ( control_word & CW_Overflow ) - { - /* The masked response */ + if (control_word & CW_Overflow) { + /* The masked response */ /* ###### The response here depends upon the rounding mode */ - reg_copy(&CONST_INF, dest); - tag = TAG_Special; - } - else - { - /* Subtract the magic number from the exponent */ - addexponent(dest, (-3 * (1 << 13))); - } - - EXCEPTION(EX_Overflow); - if ( control_word & CW_Overflow ) - { - /* The overflow exception is masked. */ - /* By definition, precision is lost. - The roundup bit (C1) is also set because we have - "rounded" upwards to Infinity. */ - EXCEPTION(EX_Precision | SW_C1); - return tag; - } - - return tag; + reg_copy(&CONST_INF, dest); + tag = TAG_Special; + } else { + /* Subtract the magic number from the exponent */ + addexponent(dest, (-3 * (1 << 13))); + } -} + EXCEPTION(EX_Overflow); + if (control_word & CW_Overflow) { + /* The overflow exception is masked. */ + /* By definition, precision is lost. + The roundup bit (C1) is also set because we have + "rounded" upwards to Infinity. */ + EXCEPTION(EX_Precision | SW_C1); + return tag; + } + + return tag; +} asmlinkage int arith_underflow(FPU_REG *dest) { - int tag = TAG_Valid; - - if ( control_word & CW_Underflow ) - { - /* The masked response */ - if ( exponent16(dest) <= EXP_UNDER - 63 ) - { - reg_copy(&CONST_Z, dest); - partial_status &= ~SW_C1; /* Round down. */ - tag = TAG_Zero; + int tag = TAG_Valid; + + if (control_word & CW_Underflow) { + /* The masked response */ + if (exponent16(dest) <= EXP_UNDER - 63) { + reg_copy(&CONST_Z, dest); + partial_status &= ~SW_C1; /* Round down. */ + tag = TAG_Zero; + } else { + stdexp(dest); + } + } else { + /* Add the magic number to the exponent. */ + addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias); } - else - { - stdexp(dest); + + EXCEPTION(EX_Underflow); + if (control_word & CW_Underflow) { + /* The underflow exception is masked. */ + EXCEPTION(EX_Precision); + return tag; } - } - else - { - /* Add the magic number to the exponent. */ - addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias); - } - - EXCEPTION(EX_Underflow); - if ( control_word & CW_Underflow ) - { - /* The underflow exception is masked. */ - EXCEPTION(EX_Precision); - return tag; - } - - return tag; -} + return tag; +} void FPU_stack_overflow(void) { - if ( control_word & CW_Invalid ) - { - /* The masked response */ - top--; - FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); - } + if (control_word & CW_Invalid) { + /* The masked response */ + top--; + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + } - EXCEPTION(EX_StackOver); + EXCEPTION(EX_StackOver); - return; + return; } - void FPU_stack_underflow(void) { - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); - } + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + } - EXCEPTION(EX_StackUnder); + EXCEPTION(EX_StackUnder); - return; + return; } - void FPU_stack_underflow_i(int i) { - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); - } + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); + } - EXCEPTION(EX_StackUnder); + EXCEPTION(EX_StackUnder); - return; + return; } - void FPU_stack_underflow_pop(int i) { - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); - FPU_pop(); - } + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); + FPU_pop(); + } - EXCEPTION(EX_StackUnder); + EXCEPTION(EX_StackUnder); - return; + return; } - diff --git a/arch/x86/math-emu/exception.h b/arch/x86/math-emu/exception.h index b463f21a811..67f43a4683d 100644 --- a/arch/x86/math-emu/exception.h +++ b/arch/x86/math-emu/exception.h @@ -9,7 +9,6 @@ #ifndef _EXCEPTION_H_ #define _EXCEPTION_H_ - #ifdef __ASSEMBLY__ #define Const_(x) $##x #else @@ -20,8 +19,8 @@ #include "fpu_emu.h" #endif /* SW_C1 */ -#define FPU_BUSY Const_(0x8000) /* FPU busy bit (8087 compatibility) */ -#define EX_ErrorSummary Const_(0x0080) /* Error summary status */ +#define FPU_BUSY Const_(0x8000) /* FPU busy bit (8087 compatibility) */ +#define EX_ErrorSummary Const_(0x0080) /* Error summary status */ /* Special exceptions: */ #define EX_INTERNAL Const_(0x8000) /* Internal error in wm-FPU-emu */ #define EX_StackOver Const_(0x0041|SW_C1) /* stack overflow */ @@ -34,11 +33,9 @@ #define EX_Denormal Const_(0x0002) /* denormalized operand */ #define EX_Invalid Const_(0x0001) /* invalid operation */ - #define PRECISION_LOST_UP Const_((EX_Precision | SW_C1)) #define PRECISION_LOST_DOWN Const_(EX_Precision) - #ifndef __ASSEMBLY__ #ifdef DEBUG @@ -48,6 +45,6 @@ #define EXCEPTION(x) FPU_exception(x) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLY__ */ #endif /* _EXCEPTION_H_ */ diff --git a/arch/x86/math-emu/fpu_arith.c b/arch/x86/math-emu/fpu_arith.c index 6972dec01af..aeab24e083c 100644 --- a/arch/x86/math-emu/fpu_arith.c +++ b/arch/x86/math-emu/fpu_arith.c @@ -15,160 +15,138 @@ #include "control_w.h" #include "status_w.h" - void fadd__(void) { - /* fadd st,st(i) */ - int i = FPU_rm; - clear_C1(); - FPU_add(&st(i), FPU_gettagi(i), 0, control_word); + /* fadd st,st(i) */ + int i = FPU_rm; + clear_C1(); + FPU_add(&st(i), FPU_gettagi(i), 0, control_word); } - void fmul__(void) { - /* fmul st,st(i) */ - int i = FPU_rm; - clear_C1(); - FPU_mul(&st(i), FPU_gettagi(i), 0, control_word); + /* fmul st,st(i) */ + int i = FPU_rm; + clear_C1(); + FPU_mul(&st(i), FPU_gettagi(i), 0, control_word); } - - void fsub__(void) { - /* fsub st,st(i) */ - clear_C1(); - FPU_sub(0, FPU_rm, control_word); + /* fsub st,st(i) */ + clear_C1(); + FPU_sub(0, FPU_rm, control_word); } - void fsubr_(void) { - /* fsubr st,st(i) */ - clear_C1(); - FPU_sub(REV, FPU_rm, control_word); + /* fsubr st,st(i) */ + clear_C1(); + FPU_sub(REV, FPU_rm, control_word); } - void fdiv__(void) { - /* fdiv st,st(i) */ - clear_C1(); - FPU_div(0, FPU_rm, control_word); + /* fdiv st,st(i) */ + clear_C1(); + FPU_div(0, FPU_rm, control_word); } - void fdivr_(void) { - /* fdivr st,st(i) */ - clear_C1(); - FPU_div(REV, FPU_rm, control_word); + /* fdivr st,st(i) */ + clear_C1(); + FPU_div(REV, FPU_rm, control_word); } - - void fadd_i(void) { - /* fadd st(i),st */ - int i = FPU_rm; - clear_C1(); - FPU_add(&st(i), FPU_gettagi(i), i, control_word); + /* fadd st(i),st */ + int i = FPU_rm; + clear_C1(); + FPU_add(&st(i), FPU_gettagi(i), i, control_word); } - void fmul_i(void) { - /* fmul st(i),st */ - clear_C1(); - FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word); + /* fmul st(i),st */ + clear_C1(); + FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word); } - void fsubri(void) { - /* fsubr st(i),st */ - clear_C1(); - FPU_sub(DEST_RM, FPU_rm, control_word); + /* fsubr st(i),st */ + clear_C1(); + FPU_sub(DEST_RM, FPU_rm, control_word); } - void fsub_i(void) { - /* fsub st(i),st */ - clear_C1(); - FPU_sub(REV|DEST_RM, FPU_rm, control_word); + /* fsub st(i),st */ + clear_C1(); + FPU_sub(REV | DEST_RM, FPU_rm, control_word); } - void fdivri(void) { - /* fdivr st(i),st */ - clear_C1(); - FPU_div(DEST_RM, FPU_rm, control_word); + /* fdivr st(i),st */ + clear_C1(); + FPU_div(DEST_RM, FPU_rm, control_word); } - void fdiv_i(void) { - /* fdiv st(i),st */ - clear_C1(); - FPU_div(REV|DEST_RM, FPU_rm, control_word); + /* fdiv st(i),st */ + clear_C1(); + FPU_div(REV | DEST_RM, FPU_rm, control_word); } - - void faddp_(void) { - /* faddp st(i),st */ - int i = FPU_rm; - clear_C1(); - if ( FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0 ) - FPU_pop(); + /* faddp st(i),st */ + int i = FPU_rm; + clear_C1(); + if (FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0) + FPU_pop(); } - void fmulp_(void) { - /* fmulp st(i),st */ - clear_C1(); - if ( FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0 ) - FPU_pop(); + /* fmulp st(i),st */ + clear_C1(); + if (FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0) + FPU_pop(); } - - void fsubrp(void) { - /* fsubrp st(i),st */ - clear_C1(); - if ( FPU_sub(DEST_RM, FPU_rm, control_word) >= 0 ) - FPU_pop(); + /* fsubrp st(i),st */ + clear_C1(); + if (FPU_sub(DEST_RM, FPU_rm, control_word) >= 0) + FPU_pop(); } - void fsubp_(void) { - /* fsubp st(i),st */ - clear_C1(); - if ( FPU_sub(REV|DEST_RM, FPU_rm, control_word) >= 0 ) - FPU_pop(); + /* fsubp st(i),st */ + clear_C1(); + if (FPU_sub(REV | DEST_RM, FPU_rm, control_word) >= 0) + FPU_pop(); } - void fdivrp(void) { - /* fdivrp st(i),st */ - clear_C1(); - if ( FPU_div(DEST_RM, FPU_rm, control_word) >= 0 ) - FPU_pop(); + /* fdivrp st(i),st */ + clear_C1(); + if (FPU_div(DEST_RM, FPU_rm, control_word) >= 0) + FPU_pop(); } - void fdivp_(void) { - /* fdivp st(i),st */ - clear_C1(); - if ( FPU_div(REV|DEST_RM, FPU_rm, control_word) >= 0 ) - FPU_pop(); + /* fdivp st(i),st */ + clear_C1(); + if (FPU_div(REV | DEST_RM, FPU_rm, control_word) >= 0) + FPU_pop(); } diff --git a/arch/x86/math-emu/fpu_asm.h b/arch/x86/math-emu/fpu_asm.h index 9ba12416df1..955b932735a 100644 --- a/arch/x86/math-emu/fpu_asm.h +++ b/arch/x86/math-emu/fpu_asm.h @@ -14,7 +14,6 @@ #define EXCEPTION FPU_exception - #define PARAM1 8(%ebp) #define PARAM2 12(%ebp) #define PARAM3 16(%ebp) diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 20886cfb9f7..491e737ce54 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -16,34 +16,34 @@ #include "status_w.h" #include "control_w.h" - static void fnop(void) { } static void fclex(void) { - partial_status &= ~(SW_Backward|SW_Summary|SW_Stack_Fault|SW_Precision| - SW_Underflow|SW_Overflow|SW_Zero_Div|SW_Denorm_Op| - SW_Invalid); - no_ip_update = 1; + partial_status &= + ~(SW_Backward | SW_Summary | SW_Stack_Fault | SW_Precision | + SW_Underflow | SW_Overflow | SW_Zero_Div | SW_Denorm_Op | + SW_Invalid); + no_ip_update = 1; } /* Needs to be externally visible */ void finit(void) { - control_word = 0x037f; - partial_status = 0; - top = 0; /* We don't keep top in the status word internally. */ - fpu_tag_word = 0xffff; - /* The behaviour is different from that detailed in - Section 15.1.6 of the Intel manual */ - operand_address.offset = 0; - operand_address.selector = 0; - instruction_address.offset = 0; - instruction_address.selector = 0; - instruction_address.opcode = 0; - no_ip_update = 1; + control_word = 0x037f; + partial_status = 0; + top = 0; /* We don't keep top in the status word internally. */ + fpu_tag_word = 0xffff; + /* The behaviour is different from that detailed in + Section 15.1.6 of the Intel manual */ + operand_address.offset = 0; + operand_address.selector = 0; + instruction_address.offset = 0; + instruction_address.selector = 0; + instruction_address.opcode = 0; + no_ip_update = 1; } /* @@ -54,151 +54,134 @@ void finit(void) #define fsetpm fnop static FUNC const finit_table[] = { - feni, fdisi, fclex, finit, - fsetpm, FPU_illegal, FPU_illegal, FPU_illegal + feni, fdisi, fclex, finit, + fsetpm, FPU_illegal, FPU_illegal, FPU_illegal }; void finit_(void) { - (finit_table[FPU_rm])(); + (finit_table[FPU_rm]) (); } - static void fstsw_ax(void) { - *(short *) &FPU_EAX = status_word(); - no_ip_update = 1; + *(short *)&FPU_EAX = status_word(); + no_ip_update = 1; } static FUNC const fstsw_table[] = { - fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal, - FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal + fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal, + FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal }; void fstsw_(void) { - (fstsw_table[FPU_rm])(); + (fstsw_table[FPU_rm]) (); } - static FUNC const fp_nop_table[] = { - fnop, FPU_illegal, FPU_illegal, FPU_illegal, - FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal + fnop, FPU_illegal, FPU_illegal, FPU_illegal, + FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal }; void fp_nop(void) { - (fp_nop_table[FPU_rm])(); + (fp_nop_table[FPU_rm]) (); } - void fld_i_(void) { - FPU_REG *st_new_ptr; - int i; - u_char tag; - - if ( STACK_OVERFLOW ) - { FPU_stack_overflow(); return; } - - /* fld st(i) */ - i = FPU_rm; - if ( NOT_EMPTY(i) ) - { - reg_copy(&st(i), st_new_ptr); - tag = FPU_gettagi(i); - push(); - FPU_settag0(tag); - } - else - { - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_stack_underflow(); + FPU_REG *st_new_ptr; + int i; + u_char tag; + + if (STACK_OVERFLOW) { + FPU_stack_overflow(); + return; } - else - EXCEPTION(EX_StackUnder); - } -} + /* fld st(i) */ + i = FPU_rm; + if (NOT_EMPTY(i)) { + reg_copy(&st(i), st_new_ptr); + tag = FPU_gettagi(i); + push(); + FPU_settag0(tag); + } else { + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_stack_underflow(); + } else + EXCEPTION(EX_StackUnder); + } +} void fxch_i(void) { - /* fxch st(i) */ - FPU_REG t; - int i = FPU_rm; - FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i); - long tag_word = fpu_tag_word; - int regnr = top & 7, regnri = ((regnr + i) & 7); - u_char st0_tag = (tag_word >> (regnr*2)) & 3; - u_char sti_tag = (tag_word >> (regnri*2)) & 3; - - if ( st0_tag == TAG_Empty ) - { - if ( sti_tag == TAG_Empty ) - { - FPU_stack_underflow(); - FPU_stack_underflow_i(i); - return; + /* fxch st(i) */ + FPU_REG t; + int i = FPU_rm; + FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i); + long tag_word = fpu_tag_word; + int regnr = top & 7, regnri = ((regnr + i) & 7); + u_char st0_tag = (tag_word >> (regnr * 2)) & 3; + u_char sti_tag = (tag_word >> (regnri * 2)) & 3; + + if (st0_tag == TAG_Empty) { + if (sti_tag == TAG_Empty) { + FPU_stack_underflow(); + FPU_stack_underflow_i(i); + return; + } + if (control_word & CW_Invalid) { + /* Masked response */ + FPU_copy_to_reg0(sti_ptr, sti_tag); + } + FPU_stack_underflow_i(i); + return; } - if ( control_word & CW_Invalid ) - { - /* Masked response */ - FPU_copy_to_reg0(sti_ptr, sti_tag); + if (sti_tag == TAG_Empty) { + if (control_word & CW_Invalid) { + /* Masked response */ + FPU_copy_to_regi(st0_ptr, st0_tag, i); + } + FPU_stack_underflow(); + return; } - FPU_stack_underflow_i(i); - return; - } - if ( sti_tag == TAG_Empty ) - { - if ( control_word & CW_Invalid ) - { - /* Masked response */ - FPU_copy_to_regi(st0_ptr, st0_tag, i); - } - FPU_stack_underflow(); - return; - } - clear_C1(); - - reg_copy(st0_ptr, &t); - reg_copy(sti_ptr, st0_ptr); - reg_copy(&t, sti_ptr); - - tag_word &= ~(3 << (regnr*2)) & ~(3 << (regnri*2)); - tag_word |= (sti_tag << (regnr*2)) | (st0_tag << (regnri*2)); - fpu_tag_word = tag_word; -} + clear_C1(); + reg_copy(st0_ptr, &t); + reg_copy(sti_ptr, st0_ptr); + reg_copy(&t, sti_ptr); + + tag_word &= ~(3 << (regnr * 2)) & ~(3 << (regnri * 2)); + tag_word |= (sti_tag << (regnr * 2)) | (st0_tag << (regnri * 2)); + fpu_tag_word = tag_word; +} void ffree_(void) { - /* ffree st(i) */ - FPU_settagi(FPU_rm, TAG_Empty); + /* ffree st(i) */ + FPU_settagi(FPU_rm, TAG_Empty); } - void ffreep(void) { - /* ffree st(i) + pop - unofficial code */ - FPU_settagi(FPU_rm, TAG_Empty); - FPU_pop(); + /* ffree st(i) + pop - unofficial code */ + FPU_settagi(FPU_rm, TAG_Empty); + FPU_pop(); } - void fst_i_(void) { - /* fst st(i) */ - FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); + /* fst st(i) */ + FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); } - void fstp_i(void) { - /* fstp st(i) */ - FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); - FPU_pop(); + /* fstp st(i) */ + FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); + FPU_pop(); } - diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h index 65120f52385..4dae511c85a 100644 --- a/arch/x86/math-emu/fpu_emu.h +++ b/arch/x86/math-emu/fpu_emu.h @@ -7,7 +7,6 @@ | | +---------------------------------------------------------------------------*/ - #ifndef _FPU_EMU_H_ #define _FPU_EMU_H_ @@ -28,15 +27,15 @@ #endif #define EXP_BIAS Const(0) -#define EXP_OVER Const(0x4000) /* smallest invalid large exponent */ -#define EXP_UNDER Const(-0x3fff) /* largest invalid small exponent */ -#define EXP_WAY_UNDER Const(-0x6000) /* Below the smallest denormal, but - still a 16 bit nr. */ +#define EXP_OVER Const(0x4000) /* smallest invalid large exponent */ +#define EXP_UNDER Const(-0x3fff) /* largest invalid small exponent */ +#define EXP_WAY_UNDER Const(-0x6000) /* Below the smallest denormal, but + still a 16 bit nr. */ #define EXP_Infinity EXP_OVER #define EXP_NaN EXP_OVER #define EXTENDED_Ebias Const(0x3fff) -#define EXTENDED_Emin (-0x3ffe) /* smallest valid exponent */ +#define EXTENDED_Emin (-0x3ffe) /* smallest valid exponent */ #define SIGN_POS Const(0) #define SIGN_NEG Const(0x80) @@ -44,10 +43,9 @@ #define SIGN_Positive Const(0) #define SIGN_Negative Const(0x8000) - /* Keep the order TAG_Valid, TAG_Zero, TW_Denormal */ /* The following fold to 2 (Special) in the Tag Word */ -#define TW_Denormal Const(4) /* De-normal */ +#define TW_Denormal Const(4) /* De-normal */ #define TW_Infinity Const(5) /* + or - infinity */ #define TW_NaN Const(6) /* Not a Number */ #define TW_Unsupported Const(7) /* Not supported by an 80486 */ @@ -67,14 +65,13 @@ #define DEST_RM 0x20 #define LOADED 0x40 -#define FPU_Exception Const(0x80000000) /* Added to tag returns. */ - +#define FPU_Exception Const(0x80000000) /* Added to tag returns. */ #ifndef __ASSEMBLY__ #include "fpu_system.h" -#include <asm/sigcontext.h> /* for struct _fpstate */ +#include <asm/sigcontext.h> /* for struct _fpstate */ #include <asm/math_emu.h> #include <linux/linkage.h> @@ -112,30 +109,33 @@ extern u_char emulating; #define PREFIX_DEFAULT 7 struct address { - unsigned int offset; - unsigned int selector:16; - unsigned int opcode:11; - unsigned int empty:5; + unsigned int offset; + unsigned int selector:16; + unsigned int opcode:11; + unsigned int empty:5; }; struct fpu__reg { - unsigned sigl; - unsigned sigh; - short exp; + unsigned sigl; + unsigned sigh; + short exp; }; -typedef void (*FUNC)(void); +typedef void (*FUNC) (void); typedef struct fpu__reg FPU_REG; -typedef void (*FUNC_ST0)(FPU_REG *st0_ptr, u_char st0_tag); -typedef struct { u_char address_size, operand_size, segment; } - overrides; +typedef void (*FUNC_ST0) (FPU_REG *st0_ptr, u_char st0_tag); +typedef struct { + u_char address_size, operand_size, segment; +} overrides; /* This structure is 32 bits: */ -typedef struct { overrides override; - u_char default_mode; } fpu_addr_modes; +typedef struct { + overrides override; + u_char default_mode; +} fpu_addr_modes; /* PROTECTED has a restricted meaning in the emulator; it is used to signal that the emulator needs to do special things to ensure that protection is respected in a segmented model. */ #define PROTECTED 4 -#define SIXTEEN 1 /* We rely upon this being 1 (true) */ +#define SIXTEEN 1 /* We rely upon this being 1 (true) */ #define VM86 SIXTEEN #define PM16 (SIXTEEN | PROTECTED) #define SEG32 PROTECTED @@ -168,8 +168,8 @@ extern u_char const data_sizes_16[32]; static inline void reg_copy(FPU_REG const *x, FPU_REG *y) { - *(short *)&(y->exp) = *(const short *)&(x->exp); - *(long long *)&(y->sigl) = *(const long long *)&(x->sigl); + *(short *)&(y->exp) = *(const short *)&(x->exp); + *(long long *)&(y->sigl) = *(const long long *)&(x->sigl); } #define exponent(x) (((*(short *)&((x)->exp)) & 0x7fff) - EXTENDED_Ebias) @@ -184,27 +184,26 @@ static inline void reg_copy(FPU_REG const *x, FPU_REG *y) #define significand(x) ( ((unsigned long long *)&((x)->sigl))[0] ) - /*----- Prototypes for functions written in assembler -----*/ /* extern void reg_move(FPU_REG *a, FPU_REG *b); */ asmlinkage int FPU_normalize(FPU_REG *x); asmlinkage int FPU_normalize_nuo(FPU_REG *x); asmlinkage int FPU_u_sub(FPU_REG const *arg1, FPU_REG const *arg2, - FPU_REG *answ, unsigned int control_w, u_char sign, + FPU_REG * answ, unsigned int control_w, u_char sign, int expa, int expb); asmlinkage int FPU_u_mul(FPU_REG const *arg1, FPU_REG const *arg2, - FPU_REG *answ, unsigned int control_w, u_char sign, + FPU_REG * answ, unsigned int control_w, u_char sign, int expon); asmlinkage int FPU_u_div(FPU_REG const *arg1, FPU_REG const *arg2, - FPU_REG *answ, unsigned int control_w, u_char sign); + FPU_REG * answ, unsigned int control_w, u_char sign); asmlinkage int FPU_u_add(FPU_REG const *arg1, FPU_REG const *arg2, - FPU_REG *answ, unsigned int control_w, u_char sign, + FPU_REG * answ, unsigned int control_w, u_char sign, int expa, int expb); asmlinkage int wm_sqrt(FPU_REG *n, int dummy1, int dummy2, unsigned int control_w, u_char sign); -asmlinkage unsigned FPU_shrx(void *l, unsigned x); -asmlinkage unsigned FPU_shrxs(void *v, unsigned x); +asmlinkage unsigned FPU_shrx(void *l, unsigned x); +asmlinkage unsigned FPU_shrxs(void *v, unsigned x); asmlinkage unsigned long FPU_div_small(unsigned long long *x, unsigned long y); asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy, unsigned int control_w, u_char sign); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 1853524c8b5..760baeea5f0 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -25,10 +25,11 @@ +---------------------------------------------------------------------------*/ #include <linux/signal.h> -#include <linux/ptrace.h> +#include <linux/regset.h> #include <asm/uaccess.h> #include <asm/desc.h> +#include <asm/user.h> #include "fpu_system.h" #include "fpu_emu.h" @@ -36,726 +37,727 @@ #include "control_w.h" #include "status_w.h" -#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ +#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ -#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */ +#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */ /* WARNING: These codes are not documented by Intel in their 80486 manual and may not work on FPU clones or later Intel FPUs. */ /* Changes to support the un-doc codes provided by Linus Torvalds. */ -#define _d9_d8_ fstp_i /* unofficial code (19) */ -#define _dc_d0_ fcom_st /* unofficial code (14) */ -#define _dc_d8_ fcompst /* unofficial code (1c) */ -#define _dd_c8_ fxch_i /* unofficial code (0d) */ -#define _de_d0_ fcompst /* unofficial code (16) */ -#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */ -#define _df_c8_ fxch_i /* unofficial code (0f) */ -#define _df_d0_ fstp_i /* unofficial code (17) */ -#define _df_d8_ fstp_i /* unofficial code (1f) */ +#define _d9_d8_ fstp_i /* unofficial code (19) */ +#define _dc_d0_ fcom_st /* unofficial code (14) */ +#define _dc_d8_ fcompst /* unofficial code (1c) */ +#define _dd_c8_ fxch_i /* unofficial code (0d) */ +#define _de_d0_ fcompst /* unofficial code (16) */ +#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */ +#define _df_c8_ fxch_i /* unofficial code (0f) */ +#define _df_d0_ fstp_i /* unofficial code (17) */ +#define _df_d8_ fstp_i /* unofficial code (1f) */ static FUNC const st_instr_table[64] = { - fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_, - fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_, - fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_, - fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_, - fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, - fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, - fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, - fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, + fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_, + fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_, + fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_, + fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_, + fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, + fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, + fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, + fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, }; -#else /* Support only documented FPU op-codes */ +#else /* Support only documented FPU op-codes */ static FUNC const st_instr_table[64] = { - fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__, - fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__, - fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__, - fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__, - fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, - fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, - fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, - fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, + fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__, + fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__, + fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__, + fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__, + fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, + fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, + fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, + fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, }; #endif /* NO_UNDOC_CODE */ - -#define _NONE_ 0 /* Take no special action */ -#define _REG0_ 1 /* Need to check for not empty st(0) */ -#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */ -#define _REGi_ 0 /* Uses st(rm) */ -#define _PUSH_ 3 /* Need to check for space to push onto stack */ -#define _null_ 4 /* Function illegal or not implemented */ -#define _REGIi 5 /* Uses st(0) and st(rm), result to st(rm) */ -#define _REGIp 6 /* Uses st(0) and st(rm), result to st(rm) then pop */ -#define _REGIc 0 /* Compare st(0) and st(rm) */ -#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */ +#define _NONE_ 0 /* Take no special action */ +#define _REG0_ 1 /* Need to check for not empty st(0) */ +#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */ +#define _REGi_ 0 /* Uses st(rm) */ +#define _PUSH_ 3 /* Need to check for space to push onto stack */ +#define _null_ 4 /* Function illegal or not implemented */ +#define _REGIi 5 /* Uses st(0) and st(rm), result to st(rm) */ +#define _REGIp 6 /* Uses st(0) and st(rm), result to st(rm) then pop */ +#define _REGIc 0 /* Compare st(0) and st(rm) */ +#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */ #ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. (see above) */ static u_char const type_table[64] = { - _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, - _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, - _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, - _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, - _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, - _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ + _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, + _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, + _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, + _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, + _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, + _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ }; -#else /* Support only documented FPU op-codes */ +#else /* Support only documented FPU op-codes */ static u_char const type_table[64] = { - _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_, - _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_, - _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_, - _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, - _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ + _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_, + _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_, + _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_, + _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_, + _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, + _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ }; #endif /* NO_UNDOC_CODE */ - #ifdef RE_ENTRANT_CHECKING -u_char emulating=0; +u_char emulating = 0; #endif /* RE_ENTRANT_CHECKING */ -static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, - overrides *override); +static int valid_prefix(u_char *Byte, u_char __user ** fpu_eip, + overrides * override); asmlinkage void math_emulate(long arg) { - u_char FPU_modrm, byte1; - unsigned short code; - fpu_addr_modes addr_modes; - int unmasked; - FPU_REG loaded_data; - FPU_REG *st0_ptr; - u_char loaded_tag, st0_tag; - void __user *data_address; - struct address data_sel_off; - struct address entry_sel_off; - unsigned long code_base = 0; - unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ - struct desc_struct code_descriptor; + u_char FPU_modrm, byte1; + unsigned short code; + fpu_addr_modes addr_modes; + int unmasked; + FPU_REG loaded_data; + FPU_REG *st0_ptr; + u_char loaded_tag, st0_tag; + void __user *data_address; + struct address data_sel_off; + struct address entry_sel_off; + unsigned long code_base = 0; + unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ + struct desc_struct code_descriptor; #ifdef RE_ENTRANT_CHECKING - if ( emulating ) - { - printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n"); - } - RE_ENTRANT_CHECK_ON; + if (emulating) { + printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n"); + } + RE_ENTRANT_CHECK_ON; #endif /* RE_ENTRANT_CHECKING */ - if (!used_math()) - { - finit(); - set_used_math(); - } - - SETUP_DATA_AREA(arg); - - FPU_ORIG_EIP = FPU_EIP; - - if ( (FPU_EFLAGS & 0x00020000) != 0 ) - { - /* Virtual 8086 mode */ - addr_modes.default_mode = VM86; - FPU_EIP += code_base = FPU_CS << 4; - code_limit = code_base + 0xffff; /* Assumes code_base <= 0xffff0000 */ - } - else if ( FPU_CS == __USER_CS && FPU_DS == __USER_DS ) - { - addr_modes.default_mode = 0; - } - else if ( FPU_CS == __KERNEL_CS ) - { - printk("math_emulate: %04x:%08lx\n",FPU_CS,FPU_EIP); - panic("Math emulation needed in kernel"); - } - else - { - - if ( (FPU_CS & 4) != 4 ) /* Must be in the LDT */ - { - /* Can only handle segmented addressing via the LDT - for now, and it must be 16 bit */ - printk("FPU emulator: Unsupported addressing mode\n"); - math_abort(FPU_info, SIGILL); + if (!used_math()) { + finit(); + set_used_math(); } - code_descriptor = LDT_DESCRIPTOR(FPU_CS); - if ( SEG_D_SIZE(code_descriptor) ) - { - /* The above test may be wrong, the book is not clear */ - /* Segmented 32 bit protected mode */ - addr_modes.default_mode = SEG32; + SETUP_DATA_AREA(arg); + + FPU_ORIG_EIP = FPU_EIP; + + if ((FPU_EFLAGS & 0x00020000) != 0) { + /* Virtual 8086 mode */ + addr_modes.default_mode = VM86; + FPU_EIP += code_base = FPU_CS << 4; + code_limit = code_base + 0xffff; /* Assumes code_base <= 0xffff0000 */ + } else if (FPU_CS == __USER_CS && FPU_DS == __USER_DS) { + addr_modes.default_mode = 0; + } else if (FPU_CS == __KERNEL_CS) { + printk("math_emulate: %04x:%08lx\n", FPU_CS, FPU_EIP); + panic("Math emulation needed in kernel"); + } else { + + if ((FPU_CS & 4) != 4) { /* Must be in the LDT */ + /* Can only handle segmented addressing via the LDT + for now, and it must be 16 bit */ + printk("FPU emulator: Unsupported addressing mode\n"); + math_abort(FPU_info, SIGILL); + } + + code_descriptor = LDT_DESCRIPTOR(FPU_CS); + if (SEG_D_SIZE(code_descriptor)) { + /* The above test may be wrong, the book is not clear */ + /* Segmented 32 bit protected mode */ + addr_modes.default_mode = SEG32; + } else { + /* 16 bit protected mode */ + addr_modes.default_mode = PM16; + } + FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); + code_limit = code_base + + (SEG_LIMIT(code_descriptor) + + 1) * SEG_GRANULARITY(code_descriptor) + - 1; + if (code_limit < code_base) + code_limit = 0xffffffff; } - else - { - /* 16 bit protected mode */ - addr_modes.default_mode = PM16; + + FPU_lookahead = !(FPU_EFLAGS & X86_EFLAGS_TF); + + if (!valid_prefix(&byte1, (u_char __user **) & FPU_EIP, + &addr_modes.override)) { + RE_ENTRANT_CHECK_OFF; + printk + ("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n" + "FPU emulator: self-modifying code! (emulation impossible)\n", + byte1); + RE_ENTRANT_CHECK_ON; + EXCEPTION(EX_INTERNAL | 0x126); + math_abort(FPU_info, SIGILL); } - FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); - code_limit = code_base - + (SEG_LIMIT(code_descriptor)+1) * SEG_GRANULARITY(code_descriptor) - - 1; - if ( code_limit < code_base ) code_limit = 0xffffffff; - } - - FPU_lookahead = 1; - if (current->ptrace & PT_PTRACED) - FPU_lookahead = 0; - - if ( !valid_prefix(&byte1, (u_char __user **)&FPU_EIP, - &addr_modes.override) ) - { - RE_ENTRANT_CHECK_OFF; - printk("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n" - "FPU emulator: self-modifying code! (emulation impossible)\n", - byte1); - RE_ENTRANT_CHECK_ON; - EXCEPTION(EX_INTERNAL|0x126); - math_abort(FPU_info,SIGILL); - } - -do_another_FPU_instruction: - - no_ip_update = 0; - - FPU_EIP++; /* We have fetched the prefix and first code bytes. */ - - if ( addr_modes.default_mode ) - { - /* This checks for the minimum instruction bytes. - We also need to check any extra (address mode) code access. */ - if ( FPU_EIP > code_limit ) - math_abort(FPU_info,SIGSEGV); - } - - if ( (byte1 & 0xf8) != 0xd8 ) - { - if ( byte1 == FWAIT_OPCODE ) - { - if (partial_status & SW_Summary) - goto do_the_FPU_interrupt; - else - goto FPU_fwait_done; + + do_another_FPU_instruction: + + no_ip_update = 0; + + FPU_EIP++; /* We have fetched the prefix and first code bytes. */ + + if (addr_modes.default_mode) { + /* This checks for the minimum instruction bytes. + We also need to check any extra (address mode) code access. */ + if (FPU_EIP > code_limit) + math_abort(FPU_info, SIGSEGV); } + + if ((byte1 & 0xf8) != 0xd8) { + if (byte1 == FWAIT_OPCODE) { + if (partial_status & SW_Summary) + goto do_the_FPU_interrupt; + else + goto FPU_fwait_done; + } #ifdef PARANOID - EXCEPTION(EX_INTERNAL|0x128); - math_abort(FPU_info,SIGILL); + EXCEPTION(EX_INTERNAL | 0x128); + math_abort(FPU_info, SIGILL); #endif /* PARANOID */ - } - - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP); - RE_ENTRANT_CHECK_ON; - FPU_EIP++; - - if (partial_status & SW_Summary) - { - /* Ignore the error for now if the current instruction is a no-wait - control instruction */ - /* The 80486 manual contradicts itself on this topic, - but a real 80486 uses the following instructions: - fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex. - */ - code = (FPU_modrm << 8) | byte1; - if ( ! ( (((code & 0xf803) == 0xe003) || /* fnclex, fninit, fnstsw */ - (((code & 0x3003) == 0x3001) && /* fnsave, fnstcw, fnstenv, - fnstsw */ - ((code & 0xc000) != 0xc000))) ) ) - { - /* - * We need to simulate the action of the kernel to FPU - * interrupts here. - */ - do_the_FPU_interrupt: - - FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */ - - RE_ENTRANT_CHECK_OFF; - current->thread.trap_no = 16; - current->thread.error_code = 0; - send_sig(SIGFPE, current, 1); - return; - } - } - - entry_sel_off.offset = FPU_ORIG_EIP; - entry_sel_off.selector = FPU_CS; - entry_sel_off.opcode = (byte1 << 8) | FPU_modrm; - - FPU_rm = FPU_modrm & 7; - - if ( FPU_modrm < 0300 ) - { - /* All of these instructions use the mod/rm byte to get a data address */ - - if ( (addr_modes.default_mode & SIXTEEN) - ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX) ) - data_address = FPU_get_address_16(FPU_modrm, &FPU_EIP, &data_sel_off, - addr_modes); - else - data_address = FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off, - addr_modes); - - if ( addr_modes.default_mode ) - { - if ( FPU_EIP-1 > code_limit ) - math_abort(FPU_info,SIGSEGV); } - if ( !(byte1 & 1) ) - { - unsigned short status1 = partial_status; - - st0_ptr = &st(0); - st0_tag = FPU_gettag0(); - - /* Stack underflow has priority */ - if ( NOT_EMPTY_ST0 ) - { - if ( addr_modes.default_mode & PROTECTED ) - { - /* This table works for 16 and 32 bit protected mode */ - if ( access_limit < data_sizes_16[(byte1 >> 1) & 3] ) - math_abort(FPU_info,SIGSEGV); + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP); + RE_ENTRANT_CHECK_ON; + FPU_EIP++; + + if (partial_status & SW_Summary) { + /* Ignore the error for now if the current instruction is a no-wait + control instruction */ + /* The 80486 manual contradicts itself on this topic, + but a real 80486 uses the following instructions: + fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex. + */ + code = (FPU_modrm << 8) | byte1; + if (!((((code & 0xf803) == 0xe003) || /* fnclex, fninit, fnstsw */ + (((code & 0x3003) == 0x3001) && /* fnsave, fnstcw, fnstenv, + fnstsw */ + ((code & 0xc000) != 0xc000))))) { + /* + * We need to simulate the action of the kernel to FPU + * interrupts here. + */ + do_the_FPU_interrupt: + + FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */ + + RE_ENTRANT_CHECK_OFF; + current->thread.trap_no = 16; + current->thread.error_code = 0; + send_sig(SIGFPE, current, 1); + return; } + } - unmasked = 0; /* Do this here to stop compiler warnings. */ - switch ( (byte1 >> 1) & 3 ) - { - case 0: - unmasked = FPU_load_single((float __user *)data_address, - &loaded_data); - loaded_tag = unmasked & 0xff; - unmasked &= ~0xff; - break; - case 1: - loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data); - break; - case 2: - unmasked = FPU_load_double((double __user *)data_address, - &loaded_data); - loaded_tag = unmasked & 0xff; - unmasked &= ~0xff; - break; - case 3: - default: /* Used here to suppress gcc warnings. */ - loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data); - break; - } + entry_sel_off.offset = FPU_ORIG_EIP; + entry_sel_off.selector = FPU_CS; + entry_sel_off.opcode = (byte1 << 8) | FPU_modrm; - /* No more access to user memory, it is safe - to use static data now */ - - /* NaN operands have the next priority. */ - /* We have to delay looking at st(0) until after - loading the data, because that data might contain an SNaN */ - if ( ((st0_tag == TAG_Special) && isNaN(st0_ptr)) || - ((loaded_tag == TAG_Special) && isNaN(&loaded_data)) ) - { - /* Restore the status word; we might have loaded a - denormal. */ - partial_status = status1; - if ( (FPU_modrm & 0x30) == 0x10 ) - { - /* fcom or fcomp */ - EXCEPTION(EX_Invalid); - setcc(SW_C3 | SW_C2 | SW_C0); - if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) ) - FPU_pop(); /* fcomp, masked, so we pop. */ - } - else - { - if ( loaded_tag == TAG_Special ) - loaded_tag = FPU_Special(&loaded_data); -#ifdef PECULIAR_486 - /* This is not really needed, but gives behaviour - identical to an 80486 */ - if ( (FPU_modrm & 0x28) == 0x20 ) - /* fdiv or fsub */ - real_2op_NaN(&loaded_data, loaded_tag, 0, &loaded_data); - else -#endif /* PECULIAR_486 */ - /* fadd, fdivr, fmul, or fsubr */ - real_2op_NaN(&loaded_data, loaded_tag, 0, st0_ptr); - } - goto reg_mem_instr_done; - } + FPU_rm = FPU_modrm & 7; - if ( unmasked && !((FPU_modrm & 0x30) == 0x10) ) - { - /* Is not a comparison instruction. */ - if ( (FPU_modrm & 0x38) == 0x38 ) - { - /* fdivr */ - if ( (st0_tag == TAG_Zero) && - ((loaded_tag == TAG_Valid) - || (loaded_tag == TAG_Special - && isdenormal(&loaded_data))) ) - { - if ( FPU_divide_by_zero(0, getsign(&loaded_data)) - < 0 ) - { - /* We use the fact here that the unmasked - exception in the loaded data was for a - denormal operand */ - /* Restore the state of the denormal op bit */ - partial_status &= ~SW_Denorm_Op; - partial_status |= status1 & SW_Denorm_Op; - } - else - setsign(st0_ptr, getsign(&loaded_data)); - } - } - goto reg_mem_instr_done; - } + if (FPU_modrm < 0300) { + /* All of these instructions use the mod/rm byte to get a data address */ - switch ( (FPU_modrm >> 3) & 7 ) - { - case 0: /* fadd */ - clear_C1(); - FPU_add(&loaded_data, loaded_tag, 0, control_word); - break; - case 1: /* fmul */ - clear_C1(); - FPU_mul(&loaded_data, loaded_tag, 0, control_word); - break; - case 2: /* fcom */ - FPU_compare_st_data(&loaded_data, loaded_tag); - break; - case 3: /* fcomp */ - if ( !FPU_compare_st_data(&loaded_data, loaded_tag) - && !unmasked ) - FPU_pop(); - break; - case 4: /* fsub */ - clear_C1(); - FPU_sub(LOADED|loaded_tag, (int)&loaded_data, control_word); - break; - case 5: /* fsubr */ - clear_C1(); - FPU_sub(REV|LOADED|loaded_tag, (int)&loaded_data, control_word); - break; - case 6: /* fdiv */ - clear_C1(); - FPU_div(LOADED|loaded_tag, (int)&loaded_data, control_word); - break; - case 7: /* fdivr */ - clear_C1(); - if ( st0_tag == TAG_Zero ) - partial_status = status1; /* Undo any denorm tag, - zero-divide has priority. */ - FPU_div(REV|LOADED|loaded_tag, (int)&loaded_data, control_word); - break; + if ((addr_modes.default_mode & SIXTEEN) + ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX)) + data_address = + FPU_get_address_16(FPU_modrm, &FPU_EIP, + &data_sel_off, addr_modes); + else + data_address = + FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off, + addr_modes); + + if (addr_modes.default_mode) { + if (FPU_EIP - 1 > code_limit) + math_abort(FPU_info, SIGSEGV); } - } - else - { - if ( (FPU_modrm & 0x30) == 0x10 ) - { - /* The instruction is fcom or fcomp */ - EXCEPTION(EX_StackUnder); - setcc(SW_C3 | SW_C2 | SW_C0); - if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) ) - FPU_pop(); /* fcomp */ + + if (!(byte1 & 1)) { + unsigned short status1 = partial_status; + + st0_ptr = &st(0); + st0_tag = FPU_gettag0(); + + /* Stack underflow has priority */ + if (NOT_EMPTY_ST0) { + if (addr_modes.default_mode & PROTECTED) { + /* This table works for 16 and 32 bit protected mode */ + if (access_limit < + data_sizes_16[(byte1 >> 1) & 3]) + math_abort(FPU_info, SIGSEGV); + } + + unmasked = 0; /* Do this here to stop compiler warnings. */ + switch ((byte1 >> 1) & 3) { + case 0: + unmasked = + FPU_load_single((float __user *) + data_address, + &loaded_data); + loaded_tag = unmasked & 0xff; + unmasked &= ~0xff; + break; + case 1: + loaded_tag = + FPU_load_int32((long __user *) + data_address, + &loaded_data); + break; + case 2: + unmasked = + FPU_load_double((double __user *) + data_address, + &loaded_data); + loaded_tag = unmasked & 0xff; + unmasked &= ~0xff; + break; + case 3: + default: /* Used here to suppress gcc warnings. */ + loaded_tag = + FPU_load_int16((short __user *) + data_address, + &loaded_data); + break; + } + + /* No more access to user memory, it is safe + to use static data now */ + + /* NaN operands have the next priority. */ + /* We have to delay looking at st(0) until after + loading the data, because that data might contain an SNaN */ + if (((st0_tag == TAG_Special) && isNaN(st0_ptr)) + || ((loaded_tag == TAG_Special) + && isNaN(&loaded_data))) { + /* Restore the status word; we might have loaded a + denormal. */ + partial_status = status1; + if ((FPU_modrm & 0x30) == 0x10) { + /* fcom or fcomp */ + EXCEPTION(EX_Invalid); + setcc(SW_C3 | SW_C2 | SW_C0); + if ((FPU_modrm & 0x08) + && (control_word & + CW_Invalid)) + FPU_pop(); /* fcomp, masked, so we pop. */ + } else { + if (loaded_tag == TAG_Special) + loaded_tag = + FPU_Special + (&loaded_data); +#ifdef PECULIAR_486 + /* This is not really needed, but gives behaviour + identical to an 80486 */ + if ((FPU_modrm & 0x28) == 0x20) + /* fdiv or fsub */ + real_2op_NaN + (&loaded_data, + loaded_tag, 0, + &loaded_data); + else +#endif /* PECULIAR_486 */ + /* fadd, fdivr, fmul, or fsubr */ + real_2op_NaN + (&loaded_data, + loaded_tag, 0, + st0_ptr); + } + goto reg_mem_instr_done; + } + + if (unmasked && !((FPU_modrm & 0x30) == 0x10)) { + /* Is not a comparison instruction. */ + if ((FPU_modrm & 0x38) == 0x38) { + /* fdivr */ + if ((st0_tag == TAG_Zero) && + ((loaded_tag == TAG_Valid) + || (loaded_tag == + TAG_Special + && + isdenormal + (&loaded_data)))) { + if (FPU_divide_by_zero + (0, + getsign + (&loaded_data)) + < 0) { + /* We use the fact here that the unmasked + exception in the loaded data was for a + denormal operand */ + /* Restore the state of the denormal op bit */ + partial_status + &= + ~SW_Denorm_Op; + partial_status + |= + status1 & + SW_Denorm_Op; + } else + setsign(st0_ptr, + getsign + (&loaded_data)); + } + } + goto reg_mem_instr_done; + } + + switch ((FPU_modrm >> 3) & 7) { + case 0: /* fadd */ + clear_C1(); + FPU_add(&loaded_data, loaded_tag, 0, + control_word); + break; + case 1: /* fmul */ + clear_C1(); + FPU_mul(&loaded_data, loaded_tag, 0, + control_word); + break; + case 2: /* fcom */ + FPU_compare_st_data(&loaded_data, + loaded_tag); + break; + case 3: /* fcomp */ + if (!FPU_compare_st_data + (&loaded_data, loaded_tag) + && !unmasked) + FPU_pop(); + break; + case 4: /* fsub */ + clear_C1(); + FPU_sub(LOADED | loaded_tag, + (int)&loaded_data, + control_word); + break; + case 5: /* fsubr */ + clear_C1(); + FPU_sub(REV | LOADED | loaded_tag, + (int)&loaded_data, + control_word); + break; + case 6: /* fdiv */ + clear_C1(); + FPU_div(LOADED | loaded_tag, + (int)&loaded_data, + control_word); + break; + case 7: /* fdivr */ + clear_C1(); + if (st0_tag == TAG_Zero) + partial_status = status1; /* Undo any denorm tag, + zero-divide has priority. */ + FPU_div(REV | LOADED | loaded_tag, + (int)&loaded_data, + control_word); + break; + } + } else { + if ((FPU_modrm & 0x30) == 0x10) { + /* The instruction is fcom or fcomp */ + EXCEPTION(EX_StackUnder); + setcc(SW_C3 | SW_C2 | SW_C0); + if ((FPU_modrm & 0x08) + && (control_word & CW_Invalid)) + FPU_pop(); /* fcomp */ + } else + FPU_stack_underflow(); + } + reg_mem_instr_done: + operand_address = data_sel_off; + } else { + if (!(no_ip_update = + FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6)) + >> 1, addr_modes, data_address))) { + operand_address = data_sel_off; + } } - else - FPU_stack_underflow(); - } - reg_mem_instr_done: - operand_address = data_sel_off; - } - else - { - if ( !(no_ip_update = - FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6)) >> 1, - addr_modes, data_address)) ) - { - operand_address = data_sel_off; - } - } - } - else - { - /* None of these instructions access user memory */ - u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7); + } else { + /* None of these instructions access user memory */ + u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7); #ifdef PECULIAR_486 - /* This is supposed to be undefined, but a real 80486 seems - to do this: */ - operand_address.offset = 0; - operand_address.selector = FPU_DS; + /* This is supposed to be undefined, but a real 80486 seems + to do this: */ + operand_address.offset = 0; + operand_address.selector = FPU_DS; #endif /* PECULIAR_486 */ - st0_ptr = &st(0); - st0_tag = FPU_gettag0(); - switch ( type_table[(int) instr_index] ) - { - case _NONE_: /* also _REGIc: _REGIn */ - break; - case _REG0_: - if ( !NOT_EMPTY_ST0 ) - { - FPU_stack_underflow(); - goto FPU_instruction_done; - } - break; - case _REGIi: - if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) - { - FPU_stack_underflow_i(FPU_rm); - goto FPU_instruction_done; - } - break; - case _REGIp: - if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) - { - FPU_stack_underflow_pop(FPU_rm); - goto FPU_instruction_done; - } - break; - case _REGI_: - if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) - { - FPU_stack_underflow(); - goto FPU_instruction_done; - } - break; - case _PUSH_: /* Only used by the fld st(i) instruction */ - break; - case _null_: - FPU_illegal(); - goto FPU_instruction_done; - default: - EXCEPTION(EX_INTERNAL|0x111); - goto FPU_instruction_done; - } - (*st_instr_table[(int) instr_index])(); + st0_ptr = &st(0); + st0_tag = FPU_gettag0(); + switch (type_table[(int)instr_index]) { + case _NONE_: /* also _REGIc: _REGIn */ + break; + case _REG0_: + if (!NOT_EMPTY_ST0) { + FPU_stack_underflow(); + goto FPU_instruction_done; + } + break; + case _REGIi: + if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) { + FPU_stack_underflow_i(FPU_rm); + goto FPU_instruction_done; + } + break; + case _REGIp: + if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) { + FPU_stack_underflow_pop(FPU_rm); + goto FPU_instruction_done; + } + break; + case _REGI_: + if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) { + FPU_stack_underflow(); + goto FPU_instruction_done; + } + break; + case _PUSH_: /* Only used by the fld st(i) instruction */ + break; + case _null_: + FPU_illegal(); + goto FPU_instruction_done; + default: + EXCEPTION(EX_INTERNAL | 0x111); + goto FPU_instruction_done; + } + (*st_instr_table[(int)instr_index]) (); -FPU_instruction_done: - ; - } + FPU_instruction_done: + ; + } - if ( ! no_ip_update ) - instruction_address = entry_sel_off; + if (!no_ip_update) + instruction_address = entry_sel_off; -FPU_fwait_done: + FPU_fwait_done: #ifdef DEBUG - RE_ENTRANT_CHECK_OFF; - FPU_printall(); - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_OFF; + FPU_printall(); + RE_ENTRANT_CHECK_ON; #endif /* DEBUG */ - if (FPU_lookahead && !need_resched()) - { - FPU_ORIG_EIP = FPU_EIP - code_base; - if ( valid_prefix(&byte1, (u_char __user **)&FPU_EIP, - &addr_modes.override) ) - goto do_another_FPU_instruction; - } + if (FPU_lookahead && !need_resched()) { + FPU_ORIG_EIP = FPU_EIP - code_base; + if (valid_prefix(&byte1, (u_char __user **) & FPU_EIP, + &addr_modes.override)) + goto do_another_FPU_instruction; + } - if ( addr_modes.default_mode ) - FPU_EIP -= code_base; + if (addr_modes.default_mode) + FPU_EIP -= code_base; - RE_ENTRANT_CHECK_OFF; + RE_ENTRANT_CHECK_OFF; } - /* Support for prefix bytes is not yet complete. To properly handle all prefix bytes, further changes are needed in the emulator code which accesses user address space. Access to separate segments is important for msdos emulation. */ static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, - overrides *override) + overrides * override) { - u_char byte; - u_char __user *ip = *fpu_eip; - - *override = (overrides) { 0, 0, PREFIX_DEFAULT }; /* defaults */ - - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(byte, ip); - RE_ENTRANT_CHECK_ON; - - while ( 1 ) - { - switch ( byte ) - { - case ADDR_SIZE_PREFIX: - override->address_size = ADDR_SIZE_PREFIX; - goto do_next_byte; - - case OP_SIZE_PREFIX: - override->operand_size = OP_SIZE_PREFIX; - goto do_next_byte; - - case PREFIX_CS: - override->segment = PREFIX_CS_; - goto do_next_byte; - case PREFIX_ES: - override->segment = PREFIX_ES_; - goto do_next_byte; - case PREFIX_SS: - override->segment = PREFIX_SS_; - goto do_next_byte; - case PREFIX_FS: - override->segment = PREFIX_FS_; - goto do_next_byte; - case PREFIX_GS: - override->segment = PREFIX_GS_; - goto do_next_byte; - case PREFIX_DS: - override->segment = PREFIX_DS_; - goto do_next_byte; + u_char byte; + u_char __user *ip = *fpu_eip; + + *override = (overrides) { + 0, 0, PREFIX_DEFAULT}; /* defaults */ + + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(byte, ip); + RE_ENTRANT_CHECK_ON; + + while (1) { + switch (byte) { + case ADDR_SIZE_PREFIX: + override->address_size = ADDR_SIZE_PREFIX; + goto do_next_byte; + + case OP_SIZE_PREFIX: + override->operand_size = OP_SIZE_PREFIX; + goto do_next_byte; + + case PREFIX_CS: + override->segment = PREFIX_CS_; + goto do_next_byte; + case PREFIX_ES: + override->segment = PREFIX_ES_; + goto do_next_byte; + case PREFIX_SS: + override->segment = PREFIX_SS_; + goto do_next_byte; + case PREFIX_FS: + override->segment = PREFIX_FS_; + goto do_next_byte; + case PREFIX_GS: + override->segment = PREFIX_GS_; + goto do_next_byte; + case PREFIX_DS: + override->segment = PREFIX_DS_; + goto do_next_byte; /* lock is not a valid prefix for FPU instructions, let the cpu handle it to generate a SIGILL. */ /* case PREFIX_LOCK: */ - /* rep.. prefixes have no meaning for FPU instructions */ - case PREFIX_REPE: - case PREFIX_REPNE: - - do_next_byte: - ip++; - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(byte, ip); - RE_ENTRANT_CHECK_ON; - break; - case FWAIT_OPCODE: - *Byte = byte; - return 1; - default: - if ( (byte & 0xf8) == 0xd8 ) - { - *Byte = byte; - *fpu_eip = ip; - return 1; - } - else - { - /* Not a valid sequence of prefix bytes followed by - an FPU instruction. */ - *Byte = byte; /* Needed for error message. */ - return 0; - } + /* rep.. prefixes have no meaning for FPU instructions */ + case PREFIX_REPE: + case PREFIX_REPNE: + + do_next_byte: + ip++; + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(byte, ip); + RE_ENTRANT_CHECK_ON; + break; + case FWAIT_OPCODE: + *Byte = byte; + return 1; + default: + if ((byte & 0xf8) == 0xd8) { + *Byte = byte; + *fpu_eip = ip; + return 1; + } else { + /* Not a valid sequence of prefix bytes followed by + an FPU instruction. */ + *Byte = byte; /* Needed for error message. */ + return 0; + } + } } - } } - -void math_abort(struct info * info, unsigned int signal) +void math_abort(struct info *info, unsigned int signal) { FPU_EIP = FPU_ORIG_EIP; current->thread.trap_no = 16; current->thread.error_code = 0; - send_sig(signal,current,1); + send_sig(signal, current, 1); RE_ENTRANT_CHECK_OFF; - __asm__("movl %0,%%esp ; ret": :"g" (((long) info)-4)); + __asm__("movl %0,%%esp ; ret": :"g"(((long)info) - 4)); #ifdef PARANOID - printk("ERROR: wm-FPU-emu math_abort failed!\n"); + printk("ERROR: wm-FPU-emu math_abort failed!\n"); #endif /* PARANOID */ } - - #define S387 ((struct i387_soft_struct *)s387) #define sstatus_word() \ ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top)) -int restore_i387_soft(void *s387, struct _fpstate __user *buf) +int fpregs_soft_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) { - u_char __user *d = (u_char __user *)buf; - int offset, other, i, tags, regnr, tag, newtop; - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, d, 7*4 + 8*10); - if (__copy_from_user(&S387->cwd, d, 7*4)) - return -1; - RE_ENTRANT_CHECK_ON; - - d += 7*4; - - S387->ftop = (S387->swd >> SW_Top_Shift) & 7; - offset = (S387->ftop & 7) * 10; - other = 80 - offset; - - RE_ENTRANT_CHECK_OFF; - /* Copy all registers in stack order. */ - if (__copy_from_user(((u_char *)&S387->st_space)+offset, d, other)) - return -1; - if ( offset ) - if (__copy_from_user((u_char *)&S387->st_space, d+other, offset)) - return -1; - RE_ENTRANT_CHECK_ON; - - /* The tags may need to be corrected now. */ - tags = S387->twd; - newtop = S387->ftop; - for ( i = 0; i < 8; i++ ) - { - regnr = (i+newtop) & 7; - if ( ((tags >> ((regnr & 7)*2)) & 3) != TAG_Empty ) - { - /* The loaded data over-rides all other cases. */ - tag = FPU_tagof((FPU_REG *)((u_char *)S387->st_space + 10*regnr)); - tags &= ~(3 << (regnr*2)); - tags |= (tag & 3) << (regnr*2); + struct i387_soft_struct *s387 = &target->thread.i387.soft; + void *space = s387->st_space; + int ret; + int offset, other, i, tags, regnr, tag, newtop; + + RE_ENTRANT_CHECK_OFF; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0, + offsetof(struct i387_soft_struct, st_space)); + RE_ENTRANT_CHECK_ON; + + if (ret) + return ret; + + S387->ftop = (S387->swd >> SW_Top_Shift) & 7; + offset = (S387->ftop & 7) * 10; + other = 80 - offset; + + RE_ENTRANT_CHECK_OFF; + + /* Copy all registers in stack order. */ + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + space + offset, 0, other); + if (!ret && offset) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + space, 0, offset); + + RE_ENTRANT_CHECK_ON; + + /* The tags may need to be corrected now. */ + tags = S387->twd; + newtop = S387->ftop; + for (i = 0; i < 8; i++) { + regnr = (i + newtop) & 7; + if (((tags >> ((regnr & 7) * 2)) & 3) != TAG_Empty) { + /* The loaded data over-rides all other cases. */ + tag = + FPU_tagof((FPU_REG *) ((u_char *) S387->st_space + + 10 * regnr)); + tags &= ~(3 << (regnr * 2)); + tags |= (tag & 3) << (regnr * 2); + } } - } - S387->twd = tags; + S387->twd = tags; - return 0; + return ret; } - -int save_i387_soft(void *s387, struct _fpstate __user * buf) +int fpregs_soft_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) { - u_char __user *d = (u_char __user *)buf; - int offset = (S387->ftop & 7) * 10, other = 80 - offset; + struct i387_soft_struct *s387 = &target->thread.i387.soft; + const void *space = s387->st_space; + int ret; + int offset = (S387->ftop & 7) * 10, other = 80 - offset; + + RE_ENTRANT_CHECK_OFF; - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 7*4 + 8*10); #ifdef PECULIAR_486 - S387->cwd &= ~0xe080; - /* An 80486 sets nearly all of the reserved bits to 1. */ - S387->cwd |= 0xffff0040; - S387->swd = sstatus_word() | 0xffff0000; - S387->twd |= 0xffff0000; - S387->fcs &= ~0xf8000000; - S387->fos |= 0xffff0000; + S387->cwd &= ~0xe080; + /* An 80486 sets nearly all of the reserved bits to 1. */ + S387->cwd |= 0xffff0040; + S387->swd = sstatus_word() | 0xffff0000; + S387->twd |= 0xffff0000; + S387->fcs &= ~0xf8000000; + S387->fos |= 0xffff0000; #endif /* PECULIAR_486 */ - if (__copy_to_user(d, &S387->cwd, 7*4)) - return -1; - RE_ENTRANT_CHECK_ON; - - d += 7*4; - - RE_ENTRANT_CHECK_OFF; - /* Copy all registers in stack order. */ - if (__copy_to_user(d, ((u_char *)&S387->st_space)+offset, other)) - return -1; - if ( offset ) - if (__copy_to_user(d+other, (u_char *)&S387->st_space, offset)) - return -1; - RE_ENTRANT_CHECK_ON; - - return 1; + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0, + offsetof(struct i387_soft_struct, st_space)); + + /* Copy all registers in stack order. */ + if (!ret) + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + space + offset, 0, other); + if (!ret) + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + space, 0, offset); + + RE_ENTRANT_CHECK_ON; + + return ret; } diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c index e3b5d465587..233e5af566f 100644 --- a/arch/x86/math-emu/fpu_etc.c +++ b/arch/x86/math-emu/fpu_etc.c @@ -16,128 +16,115 @@ #include "status_w.h" #include "reg_constant.h" - static void fchs(FPU_REG *st0_ptr, u_char st0tag) { - if ( st0tag ^ TAG_Empty ) - { - signbyte(st0_ptr) ^= SIGN_NEG; - clear_C1(); - } - else - FPU_stack_underflow(); + if (st0tag ^ TAG_Empty) { + signbyte(st0_ptr) ^= SIGN_NEG; + clear_C1(); + } else + FPU_stack_underflow(); } - static void fabs(FPU_REG *st0_ptr, u_char st0tag) { - if ( st0tag ^ TAG_Empty ) - { - setpositive(st0_ptr); - clear_C1(); - } - else - FPU_stack_underflow(); + if (st0tag ^ TAG_Empty) { + setpositive(st0_ptr); + clear_C1(); + } else + FPU_stack_underflow(); } - static void ftst_(FPU_REG *st0_ptr, u_char st0tag) { - switch (st0tag) - { - case TAG_Zero: - setcc(SW_C3); - break; - case TAG_Valid: - if (getsign(st0_ptr) == SIGN_POS) - setcc(0); - else - setcc(SW_C0); - break; - case TAG_Special: - switch ( FPU_Special(st0_ptr) ) - { - case TW_Denormal: - if (getsign(st0_ptr) == SIGN_POS) - setcc(0); - else - setcc(SW_C0); - if ( denormal_operand() < 0 ) - { -#ifdef PECULIAR_486 - /* This is weird! */ - if (getsign(st0_ptr) == SIGN_POS) + switch (st0tag) { + case TAG_Zero: setcc(SW_C3); + break; + case TAG_Valid: + if (getsign(st0_ptr) == SIGN_POS) + setcc(0); + else + setcc(SW_C0); + break; + case TAG_Special: + switch (FPU_Special(st0_ptr)) { + case TW_Denormal: + if (getsign(st0_ptr) == SIGN_POS) + setcc(0); + else + setcc(SW_C0); + if (denormal_operand() < 0) { +#ifdef PECULIAR_486 + /* This is weird! */ + if (getsign(st0_ptr) == SIGN_POS) + setcc(SW_C3); #endif /* PECULIAR_486 */ - return; - } - break; - case TW_NaN: - setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */ - EXCEPTION(EX_Invalid); - break; - case TW_Infinity: - if (getsign(st0_ptr) == SIGN_POS) - setcc(0); - else - setcc(SW_C0); - break; - default: - setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */ - EXCEPTION(EX_INTERNAL|0x14); - break; + return; + } + break; + case TW_NaN: + setcc(SW_C0 | SW_C2 | SW_C3); /* Operand is not comparable */ + EXCEPTION(EX_Invalid); + break; + case TW_Infinity: + if (getsign(st0_ptr) == SIGN_POS) + setcc(0); + else + setcc(SW_C0); + break; + default: + setcc(SW_C0 | SW_C2 | SW_C3); /* Operand is not comparable */ + EXCEPTION(EX_INTERNAL | 0x14); + break; + } + break; + case TAG_Empty: + setcc(SW_C0 | SW_C2 | SW_C3); + EXCEPTION(EX_StackUnder); + break; } - break; - case TAG_Empty: - setcc(SW_C0|SW_C2|SW_C3); - EXCEPTION(EX_StackUnder); - break; - } } - static void fxam(FPU_REG *st0_ptr, u_char st0tag) { - int c = 0; - switch (st0tag) - { - case TAG_Empty: - c = SW_C3|SW_C0; - break; - case TAG_Zero: - c = SW_C3; - break; - case TAG_Valid: - c = SW_C2; - break; - case TAG_Special: - switch ( FPU_Special(st0_ptr) ) - { - case TW_Denormal: - c = SW_C2|SW_C3; /* Denormal */ - break; - case TW_NaN: - /* We also use NaN for unsupported types. */ - if ( (st0_ptr->sigh & 0x80000000) && (exponent(st0_ptr) == EXP_OVER) ) - c = SW_C0; - break; - case TW_Infinity: - c = SW_C2|SW_C0; - break; + int c = 0; + switch (st0tag) { + case TAG_Empty: + c = SW_C3 | SW_C0; + break; + case TAG_Zero: + c = SW_C3; + break; + case TAG_Valid: + c = SW_C2; + break; + case TAG_Special: + switch (FPU_Special(st0_ptr)) { + case TW_Denormal: + c = SW_C2 | SW_C3; /* Denormal */ + break; + case TW_NaN: + /* We also use NaN for unsupported types. */ + if ((st0_ptr->sigh & 0x80000000) + && (exponent(st0_ptr) == EXP_OVER)) + c = SW_C0; + break; + case TW_Infinity: + c = SW_C2 | SW_C0; + break; + } } - } - if ( getsign(st0_ptr) == SIGN_NEG ) - c |= SW_C1; - setcc(c); + if (getsign(st0_ptr) == SIGN_NEG) + c |= SW_C1; + setcc(c); } - static FUNC_ST0 const fp_etc_table[] = { - fchs, fabs, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal, - ftst_, fxam, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal + fchs, fabs, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal, + ftst_, fxam, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal }; void FPU_etc(void) { - (fp_etc_table[FPU_rm])(&st(0), FPU_gettag0()); + (fp_etc_table[FPU_rm]) (&st(0), FPU_gettag0()); } diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h index 37a8a7fe7e2..aa49b6a0d85 100644 --- a/arch/x86/math-emu/fpu_proto.h +++ b/arch/x86/math-emu/fpu_proto.h @@ -66,7 +66,7 @@ extern int FPU_Special(FPU_REG const *ptr); extern int isNaN(FPU_REG const *ptr); extern void FPU_pop(void); extern int FPU_empty_i(int stnr); -extern int FPU_stackoverflow(FPU_REG **st_new_ptr); +extern int FPU_stackoverflow(FPU_REG ** st_new_ptr); extern void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr); extern void FPU_copy_to_reg1(FPU_REG const *r, u_char tag); extern void FPU_copy_to_reg0(FPU_REG const *r, u_char tag); @@ -75,21 +75,23 @@ extern void FPU_triga(void); extern void FPU_trigb(void); /* get_address.c */ extern void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip, - struct address *addr, fpu_addr_modes addr_modes); + struct address *addr, + fpu_addr_modes addr_modes); extern void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, - struct address *addr, fpu_addr_modes addr_modes); + struct address *addr, + fpu_addr_modes addr_modes); /* load_store.c */ extern int FPU_load_store(u_char type, fpu_addr_modes addr_modes, - void __user *data_address); + void __user * data_address); /* poly_2xm1.c */ -extern int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result); +extern int poly_2xm1(u_char sign, FPU_REG * arg, FPU_REG *result); /* poly_atan.c */ -extern void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, FPU_REG *st1_ptr, +extern void poly_atan(FPU_REG * st0_ptr, u_char st0_tag, FPU_REG *st1_ptr, u_char st1_tag); /* poly_l2.c */ extern void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign); extern int poly_l2p1(u_char s0, u_char s1, FPU_REG *r0, FPU_REG *r1, - FPU_REG *d); + FPU_REG * d); /* poly_sin.c */ extern void poly_sine(FPU_REG *st0_ptr); extern void poly_cos(FPU_REG *st0_ptr); @@ -117,10 +119,13 @@ extern int FPU_load_int32(long __user *_s, FPU_REG *loaded_data); extern int FPU_load_int16(short __user *_s, FPU_REG *loaded_data); extern int FPU_load_bcd(u_char __user *s); extern int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, - long double __user *d); -extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat); -extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single); -extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d); + long double __user * d); +extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, + double __user * dfloat); +extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, + float __user * single); +extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, + long long __user * d); extern int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d); extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d); extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d); @@ -137,4 +142,3 @@ extern int FPU_div(int flags, int regrm, int control_w); /* reg_convert.c */ extern int FPU_to_exp16(FPU_REG const *a, FPU_REG *x); #endif /* _FPU_PROTO_H */ - diff --git a/arch/x86/math-emu/fpu_tags.c b/arch/x86/math-emu/fpu_tags.c index cb436fe20e4..d9c657cd774 100644 --- a/arch/x86/math-emu/fpu_tags.c +++ b/arch/x86/math-emu/fpu_tags.c @@ -14,114 +14,102 @@ #include "fpu_system.h" #include "exception.h" - void FPU_pop(void) { - fpu_tag_word |= 3 << ((top & 7)*2); - top++; + fpu_tag_word |= 3 << ((top & 7) * 2); + top++; } - int FPU_gettag0(void) { - return (fpu_tag_word >> ((top & 7)*2)) & 3; + return (fpu_tag_word >> ((top & 7) * 2)) & 3; } - int FPU_gettagi(int stnr) { - return (fpu_tag_word >> (((top+stnr) & 7)*2)) & 3; + return (fpu_tag_word >> (((top + stnr) & 7) * 2)) & 3; } - int FPU_gettag(int regnr) { - return (fpu_tag_word >> ((regnr & 7)*2)) & 3; + return (fpu_tag_word >> ((regnr & 7) * 2)) & 3; } - void FPU_settag0(int tag) { - int regnr = top; - regnr &= 7; - fpu_tag_word &= ~(3 << (regnr*2)); - fpu_tag_word |= (tag & 3) << (regnr*2); + int regnr = top; + regnr &= 7; + fpu_tag_word &= ~(3 << (regnr * 2)); + fpu_tag_word |= (tag & 3) << (regnr * 2); } - void FPU_settagi(int stnr, int tag) { - int regnr = stnr+top; - regnr &= 7; - fpu_tag_word &= ~(3 << (regnr*2)); - fpu_tag_word |= (tag & 3) << (regnr*2); + int regnr = stnr + top; + regnr &= 7; + fpu_tag_word &= ~(3 << (regnr * 2)); + fpu_tag_word |= (tag & 3) << (regnr * 2); } - void FPU_settag(int regnr, int tag) { - regnr &= 7; - fpu_tag_word &= ~(3 << (regnr*2)); - fpu_tag_word |= (tag & 3) << (regnr*2); + regnr &= 7; + fpu_tag_word &= ~(3 << (regnr * 2)); + fpu_tag_word |= (tag & 3) << (regnr * 2); } - int FPU_Special(FPU_REG const *ptr) { - int exp = exponent(ptr); - - if ( exp == EXP_BIAS+EXP_UNDER ) - return TW_Denormal; - else if ( exp != EXP_BIAS+EXP_OVER ) - return TW_NaN; - else if ( (ptr->sigh == 0x80000000) && (ptr->sigl == 0) ) - return TW_Infinity; - return TW_NaN; + int exp = exponent(ptr); + + if (exp == EXP_BIAS + EXP_UNDER) + return TW_Denormal; + else if (exp != EXP_BIAS + EXP_OVER) + return TW_NaN; + else if ((ptr->sigh == 0x80000000) && (ptr->sigl == 0)) + return TW_Infinity; + return TW_NaN; } - int isNaN(FPU_REG const *ptr) { - return ( (exponent(ptr) == EXP_BIAS+EXP_OVER) - && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)) ); + return ((exponent(ptr) == EXP_BIAS + EXP_OVER) + && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0))); } - int FPU_empty_i(int stnr) { - int regnr = (top+stnr) & 7; + int regnr = (top + stnr) & 7; - return ((fpu_tag_word >> (regnr*2)) & 3) == TAG_Empty; + return ((fpu_tag_word >> (regnr * 2)) & 3) == TAG_Empty; } - -int FPU_stackoverflow(FPU_REG **st_new_ptr) +int FPU_stackoverflow(FPU_REG ** st_new_ptr) { - *st_new_ptr = &st(-1); + *st_new_ptr = &st(-1); - return ((fpu_tag_word >> (((top - 1) & 7)*2)) & 3) != TAG_Empty; + return ((fpu_tag_word >> (((top - 1) & 7) * 2)) & 3) != TAG_Empty; } - void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr) { - reg_copy(r, &st(stnr)); - FPU_settagi(stnr, tag); + reg_copy(r, &st(stnr)); + FPU_settagi(stnr, tag); } void FPU_copy_to_reg1(FPU_REG const *r, u_char tag) { - reg_copy(r, &st(1)); - FPU_settagi(1, tag); + reg_copy(r, &st(1)); + FPU_settagi(1, tag); } void FPU_copy_to_reg0(FPU_REG const *r, u_char tag) { - int regnr = top; - regnr &= 7; + int regnr = top; + regnr &= 7; - reg_copy(r, &st(0)); + reg_copy(r, &st(0)); - fpu_tag_word &= ~(3 << (regnr*2)); - fpu_tag_word |= (tag & 3) << (regnr*2); + fpu_tag_word &= ~(3 << (regnr * 2)); + fpu_tag_word |= (tag & 3) << (regnr * 2); } diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c index 403cbde1d42..ecd06680581 100644 --- a/arch/x86/math-emu/fpu_trig.c +++ b/arch/x86/math-emu/fpu_trig.c @@ -15,11 +15,10 @@ #include "fpu_emu.h" #include "status_w.h" #include "control_w.h" -#include "reg_constant.h" +#include "reg_constant.h" static void rem_kernel(unsigned long long st0, unsigned long long *y, - unsigned long long st1, - unsigned long long q, int n); + unsigned long long st1, unsigned long long q, int n); #define BETTER_THAN_486 @@ -33,788 +32,706 @@ static void rem_kernel(unsigned long long st0, unsigned long long *y, precision of the result sometimes degrades to about 63.9 bits */ static int trig_arg(FPU_REG *st0_ptr, int even) { - FPU_REG tmp; - u_char tmptag; - unsigned long long q; - int old_cw = control_word, saved_status = partial_status; - int tag, st0_tag = TAG_Valid; - - if ( exponent(st0_ptr) >= 63 ) - { - partial_status |= SW_C2; /* Reduction incomplete. */ - return -1; - } - - control_word &= ~CW_RC; - control_word |= RC_CHOP; - - setpositive(st0_ptr); - tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f, - SIGN_POS); - - FPU_round_to_int(&tmp, tag); /* Fortunately, this can't overflow - to 2^64 */ - q = significand(&tmp); - if ( q ) - { - rem_kernel(significand(st0_ptr), - &significand(&tmp), - significand(&CONST_PI2), - q, exponent(st0_ptr) - exponent(&CONST_PI2)); - setexponent16(&tmp, exponent(&CONST_PI2)); - st0_tag = FPU_normalize(&tmp); - FPU_copy_to_reg0(&tmp, st0_tag); - } - - if ( (even && !(q & 1)) || (!even && (q & 1)) ) - { - st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, FULL_PRECISION); + FPU_REG tmp; + u_char tmptag; + unsigned long long q; + int old_cw = control_word, saved_status = partial_status; + int tag, st0_tag = TAG_Valid; + + if (exponent(st0_ptr) >= 63) { + partial_status |= SW_C2; /* Reduction incomplete. */ + return -1; + } -#ifdef BETTER_THAN_486 - /* So far, the results are exact but based upon a 64 bit - precision approximation to pi/2. The technique used - now is equivalent to using an approximation to pi/2 which - is accurate to about 128 bits. */ - if ( (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64) || (q > 1) ) - { - /* This code gives the effect of having pi/2 to better than - 128 bits precision. */ - - significand(&tmp) = q + 1; - setexponent16(&tmp, 63); - FPU_normalize(&tmp); - tmptag = - FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, SIGN_POS, - exponent(&CONST_PI2extra) + exponent(&tmp)); - setsign(&tmp, getsign(&CONST_PI2extra)); - st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION); - if ( signnegative(st0_ptr) ) - { - /* CONST_PI2extra is negative, so the result of the addition - can be negative. This means that the argument is actually - in a different quadrant. The correction is always < pi/2, - so it can't overflow into yet another quadrant. */ - setpositive(st0_ptr); - q++; - } + control_word &= ~CW_RC; + control_word |= RC_CHOP; + + setpositive(st0_ptr); + tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f, + SIGN_POS); + + FPU_round_to_int(&tmp, tag); /* Fortunately, this can't overflow + to 2^64 */ + q = significand(&tmp); + if (q) { + rem_kernel(significand(st0_ptr), + &significand(&tmp), + significand(&CONST_PI2), + q, exponent(st0_ptr) - exponent(&CONST_PI2)); + setexponent16(&tmp, exponent(&CONST_PI2)); + st0_tag = FPU_normalize(&tmp); + FPU_copy_to_reg0(&tmp, st0_tag); } + + if ((even && !(q & 1)) || (!even && (q & 1))) { + st0_tag = + FPU_sub(REV | LOADED | TAG_Valid, (int)&CONST_PI2, + FULL_PRECISION); + +#ifdef BETTER_THAN_486 + /* So far, the results are exact but based upon a 64 bit + precision approximation to pi/2. The technique used + now is equivalent to using an approximation to pi/2 which + is accurate to about 128 bits. */ + if ((exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64) + || (q > 1)) { + /* This code gives the effect of having pi/2 to better than + 128 bits precision. */ + + significand(&tmp) = q + 1; + setexponent16(&tmp, 63); + FPU_normalize(&tmp); + tmptag = + FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, + FULL_PRECISION, SIGN_POS, + exponent(&CONST_PI2extra) + + exponent(&tmp)); + setsign(&tmp, getsign(&CONST_PI2extra)); + st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION); + if (signnegative(st0_ptr)) { + /* CONST_PI2extra is negative, so the result of the addition + can be negative. This means that the argument is actually + in a different quadrant. The correction is always < pi/2, + so it can't overflow into yet another quadrant. */ + setpositive(st0_ptr); + q++; + } + } #endif /* BETTER_THAN_486 */ - } + } #ifdef BETTER_THAN_486 - else - { - /* So far, the results are exact but based upon a 64 bit - precision approximation to pi/2. The technique used - now is equivalent to using an approximation to pi/2 which - is accurate to about 128 bits. */ - if ( ((q > 0) && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64)) - || (q > 1) ) - { - /* This code gives the effect of having p/2 to better than - 128 bits precision. */ - - significand(&tmp) = q; - setexponent16(&tmp, 63); - FPU_normalize(&tmp); /* This must return TAG_Valid */ - tmptag = FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, - SIGN_POS, - exponent(&CONST_PI2extra) + exponent(&tmp)); - setsign(&tmp, getsign(&CONST_PI2extra)); - st0_tag = FPU_sub(LOADED|(tmptag & 0x0f), (int)&tmp, - FULL_PRECISION); - if ( (exponent(st0_ptr) == exponent(&CONST_PI2)) && - ((st0_ptr->sigh > CONST_PI2.sigh) - || ((st0_ptr->sigh == CONST_PI2.sigh) - && (st0_ptr->sigl > CONST_PI2.sigl))) ) - { - /* CONST_PI2extra is negative, so the result of the - subtraction can be larger than pi/2. This means - that the argument is actually in a different quadrant. - The correction is always < pi/2, so it can't overflow - into yet another quadrant. */ - st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, - FULL_PRECISION); - q++; - } + else { + /* So far, the results are exact but based upon a 64 bit + precision approximation to pi/2. The technique used + now is equivalent to using an approximation to pi/2 which + is accurate to about 128 bits. */ + if (((q > 0) + && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64)) + || (q > 1)) { + /* This code gives the effect of having p/2 to better than + 128 bits precision. */ + + significand(&tmp) = q; + setexponent16(&tmp, 63); + FPU_normalize(&tmp); /* This must return TAG_Valid */ + tmptag = + FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, + FULL_PRECISION, SIGN_POS, + exponent(&CONST_PI2extra) + + exponent(&tmp)); + setsign(&tmp, getsign(&CONST_PI2extra)); + st0_tag = FPU_sub(LOADED | (tmptag & 0x0f), (int)&tmp, + FULL_PRECISION); + if ((exponent(st0_ptr) == exponent(&CONST_PI2)) && + ((st0_ptr->sigh > CONST_PI2.sigh) + || ((st0_ptr->sigh == CONST_PI2.sigh) + && (st0_ptr->sigl > CONST_PI2.sigl)))) { + /* CONST_PI2extra is negative, so the result of the + subtraction can be larger than pi/2. This means + that the argument is actually in a different quadrant. + The correction is always < pi/2, so it can't overflow + into yet another quadrant. */ + st0_tag = + FPU_sub(REV | LOADED | TAG_Valid, + (int)&CONST_PI2, FULL_PRECISION); + q++; + } + } } - } #endif /* BETTER_THAN_486 */ - FPU_settag0(st0_tag); - control_word = old_cw; - partial_status = saved_status & ~SW_C2; /* Reduction complete. */ + FPU_settag0(st0_tag); + control_word = old_cw; + partial_status = saved_status & ~SW_C2; /* Reduction complete. */ - return (q & 3) | even; + return (q & 3) | even; } - /* Convert a long to register */ static void convert_l2reg(long const *arg, int deststnr) { - int tag; - long num = *arg; - u_char sign; - FPU_REG *dest = &st(deststnr); - - if (num == 0) - { - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - return; - } - - if (num > 0) - { sign = SIGN_POS; } - else - { num = -num; sign = SIGN_NEG; } - - dest->sigh = num; - dest->sigl = 0; - setexponent16(dest, 31); - tag = FPU_normalize(dest); - FPU_settagi(deststnr, tag); - setsign(dest, sign); - return; -} + int tag; + long num = *arg; + u_char sign; + FPU_REG *dest = &st(deststnr); + if (num == 0) { + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + return; + } + + if (num > 0) { + sign = SIGN_POS; + } else { + num = -num; + sign = SIGN_NEG; + } + + dest->sigh = num; + dest->sigl = 0; + setexponent16(dest, 31); + tag = FPU_normalize(dest); + FPU_settagi(deststnr, tag); + setsign(dest, sign); + return; +} static void single_arg_error(FPU_REG *st0_ptr, u_char st0_tag) { - if ( st0_tag == TAG_Empty ) - FPU_stack_underflow(); /* Puts a QNaN in st(0) */ - else if ( st0_tag == TW_NaN ) - real_1op_NaN(st0_ptr); /* return with a NaN in st(0) */ + if (st0_tag == TAG_Empty) + FPU_stack_underflow(); /* Puts a QNaN in st(0) */ + else if (st0_tag == TW_NaN) + real_1op_NaN(st0_ptr); /* return with a NaN in st(0) */ #ifdef PARANOID - else - EXCEPTION(EX_INTERNAL|0x0112); + else + EXCEPTION(EX_INTERNAL | 0x0112); #endif /* PARANOID */ } - static void single_arg_2_error(FPU_REG *st0_ptr, u_char st0_tag) { - int isNaN; - - switch ( st0_tag ) - { - case TW_NaN: - isNaN = (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000); - if ( isNaN && !(st0_ptr->sigh & 0x40000000) ) /* Signaling ? */ - { - EXCEPTION(EX_Invalid); - if ( control_word & CW_Invalid ) - { - /* The masked response */ - /* Convert to a QNaN */ - st0_ptr->sigh |= 0x40000000; - push(); - FPU_copy_to_reg0(st0_ptr, TAG_Special); - } - } - else if ( isNaN ) - { - /* A QNaN */ - push(); - FPU_copy_to_reg0(st0_ptr, TAG_Special); - } - else - { - /* pseudoNaN or other unsupported */ - EXCEPTION(EX_Invalid); - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); - push(); - FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); - } - } - break; /* return with a NaN in st(0) */ + int isNaN; + + switch (st0_tag) { + case TW_NaN: + isNaN = (exponent(st0_ptr) == EXP_OVER) + && (st0_ptr->sigh & 0x80000000); + if (isNaN && !(st0_ptr->sigh & 0x40000000)) { /* Signaling ? */ + EXCEPTION(EX_Invalid); + if (control_word & CW_Invalid) { + /* The masked response */ + /* Convert to a QNaN */ + st0_ptr->sigh |= 0x40000000; + push(); + FPU_copy_to_reg0(st0_ptr, TAG_Special); + } + } else if (isNaN) { + /* A QNaN */ + push(); + FPU_copy_to_reg0(st0_ptr, TAG_Special); + } else { + /* pseudoNaN or other unsupported */ + EXCEPTION(EX_Invalid); + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + push(); + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + } + } + break; /* return with a NaN in st(0) */ #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x0112); + default: + EXCEPTION(EX_INTERNAL | 0x0112); #endif /* PARANOID */ - } + } } - /*---------------------------------------------------------------------------*/ static void f2xm1(FPU_REG *st0_ptr, u_char tag) { - FPU_REG a; + FPU_REG a; - clear_C1(); + clear_C1(); - if ( tag == TAG_Valid ) - { - /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */ - if ( exponent(st0_ptr) < 0 ) - { - denormal_arg: + if (tag == TAG_Valid) { + /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */ + if (exponent(st0_ptr) < 0) { + denormal_arg: - FPU_to_exp16(st0_ptr, &a); + FPU_to_exp16(st0_ptr, &a); - /* poly_2xm1(x) requires 0 < st(0) < 1. */ - poly_2xm1(getsign(st0_ptr), &a, st0_ptr); + /* poly_2xm1(x) requires 0 < st(0) < 1. */ + poly_2xm1(getsign(st0_ptr), &a, st0_ptr); + } + set_precision_flag_up(); /* 80486 appears to always do this */ + return; } - set_precision_flag_up(); /* 80486 appears to always do this */ - return; - } - if ( tag == TAG_Zero ) - return; + if (tag == TAG_Zero) + return; - if ( tag == TAG_Special ) - tag = FPU_Special(st0_ptr); + if (tag == TAG_Special) + tag = FPU_Special(st0_ptr); - switch ( tag ) - { - case TW_Denormal: - if ( denormal_operand() < 0 ) - return; - goto denormal_arg; - case TW_Infinity: - if ( signnegative(st0_ptr) ) - { - /* -infinity gives -1 (p16-10) */ - FPU_copy_to_reg0(&CONST_1, TAG_Valid); - setnegative(st0_ptr); + switch (tag) { + case TW_Denormal: + if (denormal_operand() < 0) + return; + goto denormal_arg; + case TW_Infinity: + if (signnegative(st0_ptr)) { + /* -infinity gives -1 (p16-10) */ + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + setnegative(st0_ptr); + } + return; + default: + single_arg_error(st0_ptr, tag); } - return; - default: - single_arg_error(st0_ptr, tag); - } } - static void fptan(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st_new_ptr; - int q; - u_char arg_sign = getsign(st0_ptr); - - /* Stack underflow has higher priority */ - if ( st0_tag == TAG_Empty ) - { - FPU_stack_underflow(); /* Puts a QNaN in st(0) */ - if ( control_word & CW_Invalid ) - { - st_new_ptr = &st(-1); - push(); - FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ + FPU_REG *st_new_ptr; + int q; + u_char arg_sign = getsign(st0_ptr); + + /* Stack underflow has higher priority */ + if (st0_tag == TAG_Empty) { + FPU_stack_underflow(); /* Puts a QNaN in st(0) */ + if (control_word & CW_Invalid) { + st_new_ptr = &st(-1); + push(); + FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ + } + return; } - return; - } - - if ( STACK_OVERFLOW ) - { FPU_stack_overflow(); return; } - - if ( st0_tag == TAG_Valid ) - { - if ( exponent(st0_ptr) > -40 ) - { - if ( (q = trig_arg(st0_ptr, 0)) == -1 ) - { - /* Operand is out of range */ - return; - } - - poly_tan(st0_ptr); - setsign(st0_ptr, (q & 1) ^ (arg_sign != 0)); - set_precision_flag_up(); /* We do not really know if up or down */ + + if (STACK_OVERFLOW) { + FPU_stack_overflow(); + return; } - else - { - /* For a small arg, the result == the argument */ - /* Underflow may happen */ - denormal_arg: + if (st0_tag == TAG_Valid) { + if (exponent(st0_ptr) > -40) { + if ((q = trig_arg(st0_ptr, 0)) == -1) { + /* Operand is out of range */ + return; + } + + poly_tan(st0_ptr); + setsign(st0_ptr, (q & 1) ^ (arg_sign != 0)); + set_precision_flag_up(); /* We do not really know if up or down */ + } else { + /* For a small arg, the result == the argument */ + /* Underflow may happen */ + + denormal_arg: + + FPU_to_exp16(st0_ptr, st0_ptr); - FPU_to_exp16(st0_ptr, st0_ptr); - - st0_tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); - FPU_settag0(st0_tag); + st0_tag = + FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); + FPU_settag0(st0_tag); + } + push(); + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + return; } - push(); - FPU_copy_to_reg0(&CONST_1, TAG_Valid); - return; - } - - if ( st0_tag == TAG_Zero ) - { - push(); - FPU_copy_to_reg0(&CONST_1, TAG_Valid); - setcc(0); - return; - } - - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - - if ( st0_tag == TW_Denormal ) - { - if ( denormal_operand() < 0 ) - return; - goto denormal_arg; - } - - if ( st0_tag == TW_Infinity ) - { - /* The 80486 treats infinity as an invalid operand */ - if ( arith_invalid(0) >= 0 ) - { - st_new_ptr = &st(-1); - push(); - arith_invalid(0); + if (st0_tag == TAG_Zero) { + push(); + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + setcc(0); + return; + } + + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + + if (st0_tag == TW_Denormal) { + if (denormal_operand() < 0) + return; + + goto denormal_arg; } - return; - } - single_arg_2_error(st0_ptr, st0_tag); -} + if (st0_tag == TW_Infinity) { + /* The 80486 treats infinity as an invalid operand */ + if (arith_invalid(0) >= 0) { + st_new_ptr = &st(-1); + push(); + arith_invalid(0); + } + return; + } + single_arg_2_error(st0_ptr, st0_tag); +} static void fxtract(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st_new_ptr; - u_char sign; - register FPU_REG *st1_ptr = st0_ptr; /* anticipate */ - - if ( STACK_OVERFLOW ) - { FPU_stack_overflow(); return; } - - clear_C1(); - - if ( st0_tag == TAG_Valid ) - { - long e; - - push(); - sign = getsign(st1_ptr); - reg_copy(st1_ptr, st_new_ptr); - setexponent16(st_new_ptr, exponent(st_new_ptr)); - - denormal_arg: - - e = exponent16(st_new_ptr); - convert_l2reg(&e, 1); - setexponentpos(st_new_ptr, 0); - setsign(st_new_ptr, sign); - FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */ - return; - } - else if ( st0_tag == TAG_Zero ) - { - sign = getsign(st0_ptr); - - if ( FPU_divide_by_zero(0, SIGN_NEG) < 0 ) - return; + FPU_REG *st_new_ptr; + u_char sign; + register FPU_REG *st1_ptr = st0_ptr; /* anticipate */ - push(); - FPU_copy_to_reg0(&CONST_Z, TAG_Zero); - setsign(st_new_ptr, sign); - return; - } + if (STACK_OVERFLOW) { + FPU_stack_overflow(); + return; + } - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); + clear_C1(); - if ( st0_tag == TW_Denormal ) - { - if (denormal_operand() < 0 ) - return; + if (st0_tag == TAG_Valid) { + long e; - push(); - sign = getsign(st1_ptr); - FPU_to_exp16(st1_ptr, st_new_ptr); - goto denormal_arg; - } - else if ( st0_tag == TW_Infinity ) - { - sign = getsign(st0_ptr); - setpositive(st0_ptr); - push(); - FPU_copy_to_reg0(&CONST_INF, TAG_Special); - setsign(st_new_ptr, sign); - return; - } - else if ( st0_tag == TW_NaN ) - { - if ( real_1op_NaN(st0_ptr) < 0 ) - return; + push(); + sign = getsign(st1_ptr); + reg_copy(st1_ptr, st_new_ptr); + setexponent16(st_new_ptr, exponent(st_new_ptr)); + + denormal_arg: + + e = exponent16(st_new_ptr); + convert_l2reg(&e, 1); + setexponentpos(st_new_ptr, 0); + setsign(st_new_ptr, sign); + FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */ + return; + } else if (st0_tag == TAG_Zero) { + sign = getsign(st0_ptr); + + if (FPU_divide_by_zero(0, SIGN_NEG) < 0) + return; - push(); - FPU_copy_to_reg0(st0_ptr, TAG_Special); - return; - } - else if ( st0_tag == TAG_Empty ) - { - /* Is this the correct behaviour? */ - if ( control_word & EX_Invalid ) - { - FPU_stack_underflow(); - push(); - FPU_stack_underflow(); + push(); + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + setsign(st_new_ptr, sign); + return; + } + + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + + if (st0_tag == TW_Denormal) { + if (denormal_operand() < 0) + return; + + push(); + sign = getsign(st1_ptr); + FPU_to_exp16(st1_ptr, st_new_ptr); + goto denormal_arg; + } else if (st0_tag == TW_Infinity) { + sign = getsign(st0_ptr); + setpositive(st0_ptr); + push(); + FPU_copy_to_reg0(&CONST_INF, TAG_Special); + setsign(st_new_ptr, sign); + return; + } else if (st0_tag == TW_NaN) { + if (real_1op_NaN(st0_ptr) < 0) + return; + + push(); + FPU_copy_to_reg0(st0_ptr, TAG_Special); + return; + } else if (st0_tag == TAG_Empty) { + /* Is this the correct behaviour? */ + if (control_word & EX_Invalid) { + FPU_stack_underflow(); + push(); + FPU_stack_underflow(); + } else + EXCEPTION(EX_StackUnder); } - else - EXCEPTION(EX_StackUnder); - } #ifdef PARANOID - else - EXCEPTION(EX_INTERNAL | 0x119); + else + EXCEPTION(EX_INTERNAL | 0x119); #endif /* PARANOID */ } - static void fdecstp(void) { - clear_C1(); - top--; + clear_C1(); + top--; } static void fincstp(void) { - clear_C1(); - top++; + clear_C1(); + top++; } - static void fsqrt_(FPU_REG *st0_ptr, u_char st0_tag) { - int expon; - - clear_C1(); - - if ( st0_tag == TAG_Valid ) - { - u_char tag; - - if (signnegative(st0_ptr)) - { - arith_invalid(0); /* sqrt(negative) is invalid */ - return; - } + int expon; + + clear_C1(); - /* make st(0) in [1.0 .. 4.0) */ - expon = exponent(st0_ptr); - - denormal_arg: - - setexponent16(st0_ptr, (expon & 1)); - - /* Do the computation, the sign of the result will be positive. */ - tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS); - addexponent(st0_ptr, expon >> 1); - FPU_settag0(tag); - return; - } - - if ( st0_tag == TAG_Zero ) - return; - - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - - if ( st0_tag == TW_Infinity ) - { - if ( signnegative(st0_ptr) ) - arith_invalid(0); /* sqrt(-Infinity) is invalid */ - return; - } - else if ( st0_tag == TW_Denormal ) - { - if (signnegative(st0_ptr)) - { - arith_invalid(0); /* sqrt(negative) is invalid */ - return; + if (st0_tag == TAG_Valid) { + u_char tag; + + if (signnegative(st0_ptr)) { + arith_invalid(0); /* sqrt(negative) is invalid */ + return; + } + + /* make st(0) in [1.0 .. 4.0) */ + expon = exponent(st0_ptr); + + denormal_arg: + + setexponent16(st0_ptr, (expon & 1)); + + /* Do the computation, the sign of the result will be positive. */ + tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS); + addexponent(st0_ptr, expon >> 1); + FPU_settag0(tag); + return; } - if ( denormal_operand() < 0 ) - return; + if (st0_tag == TAG_Zero) + return; - FPU_to_exp16(st0_ptr, st0_ptr); + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); - expon = exponent16(st0_ptr); + if (st0_tag == TW_Infinity) { + if (signnegative(st0_ptr)) + arith_invalid(0); /* sqrt(-Infinity) is invalid */ + return; + } else if (st0_tag == TW_Denormal) { + if (signnegative(st0_ptr)) { + arith_invalid(0); /* sqrt(negative) is invalid */ + return; + } - goto denormal_arg; - } + if (denormal_operand() < 0) + return; - single_arg_error(st0_ptr, st0_tag); + FPU_to_exp16(st0_ptr, st0_ptr); -} + expon = exponent16(st0_ptr); + + goto denormal_arg; + } + single_arg_error(st0_ptr, st0_tag); + +} static void frndint_(FPU_REG *st0_ptr, u_char st0_tag) { - int flags, tag; + int flags, tag; - if ( st0_tag == TAG_Valid ) - { - u_char sign; + if (st0_tag == TAG_Valid) { + u_char sign; - denormal_arg: + denormal_arg: - sign = getsign(st0_ptr); + sign = getsign(st0_ptr); - if (exponent(st0_ptr) > 63) - return; + if (exponent(st0_ptr) > 63) + return; + + if (st0_tag == TW_Denormal) { + if (denormal_operand() < 0) + return; + } + + /* Fortunately, this can't overflow to 2^64 */ + if ((flags = FPU_round_to_int(st0_ptr, st0_tag))) + set_precision_flag(flags); - if ( st0_tag == TW_Denormal ) - { - if (denormal_operand() < 0 ) - return; + setexponent16(st0_ptr, 63); + tag = FPU_normalize(st0_ptr); + setsign(st0_ptr, sign); + FPU_settag0(tag); + return; } - /* Fortunately, this can't overflow to 2^64 */ - if ( (flags = FPU_round_to_int(st0_ptr, st0_tag)) ) - set_precision_flag(flags); - - setexponent16(st0_ptr, 63); - tag = FPU_normalize(st0_ptr); - setsign(st0_ptr, sign); - FPU_settag0(tag); - return; - } - - if ( st0_tag == TAG_Zero ) - return; - - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - - if ( st0_tag == TW_Denormal ) - goto denormal_arg; - else if ( st0_tag == TW_Infinity ) - return; - else - single_arg_error(st0_ptr, st0_tag); -} + if (st0_tag == TAG_Zero) + return; + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + + if (st0_tag == TW_Denormal) + goto denormal_arg; + else if (st0_tag == TW_Infinity) + return; + else + single_arg_error(st0_ptr, st0_tag); +} static int fsin(FPU_REG *st0_ptr, u_char tag) { - u_char arg_sign = getsign(st0_ptr); - - if ( tag == TAG_Valid ) - { - int q; - - if ( exponent(st0_ptr) > -40 ) - { - if ( (q = trig_arg(st0_ptr, 0)) == -1 ) - { - /* Operand is out of range */ - return 1; - } - - poly_sine(st0_ptr); - - if (q & 2) - changesign(st0_ptr); - - setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign); - - /* We do not really know if up or down */ - set_precision_flag_up(); - return 0; + u_char arg_sign = getsign(st0_ptr); + + if (tag == TAG_Valid) { + int q; + + if (exponent(st0_ptr) > -40) { + if ((q = trig_arg(st0_ptr, 0)) == -1) { + /* Operand is out of range */ + return 1; + } + + poly_sine(st0_ptr); + + if (q & 2) + changesign(st0_ptr); + + setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign); + + /* We do not really know if up or down */ + set_precision_flag_up(); + return 0; + } else { + /* For a small arg, the result == the argument */ + set_precision_flag_up(); /* Must be up. */ + return 0; + } } - else - { - /* For a small arg, the result == the argument */ - set_precision_flag_up(); /* Must be up. */ - return 0; + + if (tag == TAG_Zero) { + setcc(0); + return 0; } - } - - if ( tag == TAG_Zero ) - { - setcc(0); - return 0; - } - - if ( tag == TAG_Special ) - tag = FPU_Special(st0_ptr); - - if ( tag == TW_Denormal ) - { - if ( denormal_operand() < 0 ) - return 1; - - /* For a small arg, the result == the argument */ - /* Underflow may happen */ - FPU_to_exp16(st0_ptr, st0_ptr); - - tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); - - FPU_settag0(tag); - - return 0; - } - else if ( tag == TW_Infinity ) - { - /* The 80486 treats infinity as an invalid operand */ - arith_invalid(0); - return 1; - } - else - { - single_arg_error(st0_ptr, tag); - return 1; - } -} + if (tag == TAG_Special) + tag = FPU_Special(st0_ptr); + + if (tag == TW_Denormal) { + if (denormal_operand() < 0) + return 1; + + /* For a small arg, the result == the argument */ + /* Underflow may happen */ + FPU_to_exp16(st0_ptr, st0_ptr); + + tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); + + FPU_settag0(tag); + + return 0; + } else if (tag == TW_Infinity) { + /* The 80486 treats infinity as an invalid operand */ + arith_invalid(0); + return 1; + } else { + single_arg_error(st0_ptr, tag); + return 1; + } +} static int f_cos(FPU_REG *st0_ptr, u_char tag) { - u_char st0_sign; - - st0_sign = getsign(st0_ptr); - - if ( tag == TAG_Valid ) - { - int q; - - if ( exponent(st0_ptr) > -40 ) - { - if ( (exponent(st0_ptr) < 0) - || ((exponent(st0_ptr) == 0) - && (significand(st0_ptr) <= 0xc90fdaa22168c234LL)) ) - { - poly_cos(st0_ptr); - - /* We do not really know if up or down */ - set_precision_flag_down(); - - return 0; - } - else if ( (q = trig_arg(st0_ptr, FCOS)) != -1 ) - { - poly_sine(st0_ptr); - - if ((q+1) & 2) - changesign(st0_ptr); - - /* We do not really know if up or down */ - set_precision_flag_down(); - - return 0; - } - else - { - /* Operand is out of range */ - return 1; - } - } - else - { - denormal_arg: + u_char st0_sign; + + st0_sign = getsign(st0_ptr); - setcc(0); - FPU_copy_to_reg0(&CONST_1, TAG_Valid); + if (tag == TAG_Valid) { + int q; + + if (exponent(st0_ptr) > -40) { + if ((exponent(st0_ptr) < 0) + || ((exponent(st0_ptr) == 0) + && (significand(st0_ptr) <= + 0xc90fdaa22168c234LL))) { + poly_cos(st0_ptr); + + /* We do not really know if up or down */ + set_precision_flag_down(); + + return 0; + } else if ((q = trig_arg(st0_ptr, FCOS)) != -1) { + poly_sine(st0_ptr); + + if ((q + 1) & 2) + changesign(st0_ptr); + + /* We do not really know if up or down */ + set_precision_flag_down(); + + return 0; + } else { + /* Operand is out of range */ + return 1; + } + } else { + denormal_arg: + + setcc(0); + FPU_copy_to_reg0(&CONST_1, TAG_Valid); #ifdef PECULIAR_486 - set_precision_flag_down(); /* 80486 appears to do this. */ + set_precision_flag_down(); /* 80486 appears to do this. */ #else - set_precision_flag_up(); /* Must be up. */ + set_precision_flag_up(); /* Must be up. */ #endif /* PECULIAR_486 */ - return 0; + return 0; + } + } else if (tag == TAG_Zero) { + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + setcc(0); + return 0; } - } - else if ( tag == TAG_Zero ) - { - FPU_copy_to_reg0(&CONST_1, TAG_Valid); - setcc(0); - return 0; - } - - if ( tag == TAG_Special ) - tag = FPU_Special(st0_ptr); - - if ( tag == TW_Denormal ) - { - if ( denormal_operand() < 0 ) - return 1; - - goto denormal_arg; - } - else if ( tag == TW_Infinity ) - { - /* The 80486 treats infinity as an invalid operand */ - arith_invalid(0); - return 1; - } - else - { - single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */ - return 1; - } -} + if (tag == TAG_Special) + tag = FPU_Special(st0_ptr); + + if (tag == TW_Denormal) { + if (denormal_operand() < 0) + return 1; + + goto denormal_arg; + } else if (tag == TW_Infinity) { + /* The 80486 treats infinity as an invalid operand */ + arith_invalid(0); + return 1; + } else { + single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */ + return 1; + } +} static void fcos(FPU_REG *st0_ptr, u_char st0_tag) { - f_cos(st0_ptr, st0_tag); + f_cos(st0_ptr, st0_tag); } - static void fsincos(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st_new_ptr; - FPU_REG arg; - u_char tag; - - /* Stack underflow has higher priority */ - if ( st0_tag == TAG_Empty ) - { - FPU_stack_underflow(); /* Puts a QNaN in st(0) */ - if ( control_word & CW_Invalid ) - { - st_new_ptr = &st(-1); - push(); - FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ + FPU_REG *st_new_ptr; + FPU_REG arg; + u_char tag; + + /* Stack underflow has higher priority */ + if (st0_tag == TAG_Empty) { + FPU_stack_underflow(); /* Puts a QNaN in st(0) */ + if (control_word & CW_Invalid) { + st_new_ptr = &st(-1); + push(); + FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ + } + return; } - return; - } - - if ( STACK_OVERFLOW ) - { FPU_stack_overflow(); return; } - - if ( st0_tag == TAG_Special ) - tag = FPU_Special(st0_ptr); - else - tag = st0_tag; - - if ( tag == TW_NaN ) - { - single_arg_2_error(st0_ptr, TW_NaN); - return; - } - else if ( tag == TW_Infinity ) - { - /* The 80486 treats infinity as an invalid operand */ - if ( arith_invalid(0) >= 0 ) - { - /* Masked response */ - push(); - arith_invalid(0); + + if (STACK_OVERFLOW) { + FPU_stack_overflow(); + return; } - return; - } - - reg_copy(st0_ptr, &arg); - if ( !fsin(st0_ptr, st0_tag) ) - { - push(); - FPU_copy_to_reg0(&arg, st0_tag); - f_cos(&st(0), st0_tag); - } - else - { - /* An error, so restore st(0) */ - FPU_copy_to_reg0(&arg, st0_tag); - } -} + if (st0_tag == TAG_Special) + tag = FPU_Special(st0_ptr); + else + tag = st0_tag; + + if (tag == TW_NaN) { + single_arg_2_error(st0_ptr, TW_NaN); + return; + } else if (tag == TW_Infinity) { + /* The 80486 treats infinity as an invalid operand */ + if (arith_invalid(0) >= 0) { + /* Masked response */ + push(); + arith_invalid(0); + } + return; + } + + reg_copy(st0_ptr, &arg); + if (!fsin(st0_ptr, st0_tag)) { + push(); + FPU_copy_to_reg0(&arg, st0_tag); + f_cos(&st(0), st0_tag); + } else { + /* An error, so restore st(0) */ + FPU_copy_to_reg0(&arg, st0_tag); + } +} /*---------------------------------------------------------------------------*/ /* The following all require two arguments: st(0) and st(1) */ @@ -826,1020 +743,901 @@ static void fsincos(FPU_REG *st0_ptr, u_char st0_tag) result must be zero. */ static void rem_kernel(unsigned long long st0, unsigned long long *y, - unsigned long long st1, - unsigned long long q, int n) + unsigned long long st1, unsigned long long q, int n) { - int dummy; - unsigned long long x; - - x = st0 << n; - - /* Do the required multiplication and subtraction in the one operation */ - - /* lsw x -= lsw st1 * lsw q */ - asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1" - :"=m" (((unsigned *)&x)[0]), "=m" (((unsigned *)&x)[1]), - "=a" (dummy) - :"2" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[0]) - :"%dx"); - /* msw x -= msw st1 * lsw q */ - asm volatile ("mull %3; subl %%eax,%0" - :"=m" (((unsigned *)&x)[1]), "=a" (dummy) - :"1" (((unsigned *)&st1)[1]), "m" (((unsigned *)&q)[0]) - :"%dx"); - /* msw x -= lsw st1 * msw q */ - asm volatile ("mull %3; subl %%eax,%0" - :"=m" (((unsigned *)&x)[1]), "=a" (dummy) - :"1" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[1]) - :"%dx"); - - *y = x; + int dummy; + unsigned long long x; + + x = st0 << n; + + /* Do the required multiplication and subtraction in the one operation */ + + /* lsw x -= lsw st1 * lsw q */ + asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1":"=m" + (((unsigned *)&x)[0]), "=m"(((unsigned *)&x)[1]), + "=a"(dummy) + :"2"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[0]) + :"%dx"); + /* msw x -= msw st1 * lsw q */ + asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]), + "=a"(dummy) + :"1"(((unsigned *)&st1)[1]), "m"(((unsigned *)&q)[0]) + :"%dx"); + /* msw x -= lsw st1 * msw q */ + asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]), + "=a"(dummy) + :"1"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[1]) + :"%dx"); + + *y = x; } - /* Remainder of st(0) / st(1) */ /* This routine produces exact results, i.e. there is never any rounding or truncation, etc of the result. */ static void do_fprem(FPU_REG *st0_ptr, u_char st0_tag, int round) { - FPU_REG *st1_ptr = &st(1); - u_char st1_tag = FPU_gettagi(1); - - if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) - { - FPU_REG tmp, st0, st1; - u_char st0_sign, st1_sign; - u_char tmptag; - int tag; - int old_cw; - int expdif; - long long q; - unsigned short saved_status; - int cc; - - fprem_valid: - /* Convert registers for internal use. */ - st0_sign = FPU_to_exp16(st0_ptr, &st0); - st1_sign = FPU_to_exp16(st1_ptr, &st1); - expdif = exponent16(&st0) - exponent16(&st1); - - old_cw = control_word; - cc = 0; - - /* We want the status following the denorm tests, but don't want - the status changed by the arithmetic operations. */ - saved_status = partial_status; - control_word &= ~CW_RC; - control_word |= RC_CHOP; - - if ( expdif < 64 ) - { - /* This should be the most common case */ - - if ( expdif > -2 ) - { - u_char sign = st0_sign ^ st1_sign; - tag = FPU_u_div(&st0, &st1, &tmp, - PR_64_BITS | RC_CHOP | 0x3f, - sign); - setsign(&tmp, sign); - - if ( exponent(&tmp) >= 0 ) - { - FPU_round_to_int(&tmp, tag); /* Fortunately, this can't - overflow to 2^64 */ - q = significand(&tmp); - - rem_kernel(significand(&st0), - &significand(&tmp), - significand(&st1), - q, expdif); - - setexponent16(&tmp, exponent16(&st1)); - } - else - { - reg_copy(&st0, &tmp); - q = 0; - } - - if ( (round == RC_RND) && (tmp.sigh & 0xc0000000) ) - { - /* We may need to subtract st(1) once more, - to get a result <= 1/2 of st(1). */ - unsigned long long x; - expdif = exponent16(&st1) - exponent16(&tmp); - if ( expdif <= 1 ) - { - if ( expdif == 0 ) - x = significand(&st1) - significand(&tmp); - else /* expdif is 1 */ - x = (significand(&st1) << 1) - significand(&tmp); - if ( (x < significand(&tmp)) || - /* or equi-distant (from 0 & st(1)) and q is odd */ - ((x == significand(&tmp)) && (q & 1) ) ) - { - st0_sign = ! st0_sign; - significand(&tmp) = x; - q++; + FPU_REG *st1_ptr = &st(1); + u_char st1_tag = FPU_gettagi(1); + + if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) { + FPU_REG tmp, st0, st1; + u_char st0_sign, st1_sign; + u_char tmptag; + int tag; + int old_cw; + int expdif; + long long q; + unsigned short saved_status; + int cc; + + fprem_valid: + /* Convert registers for internal use. */ + st0_sign = FPU_to_exp16(st0_ptr, &st0); + st1_sign = FPU_to_exp16(st1_ptr, &st1); + expdif = exponent16(&st0) - exponent16(&st1); + + old_cw = control_word; + cc = 0; + + /* We want the status following the denorm tests, but don't want + the status changed by the arithmetic operations. */ + saved_status = partial_status; + control_word &= ~CW_RC; + control_word |= RC_CHOP; + + if (expdif < 64) { + /* This should be the most common case */ + + if (expdif > -2) { + u_char sign = st0_sign ^ st1_sign; + tag = FPU_u_div(&st0, &st1, &tmp, + PR_64_BITS | RC_CHOP | 0x3f, + sign); + setsign(&tmp, sign); + + if (exponent(&tmp) >= 0) { + FPU_round_to_int(&tmp, tag); /* Fortunately, this can't + overflow to 2^64 */ + q = significand(&tmp); + + rem_kernel(significand(&st0), + &significand(&tmp), + significand(&st1), + q, expdif); + + setexponent16(&tmp, exponent16(&st1)); + } else { + reg_copy(&st0, &tmp); + q = 0; + } + + if ((round == RC_RND) + && (tmp.sigh & 0xc0000000)) { + /* We may need to subtract st(1) once more, + to get a result <= 1/2 of st(1). */ + unsigned long long x; + expdif = + exponent16(&st1) - exponent16(&tmp); + if (expdif <= 1) { + if (expdif == 0) + x = significand(&st1) - + significand(&tmp); + else /* expdif is 1 */ + x = (significand(&st1) + << 1) - + significand(&tmp); + if ((x < significand(&tmp)) || + /* or equi-distant (from 0 & st(1)) and q is odd */ + ((x == significand(&tmp)) + && (q & 1))) { + st0_sign = !st0_sign; + significand(&tmp) = x; + q++; + } + } + } + + if (q & 4) + cc |= SW_C0; + if (q & 2) + cc |= SW_C3; + if (q & 1) + cc |= SW_C1; + } else { + control_word = old_cw; + setcc(0); + return; } - } - } - - if (q & 4) cc |= SW_C0; - if (q & 2) cc |= SW_C3; - if (q & 1) cc |= SW_C1; - } - else - { - control_word = old_cw; - setcc(0); - return; - } - } - else - { - /* There is a large exponent difference ( >= 64 ) */ - /* To make much sense, the code in this section should - be done at high precision. */ - int exp_1, N; - u_char sign; - - /* prevent overflow here */ - /* N is 'a number between 32 and 63' (p26-113) */ - reg_copy(&st0, &tmp); - tmptag = st0_tag; - N = (expdif & 0x0000001f) + 32; /* This choice gives results - identical to an AMD 486 */ - setexponent16(&tmp, N); - exp_1 = exponent16(&st1); - setexponent16(&st1, 0); - expdif -= N; - - sign = getsign(&tmp) ^ st1_sign; - tag = FPU_u_div(&tmp, &st1, &tmp, PR_64_BITS | RC_CHOP | 0x3f, - sign); - setsign(&tmp, sign); - - FPU_round_to_int(&tmp, tag); /* Fortunately, this can't - overflow to 2^64 */ - - rem_kernel(significand(&st0), - &significand(&tmp), - significand(&st1), - significand(&tmp), - exponent(&tmp) - ); - setexponent16(&tmp, exp_1 + expdif); - - /* It is possible for the operation to be complete here. - What does the IEEE standard say? The Intel 80486 manual - implies that the operation will never be completed at this - point, and the behaviour of a real 80486 confirms this. - */ - if ( !(tmp.sigh | tmp.sigl) ) - { - /* The result is zero */ - control_word = old_cw; - partial_status = saved_status; - FPU_copy_to_reg0(&CONST_Z, TAG_Zero); - setsign(&st0, st0_sign); + } else { + /* There is a large exponent difference ( >= 64 ) */ + /* To make much sense, the code in this section should + be done at high precision. */ + int exp_1, N; + u_char sign; + + /* prevent overflow here */ + /* N is 'a number between 32 and 63' (p26-113) */ + reg_copy(&st0, &tmp); + tmptag = st0_tag; + N = (expdif & 0x0000001f) + 32; /* This choice gives results + identical to an AMD 486 */ + setexponent16(&tmp, N); + exp_1 = exponent16(&st1); + setexponent16(&st1, 0); + expdif -= N; + + sign = getsign(&tmp) ^ st1_sign; + tag = + FPU_u_div(&tmp, &st1, &tmp, + PR_64_BITS | RC_CHOP | 0x3f, sign); + setsign(&tmp, sign); + + FPU_round_to_int(&tmp, tag); /* Fortunately, this can't + overflow to 2^64 */ + + rem_kernel(significand(&st0), + &significand(&tmp), + significand(&st1), + significand(&tmp), exponent(&tmp) + ); + setexponent16(&tmp, exp_1 + expdif); + + /* It is possible for the operation to be complete here. + What does the IEEE standard say? The Intel 80486 manual + implies that the operation will never be completed at this + point, and the behaviour of a real 80486 confirms this. + */ + if (!(tmp.sigh | tmp.sigl)) { + /* The result is zero */ + control_word = old_cw; + partial_status = saved_status; + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + setsign(&st0, st0_sign); #ifdef PECULIAR_486 - setcc(SW_C2); + setcc(SW_C2); #else - setcc(0); + setcc(0); #endif /* PECULIAR_486 */ - return; - } - cc = SW_C2; - } + return; + } + cc = SW_C2; + } - control_word = old_cw; - partial_status = saved_status; - tag = FPU_normalize_nuo(&tmp); - reg_copy(&tmp, st0_ptr); - - /* The only condition to be looked for is underflow, - and it can occur here only if underflow is unmasked. */ - if ( (exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero) - && !(control_word & CW_Underflow) ) - { - setcc(cc); - tag = arith_underflow(st0_ptr); - setsign(st0_ptr, st0_sign); - FPU_settag0(tag); - return; - } - else if ( (exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero) ) - { - stdexp(st0_ptr); - setsign(st0_ptr, st0_sign); - } - else - { - tag = FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign); - } - FPU_settag0(tag); - setcc(cc); + control_word = old_cw; + partial_status = saved_status; + tag = FPU_normalize_nuo(&tmp); + reg_copy(&tmp, st0_ptr); + + /* The only condition to be looked for is underflow, + and it can occur here only if underflow is unmasked. */ + if ((exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero) + && !(control_word & CW_Underflow)) { + setcc(cc); + tag = arith_underflow(st0_ptr); + setsign(st0_ptr, st0_sign); + FPU_settag0(tag); + return; + } else if ((exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero)) { + stdexp(st0_ptr); + setsign(st0_ptr, st0_sign); + } else { + tag = + FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign); + } + FPU_settag0(tag); + setcc(cc); - return; - } + return; + } - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - if ( st1_tag == TAG_Special ) - st1_tag = FPU_Special(st1_ptr); + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + if (st1_tag == TAG_Special) + st1_tag = FPU_Special(st1_ptr); - if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) + if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) - || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) - { - if ( denormal_operand() < 0 ) - return; - goto fprem_valid; - } - else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) - { - FPU_stack_underflow(); - return; - } - else if ( st0_tag == TAG_Zero ) - { - if ( st1_tag == TAG_Valid ) - { - setcc(0); return; - } - else if ( st1_tag == TW_Denormal ) - { - if ( denormal_operand() < 0 ) - return; - setcc(0); return; - } - else if ( st1_tag == TAG_Zero ) - { arith_invalid(0); return; } /* fprem(?,0) always invalid */ - else if ( st1_tag == TW_Infinity ) - { setcc(0); return; } - } - else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) - { - if ( st1_tag == TAG_Zero ) - { - arith_invalid(0); /* fprem(Valid,Zero) is invalid */ - return; - } - else if ( st1_tag != TW_NaN ) - { - if ( ((st0_tag == TW_Denormal) || (st1_tag == TW_Denormal)) - && (denormal_operand() < 0) ) - return; - - if ( st1_tag == TW_Infinity ) - { - /* fprem(Valid,Infinity) is o.k. */ - setcc(0); return; - } - } - } - else if ( st0_tag == TW_Infinity ) - { - if ( st1_tag != TW_NaN ) - { - arith_invalid(0); /* fprem(Infinity,?) is invalid */ - return; + || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) { + if (denormal_operand() < 0) + return; + goto fprem_valid; + } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) { + FPU_stack_underflow(); + return; + } else if (st0_tag == TAG_Zero) { + if (st1_tag == TAG_Valid) { + setcc(0); + return; + } else if (st1_tag == TW_Denormal) { + if (denormal_operand() < 0) + return; + setcc(0); + return; + } else if (st1_tag == TAG_Zero) { + arith_invalid(0); + return; + } /* fprem(?,0) always invalid */ + else if (st1_tag == TW_Infinity) { + setcc(0); + return; + } + } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) { + if (st1_tag == TAG_Zero) { + arith_invalid(0); /* fprem(Valid,Zero) is invalid */ + return; + } else if (st1_tag != TW_NaN) { + if (((st0_tag == TW_Denormal) + || (st1_tag == TW_Denormal)) + && (denormal_operand() < 0)) + return; + + if (st1_tag == TW_Infinity) { + /* fprem(Valid,Infinity) is o.k. */ + setcc(0); + return; + } + } + } else if (st0_tag == TW_Infinity) { + if (st1_tag != TW_NaN) { + arith_invalid(0); /* fprem(Infinity,?) is invalid */ + return; + } } - } - /* One of the registers must contain a NaN if we got here. */ + /* One of the registers must contain a NaN if we got here. */ #ifdef PARANOID - if ( (st0_tag != TW_NaN) && (st1_tag != TW_NaN) ) - EXCEPTION(EX_INTERNAL | 0x118); + if ((st0_tag != TW_NaN) && (st1_tag != TW_NaN)) + EXCEPTION(EX_INTERNAL | 0x118); #endif /* PARANOID */ - real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr); + real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr); } - /* ST(1) <- ST(1) * log ST; pop ST */ static void fyl2x(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st1_ptr = &st(1), exponent; - u_char st1_tag = FPU_gettagi(1); - u_char sign; - int e, tag; - - clear_C1(); - - if ( (st0_tag == TAG_Valid) && (st1_tag == TAG_Valid) ) - { - both_valid: - /* Both regs are Valid or Denormal */ - if ( signpositive(st0_ptr) ) - { - if ( st0_tag == TW_Denormal ) - FPU_to_exp16(st0_ptr, st0_ptr); - else - /* Convert st(0) for internal use. */ - setexponent16(st0_ptr, exponent(st0_ptr)); - - if ( (st0_ptr->sigh == 0x80000000) && (st0_ptr->sigl == 0) ) - { - /* Special case. The result can be precise. */ - u_char esign; - e = exponent16(st0_ptr); - if ( e >= 0 ) - { - exponent.sigh = e; - esign = SIGN_POS; - } - else - { - exponent.sigh = -e; - esign = SIGN_NEG; + FPU_REG *st1_ptr = &st(1), exponent; + u_char st1_tag = FPU_gettagi(1); + u_char sign; + int e, tag; + + clear_C1(); + + if ((st0_tag == TAG_Valid) && (st1_tag == TAG_Valid)) { + both_valid: + /* Both regs are Valid or Denormal */ + if (signpositive(st0_ptr)) { + if (st0_tag == TW_Denormal) + FPU_to_exp16(st0_ptr, st0_ptr); + else + /* Convert st(0) for internal use. */ + setexponent16(st0_ptr, exponent(st0_ptr)); + + if ((st0_ptr->sigh == 0x80000000) + && (st0_ptr->sigl == 0)) { + /* Special case. The result can be precise. */ + u_char esign; + e = exponent16(st0_ptr); + if (e >= 0) { + exponent.sigh = e; + esign = SIGN_POS; + } else { + exponent.sigh = -e; + esign = SIGN_NEG; + } + exponent.sigl = 0; + setexponent16(&exponent, 31); + tag = FPU_normalize_nuo(&exponent); + stdexp(&exponent); + setsign(&exponent, esign); + tag = + FPU_mul(&exponent, tag, 1, FULL_PRECISION); + if (tag >= 0) + FPU_settagi(1, tag); + } else { + /* The usual case */ + sign = getsign(st1_ptr); + if (st1_tag == TW_Denormal) + FPU_to_exp16(st1_ptr, st1_ptr); + else + /* Convert st(1) for internal use. */ + setexponent16(st1_ptr, + exponent(st1_ptr)); + poly_l2(st0_ptr, st1_ptr, sign); + } + } else { + /* negative */ + if (arith_invalid(1) < 0) + return; } - exponent.sigl = 0; - setexponent16(&exponent, 31); - tag = FPU_normalize_nuo(&exponent); - stdexp(&exponent); - setsign(&exponent, esign); - tag = FPU_mul(&exponent, tag, 1, FULL_PRECISION); - if ( tag >= 0 ) - FPU_settagi(1, tag); - } - else - { - /* The usual case */ - sign = getsign(st1_ptr); - if ( st1_tag == TW_Denormal ) - FPU_to_exp16(st1_ptr, st1_ptr); - else - /* Convert st(1) for internal use. */ - setexponent16(st1_ptr, exponent(st1_ptr)); - poly_l2(st0_ptr, st1_ptr, sign); - } - } - else - { - /* negative */ - if ( arith_invalid(1) < 0 ) - return; - } - FPU_pop(); - - return; - } - - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - if ( st1_tag == TAG_Special ) - st1_tag = FPU_Special(st1_ptr); - - if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) - { - FPU_stack_underflow_pop(1); - return; - } - else if ( (st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal) ) - { - if ( st0_tag == TAG_Zero ) - { - if ( st1_tag == TAG_Zero ) - { - /* Both args zero is invalid */ - if ( arith_invalid(1) < 0 ) - return; - } - else - { - u_char sign; - sign = getsign(st1_ptr)^SIGN_NEG; - if ( FPU_divide_by_zero(1, sign) < 0 ) - return; + FPU_pop(); - setsign(st1_ptr, sign); - } - } - else if ( st1_tag == TAG_Zero ) - { - /* st(1) contains zero, st(0) valid <> 0 */ - /* Zero is the valid answer */ - sign = getsign(st1_ptr); - - if ( signnegative(st0_ptr) ) - { - /* log(negative) */ - if ( arith_invalid(1) < 0 ) return; - } - else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - else - { - if ( exponent(st0_ptr) < 0 ) - sign ^= SIGN_NEG; - - FPU_copy_to_reg1(&CONST_Z, TAG_Zero); - setsign(st1_ptr, sign); - } } - else - { - /* One or both operands are denormals. */ - if ( denormal_operand() < 0 ) - return; - goto both_valid; - } - } - else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) ) - { - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) - return; - } - /* One or both arg must be an infinity */ - else if ( st0_tag == TW_Infinity ) - { - if ( (signnegative(st0_ptr)) || (st1_tag == TAG_Zero) ) - { - /* log(-infinity) or 0*log(infinity) */ - if ( arith_invalid(1) < 0 ) - return; - } - else - { - u_char sign = getsign(st1_ptr); - if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + if (st1_tag == TAG_Special) + st1_tag = FPU_Special(st1_ptr); - FPU_copy_to_reg1(&CONST_INF, TAG_Special); - setsign(st1_ptr, sign); - } - } - /* st(1) must be infinity here */ - else if ( ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) - && ( signpositive(st0_ptr) ) ) - { - if ( exponent(st0_ptr) >= 0 ) - { - if ( (exponent(st0_ptr) == 0) && - (st0_ptr->sigh == 0x80000000) && - (st0_ptr->sigl == 0) ) - { - /* st(0) holds 1.0 */ - /* infinity*log(1) */ - if ( arith_invalid(1) < 0 ) + if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) { + FPU_stack_underflow_pop(1); return; - } - /* else st(0) is positive and > 1.0 */ + } else if ((st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal)) { + if (st0_tag == TAG_Zero) { + if (st1_tag == TAG_Zero) { + /* Both args zero is invalid */ + if (arith_invalid(1) < 0) + return; + } else { + u_char sign; + sign = getsign(st1_ptr) ^ SIGN_NEG; + if (FPU_divide_by_zero(1, sign) < 0) + return; + + setsign(st1_ptr, sign); + } + } else if (st1_tag == TAG_Zero) { + /* st(1) contains zero, st(0) valid <> 0 */ + /* Zero is the valid answer */ + sign = getsign(st1_ptr); + + if (signnegative(st0_ptr)) { + /* log(negative) */ + if (arith_invalid(1) < 0) + return; + } else if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + else { + if (exponent(st0_ptr) < 0) + sign ^= SIGN_NEG; + + FPU_copy_to_reg1(&CONST_Z, TAG_Zero); + setsign(st1_ptr, sign); + } + } else { + /* One or both operands are denormals. */ + if (denormal_operand() < 0) + return; + goto both_valid; + } + } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) { + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0) + return; + } + /* One or both arg must be an infinity */ + else if (st0_tag == TW_Infinity) { + if ((signnegative(st0_ptr)) || (st1_tag == TAG_Zero)) { + /* log(-infinity) or 0*log(infinity) */ + if (arith_invalid(1) < 0) + return; + } else { + u_char sign = getsign(st1_ptr); + + if ((st1_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + + FPU_copy_to_reg1(&CONST_INF, TAG_Special); + setsign(st1_ptr, sign); + } } - else - { - /* st(0) is positive and < 1.0 */ + /* st(1) must be infinity here */ + else if (((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) + && (signpositive(st0_ptr))) { + if (exponent(st0_ptr) >= 0) { + if ((exponent(st0_ptr) == 0) && + (st0_ptr->sigh == 0x80000000) && + (st0_ptr->sigl == 0)) { + /* st(0) holds 1.0 */ + /* infinity*log(1) */ + if (arith_invalid(1) < 0) + return; + } + /* else st(0) is positive and > 1.0 */ + } else { + /* st(0) is positive and < 1.0 */ - if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; + if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; - changesign(st1_ptr); - } - } - else - { - /* st(0) must be zero or negative */ - if ( st0_tag == TAG_Zero ) - { - /* This should be invalid, but a real 80486 is happy with it. */ + changesign(st1_ptr); + } + } else { + /* st(0) must be zero or negative */ + if (st0_tag == TAG_Zero) { + /* This should be invalid, but a real 80486 is happy with it. */ #ifndef PECULIAR_486 - sign = getsign(st1_ptr); - if ( FPU_divide_by_zero(1, sign) < 0 ) - return; + sign = getsign(st1_ptr); + if (FPU_divide_by_zero(1, sign) < 0) + return; #endif /* PECULIAR_486 */ - changesign(st1_ptr); + changesign(st1_ptr); + } else if (arith_invalid(1) < 0) /* log(negative) */ + return; } - else if ( arith_invalid(1) < 0 ) /* log(negative) */ - return; - } - FPU_pop(); + FPU_pop(); } - static void fpatan(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st1_ptr = &st(1); - u_char st1_tag = FPU_gettagi(1); - int tag; + FPU_REG *st1_ptr = &st(1); + u_char st1_tag = FPU_gettagi(1); + int tag; - clear_C1(); - if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) - { - valid_atan: + clear_C1(); + if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) { + valid_atan: - poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag); + poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag); - FPU_pop(); + FPU_pop(); - return; - } + return; + } - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - if ( st1_tag == TAG_Special ) - st1_tag = FPU_Special(st1_ptr); + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + if (st1_tag == TAG_Special) + st1_tag = FPU_Special(st1_ptr); - if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) + if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) - || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) - { - if ( denormal_operand() < 0 ) - return; + || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) { + if (denormal_operand() < 0) + return; - goto valid_atan; - } - else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) - { - FPU_stack_underflow_pop(1); - return; - } - else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) ) - { - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0 ) - FPU_pop(); - return; - } - else if ( (st0_tag == TW_Infinity) || (st1_tag == TW_Infinity) ) - { - u_char sign = getsign(st1_ptr); - if ( st0_tag == TW_Infinity ) - { - if ( st1_tag == TW_Infinity ) - { - if ( signpositive(st0_ptr) ) - { - FPU_copy_to_reg1(&CONST_PI4, TAG_Valid); - } - else - { - setpositive(st1_ptr); - tag = FPU_u_add(&CONST_PI4, &CONST_PI2, st1_ptr, - FULL_PRECISION, SIGN_POS, - exponent(&CONST_PI4), exponent(&CONST_PI2)); - if ( tag >= 0 ) - FPU_settagi(1, tag); - } - } - else - { - if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) + goto valid_atan; + } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) { + FPU_stack_underflow_pop(1); + return; + } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) { + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0) + FPU_pop(); return; + } else if ((st0_tag == TW_Infinity) || (st1_tag == TW_Infinity)) { + u_char sign = getsign(st1_ptr); + if (st0_tag == TW_Infinity) { + if (st1_tag == TW_Infinity) { + if (signpositive(st0_ptr)) { + FPU_copy_to_reg1(&CONST_PI4, TAG_Valid); + } else { + setpositive(st1_ptr); + tag = + FPU_u_add(&CONST_PI4, &CONST_PI2, + st1_ptr, FULL_PRECISION, + SIGN_POS, + exponent(&CONST_PI4), + exponent(&CONST_PI2)); + if (tag >= 0) + FPU_settagi(1, tag); + } + } else { + if ((st1_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + + if (signpositive(st0_ptr)) { + FPU_copy_to_reg1(&CONST_Z, TAG_Zero); + setsign(st1_ptr, sign); /* An 80486 preserves the sign */ + FPU_pop(); + return; + } else { + FPU_copy_to_reg1(&CONST_PI, TAG_Valid); + } + } + } else { + /* st(1) is infinity, st(0) not infinity */ + if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; - if ( signpositive(st0_ptr) ) - { - FPU_copy_to_reg1(&CONST_Z, TAG_Zero); - setsign(st1_ptr, sign); /* An 80486 preserves the sign */ - FPU_pop(); - return; + FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); } - else - { - FPU_copy_to_reg1(&CONST_PI, TAG_Valid); + setsign(st1_ptr, sign); + } else if (st1_tag == TAG_Zero) { + /* st(0) must be valid or zero */ + u_char sign = getsign(st1_ptr); + + if ((st0_tag == TW_Denormal) && (denormal_operand() < 0)) + return; + + if (signpositive(st0_ptr)) { + /* An 80486 preserves the sign */ + FPU_pop(); + return; } - } - } - else - { - /* st(1) is infinity, st(0) not infinity */ - if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); - } - setsign(st1_ptr, sign); - } - else if ( st1_tag == TAG_Zero ) - { - /* st(0) must be valid or zero */ - u_char sign = getsign(st1_ptr); - - if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; + FPU_copy_to_reg1(&CONST_PI, TAG_Valid); + setsign(st1_ptr, sign); + } else if (st0_tag == TAG_Zero) { + /* st(1) must be TAG_Valid here */ + u_char sign = getsign(st1_ptr); - if ( signpositive(st0_ptr) ) - { - /* An 80486 preserves the sign */ - FPU_pop(); - return; - } + if ((st1_tag == TW_Denormal) && (denormal_operand() < 0)) + return; - FPU_copy_to_reg1(&CONST_PI, TAG_Valid); - setsign(st1_ptr, sign); - } - else if ( st0_tag == TAG_Zero ) - { - /* st(1) must be TAG_Valid here */ - u_char sign = getsign(st1_ptr); - - if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - - FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); - setsign(st1_ptr, sign); - } + FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); + setsign(st1_ptr, sign); + } #ifdef PARANOID - else - EXCEPTION(EX_INTERNAL | 0x125); + else + EXCEPTION(EX_INTERNAL | 0x125); #endif /* PARANOID */ - FPU_pop(); - set_precision_flag_up(); /* We do not really know if up or down */ + FPU_pop(); + set_precision_flag_up(); /* We do not really know if up or down */ } - static void fprem(FPU_REG *st0_ptr, u_char st0_tag) { - do_fprem(st0_ptr, st0_tag, RC_CHOP); + do_fprem(st0_ptr, st0_tag, RC_CHOP); } - static void fprem1(FPU_REG *st0_ptr, u_char st0_tag) { - do_fprem(st0_ptr, st0_tag, RC_RND); + do_fprem(st0_ptr, st0_tag, RC_RND); } - static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag) { - u_char sign, sign1; - FPU_REG *st1_ptr = &st(1), a, b; - u_char st1_tag = FPU_gettagi(1); + u_char sign, sign1; + FPU_REG *st1_ptr = &st(1), a, b; + u_char st1_tag = FPU_gettagi(1); - clear_C1(); - if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) - { - valid_yl2xp1: + clear_C1(); + if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) { + valid_yl2xp1: - sign = getsign(st0_ptr); - sign1 = getsign(st1_ptr); + sign = getsign(st0_ptr); + sign1 = getsign(st1_ptr); - FPU_to_exp16(st0_ptr, &a); - FPU_to_exp16(st1_ptr, &b); + FPU_to_exp16(st0_ptr, &a); + FPU_to_exp16(st1_ptr, &b); - if ( poly_l2p1(sign, sign1, &a, &b, st1_ptr) ) - return; + if (poly_l2p1(sign, sign1, &a, &b, st1_ptr)) + return; - FPU_pop(); - return; - } + FPU_pop(); + return; + } - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - if ( st1_tag == TAG_Special ) - st1_tag = FPU_Special(st1_ptr); + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + if (st1_tag == TAG_Special) + st1_tag = FPU_Special(st1_ptr); - if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) + if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) - || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) - { - if ( denormal_operand() < 0 ) - return; - - goto valid_yl2xp1; - } - else if ( (st0_tag == TAG_Empty) | (st1_tag == TAG_Empty) ) - { - FPU_stack_underflow_pop(1); - return; - } - else if ( st0_tag == TAG_Zero ) - { - switch ( st1_tag ) - { - case TW_Denormal: - if ( denormal_operand() < 0 ) - return; - - case TAG_Zero: - case TAG_Valid: - setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr)); - FPU_copy_to_reg1(st0_ptr, st0_tag); - break; - - case TW_Infinity: - /* Infinity*log(1) */ - if ( arith_invalid(1) < 0 ) - return; - break; + || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) { + if (denormal_operand() < 0) + return; - case TW_NaN: - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) - return; - break; - - default: + goto valid_yl2xp1; + } else if ((st0_tag == TAG_Empty) | (st1_tag == TAG_Empty)) { + FPU_stack_underflow_pop(1); + return; + } else if (st0_tag == TAG_Zero) { + switch (st1_tag) { + case TW_Denormal: + if (denormal_operand() < 0) + return; + + case TAG_Zero: + case TAG_Valid: + setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr)); + FPU_copy_to_reg1(st0_ptr, st0_tag); + break; + + case TW_Infinity: + /* Infinity*log(1) */ + if (arith_invalid(1) < 0) + return; + break; + + case TW_NaN: + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0) + return; + break; + + default: #ifdef PARANOID - EXCEPTION(EX_INTERNAL | 0x116); - return; + EXCEPTION(EX_INTERNAL | 0x116); + return; #endif /* PARANOID */ - break; - } - } - else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) - { - switch ( st1_tag ) - { - case TAG_Zero: - if ( signnegative(st0_ptr) ) - { - if ( exponent(st0_ptr) >= 0 ) - { - /* st(0) holds <= -1.0 */ -#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ - changesign(st1_ptr); + break; + } + } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) { + switch (st1_tag) { + case TAG_Zero: + if (signnegative(st0_ptr)) { + if (exponent(st0_ptr) >= 0) { + /* st(0) holds <= -1.0 */ +#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ + changesign(st1_ptr); #else - if ( arith_invalid(1) < 0 ) - return; + if (arith_invalid(1) < 0) + return; #endif /* PECULIAR_486 */ - } - else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - else - changesign(st1_ptr); - } - else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - break; - - case TW_Infinity: - if ( signnegative(st0_ptr) ) - { - if ( (exponent(st0_ptr) >= 0) && - !((st0_ptr->sigh == 0x80000000) && - (st0_ptr->sigl == 0)) ) - { - /* st(0) holds < -1.0 */ -#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ - changesign(st1_ptr); + } else if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + else + changesign(st1_ptr); + } else if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + break; + + case TW_Infinity: + if (signnegative(st0_ptr)) { + if ((exponent(st0_ptr) >= 0) && + !((st0_ptr->sigh == 0x80000000) && + (st0_ptr->sigl == 0))) { + /* st(0) holds < -1.0 */ +#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ + changesign(st1_ptr); #else - if ( arith_invalid(1) < 0 ) return; + if (arith_invalid(1) < 0) + return; #endif /* PECULIAR_486 */ + } else if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + else + changesign(st1_ptr); + } else if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + break; + + case TW_NaN: + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0) + return; } - else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - else - changesign(st1_ptr); - } - else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - break; - - case TW_NaN: - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) - return; - } - } - else if ( st0_tag == TW_NaN ) - { - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) - return; - } - else if ( st0_tag == TW_Infinity ) - { - if ( st1_tag == TW_NaN ) - { - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) - return; - } - else if ( signnegative(st0_ptr) ) - { + } else if (st0_tag == TW_NaN) { + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0) + return; + } else if (st0_tag == TW_Infinity) { + if (st1_tag == TW_NaN) { + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0) + return; + } else if (signnegative(st0_ptr)) { #ifndef PECULIAR_486 - /* This should have higher priority than denormals, but... */ - if ( arith_invalid(1) < 0 ) /* log(-infinity) */ - return; + /* This should have higher priority than denormals, but... */ + if (arith_invalid(1) < 0) /* log(-infinity) */ + return; #endif /* PECULIAR_486 */ - if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; + if ((st1_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; #ifdef PECULIAR_486 - /* Denormal operands actually get higher priority */ - if ( arith_invalid(1) < 0 ) /* log(-infinity) */ - return; + /* Denormal operands actually get higher priority */ + if (arith_invalid(1) < 0) /* log(-infinity) */ + return; #endif /* PECULIAR_486 */ - } - else if ( st1_tag == TAG_Zero ) - { - /* log(infinity) */ - if ( arith_invalid(1) < 0 ) - return; - } - - /* st(1) must be valid here. */ + } else if (st1_tag == TAG_Zero) { + /* log(infinity) */ + if (arith_invalid(1) < 0) + return; + } - else if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; + /* st(1) must be valid here. */ + + else if ((st1_tag == TW_Denormal) && (denormal_operand() < 0)) + return; - /* The Manual says that log(Infinity) is invalid, but a real - 80486 sensibly says that it is o.k. */ - else - { - u_char sign = getsign(st1_ptr); - FPU_copy_to_reg1(&CONST_INF, TAG_Special); - setsign(st1_ptr, sign); + /* The Manual says that log(Infinity) is invalid, but a real + 80486 sensibly says that it is o.k. */ + else { + u_char sign = getsign(st1_ptr); + FPU_copy_to_reg1(&CONST_INF, TAG_Special); + setsign(st1_ptr, sign); + } } - } #ifdef PARANOID - else - { - EXCEPTION(EX_INTERNAL | 0x117); - return; - } + else { + EXCEPTION(EX_INTERNAL | 0x117); + return; + } #endif /* PARANOID */ - FPU_pop(); - return; + FPU_pop(); + return; } - static void fscale(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st1_ptr = &st(1); - u_char st1_tag = FPU_gettagi(1); - int old_cw = control_word; - u_char sign = getsign(st0_ptr); - - clear_C1(); - if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) - { - long scale; - FPU_REG tmp; - - /* Convert register for internal use. */ - setexponent16(st0_ptr, exponent(st0_ptr)); - - valid_scale: - - if ( exponent(st1_ptr) > 30 ) - { - /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */ - - if ( signpositive(st1_ptr) ) - { - EXCEPTION(EX_Overflow); - FPU_copy_to_reg0(&CONST_INF, TAG_Special); - } - else - { - EXCEPTION(EX_Underflow); - FPU_copy_to_reg0(&CONST_Z, TAG_Zero); - } - setsign(st0_ptr, sign); - return; - } - - control_word &= ~CW_RC; - control_word |= RC_CHOP; - reg_copy(st1_ptr, &tmp); - FPU_round_to_int(&tmp, st1_tag); /* This can never overflow here */ - control_word = old_cw; - scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl; - scale += exponent16(st0_ptr); - - setexponent16(st0_ptr, scale); - - /* Use FPU_round() to properly detect under/overflow etc */ - FPU_round(st0_ptr, 0, 0, control_word, sign); - - return; - } - - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - if ( st1_tag == TAG_Special ) - st1_tag = FPU_Special(st1_ptr); - - if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) - { - switch ( st1_tag ) - { - case TAG_Valid: - /* st(0) must be a denormal */ - if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - - FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */ - goto valid_scale; - - case TAG_Zero: - if ( st0_tag == TW_Denormal ) - denormal_operand(); - return; - - case TW_Denormal: - denormal_operand(); - return; - - case TW_Infinity: - if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - - if ( signpositive(st1_ptr) ) - FPU_copy_to_reg0(&CONST_INF, TAG_Special); - else - FPU_copy_to_reg0(&CONST_Z, TAG_Zero); - setsign(st0_ptr, sign); - return; + FPU_REG *st1_ptr = &st(1); + u_char st1_tag = FPU_gettagi(1); + int old_cw = control_word; + u_char sign = getsign(st0_ptr); + + clear_C1(); + if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) { + long scale; + FPU_REG tmp; + + /* Convert register for internal use. */ + setexponent16(st0_ptr, exponent(st0_ptr)); + + valid_scale: + + if (exponent(st1_ptr) > 30) { + /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */ + + if (signpositive(st1_ptr)) { + EXCEPTION(EX_Overflow); + FPU_copy_to_reg0(&CONST_INF, TAG_Special); + } else { + EXCEPTION(EX_Underflow); + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + } + setsign(st0_ptr, sign); + return; + } - case TW_NaN: - real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); - return; - } - } - else if ( st0_tag == TAG_Zero ) - { - switch ( st1_tag ) - { - case TAG_Valid: - case TAG_Zero: - return; + control_word &= ~CW_RC; + control_word |= RC_CHOP; + reg_copy(st1_ptr, &tmp); + FPU_round_to_int(&tmp, st1_tag); /* This can never overflow here */ + control_word = old_cw; + scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl; + scale += exponent16(st0_ptr); - case TW_Denormal: - denormal_operand(); - return; + setexponent16(st0_ptr, scale); - case TW_Infinity: - if ( signpositive(st1_ptr) ) - arith_invalid(0); /* Zero scaled by +Infinity */ - return; + /* Use FPU_round() to properly detect under/overflow etc */ + FPU_round(st0_ptr, 0, 0, control_word, sign); - case TW_NaN: - real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); - return; + return; } - } - else if ( st0_tag == TW_Infinity ) - { - switch ( st1_tag ) - { - case TAG_Valid: - case TAG_Zero: - return; - - case TW_Denormal: - denormal_operand(); - return; - case TW_Infinity: - if ( signnegative(st1_ptr) ) - arith_invalid(0); /* Infinity scaled by -Infinity */ - return; - - case TW_NaN: - real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); - return; + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + if (st1_tag == TAG_Special) + st1_tag = FPU_Special(st1_ptr); + + if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) { + switch (st1_tag) { + case TAG_Valid: + /* st(0) must be a denormal */ + if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + + FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */ + goto valid_scale; + + case TAG_Zero: + if (st0_tag == TW_Denormal) + denormal_operand(); + return; + + case TW_Denormal: + denormal_operand(); + return; + + case TW_Infinity: + if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + + if (signpositive(st1_ptr)) + FPU_copy_to_reg0(&CONST_INF, TAG_Special); + else + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + setsign(st0_ptr, sign); + return; + + case TW_NaN: + real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); + return; + } + } else if (st0_tag == TAG_Zero) { + switch (st1_tag) { + case TAG_Valid: + case TAG_Zero: + return; + + case TW_Denormal: + denormal_operand(); + return; + + case TW_Infinity: + if (signpositive(st1_ptr)) + arith_invalid(0); /* Zero scaled by +Infinity */ + return; + + case TW_NaN: + real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); + return; + } + } else if (st0_tag == TW_Infinity) { + switch (st1_tag) { + case TAG_Valid: + case TAG_Zero: + return; + + case TW_Denormal: + denormal_operand(); + return; + + case TW_Infinity: + if (signnegative(st1_ptr)) + arith_invalid(0); /* Infinity scaled by -Infinity */ + return; + + case TW_NaN: + real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); + return; + } + } else if (st0_tag == TW_NaN) { + if (st1_tag != TAG_Empty) { + real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); + return; + } } - } - else if ( st0_tag == TW_NaN ) - { - if ( st1_tag != TAG_Empty ) - { real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); return; } - } - #ifdef PARANOID - if ( !((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) ) - { - EXCEPTION(EX_INTERNAL | 0x115); - return; - } + if (!((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty))) { + EXCEPTION(EX_INTERNAL | 0x115); + return; + } #endif - /* At least one of st(0), st(1) must be empty */ - FPU_stack_underflow(); + /* At least one of st(0), st(1) must be empty */ + FPU_stack_underflow(); } - /*---------------------------------------------------------------------------*/ static FUNC_ST0 const trig_table_a[] = { - f2xm1, fyl2x, fptan, fpatan, - fxtract, fprem1, (FUNC_ST0)fdecstp, (FUNC_ST0)fincstp + f2xm1, fyl2x, fptan, fpatan, + fxtract, fprem1, (FUNC_ST0) fdecstp, (FUNC_ST0) fincstp }; void FPU_triga(void) { - (trig_table_a[FPU_rm])(&st(0), FPU_gettag0()); + (trig_table_a[FPU_rm]) (&st(0), FPU_gettag0()); } - -static FUNC_ST0 const trig_table_b[] = - { - fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0)fsin, fcos - }; +static FUNC_ST0 const trig_table_b[] = { + fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0) fsin, fcos +}; void FPU_trigb(void) { - (trig_table_b[FPU_rm])(&st(0), FPU_gettag0()); + (trig_table_b[FPU_rm]) (&st(0), FPU_gettag0()); } diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index 2e2c51a8bd3..d701e2b39e4 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -17,7 +17,6 @@ | other processes using the emulator while swapping is in progress. | +---------------------------------------------------------------------------*/ - #include <linux/stddef.h> #include <asm/uaccess.h> @@ -27,31 +26,30 @@ #include "exception.h" #include "fpu_emu.h" - #define FPU_WRITE_BIT 0x10 static int reg_offset[] = { - offsetof(struct info,___eax), - offsetof(struct info,___ecx), - offsetof(struct info,___edx), - offsetof(struct info,___ebx), - offsetof(struct info,___esp), - offsetof(struct info,___ebp), - offsetof(struct info,___esi), - offsetof(struct info,___edi) + offsetof(struct info, ___eax), + offsetof(struct info, ___ecx), + offsetof(struct info, ___edx), + offsetof(struct info, ___ebx), + offsetof(struct info, ___esp), + offsetof(struct info, ___ebp), + offsetof(struct info, ___esi), + offsetof(struct info, ___edi) }; #define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info)) static int reg_offset_vm86[] = { - offsetof(struct info,___cs), - offsetof(struct info,___vm86_ds), - offsetof(struct info,___vm86_es), - offsetof(struct info,___vm86_fs), - offsetof(struct info,___vm86_gs), - offsetof(struct info,___ss), - offsetof(struct info,___vm86_ds) - }; + offsetof(struct info, ___cs), + offsetof(struct info, ___vm86_ds), + offsetof(struct info, ___vm86_es), + offsetof(struct info, ___vm86_fs), + offsetof(struct info, ___vm86_gs), + offsetof(struct info, ___ss), + offsetof(struct info, ___vm86_ds) +}; #define VM86_REG_(x) (*(unsigned short *) \ (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info)) @@ -60,158 +58,141 @@ static int reg_offset_vm86[] = { #define ___GS ___ds static int reg_offset_pm[] = { - offsetof(struct info,___cs), - offsetof(struct info,___ds), - offsetof(struct info,___es), - offsetof(struct info,___fs), - offsetof(struct info,___GS), - offsetof(struct info,___ss), - offsetof(struct info,___ds) - }; + offsetof(struct info, ___cs), + offsetof(struct info, ___ds), + offsetof(struct info, ___es), + offsetof(struct info, ___fs), + offsetof(struct info, ___GS), + offsetof(struct info, ___ss), + offsetof(struct info, ___ds) +}; #define PM_REG_(x) (*(unsigned short *) \ (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info)) - /* Decode the SIB byte. This function assumes mod != 0 */ static int sib(int mod, unsigned long *fpu_eip) { - u_char ss,index,base; - long offset; - - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(base, (u_char __user *) (*fpu_eip)); /* The SIB byte */ - RE_ENTRANT_CHECK_ON; - (*fpu_eip)++; - ss = base >> 6; - index = (base >> 3) & 7; - base &= 7; - - if ((mod == 0) && (base == 5)) - offset = 0; /* No base register */ - else - offset = REG_(base); - - if (index == 4) - { - /* No index register */ - /* A non-zero ss is illegal */ - if ( ss ) - EXCEPTION(EX_Invalid); - } - else - { - offset += (REG_(index)) << ss; - } - - if (mod == 1) - { - /* 8 bit signed displacement */ - long displacement; - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(displacement, (signed char __user *) (*fpu_eip)); - offset += displacement; - RE_ENTRANT_CHECK_ON; - (*fpu_eip)++; - } - else if (mod == 2 || base == 5) /* The second condition also has mod==0 */ - { - /* 32 bit displacement */ - long displacement; - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(4); - FPU_get_user(displacement, (long __user *) (*fpu_eip)); - offset += displacement; - RE_ENTRANT_CHECK_ON; - (*fpu_eip) += 4; - } - - return offset; -} + u_char ss, index, base; + long offset; + + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(base, (u_char __user *) (*fpu_eip)); /* The SIB byte */ + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + ss = base >> 6; + index = (base >> 3) & 7; + base &= 7; + + if ((mod == 0) && (base == 5)) + offset = 0; /* No base register */ + else + offset = REG_(base); + + if (index == 4) { + /* No index register */ + /* A non-zero ss is illegal */ + if (ss) + EXCEPTION(EX_Invalid); + } else { + offset += (REG_(index)) << ss; + } + + if (mod == 1) { + /* 8 bit signed displacement */ + long displacement; + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(displacement, (signed char __user *)(*fpu_eip)); + offset += displacement; + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + } else if (mod == 2 || base == 5) { /* The second condition also has mod==0 */ + /* 32 bit displacement */ + long displacement; + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(4); + FPU_get_user(displacement, (long __user *)(*fpu_eip)); + offset += displacement; + RE_ENTRANT_CHECK_ON; + (*fpu_eip) += 4; + } + return offset; +} -static unsigned long vm86_segment(u_char segment, - struct address *addr) +static unsigned long vm86_segment(u_char segment, struct address *addr) { - segment--; + segment--; #ifdef PARANOID - if ( segment > PREFIX_SS_ ) - { - EXCEPTION(EX_INTERNAL|0x130); - math_abort(FPU_info,SIGSEGV); - } + if (segment > PREFIX_SS_) { + EXCEPTION(EX_INTERNAL | 0x130); + math_abort(FPU_info, SIGSEGV); + } #endif /* PARANOID */ - addr->selector = VM86_REG_(segment); - return (unsigned long)VM86_REG_(segment) << 4; + addr->selector = VM86_REG_(segment); + return (unsigned long)VM86_REG_(segment) << 4; } - /* This should work for 16 and 32 bit protected mode. */ static long pm_address(u_char FPU_modrm, u_char segment, struct address *addr, long offset) -{ - struct desc_struct descriptor; - unsigned long base_address, limit, address, seg_top; +{ + struct desc_struct descriptor; + unsigned long base_address, limit, address, seg_top; - segment--; + segment--; #ifdef PARANOID - /* segment is unsigned, so this also detects if segment was 0: */ - if ( segment > PREFIX_SS_ ) - { - EXCEPTION(EX_INTERNAL|0x132); - math_abort(FPU_info,SIGSEGV); - } + /* segment is unsigned, so this also detects if segment was 0: */ + if (segment > PREFIX_SS_) { + EXCEPTION(EX_INTERNAL | 0x132); + math_abort(FPU_info, SIGSEGV); + } #endif /* PARANOID */ - switch ( segment ) - { - /* gs isn't used by the kernel, so it still has its - user-space value. */ - case PREFIX_GS_-1: - /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ - savesegment(gs, addr->selector); - break; - default: - addr->selector = PM_REG_(segment); - } - - descriptor = LDT_DESCRIPTOR(PM_REG_(segment)); - base_address = SEG_BASE_ADDR(descriptor); - address = base_address + offset; - limit = base_address - + (SEG_LIMIT(descriptor)+1) * SEG_GRANULARITY(descriptor) - 1; - if ( limit < base_address ) limit = 0xffffffff; - - if ( SEG_EXPAND_DOWN(descriptor) ) - { - if ( SEG_G_BIT(descriptor) ) - seg_top = 0xffffffff; - else - { - seg_top = base_address + (1 << 20); - if ( seg_top < base_address ) seg_top = 0xffffffff; + switch (segment) { + /* gs isn't used by the kernel, so it still has its + user-space value. */ + case PREFIX_GS_ - 1: + /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ + savesegment(gs, addr->selector); + break; + default: + addr->selector = PM_REG_(segment); } - access_limit = - (address <= limit) || (address >= seg_top) ? 0 : - ((seg_top-address) >= 255 ? 255 : seg_top-address); - } - else - { - access_limit = - (address > limit) || (address < base_address) ? 0 : - ((limit-address) >= 254 ? 255 : limit-address+1); - } - if ( SEG_EXECUTE_ONLY(descriptor) || - (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT)) ) - { - access_limit = 0; - } - return address; -} + descriptor = LDT_DESCRIPTOR(PM_REG_(segment)); + base_address = SEG_BASE_ADDR(descriptor); + address = base_address + offset; + limit = base_address + + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1; + if (limit < base_address) + limit = 0xffffffff; + + if (SEG_EXPAND_DOWN(descriptor)) { + if (SEG_G_BIT(descriptor)) + seg_top = 0xffffffff; + else { + seg_top = base_address + (1 << 20); + if (seg_top < base_address) + seg_top = 0xffffffff; + } + access_limit = + (address <= limit) || (address >= seg_top) ? 0 : + ((seg_top - address) >= 255 ? 255 : seg_top - address); + } else { + access_limit = + (address > limit) || (address < base_address) ? 0 : + ((limit - address) >= 254 ? 255 : limit - address + 1); + } + if (SEG_EXECUTE_ONLY(descriptor) || + (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { + access_limit = 0; + } + return address; +} /* MOD R/M byte: MOD == 3 has a special use for the FPU @@ -221,7 +202,6 @@ static long pm_address(u_char FPU_modrm, u_char segment, ..... ......... ......... MOD OPCODE(2) R/M - SIB byte 7 6 5 4 3 2 1 0 @@ -231,208 +211,194 @@ static long pm_address(u_char FPU_modrm, u_char segment, */ void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip, - struct address *addr, - fpu_addr_modes addr_modes) + struct address *addr, fpu_addr_modes addr_modes) +{ + u_char mod; + unsigned rm = FPU_modrm & 7; + long *cpu_reg_ptr; + int address = 0; /* Initialized just to stop compiler warnings. */ + + /* Memory accessed via the cs selector is write protected + in `non-segmented' 32 bit protected mode. */ + if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) + && (addr_modes.override.segment == PREFIX_CS_)) { + math_abort(FPU_info, SIGSEGV); + } + + addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ + + mod = (FPU_modrm >> 6) & 3; + + if (rm == 4 && mod != 3) { + address = sib(mod, fpu_eip); + } else { + cpu_reg_ptr = ®_(rm); + switch (mod) { + case 0: + if (rm == 5) { + /* Special case: disp32 */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(4); + FPU_get_user(address, + (unsigned long __user + *)(*fpu_eip)); + (*fpu_eip) += 4; + RE_ENTRANT_CHECK_ON; + addr->offset = address; + return (void __user *)address; + } else { + address = *cpu_reg_ptr; /* Just return the contents + of the cpu register */ + addr->offset = address; + return (void __user *)address; + } + case 1: + /* 8 bit signed displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(address, (signed char __user *)(*fpu_eip)); + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + break; + case 2: + /* 32 bit displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(4); + FPU_get_user(address, (long __user *)(*fpu_eip)); + (*fpu_eip) += 4; + RE_ENTRANT_CHECK_ON; + break; + case 3: + /* Not legal for the FPU */ + EXCEPTION(EX_Invalid); + } + address += *cpu_reg_ptr; + } + + addr->offset = address; + + switch (addr_modes.default_mode) { + case 0: + break; + case VM86: + address += vm86_segment(addr_modes.override.segment, addr); + break; + case PM16: + case SEG32: + address = pm_address(FPU_modrm, addr_modes.override.segment, + addr, address); + break; + default: + EXCEPTION(EX_INTERNAL | 0x133); + } + + return (void __user *)address; +} + +void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, + struct address *addr, fpu_addr_modes addr_modes) { - u_char mod; - unsigned rm = FPU_modrm & 7; - long *cpu_reg_ptr; - int address = 0; /* Initialized just to stop compiler warnings. */ - - /* Memory accessed via the cs selector is write protected - in `non-segmented' 32 bit protected mode. */ - if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) - && (addr_modes.override.segment == PREFIX_CS_) ) - { - math_abort(FPU_info,SIGSEGV); - } - - addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ - - mod = (FPU_modrm >> 6) & 3; - - if (rm == 4 && mod != 3) - { - address = sib(mod, fpu_eip); - } - else - { - cpu_reg_ptr = & REG_(rm); - switch (mod) - { + u_char mod; + unsigned rm = FPU_modrm & 7; + int address = 0; /* Default used for mod == 0 */ + + /* Memory accessed via the cs selector is write protected + in `non-segmented' 32 bit protected mode. */ + if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) + && (addr_modes.override.segment == PREFIX_CS_)) { + math_abort(FPU_info, SIGSEGV); + } + + addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ + + mod = (FPU_modrm >> 6) & 3; + + switch (mod) { case 0: - if (rm == 5) - { - /* Special case: disp32 */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(4); - FPU_get_user(address, (unsigned long __user *) (*fpu_eip)); - (*fpu_eip) += 4; - RE_ENTRANT_CHECK_ON; - addr->offset = address; - return (void __user *) address; - } - else - { - address = *cpu_reg_ptr; /* Just return the contents - of the cpu register */ - addr->offset = address; - return (void __user *) address; - } + if (rm == 6) { + /* Special case: disp16 */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(2); + FPU_get_user(address, + (unsigned short __user *)(*fpu_eip)); + (*fpu_eip) += 2; + RE_ENTRANT_CHECK_ON; + goto add_segment; + } + break; case 1: - /* 8 bit signed displacement */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(address, (signed char __user *) (*fpu_eip)); - RE_ENTRANT_CHECK_ON; - (*fpu_eip)++; - break; + /* 8 bit signed displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(address, (signed char __user *)(*fpu_eip)); + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + break; case 2: - /* 32 bit displacement */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(4); - FPU_get_user(address, (long __user *) (*fpu_eip)); - (*fpu_eip) += 4; - RE_ENTRANT_CHECK_ON; - break; + /* 16 bit displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(2); + FPU_get_user(address, (unsigned short __user *)(*fpu_eip)); + (*fpu_eip) += 2; + RE_ENTRANT_CHECK_ON; + break; case 3: - /* Not legal for the FPU */ - EXCEPTION(EX_Invalid); + /* Not legal for the FPU */ + EXCEPTION(EX_Invalid); + break; + } + switch (rm) { + case 0: + address += FPU_info->___ebx + FPU_info->___esi; + break; + case 1: + address += FPU_info->___ebx + FPU_info->___edi; + break; + case 2: + address += FPU_info->___ebp + FPU_info->___esi; + if (addr_modes.override.segment == PREFIX_DEFAULT) + addr_modes.override.segment = PREFIX_SS_; + break; + case 3: + address += FPU_info->___ebp + FPU_info->___edi; + if (addr_modes.override.segment == PREFIX_DEFAULT) + addr_modes.override.segment = PREFIX_SS_; + break; + case 4: + address += FPU_info->___esi; + break; + case 5: + address += FPU_info->___edi; + break; + case 6: + address += FPU_info->___ebp; + if (addr_modes.override.segment == PREFIX_DEFAULT) + addr_modes.override.segment = PREFIX_SS_; + break; + case 7: + address += FPU_info->___ebx; + break; } - address += *cpu_reg_ptr; - } - - addr->offset = address; - - switch ( addr_modes.default_mode ) - { - case 0: - break; - case VM86: - address += vm86_segment(addr_modes.override.segment, addr); - break; - case PM16: - case SEG32: - address = pm_address(FPU_modrm, addr_modes.override.segment, - addr, address); - break; - default: - EXCEPTION(EX_INTERNAL|0x133); - } - - return (void __user *)address; -} + add_segment: + address &= 0xffff; -void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, - struct address *addr, - fpu_addr_modes addr_modes) -{ - u_char mod; - unsigned rm = FPU_modrm & 7; - int address = 0; /* Default used for mod == 0 */ - - /* Memory accessed via the cs selector is write protected - in `non-segmented' 32 bit protected mode. */ - if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) - && (addr_modes.override.segment == PREFIX_CS_) ) - { - math_abort(FPU_info,SIGSEGV); - } - - addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ - - mod = (FPU_modrm >> 6) & 3; - - switch (mod) - { - case 0: - if (rm == 6) - { - /* Special case: disp16 */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(2); - FPU_get_user(address, (unsigned short __user *) (*fpu_eip)); - (*fpu_eip) += 2; - RE_ENTRANT_CHECK_ON; - goto add_segment; + addr->offset = address; + + switch (addr_modes.default_mode) { + case 0: + break; + case VM86: + address += vm86_segment(addr_modes.override.segment, addr); + break; + case PM16: + case SEG32: + address = pm_address(FPU_modrm, addr_modes.override.segment, + addr, address); + break; + default: + EXCEPTION(EX_INTERNAL | 0x131); } - break; - case 1: - /* 8 bit signed displacement */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(address, (signed char __user *) (*fpu_eip)); - RE_ENTRANT_CHECK_ON; - (*fpu_eip)++; - break; - case 2: - /* 16 bit displacement */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(2); - FPU_get_user(address, (unsigned short __user *) (*fpu_eip)); - (*fpu_eip) += 2; - RE_ENTRANT_CHECK_ON; - break; - case 3: - /* Not legal for the FPU */ - EXCEPTION(EX_Invalid); - break; - } - switch ( rm ) - { - case 0: - address += FPU_info->___ebx + FPU_info->___esi; - break; - case 1: - address += FPU_info->___ebx + FPU_info->___edi; - break; - case 2: - address += FPU_info->___ebp + FPU_info->___esi; - if ( addr_modes.override.segment == PREFIX_DEFAULT ) - addr_modes.override.segment = PREFIX_SS_; - break; - case 3: - address += FPU_info->___ebp + FPU_info->___edi; - if ( addr_modes.override.segment == PREFIX_DEFAULT ) - addr_modes.override.segment = PREFIX_SS_; - break; - case 4: - address += FPU_info->___esi; - break; - case 5: - address += FPU_info->___edi; - break; - case 6: - address += FPU_info->___ebp; - if ( addr_modes.override.segment == PREFIX_DEFAULT ) - addr_modes.override.segment = PREFIX_SS_; - break; - case 7: - address += FPU_info->___ebx; - break; - } - - add_segment: - address &= 0xffff; - - addr->offset = address; - - switch ( addr_modes.default_mode ) - { - case 0: - break; - case VM86: - address += vm86_segment(addr_modes.override.segment, addr); - break; - case PM16: - case SEG32: - address = pm_address(FPU_modrm, addr_modes.override.segment, - addr, address); - break; - default: - EXCEPTION(EX_INTERNAL|0x131); - } - - return (void __user *)address ; + + return (void __user *)address; } diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c index eebd6fb1c8a..2931ff35521 100644 --- a/arch/x86/math-emu/load_store.c +++ b/arch/x86/math-emu/load_store.c @@ -26,247 +26,257 @@ #include "status_w.h" #include "control_w.h" - -#define _NONE_ 0 /* st0_ptr etc not needed */ -#define _REG0_ 1 /* Will be storing st(0) */ -#define _PUSH_ 3 /* Need to check for space to push onto stack */ -#define _null_ 4 /* Function illegal or not implemented */ +#define _NONE_ 0 /* st0_ptr etc not needed */ +#define _REG0_ 1 /* Will be storing st(0) */ +#define _PUSH_ 3 /* Need to check for space to push onto stack */ +#define _null_ 4 /* Function illegal or not implemented */ #define pop_0() { FPU_settag0(TAG_Empty); top++; } - static u_char const type_table[32] = { - _PUSH_, _PUSH_, _PUSH_, _PUSH_, - _null_, _null_, _null_, _null_, - _REG0_, _REG0_, _REG0_, _REG0_, - _REG0_, _REG0_, _REG0_, _REG0_, - _NONE_, _null_, _NONE_, _PUSH_, - _NONE_, _PUSH_, _null_, _PUSH_, - _NONE_, _null_, _NONE_, _REG0_, - _NONE_, _REG0_, _NONE_, _REG0_ - }; + _PUSH_, _PUSH_, _PUSH_, _PUSH_, + _null_, _null_, _null_, _null_, + _REG0_, _REG0_, _REG0_, _REG0_, + _REG0_, _REG0_, _REG0_, _REG0_, + _NONE_, _null_, _NONE_, _PUSH_, + _NONE_, _PUSH_, _null_, _PUSH_, + _NONE_, _null_, _NONE_, _REG0_, + _NONE_, _REG0_, _NONE_, _REG0_ +}; u_char const data_sizes_16[32] = { - 4, 4, 8, 2, 0, 0, 0, 0, - 4, 4, 8, 2, 4, 4, 8, 2, - 14, 0, 94, 10, 2, 10, 0, 8, - 14, 0, 94, 10, 2, 10, 2, 8 + 4, 4, 8, 2, 0, 0, 0, 0, + 4, 4, 8, 2, 4, 4, 8, 2, + 14, 0, 94, 10, 2, 10, 0, 8, + 14, 0, 94, 10, 2, 10, 2, 8 }; static u_char const data_sizes_32[32] = { - 4, 4, 8, 2, 0, 0, 0, 0, - 4, 4, 8, 2, 4, 4, 8, 2, - 28, 0,108, 10, 2, 10, 0, 8, - 28, 0,108, 10, 2, 10, 2, 8 + 4, 4, 8, 2, 0, 0, 0, 0, + 4, 4, 8, 2, 4, 4, 8, 2, + 28, 0, 108, 10, 2, 10, 0, 8, + 28, 0, 108, 10, 2, 10, 2, 8 }; int FPU_load_store(u_char type, fpu_addr_modes addr_modes, - void __user *data_address) + void __user * data_address) { - FPU_REG loaded_data; - FPU_REG *st0_ptr; - u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */ - u_char loaded_tag; + FPU_REG loaded_data; + FPU_REG *st0_ptr; + u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */ + u_char loaded_tag; - st0_ptr = NULL; /* Initialized just to stop compiler warnings. */ + st0_ptr = NULL; /* Initialized just to stop compiler warnings. */ - if ( addr_modes.default_mode & PROTECTED ) - { - if ( addr_modes.default_mode == SEG32 ) - { - if ( access_limit < data_sizes_32[type] ) - math_abort(FPU_info,SIGSEGV); - } - else if ( addr_modes.default_mode == PM16 ) - { - if ( access_limit < data_sizes_16[type] ) - math_abort(FPU_info,SIGSEGV); - } + if (addr_modes.default_mode & PROTECTED) { + if (addr_modes.default_mode == SEG32) { + if (access_limit < data_sizes_32[type]) + math_abort(FPU_info, SIGSEGV); + } else if (addr_modes.default_mode == PM16) { + if (access_limit < data_sizes_16[type]) + math_abort(FPU_info, SIGSEGV); + } #ifdef PARANOID - else - EXCEPTION(EX_INTERNAL|0x140); + else + EXCEPTION(EX_INTERNAL | 0x140); #endif /* PARANOID */ - } + } - switch ( type_table[type] ) - { - case _NONE_: - break; - case _REG0_: - st0_ptr = &st(0); /* Some of these instructions pop after - storing */ - st0_tag = FPU_gettag0(); - break; - case _PUSH_: - { - if ( FPU_gettagi(-1) != TAG_Empty ) - { FPU_stack_overflow(); return 0; } - top--; - st0_ptr = &st(0); - } - break; - case _null_: - FPU_illegal(); - return 0; + switch (type_table[type]) { + case _NONE_: + break; + case _REG0_: + st0_ptr = &st(0); /* Some of these instructions pop after + storing */ + st0_tag = FPU_gettag0(); + break; + case _PUSH_: + { + if (FPU_gettagi(-1) != TAG_Empty) { + FPU_stack_overflow(); + return 0; + } + top--; + st0_ptr = &st(0); + } + break; + case _null_: + FPU_illegal(); + return 0; #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x141); - return 0; + default: + EXCEPTION(EX_INTERNAL | 0x141); + return 0; #endif /* PARANOID */ - } - - switch ( type ) - { - case 000: /* fld m32real */ - clear_C1(); - loaded_tag = FPU_load_single((float __user *)data_address, &loaded_data); - if ( (loaded_tag == TAG_Special) - && isNaN(&loaded_data) - && (real_1op_NaN(&loaded_data) < 0) ) - { - top++; - break; - } - FPU_copy_to_reg0(&loaded_data, loaded_tag); - break; - case 001: /* fild m32int */ - clear_C1(); - loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data); - FPU_copy_to_reg0(&loaded_data, loaded_tag); - break; - case 002: /* fld m64real */ - clear_C1(); - loaded_tag = FPU_load_double((double __user *)data_address, &loaded_data); - if ( (loaded_tag == TAG_Special) - && isNaN(&loaded_data) - && (real_1op_NaN(&loaded_data) < 0) ) - { - top++; - break; } - FPU_copy_to_reg0(&loaded_data, loaded_tag); - break; - case 003: /* fild m16int */ - clear_C1(); - loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data); - FPU_copy_to_reg0(&loaded_data, loaded_tag); - break; - case 010: /* fst m32real */ - clear_C1(); - FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address); - break; - case 011: /* fist m32int */ - clear_C1(); - FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address); - break; - case 012: /* fst m64real */ - clear_C1(); - FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address); - break; - case 013: /* fist m16int */ - clear_C1(); - FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address); - break; - case 014: /* fstp m32real */ - clear_C1(); - if ( FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 015: /* fistp m32int */ - clear_C1(); - if ( FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 016: /* fstp m64real */ - clear_C1(); - if ( FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 017: /* fistp m16int */ - clear_C1(); - if ( FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 020: /* fldenv m14/28byte */ - fldenv(addr_modes, (u_char __user *)data_address); - /* Ensure that the values just loaded are not changed by - fix-up operations. */ - return 1; - case 022: /* frstor m94/108byte */ - frstor(addr_modes, (u_char __user *)data_address); - /* Ensure that the values just loaded are not changed by - fix-up operations. */ - return 1; - case 023: /* fbld m80dec */ - clear_C1(); - loaded_tag = FPU_load_bcd((u_char __user *)data_address); - FPU_settag0(loaded_tag); - break; - case 024: /* fldcw */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, data_address, 2); - FPU_get_user(control_word, (unsigned short __user *) data_address); - RE_ENTRANT_CHECK_ON; - if ( partial_status & ~control_word & CW_Exceptions ) - partial_status |= (SW_Summary | SW_Backward); - else - partial_status &= ~(SW_Summary | SW_Backward); + + switch (type) { + case 000: /* fld m32real */ + clear_C1(); + loaded_tag = + FPU_load_single((float __user *)data_address, &loaded_data); + if ((loaded_tag == TAG_Special) + && isNaN(&loaded_data) + && (real_1op_NaN(&loaded_data) < 0)) { + top++; + break; + } + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 001: /* fild m32int */ + clear_C1(); + loaded_tag = + FPU_load_int32((long __user *)data_address, &loaded_data); + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 002: /* fld m64real */ + clear_C1(); + loaded_tag = + FPU_load_double((double __user *)data_address, + &loaded_data); + if ((loaded_tag == TAG_Special) + && isNaN(&loaded_data) + && (real_1op_NaN(&loaded_data) < 0)) { + top++; + break; + } + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 003: /* fild m16int */ + clear_C1(); + loaded_tag = + FPU_load_int16((short __user *)data_address, &loaded_data); + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 010: /* fst m32real */ + clear_C1(); + FPU_store_single(st0_ptr, st0_tag, + (float __user *)data_address); + break; + case 011: /* fist m32int */ + clear_C1(); + FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address); + break; + case 012: /* fst m64real */ + clear_C1(); + FPU_store_double(st0_ptr, st0_tag, + (double __user *)data_address); + break; + case 013: /* fist m16int */ + clear_C1(); + FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address); + break; + case 014: /* fstp m32real */ + clear_C1(); + if (FPU_store_single + (st0_ptr, st0_tag, (float __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 015: /* fistp m32int */ + clear_C1(); + if (FPU_store_int32 + (st0_ptr, st0_tag, (long __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 016: /* fstp m64real */ + clear_C1(); + if (FPU_store_double + (st0_ptr, st0_tag, (double __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 017: /* fistp m16int */ + clear_C1(); + if (FPU_store_int16 + (st0_ptr, st0_tag, (short __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 020: /* fldenv m14/28byte */ + fldenv(addr_modes, (u_char __user *) data_address); + /* Ensure that the values just loaded are not changed by + fix-up operations. */ + return 1; + case 022: /* frstor m94/108byte */ + frstor(addr_modes, (u_char __user *) data_address); + /* Ensure that the values just loaded are not changed by + fix-up operations. */ + return 1; + case 023: /* fbld m80dec */ + clear_C1(); + loaded_tag = FPU_load_bcd((u_char __user *) data_address); + FPU_settag0(loaded_tag); + break; + case 024: /* fldcw */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, data_address, 2); + FPU_get_user(control_word, + (unsigned short __user *)data_address); + RE_ENTRANT_CHECK_ON; + if (partial_status & ~control_word & CW_Exceptions) + partial_status |= (SW_Summary | SW_Backward); + else + partial_status &= ~(SW_Summary | SW_Backward); #ifdef PECULIAR_486 - control_word |= 0x40; /* An 80486 appears to always set this bit */ + control_word |= 0x40; /* An 80486 appears to always set this bit */ #endif /* PECULIAR_486 */ - return 1; - case 025: /* fld m80real */ - clear_C1(); - loaded_tag = FPU_load_extended((long double __user *)data_address, 0); - FPU_settag0(loaded_tag); - break; - case 027: /* fild m64int */ - clear_C1(); - loaded_tag = FPU_load_int64((long long __user *)data_address); - if (loaded_tag == TAG_Error) + return 1; + case 025: /* fld m80real */ + clear_C1(); + loaded_tag = + FPU_load_extended((long double __user *)data_address, 0); + FPU_settag0(loaded_tag); + break; + case 027: /* fild m64int */ + clear_C1(); + loaded_tag = FPU_load_int64((long long __user *)data_address); + if (loaded_tag == TAG_Error) + return 0; + FPU_settag0(loaded_tag); + break; + case 030: /* fstenv m14/28byte */ + fstenv(addr_modes, (u_char __user *) data_address); + return 1; + case 032: /* fsave */ + fsave(addr_modes, (u_char __user *) data_address); + return 1; + case 033: /* fbstp m80dec */ + clear_C1(); + if (FPU_store_bcd + (st0_ptr, st0_tag, (u_char __user *) data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 034: /* fstcw m16int */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, data_address, 2); + FPU_put_user(control_word, + (unsigned short __user *)data_address); + RE_ENTRANT_CHECK_ON; + return 1; + case 035: /* fstp m80real */ + clear_C1(); + if (FPU_store_extended + (st0_ptr, st0_tag, (long double __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 036: /* fstsw m2byte */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, data_address, 2); + FPU_put_user(status_word(), + (unsigned short __user *)data_address); + RE_ENTRANT_CHECK_ON; + return 1; + case 037: /* fistp m64int */ + clear_C1(); + if (FPU_store_int64 + (st0_ptr, st0_tag, (long long __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + } return 0; - FPU_settag0(loaded_tag); - break; - case 030: /* fstenv m14/28byte */ - fstenv(addr_modes, (u_char __user *)data_address); - return 1; - case 032: /* fsave */ - fsave(addr_modes, (u_char __user *)data_address); - return 1; - case 033: /* fbstp m80dec */ - clear_C1(); - if ( FPU_store_bcd(st0_ptr, st0_tag, (u_char __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 034: /* fstcw m16int */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,data_address,2); - FPU_put_user(control_word, (unsigned short __user *) data_address); - RE_ENTRANT_CHECK_ON; - return 1; - case 035: /* fstp m80real */ - clear_C1(); - if ( FPU_store_extended(st0_ptr, st0_tag, (long double __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 036: /* fstsw m2byte */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,data_address,2); - FPU_put_user(status_word(),(unsigned short __user *) data_address); - RE_ENTRANT_CHECK_ON; - return 1; - case 037: /* fistp m64int */ - clear_C1(); - if ( FPU_store_int64(st0_ptr, st0_tag, (long long __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - } - return 0; } diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h index 4db79811492..168eb44c93c 100644 --- a/arch/x86/math-emu/poly.h +++ b/arch/x86/math-emu/poly.h @@ -21,9 +21,9 @@ allows. 9-byte would probably be sufficient. */ typedef struct { - unsigned long lsw; - unsigned long midw; - unsigned long msw; + unsigned long lsw; + unsigned long midw; + unsigned long msw; } Xsig; asmlinkage void mul64(unsigned long long const *a, unsigned long long const *b, @@ -49,7 +49,6 @@ asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest); /* Macro to access the 8 ms bytes of an Xsig as a long long */ #define XSIG_LL(x) (*(unsigned long long *)&x.midw) - /* Need to run gcc with optimizations on to get these to actually be in-line. @@ -63,59 +62,53 @@ asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest); static inline unsigned long mul_32_32(const unsigned long arg1, const unsigned long arg2) { - int retval; - asm volatile ("mull %2; movl %%edx,%%eax" \ - :"=a" (retval) \ - :"0" (arg1), "g" (arg2) \ - :"dx"); - return retval; + int retval; + asm volatile ("mull %2; movl %%edx,%%eax":"=a" (retval) + :"0"(arg1), "g"(arg2) + :"dx"); + return retval; } - /* Add the 12 byte Xsig x2 to Xsig dest, with no checks for overflow. */ static inline void add_Xsig_Xsig(Xsig *dest, const Xsig *x2) { - asm volatile ("movl %1,%%edi; movl %2,%%esi;\n" - "movl (%%esi),%%eax; addl %%eax,(%%edi);\n" - "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n" - "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n" - :"=g" (*dest):"g" (dest), "g" (x2) - :"ax","si","di"); + asm volatile ("movl %1,%%edi; movl %2,%%esi;\n" + "movl (%%esi),%%eax; addl %%eax,(%%edi);\n" + "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n" + "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n":"=g" + (*dest):"g"(dest), "g"(x2) + :"ax", "si", "di"); } - /* Add the 12 byte Xsig x2 to Xsig dest, adjust exp if overflow occurs. */ /* Note: the constraints in the asm statement didn't always work properly with gcc 2.5.8. Changing from using edi to using ecx got around the problem, but keep fingers crossed! */ static inline void add_two_Xsig(Xsig *dest, const Xsig *x2, long int *exp) { - asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n" - "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n" - "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n" - "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n" - "jnc 0f;\n" - "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n" - "movl %4,%%ecx; incl (%%ecx)\n" - "movl $1,%%eax; jmp 1f;\n" - "0: xorl %%eax,%%eax;\n" - "1:\n" - :"=g" (*exp), "=g" (*dest) - :"g" (dest), "g" (x2), "g" (exp) - :"cx","si","ax"); + asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n" + "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n" + "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n" + "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n" + "jnc 0f;\n" + "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n" + "movl %4,%%ecx; incl (%%ecx)\n" + "movl $1,%%eax; jmp 1f;\n" + "0: xorl %%eax,%%eax;\n" "1:\n":"=g" (*exp), "=g"(*dest) + :"g"(dest), "g"(x2), "g"(exp) + :"cx", "si", "ax"); } - /* Negate (subtract from 1.0) the 12 byte Xsig */ /* This is faster in a loop on my 386 than using the "neg" instruction. */ static inline void negate_Xsig(Xsig *x) { - asm volatile("movl %1,%%esi;\n" - "xorl %%ecx,%%ecx;\n" - "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n" - "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n" - "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n" - :"=g" (*x):"g" (x):"si","ax","cx"); + asm volatile ("movl %1,%%esi;\n" + "xorl %%ecx,%%ecx;\n" + "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n" + "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n" + "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n":"=g" + (*x):"g"(x):"si", "ax", "cx"); } #endif /* _POLY_H */ diff --git a/arch/x86/math-emu/poly_2xm1.c b/arch/x86/math-emu/poly_2xm1.c index 9766ad5e974..b00e9e10cdc 100644 --- a/arch/x86/math-emu/poly_2xm1.c +++ b/arch/x86/math-emu/poly_2xm1.c @@ -17,21 +17,19 @@ #include "control_w.h" #include "poly.h" - #define HIPOWER 11 -static const unsigned long long lterms[HIPOWER] = -{ - 0x0000000000000000LL, /* This term done separately as 12 bytes */ - 0xf5fdeffc162c7543LL, - 0x1c6b08d704a0bfa6LL, - 0x0276556df749cc21LL, - 0x002bb0ffcf14f6b8LL, - 0x0002861225ef751cLL, - 0x00001ffcbfcd5422LL, - 0x00000162c005d5f1LL, - 0x0000000da96ccb1bLL, - 0x0000000078d1b897LL, - 0x000000000422b029LL +static const unsigned long long lterms[HIPOWER] = { + 0x0000000000000000LL, /* This term done separately as 12 bytes */ + 0xf5fdeffc162c7543LL, + 0x1c6b08d704a0bfa6LL, + 0x0276556df749cc21LL, + 0x002bb0ffcf14f6b8LL, + 0x0002861225ef751cLL, + 0x00001ffcbfcd5422LL, + 0x00000162c005d5f1LL, + 0x0000000da96ccb1bLL, + 0x0000000078d1b897LL, + 0x000000000422b029LL }; static const Xsig hiterm = MK_XSIG(0xb17217f7, 0xd1cf79ab, 0xc8a39194); @@ -45,112 +43,103 @@ static const Xsig shiftterm2 = MK_XSIG(0xb504f333, 0xf9de6484, 0x597d89b3); static const Xsig shiftterm3 = MK_XSIG(0xd744fcca, 0xd69d6af4, 0x39a68bb9); static const Xsig *shiftterm[] = { &shiftterm0, &shiftterm1, - &shiftterm2, &shiftterm3 }; - + &shiftterm2, &shiftterm3 +}; /*--- poly_2xm1() -----------------------------------------------------------+ | Requires st(0) which is TAG_Valid and < 1. | +---------------------------------------------------------------------------*/ -int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result) +int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result) { - long int exponent, shift; - unsigned long long Xll; - Xsig accumulator, Denom, argSignif; - u_char tag; + long int exponent, shift; + unsigned long long Xll; + Xsig accumulator, Denom, argSignif; + u_char tag; - exponent = exponent16(arg); + exponent = exponent16(arg); #ifdef PARANOID - if ( exponent >= 0 ) /* Don't want a |number| >= 1.0 */ - { - /* Number negative, too large, or not Valid. */ - EXCEPTION(EX_INTERNAL|0x127); - return 1; - } + if (exponent >= 0) { /* Don't want a |number| >= 1.0 */ + /* Number negative, too large, or not Valid. */ + EXCEPTION(EX_INTERNAL | 0x127); + return 1; + } #endif /* PARANOID */ - argSignif.lsw = 0; - XSIG_LL(argSignif) = Xll = significand(arg); - - if ( exponent == -1 ) - { - shift = (argSignif.msw & 0x40000000) ? 3 : 2; - /* subtract 0.5 or 0.75 */ - exponent -= 2; - XSIG_LL(argSignif) <<= 2; - Xll <<= 2; - } - else if ( exponent == -2 ) - { - shift = 1; - /* subtract 0.25 */ - exponent--; - XSIG_LL(argSignif) <<= 1; - Xll <<= 1; - } - else - shift = 0; - - if ( exponent < -2 ) - { - /* Shift the argument right by the required places. */ - if ( FPU_shrx(&Xll, -2-exponent) >= 0x80000000U ) - Xll++; /* round up */ - } - - accumulator.lsw = accumulator.midw = accumulator.msw = 0; - polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER-1); - mul_Xsig_Xsig(&accumulator, &argSignif); - shr_Xsig(&accumulator, 3); - - mul_Xsig_Xsig(&argSignif, &hiterm); /* The leading term */ - add_two_Xsig(&accumulator, &argSignif, &exponent); - - if ( shift ) - { - /* The argument is large, use the identity: - f(x+a) = f(a) * (f(x) + 1) - 1; - */ - shr_Xsig(&accumulator, - exponent); - accumulator.msw |= 0x80000000; /* add 1.0 */ - mul_Xsig_Xsig(&accumulator, shiftterm[shift]); - accumulator.msw &= 0x3fffffff; /* subtract 1.0 */ - exponent = 1; - } - - if ( sign != SIGN_POS ) - { - /* The argument is negative, use the identity: - f(-x) = -f(x) / (1 + f(x)) - */ - Denom.lsw = accumulator.lsw; - XSIG_LL(Denom) = XSIG_LL(accumulator); - if ( exponent < 0 ) - shr_Xsig(&Denom, - exponent); - else if ( exponent > 0 ) - { - /* exponent must be 1 here */ - XSIG_LL(Denom) <<= 1; - if ( Denom.lsw & 0x80000000 ) - XSIG_LL(Denom) |= 1; - (Denom.lsw) <<= 1; + argSignif.lsw = 0; + XSIG_LL(argSignif) = Xll = significand(arg); + + if (exponent == -1) { + shift = (argSignif.msw & 0x40000000) ? 3 : 2; + /* subtract 0.5 or 0.75 */ + exponent -= 2; + XSIG_LL(argSignif) <<= 2; + Xll <<= 2; + } else if (exponent == -2) { + shift = 1; + /* subtract 0.25 */ + exponent--; + XSIG_LL(argSignif) <<= 1; + Xll <<= 1; + } else + shift = 0; + + if (exponent < -2) { + /* Shift the argument right by the required places. */ + if (FPU_shrx(&Xll, -2 - exponent) >= 0x80000000U) + Xll++; /* round up */ + } + + accumulator.lsw = accumulator.midw = accumulator.msw = 0; + polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER - 1); + mul_Xsig_Xsig(&accumulator, &argSignif); + shr_Xsig(&accumulator, 3); + + mul_Xsig_Xsig(&argSignif, &hiterm); /* The leading term */ + add_two_Xsig(&accumulator, &argSignif, &exponent); + + if (shift) { + /* The argument is large, use the identity: + f(x+a) = f(a) * (f(x) + 1) - 1; + */ + shr_Xsig(&accumulator, -exponent); + accumulator.msw |= 0x80000000; /* add 1.0 */ + mul_Xsig_Xsig(&accumulator, shiftterm[shift]); + accumulator.msw &= 0x3fffffff; /* subtract 1.0 */ + exponent = 1; + } + + if (sign != SIGN_POS) { + /* The argument is negative, use the identity: + f(-x) = -f(x) / (1 + f(x)) + */ + Denom.lsw = accumulator.lsw; + XSIG_LL(Denom) = XSIG_LL(accumulator); + if (exponent < 0) + shr_Xsig(&Denom, -exponent); + else if (exponent > 0) { + /* exponent must be 1 here */ + XSIG_LL(Denom) <<= 1; + if (Denom.lsw & 0x80000000) + XSIG_LL(Denom) |= 1; + (Denom.lsw) <<= 1; + } + Denom.msw |= 0x80000000; /* add 1.0 */ + div_Xsig(&accumulator, &Denom, &accumulator); } - Denom.msw |= 0x80000000; /* add 1.0 */ - div_Xsig(&accumulator, &Denom, &accumulator); - } - /* Convert to 64 bit signed-compatible */ - exponent += round_Xsig(&accumulator); + /* Convert to 64 bit signed-compatible */ + exponent += round_Xsig(&accumulator); - result = &st(0); - significand(result) = XSIG_LL(accumulator); - setexponent16(result, exponent); + result = &st(0); + significand(result) = XSIG_LL(accumulator); + setexponent16(result, exponent); - tag = FPU_round(result, 1, 0, FULL_PRECISION, sign); + tag = FPU_round(result, 1, 0, FULL_PRECISION, sign); - setsign(result, sign); - FPU_settag0(tag); + setsign(result, sign); + FPU_settag0(tag); - return 0; + return 0; } diff --git a/arch/x86/math-emu/poly_atan.c b/arch/x86/math-emu/poly_atan.c index 82f702952f6..20c28e58e2d 100644 --- a/arch/x86/math-emu/poly_atan.c +++ b/arch/x86/math-emu/poly_atan.c @@ -18,28 +18,25 @@ #include "control_w.h" #include "poly.h" - #define HIPOWERon 6 /* odd poly, negative terms */ -static const unsigned long long oddnegterms[HIPOWERon] = -{ - 0x0000000000000000LL, /* Dummy (not for - 1.0) */ - 0x015328437f756467LL, - 0x0005dda27b73dec6LL, - 0x0000226bf2bfb91aLL, - 0x000000ccc439c5f7LL, - 0x0000000355438407LL -} ; +static const unsigned long long oddnegterms[HIPOWERon] = { + 0x0000000000000000LL, /* Dummy (not for - 1.0) */ + 0x015328437f756467LL, + 0x0005dda27b73dec6LL, + 0x0000226bf2bfb91aLL, + 0x000000ccc439c5f7LL, + 0x0000000355438407LL +}; #define HIPOWERop 6 /* odd poly, positive terms */ -static const unsigned long long oddplterms[HIPOWERop] = -{ +static const unsigned long long oddplterms[HIPOWERop] = { /* 0xaaaaaaaaaaaaaaabLL, transferred to fixedpterm[] */ - 0x0db55a71875c9ac2LL, - 0x0029fce2d67880b0LL, - 0x0000dfd3908b4596LL, - 0x00000550fd61dab4LL, - 0x0000001c9422b3f9LL, - 0x000000003e3301e1LL + 0x0db55a71875c9ac2LL, + 0x0029fce2d67880b0LL, + 0x0000dfd3908b4596LL, + 0x00000550fd61dab4LL, + 0x0000001c9422b3f9LL, + 0x000000003e3301e1LL }; static const unsigned long long denomterm = 0xebd9b842c5c53a0eLL; @@ -48,182 +45,164 @@ static const Xsig fixedpterm = MK_XSIG(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa); static const Xsig pi_signif = MK_XSIG(0xc90fdaa2, 0x2168c234, 0xc4c6628b); - /*--- poly_atan() -----------------------------------------------------------+ | | +---------------------------------------------------------------------------*/ -void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, - FPU_REG *st1_ptr, u_char st1_tag) +void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, + FPU_REG *st1_ptr, u_char st1_tag) { - u_char transformed, inverted, - sign1, sign2; - int exponent; - long int dummy_exp; - Xsig accumulator, Numer, Denom, accumulatore, argSignif, - argSq, argSqSq; - u_char tag; - - sign1 = getsign(st0_ptr); - sign2 = getsign(st1_ptr); - if ( st0_tag == TAG_Valid ) - { - exponent = exponent(st0_ptr); - } - else - { - /* This gives non-compatible stack contents... */ - FPU_to_exp16(st0_ptr, st0_ptr); - exponent = exponent16(st0_ptr); - } - if ( st1_tag == TAG_Valid ) - { - exponent -= exponent(st1_ptr); - } - else - { - /* This gives non-compatible stack contents... */ - FPU_to_exp16(st1_ptr, st1_ptr); - exponent -= exponent16(st1_ptr); - } - - if ( (exponent < 0) || ((exponent == 0) && - ((st0_ptr->sigh < st1_ptr->sigh) || - ((st0_ptr->sigh == st1_ptr->sigh) && - (st0_ptr->sigl < st1_ptr->sigl))) ) ) - { - inverted = 1; - Numer.lsw = Denom.lsw = 0; - XSIG_LL(Numer) = significand(st0_ptr); - XSIG_LL(Denom) = significand(st1_ptr); - } - else - { - inverted = 0; - exponent = -exponent; - Numer.lsw = Denom.lsw = 0; - XSIG_LL(Numer) = significand(st1_ptr); - XSIG_LL(Denom) = significand(st0_ptr); - } - div_Xsig(&Numer, &Denom, &argSignif); - exponent += norm_Xsig(&argSignif); - - if ( (exponent >= -1) - || ((exponent == -2) && (argSignif.msw > 0xd413ccd0)) ) - { - /* The argument is greater than sqrt(2)-1 (=0.414213562...) */ - /* Convert the argument by an identity for atan */ - transformed = 1; - - if ( exponent >= 0 ) - { + u_char transformed, inverted, sign1, sign2; + int exponent; + long int dummy_exp; + Xsig accumulator, Numer, Denom, accumulatore, argSignif, argSq, argSqSq; + u_char tag; + + sign1 = getsign(st0_ptr); + sign2 = getsign(st1_ptr); + if (st0_tag == TAG_Valid) { + exponent = exponent(st0_ptr); + } else { + /* This gives non-compatible stack contents... */ + FPU_to_exp16(st0_ptr, st0_ptr); + exponent = exponent16(st0_ptr); + } + if (st1_tag == TAG_Valid) { + exponent -= exponent(st1_ptr); + } else { + /* This gives non-compatible stack contents... */ + FPU_to_exp16(st1_ptr, st1_ptr); + exponent -= exponent16(st1_ptr); + } + + if ((exponent < 0) || ((exponent == 0) && + ((st0_ptr->sigh < st1_ptr->sigh) || + ((st0_ptr->sigh == st1_ptr->sigh) && + (st0_ptr->sigl < st1_ptr->sigl))))) { + inverted = 1; + Numer.lsw = Denom.lsw = 0; + XSIG_LL(Numer) = significand(st0_ptr); + XSIG_LL(Denom) = significand(st1_ptr); + } else { + inverted = 0; + exponent = -exponent; + Numer.lsw = Denom.lsw = 0; + XSIG_LL(Numer) = significand(st1_ptr); + XSIG_LL(Denom) = significand(st0_ptr); + } + div_Xsig(&Numer, &Denom, &argSignif); + exponent += norm_Xsig(&argSignif); + + if ((exponent >= -1) + || ((exponent == -2) && (argSignif.msw > 0xd413ccd0))) { + /* The argument is greater than sqrt(2)-1 (=0.414213562...) */ + /* Convert the argument by an identity for atan */ + transformed = 1; + + if (exponent >= 0) { #ifdef PARANOID - if ( !( (exponent == 0) && - (argSignif.lsw == 0) && (argSignif.midw == 0) && - (argSignif.msw == 0x80000000) ) ) - { - EXCEPTION(EX_INTERNAL|0x104); /* There must be a logic error */ - return; - } + if (!((exponent == 0) && + (argSignif.lsw == 0) && (argSignif.midw == 0) && + (argSignif.msw == 0x80000000))) { + EXCEPTION(EX_INTERNAL | 0x104); /* There must be a logic error */ + return; + } #endif /* PARANOID */ - argSignif.msw = 0; /* Make the transformed arg -> 0.0 */ + argSignif.msw = 0; /* Make the transformed arg -> 0.0 */ + } else { + Numer.lsw = Denom.lsw = argSignif.lsw; + XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif); + + if (exponent < -1) + shr_Xsig(&Numer, -1 - exponent); + negate_Xsig(&Numer); + + shr_Xsig(&Denom, -exponent); + Denom.msw |= 0x80000000; + + div_Xsig(&Numer, &Denom, &argSignif); + + exponent = -1 + norm_Xsig(&argSignif); + } + } else { + transformed = 0; + } + + argSq.lsw = argSignif.lsw; + argSq.midw = argSignif.midw; + argSq.msw = argSignif.msw; + mul_Xsig_Xsig(&argSq, &argSq); + + argSqSq.lsw = argSq.lsw; + argSqSq.midw = argSq.midw; + argSqSq.msw = argSq.msw; + mul_Xsig_Xsig(&argSqSq, &argSqSq); + + accumulatore.lsw = argSq.lsw; + XSIG_LL(accumulatore) = XSIG_LL(argSq); + + shr_Xsig(&argSq, 2 * (-1 - exponent - 1)); + shr_Xsig(&argSqSq, 4 * (-1 - exponent - 1)); + + /* Now have argSq etc with binary point at the left + .1xxxxxxxx */ + + /* Do the basic fixed point polynomial evaluation */ + accumulator.msw = accumulator.midw = accumulator.lsw = 0; + polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), + oddplterms, HIPOWERop - 1); + mul64_Xsig(&accumulator, &XSIG_LL(argSq)); + negate_Xsig(&accumulator); + polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms, + HIPOWERon - 1); + negate_Xsig(&accumulator); + add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp); + + mul64_Xsig(&accumulatore, &denomterm); + shr_Xsig(&accumulatore, 1 + 2 * (-1 - exponent)); + accumulatore.msw |= 0x80000000; + + div_Xsig(&accumulator, &accumulatore, &accumulator); + + mul_Xsig_Xsig(&accumulator, &argSignif); + mul_Xsig_Xsig(&accumulator, &argSq); + + shr_Xsig(&accumulator, 3); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &argSignif); + + if (transformed) { + /* compute pi/4 - accumulator */ + shr_Xsig(&accumulator, -1 - exponent); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &pi_signif); + exponent = -1; + } + + if (inverted) { + /* compute pi/2 - accumulator */ + shr_Xsig(&accumulator, -exponent); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &pi_signif); + exponent = 0; } - else - { - Numer.lsw = Denom.lsw = argSignif.lsw; - XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif); - - if ( exponent < -1 ) - shr_Xsig(&Numer, -1-exponent); - negate_Xsig(&Numer); - - shr_Xsig(&Denom, -exponent); - Denom.msw |= 0x80000000; - - div_Xsig(&Numer, &Denom, &argSignif); - - exponent = -1 + norm_Xsig(&argSignif); + + if (sign1) { + /* compute pi - accumulator */ + shr_Xsig(&accumulator, 1 - exponent); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &pi_signif); + exponent = 1; } - } - else - { - transformed = 0; - } - - argSq.lsw = argSignif.lsw; argSq.midw = argSignif.midw; - argSq.msw = argSignif.msw; - mul_Xsig_Xsig(&argSq, &argSq); - - argSqSq.lsw = argSq.lsw; argSqSq.midw = argSq.midw; argSqSq.msw = argSq.msw; - mul_Xsig_Xsig(&argSqSq, &argSqSq); - - accumulatore.lsw = argSq.lsw; - XSIG_LL(accumulatore) = XSIG_LL(argSq); - - shr_Xsig(&argSq, 2*(-1-exponent-1)); - shr_Xsig(&argSqSq, 4*(-1-exponent-1)); - - /* Now have argSq etc with binary point at the left - .1xxxxxxxx */ - - /* Do the basic fixed point polynomial evaluation */ - accumulator.msw = accumulator.midw = accumulator.lsw = 0; - polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), - oddplterms, HIPOWERop-1); - mul64_Xsig(&accumulator, &XSIG_LL(argSq)); - negate_Xsig(&accumulator); - polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms, HIPOWERon-1); - negate_Xsig(&accumulator); - add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp); - - mul64_Xsig(&accumulatore, &denomterm); - shr_Xsig(&accumulatore, 1 + 2*(-1-exponent)); - accumulatore.msw |= 0x80000000; - - div_Xsig(&accumulator, &accumulatore, &accumulator); - - mul_Xsig_Xsig(&accumulator, &argSignif); - mul_Xsig_Xsig(&accumulator, &argSq); - - shr_Xsig(&accumulator, 3); - negate_Xsig(&accumulator); - add_Xsig_Xsig(&accumulator, &argSignif); - - if ( transformed ) - { - /* compute pi/4 - accumulator */ - shr_Xsig(&accumulator, -1-exponent); - negate_Xsig(&accumulator); - add_Xsig_Xsig(&accumulator, &pi_signif); - exponent = -1; - } - - if ( inverted ) - { - /* compute pi/2 - accumulator */ - shr_Xsig(&accumulator, -exponent); - negate_Xsig(&accumulator); - add_Xsig_Xsig(&accumulator, &pi_signif); - exponent = 0; - } - - if ( sign1 ) - { - /* compute pi - accumulator */ - shr_Xsig(&accumulator, 1 - exponent); - negate_Xsig(&accumulator); - add_Xsig_Xsig(&accumulator, &pi_signif); - exponent = 1; - } - - exponent += round_Xsig(&accumulator); - - significand(st1_ptr) = XSIG_LL(accumulator); - setexponent16(st1_ptr, exponent); - - tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2); - FPU_settagi(1, tag); - - set_precision_flag_up(); /* We do not really know if up or down, - use this as the default. */ + + exponent += round_Xsig(&accumulator); + + significand(st1_ptr) = XSIG_LL(accumulator); + setexponent16(st1_ptr, exponent); + + tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2); + FPU_settagi(1, tag); + + set_precision_flag_up(); /* We do not really know if up or down, + use this as the default. */ } diff --git a/arch/x86/math-emu/poly_l2.c b/arch/x86/math-emu/poly_l2.c index dd00e1d5b07..8e2ff4b28a0 100644 --- a/arch/x86/math-emu/poly_l2.c +++ b/arch/x86/math-emu/poly_l2.c @@ -10,7 +10,6 @@ | | +---------------------------------------------------------------------------*/ - #include "exception.h" #include "reg_constant.h" #include "fpu_emu.h" @@ -18,184 +17,163 @@ #include "control_w.h" #include "poly.h" - static void log2_kernel(FPU_REG const *arg, u_char argsign, - Xsig *accum_result, long int *expon); - + Xsig * accum_result, long int *expon); /*--- poly_l2() -------------------------------------------------------------+ | Base 2 logarithm by a polynomial approximation. | +---------------------------------------------------------------------------*/ -void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign) +void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign) { - long int exponent, expon, expon_expon; - Xsig accumulator, expon_accum, yaccum; - u_char sign, argsign; - FPU_REG x; - int tag; - - exponent = exponent16(st0_ptr); - - /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */ - if ( st0_ptr->sigh > (unsigned)0xb504f334 ) - { - /* Treat as sqrt(2)/2 < st0_ptr < 1 */ - significand(&x) = - significand(st0_ptr); - setexponent16(&x, -1); - exponent++; - argsign = SIGN_NEG; - } - else - { - /* Treat as 1 <= st0_ptr < sqrt(2) */ - x.sigh = st0_ptr->sigh - 0x80000000; - x.sigl = st0_ptr->sigl; - setexponent16(&x, 0); - argsign = SIGN_POS; - } - tag = FPU_normalize_nuo(&x); - - if ( tag == TAG_Zero ) - { - expon = 0; - accumulator.msw = accumulator.midw = accumulator.lsw = 0; - } - else - { - log2_kernel(&x, argsign, &accumulator, &expon); - } - - if ( exponent < 0 ) - { - sign = SIGN_NEG; - exponent = -exponent; - } - else - sign = SIGN_POS; - expon_accum.msw = exponent; expon_accum.midw = expon_accum.lsw = 0; - if ( exponent ) - { - expon_expon = 31 + norm_Xsig(&expon_accum); - shr_Xsig(&accumulator, expon_expon - expon); - - if ( sign ^ argsign ) - negate_Xsig(&accumulator); - add_Xsig_Xsig(&accumulator, &expon_accum); - } - else - { - expon_expon = expon; - sign = argsign; - } - - yaccum.lsw = 0; XSIG_LL(yaccum) = significand(st1_ptr); - mul_Xsig_Xsig(&accumulator, &yaccum); - - expon_expon += round_Xsig(&accumulator); - - if ( accumulator.msw == 0 ) - { - FPU_copy_to_reg1(&CONST_Z, TAG_Zero); - return; - } - - significand(st1_ptr) = XSIG_LL(accumulator); - setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1); - - tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign); - FPU_settagi(1, tag); - - set_precision_flag_up(); /* 80486 appears to always do this */ - - return; + long int exponent, expon, expon_expon; + Xsig accumulator, expon_accum, yaccum; + u_char sign, argsign; + FPU_REG x; + int tag; + + exponent = exponent16(st0_ptr); + + /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */ + if (st0_ptr->sigh > (unsigned)0xb504f334) { + /* Treat as sqrt(2)/2 < st0_ptr < 1 */ + significand(&x) = -significand(st0_ptr); + setexponent16(&x, -1); + exponent++; + argsign = SIGN_NEG; + } else { + /* Treat as 1 <= st0_ptr < sqrt(2) */ + x.sigh = st0_ptr->sigh - 0x80000000; + x.sigl = st0_ptr->sigl; + setexponent16(&x, 0); + argsign = SIGN_POS; + } + tag = FPU_normalize_nuo(&x); -} + if (tag == TAG_Zero) { + expon = 0; + accumulator.msw = accumulator.midw = accumulator.lsw = 0; + } else { + log2_kernel(&x, argsign, &accumulator, &expon); + } + + if (exponent < 0) { + sign = SIGN_NEG; + exponent = -exponent; + } else + sign = SIGN_POS; + expon_accum.msw = exponent; + expon_accum.midw = expon_accum.lsw = 0; + if (exponent) { + expon_expon = 31 + norm_Xsig(&expon_accum); + shr_Xsig(&accumulator, expon_expon - expon); + + if (sign ^ argsign) + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &expon_accum); + } else { + expon_expon = expon; + sign = argsign; + } + + yaccum.lsw = 0; + XSIG_LL(yaccum) = significand(st1_ptr); + mul_Xsig_Xsig(&accumulator, &yaccum); + + expon_expon += round_Xsig(&accumulator); + + if (accumulator.msw == 0) { + FPU_copy_to_reg1(&CONST_Z, TAG_Zero); + return; + } + + significand(st1_ptr) = XSIG_LL(accumulator); + setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1); + tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign); + FPU_settagi(1, tag); + + set_precision_flag_up(); /* 80486 appears to always do this */ + + return; + +} /*--- poly_l2p1() -----------------------------------------------------------+ | Base 2 logarithm by a polynomial approximation. | | log2(x+1) | +---------------------------------------------------------------------------*/ -int poly_l2p1(u_char sign0, u_char sign1, - FPU_REG *st0_ptr, FPU_REG *st1_ptr, FPU_REG *dest) +int poly_l2p1(u_char sign0, u_char sign1, + FPU_REG * st0_ptr, FPU_REG * st1_ptr, FPU_REG * dest) { - u_char tag; - long int exponent; - Xsig accumulator, yaccum; + u_char tag; + long int exponent; + Xsig accumulator, yaccum; - if ( exponent16(st0_ptr) < 0 ) - { - log2_kernel(st0_ptr, sign0, &accumulator, &exponent); + if (exponent16(st0_ptr) < 0) { + log2_kernel(st0_ptr, sign0, &accumulator, &exponent); - yaccum.lsw = 0; - XSIG_LL(yaccum) = significand(st1_ptr); - mul_Xsig_Xsig(&accumulator, &yaccum); + yaccum.lsw = 0; + XSIG_LL(yaccum) = significand(st1_ptr); + mul_Xsig_Xsig(&accumulator, &yaccum); - exponent += round_Xsig(&accumulator); + exponent += round_Xsig(&accumulator); - exponent += exponent16(st1_ptr) + 1; - if ( exponent < EXP_WAY_UNDER ) exponent = EXP_WAY_UNDER; + exponent += exponent16(st1_ptr) + 1; + if (exponent < EXP_WAY_UNDER) + exponent = EXP_WAY_UNDER; - significand(dest) = XSIG_LL(accumulator); - setexponent16(dest, exponent); + significand(dest) = XSIG_LL(accumulator); + setexponent16(dest, exponent); - tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1); - FPU_settagi(1, tag); + tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1); + FPU_settagi(1, tag); - if ( tag == TAG_Valid ) - set_precision_flag_up(); /* 80486 appears to always do this */ - } - else - { - /* The magnitude of st0_ptr is far too large. */ + if (tag == TAG_Valid) + set_precision_flag_up(); /* 80486 appears to always do this */ + } else { + /* The magnitude of st0_ptr is far too large. */ - if ( sign0 != SIGN_POS ) - { - /* Trying to get the log of a negative number. */ -#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ - changesign(st1_ptr); + if (sign0 != SIGN_POS) { + /* Trying to get the log of a negative number. */ +#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ + changesign(st1_ptr); #else - if ( arith_invalid(1) < 0 ) - return 1; + if (arith_invalid(1) < 0) + return 1; #endif /* PECULIAR_486 */ - } + } - /* 80486 appears to do this */ - if ( sign0 == SIGN_NEG ) - set_precision_flag_down(); - else - set_precision_flag_up(); - } + /* 80486 appears to do this */ + if (sign0 == SIGN_NEG) + set_precision_flag_down(); + else + set_precision_flag_up(); + } - if ( exponent(dest) <= EXP_UNDER ) - EXCEPTION(EX_Underflow); + if (exponent(dest) <= EXP_UNDER) + EXCEPTION(EX_Underflow); - return 0; + return 0; } - - - #undef HIPOWER #define HIPOWER 10 -static const unsigned long long logterms[HIPOWER] = -{ - 0x2a8eca5705fc2ef0LL, - 0xf6384ee1d01febceLL, - 0x093bb62877cdf642LL, - 0x006985d8a9ec439bLL, - 0x0005212c4f55a9c8LL, - 0x00004326a16927f0LL, - 0x0000038d1d80a0e7LL, - 0x0000003141cc80c6LL, - 0x00000002b1668c9fLL, - 0x000000002c7a46aaLL +static const unsigned long long logterms[HIPOWER] = { + 0x2a8eca5705fc2ef0LL, + 0xf6384ee1d01febceLL, + 0x093bb62877cdf642LL, + 0x006985d8a9ec439bLL, + 0x0005212c4f55a9c8LL, + 0x00004326a16927f0LL, + 0x0000038d1d80a0e7LL, + 0x0000003141cc80c6LL, + 0x00000002b1668c9fLL, + 0x000000002c7a46aaLL }; static const unsigned long leadterm = 0xb8000000; - /*--- log2_kernel() ---------------------------------------------------------+ | Base 2 logarithm by a polynomial approximation. | | log2(x+1) | @@ -203,70 +181,64 @@ static const unsigned long leadterm = 0xb8000000; static void log2_kernel(FPU_REG const *arg, u_char argsign, Xsig *accum_result, long int *expon) { - long int exponent, adj; - unsigned long long Xsq; - Xsig accumulator, Numer, Denom, argSignif, arg_signif; - - exponent = exponent16(arg); - Numer.lsw = Denom.lsw = 0; - XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg); - if ( argsign == SIGN_POS ) - { - shr_Xsig(&Denom, 2 - (1 + exponent)); - Denom.msw |= 0x80000000; - div_Xsig(&Numer, &Denom, &argSignif); - } - else - { - shr_Xsig(&Denom, 1 - (1 + exponent)); - negate_Xsig(&Denom); - if ( Denom.msw & 0x80000000 ) - { - div_Xsig(&Numer, &Denom, &argSignif); - exponent ++; - } - else - { - /* Denom must be 1.0 */ - argSignif.lsw = Numer.lsw; argSignif.midw = Numer.midw; - argSignif.msw = Numer.msw; + long int exponent, adj; + unsigned long long Xsq; + Xsig accumulator, Numer, Denom, argSignif, arg_signif; + + exponent = exponent16(arg); + Numer.lsw = Denom.lsw = 0; + XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg); + if (argsign == SIGN_POS) { + shr_Xsig(&Denom, 2 - (1 + exponent)); + Denom.msw |= 0x80000000; + div_Xsig(&Numer, &Denom, &argSignif); + } else { + shr_Xsig(&Denom, 1 - (1 + exponent)); + negate_Xsig(&Denom); + if (Denom.msw & 0x80000000) { + div_Xsig(&Numer, &Denom, &argSignif); + exponent++; + } else { + /* Denom must be 1.0 */ + argSignif.lsw = Numer.lsw; + argSignif.midw = Numer.midw; + argSignif.msw = Numer.msw; + } } - } #ifndef PECULIAR_486 - /* Should check here that |local_arg| is within the valid range */ - if ( exponent >= -2 ) - { - if ( (exponent > -2) || - (argSignif.msw > (unsigned)0xafb0ccc0) ) - { - /* The argument is too large */ + /* Should check here that |local_arg| is within the valid range */ + if (exponent >= -2) { + if ((exponent > -2) || (argSignif.msw > (unsigned)0xafb0ccc0)) { + /* The argument is too large */ + } } - } #endif /* PECULIAR_486 */ - arg_signif.lsw = argSignif.lsw; XSIG_LL(arg_signif) = XSIG_LL(argSignif); - adj = norm_Xsig(&argSignif); - accumulator.lsw = argSignif.lsw; XSIG_LL(accumulator) = XSIG_LL(argSignif); - mul_Xsig_Xsig(&accumulator, &accumulator); - shr_Xsig(&accumulator, 2*(-1 - (1 + exponent + adj))); - Xsq = XSIG_LL(accumulator); - if ( accumulator.lsw & 0x80000000 ) - Xsq++; - - accumulator.msw = accumulator.midw = accumulator.lsw = 0; - /* Do the basic fixed point polynomial evaluation */ - polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER-1); - - mul_Xsig_Xsig(&accumulator, &argSignif); - shr_Xsig(&accumulator, 6 - adj); - - mul32_Xsig(&arg_signif, leadterm); - add_two_Xsig(&accumulator, &arg_signif, &exponent); - - *expon = exponent + 1; - accum_result->lsw = accumulator.lsw; - accum_result->midw = accumulator.midw; - accum_result->msw = accumulator.msw; + arg_signif.lsw = argSignif.lsw; + XSIG_LL(arg_signif) = XSIG_LL(argSignif); + adj = norm_Xsig(&argSignif); + accumulator.lsw = argSignif.lsw; + XSIG_LL(accumulator) = XSIG_LL(argSignif); + mul_Xsig_Xsig(&accumulator, &accumulator); + shr_Xsig(&accumulator, 2 * (-1 - (1 + exponent + adj))); + Xsq = XSIG_LL(accumulator); + if (accumulator.lsw & 0x80000000) + Xsq++; + + accumulator.msw = accumulator.midw = accumulator.lsw = 0; + /* Do the basic fixed point polynomial evaluation */ + polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER - 1); + + mul_Xsig_Xsig(&accumulator, &argSignif); + shr_Xsig(&accumulator, 6 - adj); + + mul32_Xsig(&arg_signif, leadterm); + add_two_Xsig(&accumulator, &arg_signif, &exponent); + + *expon = exponent + 1; + accum_result->lsw = accumulator.lsw; + accum_result->midw = accumulator.midw; + accum_result->msw = accumulator.msw; } diff --git a/arch/x86/math-emu/poly_sin.c b/arch/x86/math-emu/poly_sin.c index a36313fb06f..b862039c728 100644 --- a/arch/x86/math-emu/poly_sin.c +++ b/arch/x86/math-emu/poly_sin.c @@ -11,7 +11,6 @@ | | +---------------------------------------------------------------------------*/ - #include "exception.h" #include "reg_constant.h" #include "fpu_emu.h" @@ -19,379 +18,361 @@ #include "control_w.h" #include "poly.h" - #define N_COEFF_P 4 #define N_COEFF_N 4 -static const unsigned long long pos_terms_l[N_COEFF_P] = -{ - 0xaaaaaaaaaaaaaaabLL, - 0x00d00d00d00cf906LL, - 0x000006b99159a8bbLL, - 0x000000000d7392e6LL +static const unsigned long long pos_terms_l[N_COEFF_P] = { + 0xaaaaaaaaaaaaaaabLL, + 0x00d00d00d00cf906LL, + 0x000006b99159a8bbLL, + 0x000000000d7392e6LL }; -static const unsigned long long neg_terms_l[N_COEFF_N] = -{ - 0x2222222222222167LL, - 0x0002e3bc74aab624LL, - 0x0000000b09229062LL, - 0x00000000000c7973LL +static const unsigned long long neg_terms_l[N_COEFF_N] = { + 0x2222222222222167LL, + 0x0002e3bc74aab624LL, + 0x0000000b09229062LL, + 0x00000000000c7973LL }; - - #define N_COEFF_PH 4 #define N_COEFF_NH 4 -static const unsigned long long pos_terms_h[N_COEFF_PH] = -{ - 0x0000000000000000LL, - 0x05b05b05b05b0406LL, - 0x000049f93edd91a9LL, - 0x00000000c9c9ed62LL +static const unsigned long long pos_terms_h[N_COEFF_PH] = { + 0x0000000000000000LL, + 0x05b05b05b05b0406LL, + 0x000049f93edd91a9LL, + 0x00000000c9c9ed62LL }; -static const unsigned long long neg_terms_h[N_COEFF_NH] = -{ - 0xaaaaaaaaaaaaaa98LL, - 0x001a01a01a019064LL, - 0x0000008f76c68a77LL, - 0x0000000000d58f5eLL +static const unsigned long long neg_terms_h[N_COEFF_NH] = { + 0xaaaaaaaaaaaaaa98LL, + 0x001a01a01a019064LL, + 0x0000008f76c68a77LL, + 0x0000000000d58f5eLL }; - /*--- poly_sine() -----------------------------------------------------------+ | | +---------------------------------------------------------------------------*/ -void poly_sine(FPU_REG *st0_ptr) +void poly_sine(FPU_REG *st0_ptr) { - int exponent, echange; - Xsig accumulator, argSqrd, argTo4; - unsigned long fix_up, adj; - unsigned long long fixed_arg; - FPU_REG result; + int exponent, echange; + Xsig accumulator, argSqrd, argTo4; + unsigned long fix_up, adj; + unsigned long long fixed_arg; + FPU_REG result; - exponent = exponent(st0_ptr); + exponent = exponent(st0_ptr); - accumulator.lsw = accumulator.midw = accumulator.msw = 0; + accumulator.lsw = accumulator.midw = accumulator.msw = 0; - /* Split into two ranges, for arguments below and above 1.0 */ - /* The boundary between upper and lower is approx 0.88309101259 */ - if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa)) ) - { - /* The argument is <= 0.88309101259 */ + /* Split into two ranges, for arguments below and above 1.0 */ + /* The boundary between upper and lower is approx 0.88309101259 */ + if ((exponent < -1) + || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa))) { + /* The argument is <= 0.88309101259 */ + + argSqrd.msw = st0_ptr->sigh; + argSqrd.midw = st0_ptr->sigl; + argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &significand(st0_ptr)); + shr_Xsig(&argSqrd, 2 * (-1 - exponent)); + argTo4.msw = argSqrd.msw; + argTo4.midw = argSqrd.midw; + argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); - argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl; argSqrd.lsw = 0; - mul64_Xsig(&argSqrd, &significand(st0_ptr)); - shr_Xsig(&argSqrd, 2*(-1-exponent)); - argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; - argTo4.lsw = argSqrd.lsw; - mul_Xsig_Xsig(&argTo4, &argTo4); + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, + N_COEFF_N - 1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, - N_COEFF_N-1); - mul_Xsig_Xsig(&accumulator, &argSqrd); - negate_Xsig(&accumulator); + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, + N_COEFF_P - 1); - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, - N_COEFF_P-1); + shr_Xsig(&accumulator, 2); /* Divide by four */ + accumulator.msw |= 0x80000000; /* Add 1.0 */ - shr_Xsig(&accumulator, 2); /* Divide by four */ - accumulator.msw |= 0x80000000; /* Add 1.0 */ + mul64_Xsig(&accumulator, &significand(st0_ptr)); + mul64_Xsig(&accumulator, &significand(st0_ptr)); + mul64_Xsig(&accumulator, &significand(st0_ptr)); - mul64_Xsig(&accumulator, &significand(st0_ptr)); - mul64_Xsig(&accumulator, &significand(st0_ptr)); - mul64_Xsig(&accumulator, &significand(st0_ptr)); + /* Divide by four, FPU_REG compatible, etc */ + exponent = 3 * exponent; - /* Divide by four, FPU_REG compatible, etc */ - exponent = 3*exponent; + /* The minimum exponent difference is 3 */ + shr_Xsig(&accumulator, exponent(st0_ptr) - exponent); - /* The minimum exponent difference is 3 */ - shr_Xsig(&accumulator, exponent(st0_ptr) - exponent); + negate_Xsig(&accumulator); + XSIG_LL(accumulator) += significand(st0_ptr); - negate_Xsig(&accumulator); - XSIG_LL(accumulator) += significand(st0_ptr); + echange = round_Xsig(&accumulator); - echange = round_Xsig(&accumulator); + setexponentpos(&result, exponent(st0_ptr) + echange); + } else { + /* The argument is > 0.88309101259 */ + /* We use sin(st(0)) = cos(pi/2-st(0)) */ - setexponentpos(&result, exponent(st0_ptr) + echange); - } - else - { - /* The argument is > 0.88309101259 */ - /* We use sin(st(0)) = cos(pi/2-st(0)) */ + fixed_arg = significand(st0_ptr); - fixed_arg = significand(st0_ptr); + if (exponent == 0) { + /* The argument is >= 1.0 */ - if ( exponent == 0 ) - { - /* The argument is >= 1.0 */ + /* Put the binary point at the left. */ + fixed_arg <<= 1; + } + /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ + fixed_arg = 0x921fb54442d18469LL - fixed_arg; + /* There is a special case which arises due to rounding, to fix here. */ + if (fixed_arg == 0xffffffffffffffffLL) + fixed_arg = 0; - /* Put the binary point at the left. */ - fixed_arg <<= 1; - } - /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ - fixed_arg = 0x921fb54442d18469LL - fixed_arg; - /* There is a special case which arises due to rounding, to fix here. */ - if ( fixed_arg == 0xffffffffffffffffLL ) - fixed_arg = 0; + XSIG_LL(argSqrd) = fixed_arg; + argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &fixed_arg); - XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0; - mul64_Xsig(&argSqrd, &fixed_arg); + XSIG_LL(argTo4) = XSIG_LL(argSqrd); + argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); - XSIG_LL(argTo4) = XSIG_LL(argSqrd); argTo4.lsw = argSqrd.lsw; - mul_Xsig_Xsig(&argTo4, &argTo4); + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, + N_COEFF_NH - 1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, - N_COEFF_NH-1); - mul_Xsig_Xsig(&accumulator, &argSqrd); - negate_Xsig(&accumulator); + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, + N_COEFF_PH - 1); + negate_Xsig(&accumulator); - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, - N_COEFF_PH-1); - negate_Xsig(&accumulator); + mul64_Xsig(&accumulator, &fixed_arg); + mul64_Xsig(&accumulator, &fixed_arg); - mul64_Xsig(&accumulator, &fixed_arg); - mul64_Xsig(&accumulator, &fixed_arg); + shr_Xsig(&accumulator, 3); + negate_Xsig(&accumulator); - shr_Xsig(&accumulator, 3); - negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &argSqrd); - add_Xsig_Xsig(&accumulator, &argSqrd); + shr_Xsig(&accumulator, 1); - shr_Xsig(&accumulator, 1); + accumulator.lsw |= 1; /* A zero accumulator here would cause problems */ + negate_Xsig(&accumulator); - accumulator.lsw |= 1; /* A zero accumulator here would cause problems */ - negate_Xsig(&accumulator); + /* The basic computation is complete. Now fix the answer to + compensate for the error due to the approximation used for + pi/2 + */ - /* The basic computation is complete. Now fix the answer to - compensate for the error due to the approximation used for - pi/2 - */ + /* This has an exponent of -65 */ + fix_up = 0x898cc517; + /* The fix-up needs to be improved for larger args */ + if (argSqrd.msw & 0xffc00000) { + /* Get about 32 bit precision in these: */ + fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6; + } + fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg)); - /* This has an exponent of -65 */ - fix_up = 0x898cc517; - /* The fix-up needs to be improved for larger args */ - if ( argSqrd.msw & 0xffc00000 ) - { - /* Get about 32 bit precision in these: */ - fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6; - } - fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg)); + adj = accumulator.lsw; /* temp save */ + accumulator.lsw -= fix_up; + if (accumulator.lsw > adj) + XSIG_LL(accumulator)--; - adj = accumulator.lsw; /* temp save */ - accumulator.lsw -= fix_up; - if ( accumulator.lsw > adj ) - XSIG_LL(accumulator) --; + echange = round_Xsig(&accumulator); - echange = round_Xsig(&accumulator); - - setexponentpos(&result, echange - 1); - } + setexponentpos(&result, echange - 1); + } - significand(&result) = XSIG_LL(accumulator); - setsign(&result, getsign(st0_ptr)); - FPU_copy_to_reg0(&result, TAG_Valid); + significand(&result) = XSIG_LL(accumulator); + setsign(&result, getsign(st0_ptr)); + FPU_copy_to_reg0(&result, TAG_Valid); #ifdef PARANOID - if ( (exponent(&result) >= 0) - && (significand(&result) > 0x8000000000000000LL) ) - { - EXCEPTION(EX_INTERNAL|0x150); - } + if ((exponent(&result) >= 0) + && (significand(&result) > 0x8000000000000000LL)) { + EXCEPTION(EX_INTERNAL | 0x150); + } #endif /* PARANOID */ } - - /*--- poly_cos() ------------------------------------------------------------+ | | +---------------------------------------------------------------------------*/ -void poly_cos(FPU_REG *st0_ptr) +void poly_cos(FPU_REG *st0_ptr) { - FPU_REG result; - long int exponent, exp2, echange; - Xsig accumulator, argSqrd, fix_up, argTo4; - unsigned long long fixed_arg; + FPU_REG result; + long int exponent, exp2, echange; + Xsig accumulator, argSqrd, fix_up, argTo4; + unsigned long long fixed_arg; #ifdef PARANOID - if ( (exponent(st0_ptr) > 0) - || ((exponent(st0_ptr) == 0) - && (significand(st0_ptr) > 0xc90fdaa22168c234LL)) ) - { - EXCEPTION(EX_Invalid); - FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); - return; - } -#endif /* PARANOID */ - - exponent = exponent(st0_ptr); - - accumulator.lsw = accumulator.midw = accumulator.msw = 0; - - if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54)) ) - { - /* arg is < 0.687705 */ - - argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl; - argSqrd.lsw = 0; - mul64_Xsig(&argSqrd, &significand(st0_ptr)); - - if ( exponent < -1 ) - { - /* shift the argument right by the required places */ - shr_Xsig(&argSqrd, 2*(-1-exponent)); - } - - argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; - argTo4.lsw = argSqrd.lsw; - mul_Xsig_Xsig(&argTo4, &argTo4); - - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, - N_COEFF_NH-1); - mul_Xsig_Xsig(&accumulator, &argSqrd); - negate_Xsig(&accumulator); - - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, - N_COEFF_PH-1); - negate_Xsig(&accumulator); - - mul64_Xsig(&accumulator, &significand(st0_ptr)); - mul64_Xsig(&accumulator, &significand(st0_ptr)); - shr_Xsig(&accumulator, -2*(1+exponent)); - - shr_Xsig(&accumulator, 3); - negate_Xsig(&accumulator); - - add_Xsig_Xsig(&accumulator, &argSqrd); - - shr_Xsig(&accumulator, 1); - - /* It doesn't matter if accumulator is all zero here, the - following code will work ok */ - negate_Xsig(&accumulator); - - if ( accumulator.lsw & 0x80000000 ) - XSIG_LL(accumulator) ++; - if ( accumulator.msw == 0 ) - { - /* The result is 1.0 */ - FPU_copy_to_reg0(&CONST_1, TAG_Valid); - return; - } - else - { - significand(&result) = XSIG_LL(accumulator); - - /* will be a valid positive nr with expon = -1 */ - setexponentpos(&result, -1); - } - } - else - { - fixed_arg = significand(st0_ptr); - - if ( exponent == 0 ) - { - /* The argument is >= 1.0 */ - - /* Put the binary point at the left. */ - fixed_arg <<= 1; - } - /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ - fixed_arg = 0x921fb54442d18469LL - fixed_arg; - /* There is a special case which arises due to rounding, to fix here. */ - if ( fixed_arg == 0xffffffffffffffffLL ) - fixed_arg = 0; - - exponent = -1; - exp2 = -1; - - /* A shift is needed here only for a narrow range of arguments, - i.e. for fixed_arg approx 2^-32, but we pick up more... */ - if ( !(LL_MSW(fixed_arg) & 0xffff0000) ) - { - fixed_arg <<= 16; - exponent -= 16; - exp2 -= 16; + if ((exponent(st0_ptr) > 0) + || ((exponent(st0_ptr) == 0) + && (significand(st0_ptr) > 0xc90fdaa22168c234LL))) { + EXCEPTION(EX_Invalid); + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + return; } +#endif /* PARANOID */ - XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0; - mul64_Xsig(&argSqrd, &fixed_arg); - - if ( exponent < -1 ) - { - /* shift the argument right by the required places */ - shr_Xsig(&argSqrd, 2*(-1-exponent)); - } - - argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; - argTo4.lsw = argSqrd.lsw; - mul_Xsig_Xsig(&argTo4, &argTo4); - - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, - N_COEFF_N-1); - mul_Xsig_Xsig(&accumulator, &argSqrd); - negate_Xsig(&accumulator); - - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, - N_COEFF_P-1); - - shr_Xsig(&accumulator, 2); /* Divide by four */ - accumulator.msw |= 0x80000000; /* Add 1.0 */ - - mul64_Xsig(&accumulator, &fixed_arg); - mul64_Xsig(&accumulator, &fixed_arg); - mul64_Xsig(&accumulator, &fixed_arg); - - /* Divide by four, FPU_REG compatible, etc */ - exponent = 3*exponent; - - /* The minimum exponent difference is 3 */ - shr_Xsig(&accumulator, exp2 - exponent); - - negate_Xsig(&accumulator); - XSIG_LL(accumulator) += fixed_arg; - - /* The basic computation is complete. Now fix the answer to - compensate for the error due to the approximation used for - pi/2 - */ - - /* This has an exponent of -65 */ - XSIG_LL(fix_up) = 0x898cc51701b839a2ll; - fix_up.lsw = 0; - - /* The fix-up needs to be improved for larger args */ - if ( argSqrd.msw & 0xffc00000 ) - { - /* Get about 32 bit precision in these: */ - fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2; - fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24; + exponent = exponent(st0_ptr); + + accumulator.lsw = accumulator.midw = accumulator.msw = 0; + + if ((exponent < -1) + || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54))) { + /* arg is < 0.687705 */ + + argSqrd.msw = st0_ptr->sigh; + argSqrd.midw = st0_ptr->sigl; + argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &significand(st0_ptr)); + + if (exponent < -1) { + /* shift the argument right by the required places */ + shr_Xsig(&argSqrd, 2 * (-1 - exponent)); + } + + argTo4.msw = argSqrd.msw; + argTo4.midw = argSqrd.midw; + argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, + N_COEFF_NH - 1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, + N_COEFF_PH - 1); + negate_Xsig(&accumulator); + + mul64_Xsig(&accumulator, &significand(st0_ptr)); + mul64_Xsig(&accumulator, &significand(st0_ptr)); + shr_Xsig(&accumulator, -2 * (1 + exponent)); + + shr_Xsig(&accumulator, 3); + negate_Xsig(&accumulator); + + add_Xsig_Xsig(&accumulator, &argSqrd); + + shr_Xsig(&accumulator, 1); + + /* It doesn't matter if accumulator is all zero here, the + following code will work ok */ + negate_Xsig(&accumulator); + + if (accumulator.lsw & 0x80000000) + XSIG_LL(accumulator)++; + if (accumulator.msw == 0) { + /* The result is 1.0 */ + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + return; + } else { + significand(&result) = XSIG_LL(accumulator); + + /* will be a valid positive nr with expon = -1 */ + setexponentpos(&result, -1); + } + } else { + fixed_arg = significand(st0_ptr); + + if (exponent == 0) { + /* The argument is >= 1.0 */ + + /* Put the binary point at the left. */ + fixed_arg <<= 1; + } + /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ + fixed_arg = 0x921fb54442d18469LL - fixed_arg; + /* There is a special case which arises due to rounding, to fix here. */ + if (fixed_arg == 0xffffffffffffffffLL) + fixed_arg = 0; + + exponent = -1; + exp2 = -1; + + /* A shift is needed here only for a narrow range of arguments, + i.e. for fixed_arg approx 2^-32, but we pick up more... */ + if (!(LL_MSW(fixed_arg) & 0xffff0000)) { + fixed_arg <<= 16; + exponent -= 16; + exp2 -= 16; + } + + XSIG_LL(argSqrd) = fixed_arg; + argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &fixed_arg); + + if (exponent < -1) { + /* shift the argument right by the required places */ + shr_Xsig(&argSqrd, 2 * (-1 - exponent)); + } + + argTo4.msw = argSqrd.msw; + argTo4.midw = argSqrd.midw; + argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, + N_COEFF_N - 1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, + N_COEFF_P - 1); + + shr_Xsig(&accumulator, 2); /* Divide by four */ + accumulator.msw |= 0x80000000; /* Add 1.0 */ + + mul64_Xsig(&accumulator, &fixed_arg); + mul64_Xsig(&accumulator, &fixed_arg); + mul64_Xsig(&accumulator, &fixed_arg); + + /* Divide by four, FPU_REG compatible, etc */ + exponent = 3 * exponent; + + /* The minimum exponent difference is 3 */ + shr_Xsig(&accumulator, exp2 - exponent); + + negate_Xsig(&accumulator); + XSIG_LL(accumulator) += fixed_arg; + + /* The basic computation is complete. Now fix the answer to + compensate for the error due to the approximation used for + pi/2 + */ + + /* This has an exponent of -65 */ + XSIG_LL(fix_up) = 0x898cc51701b839a2ll; + fix_up.lsw = 0; + + /* The fix-up needs to be improved for larger args */ + if (argSqrd.msw & 0xffc00000) { + /* Get about 32 bit precision in these: */ + fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2; + fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24; + } + + exp2 += norm_Xsig(&accumulator); + shr_Xsig(&accumulator, 1); /* Prevent overflow */ + exp2++; + shr_Xsig(&fix_up, 65 + exp2); + + add_Xsig_Xsig(&accumulator, &fix_up); + + echange = round_Xsig(&accumulator); + + setexponentpos(&result, exp2 + echange); + significand(&result) = XSIG_LL(accumulator); } - exp2 += norm_Xsig(&accumulator); - shr_Xsig(&accumulator, 1); /* Prevent overflow */ - exp2++; - shr_Xsig(&fix_up, 65 + exp2); - - add_Xsig_Xsig(&accumulator, &fix_up); - - echange = round_Xsig(&accumulator); - - setexponentpos(&result, exp2 + echange); - significand(&result) = XSIG_LL(accumulator); - } - - FPU_copy_to_reg0(&result, TAG_Valid); + FPU_copy_to_reg0(&result, TAG_Valid); #ifdef PARANOID - if ( (exponent(&result) >= 0) - && (significand(&result) > 0x8000000000000000LL) ) - { - EXCEPTION(EX_INTERNAL|0x151); - } + if ((exponent(&result) >= 0) + && (significand(&result) > 0x8000000000000000LL)) { + EXCEPTION(EX_INTERNAL | 0x151); + } #endif /* PARANOID */ } diff --git a/arch/x86/math-emu/poly_tan.c b/arch/x86/math-emu/poly_tan.c index 8df3e03b6e6..1875763e0c0 100644 --- a/arch/x86/math-emu/poly_tan.c +++ b/arch/x86/math-emu/poly_tan.c @@ -17,206 +17,196 @@ #include "control_w.h" #include "poly.h" - #define HiPOWERop 3 /* odd poly, positive terms */ -static const unsigned long long oddplterm[HiPOWERop] = -{ - 0x0000000000000000LL, - 0x0051a1cf08fca228LL, - 0x0000000071284ff7LL +static const unsigned long long oddplterm[HiPOWERop] = { + 0x0000000000000000LL, + 0x0051a1cf08fca228LL, + 0x0000000071284ff7LL }; #define HiPOWERon 2 /* odd poly, negative terms */ -static const unsigned long long oddnegterm[HiPOWERon] = -{ - 0x1291a9a184244e80LL, - 0x0000583245819c21LL +static const unsigned long long oddnegterm[HiPOWERon] = { + 0x1291a9a184244e80LL, + 0x0000583245819c21LL }; #define HiPOWERep 2 /* even poly, positive terms */ -static const unsigned long long evenplterm[HiPOWERep] = -{ - 0x0e848884b539e888LL, - 0x00003c7f18b887daLL +static const unsigned long long evenplterm[HiPOWERep] = { + 0x0e848884b539e888LL, + 0x00003c7f18b887daLL }; #define HiPOWERen 2 /* even poly, negative terms */ -static const unsigned long long evennegterm[HiPOWERen] = -{ - 0xf1f0200fd51569ccLL, - 0x003afb46105c4432LL +static const unsigned long long evennegterm[HiPOWERen] = { + 0xf1f0200fd51569ccLL, + 0x003afb46105c4432LL }; static const unsigned long long twothirds = 0xaaaaaaaaaaaaaaabLL; - /*--- poly_tan() ------------------------------------------------------------+ | | +---------------------------------------------------------------------------*/ -void poly_tan(FPU_REG *st0_ptr) +void poly_tan(FPU_REG *st0_ptr) { - long int exponent; - int invert; - Xsig argSq, argSqSq, accumulatoro, accumulatore, accum, - argSignif, fix_up; - unsigned long adj; + long int exponent; + int invert; + Xsig argSq, argSqSq, accumulatoro, accumulatore, accum, + argSignif, fix_up; + unsigned long adj; - exponent = exponent(st0_ptr); + exponent = exponent(st0_ptr); #ifdef PARANOID - if ( signnegative(st0_ptr) ) /* Can't hack a number < 0.0 */ - { arith_invalid(0); return; } /* Need a positive number */ + if (signnegative(st0_ptr)) { /* Can't hack a number < 0.0 */ + arith_invalid(0); + return; + } /* Need a positive number */ #endif /* PARANOID */ - /* Split the problem into two domains, smaller and larger than pi/4 */ - if ( (exponent == 0) || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2)) ) - { - /* The argument is greater than (approx) pi/4 */ - invert = 1; - accum.lsw = 0; - XSIG_LL(accum) = significand(st0_ptr); - - if ( exponent == 0 ) - { - /* The argument is >= 1.0 */ - /* Put the binary point at the left. */ - XSIG_LL(accum) <<= 1; - } - /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ - XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum); - /* This is a special case which arises due to rounding. */ - if ( XSIG_LL(accum) == 0xffffffffffffffffLL ) - { - FPU_settag0(TAG_Valid); - significand(st0_ptr) = 0x8a51e04daabda360LL; - setexponent16(st0_ptr, (0x41 + EXTENDED_Ebias) | SIGN_Negative); - return; + /* Split the problem into two domains, smaller and larger than pi/4 */ + if ((exponent == 0) + || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2))) { + /* The argument is greater than (approx) pi/4 */ + invert = 1; + accum.lsw = 0; + XSIG_LL(accum) = significand(st0_ptr); + + if (exponent == 0) { + /* The argument is >= 1.0 */ + /* Put the binary point at the left. */ + XSIG_LL(accum) <<= 1; + } + /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ + XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum); + /* This is a special case which arises due to rounding. */ + if (XSIG_LL(accum) == 0xffffffffffffffffLL) { + FPU_settag0(TAG_Valid); + significand(st0_ptr) = 0x8a51e04daabda360LL; + setexponent16(st0_ptr, + (0x41 + EXTENDED_Ebias) | SIGN_Negative); + return; + } + + argSignif.lsw = accum.lsw; + XSIG_LL(argSignif) = XSIG_LL(accum); + exponent = -1 + norm_Xsig(&argSignif); + } else { + invert = 0; + argSignif.lsw = 0; + XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr); + + if (exponent < -1) { + /* shift the argument right by the required places */ + if (FPU_shrx(&XSIG_LL(accum), -1 - exponent) >= + 0x80000000U) + XSIG_LL(accum)++; /* round up */ + } } - argSignif.lsw = accum.lsw; - XSIG_LL(argSignif) = XSIG_LL(accum); - exponent = -1 + norm_Xsig(&argSignif); - } - else - { - invert = 0; - argSignif.lsw = 0; - XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr); - - if ( exponent < -1 ) - { - /* shift the argument right by the required places */ - if ( FPU_shrx(&XSIG_LL(accum), -1-exponent) >= 0x80000000U ) - XSIG_LL(accum) ++; /* round up */ - } - } - - XSIG_LL(argSq) = XSIG_LL(accum); argSq.lsw = accum.lsw; - mul_Xsig_Xsig(&argSq, &argSq); - XSIG_LL(argSqSq) = XSIG_LL(argSq); argSqSq.lsw = argSq.lsw; - mul_Xsig_Xsig(&argSqSq, &argSqSq); - - /* Compute the negative terms for the numerator polynomial */ - accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0; - polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm, HiPOWERon-1); - mul_Xsig_Xsig(&accumulatoro, &argSq); - negate_Xsig(&accumulatoro); - /* Add the positive terms */ - polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm, HiPOWERop-1); - - - /* Compute the positive terms for the denominator polynomial */ - accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0; - polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm, HiPOWERep-1); - mul_Xsig_Xsig(&accumulatore, &argSq); - negate_Xsig(&accumulatore); - /* Add the negative terms */ - polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm, HiPOWERen-1); - /* Multiply by arg^2 */ - mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); - mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); - /* de-normalize and divide by 2 */ - shr_Xsig(&accumulatore, -2*(1+exponent) + 1); - negate_Xsig(&accumulatore); /* This does 1 - accumulator */ - - /* Now find the ratio. */ - if ( accumulatore.msw == 0 ) - { - /* accumulatoro must contain 1.0 here, (actually, 0) but it - really doesn't matter what value we use because it will - have negligible effect in later calculations - */ - XSIG_LL(accum) = 0x8000000000000000LL; - accum.lsw = 0; - } - else - { - div_Xsig(&accumulatoro, &accumulatore, &accum); - } - - /* Multiply by 1/3 * arg^3 */ - mul64_Xsig(&accum, &XSIG_LL(argSignif)); - mul64_Xsig(&accum, &XSIG_LL(argSignif)); - mul64_Xsig(&accum, &XSIG_LL(argSignif)); - mul64_Xsig(&accum, &twothirds); - shr_Xsig(&accum, -2*(exponent+1)); - - /* tan(arg) = arg + accum */ - add_two_Xsig(&accum, &argSignif, &exponent); - - if ( invert ) - { - /* We now have the value of tan(pi_2 - arg) where pi_2 is an - approximation for pi/2 - */ - /* The next step is to fix the answer to compensate for the - error due to the approximation used for pi/2 - */ - - /* This is (approx) delta, the error in our approx for pi/2 - (see above). It has an exponent of -65 - */ - XSIG_LL(fix_up) = 0x898cc51701b839a2LL; - fix_up.lsw = 0; - - if ( exponent == 0 ) - adj = 0xffffffff; /* We want approx 1.0 here, but - this is close enough. */ - else if ( exponent > -30 ) - { - adj = accum.msw >> -(exponent+1); /* tan */ - adj = mul_32_32(adj, adj); /* tan^2 */ + XSIG_LL(argSq) = XSIG_LL(accum); + argSq.lsw = accum.lsw; + mul_Xsig_Xsig(&argSq, &argSq); + XSIG_LL(argSqSq) = XSIG_LL(argSq); + argSqSq.lsw = argSq.lsw; + mul_Xsig_Xsig(&argSqSq, &argSqSq); + + /* Compute the negative terms for the numerator polynomial */ + accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0; + polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm, + HiPOWERon - 1); + mul_Xsig_Xsig(&accumulatoro, &argSq); + negate_Xsig(&accumulatoro); + /* Add the positive terms */ + polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm, + HiPOWERop - 1); + + /* Compute the positive terms for the denominator polynomial */ + accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0; + polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm, + HiPOWERep - 1); + mul_Xsig_Xsig(&accumulatore, &argSq); + negate_Xsig(&accumulatore); + /* Add the negative terms */ + polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm, + HiPOWERen - 1); + /* Multiply by arg^2 */ + mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); + mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); + /* de-normalize and divide by 2 */ + shr_Xsig(&accumulatore, -2 * (1 + exponent) + 1); + negate_Xsig(&accumulatore); /* This does 1 - accumulator */ + + /* Now find the ratio. */ + if (accumulatore.msw == 0) { + /* accumulatoro must contain 1.0 here, (actually, 0) but it + really doesn't matter what value we use because it will + have negligible effect in later calculations + */ + XSIG_LL(accum) = 0x8000000000000000LL; + accum.lsw = 0; + } else { + div_Xsig(&accumulatoro, &accumulatore, &accum); } - else - adj = 0; - adj = mul_32_32(0x898cc517, adj); /* delta * tan^2 */ - - fix_up.msw += adj; - if ( !(fix_up.msw & 0x80000000) ) /* did fix_up overflow ? */ - { - /* Yes, we need to add an msb */ - shr_Xsig(&fix_up, 1); - fix_up.msw |= 0x80000000; - shr_Xsig(&fix_up, 64 + exponent); + + /* Multiply by 1/3 * arg^3 */ + mul64_Xsig(&accum, &XSIG_LL(argSignif)); + mul64_Xsig(&accum, &XSIG_LL(argSignif)); + mul64_Xsig(&accum, &XSIG_LL(argSignif)); + mul64_Xsig(&accum, &twothirds); + shr_Xsig(&accum, -2 * (exponent + 1)); + + /* tan(arg) = arg + accum */ + add_two_Xsig(&accum, &argSignif, &exponent); + + if (invert) { + /* We now have the value of tan(pi_2 - arg) where pi_2 is an + approximation for pi/2 + */ + /* The next step is to fix the answer to compensate for the + error due to the approximation used for pi/2 + */ + + /* This is (approx) delta, the error in our approx for pi/2 + (see above). It has an exponent of -65 + */ + XSIG_LL(fix_up) = 0x898cc51701b839a2LL; + fix_up.lsw = 0; + + if (exponent == 0) + adj = 0xffffffff; /* We want approx 1.0 here, but + this is close enough. */ + else if (exponent > -30) { + adj = accum.msw >> -(exponent + 1); /* tan */ + adj = mul_32_32(adj, adj); /* tan^2 */ + } else + adj = 0; + adj = mul_32_32(0x898cc517, adj); /* delta * tan^2 */ + + fix_up.msw += adj; + if (!(fix_up.msw & 0x80000000)) { /* did fix_up overflow ? */ + /* Yes, we need to add an msb */ + shr_Xsig(&fix_up, 1); + fix_up.msw |= 0x80000000; + shr_Xsig(&fix_up, 64 + exponent); + } else + shr_Xsig(&fix_up, 65 + exponent); + + add_two_Xsig(&accum, &fix_up, &exponent); + + /* accum now contains tan(pi/2 - arg). + Use tan(arg) = 1.0 / tan(pi/2 - arg) + */ + accumulatoro.lsw = accumulatoro.midw = 0; + accumulatoro.msw = 0x80000000; + div_Xsig(&accumulatoro, &accum, &accum); + exponent = -exponent - 1; } - else - shr_Xsig(&fix_up, 65 + exponent); - - add_two_Xsig(&accum, &fix_up, &exponent); - - /* accum now contains tan(pi/2 - arg). - Use tan(arg) = 1.0 / tan(pi/2 - arg) - */ - accumulatoro.lsw = accumulatoro.midw = 0; - accumulatoro.msw = 0x80000000; - div_Xsig(&accumulatoro, &accum, &accum); - exponent = - exponent - 1; - } - - /* Transfer the result */ - round_Xsig(&accum); - FPU_settag0(TAG_Valid); - significand(st0_ptr) = XSIG_LL(accum); - setexponent16(st0_ptr, exponent + EXTENDED_Ebias); /* Result is positive. */ + + /* Transfer the result */ + round_Xsig(&accum); + FPU_settag0(TAG_Valid); + significand(st0_ptr) = XSIG_LL(accum); + setexponent16(st0_ptr, exponent + EXTENDED_Ebias); /* Result is positive. */ } diff --git a/arch/x86/math-emu/reg_add_sub.c b/arch/x86/math-emu/reg_add_sub.c index 7cd3b37ac08..deea48b9f13 100644 --- a/arch/x86/math-emu/reg_add_sub.c +++ b/arch/x86/math-emu/reg_add_sub.c @@ -27,7 +27,7 @@ static int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, FPU_REG const *b, u_char tagb, u_char signb, - FPU_REG *dest, int deststnr, int control_w); + FPU_REG * dest, int deststnr, int control_w); /* Operates on st(0) and st(n), or on st(0) and temporary data. @@ -35,340 +35,299 @@ int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, */ int FPU_add(FPU_REG const *b, u_char tagb, int deststnr, int control_w) { - FPU_REG *a = &st(0); - FPU_REG *dest = &st(deststnr); - u_char signb = getsign(b); - u_char taga = FPU_gettag0(); - u_char signa = getsign(a); - u_char saved_sign = getsign(dest); - int diff, tag, expa, expb; - - if ( !(taga | tagb) ) - { - expa = exponent(a); - expb = exponent(b); - - valid_add: - /* Both registers are valid */ - if (!(signa ^ signb)) - { - /* signs are the same */ - tag = FPU_u_add(a, b, dest, control_w, signa, expa, expb); - } - else - { - /* The signs are different, so do a subtraction */ - diff = expa - expb; - if (!diff) - { - diff = a->sigh - b->sigh; /* This works only if the ms bits - are identical. */ - if (!diff) - { - diff = a->sigl > b->sigl; - if (!diff) - diff = -(a->sigl < b->sigl); + FPU_REG *a = &st(0); + FPU_REG *dest = &st(deststnr); + u_char signb = getsign(b); + u_char taga = FPU_gettag0(); + u_char signa = getsign(a); + u_char saved_sign = getsign(dest); + int diff, tag, expa, expb; + + if (!(taga | tagb)) { + expa = exponent(a); + expb = exponent(b); + + valid_add: + /* Both registers are valid */ + if (!(signa ^ signb)) { + /* signs are the same */ + tag = + FPU_u_add(a, b, dest, control_w, signa, expa, expb); + } else { + /* The signs are different, so do a subtraction */ + diff = expa - expb; + if (!diff) { + diff = a->sigh - b->sigh; /* This works only if the ms bits + are identical. */ + if (!diff) { + diff = a->sigl > b->sigl; + if (!diff) + diff = -(a->sigl < b->sigl); + } + } + + if (diff > 0) { + tag = + FPU_u_sub(a, b, dest, control_w, signa, + expa, expb); + } else if (diff < 0) { + tag = + FPU_u_sub(b, a, dest, control_w, signb, + expb, expa); + } else { + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + /* sign depends upon rounding mode */ + setsign(dest, ((control_w & CW_RC) != RC_DOWN) + ? SIGN_POS : SIGN_NEG); + return TAG_Zero; + } } - } - - if (diff > 0) - { - tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb); - } - else if ( diff < 0 ) - { - tag = FPU_u_sub(b, a, dest, control_w, signb, expb, expa); - } - else - { - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - /* sign depends upon rounding mode */ - setsign(dest, ((control_w & CW_RC) != RC_DOWN) - ? SIGN_POS : SIGN_NEG); - return TAG_Zero; - } - } - if ( tag < 0 ) - { - setsign(dest, saved_sign); - return tag; + if (tag < 0) { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; } - FPU_settagi(deststnr, tag); - return tag; - } - if ( taga == TAG_Special ) - taga = FPU_Special(a); - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); + if (taga == TAG_Special) + taga = FPU_Special(a); + if (tagb == TAG_Special) + tagb = FPU_Special(b); - if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + if (((taga == TAG_Valid) && (tagb == TW_Denormal)) || ((taga == TW_Denormal) && (tagb == TAG_Valid)) - || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) - { - FPU_REG x, y; + || ((taga == TW_Denormal) && (tagb == TW_Denormal))) { + FPU_REG x, y; + + if (denormal_operand() < 0) + return FPU_Exception; + + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + a = &x; + b = &y; + expa = exponent16(a); + expb = exponent16(b); + goto valid_add; + } - if ( denormal_operand() < 0 ) - return FPU_Exception; + if ((taga == TW_NaN) || (tagb == TW_NaN)) { + if (deststnr == 0) + return real_2op_NaN(b, tagb, deststnr, a); + else + return real_2op_NaN(a, taga, deststnr, a); + } - FPU_to_exp16(a, &x); - FPU_to_exp16(b, &y); - a = &x; - b = &y; - expa = exponent16(a); - expb = exponent16(b); - goto valid_add; - } - - if ( (taga == TW_NaN) || (tagb == TW_NaN) ) - { - if ( deststnr == 0 ) - return real_2op_NaN(b, tagb, deststnr, a); - else - return real_2op_NaN(a, taga, deststnr, a); - } - - return add_sub_specials(a, taga, signa, b, tagb, signb, - dest, deststnr, control_w); + return add_sub_specials(a, taga, signa, b, tagb, signb, + dest, deststnr, control_w); } - /* Subtract b from a. (a-b) -> dest */ int FPU_sub(int flags, int rm, int control_w) { - FPU_REG const *a, *b; - FPU_REG *dest; - u_char taga, tagb, signa, signb, saved_sign, sign; - int diff, tag = 0, expa, expb, deststnr; - - a = &st(0); - taga = FPU_gettag0(); - - deststnr = 0; - if ( flags & LOADED ) - { - b = (FPU_REG *)rm; - tagb = flags & 0x0f; - } - else - { - b = &st(rm); - tagb = FPU_gettagi(rm); - - if ( flags & DEST_RM ) - deststnr = rm; - } - - signa = getsign(a); - signb = getsign(b); - - if ( flags & REV ) - { - signa ^= SIGN_NEG; - signb ^= SIGN_NEG; - } - - dest = &st(deststnr); - saved_sign = getsign(dest); - - if ( !(taga | tagb) ) - { - expa = exponent(a); - expb = exponent(b); - - valid_subtract: - /* Both registers are valid */ - - diff = expa - expb; - - if (!diff) - { - diff = a->sigh - b->sigh; /* Works only if ms bits are identical */ - if (!diff) - { - diff = a->sigl > b->sigl; - if (!diff) - diff = -(a->sigl < b->sigl); - } + FPU_REG const *a, *b; + FPU_REG *dest; + u_char taga, tagb, signa, signb, saved_sign, sign; + int diff, tag = 0, expa, expb, deststnr; + + a = &st(0); + taga = FPU_gettag0(); + + deststnr = 0; + if (flags & LOADED) { + b = (FPU_REG *) rm; + tagb = flags & 0x0f; + } else { + b = &st(rm); + tagb = FPU_gettagi(rm); + + if (flags & DEST_RM) + deststnr = rm; } - switch ( (((int)signa)*2 + signb) / SIGN_NEG ) - { - case 0: /* P - P */ - case 3: /* N - N */ - if (diff > 0) - { - /* |a| > |b| */ - tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb); - } - else if ( diff == 0 ) - { - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - - /* sign depends upon rounding mode */ - setsign(dest, ((control_w & CW_RC) != RC_DOWN) - ? SIGN_POS : SIGN_NEG); - return TAG_Zero; - } - else - { - sign = signa ^ SIGN_NEG; - tag = FPU_u_sub(b, a, dest, control_w, sign, expb, expa); - } - break; - case 1: /* P - N */ - tag = FPU_u_add(a, b, dest, control_w, SIGN_POS, expa, expb); - break; - case 2: /* N - P */ - tag = FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa, expb); - break; + signa = getsign(a); + signb = getsign(b); + + if (flags & REV) { + signa ^= SIGN_NEG; + signb ^= SIGN_NEG; + } + + dest = &st(deststnr); + saved_sign = getsign(dest); + + if (!(taga | tagb)) { + expa = exponent(a); + expb = exponent(b); + + valid_subtract: + /* Both registers are valid */ + + diff = expa - expb; + + if (!diff) { + diff = a->sigh - b->sigh; /* Works only if ms bits are identical */ + if (!diff) { + diff = a->sigl > b->sigl; + if (!diff) + diff = -(a->sigl < b->sigl); + } + } + + switch ((((int)signa) * 2 + signb) / SIGN_NEG) { + case 0: /* P - P */ + case 3: /* N - N */ + if (diff > 0) { + /* |a| > |b| */ + tag = + FPU_u_sub(a, b, dest, control_w, signa, + expa, expb); + } else if (diff == 0) { + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + + /* sign depends upon rounding mode */ + setsign(dest, ((control_w & CW_RC) != RC_DOWN) + ? SIGN_POS : SIGN_NEG); + return TAG_Zero; + } else { + sign = signa ^ SIGN_NEG; + tag = + FPU_u_sub(b, a, dest, control_w, sign, expb, + expa); + } + break; + case 1: /* P - N */ + tag = + FPU_u_add(a, b, dest, control_w, SIGN_POS, expa, + expb); + break; + case 2: /* N - P */ + tag = + FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa, + expb); + break; #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x111); - return -1; + default: + EXCEPTION(EX_INTERNAL | 0x111); + return -1; #endif + } + if (tag < 0) { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; } - if ( tag < 0 ) - { - setsign(dest, saved_sign); - return tag; - } - FPU_settagi(deststnr, tag); - return tag; - } - if ( taga == TAG_Special ) - taga = FPU_Special(a); - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); + if (taga == TAG_Special) + taga = FPU_Special(a); + if (tagb == TAG_Special) + tagb = FPU_Special(b); - if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + if (((taga == TAG_Valid) && (tagb == TW_Denormal)) || ((taga == TW_Denormal) && (tagb == TAG_Valid)) - || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) - { - FPU_REG x, y; + || ((taga == TW_Denormal) && (tagb == TW_Denormal))) { + FPU_REG x, y; - if ( denormal_operand() < 0 ) - return FPU_Exception; + if (denormal_operand() < 0) + return FPU_Exception; + + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + a = &x; + b = &y; + expa = exponent16(a); + expb = exponent16(b); - FPU_to_exp16(a, &x); - FPU_to_exp16(b, &y); - a = &x; - b = &y; - expa = exponent16(a); - expb = exponent16(b); - - goto valid_subtract; - } - - if ( (taga == TW_NaN) || (tagb == TW_NaN) ) - { - FPU_REG const *d1, *d2; - if ( flags & REV ) - { - d1 = b; - d2 = a; + goto valid_subtract; } - else - { - d1 = a; - d2 = b; + + if ((taga == TW_NaN) || (tagb == TW_NaN)) { + FPU_REG const *d1, *d2; + if (flags & REV) { + d1 = b; + d2 = a; + } else { + d1 = a; + d2 = b; + } + if (flags & LOADED) + return real_2op_NaN(b, tagb, deststnr, d1); + if (flags & DEST_RM) + return real_2op_NaN(a, taga, deststnr, d2); + else + return real_2op_NaN(b, tagb, deststnr, d2); } - if ( flags & LOADED ) - return real_2op_NaN(b, tagb, deststnr, d1); - if ( flags & DEST_RM ) - return real_2op_NaN(a, taga, deststnr, d2); - else - return real_2op_NaN(b, tagb, deststnr, d2); - } - - return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG, - dest, deststnr, control_w); -} + return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG, + dest, deststnr, control_w); +} static int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, FPU_REG const *b, u_char tagb, u_char signb, - FPU_REG *dest, int deststnr, int control_w) + FPU_REG * dest, int deststnr, int control_w) { - if ( ((taga == TW_Denormal) || (tagb == TW_Denormal)) - && (denormal_operand() < 0) ) - return FPU_Exception; - - if (taga == TAG_Zero) - { - if (tagb == TAG_Zero) - { - /* Both are zero, result will be zero. */ - u_char different_signs = signa ^ signb; - - FPU_copy_to_regi(a, TAG_Zero, deststnr); - if ( different_signs ) - { - /* Signs are different. */ - /* Sign of answer depends upon rounding mode. */ - setsign(dest, ((control_w & CW_RC) != RC_DOWN) - ? SIGN_POS : SIGN_NEG); - } - else - setsign(dest, signa); /* signa may differ from the sign of a. */ - return TAG_Zero; - } - else - { - reg_copy(b, dest); - if ( (tagb == TW_Denormal) && (b->sigh & 0x80000000) ) - { - /* A pseudoDenormal, convert it. */ - addexponent(dest, 1); - tagb = TAG_Valid; - } - else if ( tagb > TAG_Empty ) - tagb = TAG_Special; - setsign(dest, signb); /* signb may differ from the sign of b. */ - FPU_settagi(deststnr, tagb); - return tagb; - } - } - else if (tagb == TAG_Zero) - { - reg_copy(a, dest); - if ( (taga == TW_Denormal) && (a->sigh & 0x80000000) ) - { - /* A pseudoDenormal */ - addexponent(dest, 1); - taga = TAG_Valid; - } - else if ( taga > TAG_Empty ) - taga = TAG_Special; - setsign(dest, signa); /* signa may differ from the sign of a. */ - FPU_settagi(deststnr, taga); - return taga; - } - else if (taga == TW_Infinity) - { - if ( (tagb != TW_Infinity) || (signa == signb) ) - { - FPU_copy_to_regi(a, TAG_Special, deststnr); - setsign(dest, signa); /* signa may differ from the sign of a. */ - return taga; + if (((taga == TW_Denormal) || (tagb == TW_Denormal)) + && (denormal_operand() < 0)) + return FPU_Exception; + + if (taga == TAG_Zero) { + if (tagb == TAG_Zero) { + /* Both are zero, result will be zero. */ + u_char different_signs = signa ^ signb; + + FPU_copy_to_regi(a, TAG_Zero, deststnr); + if (different_signs) { + /* Signs are different. */ + /* Sign of answer depends upon rounding mode. */ + setsign(dest, ((control_w & CW_RC) != RC_DOWN) + ? SIGN_POS : SIGN_NEG); + } else + setsign(dest, signa); /* signa may differ from the sign of a. */ + return TAG_Zero; + } else { + reg_copy(b, dest); + if ((tagb == TW_Denormal) && (b->sigh & 0x80000000)) { + /* A pseudoDenormal, convert it. */ + addexponent(dest, 1); + tagb = TAG_Valid; + } else if (tagb > TAG_Empty) + tagb = TAG_Special; + setsign(dest, signb); /* signb may differ from the sign of b. */ + FPU_settagi(deststnr, tagb); + return tagb; + } + } else if (tagb == TAG_Zero) { + reg_copy(a, dest); + if ((taga == TW_Denormal) && (a->sigh & 0x80000000)) { + /* A pseudoDenormal */ + addexponent(dest, 1); + taga = TAG_Valid; + } else if (taga > TAG_Empty) + taga = TAG_Special; + setsign(dest, signa); /* signa may differ from the sign of a. */ + FPU_settagi(deststnr, taga); + return taga; + } else if (taga == TW_Infinity) { + if ((tagb != TW_Infinity) || (signa == signb)) { + FPU_copy_to_regi(a, TAG_Special, deststnr); + setsign(dest, signa); /* signa may differ from the sign of a. */ + return taga; + } + /* Infinity-Infinity is undefined. */ + return arith_invalid(deststnr); + } else if (tagb == TW_Infinity) { + FPU_copy_to_regi(b, TAG_Special, deststnr); + setsign(dest, signb); /* signb may differ from the sign of b. */ + return tagb; } - /* Infinity-Infinity is undefined. */ - return arith_invalid(deststnr); - } - else if (tagb == TW_Infinity) - { - FPU_copy_to_regi(b, TAG_Special, deststnr); - setsign(dest, signb); /* signb may differ from the sign of b. */ - return tagb; - } - #ifdef PARANOID - EXCEPTION(EX_INTERNAL|0x101); + EXCEPTION(EX_INTERNAL | 0x101); #endif - return FPU_Exception; + return FPU_Exception; } - diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c index f37c5b5a35a..ecce55fc2e2 100644 --- a/arch/x86/math-emu/reg_compare.c +++ b/arch/x86/math-emu/reg_compare.c @@ -20,362 +20,331 @@ #include "control_w.h" #include "status_w.h" - static int compare(FPU_REG const *b, int tagb) { - int diff, exp0, expb; - u_char st0_tag; - FPU_REG *st0_ptr; - FPU_REG x, y; - u_char st0_sign, signb = getsign(b); - - st0_ptr = &st(0); - st0_tag = FPU_gettag0(); - st0_sign = getsign(st0_ptr); - - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - - if ( ((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal)) - || ((tagb != TAG_Valid) && (tagb != TW_Denormal)) ) - { - if ( st0_tag == TAG_Zero ) - { - if ( tagb == TAG_Zero ) return COMP_A_eq_B; - if ( tagb == TAG_Valid ) - return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); - if ( tagb == TW_Denormal ) - return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) - | COMP_Denormal; - } - else if ( tagb == TAG_Zero ) - { - if ( st0_tag == TAG_Valid ) - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); - if ( st0_tag == TW_Denormal ) - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) - | COMP_Denormal; + int diff, exp0, expb; + u_char st0_tag; + FPU_REG *st0_ptr; + FPU_REG x, y; + u_char st0_sign, signb = getsign(b); + + st0_ptr = &st(0); + st0_tag = FPU_gettag0(); + st0_sign = getsign(st0_ptr); + + if (tagb == TAG_Special) + tagb = FPU_Special(b); + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + + if (((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal)) + || ((tagb != TAG_Valid) && (tagb != TW_Denormal))) { + if (st0_tag == TAG_Zero) { + if (tagb == TAG_Zero) + return COMP_A_eq_B; + if (tagb == TAG_Valid) + return ((signb == + SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); + if (tagb == TW_Denormal) + return ((signb == + SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) + | COMP_Denormal; + } else if (tagb == TAG_Zero) { + if (st0_tag == TAG_Valid) + return ((st0_sign == + SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); + if (st0_tag == TW_Denormal) + return ((st0_sign == + SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | COMP_Denormal; + } + + if (st0_tag == TW_Infinity) { + if ((tagb == TAG_Valid) || (tagb == TAG_Zero)) + return ((st0_sign == + SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); + else if (tagb == TW_Denormal) + return ((st0_sign == + SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | COMP_Denormal; + else if (tagb == TW_Infinity) { + /* The 80486 book says that infinities can be equal! */ + return (st0_sign == signb) ? COMP_A_eq_B : + ((st0_sign == + SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); + } + /* Fall through to the NaN code */ + } else if (tagb == TW_Infinity) { + if ((st0_tag == TAG_Valid) || (st0_tag == TAG_Zero)) + return ((signb == + SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); + if (st0_tag == TW_Denormal) + return ((signb == + SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) + | COMP_Denormal; + /* Fall through to the NaN code */ + } + + /* The only possibility now should be that one of the arguments + is a NaN */ + if ((st0_tag == TW_NaN) || (tagb == TW_NaN)) { + int signalling = 0, unsupported = 0; + if (st0_tag == TW_NaN) { + signalling = + (st0_ptr->sigh & 0xc0000000) == 0x80000000; + unsupported = !((exponent(st0_ptr) == EXP_OVER) + && (st0_ptr-> + sigh & 0x80000000)); + } + if (tagb == TW_NaN) { + signalling |= + (b->sigh & 0xc0000000) == 0x80000000; + unsupported |= !((exponent(b) == EXP_OVER) + && (b->sigh & 0x80000000)); + } + if (signalling || unsupported) + return COMP_No_Comp | COMP_SNaN | COMP_NaN; + else + /* Neither is a signaling NaN */ + return COMP_No_Comp | COMP_NaN; + } + + EXCEPTION(EX_Invalid); } - if ( st0_tag == TW_Infinity ) - { - if ( (tagb == TAG_Valid) || (tagb == TAG_Zero) ) - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); - else if ( tagb == TW_Denormal ) - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) - | COMP_Denormal; - else if ( tagb == TW_Infinity ) - { - /* The 80486 book says that infinities can be equal! */ - return (st0_sign == signb) ? COMP_A_eq_B : - ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); - } - /* Fall through to the NaN code */ - } - else if ( tagb == TW_Infinity ) - { - if ( (st0_tag == TAG_Valid) || (st0_tag == TAG_Zero) ) - return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); - if ( st0_tag == TW_Denormal ) - return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) - | COMP_Denormal; - /* Fall through to the NaN code */ + if (st0_sign != signb) { + return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); } - /* The only possibility now should be that one of the arguments - is a NaN */ - if ( (st0_tag == TW_NaN) || (tagb == TW_NaN) ) - { - int signalling = 0, unsupported = 0; - if ( st0_tag == TW_NaN ) - { - signalling = (st0_ptr->sigh & 0xc0000000) == 0x80000000; - unsupported = !((exponent(st0_ptr) == EXP_OVER) - && (st0_ptr->sigh & 0x80000000)); - } - if ( tagb == TW_NaN ) - { - signalling |= (b->sigh & 0xc0000000) == 0x80000000; - unsupported |= !((exponent(b) == EXP_OVER) - && (b->sigh & 0x80000000)); - } - if ( signalling || unsupported ) - return COMP_No_Comp | COMP_SNaN | COMP_NaN; - else - /* Neither is a signaling NaN */ - return COMP_No_Comp | COMP_NaN; + if ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) { + FPU_to_exp16(st0_ptr, &x); + FPU_to_exp16(b, &y); + st0_ptr = &x; + b = &y; + exp0 = exponent16(st0_ptr); + expb = exponent16(b); + } else { + exp0 = exponent(st0_ptr); + expb = exponent(b); } - - EXCEPTION(EX_Invalid); - } - - if (st0_sign != signb) - { - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) - | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? - COMP_Denormal : 0); - } - - if ( (st0_tag == TW_Denormal) || (tagb == TW_Denormal) ) - { - FPU_to_exp16(st0_ptr, &x); - FPU_to_exp16(b, &y); - st0_ptr = &x; - b = &y; - exp0 = exponent16(st0_ptr); - expb = exponent16(b); - } - else - { - exp0 = exponent(st0_ptr); - expb = exponent(b); - } #ifdef PARANOID - if (!(st0_ptr->sigh & 0x80000000)) EXCEPTION(EX_Invalid); - if (!(b->sigh & 0x80000000)) EXCEPTION(EX_Invalid); + if (!(st0_ptr->sigh & 0x80000000)) + EXCEPTION(EX_Invalid); + if (!(b->sigh & 0x80000000)) + EXCEPTION(EX_Invalid); #endif /* PARANOID */ - diff = exp0 - expb; - if ( diff == 0 ) - { - diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are - identical */ - if ( diff == 0 ) - { - diff = st0_ptr->sigl > b->sigl; - if ( diff == 0 ) - diff = -(st0_ptr->sigl < b->sigl); + diff = exp0 - expb; + if (diff == 0) { + diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are + identical */ + if (diff == 0) { + diff = st0_ptr->sigl > b->sigl; + if (diff == 0) + diff = -(st0_ptr->sigl < b->sigl); + } } - } - - if ( diff > 0 ) - { - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) - | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? - COMP_Denormal : 0); - } - if ( diff < 0 ) - { - return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) - | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? - COMP_Denormal : 0); - } - - return COMP_A_eq_B - | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? - COMP_Denormal : 0); -} + if (diff > 0) { + return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); + } + if (diff < 0) { + return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) + | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); + } + return COMP_A_eq_B + | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); + +} /* This function requires that st(0) is not empty */ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag) { - int f = 0, c; - - c = compare(loaded_data, loaded_tag); - - if (c & COMP_NaN) - { - EXCEPTION(EX_Invalid); - f = SW_C3 | SW_C2 | SW_C0; - } - else - switch (c & 7) - { - case COMP_A_lt_B: - f = SW_C0; - break; - case COMP_A_eq_B: - f = SW_C3; - break; - case COMP_A_gt_B: - f = 0; - break; - case COMP_No_Comp: - f = SW_C3 | SW_C2 | SW_C0; - break; + int f = 0, c; + + c = compare(loaded_data, loaded_tag); + + if (c & COMP_NaN) { + EXCEPTION(EX_Invalid); + f = SW_C3 | SW_C2 | SW_C0; + } else + switch (c & 7) { + case COMP_A_lt_B: + f = SW_C0; + break; + case COMP_A_eq_B: + f = SW_C3; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = SW_C3 | SW_C2 | SW_C0; + break; #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x121); - f = SW_C3 | SW_C2 | SW_C0; - break; + default: + EXCEPTION(EX_INTERNAL | 0x121); + f = SW_C3 | SW_C2 | SW_C0; + break; #endif /* PARANOID */ - } - setcc(f); - if (c & COMP_Denormal) - { - return denormal_operand() < 0; - } - return 0; + } + setcc(f); + if (c & COMP_Denormal) { + return denormal_operand() < 0; + } + return 0; } - static int compare_st_st(int nr) { - int f = 0, c; - FPU_REG *st_ptr; - - if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) ) - { - setcc(SW_C3 | SW_C2 | SW_C0); - /* Stack fault */ - EXCEPTION(EX_StackUnder); - return !(control_word & CW_Invalid); - } - - st_ptr = &st(nr); - c = compare(st_ptr, FPU_gettagi(nr)); - if (c & COMP_NaN) - { - setcc(SW_C3 | SW_C2 | SW_C0); - EXCEPTION(EX_Invalid); - return !(control_word & CW_Invalid); - } - else - switch (c & 7) - { - case COMP_A_lt_B: - f = SW_C0; - break; - case COMP_A_eq_B: - f = SW_C3; - break; - case COMP_A_gt_B: - f = 0; - break; - case COMP_No_Comp: - f = SW_C3 | SW_C2 | SW_C0; - break; + int f = 0, c; + FPU_REG *st_ptr; + + if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) { + setcc(SW_C3 | SW_C2 | SW_C0); + /* Stack fault */ + EXCEPTION(EX_StackUnder); + return !(control_word & CW_Invalid); + } + + st_ptr = &st(nr); + c = compare(st_ptr, FPU_gettagi(nr)); + if (c & COMP_NaN) { + setcc(SW_C3 | SW_C2 | SW_C0); + EXCEPTION(EX_Invalid); + return !(control_word & CW_Invalid); + } else + switch (c & 7) { + case COMP_A_lt_B: + f = SW_C0; + break; + case COMP_A_eq_B: + f = SW_C3; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = SW_C3 | SW_C2 | SW_C0; + break; #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x122); - f = SW_C3 | SW_C2 | SW_C0; - break; + default: + EXCEPTION(EX_INTERNAL | 0x122); + f = SW_C3 | SW_C2 | SW_C0; + break; #endif /* PARANOID */ - } - setcc(f); - if (c & COMP_Denormal) - { - return denormal_operand() < 0; - } - return 0; + } + setcc(f); + if (c & COMP_Denormal) { + return denormal_operand() < 0; + } + return 0; } - static int compare_u_st_st(int nr) { - int f = 0, c; - FPU_REG *st_ptr; - - if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) ) - { - setcc(SW_C3 | SW_C2 | SW_C0); - /* Stack fault */ - EXCEPTION(EX_StackUnder); - return !(control_word & CW_Invalid); - } - - st_ptr = &st(nr); - c = compare(st_ptr, FPU_gettagi(nr)); - if (c & COMP_NaN) - { - setcc(SW_C3 | SW_C2 | SW_C0); - if (c & COMP_SNaN) /* This is the only difference between - un-ordered and ordinary comparisons */ - { - EXCEPTION(EX_Invalid); - return !(control_word & CW_Invalid); + int f = 0, c; + FPU_REG *st_ptr; + + if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) { + setcc(SW_C3 | SW_C2 | SW_C0); + /* Stack fault */ + EXCEPTION(EX_StackUnder); + return !(control_word & CW_Invalid); } - return 0; - } - else - switch (c & 7) - { - case COMP_A_lt_B: - f = SW_C0; - break; - case COMP_A_eq_B: - f = SW_C3; - break; - case COMP_A_gt_B: - f = 0; - break; - case COMP_No_Comp: - f = SW_C3 | SW_C2 | SW_C0; - break; + + st_ptr = &st(nr); + c = compare(st_ptr, FPU_gettagi(nr)); + if (c & COMP_NaN) { + setcc(SW_C3 | SW_C2 | SW_C0); + if (c & COMP_SNaN) { /* This is the only difference between + un-ordered and ordinary comparisons */ + EXCEPTION(EX_Invalid); + return !(control_word & CW_Invalid); + } + return 0; + } else + switch (c & 7) { + case COMP_A_lt_B: + f = SW_C0; + break; + case COMP_A_eq_B: + f = SW_C3; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = SW_C3 | SW_C2 | SW_C0; + break; #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x123); - f = SW_C3 | SW_C2 | SW_C0; - break; -#endif /* PARANOID */ - } - setcc(f); - if (c & COMP_Denormal) - { - return denormal_operand() < 0; - } - return 0; + default: + EXCEPTION(EX_INTERNAL | 0x123); + f = SW_C3 | SW_C2 | SW_C0; + break; +#endif /* PARANOID */ + } + setcc(f); + if (c & COMP_Denormal) { + return denormal_operand() < 0; + } + return 0; } /*---------------------------------------------------------------------------*/ void fcom_st(void) { - /* fcom st(i) */ - compare_st_st(FPU_rm); + /* fcom st(i) */ + compare_st_st(FPU_rm); } - void fcompst(void) { - /* fcomp st(i) */ - if ( !compare_st_st(FPU_rm) ) - FPU_pop(); + /* fcomp st(i) */ + if (!compare_st_st(FPU_rm)) + FPU_pop(); } - void fcompp(void) { - /* fcompp */ - if (FPU_rm != 1) - { - FPU_illegal(); - return; - } - if ( !compare_st_st(1) ) - poppop(); + /* fcompp */ + if (FPU_rm != 1) { + FPU_illegal(); + return; + } + if (!compare_st_st(1)) + poppop(); } - void fucom_(void) { - /* fucom st(i) */ - compare_u_st_st(FPU_rm); + /* fucom st(i) */ + compare_u_st_st(FPU_rm); } - void fucomp(void) { - /* fucomp st(i) */ - if ( !compare_u_st_st(FPU_rm) ) - FPU_pop(); + /* fucomp st(i) */ + if (!compare_u_st_st(FPU_rm)) + FPU_pop(); } - void fucompp(void) { - /* fucompp */ - if (FPU_rm == 1) - { - if ( !compare_u_st_st(1) ) - poppop(); - } - else - FPU_illegal(); + /* fucompp */ + if (FPU_rm == 1) { + if (!compare_u_st_st(1)) + poppop(); + } else + FPU_illegal(); } diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c index a8501580196..04869e64b18 100644 --- a/arch/x86/math-emu/reg_constant.c +++ b/arch/x86/math-emu/reg_constant.c @@ -16,29 +16,28 @@ #include "reg_constant.h" #include "control_w.h" - #define MAKE_REG(s,e,l,h) { l, h, \ ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } -FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); +FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); #if 0 -FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000); +FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000); FPU_REG const CONST_HALF = MAKE_REG(POS, -1, 0x00000000, 0x80000000); -#endif /* 0 */ -static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b); -static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29); -FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2); -FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2); -FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2); -static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84); -static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7); +#endif /* 0 */ +static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b); +static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29); +FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2); +FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2); +FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2); +static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84); +static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7); /* Extra bits to take pi/2 to more than 128 bits precision. */ FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66, - 0xfc8f8cbb, 0xece675d1); + 0xfc8f8cbb, 0xece675d1); /* Only the sign (and tag) is used in internal zeroes */ -FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0); +FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0); /* Only the sign and significand (and tag) are used in internal NaNs */ /* The 80486 never generates one of these @@ -48,24 +47,22 @@ FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000); FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000); /* Only the sign (and tag) is used in internal infinities */ -FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000); - +FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000); static void fld_const(FPU_REG const *c, int adj, u_char tag) { - FPU_REG *st_new_ptr; - - if ( STACK_OVERFLOW ) - { - FPU_stack_overflow(); - return; - } - push(); - reg_copy(c, st_new_ptr); - st_new_ptr->sigl += adj; /* For all our fldxxx constants, we don't need to - borrow or carry. */ - FPU_settag0(tag); - clear_C1(); + FPU_REG *st_new_ptr; + + if (STACK_OVERFLOW) { + FPU_stack_overflow(); + return; + } + push(); + reg_copy(c, st_new_ptr); + st_new_ptr->sigl += adj; /* For all our fldxxx constants, we don't need to + borrow or carry. */ + FPU_settag0(tag); + clear_C1(); } /* A fast way to find out whether x is one of RC_DOWN or RC_CHOP @@ -75,46 +72,46 @@ static void fld_const(FPU_REG const *c, int adj, u_char tag) static void fld1(int rc) { - fld_const(&CONST_1, 0, TAG_Valid); + fld_const(&CONST_1, 0, TAG_Valid); } static void fldl2t(int rc) { - fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid); + fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid); } static void fldl2e(int rc) { - fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); + fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); } static void fldpi(int rc) { - fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); + fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); } static void fldlg2(int rc) { - fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); + fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); } static void fldln2(int rc) { - fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); + fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); } static void fldz(int rc) { - fld_const(&CONST_Z, 0, TAG_Zero); + fld_const(&CONST_Z, 0, TAG_Zero); } -typedef void (*FUNC_RC)(int); +typedef void (*FUNC_RC) (int); static FUNC_RC constants_table[] = { - fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC)FPU_illegal + fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC) FPU_illegal }; void fconst(void) { - (constants_table[FPU_rm])(control_word & CW_RC); + (constants_table[FPU_rm]) (control_word & CW_RC); } diff --git a/arch/x86/math-emu/reg_convert.c b/arch/x86/math-emu/reg_convert.c index 45a25875270..10806077997 100644 --- a/arch/x86/math-emu/reg_convert.c +++ b/arch/x86/math-emu/reg_convert.c @@ -13,41 +13,34 @@ #include "exception.h" #include "fpu_emu.h" - int FPU_to_exp16(FPU_REG const *a, FPU_REG *x) { - int sign = getsign(a); - - *(long long *)&(x->sigl) = *(const long long *)&(a->sigl); - - /* Set up the exponent as a 16 bit quantity. */ - setexponent16(x, exponent(a)); - - if ( exponent16(x) == EXP_UNDER ) - { - /* The number is a de-normal or pseudodenormal. */ - /* We only deal with the significand and exponent. */ - - if (x->sigh & 0x80000000) - { - /* Is a pseudodenormal. */ - /* This is non-80486 behaviour because the number - loses its 'denormal' identity. */ - addexponent(x, 1); - } - else - { - /* Is a denormal. */ - addexponent(x, 1); - FPU_normalize_nuo(x); + int sign = getsign(a); + + *(long long *)&(x->sigl) = *(const long long *)&(a->sigl); + + /* Set up the exponent as a 16 bit quantity. */ + setexponent16(x, exponent(a)); + + if (exponent16(x) == EXP_UNDER) { + /* The number is a de-normal or pseudodenormal. */ + /* We only deal with the significand and exponent. */ + + if (x->sigh & 0x80000000) { + /* Is a pseudodenormal. */ + /* This is non-80486 behaviour because the number + loses its 'denormal' identity. */ + addexponent(x, 1); + } else { + /* Is a denormal. */ + addexponent(x, 1); + FPU_normalize_nuo(x); + } } - } - if ( !(x->sigh & 0x80000000) ) - { - EXCEPTION(EX_INTERNAL | 0x180); - } + if (!(x->sigh & 0x80000000)) { + EXCEPTION(EX_INTERNAL | 0x180); + } - return sign; + return sign; } - diff --git a/arch/x86/math-emu/reg_divide.c b/arch/x86/math-emu/reg_divide.c index 5cee7ff920d..6827012db34 100644 --- a/arch/x86/math-emu/reg_divide.c +++ b/arch/x86/math-emu/reg_divide.c @@ -26,182 +26,157 @@ */ int FPU_div(int flags, int rm, int control_w) { - FPU_REG x, y; - FPU_REG const *a, *b, *st0_ptr, *st_ptr; - FPU_REG *dest; - u_char taga, tagb, signa, signb, sign, saved_sign; - int tag, deststnr; - - if ( flags & DEST_RM ) - deststnr = rm; - else - deststnr = 0; - - if ( flags & REV ) - { - b = &st(0); - st0_ptr = b; - tagb = FPU_gettag0(); - if ( flags & LOADED ) - { - a = (FPU_REG *)rm; - taga = flags & 0x0f; + FPU_REG x, y; + FPU_REG const *a, *b, *st0_ptr, *st_ptr; + FPU_REG *dest; + u_char taga, tagb, signa, signb, sign, saved_sign; + int tag, deststnr; + + if (flags & DEST_RM) + deststnr = rm; + else + deststnr = 0; + + if (flags & REV) { + b = &st(0); + st0_ptr = b; + tagb = FPU_gettag0(); + if (flags & LOADED) { + a = (FPU_REG *) rm; + taga = flags & 0x0f; + } else { + a = &st(rm); + st_ptr = a; + taga = FPU_gettagi(rm); + } + } else { + a = &st(0); + st0_ptr = a; + taga = FPU_gettag0(); + if (flags & LOADED) { + b = (FPU_REG *) rm; + tagb = flags & 0x0f; + } else { + b = &st(rm); + st_ptr = b; + tagb = FPU_gettagi(rm); + } } - else - { - a = &st(rm); - st_ptr = a; - taga = FPU_gettagi(rm); - } - } - else - { - a = &st(0); - st0_ptr = a; - taga = FPU_gettag0(); - if ( flags & LOADED ) - { - b = (FPU_REG *)rm; - tagb = flags & 0x0f; - } - else - { - b = &st(rm); - st_ptr = b; - tagb = FPU_gettagi(rm); - } - } - signa = getsign(a); - signb = getsign(b); + signa = getsign(a); + signb = getsign(b); - sign = signa ^ signb; + sign = signa ^ signb; - dest = &st(deststnr); - saved_sign = getsign(dest); + dest = &st(deststnr); + saved_sign = getsign(dest); - if ( !(taga | tagb) ) - { - /* Both regs Valid, this should be the most common case. */ - reg_copy(a, &x); - reg_copy(b, &y); - setpositive(&x); - setpositive(&y); - tag = FPU_u_div(&x, &y, dest, control_w, sign); + if (!(taga | tagb)) { + /* Both regs Valid, this should be the most common case. */ + reg_copy(a, &x); + reg_copy(b, &y); + setpositive(&x); + setpositive(&y); + tag = FPU_u_div(&x, &y, dest, control_w, sign); - if ( tag < 0 ) - return tag; + if (tag < 0) + return tag; - FPU_settagi(deststnr, tag); - return tag; - } + FPU_settagi(deststnr, tag); + return tag; + } - if ( taga == TAG_Special ) - taga = FPU_Special(a); - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); + if (taga == TAG_Special) + taga = FPU_Special(a); + if (tagb == TAG_Special) + tagb = FPU_Special(b); - if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + if (((taga == TAG_Valid) && (tagb == TW_Denormal)) || ((taga == TW_Denormal) && (tagb == TAG_Valid)) - || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) - { - if ( denormal_operand() < 0 ) - return FPU_Exception; - - FPU_to_exp16(a, &x); - FPU_to_exp16(b, &y); - tag = FPU_u_div(&x, &y, dest, control_w, sign); - if ( tag < 0 ) - return tag; - - FPU_settagi(deststnr, tag); - return tag; - } - else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) ) - { - if ( tagb != TAG_Zero ) - { - /* Want to find Zero/Valid */ - if ( tagb == TW_Denormal ) - { - if ( denormal_operand() < 0 ) - return FPU_Exception; - } - - /* The result is zero. */ - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - setsign(dest, sign); - return TAG_Zero; + || ((taga == TW_Denormal) && (tagb == TW_Denormal))) { + if (denormal_operand() < 0) + return FPU_Exception; + + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + tag = FPU_u_div(&x, &y, dest, control_w, sign); + if (tag < 0) + return tag; + + FPU_settagi(deststnr, tag); + return tag; + } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) { + if (tagb != TAG_Zero) { + /* Want to find Zero/Valid */ + if (tagb == TW_Denormal) { + if (denormal_operand() < 0) + return FPU_Exception; + } + + /* The result is zero. */ + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + setsign(dest, sign); + return TAG_Zero; + } + /* We have an exception condition, either 0/0 or Valid/Zero. */ + if (taga == TAG_Zero) { + /* 0/0 */ + return arith_invalid(deststnr); + } + /* Valid/Zero */ + return FPU_divide_by_zero(deststnr, sign); } - /* We have an exception condition, either 0/0 or Valid/Zero. */ - if ( taga == TAG_Zero ) - { - /* 0/0 */ - return arith_invalid(deststnr); + /* Must have infinities, NaNs, etc */ + else if ((taga == TW_NaN) || (tagb == TW_NaN)) { + if (flags & LOADED) + return real_2op_NaN((FPU_REG *) rm, flags & 0x0f, 0, + st0_ptr); + + if (flags & DEST_RM) { + int tag; + tag = FPU_gettag0(); + if (tag == TAG_Special) + tag = FPU_Special(st0_ptr); + return real_2op_NaN(st0_ptr, tag, rm, + (flags & REV) ? st0_ptr : &st(rm)); + } else { + int tag; + tag = FPU_gettagi(rm); + if (tag == TAG_Special) + tag = FPU_Special(&st(rm)); + return real_2op_NaN(&st(rm), tag, 0, + (flags & REV) ? st0_ptr : &st(rm)); + } + } else if (taga == TW_Infinity) { + if (tagb == TW_Infinity) { + /* infinity/infinity */ + return arith_invalid(deststnr); + } else { + /* tagb must be Valid or Zero */ + if ((tagb == TW_Denormal) && (denormal_operand() < 0)) + return FPU_Exception; + + /* Infinity divided by Zero or Valid does + not raise and exception, but returns Infinity */ + FPU_copy_to_regi(a, TAG_Special, deststnr); + setsign(dest, sign); + return taga; + } + } else if (tagb == TW_Infinity) { + if ((taga == TW_Denormal) && (denormal_operand() < 0)) + return FPU_Exception; + + /* The result is zero. */ + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + setsign(dest, sign); + return TAG_Zero; } - /* Valid/Zero */ - return FPU_divide_by_zero(deststnr, sign); - } - /* Must have infinities, NaNs, etc */ - else if ( (taga == TW_NaN) || (tagb == TW_NaN) ) - { - if ( flags & LOADED ) - return real_2op_NaN((FPU_REG *)rm, flags & 0x0f, 0, st0_ptr); - - if ( flags & DEST_RM ) - { - int tag; - tag = FPU_gettag0(); - if ( tag == TAG_Special ) - tag = FPU_Special(st0_ptr); - return real_2op_NaN(st0_ptr, tag, rm, (flags & REV) ? st0_ptr : &st(rm)); - } - else - { - int tag; - tag = FPU_gettagi(rm); - if ( tag == TAG_Special ) - tag = FPU_Special(&st(rm)); - return real_2op_NaN(&st(rm), tag, 0, (flags & REV) ? st0_ptr : &st(rm)); - } - } - else if (taga == TW_Infinity) - { - if (tagb == TW_Infinity) - { - /* infinity/infinity */ - return arith_invalid(deststnr); - } - else - { - /* tagb must be Valid or Zero */ - if ( (tagb == TW_Denormal) && (denormal_operand() < 0) ) - return FPU_Exception; - - /* Infinity divided by Zero or Valid does - not raise and exception, but returns Infinity */ - FPU_copy_to_regi(a, TAG_Special, deststnr); - setsign(dest, sign); - return taga; - } - } - else if (tagb == TW_Infinity) - { - if ( (taga == TW_Denormal) && (denormal_operand() < 0) ) - return FPU_Exception; - - /* The result is zero. */ - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - setsign(dest, sign); - return TAG_Zero; - } #ifdef PARANOID - else - { - EXCEPTION(EX_INTERNAL|0x102); - return FPU_Exception; - } -#endif /* PARANOID */ + else { + EXCEPTION(EX_INTERNAL | 0x102); + return FPU_Exception; + } +#endif /* PARANOID */ return 0; } diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index e976caef649..799d4af5be6 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -27,1084 +27,938 @@ #include "control_w.h" #include "status_w.h" - -#define DOUBLE_Emax 1023 /* largest valid exponent */ +#define DOUBLE_Emax 1023 /* largest valid exponent */ #define DOUBLE_Ebias 1023 -#define DOUBLE_Emin (-1022) /* smallest valid exponent */ +#define DOUBLE_Emin (-1022) /* smallest valid exponent */ -#define SINGLE_Emax 127 /* largest valid exponent */ +#define SINGLE_Emax 127 /* largest valid exponent */ #define SINGLE_Ebias 127 -#define SINGLE_Emin (-126) /* smallest valid exponent */ - +#define SINGLE_Emin (-126) /* smallest valid exponent */ static u_char normalize_no_excep(FPU_REG *r, int exp, int sign) { - u_char tag; + u_char tag; - setexponent16(r, exp); + setexponent16(r, exp); - tag = FPU_normalize_nuo(r); - stdexp(r); - if ( sign ) - setnegative(r); + tag = FPU_normalize_nuo(r); + stdexp(r); + if (sign) + setnegative(r); - return tag; + return tag; } - int FPU_tagof(FPU_REG *ptr) { - int exp; - - exp = exponent16(ptr) & 0x7fff; - if ( exp == 0 ) - { - if ( !(ptr->sigh | ptr->sigl) ) - { - return TAG_Zero; + int exp; + + exp = exponent16(ptr) & 0x7fff; + if (exp == 0) { + if (!(ptr->sigh | ptr->sigl)) { + return TAG_Zero; + } + /* The number is a de-normal or pseudodenormal. */ + return TAG_Special; + } + + if (exp == 0x7fff) { + /* Is an Infinity, a NaN, or an unsupported data type. */ + return TAG_Special; } - /* The number is a de-normal or pseudodenormal. */ - return TAG_Special; - } - - if ( exp == 0x7fff ) - { - /* Is an Infinity, a NaN, or an unsupported data type. */ - return TAG_Special; - } - - if ( !(ptr->sigh & 0x80000000) ) - { - /* Unsupported data type. */ - /* Valid numbers have the ms bit set to 1. */ - /* Unnormal. */ - return TAG_Special; - } - - return TAG_Valid; -} + if (!(ptr->sigh & 0x80000000)) { + /* Unsupported data type. */ + /* Valid numbers have the ms bit set to 1. */ + /* Unnormal. */ + return TAG_Special; + } + + return TAG_Valid; +} /* Get a long double from user memory */ int FPU_load_extended(long double __user *s, int stnr) { - FPU_REG *sti_ptr = &st(stnr); + FPU_REG *sti_ptr = &st(stnr); - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 10); - __copy_from_user(sti_ptr, s, 10); - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 10); + __copy_from_user(sti_ptr, s, 10); + RE_ENTRANT_CHECK_ON; - return FPU_tagof(sti_ptr); + return FPU_tagof(sti_ptr); } - /* Get a double from user memory */ int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data) { - int exp, tag, negative; - unsigned m64, l64; - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, dfloat, 8); - FPU_get_user(m64, 1 + (unsigned long __user *) dfloat); - FPU_get_user(l64, (unsigned long __user *) dfloat); - RE_ENTRANT_CHECK_ON; - - negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive; - exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias; - m64 &= 0xfffff; - if ( exp > DOUBLE_Emax + EXTENDED_Ebias ) - { - /* Infinity or NaN */ - if ((m64 == 0) && (l64 == 0)) - { - /* +- infinity */ - loaded_data->sigh = 0x80000000; - loaded_data->sigl = 0x00000000; - exp = EXP_Infinity + EXTENDED_Ebias; - tag = TAG_Special; - } - else - { - /* Must be a signaling or quiet NaN */ - exp = EXP_NaN + EXTENDED_Ebias; - loaded_data->sigh = (m64 << 11) | 0x80000000; - loaded_data->sigh |= l64 >> 21; - loaded_data->sigl = l64 << 11; - tag = TAG_Special; /* The calling function must look for NaNs */ - } - } - else if ( exp < DOUBLE_Emin + EXTENDED_Ebias ) - { - /* Zero or de-normal */ - if ((m64 == 0) && (l64 == 0)) - { - /* Zero */ - reg_copy(&CONST_Z, loaded_data); - exp = 0; - tag = TAG_Zero; - } - else - { - /* De-normal */ - loaded_data->sigh = m64 << 11; - loaded_data->sigh |= l64 >> 21; - loaded_data->sigl = l64 << 11; - - return normalize_no_excep(loaded_data, DOUBLE_Emin, negative) - | (denormal_operand() < 0 ? FPU_Exception : 0); - } - } - else - { - loaded_data->sigh = (m64 << 11) | 0x80000000; - loaded_data->sigh |= l64 >> 21; - loaded_data->sigl = l64 << 11; + int exp, tag, negative; + unsigned m64, l64; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, dfloat, 8); + FPU_get_user(m64, 1 + (unsigned long __user *)dfloat); + FPU_get_user(l64, (unsigned long __user *)dfloat); + RE_ENTRANT_CHECK_ON; + + negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive; + exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias; + m64 &= 0xfffff; + if (exp > DOUBLE_Emax + EXTENDED_Ebias) { + /* Infinity or NaN */ + if ((m64 == 0) && (l64 == 0)) { + /* +- infinity */ + loaded_data->sigh = 0x80000000; + loaded_data->sigl = 0x00000000; + exp = EXP_Infinity + EXTENDED_Ebias; + tag = TAG_Special; + } else { + /* Must be a signaling or quiet NaN */ + exp = EXP_NaN + EXTENDED_Ebias; + loaded_data->sigh = (m64 << 11) | 0x80000000; + loaded_data->sigh |= l64 >> 21; + loaded_data->sigl = l64 << 11; + tag = TAG_Special; /* The calling function must look for NaNs */ + } + } else if (exp < DOUBLE_Emin + EXTENDED_Ebias) { + /* Zero or de-normal */ + if ((m64 == 0) && (l64 == 0)) { + /* Zero */ + reg_copy(&CONST_Z, loaded_data); + exp = 0; + tag = TAG_Zero; + } else { + /* De-normal */ + loaded_data->sigh = m64 << 11; + loaded_data->sigh |= l64 >> 21; + loaded_data->sigl = l64 << 11; + + return normalize_no_excep(loaded_data, DOUBLE_Emin, + negative) + | (denormal_operand() < 0 ? FPU_Exception : 0); + } + } else { + loaded_data->sigh = (m64 << 11) | 0x80000000; + loaded_data->sigh |= l64 >> 21; + loaded_data->sigl = l64 << 11; - tag = TAG_Valid; - } + tag = TAG_Valid; + } - setexponent16(loaded_data, exp | negative); + setexponent16(loaded_data, exp | negative); - return tag; + return tag; } - /* Get a float from user memory */ int FPU_load_single(float __user *single, FPU_REG *loaded_data) { - unsigned m32; - int exp, tag, negative; - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, single, 4); - FPU_get_user(m32, (unsigned long __user *) single); - RE_ENTRANT_CHECK_ON; - - negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive; - - if (!(m32 & 0x7fffffff)) - { - /* Zero */ - reg_copy(&CONST_Z, loaded_data); - addexponent(loaded_data, negative); - return TAG_Zero; - } - exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias; - m32 = (m32 & 0x7fffff) << 8; - if ( exp < SINGLE_Emin + EXTENDED_Ebias ) - { - /* De-normals */ - loaded_data->sigh = m32; - loaded_data->sigl = 0; - - return normalize_no_excep(loaded_data, SINGLE_Emin, negative) - | (denormal_operand() < 0 ? FPU_Exception : 0); - } - else if ( exp > SINGLE_Emax + EXTENDED_Ebias ) - { - /* Infinity or NaN */ - if ( m32 == 0 ) - { - /* +- infinity */ - loaded_data->sigh = 0x80000000; - loaded_data->sigl = 0x00000000; - exp = EXP_Infinity + EXTENDED_Ebias; - tag = TAG_Special; + unsigned m32; + int exp, tag, negative; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, single, 4); + FPU_get_user(m32, (unsigned long __user *)single); + RE_ENTRANT_CHECK_ON; + + negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive; + + if (!(m32 & 0x7fffffff)) { + /* Zero */ + reg_copy(&CONST_Z, loaded_data); + addexponent(loaded_data, negative); + return TAG_Zero; } - else - { - /* Must be a signaling or quiet NaN */ - exp = EXP_NaN + EXTENDED_Ebias; - loaded_data->sigh = m32 | 0x80000000; - loaded_data->sigl = 0; - tag = TAG_Special; /* The calling function must look for NaNs */ + exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias; + m32 = (m32 & 0x7fffff) << 8; + if (exp < SINGLE_Emin + EXTENDED_Ebias) { + /* De-normals */ + loaded_data->sigh = m32; + loaded_data->sigl = 0; + + return normalize_no_excep(loaded_data, SINGLE_Emin, negative) + | (denormal_operand() < 0 ? FPU_Exception : 0); + } else if (exp > SINGLE_Emax + EXTENDED_Ebias) { + /* Infinity or NaN */ + if (m32 == 0) { + /* +- infinity */ + loaded_data->sigh = 0x80000000; + loaded_data->sigl = 0x00000000; + exp = EXP_Infinity + EXTENDED_Ebias; + tag = TAG_Special; + } else { + /* Must be a signaling or quiet NaN */ + exp = EXP_NaN + EXTENDED_Ebias; + loaded_data->sigh = m32 | 0x80000000; + loaded_data->sigl = 0; + tag = TAG_Special; /* The calling function must look for NaNs */ + } + } else { + loaded_data->sigh = m32 | 0x80000000; + loaded_data->sigl = 0; + tag = TAG_Valid; } - } - else - { - loaded_data->sigh = m32 | 0x80000000; - loaded_data->sigl = 0; - tag = TAG_Valid; - } - setexponent16(loaded_data, exp | negative); /* Set the sign. */ + setexponent16(loaded_data, exp | negative); /* Set the sign. */ - return tag; + return tag; } - /* Get a long long from user memory */ int FPU_load_int64(long long __user *_s) { - long long s; - int sign; - FPU_REG *st0_ptr = &st(0); - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, _s, 8); - if (copy_from_user(&s,_s,8)) - FPU_abort; - RE_ENTRANT_CHECK_ON; - - if (s == 0) - { - reg_copy(&CONST_Z, st0_ptr); - return TAG_Zero; - } - - if (s > 0) - sign = SIGN_Positive; - else - { - s = -s; - sign = SIGN_Negative; - } - - significand(st0_ptr) = s; - - return normalize_no_excep(st0_ptr, 63, sign); -} + long long s; + int sign; + FPU_REG *st0_ptr = &st(0); + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, _s, 8); + if (copy_from_user(&s, _s, 8)) + FPU_abort; + RE_ENTRANT_CHECK_ON; + + if (s == 0) { + reg_copy(&CONST_Z, st0_ptr); + return TAG_Zero; + } + + if (s > 0) + sign = SIGN_Positive; + else { + s = -s; + sign = SIGN_Negative; + } + significand(st0_ptr) = s; + + return normalize_no_excep(st0_ptr, 63, sign); +} /* Get a long from user memory */ int FPU_load_int32(long __user *_s, FPU_REG *loaded_data) { - long s; - int negative; + long s; + int negative; - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, _s, 4); - FPU_get_user(s, _s); - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, _s, 4); + FPU_get_user(s, _s); + RE_ENTRANT_CHECK_ON; - if (s == 0) - { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; } + if (s == 0) { + reg_copy(&CONST_Z, loaded_data); + return TAG_Zero; + } - if (s > 0) - negative = SIGN_Positive; - else - { - s = -s; - negative = SIGN_Negative; - } + if (s > 0) + negative = SIGN_Positive; + else { + s = -s; + negative = SIGN_Negative; + } - loaded_data->sigh = s; - loaded_data->sigl = 0; + loaded_data->sigh = s; + loaded_data->sigl = 0; - return normalize_no_excep(loaded_data, 31, negative); + return normalize_no_excep(loaded_data, 31, negative); } - /* Get a short from user memory */ int FPU_load_int16(short __user *_s, FPU_REG *loaded_data) { - int s, negative; + int s, negative; - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, _s, 2); - /* Cast as short to get the sign extended. */ - FPU_get_user(s, _s); - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, _s, 2); + /* Cast as short to get the sign extended. */ + FPU_get_user(s, _s); + RE_ENTRANT_CHECK_ON; - if (s == 0) - { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; } + if (s == 0) { + reg_copy(&CONST_Z, loaded_data); + return TAG_Zero; + } - if (s > 0) - negative = SIGN_Positive; - else - { - s = -s; - negative = SIGN_Negative; - } + if (s > 0) + negative = SIGN_Positive; + else { + s = -s; + negative = SIGN_Negative; + } - loaded_data->sigh = s << 16; - loaded_data->sigl = 0; + loaded_data->sigh = s << 16; + loaded_data->sigl = 0; - return normalize_no_excep(loaded_data, 15, negative); + return normalize_no_excep(loaded_data, 15, negative); } - /* Get a packed bcd array from user memory */ int FPU_load_bcd(u_char __user *s) { - FPU_REG *st0_ptr = &st(0); - int pos; - u_char bcd; - long long l=0; - int sign; - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 10); - RE_ENTRANT_CHECK_ON; - for ( pos = 8; pos >= 0; pos--) - { - l *= 10; - RE_ENTRANT_CHECK_OFF; - FPU_get_user(bcd, s+pos); - RE_ENTRANT_CHECK_ON; - l += bcd >> 4; - l *= 10; - l += bcd & 0x0f; - } - - RE_ENTRANT_CHECK_OFF; - FPU_get_user(sign, s+9); - sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive; - RE_ENTRANT_CHECK_ON; - - if ( l == 0 ) - { - reg_copy(&CONST_Z, st0_ptr); - addexponent(st0_ptr, sign); /* Set the sign. */ - return TAG_Zero; - } - else - { - significand(st0_ptr) = l; - return normalize_no_excep(st0_ptr, 63, sign); - } + FPU_REG *st0_ptr = &st(0); + int pos; + u_char bcd; + long long l = 0; + int sign; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 10); + RE_ENTRANT_CHECK_ON; + for (pos = 8; pos >= 0; pos--) { + l *= 10; + RE_ENTRANT_CHECK_OFF; + FPU_get_user(bcd, s + pos); + RE_ENTRANT_CHECK_ON; + l += bcd >> 4; + l *= 10; + l += bcd & 0x0f; + } + + RE_ENTRANT_CHECK_OFF; + FPU_get_user(sign, s + 9); + sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive; + RE_ENTRANT_CHECK_ON; + + if (l == 0) { + reg_copy(&CONST_Z, st0_ptr); + addexponent(st0_ptr, sign); /* Set the sign. */ + return TAG_Zero; + } else { + significand(st0_ptr) = l; + return normalize_no_excep(st0_ptr, 63, sign); + } } /*===========================================================================*/ /* Put a long double into user memory */ -int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, long double __user *d) +int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, + long double __user * d) { - /* - The only exception raised by an attempt to store to an - extended format is the Invalid Stack exception, i.e. - attempting to store from an empty register. - */ - - if ( st0_tag != TAG_Empty ) - { - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 10); - - FPU_put_user(st0_ptr->sigl, (unsigned long __user *) d); - FPU_put_user(st0_ptr->sigh, (unsigned long __user *) ((u_char __user *)d + 4)); - FPU_put_user(exponent16(st0_ptr), (unsigned short __user *) ((u_char __user *)d + 8)); - RE_ENTRANT_CHECK_ON; - - return 1; - } - - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - if ( control_word & CW_Invalid ) - { - /* The masked response */ - /* Put out the QNaN indefinite */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,10); - FPU_put_user(0, (unsigned long __user *) d); - FPU_put_user(0xc0000000, 1 + (unsigned long __user *) d); - FPU_put_user(0xffff, 4 + (short __user *) d); - RE_ENTRANT_CHECK_ON; - return 1; - } - else - return 0; + /* + The only exception raised by an attempt to store to an + extended format is the Invalid Stack exception, i.e. + attempting to store from an empty register. + */ + + if (st0_tag != TAG_Empty) { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 10); + + FPU_put_user(st0_ptr->sigl, (unsigned long __user *)d); + FPU_put_user(st0_ptr->sigh, + (unsigned long __user *)((u_char __user *) d + 4)); + FPU_put_user(exponent16(st0_ptr), + (unsigned short __user *)((u_char __user *) d + + 8)); + RE_ENTRANT_CHECK_ON; + + return 1; + } -} + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + if (control_word & CW_Invalid) { + /* The masked response */ + /* Put out the QNaN indefinite */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 10); + FPU_put_user(0, (unsigned long __user *)d); + FPU_put_user(0xc0000000, 1 + (unsigned long __user *)d); + FPU_put_user(0xffff, 4 + (short __user *)d); + RE_ENTRANT_CHECK_ON; + return 1; + } else + return 0; +} /* Put a double into user memory */ int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat) { - unsigned long l[2]; - unsigned long increment = 0; /* avoid gcc warnings */ - int precision_loss; - int exp; - FPU_REG tmp; + unsigned long l[2]; + unsigned long increment = 0; /* avoid gcc warnings */ + int precision_loss; + int exp; + FPU_REG tmp; - if ( st0_tag == TAG_Valid ) - { - reg_copy(st0_ptr, &tmp); - exp = exponent(&tmp); + if (st0_tag == TAG_Valid) { + reg_copy(st0_ptr, &tmp); + exp = exponent(&tmp); - if ( exp < DOUBLE_Emin ) /* It may be a denormal */ - { - addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */ + if (exp < DOUBLE_Emin) { /* It may be a denormal */ + addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */ - denormal_arg: + denormal_arg: - if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) ) - { + if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) { #ifdef PECULIAR_486 - /* Did it round to a non-denormal ? */ - /* This behaviour might be regarded as peculiar, it appears - that the 80486 rounds to the dest precision, then - converts to decide underflow. */ - if ( !((tmp.sigh == 0x00100000) && (tmp.sigl == 0) && - (st0_ptr->sigl & 0x000007ff)) ) + /* Did it round to a non-denormal ? */ + /* This behaviour might be regarded as peculiar, it appears + that the 80486 rounds to the dest precision, then + converts to decide underflow. */ + if (! + ((tmp.sigh == 0x00100000) && (tmp.sigl == 0) + && (st0_ptr->sigl & 0x000007ff))) #endif /* PECULIAR_486 */ - { - EXCEPTION(EX_Underflow); - /* This is a special case: see sec 16.2.5.1 of - the 80486 book */ - if ( !(control_word & CW_Underflow) ) - return 0; - } - EXCEPTION(precision_loss); - if ( !(control_word & CW_Precision) ) - return 0; - } - l[0] = tmp.sigl; - l[1] = tmp.sigh; - } - else - { - if ( tmp.sigl & 0x000007ff ) - { - precision_loss = 1; - switch (control_word & CW_RC) - { - case RC_RND: - /* Rounding can get a little messy.. */ - increment = ((tmp.sigl & 0x7ff) > 0x400) | /* nearest */ - ((tmp.sigl & 0xc00) == 0xc00); /* odd -> even */ - break; - case RC_DOWN: /* towards -infinity */ - increment = signpositive(&tmp) ? 0 : tmp.sigl & 0x7ff; - break; - case RC_UP: /* towards +infinity */ - increment = signpositive(&tmp) ? tmp.sigl & 0x7ff : 0; - break; - case RC_CHOP: - increment = 0; - break; - } - - /* Truncate the mantissa */ - tmp.sigl &= 0xfffff800; - - if ( increment ) - { - if ( tmp.sigl >= 0xfffff800 ) - { - /* the sigl part overflows */ - if ( tmp.sigh == 0xffffffff ) - { - /* The sigh part overflows */ - tmp.sigh = 0x80000000; - exp++; - if (exp >= EXP_OVER) - goto overflow; + { + EXCEPTION(EX_Underflow); + /* This is a special case: see sec 16.2.5.1 of + the 80486 book */ + if (!(control_word & CW_Underflow)) + return 0; + } + EXCEPTION(precision_loss); + if (!(control_word & CW_Precision)) + return 0; } - else - { - tmp.sigh ++; + l[0] = tmp.sigl; + l[1] = tmp.sigh; + } else { + if (tmp.sigl & 0x000007ff) { + precision_loss = 1; + switch (control_word & CW_RC) { + case RC_RND: + /* Rounding can get a little messy.. */ + increment = ((tmp.sigl & 0x7ff) > 0x400) | /* nearest */ + ((tmp.sigl & 0xc00) == 0xc00); /* odd -> even */ + break; + case RC_DOWN: /* towards -infinity */ + increment = + signpositive(&tmp) ? 0 : tmp. + sigl & 0x7ff; + break; + case RC_UP: /* towards +infinity */ + increment = + signpositive(&tmp) ? tmp. + sigl & 0x7ff : 0; + break; + case RC_CHOP: + increment = 0; + break; + } + + /* Truncate the mantissa */ + tmp.sigl &= 0xfffff800; + + if (increment) { + if (tmp.sigl >= 0xfffff800) { + /* the sigl part overflows */ + if (tmp.sigh == 0xffffffff) { + /* The sigh part overflows */ + tmp.sigh = 0x80000000; + exp++; + if (exp >= EXP_OVER) + goto overflow; + } else { + tmp.sigh++; + } + tmp.sigl = 0x00000000; + } else { + /* We only need to increment sigl */ + tmp.sigl += 0x00000800; + } + } + } else + precision_loss = 0; + + l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21); + l[1] = ((tmp.sigh >> 11) & 0xfffff); + + if (exp > DOUBLE_Emax) { + overflow: + EXCEPTION(EX_Overflow); + if (!(control_word & CW_Overflow)) + return 0; + set_precision_flag_up(); + if (!(control_word & CW_Precision)) + return 0; + + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + /* Overflow to infinity */ + l[0] = 0x00000000; /* Set to */ + l[1] = 0x7ff00000; /* + INF */ + } else { + if (precision_loss) { + if (increment) + set_precision_flag_up(); + else + set_precision_flag_down(); + } + /* Add the exponent */ + l[1] |= (((exp + DOUBLE_Ebias) & 0x7ff) << 20); } - tmp.sigl = 0x00000000; - } - else - { - /* We only need to increment sigl */ - tmp.sigl += 0x00000800; - } - } - } - else - precision_loss = 0; - - l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21); - l[1] = ((tmp.sigh >> 11) & 0xfffff); - - if ( exp > DOUBLE_Emax ) - { - overflow: - EXCEPTION(EX_Overflow); - if ( !(control_word & CW_Overflow) ) - return 0; - set_precision_flag_up(); - if ( !(control_word & CW_Precision) ) - return 0; - - /* This is a special case: see sec 16.2.5.1 of the 80486 book */ - /* Overflow to infinity */ - l[0] = 0x00000000; /* Set to */ - l[1] = 0x7ff00000; /* + INF */ - } - else - { - if ( precision_loss ) - { - if ( increment ) - set_precision_flag_up(); - else - set_precision_flag_down(); } - /* Add the exponent */ - l[1] |= (((exp+DOUBLE_Ebias) & 0x7ff) << 20); - } - } - } - else if (st0_tag == TAG_Zero) - { - /* Number is zero */ - l[0] = 0; - l[1] = 0; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if ( st0_tag == TW_Denormal ) - { - /* A denormal will always underflow. */ + } else if (st0_tag == TAG_Zero) { + /* Number is zero */ + l[0] = 0; + l[1] = 0; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if (st0_tag == TW_Denormal) { + /* A denormal will always underflow. */ #ifndef PECULIAR_486 - /* An 80486 is supposed to be able to generate - a denormal exception here, but... */ - /* Underflow has priority. */ - if ( control_word & CW_Underflow ) - denormal_operand(); + /* An 80486 is supposed to be able to generate + a denormal exception here, but... */ + /* Underflow has priority. */ + if (control_word & CW_Underflow) + denormal_operand(); #endif /* PECULIAR_486 */ - reg_copy(st0_ptr, &tmp); - goto denormal_arg; - } - else if (st0_tag == TW_Infinity) - { - l[0] = 0; - l[1] = 0x7ff00000; - } - else if (st0_tag == TW_NaN) - { - /* Is it really a NaN ? */ - if ( (exponent(st0_ptr) == EXP_OVER) - && (st0_ptr->sigh & 0x80000000) ) - { - /* See if we can get a valid NaN from the FPU_REG */ - l[0] = (st0_ptr->sigl >> 11) | (st0_ptr->sigh << 21); - l[1] = ((st0_ptr->sigh >> 11) & 0xfffff); - if ( !(st0_ptr->sigh & 0x40000000) ) - { - /* It is a signalling NaN */ - EXCEPTION(EX_Invalid); - if ( !(control_word & CW_Invalid) ) - return 0; - l[1] |= (0x40000000 >> 11); + reg_copy(st0_ptr, &tmp); + goto denormal_arg; + } else if (st0_tag == TW_Infinity) { + l[0] = 0; + l[1] = 0x7ff00000; + } else if (st0_tag == TW_NaN) { + /* Is it really a NaN ? */ + if ((exponent(st0_ptr) == EXP_OVER) + && (st0_ptr->sigh & 0x80000000)) { + /* See if we can get a valid NaN from the FPU_REG */ + l[0] = + (st0_ptr->sigl >> 11) | (st0_ptr-> + sigh << 21); + l[1] = ((st0_ptr->sigh >> 11) & 0xfffff); + if (!(st0_ptr->sigh & 0x40000000)) { + /* It is a signalling NaN */ + EXCEPTION(EX_Invalid); + if (!(control_word & CW_Invalid)) + return 0; + l[1] |= (0x40000000 >> 11); + } + l[1] |= 0x7ff00000; + } else { + /* It is an unsupported data type */ + EXCEPTION(EX_Invalid); + if (!(control_word & CW_Invalid)) + return 0; + l[0] = 0; + l[1] = 0xfff80000; + } } - l[1] |= 0x7ff00000; - } - else - { - /* It is an unsupported data type */ - EXCEPTION(EX_Invalid); - if ( !(control_word & CW_Invalid) ) - return 0; - l[0] = 0; - l[1] = 0xfff80000; - } + } else if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + if (control_word & CW_Invalid) { + /* The masked response */ + /* Put out the QNaN indefinite */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, dfloat, 8); + FPU_put_user(0, (unsigned long __user *)dfloat); + FPU_put_user(0xfff80000, + 1 + (unsigned long __user *)dfloat); + RE_ENTRANT_CHECK_ON; + return 1; + } else + return 0; } - } - else if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - if ( control_word & CW_Invalid ) - { - /* The masked response */ - /* Put out the QNaN indefinite */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,dfloat,8); - FPU_put_user(0, (unsigned long __user *) dfloat); - FPU_put_user(0xfff80000, 1 + (unsigned long __user *) dfloat); - RE_ENTRANT_CHECK_ON; - return 1; - } - else - return 0; - } - if ( getsign(st0_ptr) ) - l[1] |= 0x80000000; - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,dfloat,8); - FPU_put_user(l[0], (unsigned long __user *)dfloat); - FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat); - RE_ENTRANT_CHECK_ON; - - return 1; -} + if (getsign(st0_ptr)) + l[1] |= 0x80000000; + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, dfloat, 8); + FPU_put_user(l[0], (unsigned long __user *)dfloat); + FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat); + RE_ENTRANT_CHECK_ON; + + return 1; +} /* Put a float into user memory */ int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single) { - long templ = 0; - unsigned long increment = 0; /* avoid gcc warnings */ - int precision_loss; - int exp; - FPU_REG tmp; + long templ = 0; + unsigned long increment = 0; /* avoid gcc warnings */ + int precision_loss; + int exp; + FPU_REG tmp; - if ( st0_tag == TAG_Valid ) - { + if (st0_tag == TAG_Valid) { - reg_copy(st0_ptr, &tmp); - exp = exponent(&tmp); + reg_copy(st0_ptr, &tmp); + exp = exponent(&tmp); - if ( exp < SINGLE_Emin ) - { - addexponent(&tmp, -SINGLE_Emin + 23); /* largest exp to be 22 */ + if (exp < SINGLE_Emin) { + addexponent(&tmp, -SINGLE_Emin + 23); /* largest exp to be 22 */ - denormal_arg: + denormal_arg: - if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) ) - { + if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) { #ifdef PECULIAR_486 - /* Did it round to a non-denormal ? */ - /* This behaviour might be regarded as peculiar, it appears - that the 80486 rounds to the dest precision, then - converts to decide underflow. */ - if ( !((tmp.sigl == 0x00800000) && - ((st0_ptr->sigh & 0x000000ff) || st0_ptr->sigl)) ) + /* Did it round to a non-denormal ? */ + /* This behaviour might be regarded as peculiar, it appears + that the 80486 rounds to the dest precision, then + converts to decide underflow. */ + if (!((tmp.sigl == 0x00800000) && + ((st0_ptr->sigh & 0x000000ff) + || st0_ptr->sigl))) #endif /* PECULIAR_486 */ - { - EXCEPTION(EX_Underflow); - /* This is a special case: see sec 16.2.5.1 of - the 80486 book */ - if ( !(control_word & CW_Underflow) ) - return 0; - } - EXCEPTION(precision_loss); - if ( !(control_word & CW_Precision) ) - return 0; - } - templ = tmp.sigl; - } - else - { - if ( tmp.sigl | (tmp.sigh & 0x000000ff) ) - { - unsigned long sigh = tmp.sigh; - unsigned long sigl = tmp.sigl; - - precision_loss = 1; - switch (control_word & CW_RC) - { - case RC_RND: - increment = ((sigh & 0xff) > 0x80) /* more than half */ - || (((sigh & 0xff) == 0x80) && sigl) /* more than half */ - || ((sigh & 0x180) == 0x180); /* round to even */ - break; - case RC_DOWN: /* towards -infinity */ - increment = signpositive(&tmp) - ? 0 : (sigl | (sigh & 0xff)); - break; - case RC_UP: /* towards +infinity */ - increment = signpositive(&tmp) - ? (sigl | (sigh & 0xff)) : 0; - break; - case RC_CHOP: - increment = 0; - break; - } - - /* Truncate part of the mantissa */ - tmp.sigl = 0; - - if (increment) - { - if ( sigh >= 0xffffff00 ) - { - /* The sigh part overflows */ - tmp.sigh = 0x80000000; - exp++; - if ( exp >= EXP_OVER ) - goto overflow; - } - else - { - tmp.sigh &= 0xffffff00; - tmp.sigh += 0x100; - } - } - else - { - tmp.sigh &= 0xffffff00; /* Finish the truncation */ - } - } - else - precision_loss = 0; - - templ = (tmp.sigh >> 8) & 0x007fffff; - - if ( exp > SINGLE_Emax ) - { - overflow: - EXCEPTION(EX_Overflow); - if ( !(control_word & CW_Overflow) ) - return 0; - set_precision_flag_up(); - if ( !(control_word & CW_Precision) ) - return 0; - - /* This is a special case: see sec 16.2.5.1 of the 80486 book. */ - /* Masked response is overflow to infinity. */ - templ = 0x7f800000; - } - else - { - if ( precision_loss ) - { - if ( increment ) - set_precision_flag_up(); - else - set_precision_flag_down(); + { + EXCEPTION(EX_Underflow); + /* This is a special case: see sec 16.2.5.1 of + the 80486 book */ + if (!(control_word & CW_Underflow)) + return 0; + } + EXCEPTION(precision_loss); + if (!(control_word & CW_Precision)) + return 0; + } + templ = tmp.sigl; + } else { + if (tmp.sigl | (tmp.sigh & 0x000000ff)) { + unsigned long sigh = tmp.sigh; + unsigned long sigl = tmp.sigl; + + precision_loss = 1; + switch (control_word & CW_RC) { + case RC_RND: + increment = ((sigh & 0xff) > 0x80) /* more than half */ + ||(((sigh & 0xff) == 0x80) && sigl) /* more than half */ + ||((sigh & 0x180) == 0x180); /* round to even */ + break; + case RC_DOWN: /* towards -infinity */ + increment = signpositive(&tmp) + ? 0 : (sigl | (sigh & 0xff)); + break; + case RC_UP: /* towards +infinity */ + increment = signpositive(&tmp) + ? (sigl | (sigh & 0xff)) : 0; + break; + case RC_CHOP: + increment = 0; + break; + } + + /* Truncate part of the mantissa */ + tmp.sigl = 0; + + if (increment) { + if (sigh >= 0xffffff00) { + /* The sigh part overflows */ + tmp.sigh = 0x80000000; + exp++; + if (exp >= EXP_OVER) + goto overflow; + } else { + tmp.sigh &= 0xffffff00; + tmp.sigh += 0x100; + } + } else { + tmp.sigh &= 0xffffff00; /* Finish the truncation */ + } + } else + precision_loss = 0; + + templ = (tmp.sigh >> 8) & 0x007fffff; + + if (exp > SINGLE_Emax) { + overflow: + EXCEPTION(EX_Overflow); + if (!(control_word & CW_Overflow)) + return 0; + set_precision_flag_up(); + if (!(control_word & CW_Precision)) + return 0; + + /* This is a special case: see sec 16.2.5.1 of the 80486 book. */ + /* Masked response is overflow to infinity. */ + templ = 0x7f800000; + } else { + if (precision_loss) { + if (increment) + set_precision_flag_up(); + else + set_precision_flag_down(); + } + /* Add the exponent */ + templ |= ((exp + SINGLE_Ebias) & 0xff) << 23; + } } - /* Add the exponent */ - templ |= ((exp+SINGLE_Ebias) & 0xff) << 23; - } - } - } - else if (st0_tag == TAG_Zero) - { - templ = 0; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if (st0_tag == TW_Denormal) - { - reg_copy(st0_ptr, &tmp); - - /* A denormal will always underflow. */ + } else if (st0_tag == TAG_Zero) { + templ = 0; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if (st0_tag == TW_Denormal) { + reg_copy(st0_ptr, &tmp); + + /* A denormal will always underflow. */ #ifndef PECULIAR_486 - /* An 80486 is supposed to be able to generate - a denormal exception here, but... */ - /* Underflow has priority. */ - if ( control_word & CW_Underflow ) - denormal_operand(); -#endif /* PECULIAR_486 */ - goto denormal_arg; - } - else if (st0_tag == TW_Infinity) - { - templ = 0x7f800000; - } - else if (st0_tag == TW_NaN) - { - /* Is it really a NaN ? */ - if ( (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000) ) - { - /* See if we can get a valid NaN from the FPU_REG */ - templ = st0_ptr->sigh >> 8; - if ( !(st0_ptr->sigh & 0x40000000) ) - { - /* It is a signalling NaN */ - EXCEPTION(EX_Invalid); - if ( !(control_word & CW_Invalid) ) - return 0; - templ |= (0x40000000 >> 8); + /* An 80486 is supposed to be able to generate + a denormal exception here, but... */ + /* Underflow has priority. */ + if (control_word & CW_Underflow) + denormal_operand(); +#endif /* PECULIAR_486 */ + goto denormal_arg; + } else if (st0_tag == TW_Infinity) { + templ = 0x7f800000; + } else if (st0_tag == TW_NaN) { + /* Is it really a NaN ? */ + if ((exponent(st0_ptr) == EXP_OVER) + && (st0_ptr->sigh & 0x80000000)) { + /* See if we can get a valid NaN from the FPU_REG */ + templ = st0_ptr->sigh >> 8; + if (!(st0_ptr->sigh & 0x40000000)) { + /* It is a signalling NaN */ + EXCEPTION(EX_Invalid); + if (!(control_word & CW_Invalid)) + return 0; + templ |= (0x40000000 >> 8); + } + templ |= 0x7f800000; + } else { + /* It is an unsupported data type */ + EXCEPTION(EX_Invalid); + if (!(control_word & CW_Invalid)) + return 0; + templ = 0xffc00000; + } } - templ |= 0x7f800000; - } - else - { - /* It is an unsupported data type */ - EXCEPTION(EX_Invalid); - if ( !(control_word & CW_Invalid) ) - return 0; - templ = 0xffc00000; - } - } #ifdef PARANOID - else - { - EXCEPTION(EX_INTERNAL|0x164); - return 0; - } + else { + EXCEPTION(EX_INTERNAL | 0x164); + return 0; + } #endif - } - else if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - if ( control_word & EX_Invalid ) - { - /* The masked response */ - /* Put out the QNaN indefinite */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,single,4); - FPU_put_user(0xffc00000, (unsigned long __user *) single); - RE_ENTRANT_CHECK_ON; - return 1; + } else if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + if (control_word & EX_Invalid) { + /* The masked response */ + /* Put out the QNaN indefinite */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, single, 4); + FPU_put_user(0xffc00000, + (unsigned long __user *)single); + RE_ENTRANT_CHECK_ON; + return 1; + } else + return 0; } - else - return 0; - } #ifdef PARANOID - else - { - EXCEPTION(EX_INTERNAL|0x163); - return 0; - } + else { + EXCEPTION(EX_INTERNAL | 0x163); + return 0; + } #endif - if ( getsign(st0_ptr) ) - templ |= 0x80000000; + if (getsign(st0_ptr)) + templ |= 0x80000000; - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,single,4); - FPU_put_user(templ,(unsigned long __user *) single); - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, single, 4); + FPU_put_user(templ, (unsigned long __user *)single); + RE_ENTRANT_CHECK_ON; - return 1; + return 1; } - /* Put a long long into user memory */ int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d) { - FPU_REG t; - long long tll; - int precision_loss; - - if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - goto invalid_operand; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if ( (st0_tag == TW_Infinity) || - (st0_tag == TW_NaN) ) - { - EXCEPTION(EX_Invalid); - goto invalid_operand; + FPU_REG t; + long long tll; + int precision_loss; + + if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } } - } - - reg_copy(st0_ptr, &t); - precision_loss = FPU_round_to_int(&t, st0_tag); - ((long *)&tll)[0] = t.sigl; - ((long *)&tll)[1] = t.sigh; - if ( (precision_loss == 1) || - ((t.sigh & 0x80000000) && - !((t.sigh == 0x80000000) && (t.sigl == 0) && - signnegative(&t))) ) - { - EXCEPTION(EX_Invalid); - /* This is a special case: see sec 16.2.5.1 of the 80486 book */ - invalid_operand: - if ( control_word & EX_Invalid ) - { - /* Produce something like QNaN "indefinite" */ - tll = 0x8000000000000000LL; + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + ((long *)&tll)[0] = t.sigl; + ((long *)&tll)[1] = t.sigh; + if ((precision_loss == 1) || + ((t.sigh & 0x80000000) && + !((t.sigh == 0x80000000) && (t.sigl == 0) && signnegative(&t)))) { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if (control_word & EX_Invalid) { + /* Produce something like QNaN "indefinite" */ + tll = 0x8000000000000000LL; + } else + return 0; + } else { + if (precision_loss) + set_precision_flag(precision_loss); + if (signnegative(&t)) + tll = -tll; } - else - return 0; - } - else - { - if ( precision_loss ) - set_precision_flag(precision_loss); - if ( signnegative(&t) ) - tll = - tll; - } - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,8); - if (copy_to_user(d, &tll, 8)) - FPU_abort; - RE_ENTRANT_CHECK_ON; - - return 1; -} + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 8); + if (copy_to_user(d, &tll, 8)) + FPU_abort; + RE_ENTRANT_CHECK_ON; + + return 1; +} /* Put a long into user memory */ int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d) { - FPU_REG t; - int precision_loss; - - if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - goto invalid_operand; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if ( (st0_tag == TW_Infinity) || - (st0_tag == TW_NaN) ) - { - EXCEPTION(EX_Invalid); - goto invalid_operand; + FPU_REG t; + int precision_loss; + + if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } } - } - - reg_copy(st0_ptr, &t); - precision_loss = FPU_round_to_int(&t, st0_tag); - if (t.sigh || - ((t.sigl & 0x80000000) && - !((t.sigl == 0x80000000) && signnegative(&t))) ) - { - EXCEPTION(EX_Invalid); - /* This is a special case: see sec 16.2.5.1 of the 80486 book */ - invalid_operand: - if ( control_word & EX_Invalid ) - { - /* Produce something like QNaN "indefinite" */ - t.sigl = 0x80000000; + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + if (t.sigh || + ((t.sigl & 0x80000000) && + !((t.sigl == 0x80000000) && signnegative(&t)))) { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if (control_word & EX_Invalid) { + /* Produce something like QNaN "indefinite" */ + t.sigl = 0x80000000; + } else + return 0; + } else { + if (precision_loss) + set_precision_flag(precision_loss); + if (signnegative(&t)) + t.sigl = -(long)t.sigl; } - else - return 0; - } - else - { - if ( precision_loss ) - set_precision_flag(precision_loss); - if ( signnegative(&t) ) - t.sigl = -(long)t.sigl; - } - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,4); - FPU_put_user(t.sigl, (unsigned long __user *) d); - RE_ENTRANT_CHECK_ON; - - return 1; -} + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 4); + FPU_put_user(t.sigl, (unsigned long __user *)d); + RE_ENTRANT_CHECK_ON; + + return 1; +} /* Put a short into user memory */ int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d) { - FPU_REG t; - int precision_loss; - - if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - goto invalid_operand; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if ( (st0_tag == TW_Infinity) || - (st0_tag == TW_NaN) ) - { - EXCEPTION(EX_Invalid); - goto invalid_operand; + FPU_REG t; + int precision_loss; + + if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } } - } - - reg_copy(st0_ptr, &t); - precision_loss = FPU_round_to_int(&t, st0_tag); - if (t.sigh || - ((t.sigl & 0xffff8000) && - !((t.sigl == 0x8000) && signnegative(&t))) ) - { - EXCEPTION(EX_Invalid); - /* This is a special case: see sec 16.2.5.1 of the 80486 book */ - invalid_operand: - if ( control_word & EX_Invalid ) - { - /* Produce something like QNaN "indefinite" */ - t.sigl = 0x8000; + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + if (t.sigh || + ((t.sigl & 0xffff8000) && + !((t.sigl == 0x8000) && signnegative(&t)))) { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if (control_word & EX_Invalid) { + /* Produce something like QNaN "indefinite" */ + t.sigl = 0x8000; + } else + return 0; + } else { + if (precision_loss) + set_precision_flag(precision_loss); + if (signnegative(&t)) + t.sigl = -t.sigl; } - else - return 0; - } - else - { - if ( precision_loss ) - set_precision_flag(precision_loss); - if ( signnegative(&t) ) - t.sigl = -t.sigl; - } - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,2); - FPU_put_user((short)t.sigl, d); - RE_ENTRANT_CHECK_ON; - - return 1; -} + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 2); + FPU_put_user((short)t.sigl, d); + RE_ENTRANT_CHECK_ON; + + return 1; +} /* Put a packed bcd array into user memory */ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) { - FPU_REG t; - unsigned long long ll; - u_char b; - int i, precision_loss; - u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0; - - if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - goto invalid_operand; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if ( (st0_tag == TW_Infinity) || - (st0_tag == TW_NaN) ) - { - EXCEPTION(EX_Invalid); - goto invalid_operand; + FPU_REG t; + unsigned long long ll; + u_char b; + int i, precision_loss; + u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0; + + if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } + } + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + ll = significand(&t); + + /* Check for overflow, by comparing with 999999999999999999 decimal. */ + if ((t.sigh > 0x0de0b6b3) || + ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff))) { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if (control_word & CW_Invalid) { + /* Produce the QNaN "indefinite" */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 10); + for (i = 0; i < 7; i++) + FPU_put_user(0, d + i); /* These bytes "undefined" */ + FPU_put_user(0xc0, d + 7); /* This byte "undefined" */ + FPU_put_user(0xff, d + 8); + FPU_put_user(0xff, d + 9); + RE_ENTRANT_CHECK_ON; + return 1; + } else + return 0; + } else if (precision_loss) { + /* Precision loss doesn't stop the data transfer */ + set_precision_flag(precision_loss); } - } - - reg_copy(st0_ptr, &t); - precision_loss = FPU_round_to_int(&t, st0_tag); - ll = significand(&t); - - /* Check for overflow, by comparing with 999999999999999999 decimal. */ - if ( (t.sigh > 0x0de0b6b3) || - ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff)) ) - { - EXCEPTION(EX_Invalid); - /* This is a special case: see sec 16.2.5.1 of the 80486 book */ - invalid_operand: - if ( control_word & CW_Invalid ) - { - /* Produce the QNaN "indefinite" */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,10); - for ( i = 0; i < 7; i++) - FPU_put_user(0, d+i); /* These bytes "undefined" */ - FPU_put_user(0xc0, d+7); /* This byte "undefined" */ - FPU_put_user(0xff, d+8); - FPU_put_user(0xff, d+9); - RE_ENTRANT_CHECK_ON; - return 1; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 10); + RE_ENTRANT_CHECK_ON; + for (i = 0; i < 9; i++) { + b = FPU_div_small(&ll, 10); + b |= (FPU_div_small(&ll, 10)) << 4; + RE_ENTRANT_CHECK_OFF; + FPU_put_user(b, d + i); + RE_ENTRANT_CHECK_ON; } - else - return 0; - } - else if ( precision_loss ) - { - /* Precision loss doesn't stop the data transfer */ - set_precision_flag(precision_loss); - } - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,10); - RE_ENTRANT_CHECK_ON; - for ( i = 0; i < 9; i++) - { - b = FPU_div_small(&ll, 10); - b |= (FPU_div_small(&ll, 10)) << 4; - RE_ENTRANT_CHECK_OFF; - FPU_put_user(b, d+i); - RE_ENTRANT_CHECK_ON; - } - RE_ENTRANT_CHECK_OFF; - FPU_put_user(sign, d+9); - RE_ENTRANT_CHECK_ON; - - return 1; + RE_ENTRANT_CHECK_OFF; + FPU_put_user(sign, d + 9); + RE_ENTRANT_CHECK_ON; + + return 1; } /*===========================================================================*/ @@ -1119,59 +973,56 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) largest possible value */ int FPU_round_to_int(FPU_REG *r, u_char tag) { - u_char very_big; - unsigned eax; - - if (tag == TAG_Zero) - { - /* Make sure that zero is returned */ - significand(r) = 0; - return 0; /* o.k. */ - } - - if (exponent(r) > 63) - { - r->sigl = r->sigh = ~0; /* The largest representable number */ - return 1; /* overflow */ - } - - eax = FPU_shrxs(&r->sigl, 63 - exponent(r)); - very_big = !(~(r->sigh) | ~(r->sigl)); /* test for 0xfff...fff */ + u_char very_big; + unsigned eax; + + if (tag == TAG_Zero) { + /* Make sure that zero is returned */ + significand(r) = 0; + return 0; /* o.k. */ + } + + if (exponent(r) > 63) { + r->sigl = r->sigh = ~0; /* The largest representable number */ + return 1; /* overflow */ + } + + eax = FPU_shrxs(&r->sigl, 63 - exponent(r)); + very_big = !(~(r->sigh) | ~(r->sigl)); /* test for 0xfff...fff */ #define half_or_more (eax & 0x80000000) #define frac_part (eax) #define more_than_half ((eax & 0x80000001) == 0x80000001) - switch (control_word & CW_RC) - { - case RC_RND: - if ( more_than_half /* nearest */ - || (half_or_more && (r->sigl & 1)) ) /* odd -> even */ - { - if ( very_big ) return 1; /* overflow */ - significand(r) ++; - return PRECISION_LOST_UP; - } - break; - case RC_DOWN: - if (frac_part && getsign(r)) - { - if ( very_big ) return 1; /* overflow */ - significand(r) ++; - return PRECISION_LOST_UP; - } - break; - case RC_UP: - if (frac_part && !getsign(r)) - { - if ( very_big ) return 1; /* overflow */ - significand(r) ++; - return PRECISION_LOST_UP; + switch (control_word & CW_RC) { + case RC_RND: + if (more_than_half /* nearest */ + || (half_or_more && (r->sigl & 1))) { /* odd -> even */ + if (very_big) + return 1; /* overflow */ + significand(r)++; + return PRECISION_LOST_UP; + } + break; + case RC_DOWN: + if (frac_part && getsign(r)) { + if (very_big) + return 1; /* overflow */ + significand(r)++; + return PRECISION_LOST_UP; + } + break; + case RC_UP: + if (frac_part && !getsign(r)) { + if (very_big) + return 1; /* overflow */ + significand(r)++; + return PRECISION_LOST_UP; + } + break; + case RC_CHOP: + break; } - break; - case RC_CHOP: - break; - } - return eax ? PRECISION_LOST_DOWN : 0; + return eax ? PRECISION_LOST_DOWN : 0; } @@ -1179,197 +1030,195 @@ int FPU_round_to_int(FPU_REG *r, u_char tag) u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s) { - unsigned short tag_word = 0; - u_char tag; - int i; - - if ( (addr_modes.default_mode == VM86) || - ((addr_modes.default_mode == PM16) - ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) ) - { - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 0x0e); - FPU_get_user(control_word, (unsigned short __user *) s); - FPU_get_user(partial_status, (unsigned short __user *) (s+2)); - FPU_get_user(tag_word, (unsigned short __user *) (s+4)); - FPU_get_user(instruction_address.offset, (unsigned short __user *) (s+6)); - FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+8)); - FPU_get_user(operand_address.offset, (unsigned short __user *) (s+0x0a)); - FPU_get_user(operand_address.selector, (unsigned short __user *) (s+0x0c)); - RE_ENTRANT_CHECK_ON; - s += 0x0e; - if ( addr_modes.default_mode == VM86 ) - { - instruction_address.offset - += (instruction_address.selector & 0xf000) << 4; - operand_address.offset += (operand_address.selector & 0xf000) << 4; + unsigned short tag_word = 0; + u_char tag; + int i; + + if ((addr_modes.default_mode == VM86) || + ((addr_modes.default_mode == PM16) + ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 0x0e); + FPU_get_user(control_word, (unsigned short __user *)s); + FPU_get_user(partial_status, (unsigned short __user *)(s + 2)); + FPU_get_user(tag_word, (unsigned short __user *)(s + 4)); + FPU_get_user(instruction_address.offset, + (unsigned short __user *)(s + 6)); + FPU_get_user(instruction_address.selector, + (unsigned short __user *)(s + 8)); + FPU_get_user(operand_address.offset, + (unsigned short __user *)(s + 0x0a)); + FPU_get_user(operand_address.selector, + (unsigned short __user *)(s + 0x0c)); + RE_ENTRANT_CHECK_ON; + s += 0x0e; + if (addr_modes.default_mode == VM86) { + instruction_address.offset + += (instruction_address.selector & 0xf000) << 4; + operand_address.offset += + (operand_address.selector & 0xf000) << 4; + } + } else { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 0x1c); + FPU_get_user(control_word, (unsigned short __user *)s); + FPU_get_user(partial_status, (unsigned short __user *)(s + 4)); + FPU_get_user(tag_word, (unsigned short __user *)(s + 8)); + FPU_get_user(instruction_address.offset, + (unsigned long __user *)(s + 0x0c)); + FPU_get_user(instruction_address.selector, + (unsigned short __user *)(s + 0x10)); + FPU_get_user(instruction_address.opcode, + (unsigned short __user *)(s + 0x12)); + FPU_get_user(operand_address.offset, + (unsigned long __user *)(s + 0x14)); + FPU_get_user(operand_address.selector, + (unsigned long __user *)(s + 0x18)); + RE_ENTRANT_CHECK_ON; + s += 0x1c; } - } - else - { - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 0x1c); - FPU_get_user(control_word, (unsigned short __user *) s); - FPU_get_user(partial_status, (unsigned short __user *) (s+4)); - FPU_get_user(tag_word, (unsigned short __user *) (s+8)); - FPU_get_user(instruction_address.offset, (unsigned long __user *) (s+0x0c)); - FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+0x10)); - FPU_get_user(instruction_address.opcode, (unsigned short __user *) (s+0x12)); - FPU_get_user(operand_address.offset, (unsigned long __user *) (s+0x14)); - FPU_get_user(operand_address.selector, (unsigned long __user *) (s+0x18)); - RE_ENTRANT_CHECK_ON; - s += 0x1c; - } #ifdef PECULIAR_486 - control_word &= ~0xe080; -#endif /* PECULIAR_486 */ - - top = (partial_status >> SW_Top_Shift) & 7; - - if ( partial_status & ~control_word & CW_Exceptions ) - partial_status |= (SW_Summary | SW_Backward); - else - partial_status &= ~(SW_Summary | SW_Backward); - - for ( i = 0; i < 8; i++ ) - { - tag = tag_word & 3; - tag_word >>= 2; - - if ( tag == TAG_Empty ) - /* New tag is empty. Accept it */ - FPU_settag(i, TAG_Empty); - else if ( FPU_gettag(i) == TAG_Empty ) - { - /* Old tag is empty and new tag is not empty. New tag is determined - by old reg contents */ - if ( exponent(&fpu_register(i)) == - EXTENDED_Ebias ) - { - if ( !(fpu_register(i).sigl | fpu_register(i).sigh) ) - FPU_settag(i, TAG_Zero); - else - FPU_settag(i, TAG_Special); - } - else if ( exponent(&fpu_register(i)) == 0x7fff - EXTENDED_Ebias ) - { - FPU_settag(i, TAG_Special); - } - else if ( fpu_register(i).sigh & 0x80000000 ) - FPU_settag(i, TAG_Valid); - else - FPU_settag(i, TAG_Special); /* An Un-normal */ - } - /* Else old tag is not empty and new tag is not empty. Old tag - remains correct */ - } - - return s; -} + control_word &= ~0xe080; +#endif /* PECULIAR_486 */ + + top = (partial_status >> SW_Top_Shift) & 7; + + if (partial_status & ~control_word & CW_Exceptions) + partial_status |= (SW_Summary | SW_Backward); + else + partial_status &= ~(SW_Summary | SW_Backward); + + for (i = 0; i < 8; i++) { + tag = tag_word & 3; + tag_word >>= 2; + + if (tag == TAG_Empty) + /* New tag is empty. Accept it */ + FPU_settag(i, TAG_Empty); + else if (FPU_gettag(i) == TAG_Empty) { + /* Old tag is empty and new tag is not empty. New tag is determined + by old reg contents */ + if (exponent(&fpu_register(i)) == -EXTENDED_Ebias) { + if (! + (fpu_register(i).sigl | fpu_register(i). + sigh)) + FPU_settag(i, TAG_Zero); + else + FPU_settag(i, TAG_Special); + } else if (exponent(&fpu_register(i)) == + 0x7fff - EXTENDED_Ebias) { + FPU_settag(i, TAG_Special); + } else if (fpu_register(i).sigh & 0x80000000) + FPU_settag(i, TAG_Valid); + else + FPU_settag(i, TAG_Special); /* An Un-normal */ + } + /* Else old tag is not empty and new tag is not empty. Old tag + remains correct */ + } + return s; +} void frstor(fpu_addr_modes addr_modes, u_char __user *data_address) { - int i, regnr; - u_char __user *s = fldenv(addr_modes, data_address); - int offset = (top & 7) * 10, other = 80 - offset; - - /* Copy all registers in stack order. */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ,s,80); - __copy_from_user(register_base+offset, s, other); - if ( offset ) - __copy_from_user(register_base, s+other, offset); - RE_ENTRANT_CHECK_ON; - - for ( i = 0; i < 8; i++ ) - { - regnr = (i+top) & 7; - if ( FPU_gettag(regnr) != TAG_Empty ) - /* The loaded data over-rides all other cases. */ - FPU_settag(regnr, FPU_tagof(&st(i))); - } + int i, regnr; + u_char __user *s = fldenv(addr_modes, data_address); + int offset = (top & 7) * 10, other = 80 - offset; + + /* Copy all registers in stack order. */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 80); + __copy_from_user(register_base + offset, s, other); + if (offset) + __copy_from_user(register_base, s + other, offset); + RE_ENTRANT_CHECK_ON; + + for (i = 0; i < 8; i++) { + regnr = (i + top) & 7; + if (FPU_gettag(regnr) != TAG_Empty) + /* The loaded data over-rides all other cases. */ + FPU_settag(regnr, FPU_tagof(&st(i))); + } } - u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d) { - if ( (addr_modes.default_mode == VM86) || - ((addr_modes.default_mode == PM16) - ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) ) - { - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,14); + if ((addr_modes.default_mode == VM86) || + ((addr_modes.default_mode == PM16) + ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 14); #ifdef PECULIAR_486 - FPU_put_user(control_word & ~0xe080, (unsigned long __user *) d); + FPU_put_user(control_word & ~0xe080, (unsigned long __user *)d); #else - FPU_put_user(control_word, (unsigned short __user *) d); + FPU_put_user(control_word, (unsigned short __user *)d); #endif /* PECULIAR_486 */ - FPU_put_user(status_word(), (unsigned short __user *) (d+2)); - FPU_put_user(fpu_tag_word, (unsigned short __user *) (d+4)); - FPU_put_user(instruction_address.offset, (unsigned short __user *) (d+6)); - FPU_put_user(operand_address.offset, (unsigned short __user *) (d+0x0a)); - if ( addr_modes.default_mode == VM86 ) - { - FPU_put_user((instruction_address.offset & 0xf0000) >> 4, - (unsigned short __user *) (d+8)); - FPU_put_user((operand_address.offset & 0xf0000) >> 4, - (unsigned short __user *) (d+0x0c)); - } - else - { - FPU_put_user(instruction_address.selector, (unsigned short __user *) (d+8)); - FPU_put_user(operand_address.selector, (unsigned short __user *) (d+0x0c)); - } - RE_ENTRANT_CHECK_ON; - d += 0x0e; - } - else - { - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 7*4); + FPU_put_user(status_word(), (unsigned short __user *)(d + 2)); + FPU_put_user(fpu_tag_word, (unsigned short __user *)(d + 4)); + FPU_put_user(instruction_address.offset, + (unsigned short __user *)(d + 6)); + FPU_put_user(operand_address.offset, + (unsigned short __user *)(d + 0x0a)); + if (addr_modes.default_mode == VM86) { + FPU_put_user((instruction_address. + offset & 0xf0000) >> 4, + (unsigned short __user *)(d + 8)); + FPU_put_user((operand_address.offset & 0xf0000) >> 4, + (unsigned short __user *)(d + 0x0c)); + } else { + FPU_put_user(instruction_address.selector, + (unsigned short __user *)(d + 8)); + FPU_put_user(operand_address.selector, + (unsigned short __user *)(d + 0x0c)); + } + RE_ENTRANT_CHECK_ON; + d += 0x0e; + } else { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 7 * 4); #ifdef PECULIAR_486 - control_word &= ~0xe080; - /* An 80486 sets nearly all of the reserved bits to 1. */ - control_word |= 0xffff0040; - partial_status = status_word() | 0xffff0000; - fpu_tag_word |= 0xffff0000; - I387.soft.fcs &= ~0xf8000000; - I387.soft.fos |= 0xffff0000; + control_word &= ~0xe080; + /* An 80486 sets nearly all of the reserved bits to 1. */ + control_word |= 0xffff0040; + partial_status = status_word() | 0xffff0000; + fpu_tag_word |= 0xffff0000; + I387.soft.fcs &= ~0xf8000000; + I387.soft.fos |= 0xffff0000; #endif /* PECULIAR_486 */ - if (__copy_to_user(d, &control_word, 7*4)) - FPU_abort; - RE_ENTRANT_CHECK_ON; - d += 0x1c; - } - - control_word |= CW_Exceptions; - partial_status &= ~(SW_Summary | SW_Backward); - - return d; -} + if (__copy_to_user(d, &control_word, 7 * 4)) + FPU_abort; + RE_ENTRANT_CHECK_ON; + d += 0x1c; + } + control_word |= CW_Exceptions; + partial_status &= ~(SW_Summary | SW_Backward); + + return d; +} void fsave(fpu_addr_modes addr_modes, u_char __user *data_address) { - u_char __user *d; - int offset = (top & 7) * 10, other = 80 - offset; + u_char __user *d; + int offset = (top & 7) * 10, other = 80 - offset; - d = fstenv(addr_modes, data_address); + d = fstenv(addr_modes, data_address); - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,80); + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 80); - /* Copy all registers in stack order. */ - if (__copy_to_user(d, register_base+offset, other)) - FPU_abort; - if ( offset ) - if (__copy_to_user(d+other, register_base, offset)) - FPU_abort; - RE_ENTRANT_CHECK_ON; + /* Copy all registers in stack order. */ + if (__copy_to_user(d, register_base + offset, other)) + FPU_abort; + if (offset) + if (__copy_to_user(d + other, register_base, offset)) + FPU_abort; + RE_ENTRANT_CHECK_ON; - finit(); + finit(); } /*===========================================================================*/ diff --git a/arch/x86/math-emu/reg_mul.c b/arch/x86/math-emu/reg_mul.c index 40f50b61bc6..36c37f71f71 100644 --- a/arch/x86/math-emu/reg_mul.c +++ b/arch/x86/math-emu/reg_mul.c @@ -20,7 +20,6 @@ #include "reg_constant.h" #include "fpu_system.h" - /* Multiply two registers to give a register result. The sources are st(deststnr) and (b,tagb,signb). @@ -29,104 +28,88 @@ /* This routine must be called with non-empty source registers */ int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w) { - FPU_REG *a = &st(deststnr); - FPU_REG *dest = a; - u_char taga = FPU_gettagi(deststnr); - u_char saved_sign = getsign(dest); - u_char sign = (getsign(a) ^ getsign(b)); - int tag; - + FPU_REG *a = &st(deststnr); + FPU_REG *dest = a; + u_char taga = FPU_gettagi(deststnr); + u_char saved_sign = getsign(dest); + u_char sign = (getsign(a) ^ getsign(b)); + int tag; - if ( !(taga | tagb) ) - { - /* Both regs Valid, this should be the most common case. */ + if (!(taga | tagb)) { + /* Both regs Valid, this should be the most common case. */ - tag = FPU_u_mul(a, b, dest, control_w, sign, exponent(a) + exponent(b)); - if ( tag < 0 ) - { - setsign(dest, saved_sign); - return tag; + tag = + FPU_u_mul(a, b, dest, control_w, sign, + exponent(a) + exponent(b)); + if (tag < 0) { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; } - FPU_settagi(deststnr, tag); - return tag; - } - if ( taga == TAG_Special ) - taga = FPU_Special(a); - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); + if (taga == TAG_Special) + taga = FPU_Special(a); + if (tagb == TAG_Special) + tagb = FPU_Special(b); - if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + if (((taga == TAG_Valid) && (tagb == TW_Denormal)) || ((taga == TW_Denormal) && (tagb == TAG_Valid)) - || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) - { - FPU_REG x, y; - if ( denormal_operand() < 0 ) - return FPU_Exception; - - FPU_to_exp16(a, &x); - FPU_to_exp16(b, &y); - tag = FPU_u_mul(&x, &y, dest, control_w, sign, - exponent16(&x) + exponent16(&y)); - if ( tag < 0 ) - { - setsign(dest, saved_sign); - return tag; - } - FPU_settagi(deststnr, tag); - return tag; - } - else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) ) - { - if ( ((tagb == TW_Denormal) || (taga == TW_Denormal)) - && (denormal_operand() < 0) ) - return FPU_Exception; + || ((taga == TW_Denormal) && (tagb == TW_Denormal))) { + FPU_REG x, y; + if (denormal_operand() < 0) + return FPU_Exception; - /* Must have either both arguments == zero, or - one valid and the other zero. - The result is therefore zero. */ - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - /* The 80486 book says that the answer is +0, but a real - 80486 behaves this way. - IEEE-754 apparently says it should be this way. */ - setsign(dest, sign); - return TAG_Zero; - } - /* Must have infinities, NaNs, etc */ - else if ( (taga == TW_NaN) || (tagb == TW_NaN) ) - { - return real_2op_NaN(b, tagb, deststnr, &st(0)); - } - else if ( ((taga == TW_Infinity) && (tagb == TAG_Zero)) - || ((tagb == TW_Infinity) && (taga == TAG_Zero)) ) - { - return arith_invalid(deststnr); /* Zero*Infinity is invalid */ - } - else if ( ((taga == TW_Denormal) || (tagb == TW_Denormal)) - && (denormal_operand() < 0) ) - { - return FPU_Exception; - } - else if (taga == TW_Infinity) - { - FPU_copy_to_regi(a, TAG_Special, deststnr); - setsign(dest, sign); - return TAG_Special; - } - else if (tagb == TW_Infinity) - { - FPU_copy_to_regi(b, TAG_Special, deststnr); - setsign(dest, sign); - return TAG_Special; - } + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + tag = FPU_u_mul(&x, &y, dest, control_w, sign, + exponent16(&x) + exponent16(&y)); + if (tag < 0) { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; + } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) { + if (((tagb == TW_Denormal) || (taga == TW_Denormal)) + && (denormal_operand() < 0)) + return FPU_Exception; + /* Must have either both arguments == zero, or + one valid and the other zero. + The result is therefore zero. */ + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + /* The 80486 book says that the answer is +0, but a real + 80486 behaves this way. + IEEE-754 apparently says it should be this way. */ + setsign(dest, sign); + return TAG_Zero; + } + /* Must have infinities, NaNs, etc */ + else if ((taga == TW_NaN) || (tagb == TW_NaN)) { + return real_2op_NaN(b, tagb, deststnr, &st(0)); + } else if (((taga == TW_Infinity) && (tagb == TAG_Zero)) + || ((tagb == TW_Infinity) && (taga == TAG_Zero))) { + return arith_invalid(deststnr); /* Zero*Infinity is invalid */ + } else if (((taga == TW_Denormal) || (tagb == TW_Denormal)) + && (denormal_operand() < 0)) { + return FPU_Exception; + } else if (taga == TW_Infinity) { + FPU_copy_to_regi(a, TAG_Special, deststnr); + setsign(dest, sign); + return TAG_Special; + } else if (tagb == TW_Infinity) { + FPU_copy_to_regi(b, TAG_Special, deststnr); + setsign(dest, sign); + return TAG_Special; + } #ifdef PARANOID - else - { - EXCEPTION(EX_INTERNAL|0x102); - return FPU_Exception; - } -#endif /* PARANOID */ + else { + EXCEPTION(EX_INTERNAL | 0x102); + return FPU_Exception; + } +#endif /* PARANOID */ return 0; } diff --git a/arch/x86/math-emu/status_w.h b/arch/x86/math-emu/status_w.h index 59e73302aa6..54a3f226982 100644 --- a/arch/x86/math-emu/status_w.h +++ b/arch/x86/math-emu/status_w.h @@ -10,7 +10,7 @@ #ifndef _STATUS_H_ #define _STATUS_H_ -#include "fpu_emu.h" /* for definition of PECULIAR_486 */ +#include "fpu_emu.h" /* for definition of PECULIAR_486 */ #ifdef __ASSEMBLY__ #define Const__(x) $##x @@ -34,7 +34,7 @@ #define SW_Denorm_Op Const__(0x0002) /* denormalized operand */ #define SW_Invalid Const__(0x0001) /* invalid operation */ -#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */ +#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */ #ifndef __ASSEMBLY__ @@ -50,8 +50,8 @@ ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top)) static inline void setcc(int cc) { - partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3); - partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3); + partial_status &= ~(SW_C0 | SW_C1 | SW_C2 | SW_C3); + partial_status |= (cc) & (SW_C0 | SW_C1 | SW_C2 | SW_C3); } #ifdef PECULIAR_486 diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32 index 362b4ad082d..c36ae88bb54 100644 --- a/arch/x86/mm/Makefile_32 +++ b/arch/x86/mm/Makefile_32 @@ -2,9 +2,8 @@ # Makefile for the linux i386-specific parts of the memory manager. # -obj-y := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o +obj-y := init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o obj-$(CONFIG_NUMA) += discontig_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_HIGHMEM) += highmem_32.o -obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64 index 6bcb47945b8..688c8c28ac8 100644 --- a/arch/x86/mm/Makefile_64 +++ b/arch/x86/mm/Makefile_64 @@ -2,9 +2,8 @@ # Makefile for the linux x86_64-specific parts of the memory manager. # -obj-y := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o +obj-y := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NUMA) += numa_64.o obj-$(CONFIG_K8_NUMA) += k8topology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_64.o - diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c deleted file mode 100644 index f14da2a53ec..00000000000 --- a/arch/x86/mm/boot_ioremap_32.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * arch/i386/mm/boot_ioremap.c - * - * Re-map functions for early boot-time before paging_init() when the - * boot-time pagetables are still in use - * - * Written by Dave Hansen <haveblue@us.ibm.com> - */ - - -/* - * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE - * keeps that from happening. If anyone has a better way, I'm listening. - * - * boot_pte_t is defined only if this all works correctly - */ - -#undef CONFIG_X86_PAE -#undef CONFIG_PARAVIRT -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <linux/init.h> -#include <linux/stddef.h> - -/* - * I'm cheating here. It is known that the two boot PTE pages are - * allocated next to each other. I'm pretending that they're just - * one big array. - */ - -#define BOOT_PTE_PTRS (PTRS_PER_PTE*2) - -static unsigned long boot_pte_index(unsigned long vaddr) -{ - return __pa(vaddr) >> PAGE_SHIFT; -} - -static inline boot_pte_t* boot_vaddr_to_pte(void *address) -{ - boot_pte_t* boot_pg = (boot_pte_t*)pg0; - return &boot_pg[boot_pte_index((unsigned long)address)]; -} - -/* - * This is only for a caller who is clever enough to page-align - * phys_addr and virtual_source, and who also has a preference - * about which virtual address from which to steal ptes - */ -static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, - void* virtual_source) -{ - boot_pte_t* pte; - int i; - char *vaddr = virtual_source; - - pte = boot_vaddr_to_pte(virtual_source); - for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) { - set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL)); - __flush_tlb_one(&vaddr[i*PAGE_SIZE]); - } -} - -/* the virtual space we're going to remap comes from this array */ -#define BOOT_IOREMAP_PAGES 4 -#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE) -static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE] - __attribute__ ((aligned (PAGE_SIZE))); - -/* - * This only applies to things which need to ioremap before paging_init() - * bt_ioremap() and plain ioremap() are both useless at this point. - * - * When used, we're still using the boot-time pagetables, which only - * have 2 PTE pages mapping the first 8MB - * - * There is no unmap. The boot-time PTE pages aren't used after boot. - * If you really want the space back, just remap it yourself. - * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE) - */ -__init void* boot_ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr, offset; - unsigned int nrpages; - - last_addr = phys_addr + size - 1; - - /* page align the requested address */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; - - nrpages = size >> PAGE_SHIFT; - if (nrpages > BOOT_IOREMAP_PAGES) - return NULL; - - __boot_ioremap(phys_addr, nrpages, boot_ioremap_space); - - return &boot_ioremap_space[offset]; -} diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 13a474d3c6e..04b1d20e261 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -32,6 +32,7 @@ #include <linux/kexec.h> #include <linux/pfn.h> #include <linux/swap.h> +#include <linux/acpi.h> #include <asm/e820.h> #include <asm/setup.h> @@ -103,14 +104,10 @@ extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; unsigned long node_remap_size[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; static unsigned long kva_start_pfn; static unsigned long kva_pages; /* @@ -167,6 +164,22 @@ static void __init allocate_pgdat(int nid) } } +#ifdef CONFIG_DISCONTIGMEM +/* + * In the discontig memory model, a portion of the kernel virtual area (KVA) + * is reserved and portions of nodes are mapped using it. This is to allow + * node-local memory to be allocated for structures that would normally require + * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers + * should be prepared to allocate from the bootmem allocator instead. This KVA + * mechanism is incompatible with SPARSEMEM as it makes assumptions about the + * layout of memory that are broken if alloc_remap() succeeds for some of the + * map and fails for others + */ +static unsigned long node_remap_start_pfn[MAX_NUMNODES]; +static void *node_remap_end_vaddr[MAX_NUMNODES]; +static void *node_remap_alloc_vaddr[MAX_NUMNODES]; +static unsigned long node_remap_offset[MAX_NUMNODES]; + void *alloc_remap(int nid, unsigned long size) { void *allocation = node_remap_alloc_vaddr[nid]; @@ -263,11 +276,46 @@ static unsigned long calculate_numa_remap_pages(void) return reserve_pages; } +static void init_remap_allocator(int nid) +{ + node_remap_start_vaddr[nid] = pfn_to_kaddr( + kva_start_pfn + node_remap_offset[nid]); + node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + + (node_remap_size[nid] * PAGE_SIZE); + node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + + ALIGN(sizeof(pg_data_t), PAGE_SIZE); + + printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, + (ulong) node_remap_start_vaddr[nid], + (ulong) pfn_to_kaddr(highstart_pfn + + node_remap_offset[nid] + node_remap_size[nid])); +} +#else +void *alloc_remap(int nid, unsigned long size) +{ + return NULL; +} + +static unsigned long calculate_numa_remap_pages(void) +{ + return 0; +} + +static void init_remap_allocator(int nid) +{ +} + +void __init remap_numa_kva(void) +{ +} +#endif /* CONFIG_DISCONTIGMEM */ + extern void setup_bootmem_allocator(void); unsigned long __init setup_memory(void) { int nid; unsigned long system_start_pfn, system_max_low_pfn; + unsigned long wasted_pages; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -288,11 +336,18 @@ unsigned long __init setup_memory(void) #ifdef CONFIG_BLK_DEV_INITRD /* Numa kva area is below the initrd */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) - kva_start_pfn = PFN_DOWN(boot_params.hdr.ramdisk_image) + if (initrd_start) + kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET) - kva_pages; #endif - kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1); + + /* + * We waste pages past at the end of the KVA for no good reason other + * than how it is located. This is bad. + */ + wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1); + kva_start_pfn -= wasted_pages; + kva_pages += wasted_pages; system_max_low_pfn = max_low_pfn = find_max_low_pfn(); printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", @@ -318,19 +373,9 @@ unsigned long __init setup_memory(void) printk("Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); for_each_online_node(nid) { - node_remap_start_vaddr[nid] = pfn_to_kaddr( - kva_start_pfn + node_remap_offset[nid]); - /* Init the node remap allocator */ - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); + init_remap_allocator(nid); allocate_pgdat(nid); - printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) pfn_to_kaddr(highstart_pfn - + node_remap_offset[nid] + node_remap_size[nid])); } printk("High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); @@ -345,7 +390,8 @@ unsigned long __init setup_memory(void) void __init numa_kva_reserve(void) { - reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages)); + if (kva_pages) + reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages)); } void __init zone_sizes_init(void) @@ -430,3 +476,29 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif + +#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT +/* + * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA + * + * These stub functions are needed to compile 32-bit NUMA when SRAT is + * not set. There are functions in srat_64.c for parsing this table + * and it may be possible to make them common functions. + */ +void acpi_numa_slit_init (struct acpi_table_slit *slit) +{ + printk(KERN_INFO "ACPI: No support for parsing SLIT table\n"); +} + +void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa) +{ +} + +void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma) +{ +} + +void acpi_numa_arch_fixup(void) +{ +} +#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */ diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c new file mode 100644 index 00000000000..7e8db53528a --- /dev/null +++ b/arch/x86/mm/extable.c @@ -0,0 +1,62 @@ +#include <linux/module.h> +#include <linux/spinlock.h> +#include <asm/uaccess.h> + + +int fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *fixup; + +#ifdef CONFIG_PNPBIOS + if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { + extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; + extern u32 pnp_bios_is_utter_crap; + pnp_bios_is_utter_crap = 1; + printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); + __asm__ volatile( + "movl %0, %%esp\n\t" + "jmp *%1\n\t" + : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); + panic("do_trap: can't hit this"); + } +#endif + + fixup = search_exception_tables(regs->ip); + if (fixup) { + regs->ip = fixup->fixup; + return 1; + } + + return 0; +} + +#ifdef CONFIG_X86_64 +/* + * Need to defined our own search_extable on X86_64 to work around + * a B stepping K8 bug. + */ +const struct exception_table_entry * +search_extable(const struct exception_table_entry *first, + const struct exception_table_entry *last, + unsigned long value) +{ + /* B stepping K8 bug */ + if ((value >> 32) == 0) + value |= 0xffffffffUL << 32; + + while (first <= last) { + const struct exception_table_entry *mid; + long diff; + + mid = (last - first) / 2 + first; + diff = mid->insn - value; + if (diff == 0) + return mid; + else if (diff < 0) + first = mid+1; + else + last = mid-1; + } + return NULL; +} +#endif diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c deleted file mode 100644 index 0ce4f22a263..00000000000 --- a/arch/x86/mm/extable_32.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * linux/arch/i386/mm/extable.c - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <asm/uaccess.h> - -int fixup_exception(struct pt_regs *regs) -{ - const struct exception_table_entry *fixup; - -#ifdef CONFIG_PNPBIOS - if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs))) - { - extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; - extern u32 pnp_bios_is_utter_crap; - pnp_bios_is_utter_crap = 1; - printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); - __asm__ volatile( - "movl %0, %%esp\n\t" - "jmp *%1\n\t" - : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); - panic("do_trap: can't hit this"); - } -#endif - - fixup = search_exception_tables(regs->eip); - if (fixup) { - regs->eip = fixup->fixup; - return 1; - } - - return 0; -} diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c deleted file mode 100644 index 79ac6e7100a..00000000000 --- a/arch/x86/mm/extable_64.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - * linux/arch/x86_64/mm/extable.c - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/init.h> -#include <asm/uaccess.h> - -/* Simple binary search */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - /* Work around a B stepping K8 bug */ - if ((value >> 32) == 0) - value |= 0xffffffffUL << 32; - - while (first <= last) { - const struct exception_table_entry *mid; - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - return NULL; -} diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c new file mode 100644 index 00000000000..e28cc5277b1 --- /dev/null +++ b/arch/x86/mm/fault.c @@ -0,0 +1,986 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/tty.h> +#include <linux/vt_kern.h> /* For unblank_screen() */ +#include <linux/compiler.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> /* for max_low_pfn */ +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/kdebug.h> + +#include <asm/system.h> +#include <asm/desc.h> +#include <asm/segment.h> +#include <asm/pgalloc.h> +#include <asm/smp.h> +#include <asm/tlbflush.h> +#include <asm/proto.h> +#include <asm-generic/sections.h> + +/* + * Page fault error code bits + * bit 0 == 0 means no page found, 1 means protection fault + * bit 1 == 0 means read, 1 means write + * bit 2 == 0 means kernel, 1 means user-mode + * bit 3 == 1 means use of reserved bit detected + * bit 4 == 1 means fault was an instruction fetch + */ +#define PF_PROT (1<<0) +#define PF_WRITE (1<<1) +#define PF_USER (1<<2) +#define PF_RSVD (1<<3) +#define PF_INSTR (1<<4) + +static inline int notify_page_fault(struct pt_regs *regs) +{ +#ifdef CONFIG_KPROBES + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ +#ifdef CONFIG_X86_32 + if (!user_mode_vm(regs)) { +#else + if (!user_mode(regs)) { +#endif + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } + + return ret; +#else + return 0; +#endif +} + +/* + * X86_32 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + * + * X86_64 + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner + */ +static int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) +{ + unsigned char *instr; + int scan_more = 1; + int prefetch = 0; + unsigned char *max_instr; + +#ifdef CONFIG_X86_32 + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; +#endif + + /* If it was a exec fault on NX page, ignore */ + if (error_code & PF_INSTR) + return 0; + + instr = (unsigned char *)convert_ip_to_linear(current, regs); + max_instr = instr + 15; + + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + return 0; + + while (scan_more && instr < max_instr) { + unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; + + if (probe_kernel_address(instr, opcode)) + break; + + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; + instr++; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + scan_more = ((instr_lo & 7) == 0x6); + break; +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); + break; +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; + + if (probe_kernel_address(instr, opcode)) + break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; + } + } + return prefetch; +} + +static void force_sig_info_fault(int si_signo, int si_code, + unsigned long address, struct task_struct *tsk) +{ + siginfo_t info; + + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); +} + +#ifdef CONFIG_X86_64 +static int bad_address(void *p) +{ + unsigned long dummy; + return probe_kernel_address((unsigned long *)p, dummy); +} +#endif + +void dump_pagetable(unsigned long address) +{ +#ifdef CONFIG_X86_32 + __typeof__(pte_val(__pte(0))) page; + + page = read_cr3(); + page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +#ifdef CONFIG_X86_PAE + printk("*pdpt = %016Lx ", page); + if ((page >> PAGE_SHIFT) < max_low_pfn + && page & _PAGE_PRESENT) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) + & (PTRS_PER_PMD - 1)]; + printk(KERN_CONT "*pde = %016Lx ", page); + page &= ~_PAGE_NX; + } +#else + printk("*pde = %08lx ", page); +#endif + + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case + * it's allocated already. + */ + if ((page >> PAGE_SHIFT) < max_low_pfn + && (page & _PAGE_PRESENT) + && !(page & _PAGE_PSE)) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) + & (PTRS_PER_PTE - 1)]; + printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); + } + + printk("\n"); +#else /* CONFIG_X86_64 */ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = (pgd_t *)read_cr3(); + + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + if (bad_address(pgd)) goto bad; + printk("PGD %lx ", pgd_val(*pgd)); + if (!pgd_present(*pgd)) goto ret; + + pud = pud_offset(pgd, address); + if (bad_address(pud)) goto bad; + printk("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud)) goto ret; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) goto bad; + printk("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) goto bad; + printk("PTE %lx", pte_val(*pte)); +ret: + printk("\n"); + return; +bad: + printk("BAD\n"); +#endif +} + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +{ + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + if (!pmd_present(*pmd)) { + set_pmd(pmd, *pmd_k); + arch_flush_lazy_mmu_mode(); + } else + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + return pmd_k; +} +#endif + +#ifdef CONFIG_X86_64 +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +#endif + +/* Workaround for K8 erratum #93 & buggy BIOS. + BIOS SMM functions are required to use a specific workaround + to avoid corruption of the 64bit RIP register on C stepping K8. + A lot of BIOS that didn't get tested properly miss this. + The OS sees this as a page fault with the upper 32bits of RIP cleared. + Try to work around it here. + Note we only handle faults in kernel here. + Does nothing for X86_32 + */ +static int is_errata93(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + static int warned; + if (address != regs->ip) + return 0; + if ((address >> 32) != 0) + return 0; + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { + if (!warned) { + printk(errata93_warning); + warned = 1; + } + regs->ip = address; + return 1; + } +#endif + return 0; +} + +/* + * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal + * addresses >4GB. We catch this in the page fault handler because these + * addresses are not reachable. Just detect this case and return. Any code + * segment in LDT is compatibility mode. + */ +static int is_errata100(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && + (address >> 32)) + return 1; +#endif + return 0; +} + +void do_invalid_op(struct pt_regs *, unsigned long); + +static int is_f00f_bug(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_F00F_BUG + unsigned long nr; + /* + * Pentium F0 0F C7 C8 bug workaround. + */ + if (boot_cpu_data.f00f_bug) { + nr = (address - idt_descr.address) >> 3; + + if (nr == 6) { + do_invalid_op(regs, 0); + return 1; + } + } +#endif + return 0; +} + +static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ +#ifdef CONFIG_X86_32 + if (!oops_may_print()) + return; +#endif + +#ifdef CONFIG_X86_PAE + if (error_code & PF_INSTR) { + int level; + pte_t *pte = lookup_address(address, &level); + + if (pte && pte_present(*pte) && !pte_exec(*pte)) + printk(KERN_CRIT "kernel tried to execute " + "NX-protected page - exploit attempt? " + "(uid: %d)\n", current->uid); + } +#endif + + printk(KERN_ALERT "BUG: unable to handle kernel "); + if (address < PAGE_SIZE) + printk(KERN_CONT "NULL pointer dereference"); + else + printk(KERN_CONT "paging request"); +#ifdef CONFIG_X86_32 + printk(KERN_CONT " at %08lx\n", address); +#else + printk(KERN_CONT " at %016lx\n", address); +#endif + printk(KERN_ALERT "IP:"); + printk_address(regs->ip, 1); + dump_pagetable(address); +} + +#ifdef CONFIG_X86_64 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + unsigned long flags = oops_begin(); + struct task_struct *tsk; + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + current->comm, address); + dump_pagetable(address); + tsk = current; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + if (__die("Bad pagetable", regs, error_code)) + regs = NULL; + oops_end(flags, regs, SIGKILL); +} +#endif + +/* + * Handle a spurious fault caused by a stale TLB entry. This allows + * us to lazily refresh the TLB when increasing the permissions of a + * kernel page (RO -> RW or NX -> X). Doing it eagerly is very + * expensive since that implies doing a full cross-processor TLB + * flush, even if no stale TLB entries exist on other processors. + * There are no security implications to leaving a stale TLB when + * increasing the permissions on a page. + */ +static int spurious_fault(unsigned long address, + unsigned long error_code) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* Reserved-bit violation or user access to kernel space? */ + if (error_code & (PF_USER | PF_RSVD)) + return 0; + + pgd = init_mm.pgd + pgd_index(address); + if (!pgd_present(*pgd)) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return 0; + + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) + return 0; + + return 1; +} + +/* + * X86_32 + * Handle a fault on the vmalloc or module mapping area + * + * X86_64 + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static int vmalloc_fault(unsigned long address) +{ +#ifdef CONFIG_X86_32 + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + return 0; +#else + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Copy kernel mappings over when needed. This can also + happen within a race in page table update. In the later + case just flush. */ + + pgd = pgd_offset(current->mm ?: &init_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* Below here mismatches are bugs because these lower tables + are shared */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + pte = pte_offset_kernel(pmd, address); + /* Don't use pte_page here, because the mappings can point + outside mem_map, and the NUMA hash lookup cannot handle + that. */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + return 0; +#endif +} + +int show_unhandled_signals = 1; + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +#ifdef CONFIG_X86_64 +asmlinkage +#endif +void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct *vma; + unsigned long address; + int write, si_code; + int fault; +#ifdef CONFIG_X86_64 + unsigned long flags; +#endif + + /* + * We can fault from pretty much anywhere, with unknown IRQ state. + */ + trace_hardirqs_fixup(); + + tsk = current; + mm = tsk->mm; + prefetchw(&mm->mmap_sem); + + /* get the address */ + address = read_cr2(); + + si_code = SEGV_MAPERR; + + if (notify_page_fault(regs)) + return; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 9) == 0. + */ +#ifdef CONFIG_X86_32 + if (unlikely(address >= TASK_SIZE)) { + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && + vmalloc_fault(address) >= 0) + return; + + /* Can handle a stale RO->RW TLB */ + if (spurious_fault(address, error_code)) + return; + + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + /* It's safe to allow irq's after cr2 has been saved and the vmalloc + fault has been handled. */ + if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) + local_irq_enable(); + + /* + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault. + */ + if (in_atomic() || !mm) + goto bad_area_nosemaphore; +#else /* CONFIG_X86_64 */ + if (unlikely(address >= TASK_SIZE64)) { + /* + * Don't check for the module range here: its PML4 + * is always initialized because it's shared with the main + * kernel text. Only vmalloc may need PML4 syncups. + */ + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && + ((address >= VMALLOC_START && address < VMALLOC_END))) { + if (vmalloc_fault(address) >= 0) + return; + } + + /* Can handle a stale RO->RW TLB */ + if (spurious_fault(address, error_code)) + return; + + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + if (likely(regs->flags & X86_EFLAGS_IF)) + local_irq_enable(); + + if (unlikely(error_code & PF_RSVD)) + pgtable_bad(address, regs, error_code); + + /* + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault. + */ + if (unlikely(in_atomic() || !mm)) + goto bad_area_nosemaphore; + + /* + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet. + */ + if (user_mode_vm(regs)) + error_code |= PF_USER; +again: +#endif + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an + * erroneous fault occurring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibility of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & PF_USER) == 0 && + !search_exception_tables(regs->ip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (error_code & PF_USER) { + /* + * Accessing the stack below %sp is always a bug. + * The large cushion allows instructions like enter + * and pusha to work. ("enter $65535,$31" pushes + * 32 pointers and then decrements %sp by 65535.) + */ + if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + si_code = SEGV_ACCERR; + write = 0; + switch (error_code & (PF_PROT|PF_WRITE)) { + default: /* 3: write, present */ + /* fall through */ + case PF_WRITE: /* write, not present */ + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + write++; + break; + case PF_PROT: /* read, present */ + goto bad_area; + case 0: /* read, not present */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + goto bad_area; + } + +#ifdef CONFIG_X86_32 +survive: +#endif + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); + } + if (fault & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + +#ifdef CONFIG_X86_32 + /* + * Did it hit the DOS screen memory VA from vm86 mode? + */ + if (v8086_mode(regs)) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; + } +#endif + up_read(&mm->mmap_sem); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, address, error_code)) + return; + + if (is_errata100(regs, address)) + return; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk( +#ifdef CONFIG_X86_32 + "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", +#else + "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx", +#endif + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, regs->ip, + regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; + } + + if (is_f00f_bug(regs, address)) + return; + +no_context: + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * X86_32 + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * X86_64 + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, address, error_code)) + return; + + if (is_errata93(regs, address)) + return; + +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ +#ifdef CONFIG_X86_32 + bust_spinlocks(1); +#else + flags = oops_begin(); +#endif + + show_fault_oops(regs, error_code, address); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + +#ifdef CONFIG_X86_32 + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); +#else + if (__die("Oops", regs, error_code)) + regs = NULL; + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, SIGKILL); +#endif + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up_read(&mm->mmap_sem); + if (is_global_init(tsk)) { + yield(); +#ifdef CONFIG_X86_32 + down_read(&mm->mmap_sem); + goto survive; +#else + goto again; +#endif + } + + printk("VM: killing process %s\n", tsk->comm); + if (error_code & PF_USER) + do_group_exit(SIGKILL); + goto no_context; + +do_sigbus: + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & PF_USER)) + goto no_context; +#ifdef CONFIG_X86_32 + /* User space => ok to do another page fault */ + if (is_prefetch(regs, address, error_code)) + return; +#endif + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +} + +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +void vmalloc_sync_all(void) +{ +#ifdef CONFIG_X86_32 + /* + * Note that races in the updates of insync and start aren't + * problematic: insync can only get set bits added, and updates to + * start are only improving performance (without affecting correctness + * if undone). + */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = TASK_SIZE; + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); + for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), + address)) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + if (!page) + set_bit(pgd_index(address), insync); + } + if (address == start && test_bit(pgd_index(address), insync)) + start = address + PGDIR_SIZE; + } +#else /* CONFIG_X86_64 */ + /* + * Note that races in the updates of insync and start aren't + * problematic: insync can only get set bits added, and updates to + * start are only improving performance (without affecting correctness + * if undone). + */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = VMALLOC_START & PGDIR_MASK; + unsigned long address; + + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + const pgd_t *pgd_ref = pgd_offset_k(address); + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock(&pgd_lock); + set_bit(pgd_index(address), insync); + } + if (address == start) + start = address + PGDIR_SIZE; + } + /* Check that there is no need to do the same for the modules area. */ + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); +#endif +} diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c deleted file mode 100644 index a2273d44aa2..00000000000 --- a/arch/x86/mm/fault_32.c +++ /dev/null @@ -1,659 +0,0 @@ -/* - * linux/arch/i386/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/highmem.h> -#include <linux/bootmem.h> /* for max_low_pfn */ -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> - -#include <asm/system.h> -#include <asm/desc.h> -#include <asm/segment.h> - -extern void die(const char *,struct pt_regs *,long); - -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode_vm(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs) -{ - return 0; -} -#endif - -/* - * Return EIP plus the CS segment base. The segment limit is also - * adjusted, clamped to the kernel/user address space (whichever is - * appropriate), and returned in *eip_limit. - * - * The segment is checked, because it might have been changed by another - * task between the original faulting instruction and here. - * - * If CS is no longer a valid code segment, or if EIP is beyond the - * limit, or if it is a kernel address when CS is not a kernel segment, - * then the returned value will be greater than *eip_limit. - * - * This is slow, but is very rarely executed. - */ -static inline unsigned long get_segment_eip(struct pt_regs *regs, - unsigned long *eip_limit) -{ - unsigned long eip = regs->eip; - unsigned seg = regs->xcs & 0xffff; - u32 seg_ar, seg_limit, base, *desc; - - /* Unlikely, but must come before segment checks. */ - if (unlikely(regs->eflags & VM_MASK)) { - base = seg << 4; - *eip_limit = base + 0xffff; - return base + (eip & 0xffff); - } - - /* The standard kernel/user address space limit. */ - *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; - - /* By far the most common cases. */ - if (likely(SEGMENT_IS_FLAT_CODE(seg))) - return eip; - - /* Check the segment exists, is within the current LDT/GDT size, - that kernel/user (ring 0..3) has the appropriate privilege, - that it's a code segment, and get the limit. */ - __asm__ ("larl %3,%0; lsll %3,%1" - : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); - if ((~seg_ar & 0x9800) || eip > seg_limit) { - *eip_limit = 0; - return 1; /* So that returned eip > *eip_limit. */ - } - - /* Get the GDT/LDT descriptor base. - When you look for races in this code remember that - LDT and other horrors are only used in user space. */ - if (seg & (1<<2)) { - /* Must lock the LDT while reading it. */ - mutex_lock(¤t->mm->context.lock); - desc = current->mm->context.ldt; - desc = (void *)desc + (seg & ~7); - } else { - /* Must disable preemption while reading the GDT. */ - desc = (u32 *)get_cpu_gdt_table(get_cpu()); - desc = (void *)desc + (seg & ~7); - } - - /* Decode the code segment base from the descriptor */ - base = get_desc_base((unsigned long *)desc); - - if (seg & (1<<2)) { - mutex_unlock(¤t->mm->context.lock); - } else - put_cpu(); - - /* Adjust EIP and segment limit, and clamp at the kernel limit. - It's legitimate for segments to wrap at 0xffffffff. */ - seg_limit += base; - if (seg_limit < *eip_limit && seg_limit >= base) - *eip_limit = seg_limit; - return eip + base; -} - -/* - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. - */ -static int __is_prefetch(struct pt_regs *regs, unsigned long addr) -{ - unsigned long limit; - unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit); - int scan_more = 1; - int prefetch = 0; - int i; - - for (i = 0; scan_more && i < 15; i++) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (instr > (unsigned char *)limit) - break; - if (probe_kernel_address(instr, opcode)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (instr > (unsigned char *)limit) - break; - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 6)) { - /* Catch an obscure case of prefetch inside an NX page. */ - if (nx_enabled && (error_code & 16)) - return 0; - return __is_prefetch(regs, addr); - } - return 0; -} - -static noinline void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) -{ - siginfo_t info; - - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; - force_sig_info(si_signo, &info, tsk); -} - -fastcall void do_invalid_op(struct pt_regs *, unsigned long); - -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - if (!pmd_present(*pmd)) { - set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - return pmd_k; -} - -/* - * Handle a fault on the vmalloc or module mapping area - * - * This assumes no large pages in there. - */ -static inline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - return 0; -} - -int show_unhandled_signals = 1; - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - * - * error_code: - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch - */ -fastcall void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long address; - int write, si_code; - int fault; - - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - - /* get the address */ - address = read_cr2(); - - tsk = current; - - si_code = SEGV_MAPERR; - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. - */ - if (unlikely(address >= TASK_SIZE)) { - if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) - return; - if (notify_page_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (notify_page_fault(regs)) - return; - - /* It's safe to allow irq's after cr2 has been saved and the vmalloc - fault has been handled. */ - if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) - local_irq_enable(); - - mm = tsk->mm; - - /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault.. - */ - if (in_atomic() || !mm) - goto bad_area_nosemaphore; - - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & 4) == 0 && - !search_exception_tables(regs->eip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* - * Accessing the stack below %esp is always a bug. - * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes - * 32 pointers and then decrements %esp by 65535.) - */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - si_code = SEGV_ACCERR; - write = 0; - switch (error_code & 3) { - default: /* 3: write, present */ - /* fall through */ - case 2: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case 1: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } - - survive: - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, write); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (regs->eflags & VM_MASK) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & 4) { - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk("%s%s[%d]: segfault at %08lx eip %08lx " - "esp %08lx error %lx\n", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, regs->eip, - regs->esp, error_code); - } - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; - } - -#ifdef CONFIG_X86_F00F_BUG - /* - * Pentium F0 0F C7 C8 bug workaround. - */ - if (boot_cpu_data.f00f_bug) { - unsigned long nr; - - nr = (address - idt_descr.address) >> 3; - - if (nr == 6) { - do_invalid_op(regs, 0); - return; - } - } -#endif - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - */ - if (is_prefetch(regs, address, error_code)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - bust_spinlocks(1); - - if (oops_may_print()) { - __typeof__(pte_val(__pte(0))) page; - -#ifdef CONFIG_X86_PAE - if (error_code & 16) { - pte_t *pte = lookup_address(address); - - if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current->uid); - } -#endif - if (address < PAGE_SIZE) - printk(KERN_ALERT "BUG: unable to handle kernel NULL " - "pointer dereference"); - else - printk(KERN_ALERT "BUG: unable to handle kernel paging" - " request"); - printk(" at virtual address %08lx\n",address); - printk(KERN_ALERT "printing eip: %08lx ", regs->eip); - - page = read_cr3(); - page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; -#ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", page); - if ((page >> PAGE_SHIFT) < max_low_pfn - && page & _PAGE_PRESENT) { - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; - printk(KERN_CONT "*pde = %016Lx ", page); - page &= ~_PAGE_NX; - } -#else - printk("*pde = %08lx ", page); -#endif - - /* - * We must not directly access the pte in the highpte - * case if the page table is located in highmem. - * And let's rather not kmap-atomic the pte, just in case - * it's allocated already. - */ - if ((page >> PAGE_SHIFT) < max_low_pfn - && (page & _PAGE_PRESENT) - && !(page & _PAGE_PSE)) { - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; - printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); - } - - printk("\n"); - } - - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(tsk)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & 4)) - goto no_context; - - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); -} - -void vmalloc_sync_all(void) -{ - /* - * Note that races in the updates of insync and start aren't - * problematic: insync can only get set bits added, and updates to - * start are only improving performance (without affecting correctness - * if undone). - */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = TASK_SIZE; - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; - - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); - for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - for (page = pgd_list; page; page = - (struct page *)page->index) - if (!vmalloc_sync_one(page_address(page), - address)) { - BUG_ON(page != pgd_list); - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - if (!page) - set_bit(pgd_index(address), insync); - } - if (address == start && test_bit(pgd_index(address), insync)) - start = address + PGDIR_SIZE; - } -} diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c deleted file mode 100644 index 0e26230669c..00000000000 --- a/arch/x86/mm/fault_64.c +++ /dev/null @@ -1,623 +0,0 @@ -/* - * linux/arch/x86-64/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/compiler.h> -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> - -#include <asm/system.h> -#include <asm/pgalloc.h> -#include <asm/smp.h> -#include <asm/tlbflush.h> -#include <asm/proto.h> -#include <asm-generic/sections.h> - -/* Page fault error code bits */ -#define PF_PROT (1<<0) /* or no page found */ -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) - -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs) -{ - return 0; -} -#endif - -/* Sometimes the CPU reports invalid exceptions on prefetch. - Check that here and ignore. - Opcode checker based on code by Richard Brunner */ -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - unsigned char *instr; - int scan_more = 1; - int prefetch = 0; - unsigned char *max_instr; - - /* If it was a exec fault ignore */ - if (error_code & PF_INSTR) - return 0; - - instr = (unsigned char __user *)convert_rip_to_linear(current, regs); - max_instr = instr + 15; - - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) - return 0; - - while (scan_more && instr < max_instr) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (probe_kernel_address(instr, opcode)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 - prefixes. In long mode, the CPU will signal - invalid opcode if some of these prefixes are - present so we will never get here anyway */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x40: - /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes - Need to figure out under what instruction mode the - instruction was issued ... */ - /* Could check the LDT for lm, but for now it's good - enough to assume that long mode only uses well known - segments or kernel. */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static int bad_address(void *p) -{ - unsigned long dummy; - return probe_kernel_address((unsigned long *)p, dummy); -} - -void dump_pagetable(unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); - pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; - - pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud)) goto ret; - - pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; - - pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); -ret: - printk("\n"); - return; -bad: - printk("BAD\n"); -} - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. */ - -static int is_errata93(struct pt_regs *regs, unsigned long address) -{ - static int warned; - if (address != regs->rip) - return 0; - if ((address >> 32) != 0) - return 0; - address |= 0xffffffffUL << 32; - if ((address >= (u64)_stext && address <= (u64)_etext) || - (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { - printk(errata93_warning); - warned = 1; - } - regs->rip = address; - return 1; - } - return 0; -} - -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) -{ - unsigned long flags = oops_begin(); - struct task_struct *tsk; - - printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); - dump_pagetable(address); - tsk = current; - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Bad pagetable", regs, error_code); - oops_end(flags); - do_exit(SIGKILL); -} - -/* - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - - pgd = pgd_offset(current->mm ?: &init_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* Below here mismatches are bugs because these lower tables - are shared */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - return 0; -} - -int show_unhandled_signals = 1; - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - */ -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long address; - const struct exception_table_entry *fixup; - int write, fault; - unsigned long flags; - siginfo_t info; - - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - - tsk = current; - mm = tsk->mm; - prefetchw(&mm->mmap_sem); - - /* get the address */ - address = read_cr2(); - - info.si_code = SEGV_MAPERR; - - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. - */ - if (unlikely(address >= TASK_SIZE64)) { - /* - * Don't check for the module range here: its PML4 - * is always initialized because it's shared with the main - * kernel text. Only vmalloc may need PML4 syncups. - */ - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - ((address >= VMALLOC_START && address < VMALLOC_END))) { - if (vmalloc_fault(address) >= 0) - return; - } - if (notify_page_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (notify_page_fault(regs)) - return; - - if (likely(regs->eflags & X86_EFLAGS_IF)) - local_irq_enable(); - - if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); - - /* - * If we're in an interrupt or have no user - * context, we must not take the fault.. - */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; - - /* - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. - */ - if (user_mode_vm(regs)) - error_code |= PF_USER; - - again: - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->rip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (likely(vma->vm_start <= address)) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* Allow userspace just enough access below the stack pointer - * to let the 'enter' instruction work. - */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - info.si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } - - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, write); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - if (is_prefetch(regs, address, error_code)) - return; - - /* Work around K8 erratum #100 K8 in compat mode - occasionally jumps to illegal addresses >4GB. We - catch this here in the page fault handler because - these addresses are not reachable. Just detect this - case and return. Any code segment in LDT is - compatibility mode. */ - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n", - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, tsk->pid, address, regs->rip, - regs->rsp, error_code); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code has been set above */ - info.si_addr = (void __user *)address; - force_sig_info(SIGSEGV, &info, tsk); - return; - } - -no_context: - - /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } - - /* - * Hall of shame of CPU/BIOS bugs. - */ - - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - flags = oops_begin(); - - if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); - else - printk(KERN_ALERT "Unable to handle kernel paging request"); - printk(" at %016lx RIP: \n" KERN_ALERT,address); - printk_address(regs->rip); - dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Oops", regs, error_code); - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - goto again; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, tsk); - return; -} - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - /* Note that races in the updates of insync and start aren't - problematic: - insync can only get set bits added, and updates to start are only - improving performance (without affecting correctness if undone). */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = VMALLOC_START & PGDIR_MASK; - unsigned long address; - - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - const pgd_t *pgd_ref = pgd_offset_k(address); - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - spin_lock(&pgd_lock); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock(&pgd_lock); - set_bit(pgd_index(address), insync); - } - if (address == start) - start = address + PGDIR_SIZE; - } - /* Check that there is no need to do the same for the modules area. */ - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == - (__START_KERNEL & PGDIR_MASK))); -} diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 1c3bf95f735..3d936f23270 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -18,6 +18,49 @@ void kunmap(struct page *page) kunmap_high(page); } +static void debug_kmap_atomic_prot(enum km_type type) +{ +#ifdef CONFIG_DEBUG_HIGHMEM + static unsigned warn_count = 10; + + if (unlikely(warn_count == 0)) + return; + + if (unlikely(in_interrupt())) { + if (in_irq()) { + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } else if (!irqs_disabled()) { /* softirq */ + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && + type != KM_SKB_SUNRPC_DATA && + type != KM_SKB_DATA_SOFTIRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } + } + + if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { + if (!irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { + if (irq_count() == 0 && !irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } +#endif +} + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -30,8 +73,10 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + + debug_kmap_atomic_prot(type); + pagefault_disable(); if (!PageHighMem(page)) diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 6c06d9c0488..4fbafb4bc2f 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -15,6 +15,7 @@ #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/pgalloc.h> static unsigned long page_table_shareable(struct vm_area_struct *svma, struct vm_area_struct *vma, @@ -88,7 +89,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_lock(&mm->page_table_lock); if (pud_none(*pud)) - pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK); + pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); else put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3c76d194fd2..da524fb2242 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -27,7 +27,6 @@ #include <linux/bootmem.h> #include <linux/slab.h> #include <linux/proc_fs.h> -#include <linux/efi.h> #include <linux/memory_hotplug.h> #include <linux/initrd.h> #include <linux/cpumask.h> @@ -40,8 +39,10 @@ #include <asm/fixmap.h> #include <asm/e820.h> #include <asm/apic.h> +#include <asm/bugs.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/pgalloc.h> #include <asm/sections.h> #include <asm/paravirt.h> @@ -50,7 +51,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; -static int noinline do_test_wp_bit(void); +static noinline int do_test_wp_bit(void); /* * Creates a middle page table and puts a pointer to it in the @@ -61,26 +62,26 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) { pud_t *pud; pmd_t *pmd_table; - + #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); + paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); + BUG_ON(pmd_table != pmd_offset(pud, 0)); } #endif pud = pud_offset(pgd, 0); pmd_table = pmd_offset(pud, 0); + return pmd_table; } /* * Create a page table and place a pointer to it in a middle page - * directory entry. + * directory entry: */ static pte_t * __init one_page_table_init(pmd_t *pmd) { @@ -90,9 +91,10 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) #ifdef CONFIG_DEBUG_PAGEALLOC page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif - if (!page_table) + if (!page_table) { page_table = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); + } paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -103,22 +105,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) } /* - * This function initializes a certain range of kernel virtual memory + * This function initializes a certain range of kernel virtual memory * with new bootmem page tables, everywhere page tables are missing in * the given range. - */ - -/* - * NOTE: The pagetables are allocated contiguous on the physical space - * so we can cache the place of the first one and move around without + * + * NOTE: The pagetables are allocated contiguous on the physical space + * so we can cache the place of the first one and move around without * checking the pgd every time. */ -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) +static void __init +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) { - pgd_t *pgd; - pmd_t *pmd; int pgd_idx, pmd_idx; unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; vaddr = start; pgd_idx = pgd_index(vaddr); @@ -128,7 +129,8 @@ static void __init page_table_range_init (unsigned long start, unsigned long end for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { pmd = one_md_table_init(pgd); pmd = pmd + pmd_index(vaddr); - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd++, pmd_idx++) { one_page_table_init(pmd); vaddr += PMD_SIZE; @@ -145,17 +147,17 @@ static inline int is_kernel_text(unsigned long addr) } /* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET: */ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) { + int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; pgd_t *pgd; pmd_t *pmd; pte_t *pte; - int pgd_idx, pmd_idx, pte_ofs; pgd_idx = pgd_index(PAGE_OFFSET); pgd = pgd_base + pgd_idx; @@ -165,29 +167,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) pmd = one_md_table_init(pgd); if (pfn >= max_low_pfn) continue; - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { - unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; - /* Map with big pages if possible, otherwise create normal page tables. */ + for (pmd_idx = 0; + pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; + pmd++, pmd_idx++) { + unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; + + /* + * Map with big pages if possible, otherwise + * create normal page tables: + */ if (cpu_has_pse) { - unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - if (is_kernel_text(address) || is_kernel_text(address2)) - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); - else - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + unsigned int addr2; + pgprot_t prot = PAGE_KERNEL_LARGE; + + addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_kernel_text(addr) || + is_kernel_text(addr2)) + prot = PAGE_KERNEL_LARGE_EXEC; + + set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; - } else { - pte = one_page_table_init(pmd); - - for (pte_ofs = 0; - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; - pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { - if (is_kernel_text(address)) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - else - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); - } + continue; + } + pte = one_page_table_init(pmd); + + for (pte_ofs = 0; + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; + pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { + pgprot_t prot = PAGE_KERNEL; + + if (is_kernel_text(addr)) + prot = PAGE_KERNEL_EXEC; + + set_pte(pte, pfn_pte(pfn, prot)); } } } @@ -200,57 +216,23 @@ static inline int page_kills_ppro(unsigned long pagenr) return 0; } -int page_is_ram(unsigned long pagenr) -{ - int i; - unsigned long addr, end; - - if (efi_enabled) { - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if (!is_available_memory(md)) - continue; - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; - } - - for (i = 0; i < e820.nr_map; i++) { - - if (e820.map[i].type != E820_RAM) /* not usable memory */ - continue; - /* - * !!!FIXME!!! Some BIOSen report areas as RAM that - * are not. Notably the 640->1Mb area. We need a sanity - * check here. - */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; -} - #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; pgprot_t kmap_prot; -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) +{ + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), + vaddr), vaddr), vaddr); +} static void __init kmap_init(void) { unsigned long kmap_vstart; - /* cache the first kmap pte */ + /* + * Cache the first kmap pte: + */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); @@ -259,11 +241,11 @@ static void __init kmap_init(void) static void __init permanent_kmaps_init(pgd_t *pgd_base) { + unsigned long vaddr; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned long vaddr; vaddr = PKMAP_BASE; page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); @@ -272,7 +254,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pud = pud_offset(pgd, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + pkmap_page_table = pte; } static void __meminit free_new_highpage(struct page *page) @@ -291,7 +273,8 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) SetPageReserved(page); } -static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn) +static int __meminit +add_one_highpage_hotplug(struct page *page, unsigned long pfn) { free_new_highpage(page); totalram_pages++; @@ -299,6 +282,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p max_mapnr = max(pfn, max_mapnr); #endif num_physpages++; + return 0; } @@ -306,7 +290,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p * Not currently handling the NUMA case. * Assuming single node and all memory that * has been added dynamically that would be - * onlined here is in HIGHMEM + * onlined here is in HIGHMEM. */ void __meminit online_page(struct page *page) { @@ -314,13 +298,11 @@ void __meminit online_page(struct page *page) add_one_highpage_hotplug(page, page_to_pfn(page)); } - -#ifdef CONFIG_NUMA -extern void set_highmem_pages_init(int); -#else +#ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { int pfn; + for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { /* * Holes under sparsemem might not have no mem_map[]: @@ -330,23 +312,18 @@ static void __init set_highmem_pages_init(int bad_ppro) } totalram_pages += totalhigh_pages; } -#endif /* CONFIG_FLATMEM */ +#endif /* !CONFIG_NUMA */ #else -#define kmap_init() do { } while (0) -#define permanent_kmaps_init(pgd_base) do { } while (0) -#define set_highmem_pages_init(bad_ppro) do { } while (0) +# define kmap_init() do { } while (0) +# define permanent_kmaps_init(pgd_base) do { } while (0) +# define set_highmem_pages_init(bad_ppro) do { } while (0) #endif /* CONFIG_HIGHMEM */ -unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; +pteval_t __PAGE_KERNEL = _PAGE_KERNEL; EXPORT_SYMBOL(__PAGE_KERNEL); -unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; -#ifdef CONFIG_NUMA -extern void __init remap_numa_kva(void); -#else -#define remap_numa_kva() do {} while (0) -#endif +pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; void __init native_pagetable_setup_start(pgd_t *base) { @@ -372,7 +349,7 @@ void __init native_pagetable_setup_start(pgd_t *base) memset(&base[USER_PTRS_PER_PGD], 0, KERNEL_PGD_PTRS * sizeof(pgd_t)); #else - paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); + paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); #endif } @@ -410,10 +387,10 @@ void __init native_pagetable_setup_done(pgd_t *base) * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init pagetable_init (void) +static void __init pagetable_init(void) { - unsigned long vaddr, end; pgd_t *pgd_base = swapper_pg_dir; + unsigned long vaddr, end; paravirt_pagetable_setup_start(pgd_base); @@ -435,9 +412,11 @@ static void __init pagetable_init (void) * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): */ + early_ioremap_clear(); vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; page_table_range_init(vaddr, end, pgd_base); + early_ioremap_reset(); permanent_kmaps_init(pgd_base); @@ -450,7 +429,7 @@ static void __init pagetable_init (void) * driver might have split up a kernel 4MB mapping. */ char __nosavedata swsusp_pg_dir[PAGE_SIZE] - __attribute__ ((aligned (PAGE_SIZE))); + __attribute__ ((aligned(PAGE_SIZE))); static inline void save_pg_dir(void) { @@ -462,7 +441,7 @@ static inline void save_pg_dir(void) } #endif -void zap_low_mappings (void) +void zap_low_mappings(void) { int i; @@ -474,22 +453,24 @@ void zap_low_mappings (void) * Note that "pgd_clear()" doesn't do it for * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) + for (i = 0; i < USER_PTRS_PER_PGD; i++) { #ifdef CONFIG_X86_PAE set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else set_pgd(swapper_pg_dir+i, __pgd(0)); #endif + } flush_tlb_all(); } -int nx_enabled = 0; +int nx_enabled; + +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; +EXPORT_SYMBOL_GPL(__supported_pte_mask); #ifdef CONFIG_X86_PAE -static int disable_nx __initdata = 0; -u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; -EXPORT_SYMBOL_GPL(__supported_pte_mask); +static int disable_nx __initdata; /* * noexec = on|off @@ -506,11 +487,14 @@ static int __init noexec_setup(char *str) __supported_pte_mask |= _PAGE_NX; disable_nx = 0; } - } else if (!strcmp(str,"off")) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } else - return -EINVAL; + } else { + if (!strcmp(str, "off")) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } else { + return -EINVAL; + } + } return 0; } @@ -522,6 +506,7 @@ static void __init set_nx(void) if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + if ((v[3] & (1 << 20)) && !disable_nx) { rdmsr(MSR_EFER, l, h); l |= EFER_NX; @@ -531,35 +516,6 @@ static void __init set_nx(void) } } } - -/* - * Enables/disables executability of a given kernel page and - * returns the previous setting. - */ -int __init set_kernel_exec(unsigned long vaddr, int enable) -{ - pte_t *pte; - int ret = 1; - - if (!nx_enabled) - goto out; - - pte = lookup_address(vaddr); - BUG_ON(!pte); - - if (!pte_exec_kernel(*pte)) - ret = 0; - - if (enable) - pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); - else - pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); - pte_update_defer(&init_mm, vaddr, pte); - __flush_tlb_all(); -out: - return ret; -} - #endif /* @@ -574,9 +530,8 @@ void __init paging_init(void) #ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) - printk("NX (Execute Disable) protection: active\n"); + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); #endif - pagetable_init(); load_cr3(swapper_pg_dir); @@ -600,10 +555,10 @@ void __init paging_init(void) * used to involve black magic jumps to work around some nasty CPU bugs, * but fortunately the switch to using exceptions got rid of all that. */ - static void __init test_wp_bit(void) { - printk("Checking if this processor honours the WP bit even in supervisor mode... "); + printk(KERN_INFO + "Checking if this processor honours the WP bit even in supervisor mode..."); /* Any page-aligned address will do, the test is non-destructive */ __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); @@ -611,47 +566,46 @@ static void __init test_wp_bit(void) clear_fixmap(FIX_WP_TEST); if (!boot_cpu_data.wp_works_ok) { - printk("No.\n"); + printk(KERN_CONT "No.\n"); #ifdef CONFIG_X86_WP_WORKS_OK - panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); + panic( + "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); #endif } else { - printk("Ok.\n"); + printk(KERN_CONT "Ok.\n"); } } -static struct kcore_list kcore_mem, kcore_vmalloc; +static struct kcore_list kcore_mem, kcore_vmalloc; void __init mem_init(void) { - extern int ppro_with_ram_bug(void); int codesize, reservedpages, datasize, initsize; - int tmp; - int bad_ppro; + int tmp, bad_ppro; #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif - bad_ppro = ppro_with_ram_bug(); #ifdef CONFIG_HIGHMEM /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); + if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { + printk(KERN_ERR + "fixmap and kmap areas overlap - this will crash\n"); printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); + PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE, + FIXADDR_START); BUG(); } #endif - /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) /* - * Only count reserved RAM pages + * Only count reserved RAM pages: */ if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; @@ -662,11 +616,12 @@ void __init mem_init(void) datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " + "%dk reserved, %dk data, %dk init, %ldk highmem)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, @@ -677,45 +632,46 @@ void __init mem_init(void) ); #if 1 /* double-sanity-check paranoia */ - printk("virtual kernel memory layout:\n" - " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + printk(KERN_INFO "virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM - " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #endif - " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" - " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" - " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" - " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" - " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", - FIXADDR_START, FIXADDR_TOP, - (FIXADDR_TOP - FIXADDR_START) >> 10, + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, #ifdef CONFIG_HIGHMEM - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, - (LAST_PKMAP*PAGE_SIZE) >> 10, + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, #endif - VMALLOC_START, VMALLOC_END, - (VMALLOC_END - VMALLOC_START) >> 20, + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, - (unsigned long)__va(0), (unsigned long)high_memory, - ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, + (unsigned long)__va(0), (unsigned long)high_memory, + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, - (unsigned long)&__init_begin, (unsigned long)&__init_end, - ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, + (unsigned long)&__init_begin, (unsigned long)&__init_end, + ((unsigned long)&__init_end - + (unsigned long)&__init_begin) >> 10, - (unsigned long)&_etext, (unsigned long)&_edata, - ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, + (unsigned long)&_etext, (unsigned long)&_edata, + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, - (unsigned long)&_text, (unsigned long)&_etext, - ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + (unsigned long)&_text, (unsigned long)&_etext, + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); #ifdef CONFIG_HIGHMEM - BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); - BUG_ON(VMALLOC_END > PKMAP_BASE); + BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); #endif - BUG_ON(VMALLOC_START > VMALLOC_END); - BUG_ON((unsigned long)high_memory > VMALLOC_START); + BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); #endif /* double-sanity-check paranoia */ #ifdef CONFIG_X86_PAE @@ -746,49 +702,38 @@ int arch_add_memory(int nid, u64 start, u64 size) return __add_pages(zone, start_pfn, nr_pages); } - #endif -struct kmem_cache *pmd_cache; - -void __init pgtable_cache_init(void) -{ - if (PTRS_PER_PMD > 1) - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - SLAB_PANIC, - pmd_ctor); -} - /* * This function cannot be __init, since exceptions don't work in that * section. Put this after the callers, so that it cannot be inlined. */ -static int noinline do_test_wp_bit(void) +static noinline int do_test_wp_bit(void) { char tmp_reg; int flag; __asm__ __volatile__( - " movb %0,%1 \n" - "1: movb %1,%0 \n" - " xorl %2,%2 \n" + " movb %0, %1 \n" + "1: movb %1, %0 \n" + " xorl %2, %2 \n" "2: \n" - ".section __ex_table,\"a\"\n" + ".section __ex_table, \"a\"\n" " .align 4 \n" - " .long 1b,2b \n" + " .long 1b, 2b \n" ".previous \n" :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), "=q" (tmp_reg), "=r" (flag) :"2" (1) :"memory"); - + return flag; } #ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { @@ -801,32 +746,58 @@ void mark_rodata_ro(void) if (num_possible_cpus() <= 1) #endif { - change_page_attr(virt_to_page(start), - size >> PAGE_SHIFT, PAGE_KERNEL_RX); - printk("Write protecting the kernel text: %luk\n", size >> 10); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); +#endif } #endif start += size; size = (unsigned long)__end_rodata - start; - change_page_attr(virt_to_page(start), - size >> PAGE_SHIFT, PAGE_KERNEL_RO); - printk("Write protecting the kernel read-only data: %luk\n", - size >> 10); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + size >> 10); + rodata_test(); - /* - * change_page_attr() requires a global_flush_tlb() call after it. - * We do this after the printk so that if something went wrong in the - * change, the printk gets out at least to give a better debug hint - * of who is the culprit. - */ - global_flush_tlb(); +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +#endif } #endif void free_init_pages(char *what, unsigned long begin, unsigned long end) { +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else unsigned long addr; + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); @@ -835,6 +806,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) totalram_pages++; } printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); +#endif } void free_initmem(void) @@ -850,4 +822,3 @@ void free_initrd_mem(unsigned long start, unsigned long end) free_init_pages("initrd memory", start, end); } #endif - diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0f9c8c89065..cc50a13ce8d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -43,12 +43,10 @@ #include <asm/proto.h> #include <asm/smp.h> #include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/numa.h> -#ifndef Dprintk -#define Dprintk(x...) -#endif - -const struct dma_mapping_ops* dma_ops; +const struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; @@ -65,22 +63,26 @@ void show_mem(void) { long i, total = 0, reserved = 0; long shared = 0, cached = 0; - pg_data_t *pgdat; struct page *page; + pg_data_t *pgdat; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + printk(KERN_INFO "Free swap: %6ldkB\n", + nr_swap_pages << (PAGE_SHIFT-10)); for_each_online_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - /* this loop can take a while with 256 GB and 4k pages - so update the NMI watchdog */ - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { + /* + * This loop can take a while with 256 GB and + * 4k pages so defer the NMI watchdog: + */ + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) touch_nmi_watchdog(); - } + if (!pfn_valid(pgdat->node_start_pfn + i)) continue; + page = pfn_to_page(pgdat->node_start_pfn + i); total++; if (PageReserved(page)) @@ -89,51 +91,58 @@ void show_mem(void) cached++; else if (page_count(page)) shared += page_count(page) - 1; - } + } } - printk(KERN_INFO "%lu pages of RAM\n", total); - printk(KERN_INFO "%lu reserved pages\n",reserved); - printk(KERN_INFO "%lu pages shared\n",shared); - printk(KERN_INFO "%lu pages swap cached\n",cached); + printk(KERN_INFO "%lu pages of RAM\n", total); + printk(KERN_INFO "%lu reserved pages\n", reserved); + printk(KERN_INFO "%lu pages shared\n", shared); + printk(KERN_INFO "%lu pages swap cached\n", cached); } int after_bootmem; static __init void *spp_getpage(void) -{ +{ void *ptr; + if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); + ptr = (void *) get_zeroed_page(GFP_ATOMIC); else ptr = alloc_bootmem_pages(PAGE_SIZE); - if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) - panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); - Dprintk("spp_getpage %p\n", ptr); + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { + panic("set_pte_phys: cannot allocate page data %s\n", + after_bootmem ? "after bootmem" : ""); + } + + pr_debug("spp_getpage %p\n", ptr); + return ptr; -} +} -static __init void set_pte_phys(unsigned long vaddr, - unsigned long phys, pgprot_t prot) +static __init void +set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte, new_pte; - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); + pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys); pgd = pgd_offset_k(vaddr); if (pgd_none(*pgd)) { - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } pud = pud_offset(pgd, vaddr); if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd = (pmd_t *) spp_getpage(); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); if (pmd != pmd_offset(pud, 0)) { - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pmd, pmd_offset(pud, 0)); return; } } @@ -142,7 +151,7 @@ static __init void set_pte_phys(unsigned long vaddr, pte = (pte_t *) spp_getpage(); set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); if (pte != pte_offset_kernel(pmd, 0)) { - printk("PAGETABLE BUG #02!\n"); + printk(KERN_ERR "PAGETABLE BUG #02!\n"); return; } } @@ -162,33 +171,35 @@ static __init void set_pte_phys(unsigned long vaddr, } /* NOTE: this is meant to be run only at boot */ -void __init -__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +void __init +__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { unsigned long address = __fix_to_virt(idx); if (idx >= __end_of_fixed_addresses) { - printk("Invalid __set_fixmap\n"); + printk(KERN_ERR "Invalid __set_fixmap\n"); return; } set_pte_phys(address, phys, prot); } -unsigned long __meminitdata table_start, table_end; +static unsigned long __initdata table_start; +static unsigned long __meminitdata table_end; static __meminit void *alloc_low_page(unsigned long *phys) -{ +{ unsigned long pfn = table_end++; void *adr; if (after_bootmem) { adr = (void *)get_zeroed_page(GFP_ATOMIC); *phys = __pa(adr); + return adr; } - if (pfn >= end_pfn) - panic("alloc_low_page: ran out of memory"); + if (pfn >= end_pfn) + panic("alloc_low_page: ran out of memory"); adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); memset(adr, 0, PAGE_SIZE); @@ -197,44 +208,49 @@ static __meminit void *alloc_low_page(unsigned long *phys) } static __meminit void unmap_low_page(void *adr) -{ - +{ if (after_bootmem) return; early_iounmap(adr, PAGE_SIZE); -} +} /* Must run before zap_low_mappings */ __meminit void *early_ioremap(unsigned long addr, unsigned long size) { - unsigned long vaddr; pmd_t *pmd, *last_pmd; + unsigned long vaddr; int i, pmds; pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; vaddr = __START_KERNEL_map; pmd = level2_kernel_pgt; last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { for (i = 0; i < pmds; i++) { if (pmd_present(pmd[i])) - goto next; + goto continue_outer_loop; } vaddr += addr & ~PMD_MASK; addr &= PMD_MASK; + for (i = 0; i < pmds; i++, addr += PMD_SIZE) - set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); - __flush_tlb(); + set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + __flush_tlb_all(); + return (void *)vaddr; - next: +continue_outer_loop: ; } - printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); + printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); + return NULL; } -/* To avoid virtual aliases later */ +/* + * To avoid virtual aliases later: + */ __meminit void early_iounmap(void *addr, unsigned long size) { unsigned long vaddr; @@ -244,9 +260,11 @@ __meminit void early_iounmap(void *addr, unsigned long size) vaddr = (unsigned long)addr; pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; pmd = level2_kernel_pgt + pmd_index(vaddr); + for (i = 0; i < pmds; i++) pmd_clear(pmd + i); - __flush_tlb(); + + __flush_tlb_all(); } static void __meminit @@ -259,16 +277,17 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) pmd_t *pmd = pmd_page + pmd_index(address); if (address >= end) { - if (!after_bootmem) + if (!after_bootmem) { for (; i < PTRS_PER_PMD; i++, pmd++) set_pmd(pmd, __pmd(0)); + } break; } if (pmd_val(*pmd)) continue; - entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; + entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address; entry &= __supported_pte_mask; set_pmd(pmd, __pmd(entry)); } @@ -277,19 +296,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) static void __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) { - pmd_t *pmd = pmd_offset(pud,0); + pmd_t *pmd = pmd_offset(pud, 0); spin_lock(&init_mm.page_table_lock); phys_pmd_init(pmd, address, end); spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); } -static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) -{ +static void __meminit +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) +{ int i = pud_index(addr); - - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; @@ -297,10 +316,11 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne if (addr >= end) break; - if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { - set_pud(pud, __pud(0)); + if (!after_bootmem && + !e820_any_mapped(addr, addr+PUD_SIZE, 0)) { + set_pud(pud, __pud(0)); continue; - } + } if (pud_val(*pud)) { phys_pmd_update(pud, addr, end); @@ -308,14 +328,16 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne } pmd = alloc_low_page(&pmd_phys); + spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); + unmap_low_page(pmd); } - __flush_tlb(); -} + __flush_tlb_all(); +} static void __init find_early_table_space(unsigned long end) { @@ -326,14 +348,23 @@ static void __init find_early_table_space(unsigned long end) tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - /* RED-PEN putting page tables only on node 0 could - cause a hotspot and fill up ZONE_DMA. The page tables - need roughly 0.5KB per GB. */ - start = 0x8000; - table_start = find_e820_area(start, end, tables); + /* + * RED-PEN putting page tables only on node 0 could + * cause a hotspot and fill up ZONE_DMA. The page tables + * need roughly 0.5KB per GB. + */ + start = 0x8000; + table_start = find_e820_area(start, end, tables); if (table_start == -1UL) panic("Cannot find space for the kernel page tables"); + /* + * When you have a lot of RAM like 256GB, early_table will not fit + * into 0x8000 range, find_e820_area() will find area after kernel + * bss but the table_start is not page aligned, so need to round it + * up to avoid overlap with bss: + */ + table_start = round_up(table_start, PAGE_SIZE); table_start >>= PAGE_SHIFT; table_end = table_start; @@ -342,20 +373,23 @@ static void __init find_early_table_space(unsigned long end) (table_start << PAGE_SHIFT) + tables); } -/* Setup the direct mapping of the physical memory at PAGE_OFFSET. - This runs before bootmem is initialized and gets pages directly from the - physical memory. To access them they are temporarily mapped. */ +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) -{ - unsigned long next; +{ + unsigned long next; - Dprintk("init_memory_mapping\n"); + pr_debug("init_memory_mapping\n"); - /* + /* * Find space for the kernel direct mapping tables. - * Later we should allocate these tables in the local node of the memory - * mapped. Unfortunately this is done currently before the nodes are - * discovered. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. */ if (!after_bootmem) find_early_table_space(end); @@ -364,8 +398,8 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) end = (unsigned long)__va(end); for (; start < end; start = next) { - unsigned long pud_phys; pgd_t *pgd = pgd_offset_k(start); + unsigned long pud_phys; pud_t *pud; if (after_bootmem) @@ -374,23 +408,26 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) pud = alloc_low_page(&pud_phys); next = start + PGDIR_SIZE; - if (next > end) - next = end; + if (next > end) + next = end; phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); unmap_low_page(pud); - } + } if (!after_bootmem) mmu_cr4_features = read_cr4(); __flush_tlb_all(); + + reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT); } #ifndef CONFIG_NUMA void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; @@ -402,39 +439,48 @@ void __init paging_init(void) } #endif -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches - from the CPU leading to inconsistent cache lines. address and size - must be aligned to 2MB boundaries. - Does nothing when the mapping doesn't exist. */ -void __init clear_kernel_mapping(unsigned long address, unsigned long size) +/* + * Unmap a kernel mapping if it exists. This is useful to avoid + * prefetches from the CPU leading to inconsistent cache lines. + * address and size must be aligned to 2MB boundaries. + * Does nothing when the mapping doesn't exist. + */ +void __init clear_kernel_mapping(unsigned long address, unsigned long size) { unsigned long end = address + size; BUG_ON(address & ~LARGE_PAGE_MASK); - BUG_ON(size & ~LARGE_PAGE_MASK); - - for (; address < end; address += LARGE_PAGE_SIZE) { + BUG_ON(size & ~LARGE_PAGE_MASK); + + for (; address < end; address += LARGE_PAGE_SIZE) { pgd_t *pgd = pgd_offset_k(address); pud_t *pud; pmd_t *pmd; + if (pgd_none(*pgd)) continue; + pud = pud_offset(pgd, address); if (pud_none(*pud)) - continue; + continue; + pmd = pmd_offset(pud, address); if (!pmd || pmd_none(*pmd)) - continue; - if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { - /* Could handle this, but it should not happen currently. */ - printk(KERN_ERR - "clear_kernel_mapping: mapping has been split. will leak memory\n"); - pmd_ERROR(*pmd); + continue; + + if (!(pmd_val(*pmd) & _PAGE_PSE)) { + /* + * Could handle this, but it should not happen + * currently: + */ + printk(KERN_ERR "clear_kernel_mapping: " + "mapping has been split. will leak memory\n"); + pmd_ERROR(*pmd); } - set_pmd(pmd, __pmd(0)); + set_pmd(pmd, __pmd(0)); } __flush_tlb_all(); -} +} /* * Memory hotplug specific functions @@ -461,16 +507,12 @@ int arch_add_memory(int nid, u64 start, u64 size) unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - init_memory_mapping(start, (start + size -1)); + init_memory_mapping(start, start + size-1); ret = __add_pages(zone, start_pfn, nr_pages); - if (ret) - goto error; + WARN_ON(1); return ret; -error: - printk("%s: Problem encountered in __add_pages!\n", __func__); - return ret; } EXPORT_SYMBOL_GPL(arch_add_memory); @@ -484,36 +526,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -/* - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, - * just online the pages. - */ -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) -{ - int err = -EIO; - unsigned long pfn; - unsigned long total = 0, mem = 0; - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { - if (pfn_valid(pfn)) { - online_page(pfn_to_page(pfn)); - err = 0; - mem++; - } - total++; - } - if (!err) { - z->spanned_pages += total; - z->present_pages += mem; - z->zone_pgdat->node_spanned_pages += total; - z->zone_pgdat->node_present_pages += mem; - } - return err; -} -#endif - -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, - kcore_vsyscall; +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, + kcore_modules, kcore_vsyscall; void __init mem_init(void) { @@ -521,8 +535,15 @@ void __init mem_init(void) pci_iommu_alloc(); - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); + /* clear_bss() already clear the empty_zero_page */ + + /* temporary debugging - double check it's true: */ + { + int i; + + for (i = 0; i < 1024; i++) + WARN_ON_ONCE(empty_zero_page[i]); + } reservedpages = 0; @@ -534,7 +555,6 @@ void __init mem_init(void) #endif reservedpages = end_pfn - totalram_pages - absent_pages_in_range(0, end_pfn); - after_bootmem = 1; codesize = (unsigned long) &_etext - (unsigned long) &_text; @@ -542,15 +562,16 @@ void __init mem_init(void) initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); kclist_add(&kcore_kernel, &_stext, _end - _stext); kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, VSYSCALL_END - VSYSCALL_START); - printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", + printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " + "%ldk reserved, %ldk data, %ldk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), end_pfn << (PAGE_SHIFT-10), codesize >> 10, @@ -566,19 +587,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) if (begin >= end) return; + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), POISON_FREE_INITMEM, PAGE_SIZE); - if (addr >= __START_KERNEL_map) - change_page_attr_addr(addr, 1, __pgprot(0)); free_page(addr); totalram_pages++; } - if (addr > __START_KERNEL_map) - global_flush_tlb(); +#endif } void free_initmem(void) @@ -589,6 +618,8 @@ void free_initmem(void) } #ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { @@ -603,25 +634,27 @@ void mark_rodata_ro(void) #ifdef CONFIG_KPROBES start = (unsigned long)__start_rodata; #endif - + end = (unsigned long)__end_rodata; start = (start + PAGE_SIZE - 1) & PAGE_MASK; end &= PAGE_MASK; if (end <= start) return; - change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); - /* - * change_page_attr_addr() requires a global_flush_tlb() call after it. - * We do this after the printk so that if something went wrong in the - * change, the printk gets out at least to give a better debug hint - * of who is the culprit. - */ - global_flush_tlb(); + rodata_test(); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); + set_memory_rw(start, (end-start) >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: again\n"); + set_memory_ro(start, (end-start) >> PAGE_SHIFT); +#endif } #endif @@ -632,17 +665,21 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) -{ +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +{ #ifdef CONFIG_NUMA int nid = phys_to_nid(phys); #endif unsigned long pfn = phys >> PAGE_SHIFT; + if (pfn >= end_pfn) { - /* This can happen with kdump kernels when accessing firmware - tables. */ + /* + * This can happen with kdump kernels when accessing + * firmware tables: + */ if (pfn < end_pfn_map) return; + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", phys, len); return; @@ -650,9 +687,9 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len); -#else - reserve_bootmem(phys, len); + reserve_bootmem_node(NODE_DATA(nid), phys, len); +#else + reserve_bootmem(phys, len); #endif if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { dma_reserve += len / PAGE_SIZE; @@ -660,46 +697,49 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) } } -int kern_addr_valid(unsigned long addr) -{ +int kern_addr_valid(unsigned long addr) +{ unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; if (above != 0 && above != -1UL) - return 0; - + return 0; + pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) return 0; pud = pud_offset(pgd, addr); if (pud_none(*pud)) - return 0; + return 0; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return 0; + if (pmd_large(*pmd)) return pfn_valid(pmd_pfn(*pmd)); pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) return 0; + return pfn_valid(pte_pfn(*pte)); } -/* A pseudo VMA to allow ptrace access for the vsyscall page. This only - covers the 64bit vsyscall page now. 32bit has a real VMA now and does - not need special handling anymore. */ - +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC + .vm_start = VSYSCALL_START, + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC }; struct vm_area_struct *get_gate_vma(struct task_struct *tsk) @@ -714,14 +754,17 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) int in_gate_area(struct task_struct *task, unsigned long addr) { struct vm_area_struct *vma = get_gate_vma(task); + if (!vma) return 0; + return (addr >= vma->vm_start) && (addr < vma->vm_end); } -/* Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives. +/* + * Use this when you have no reliable task/vma, typically from interrupt + * context. It is less reliable than using the task's vma and may give + * false positives: */ int in_gate_area_no_task(unsigned long addr) { @@ -741,8 +784,8 @@ const char *arch_vma_name(struct vm_area_struct *vma) /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. */ -int __meminit vmemmap_populate(struct page *start_page, - unsigned long size, int node) +int __meminit +vmemmap_populate(struct page *start_page, unsigned long size, int node) { unsigned long addr = (unsigned long)start_page; unsigned long end = (unsigned long)(start_page + size); @@ -757,6 +800,7 @@ int __meminit vmemmap_populate(struct page *start_page, pgd = vmemmap_pgd_populate(addr, node); if (!pgd) return -ENOMEM; + pud = vmemmap_pud_populate(pgd, addr, node); if (!pud) return -ENOMEM; @@ -764,20 +808,22 @@ int __meminit vmemmap_populate(struct page *start_page, pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { pte_t entry; - void *p = vmemmap_alloc_block(PMD_SIZE, node); + void *p; + + p = vmemmap_alloc_block(PMD_SIZE, node); if (!p) return -ENOMEM; - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); - mk_pte_huge(entry); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", addr, addr + PMD_SIZE - 1, p, node); - } else + } else { vmemmap_verify((pte_t *)pmd, node, addr, next); + } } - return 0; } #endif diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c new file mode 100644 index 00000000000..ed795721ca8 --- /dev/null +++ b/arch/x86/mm/ioremap.c @@ -0,0 +1,501 @@ +/* + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include <asm/cacheflush.h> +#include <asm/e820.h> +#include <asm/fixmap.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/pgalloc.h> + +enum ioremap_mode { + IOR_MODE_UNCACHED, + IOR_MODE_CACHED, +}; + +#ifdef CONFIG_X86_64 + +unsigned long __phys_addr(unsigned long x) +{ + if (x >= __START_KERNEL_map) + return x - __START_KERNEL_map + phys_base; + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); + +#endif + +int page_is_ram(unsigned long pagenr) +{ + unsigned long addr, end; + int i; + + for (i = 0; i < e820.nr_map; i++) { + /* + * Not usable memory: + */ + if (e820.map[i].type != E820_RAM) + continue; + addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT; + end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT; + + /* + * Sanity check: Some BIOSen report areas as RAM that + * are not. Notably the 640->1Mb area, which is the + * PCI BIOS area. + */ + if (addr >= (BIOS_BEGIN >> PAGE_SHIFT) && + end < (BIOS_END >> PAGE_SHIFT)) + continue; + + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; +} + +/* + * Fix up the linear direct mapping of the kernel to avoid cache attribute + * conflicts. + */ +static int ioremap_change_attr(unsigned long paddr, unsigned long size, + enum ioremap_mode mode) +{ + unsigned long vaddr = (unsigned long)__va(paddr); + unsigned long nrpages = size >> PAGE_SHIFT; + int err, level; + + /* No change for pages after the last mapping */ + if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT)) + return 0; + + /* + * If there is no identity map for this address, + * change_page_attr_addr is unnecessary + */ + if (!lookup_address(vaddr, &level)) + return 0; + + switch (mode) { + case IOR_MODE_UNCACHED: + default: + err = set_memory_uc(vaddr, nrpages); + break; + case IOR_MODE_CACHED: + err = set_memory_wb(vaddr, nrpages); + break; + } + + return err; +} + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. Needed when the kernel wants to access high addresses + * directly. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, + enum ioremap_mode mode) +{ + void __iomem *addr; + struct vm_struct *area; + unsigned long offset, last_addr; + pgprot_t prot; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) + return (__force void __iomem *)phys_to_virt(phys_addr); + + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped && + (offset << PAGE_SHIFT) < last_addr; offset++) { + if (page_is_ram(offset)) + return NULL; + } + + switch (mode) { + case IOR_MODE_UNCACHED: + default: + prot = PAGE_KERNEL_NOCACHE; + break; + case IOR_MODE_CACHED: + prot = PAGE_KERNEL; + break; + } + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + /* + * Ok, go for it.. + */ + area = get_vm_area(size, VM_IOREMAP); + if (!area) + return NULL; + area->phys_addr = phys_addr; + addr = (void __iomem *) area->addr; + if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, + phys_addr, prot)) { + remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); + return NULL; + } + + if (ioremap_change_attr(phys_addr, size, mode) < 0) { + vunmap(addr); + return NULL; + } + + return (void __iomem *) (offset + (char __iomem *)addr); +} + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_nocache(unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr, size, IOR_MODE_UNCACHED); +} +EXPORT_SYMBOL(ioremap_nocache); + +void __iomem *ioremap_cache(unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr, size, IOR_MODE_CACHED); +} +EXPORT_SYMBOL(ioremap_cache); + +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p, *o; + + if ((void __force *)addr <= high_memory) + return; + + /* + * __ioremap special-cases the PCI/ISA range by not instantiating a + * vm_area and by simply returning an address into the kernel mapping + * of ISA space. So handle that here. + */ + if (addr >= phys_to_virt(ISA_START_ADDRESS) && + addr < phys_to_virt(ISA_END_ADDRESS)) + return; + + addr = (volatile void __iomem *) + (PAGE_MASK & (unsigned long __force)addr); + + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + if (p->addr == addr) + break; + } + read_unlock(&vmlist_lock); + + if (!p) { + printk(KERN_ERR "iounmap: bad address %p\n", addr); + dump_stack(); + return; + } + + /* Reset the direct mapping. Can block */ + ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED); + + /* Finally remove it */ + o = remove_vm_area((void *)addr); + BUG_ON(p != o || o == NULL); + kfree(p); +} +EXPORT_SYMBOL(iounmap); + +#ifdef CONFIG_X86_32 + +int __initdata early_ioremap_debug; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static __initdata int after_paging_init; +static __initdata unsigned long bm_pte[1024] + __attribute__((aligned(PAGE_SIZE))); + +static inline unsigned long * __init early_ioremap_pgd(unsigned long addr) +{ + return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023); +} + +static inline unsigned long * __init early_ioremap_pte(unsigned long addr) +{ + return bm_pte + ((addr >> PAGE_SHIFT) & 1023); +} + +void __init early_ioremap_init(void) +{ + unsigned long *pgd; + + if (early_ioremap_debug) + printk(KERN_INFO "early_ioremap_init()\n"); + + pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); + *pgd = __pa(bm_pte) | _PAGE_TABLE; + memset(bm_pte, 0, sizeof(bm_pte)); + /* + * The boot-ioremap range spans multiple pgds, for which + * we are not prepared: + */ + if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) { + WARN_ON(1); + printk(KERN_WARNING "pgd %p != %p\n", + pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", + FIX_BTMAP_BEGIN); + } +} + +void __init early_ioremap_clear(void) +{ + unsigned long *pgd; + + if (early_ioremap_debug) + printk(KERN_INFO "early_ioremap_clear()\n"); + + pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); + *pgd = 0; + paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT); + __flush_tlb_all(); +} + +void __init early_ioremap_reset(void) +{ + enum fixed_addresses idx; + unsigned long *pte, phys, addr; + + after_paging_init = 1; + for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { + addr = fix_to_virt(idx); + pte = early_ioremap_pte(addr); + if (!*pte & _PAGE_PRESENT) { + phys = *pte & PAGE_MASK; + set_fixmap(idx, phys); + } + } +} + +static void __init __early_set_fixmap(enum fixed_addresses idx, + unsigned long phys, pgprot_t flags) +{ + unsigned long *pte, addr = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + pte = early_ioremap_pte(addr); + if (pgprot_val(flags)) + *pte = (phys & PAGE_MASK) | pgprot_val(flags); + else + *pte = 0; + __flush_tlb_one(addr); +} + +static inline void __init early_set_fixmap(enum fixed_addresses idx, + unsigned long phys) +{ + if (after_paging_init) + set_fixmap(idx, phys); + else + __early_set_fixmap(idx, phys, PAGE_KERNEL); +} + +static inline void __init early_clear_fixmap(enum fixed_addresses idx) +{ + if (after_paging_init) + clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, __pgprot(0)); +} + + +int __initdata early_ioremap_nested; + +static int __init check_early_ioremap_leak(void) +{ + if (!early_ioremap_nested) + return 0; + + printk(KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n", + early_ioremap_nested); + printk(KERN_WARNING + "please boot with early_ioremap_debug and report the dmesg.\n"); + WARN_ON(1); + + return 1; +} +late_initcall(check_early_ioremap_leak); + +void __init *early_ioremap(unsigned long phys_addr, unsigned long size) +{ + unsigned long offset, last_addr; + unsigned int nrpages, nesting; + enum fixed_addresses idx0, idx; + + WARN_ON(system_state != SYSTEM_BOOTING); + + nesting = early_ioremap_nested; + if (early_ioremap_debug) { + printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", + phys_addr, size, nesting); + dump_stack(); + } + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) { + WARN_ON(1); + return NULL; + } + + if (nesting >= FIX_BTMAPS_NESTING) { + WARN_ON(1); + return NULL; + } + early_ioremap_nested++; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (nrpages > NR_FIX_BTMAPS) { + WARN_ON(1); + return NULL; + } + + /* + * Ok, go for it.. + */ + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx = idx0; + while (nrpages > 0) { + early_set_fixmap(idx, phys_addr); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + if (early_ioremap_debug) + printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); + + return (void *) (offset + fix_to_virt(idx0)); +} + +void __init early_iounmap(void *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + unsigned int nesting; + + nesting = --early_ioremap_nested; + WARN_ON(nesting < 0); + + if (early_ioremap_debug) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, + size, nesting); + dump_stack(); + } + + virt_addr = (unsigned long)addr; + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { + WARN_ON(1); + return; + } + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + while (nrpages > 0) { + early_clear_fixmap(idx); + --idx; + --nrpages; + } +} + +void __this_fixmap_does_not_exist(void) +{ + WARN_ON(1); +} + +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c deleted file mode 100644 index 0b278315d73..00000000000 --- a/arch/x86/mm/ioremap_32.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * arch/i386/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/io.h> -#include <asm/fixmap.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/pgtable.h> - -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 - -/* - * Generic mapping function (not visible outside): - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void __iomem * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - pgprot_t prot; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return (void __iomem *) phys_to_virt(phys_addr); - - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (phys_addr <= virt_to_phys(high_memory - 1)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = __va(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - } - - prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY - | _PAGE_ACCESSED | flags); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (ioremap_page_range((unsigned long) addr, - (unsigned long) addr + size, phys_addr, prot)) { - vunmap((void __force *) addr); - return NULL; - } - return (void __iomem *) (offset + (char __iomem *)addr); -} -EXPORT_SYMBOL(__ioremap); - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr; - void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); - if (!p) - return p; - - /* Guaranteed to be > phys_addr, as per __ioremap() */ - last_addr = phys_addr + size - 1; - - if (last_addr < virt_to_phys(high_memory) - 1) { - struct page *ppage = virt_to_page(__va(phys_addr)); - unsigned long npages; - - phys_addr &= PAGE_MASK; - - /* This might overflow and become zero.. */ - last_addr = PAGE_ALIGN(last_addr); - - /* .. but that's ok, because modulo-2**n arithmetic will make - * the page-aligned "last - first" come out right. - */ - npages = (last_addr - phys_addr) >> PAGE_SHIFT; - - if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { - iounmap(p); - p = NULL; - } - global_flush_tlb(); - } - - return p; -} -EXPORT_SYMBOL(ioremap_nocache); - -/** - * iounmap - Free a IO remapping - * @addr: virtual address from ioremap_* - * - * Caller must ensure there is only one unmapping for the same pointer. - */ -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p, *o; - - if ((void __force *)addr <= high_memory) - return; - - /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. - */ - if (addr >= phys_to_virt(ISA_START_ADDRESS) && - addr < phys_to_virt(ISA_END_ADDRESS)) - return; - - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); - - /* Use the vm area unlocked, assuming the caller - ensures there isn't another iounmap for the same address - in parallel. Reuse of the virtual address is prevented by - leaving it in the global lists until we're done with it. - cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == addr) - break; - } - read_unlock(&vmlist_lock); - - if (!p) { - printk("iounmap: bad address %p\n", addr); - dump_stack(); - return; - } - - /* Reset the direct mapping. Can block */ - if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) { - change_page_attr(virt_to_page(__va(p->phys_addr)), - get_vm_area_size(p) >> PAGE_SHIFT, - PAGE_KERNEL); - global_flush_tlb(); - } - - /* Finally remove it */ - o = remove_vm_area((void *)addr); - BUG_ON(p != o || o == NULL); - kfree(p); -} -EXPORT_SYMBOL(iounmap); - -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long offset, last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return phys_to_virt(phys_addr); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) - return NULL; - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - set_fixmap(idx, phys_addr); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); -} - -void __init bt_iounmap(void *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) - return; - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - clear_fixmap(idx); - --idx; - --nrpages; - } -} diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c deleted file mode 100644 index 6cac90aa503..00000000000 --- a/arch/x86/mm/ioremap_64.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * arch/x86_64/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/io.h> - -#include <asm/pgalloc.h> -#include <asm/fixmap.h> -#include <asm/tlbflush.h> -#include <asm/cacheflush.h> -#include <asm/proto.h> - -unsigned long __phys_addr(unsigned long x) -{ - if (x >= __START_KERNEL_map) - return x - __START_KERNEL_map + phys_base; - return x - PAGE_OFFSET; -} -EXPORT_SYMBOL(__phys_addr); - -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 - -/* - * Fix up the linear direct mapping of the kernel to avoid cache attribute - * conflicts. - */ -static int -ioremap_change_attr(unsigned long phys_addr, unsigned long size, - unsigned long flags) -{ - int err = 0; - if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { - unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long vaddr = (unsigned long) __va(phys_addr); - - /* - * Must use a address here and not struct page because the phys addr - * can be a in hole between nodes and not have an memmap entry. - */ - err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); - if (!err) - global_flush_tlb(); - } - return err; -} - -/* - * Generic mapping function - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - pgprot_t pgprot; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return (__force void __iomem *)phys_to_virt(phys_addr); - -#ifdef CONFIG_FLATMEM - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (last_addr < virt_to_phys(high_memory)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = __va(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - } -#endif - - pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL - | _PAGE_DIRTY | _PAGE_ACCESSED | flags); - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = area->addr; - if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, pgprot)) { - remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); - return NULL; - } - if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) { - area->flags &= 0xffffff; - vunmap(addr); - return NULL; - } - return (__force void __iomem *) (offset + (char *)addr); -} -EXPORT_SYMBOL(__ioremap); - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - return __ioremap(phys_addr, size, _PAGE_PCD); -} -EXPORT_SYMBOL(ioremap_nocache); - -/** - * iounmap - Free a IO remapping - * @addr: virtual address from ioremap_* - * - * Caller must ensure there is only one unmapping for the same pointer. - */ -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p, *o; - - if (addr <= high_memory) - return; - if (addr >= phys_to_virt(ISA_START_ADDRESS) && - addr < phys_to_virt(ISA_END_ADDRESS)) - return; - - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); - /* Use the vm area unlocked, assuming the caller - ensures there isn't another iounmap for the same address - in parallel. Reuse of the virtual address is prevented by - leaving it in the global lists until we're done with it. - cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == addr) - break; - } - read_unlock(&vmlist_lock); - - if (!p) { - printk("iounmap: bad address %p\n", addr); - dump_stack(); - return; - } - - /* Reset the direct mapping. Can block */ - if (p->flags >> 20) - ioremap_change_attr(p->phys_addr, p->size, 0); - - /* Finally remove it */ - o = remove_vm_area((void *)addr); - BUG_ON(p != o || o == NULL); - kfree(p); -} -EXPORT_SYMBOL(iounmap); - diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index a96006f7ae0..7a2ebce87df 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -1,9 +1,9 @@ -/* +/* * AMD K8 NUMA support. * Discover the memory map and associated nodes. - * + * * This version reads it directly from the K8 northbridge. - * + * * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ #include <linux/kernel.h> @@ -22,132 +22,135 @@ static __init int find_northbridge(void) { - int num; + int num; - for (num = 0; num < 32; num++) { + for (num = 0; num < 32; num++) { u32 header; - - header = read_pci_config(0, num, 0, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) - continue; - - header = read_pci_config(0, num, 1, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) - continue; - return num; - } - - return -1; + + header = read_pci_config(0, num, 0, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) + continue; + + header = read_pci_config(0, num, 1, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) + continue; + return num; + } + + return -1; } int __init k8_scan_nodes(unsigned long start, unsigned long end) -{ +{ unsigned long prevbase; struct bootnode nodes[8]; - int nodeid, i, j, nb; + int nodeid, i, nb; unsigned char nodeids[8]; int found = 0; u32 reg; unsigned numnodes; - unsigned num_cores; + unsigned cores; + unsigned bits; + int j; if (!early_pci_allowed()) return -1; - nb = find_northbridge(); - if (nb < 0) + nb = find_northbridge(); + if (nb < 0) return nb; - printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); - - num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - printk(KERN_INFO "CPU has %d num_cores\n", num_cores); + printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); - reg = read_pci_config(0, nb, 0, 0x60); + reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; if (numnodes <= 1) return -1; printk(KERN_INFO "Number of nodes %d\n", numnodes); - memset(&nodes,0,sizeof(nodes)); + memset(&nodes, 0, sizeof(nodes)); prevbase = 0; - for (i = 0; i < 8; i++) { - unsigned long base,limit; + for (i = 0; i < 8; i++) { + unsigned long base, limit; u32 nodeid; - + base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); - nodeid = limit & 7; + nodeid = limit & 7; nodeids[i] = nodeid; - if ((base & 3) == 0) { + if ((base & 3) == 0) { if (i < numnodes) - printk("Skipping disabled node %d\n", i); + printk("Skipping disabled node %d\n", i); continue; - } + } if (nodeid >= numnodes) { printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, - base, limit); + base, limit); continue; - } + } - if (!limit) { - printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, - base); + if (!limit) { + printk(KERN_INFO "Skipping node entry %d (base %lx)\n", + i, base); continue; } if ((base >> 8) & 3 || (limit >> 8) & 3) { - printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", - nodeid, (base>>8)&3, (limit>>8) & 3); - return -1; - } + printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", + nodeid, (base>>8)&3, (limit>>8) & 3); + return -1; + } if (node_isset(nodeid, node_possible_map)) { - printk(KERN_INFO "Node %d already present. Skipping\n", + printk(KERN_INFO "Node %d already present. Skipping\n", nodeid); continue; } - limit >>= 16; - limit <<= 24; + limit >>= 16; + limit <<= 24; limit |= (1<<24)-1; limit++; if (limit > end_pfn << PAGE_SHIFT) limit = end_pfn << PAGE_SHIFT; if (limit <= base) - continue; - + continue; + base >>= 16; - base <<= 24; - - if (base < start) - base = start; - if (limit > end) - limit = end; - if (limit == base) { - printk(KERN_ERR "Empty node %d\n", nodeid); - continue; + base <<= 24; + + if (base < start) + base = start; + if (limit > end) + limit = end; + if (limit == base) { + printk(KERN_ERR "Empty node %d\n", nodeid); + continue; } - if (limit < base) { + if (limit < base) { printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", - nodeid, base, limit); + nodeid, base, limit); continue; - } - + } + /* Could sort here, but pun for now. Should not happen anyroads. */ - if (prevbase > base) { + if (prevbase > base) { printk(KERN_ERR "Node map not sorted %lx,%lx\n", - prevbase,base); + prevbase, base); return -1; } - - printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", - nodeid, base, limit); - + + printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", + nodeid, base, limit); + found++; - - nodes[nodeid].start = base; + + nodes[nodeid].start = base; nodes[nodeid].end = limit; e820_register_active_regions(nodeid, nodes[nodeid].start >> PAGE_SHIFT, @@ -156,27 +159,31 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = base; node_set(nodeid, node_possible_map); - } + } if (!found) - return -1; + return -1; memnode_shift = compute_hash_shift(nodes, 8); - if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + if (memnode_shift < 0) { + printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); + return -1; + } + printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + + /* use the coreid bits from early_identify_cpu */ + bits = boot_cpu_data.x86_coreid_bits; + cores = (1<<bits); for (i = 0; i < 8; i++) { - if (nodes[i].start != nodes[i].end) { + if (nodes[i].start != nodes[i].end) { nodeid = nodeids[i]; - for (j = 0; j < num_cores; j++) - apicid_to_node[(nodeid * num_cores) + j] = i; - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } + for (j = 0; j < cores; j++) + apicid_to_node[(nodeid << bits) + j] = i; + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } } numa_init_array(); return 0; -} +} diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap.c index 552e0847375..56fe7124fbe 100644 --- a/arch/x86/mm/mmap_32.c +++ b/arch/x86/mm/mmap.c @@ -1,10 +1,13 @@ /* - * linux/arch/i386/mm/mmap.c + * Flexible mmap layout support * - * flexible mmap layout support + * Based on code by Ingo Molnar and Andi Kleen, copyrighted + * as follows: * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. + * Copyright 2005 Andi Kleen, SUSE Labs. + * Copyright 2007 Jiri Kosina, SUSE Labs. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,14 +22,12 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * - * Started by Ingo Molnar <mingo@elte.hu> */ #include <linux/personality.h> #include <linux/mm.h> #include <linux/random.h> +#include <linux/limits.h> #include <linux/sched.h> /* @@ -37,20 +38,71 @@ #define MIN_GAP (128*1024*1024) #define MAX_GAP (TASK_SIZE/6*5) -static inline unsigned long mmap_base(struct mm_struct *mm) +/* + * True on X86_32 or when emulating IA32 on X86_64 + */ +static int mmap_is_ia32(void) { - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; - unsigned long random_factor = 0; +#ifdef CONFIG_X86_32 + return 1; +#endif +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + return 1; +#endif + return 0; +} - if (current->flags & PF_RANDOMIZE) - random_factor = get_random_int() % (1024*1024); +static int mmap_is_legacy(void) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +static unsigned long mmap_rnd(void) +{ + unsigned long rnd = 0; + + /* + * 8 bits of randomness in 32bit mmaps, 20 address space bits + * 28 bits of randomness in 64bit mmaps, 40 address space bits + */ + if (current->flags & PF_RANDOMIZE) { + if (mmap_is_ia32()) + rnd = (long)get_random_int() % (1<<8); + else + rnd = (long)(get_random_int() % (1<<28)); + } + return rnd << PAGE_SHIFT; +} + +static unsigned long mmap_base(void) +{ + unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - random_factor); + return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); +} + +/* + * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 + * does, but not when emulating X86_32 + */ +static unsigned long mmap_legacy_base(void) +{ + if (mmap_is_ia32()) + return TASK_UNMAPPED_BASE; + else + return TASK_UNMAPPED_BASE + mmap_rnd(); } /* @@ -59,18 +111,12 @@ static inline unsigned long mmap_base(struct mm_struct *mm) */ void arch_pick_mmap_layout(struct mm_struct *mm) { - /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: - */ - if (sysctl_legacy_va_layout || - (current->personality & ADDR_COMPAT_LAYOUT) || - current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) { - mm->mmap_base = TASK_UNMAPPED_BASE; + if (mmap_is_legacy()) { + mm->mmap_base = mmap_legacy_base(); mm->get_unmapped_area = arch_get_unmapped_area; mm->unmap_area = arch_unmap_area; } else { - mm->mmap_base = mmap_base(mm); + mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; mm->unmap_area = arch_unmap_area_topdown; } diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c deleted file mode 100644 index 80bba0dc000..00000000000 --- a/arch/x86/mm/mmap_64.c +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright 2005 Andi Kleen, SuSE Labs. - * Licensed under GPL, v.2 - */ -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/random.h> -#include <asm/ia32.h> - -/* Notebook: move the mmap code from sys_x86_64.c over here. */ - -void arch_pick_mmap_layout(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->flags & _TIF_IA32) - return ia32_pick_mmap_layout(mm); -#endif - mm->mmap_base = TASK_UNMAPPED_BASE; - if (current->flags & PF_RANDOMIZE) { - /* Add 28bit randomness which is about 40bits of address space - because mmap base has to be page aligned. - or ~1/128 of the total user VM - (total user address space is 47bits) */ - unsigned rnd = get_random_int() & 0xfffffff; - mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT; - } - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; -} - diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3d6926ba899..dc3b1f7e145 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -1,7 +1,7 @@ -/* +/* * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ + */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> @@ -11,35 +11,45 @@ #include <linux/ctype.h> #include <linux/module.h> #include <linux/nodemask.h> +#include <linux/sched.h> #include <asm/e820.h> #include <asm/proto.h> #include <asm/dma.h> #include <asm/numa.h> #include <asm/acpi.h> +#include <asm/k8.h> #ifndef Dprintk #define Dprintk(x...) #endif struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; -unsigned char cpu_to_node[NR_CPUS] __read_mostly = { +int x86_cpu_to_node_map_init[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; -unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +void *x86_cpu_to_node_map_early_ptr; +DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); +EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); + +s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; + +cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_to_cpumask_map); int numa_off __initdata; unsigned long __initdata nodemap_addr; unsigned long __initdata nodemap_size; - /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size; * 0 if memnodmap[] too small (of shift too small) * -1 if node overlap or lost ram (shift too big) */ -static int __init -populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) +static int __init populate_memnodemap(const struct bootnode *nodes, + int numnodes, int shift) { - int i; - int res = -1; unsigned long addr, end; + int i, res = -1; - memset(memnodemap, 0xff, memnodemapsize); + memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); for (i = 0; i < numnodes; i++) { addr = nodes[i].start; end = nodes[i].end; @@ -63,13 +72,13 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) if ((end >> shift) >= memnodemapsize) return 0; do { - if (memnodemap[addr >> shift] != 0xff) + if (memnodemap[addr >> shift] != NUMA_NO_NODE) return -1; memnodemap[addr >> shift] = i; addr += (1UL << shift); } while (addr < end); res = 1; - } + } return res; } @@ -78,12 +87,12 @@ static int __init allocate_cachealigned_memnodemap(void) unsigned long pad, pad_addr; memnodemap = memnode.embedded_map; - if (memnodemapsize <= 48) + if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) return 0; pad = L1_CACHE_BYTES - 1; pad_addr = 0x8000; - nodemap_size = pad + memnodemapsize; + nodemap_size = pad + sizeof(s16) * memnodemapsize; nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, nodemap_size); if (nodemap_addr == -1UL) { @@ -94,6 +103,7 @@ static int __init allocate_cachealigned_memnodemap(void) } pad_addr = (nodemap_addr + pad) & ~pad; memnodemap = phys_to_virt(pad_addr); + reserve_early(nodemap_addr, nodemap_addr + nodemap_size); printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", nodemap_addr, nodemap_addr + nodemap_size); @@ -104,8 +114,8 @@ static int __init allocate_cachealigned_memnodemap(void) * The LSB of all start and end addresses in the node map is the value of the * maximum possible shift. */ -static int __init -extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) +static int __init extract_lsb_from_nodes(const struct bootnode *nodes, + int numnodes) { int i, nodes_used = 0; unsigned long start, end; @@ -140,51 +150,50 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes) shift); if (populate_memnodemap(nodes, numnodes, shift) != 1) { - printk(KERN_INFO - "Your memory is not aligned you need to rebuild your kernel " - "with a bigger NODEMAPSIZE shift=%d\n", - shift); + printk(KERN_INFO "Your memory is not aligned you need to " + "rebuild your kernel with a bigger NODEMAPSIZE " + "shift=%d\n", shift); return -1; } return shift; } -#ifdef CONFIG_SPARSEMEM int early_pfn_to_nid(unsigned long pfn) { return phys_to_nid(pfn << PAGE_SHIFT); } -#endif -static void * __init -early_node_mem(int nodeid, unsigned long start, unsigned long end, - unsigned long size) +static void * __init early_node_mem(int nodeid, unsigned long start, + unsigned long end, unsigned long size) { unsigned long mem = find_e820_area(start, end, size); void *ptr; + if (mem != -1L) return __va(mem); ptr = __alloc_bootmem_nopanic(size, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); if (ptr == NULL) { printk(KERN_ERR "Cannot find %lu bytes in node %d\n", - size, nodeid); + size, nodeid); return NULL; } return ptr; } /* Initialize bootmem allocator for a node */ -void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) -{ - unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; - unsigned long nodedata_phys; +void __init setup_node_bootmem(int nodeid, unsigned long start, + unsigned long end) +{ + unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; + unsigned long bootmap_start, nodedata_phys; void *bootmap; const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); - start = round_up(start, ZONE_ALIGN); + start = round_up(start, ZONE_ALIGN); - printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); + printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, + start, end); start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; @@ -200,75 +209,55 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; /* Find a place for the bootmem map */ - bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); bootmap = early_node_mem(nodeid, bootmap_start, end, bootmap_pages<<PAGE_SHIFT); if (bootmap == NULL) { if (nodedata_phys < start || nodedata_phys >= end) - free_bootmem((unsigned long)node_data[nodeid],pgdat_size); + free_bootmem((unsigned long)node_data[nodeid], + pgdat_size); node_data[nodeid] = NULL; return; } bootmap_start = __pa(bootmap); - Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); - + Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); + bootmap_size = init_bootmem_node(NODE_DATA(nodeid), - bootmap_start >> PAGE_SHIFT, - start_pfn, end_pfn); + bootmap_start >> PAGE_SHIFT, + start_pfn, end_pfn); free_bootmem_with_active_regions(nodeid, end); - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); + reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); + reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, + bootmap_pages<<PAGE_SHIFT); #ifdef CONFIG_ACPI_NUMA srat_reserve_add_area(nodeid); #endif node_set_online(nodeid); -} - -/* Initialize final allocator for a zone */ -void __init setup_node_zones(int nodeid) -{ - unsigned long start_pfn, end_pfn, memmapsize, limit; - - start_pfn = node_start_pfn(nodeid); - end_pfn = node_end_pfn(nodeid); - - Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n", - nodeid, start_pfn, end_pfn); - - /* Try to allocate mem_map at end to not fill up precious <4GB - memory. */ - memmapsize = sizeof(struct page) * (end_pfn-start_pfn); - limit = end_pfn << PAGE_SHIFT; -#ifdef CONFIG_FLAT_NODE_MEM_MAP - NODE_DATA(nodeid)->node_mem_map = - __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, - memmapsize, SMP_CACHE_BYTES, - round_down(limit - memmapsize, PAGE_SIZE), - limit); -#endif -} +} +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ void __init numa_init_array(void) { int rr, i; - /* There are unfortunately some poorly designed mainboards around - that only connect memory to a single CPU. This breaks the 1:1 cpu->node - mapping. To avoid this fill in the mapping for all possible - CPUs, as the number of CPUs is not known yet. - We round robin the existing nodes. */ + rr = first_node(node_online_map); for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node(i) != NUMA_NO_NODE) + if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; - numa_set_node(i, rr); + numa_set_node(i, rr); rr = next_node(rr, node_online_map); if (rr == MAX_NUMNODES) rr = first_node(node_online_map); } - } #ifdef CONFIG_NUMA_EMU @@ -276,15 +265,17 @@ void __init numa_init_array(void) char *cmdline __initdata; /* - * Setups up nid to range from addr to addr + size. If the end boundary is - * greater than max_addr, then max_addr is used instead. The return value is 0 - * if there is additional memory left for allocation past addr and -1 otherwise. - * addr is adjusted to be at the end of the node. + * Setups up nid to range from addr to addr + size. If the end + * boundary is greater than max_addr, then max_addr is used instead. + * The return value is 0 if there is additional memory left for + * allocation past addr and -1 otherwise. addr is adjusted to be at + * the end of the node. */ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, u64 size, u64 max_addr) { int ret = 0; + nodes[nid].start = *addr; *addr += size; if (*addr >= max_addr) { @@ -335,6 +326,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, for (i = node_start; i < num_nodes + node_start; i++) { u64 end = *addr + size; + if (i < big) end += FAKE_NODE_MIN_SIZE; /* @@ -380,14 +372,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { struct bootnode nodes[MAX_NUMNODES]; - u64 addr = start_pfn << PAGE_SHIFT; + u64 size, addr = start_pfn << PAGE_SHIFT; u64 max_addr = end_pfn << PAGE_SHIFT; - int num_nodes = 0; - int coeff_flag; - int coeff = -1; - int num = 0; - u64 size; - int i; + int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; memset(&nodes, 0, sizeof(nodes)); /* @@ -395,8 +382,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) * system RAM into N fake nodes. */ if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, - simple_strtol(cmdline, NULL, 0)); + long n = simple_strtol(cmdline, NULL, 0); + + num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); if (num_nodes < 0) return num_nodes; goto out; @@ -483,46 +471,47 @@ out: for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); - setup_node_bootmem(i, nodes[i].start, nodes[i].end); + setup_node_bootmem(i, nodes[i].start, nodes[i].end); } acpi_fake_nodes(nodes, num_nodes); - numa_init_array(); - return 0; + numa_init_array(); + return 0; } #endif /* CONFIG_NUMA_EMU */ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) -{ +{ int i; nodes_clear(node_possible_map); #ifdef CONFIG_NUMA_EMU if (cmdline && !numa_emulation(start_pfn, end_pfn)) - return; + return; nodes_clear(node_possible_map); #endif #ifdef CONFIG_ACPI_NUMA if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT)) - return; + return; nodes_clear(node_possible_map); #endif #ifdef CONFIG_K8_NUMA - if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) + if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, + end_pfn<<PAGE_SHIFT)) return; nodes_clear(node_possible_map); #endif printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", + printk(KERN_INFO "Faking a node at %016lx-%016lx\n", start_pfn << PAGE_SHIFT, - end_pfn << PAGE_SHIFT); - /* setup dummy node covering all memory */ - memnode_shift = 63; + end_pfn << PAGE_SHIFT); + /* setup dummy node covering all memory */ + memnode_shift = 63; memnodemap = memnode.embedded_map; memnodemap[0] = 0; nodes_clear(node_online_map); @@ -530,36 +519,48 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) node_set(0, node_possible_map); for (i = 0; i < NR_CPUS; i++) numa_set_node(i, 0); - node_to_cpumask[0] = cpumask_of_cpu(0); + /* cpumask_of_cpu() may not be available during early startup */ + memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); + cpu_set(0, node_to_cpumask_map[0]); e820_register_active_regions(0, start_pfn, end_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } __cpuinit void numa_add_cpu(int cpu) { - set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); -} + set_bit(cpu, + (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]); +} void __cpuinit numa_set_node(int cpu, int node) { + int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; + cpu_pda(cpu)->nodenumber = node; - cpu_to_node(cpu) = node; + + if(cpu_to_node_map) + cpu_to_node_map[cpu] = node; + else if(per_cpu_offset(cpu)) + per_cpu(x86_cpu_to_node_map, cpu) = node; + else + Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); } -unsigned long __init numa_free_all_bootmem(void) -{ - int i; +unsigned long __init numa_free_all_bootmem(void) +{ unsigned long pages = 0; - for_each_online_node(i) { + int i; + + for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); - } + return pages; -} +} void __init paging_init(void) -{ - int i; +{ unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; @@ -568,32 +569,27 @@ void __init paging_init(void) sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); - for_each_online_node(i) { - setup_node_zones(i); - } - free_area_init_nodes(max_zone_pfns); -} +} static __init int numa_setup(char *opt) -{ +{ if (!opt) return -EINVAL; - if (!strncmp(opt,"off",3)) + if (!strncmp(opt, "off", 3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU if (!strncmp(opt, "fake=", 5)) cmdline = opt + 5; #endif #ifdef CONFIG_ACPI_NUMA - if (!strncmp(opt,"noacpi",6)) - acpi_numa = -1; - if (!strncmp(opt,"hotadd=", 7)) + if (!strncmp(opt, "noacpi", 6)) + acpi_numa = -1; + if (!strncmp(opt, "hotadd=", 7)) hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif return 0; -} - +} early_param("numa", numa_setup); /* @@ -611,38 +607,16 @@ early_param("numa", numa_setup); void __init init_cpu_to_node(void) { int i; - for (i = 0; i < NR_CPUS; i++) { - u8 apicid = x86_cpu_to_apicid_init[i]; + + for (i = 0; i < NR_CPUS; i++) { + u16 apicid = x86_cpu_to_apicid_init[i]; + if (apicid == BAD_APICID) continue; if (apicid_to_node[apicid] == NUMA_NO_NODE) continue; - numa_set_node(i,apicid_to_node[apicid]); + numa_set_node(i, apicid_to_node[apicid]); } } -EXPORT_SYMBOL(cpu_to_node); -EXPORT_SYMBOL(node_to_cpumask); -EXPORT_SYMBOL(memnode); -EXPORT_SYMBOL(node_data); - -#ifdef CONFIG_DISCONTIGMEM -/* - * Functions to convert PFNs from/to per node page addresses. - * These are out of line because they are quite big. - * They could be all tuned by pre caching more state. - * Should do that. - */ -int pfn_valid(unsigned long pfn) -{ - unsigned nid; - if (pfn >= num_physpages) - return 0; - nid = pfn_to_nid(pfn); - if (nid == 0xff) - return 0; - return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); -} -EXPORT_SYMBOL(pfn_valid); -#endif diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c new file mode 100644 index 00000000000..06353d43f72 --- /dev/null +++ b/arch/x86/mm/pageattr-test.c @@ -0,0 +1,224 @@ +/* + * self test for change_page_attr. + * + * Clears the global bit on random pages in the direct mapping, then reverts + * and compares page tables forwards and afterwards. + */ +#include <linux/bootmem.h> +#include <linux/random.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> + +#include <asm/cacheflush.h> +#include <asm/pgtable.h> +#include <asm/kdebug.h> + +enum { + NTEST = 4000, +#ifdef CONFIG_X86_64 + LPS = (1 << PMD_SHIFT), +#elif defined(CONFIG_X86_PAE) + LPS = (1 << PMD_SHIFT), +#else + LPS = (1 << 22), +#endif + GPS = (1<<30) +}; + +struct split_state { + long lpg, gpg, spg, exec; + long min_exec, max_exec; +}; + +static __init int print_split(struct split_state *s) +{ + long i, expected, missed = 0; + int printed = 0; + int err = 0; + + s->lpg = s->gpg = s->spg = s->exec = 0; + s->min_exec = ~0UL; + s->max_exec = 0; + for (i = 0; i < max_pfn_mapped; ) { + unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT); + int level; + pte_t *pte; + + pte = lookup_address(addr, &level); + if (!pte) { + if (!printed) { + dump_pagetable(addr); + printk(KERN_INFO "CPA %lx no pte level %d\n", + addr, level); + printed = 1; + } + missed++; + i++; + continue; + } + + if (level == PG_LEVEL_1G && sizeof(long) == 8) { + s->gpg++; + i += GPS/PAGE_SIZE; + } else if (level == PG_LEVEL_2M) { + if (!(pte_val(*pte) & _PAGE_PSE)) { + printk(KERN_ERR + "%lx level %d but not PSE %Lx\n", + addr, level, (u64)pte_val(*pte)); + err = 1; + } + s->lpg++; + i += LPS/PAGE_SIZE; + } else { + s->spg++; + i++; + } + if (!(pte_val(*pte) & _PAGE_NX)) { + s->exec++; + if (addr < s->min_exec) + s->min_exec = addr; + if (addr > s->max_exec) + s->max_exec = addr; + } + } + printk(KERN_INFO + "CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", + s->spg, s->lpg, s->gpg, s->exec, + s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed); + + expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed; + if (expected != i) { + printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n", + max_pfn_mapped, expected); + return 1; + } + return err; +} + +static unsigned long __initdata addr[NTEST]; +static unsigned int __initdata len[NTEST]; + +/* Change the global bit on random pages in the direct mapping */ +static __init int exercise_pageattr(void) +{ + struct split_state sa, sb, sc; + unsigned long *bm; + pte_t *pte, pte0; + int failed = 0; + int level; + int i, k; + int err; + + printk(KERN_INFO "CPA exercising pageattr\n"); + + bm = vmalloc((max_pfn_mapped + 7) / 8); + if (!bm) { + printk(KERN_ERR "CPA Cannot vmalloc bitmap\n"); + return -ENOMEM; + } + memset(bm, 0, (max_pfn_mapped + 7) / 8); + + failed += print_split(&sa); + srandom32(100); + + for (i = 0; i < NTEST; i++) { + unsigned long pfn = random32() % max_pfn_mapped; + + addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); + len[i] = random32() % 100; + len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); + + if (len[i] == 0) + len[i] = 1; + + pte = NULL; + pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */ + + for (k = 0; k < len[i]; k++) { + pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); + if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) { + addr[i] = 0; + break; + } + if (k == 0) { + pte0 = *pte; + } else { + if (pgprot_val(pte_pgprot(*pte)) != + pgprot_val(pte_pgprot(pte0))) { + len[i] = k; + break; + } + } + if (test_bit(pfn + k, bm)) { + len[i] = k; + break; + } + __set_bit(pfn + k, bm); + } + if (!addr[i] || !pte || !k) { + addr[i] = 0; + continue; + } + + err = change_page_attr_clear(addr[i], len[i], + __pgprot(_PAGE_GLOBAL)); + if (err < 0) { + printk(KERN_ERR "CPA %d failed %d\n", i, err); + failed++; + } + + pte = lookup_address(addr[i], &level); + if (!pte || pte_global(*pte) || pte_huge(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], + pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + if (level != PG_LEVEL_4K) { + printk(KERN_ERR "CPA %lx: unexpected level %d\n", + addr[i], level); + failed++; + } + + } + vfree(bm); + + failed += print_split(&sb); + + printk(KERN_INFO "CPA reverting everything\n"); + for (i = 0; i < NTEST; i++) { + if (!addr[i]) + continue; + pte = lookup_address(addr[i], &level); + if (!pte) { + printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]); + failed++; + continue; + } + err = change_page_attr_set(addr[i], len[i], + __pgprot(_PAGE_GLOBAL)); + if (err < 0) { + printk(KERN_ERR "CPA reverting failed: %d\n", err); + failed++; + } + pte = lookup_address(addr[i], &level); + if (!pte || !pte_global(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", + addr[i], pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + + } + + failed += print_split(&sc); + + if (failed) { + printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n"); + WARN_ON(1); + } else { + printk(KERN_INFO "CPA selftests PASSED\n"); + } + + return 0; +} +module_init(exercise_pageattr); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c new file mode 100644 index 00000000000..1cc6607eacb --- /dev/null +++ b/arch/x86/mm/pageattr.c @@ -0,0 +1,564 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/mm.h> + +#include <asm/e820.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/uaccess.h> +#include <asm/pgalloc.h> + +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + +/* + * Flushing functions + */ + +/** + * clflush_cache_range - flush a cache range with clflush + * @addr: virtual start address + * @size: number of bytes to flush + * + * clflush is an unordered instruction which needs fencing with mfence + * to avoid ordering issues. + */ +void clflush_cache_range(void *vaddr, unsigned int size) +{ + void *vend = vaddr + size - 1; + + mb(); + + for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) + clflush(vaddr); + /* + * Flush any possible final partial cacheline: + */ + clflush(vend); + + mb(); +} + +static void __cpa_flush_all(void *arg) +{ + /* + * Flush all to work around Errata in early athlons regarding + * large page flushing. + */ + __flush_tlb_all(); + + if (boot_cpu_data.x86_model >= 4) + wbinvd(); +} + +static void cpa_flush_all(void) +{ + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_all, NULL, 1, 1); +} + +static void __cpa_flush_range(void *arg) +{ + /* + * We could optimize that further and do individual per page + * tlb invalidates for a low number of pages. Caveat: we must + * flush the high aliases on 64bit as well. + */ + __flush_tlb_all(); +} + +static void cpa_flush_range(unsigned long start, int numpages) +{ + unsigned int i, level; + unsigned long addr; + + BUG_ON(irqs_disabled()); + WARN_ON(PAGE_ALIGN(start) != start); + + on_each_cpu(__cpa_flush_range, NULL, 1, 1); + + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { + pte_t *pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && pte_present(*pte)) + clflush_cache_range((void *) addr, PAGE_SIZE); + } +} + +/* + * Certain areas of memory on x86 require very specific protection flags, + * for example the BIOS area or kernel text. Callers don't always get this + * right (again, ioremap() on BIOS memory is not uncommon) so this function + * checks and fixes these known static required protection bits. + */ +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) +{ + pgprot_t forbidden = __pgprot(0); + + /* + * The BIOS area between 640k and 1Mb needs to be executable for + * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. + */ + if (within(__pa(address), BIOS_BEGIN, BIOS_END)) + pgprot_val(forbidden) |= _PAGE_NX; + + /* + * The kernel text needs to be executable for obvious reasons + * Does not cover __inittext since that is gone later on + */ + if (within(address, (unsigned long)_text, (unsigned long)_etext)) + pgprot_val(forbidden) |= _PAGE_NX; + +#ifdef CONFIG_DEBUG_RODATA + /* The .rodata section needs to be read-only */ + if (within(address, (unsigned long)__start_rodata, + (unsigned long)__end_rodata)) + pgprot_val(forbidden) |= _PAGE_RW; +#endif + + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); + + return prot; +} + +pte_t *lookup_address(unsigned long address, int *level) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + + *level = PG_LEVEL_NONE; + + if (pgd_none(*pgd)) + return NULL; + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return NULL; + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return NULL; + + *level = PG_LEVEL_2M; + if (pmd_large(*pmd)) + return (pte_t *)pmd; + + *level = PG_LEVEL_4K; + return pte_offset_kernel(pmd, address); +} + +static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) +{ + /* change init_mm */ + set_pte_atomic(kpte, pte); +#ifdef CONFIG_X86_32 + if (!SHARED_KERNEL_PMD) { + struct page *page; + + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + set_pte_atomic((pte_t *)pmd, pte); + } + } +#endif +} + +static int split_large_page(pte_t *kpte, unsigned long address) +{ + pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + gfp_t gfp_flags = GFP_KERNEL; + unsigned long flags; + unsigned long addr; + pte_t *pbase, *tmp; + struct page *base; + unsigned int i, level; + +#ifdef CONFIG_DEBUG_PAGEALLOC + gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN; + gfp_flags = GFP_ATOMIC | __GFP_NOWARN; +#endif + base = alloc_pages(gfp_flags, 0); + if (!base) + return -ENOMEM; + + spin_lock_irqsave(&pgd_lock, flags); + /* + * Check for races, another CPU might have split this page + * up for us already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) { + WARN_ON_ONCE(1); + goto out_unlock; + } + + address = __pa(address); + addr = address & LARGE_PAGE_MASK; + pbase = (pte_t *)page_address(base); +#ifdef CONFIG_X86_32 + paravirt_alloc_pt(&init_mm, page_to_pfn(base)); +#endif + + pgprot_val(ref_prot) &= ~_PAGE_NX; + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) + set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot)); + + /* + * Install the new, split up pagetable. Important detail here: + * + * On Intel the NX bit of all levels must be cleared to make a + * page executable. See section 4.13.2 of Intel 64 and IA-32 + * Architectures Software Developer's Manual). + */ + ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); + __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); + base = NULL; + +out_unlock: + spin_unlock_irqrestore(&pgd_lock, flags); + + if (base) + __free_pages(base, 0); + + return 0; +} + +static int +__change_page_attr(unsigned long address, unsigned long pfn, + pgprot_t mask_set, pgprot_t mask_clr) +{ + struct page *kpte_page; + int level, err = 0; + pte_t *kpte; + +#ifdef CONFIG_X86_32 + BUG_ON(pfn > max_low_pfn); +#endif + +repeat: + kpte = lookup_address(address, &level); + if (!kpte) + return -EINVAL; + + kpte_page = virt_to_page(kpte); + BUG_ON(PageLRU(kpte_page)); + BUG_ON(PageCompound(kpte_page)); + + if (level == PG_LEVEL_4K) { + pgprot_t new_prot = pte_pgprot(*kpte); + pte_t new_pte, old_pte = *kpte; + + pgprot_val(new_prot) &= ~pgprot_val(mask_clr); + pgprot_val(new_prot) |= pgprot_val(mask_set); + + new_prot = static_protections(new_prot, address); + + new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); + BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte)); + + set_pte_atomic(kpte, new_pte); + } else { + err = split_large_page(kpte, address); + if (!err) + goto repeat; + } + return err; +} + +/** + * change_page_attr_addr - Change page table attributes in linear mapping + * @address: Virtual address in linear mapping. + * @prot: New page table attribute (PAGE_*) + * + * Change page attributes of a page in the direct mapping. This is a variant + * of change_page_attr() that also works on memory holes that do not have + * mem_map entry (pfn_valid() is false). + * + * See change_page_attr() documentation for more details. + * + * Modules and drivers should use the set_memory_* APIs instead. + */ + +#define HIGH_MAP_START __START_KERNEL_map +#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) + +static int +change_page_attr_addr(unsigned long address, pgprot_t mask_set, + pgprot_t mask_clr) +{ + unsigned long phys_addr = __pa(address); + unsigned long pfn = phys_addr >> PAGE_SHIFT; + int err; + +#ifdef CONFIG_X86_64 + /* + * If we are inside the high mapped kernel range, then we + * fixup the low mapping first. __va() returns the virtual + * address in the linear mapping: + */ + if (within(address, HIGH_MAP_START, HIGH_MAP_END)) + address = (unsigned long) __va(phys_addr); +#endif + + err = __change_page_attr(address, pfn, mask_set, mask_clr); + if (err) + return err; + +#ifdef CONFIG_X86_64 + /* + * If the physical address is inside the kernel map, we need + * to touch the high mapped kernel as well: + */ + if (within(phys_addr, 0, KERNEL_TEXT_SIZE)) { + /* + * Calc the high mapping address. See __phys_addr() + * for the non obvious details. + */ + address = phys_addr + HIGH_MAP_START - phys_base; + /* Make sure the kernel mappings stay executable */ + pgprot_val(mask_clr) |= _PAGE_NX; + + /* + * Our high aliases are imprecise, because we check + * everything between 0 and KERNEL_TEXT_SIZE, so do + * not propagate lookup failures back to users: + */ + __change_page_attr(address, pfn, mask_set, mask_clr); + } +#endif + return err; +} + +static int __change_page_attr_set_clr(unsigned long addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr) +{ + unsigned int i; + int ret; + + for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) { + ret = change_page_attr_addr(addr, mask_set, mask_clr); + if (ret) + return ret; + } + + return 0; +} + +static int change_page_attr_set_clr(unsigned long addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr) +{ + int ret = __change_page_attr_set_clr(addr, numpages, mask_set, + mask_clr); + + /* + * On success we use clflush, when the CPU supports it to + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): + */ + if (!ret && cpu_has_clflush) + cpa_flush_range(addr, numpages); + else + cpa_flush_all(); + + return ret; +} + +static inline int change_page_attr_set(unsigned long addr, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); +} + +static inline int change_page_attr_clear(unsigned long addr, int numpages, + pgprot_t mask) +{ + return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); + +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, + __pgprot(_PAGE_PCD | _PAGE_PWT)); +} +EXPORT_SYMBOL(set_memory_uc); + +int set_memory_wb(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, + __pgprot(_PAGE_PCD | _PAGE_PWT)); +} +EXPORT_SYMBOL(set_memory_wb); + +int set_memory_x(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); +} +EXPORT_SYMBOL(set_memory_x); + +int set_memory_nx(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); +} +EXPORT_SYMBOL(set_memory_nx); + +int set_memory_ro(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); +} + +int set_memory_rw(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); +} + +int set_memory_np(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); +} + +int set_pages_uc(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_uc(addr, numpages); +} +EXPORT_SYMBOL(set_pages_uc); + +int set_pages_wb(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_wb(addr, numpages); +} +EXPORT_SYMBOL(set_pages_wb); + +int set_pages_x(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_x(addr, numpages); +} +EXPORT_SYMBOL(set_pages_x); + +int set_pages_nx(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_nx(addr, numpages); +} +EXPORT_SYMBOL(set_pages_nx); + +int set_pages_ro(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_ro(addr, numpages); +} + +int set_pages_rw(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_rw(addr, numpages); +} + + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG) +static inline int __change_page_attr_set(unsigned long addr, int numpages, + pgprot_t mask) +{ + return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); +} + +static inline int __change_page_attr_clear(unsigned long addr, int numpages, + pgprot_t mask) +{ + return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); +} +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC + +static int __set_pages_p(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return __change_page_attr_set(addr, numpages, + __pgprot(_PAGE_PRESENT | _PAGE_RW)); +} + +static int __set_pages_np(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return __change_page_attr_clear(addr, numpages, + __pgprot(_PAGE_PRESENT)); +} + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + if (!enable) { + debug_check_no_locks_freed(page_address(page), + numpages * PAGE_SIZE); + } + + /* + * If page allocator is not up yet then do not call c_p_a(): + */ + if (!debug_pagealloc_enabled) + return; + + /* + * The return value is ignored - the calls cannot fail, + * large pages are disabled at boot time: + */ + if (enable) + __set_pages_p(page, numpages); + else + __set_pages_np(page, numpages); + + /* + * We should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu: + */ + __flush_tlb_all(); +} +#endif + +/* + * The testcases use internal knowledge of the implementation that shouldn't + * be exposed to the rest of the kernel. Include these directly here. + */ +#ifdef CONFIG_CPA_DEBUG +#include "pageattr-test.c" +#endif diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c deleted file mode 100644 index 260073c0760..00000000000 --- a/arch/x86/mm/pageattr_32.c +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * Thanks to Ben LaHaise for precious feedback. - */ - -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/tlbflush.h> -#include <asm/pgalloc.h> -#include <asm/sections.h> - -static DEFINE_SPINLOCK(cpa_lock); -static struct list_head df_list = LIST_HEAD_INIT(df_list); - - -pte_t *lookup_address(unsigned long address) -{ - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - if (pgd_none(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return NULL; - if (pmd_large(*pmd)) - return (pte_t *)pmd; - return pte_offset_kernel(pmd, address); -} - -static struct page *split_large_page(unsigned long address, pgprot_t prot, - pgprot_t ref_prot) -{ - int i; - unsigned long addr; - struct page *base; - pte_t *pbase; - - spin_unlock_irq(&cpa_lock); - base = alloc_pages(GFP_KERNEL, 0); - spin_lock_irq(&cpa_lock); - if (!base) - return NULL; - - /* - * page_private is used to track the number of entries in - * the page table page that have non standard attributes. - */ - SetPagePrivate(base); - page_private(base) = 0; - - address = __pa(address); - addr = address & LARGE_PAGE_MASK; - pbase = (pte_t *)page_address(base); - paravirt_alloc_pt(&init_mm, page_to_pfn(base)); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, - addr == address ? prot : ref_prot)); - } - return base; -} - -static void cache_flush_page(struct page *p) -{ - void *adr = page_address(p); - int i; - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) - clflush(adr+i); -} - -static void flush_kernel_map(void *arg) -{ - struct list_head *lh = (struct list_head *)arg; - struct page *p; - - /* High level code is not ready for clflush yet */ - if (0 && cpu_has_clflush) { - list_for_each_entry (p, lh, lru) - cache_flush_page(p); - } else if (boot_cpu_data.x86_model >= 4) - wbinvd(); - - /* Flush all to work around Errata in early athlons regarding - * large page flushing. - */ - __flush_tlb_all(); -} - -static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) -{ - struct page *page; - unsigned long flags; - - set_pte_atomic(kpte, pte); /* change init_mm */ - if (SHARED_KERNEL_PMD) - return; - - spin_lock_irqsave(&pgd_lock, flags); - for (page = pgd_list; page; page = (struct page *)page->index) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - pud = pud_offset(pgd, address); - pmd = pmd_offset(pud, address); - set_pte_atomic((pte_t *)pmd, pte); - } - spin_unlock_irqrestore(&pgd_lock, flags); -} - -/* - * No more special protections in this 2/4MB area - revert to a - * large page again. - */ -static inline void revert_page(struct page *kpte_page, unsigned long address) -{ - pgprot_t ref_prot; - pte_t *linear; - - ref_prot = - ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext) - ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE; - - linear = (pte_t *) - pmd_offset(pud_offset(pgd_offset_k(address), address), address); - set_pmd_pte(linear, address, - pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, - ref_prot)); -} - -static inline void save_page(struct page *kpte_page) -{ - if (!test_and_set_bit(PG_arch_1, &kpte_page->flags)) - list_add(&kpte_page->lru, &df_list); -} - -static int -__change_page_attr(struct page *page, pgprot_t prot) -{ - pte_t *kpte; - unsigned long address; - struct page *kpte_page; - - BUG_ON(PageHighMem(page)); - address = (unsigned long)page_address(page); - - kpte = lookup_address(address); - if (!kpte) - return -EINVAL; - kpte_page = virt_to_page(kpte); - BUG_ON(PageLRU(kpte_page)); - BUG_ON(PageCompound(kpte_page)); - - if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { - if (!pte_huge(*kpte)) { - set_pte_atomic(kpte, mk_pte(page, prot)); - } else { - pgprot_t ref_prot; - struct page *split; - - ref_prot = - ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext) - ? PAGE_KERNEL_EXEC : PAGE_KERNEL; - split = split_large_page(address, prot, ref_prot); - if (!split) - return -ENOMEM; - set_pmd_pte(kpte,address,mk_pte(split, ref_prot)); - kpte_page = split; - } - page_private(kpte_page)++; - } else if (!pte_huge(*kpte)) { - set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); - BUG_ON(page_private(kpte_page) == 0); - page_private(kpte_page)--; - } else - BUG(); - - /* - * If the pte was reserved, it means it was created at boot - * time (not via split_large_page) and in turn we must not - * replace it with a largepage. - */ - - save_page(kpte_page); - if (!PageReserved(kpte_page)) { - if (cpu_has_pse && (page_private(kpte_page) == 0)) { - paravirt_release_pt(page_to_pfn(kpte_page)); - revert_page(kpte_page, address); - } - } - return 0; -} - -static inline void flush_map(struct list_head *l) -{ - on_each_cpu(flush_kernel_map, l, 1, 1); -} - -/* - * Change the page attributes of an page in the linear mapping. - * - * This should be used when a page is mapped with a different caching policy - * than write-back somewhere - some CPUs do not like it when mappings with - * different caching policies exist. This changes the page attributes of the - * in kernel linear mapping too. - * - * The caller needs to ensure that there are no conflicting mappings elsewhere. - * This function only deals with the kernel linear map. - * - * Caller must call global_flush_tlb() after this. - */ -int change_page_attr(struct page *page, int numpages, pgprot_t prot) -{ - int err = 0; - int i; - unsigned long flags; - - spin_lock_irqsave(&cpa_lock, flags); - for (i = 0; i < numpages; i++, page++) { - err = __change_page_attr(page, prot); - if (err) - break; - } - spin_unlock_irqrestore(&cpa_lock, flags); - return err; -} - -void global_flush_tlb(void) -{ - struct list_head l; - struct page *pg, *next; - - BUG_ON(irqs_disabled()); - - spin_lock_irq(&cpa_lock); - list_replace_init(&df_list, &l); - spin_unlock_irq(&cpa_lock); - flush_map(&l); - list_for_each_entry_safe(pg, next, &l, lru) { - list_del(&pg->lru); - clear_bit(PG_arch_1, &pg->flags); - if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0) - continue; - ClearPagePrivate(pg); - __free_page(pg); - } -} - -#ifdef CONFIG_DEBUG_PAGEALLOC -void kernel_map_pages(struct page *page, int numpages, int enable) -{ - if (PageHighMem(page)) - return; - if (!enable) - debug_check_no_locks_freed(page_address(page), - numpages * PAGE_SIZE); - - /* the return value is ignored - the calls cannot fail, - * large pages are disabled at boot time. - */ - change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); - /* we should perform an IPI and flush all tlbs, - * but that can deadlock->flush only current cpu. - */ - __flush_tlb_all(); -} -#endif - -EXPORT_SYMBOL(change_page_attr); -EXPORT_SYMBOL(global_flush_tlb); diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c deleted file mode 100644 index c40afbaaf93..00000000000 --- a/arch/x86/mm/pageattr_64.c +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * Thanks to Ben LaHaise for precious feedback. - */ - -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/tlbflush.h> -#include <asm/io.h> - -pte_t *lookup_address(unsigned long address) -{ - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - if (pgd_none(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return NULL; - if (pmd_large(*pmd)) - return (pte_t *)pmd; - pte = pte_offset_kernel(pmd, address); - if (pte && !pte_present(*pte)) - pte = NULL; - return pte; -} - -static struct page *split_large_page(unsigned long address, pgprot_t prot, - pgprot_t ref_prot) -{ - int i; - unsigned long addr; - struct page *base = alloc_pages(GFP_KERNEL, 0); - pte_t *pbase; - if (!base) - return NULL; - /* - * page_private is used to track the number of entries in - * the page table page have non standard attributes. - */ - SetPagePrivate(base); - page_private(base) = 0; - - address = __pa(address); - addr = address & LARGE_PAGE_MASK; - pbase = (pte_t *)page_address(base); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, - addr == address ? prot : ref_prot); - } - return base; -} - -void clflush_cache_range(void *adr, int size) -{ - int i; - for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size) - clflush(adr+i); -} - -static void flush_kernel_map(void *arg) -{ - struct list_head *l = (struct list_head *)arg; - struct page *pg; - - /* When clflush is available always use it because it is - much cheaper than WBINVD. */ - /* clflush is still broken. Disable for now. */ - if (1 || !cpu_has_clflush) - asm volatile("wbinvd" ::: "memory"); - else list_for_each_entry(pg, l, lru) { - void *adr = page_address(pg); - clflush_cache_range(adr, PAGE_SIZE); - } - __flush_tlb_all(); -} - -static inline void flush_map(struct list_head *l) -{ - on_each_cpu(flush_kernel_map, l, 1, 1); -} - -static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ - -static inline void save_page(struct page *fpage) -{ - if (!test_and_set_bit(PG_arch_1, &fpage->flags)) - list_add(&fpage->lru, &deferred_pages); -} - -/* - * No more special protections in this 2/4MB area - revert to a - * large page again. - */ -static void revert_page(unsigned long address, pgprot_t ref_prot) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t large_pte; - unsigned long pfn; - - pgd = pgd_offset_k(address); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd,address); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, address); - BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; - large_pte = pfn_pte(pfn, ref_prot); - large_pte = pte_mkhuge(large_pte); - set_pte((pte_t *)pmd, large_pte); -} - -static int -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, - pgprot_t ref_prot) -{ - pte_t *kpte; - struct page *kpte_page; - pgprot_t ref_prot2; - - kpte = lookup_address(address); - if (!kpte) return 0; - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); - BUG_ON(PageLRU(kpte_page)); - BUG_ON(PageCompound(kpte_page)); - if (pgprot_val(prot) != pgprot_val(ref_prot)) { - if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, prot)); - } else { - /* - * split_large_page will take the reference for this - * change_page_attr on the split page. - */ - struct page *split; - ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(address, prot, ref_prot2); - if (!split) - return -ENOMEM; - pgprot_val(ref_prot2) &= ~_PAGE_NX; - set_pte(kpte, mk_pte(split, ref_prot2)); - kpte_page = split; - } - page_private(kpte_page)++; - } else if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, ref_prot)); - BUG_ON(page_private(kpte_page) == 0); - page_private(kpte_page)--; - } else - BUG(); - - /* on x86-64 the direct mapping set at boot is not using 4k pages */ - BUG_ON(PageReserved(kpte_page)); - - save_page(kpte_page); - if (page_private(kpte_page) == 0) - revert_page(address, ref_prot); - return 0; -} - -/* - * Change the page attributes of an page in the linear mapping. - * - * This should be used when a page is mapped with a different caching policy - * than write-back somewhere - some CPUs do not like it when mappings with - * different caching policies exist. This changes the page attributes of the - * in kernel linear mapping too. - * - * The caller needs to ensure that there are no conflicting mappings elsewhere. - * This function only deals with the kernel linear map. - * - * Caller must call global_flush_tlb() after this. - */ -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) -{ - int err = 0, kernel_map = 0; - int i; - - if (address >= __START_KERNEL_map - && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { - address = (unsigned long)__va(__pa(address)); - kernel_map = 1; - } - - down_write(&init_mm.mmap_sem); - for (i = 0; i < numpages; i++, address += PAGE_SIZE) { - unsigned long pfn = __pa(address) >> PAGE_SHIFT; - - if (!kernel_map || pte_present(pfn_pte(0, prot))) { - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); - if (err) - break; - } - /* Handle kernel mapping too which aliases part of the - * lowmem */ - if (__pa(address) < KERNEL_TEXT_SIZE) { - unsigned long addr2; - pgprot_t prot2; - addr2 = __START_KERNEL_map + __pa(address); - /* Make sure the kernel mappings stay executable */ - prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); - err = __change_page_attr(addr2, pfn, prot2, - PAGE_KERNEL_EXEC); - } - } - up_write(&init_mm.mmap_sem); - return err; -} - -/* Don't call this for MMIO areas that may not have a mem_map entry */ -int change_page_attr(struct page *page, int numpages, pgprot_t prot) -{ - unsigned long addr = (unsigned long)page_address(page); - return change_page_attr_addr(addr, numpages, prot); -} - -void global_flush_tlb(void) -{ - struct page *pg, *next; - struct list_head l; - - /* - * Write-protect the semaphore, to exclude two contexts - * doing a list_replace_init() call in parallel and to - * exclude new additions to the deferred_pages list: - */ - down_write(&init_mm.mmap_sem); - list_replace_init(&deferred_pages, &l); - up_write(&init_mm.mmap_sem); - - flush_map(&l); - - list_for_each_entry_safe(pg, next, &l, lru) { - list_del(&pg->lru); - clear_bit(PG_arch_1, &pg->flags); - if (page_private(pg) != 0) - continue; - ClearPagePrivate(pg); - __free_page(pg); - } -} - -EXPORT_SYMBOL(change_page_attr); -EXPORT_SYMBOL(global_flush_tlb); diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index be61a1d845a..2ae5999a795 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -195,11 +195,6 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) return pte; } -void pmd_ctor(struct kmem_cache *cache, void *pmd) -{ - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); -} - /* * List of all pgd's needed for non-PAE so it can invalidate entries * in both cached and uncached pgd's; not needed for PAE since the @@ -210,27 +205,18 @@ void pmd_ctor(struct kmem_cache *cache, void *pmd) * vmalloc faults work because attached pagetables are never freed. * -- wli */ -DEFINE_SPINLOCK(pgd_lock); -struct page *pgd_list; - static inline void pgd_list_add(pgd_t *pgd) { struct page *page = virt_to_page(pgd); - page->index = (unsigned long)pgd_list; - if (pgd_list) - set_page_private(pgd_list, (unsigned long)&page->index); - pgd_list = page; - set_page_private(page, (unsigned long)&pgd_list); + + list_add(&page->lru, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { - struct page *next, **pprev, *page = virt_to_page(pgd); - next = (struct page *)page->index; - pprev = (struct page **)page_private(page); - *pprev = next; - if (next) - set_page_private(next, (unsigned long)pprev); + struct page *page = virt_to_page(pgd); + + list_del(&page->lru); } @@ -285,7 +271,6 @@ static void pgd_dtor(void *pgd) if (SHARED_KERNEL_PMD) return; - paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); @@ -294,77 +279,96 @@ static void pgd_dtor(void *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) -/* If we allocate a pmd for part of the kernel address space, then - make sure its initialized with the appropriate kernel mappings. - Otherwise use a cached zeroed pmd. */ -static pmd_t *pmd_cache_alloc(int idx) +#ifdef CONFIG_X86_PAE +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void pgd_mop_up_pmds(pgd_t *pgdp) { - pmd_t *pmd; + int i; - if (idx >= USER_PTRS_PER_PGD) { - pmd = (pmd_t *)__get_free_page(GFP_KERNEL); + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { + pgd_t pgd = pgdp[i]; - if (pmd) - memcpy(pmd, - (void *)pgd_page_vaddr(swapper_pg_dir[idx]), + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgdp[i] = native_make_pgd(0); + + paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(pmd); + } + } +} + +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + pud_t *pud; + unsigned long addr; + int i; + + pud = pud_offset(pgd, 0); + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; + i++, pud++, addr += PUD_SIZE) { + pmd_t *pmd = pmd_alloc_one(mm, addr); + + if (!pmd) { + pgd_mop_up_pmds(pgd); + return 0; + } + + if (i >= USER_PTRS_PER_PGD) + memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), sizeof(pmd_t) * PTRS_PER_PMD); - } else - pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - return pmd; + pud_populate(mm, pud, pmd); + } + + return 1; +} +#else /* !CONFIG_X86_PAE */ +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + return 1; } -static void pmd_cache_free(pmd_t *pmd, int idx) +static void pgd_mop_up_pmds(pgd_t *pgd) { - if (idx >= USER_PTRS_PER_PGD) - free_page((unsigned long)pmd); - else - kmem_cache_free(pmd_cache, pmd); } +#endif /* CONFIG_X86_PAE */ pgd_t *pgd_alloc(struct mm_struct *mm) { - int i; pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); - if (PTRS_PER_PMD == 1 || !pgd) - return pgd; + mm->pgd = pgd; /* so that alloc_pd can use it */ - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { - pmd_t *pmd = pmd_cache_alloc(i); - - if (!pmd) - goto out_oom; - - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); - set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { + quicklist_free(0, pgd_dtor, pgd); + pgd = NULL; } - return pgd; -out_oom: - for (i--; i >= 0; i--) { - pgd_t pgdent = pgd[i]; - void* pmd = (void *)__va(pgd_val(pgdent)-1); - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - pmd_cache_free(pmd, i); - } - quicklist_free(0, pgd_dtor, pgd); - return NULL; + return pgd; } void pgd_free(pgd_t *pgd) { - int i; - - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { - pgd_t pgdent = pgd[i]; - void* pmd = (void *)__va(pgd_val(pgdent)-1); - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - pmd_cache_free(pmd, i); - } - /* in the non-PAE case, free_pgtables() clears user pgd entries */ + pgd_mop_up_pmds(pgd); quicklist_free(0, pgd_dtor, pgd); } @@ -372,4 +376,3 @@ void check_pgt_cache(void) { quicklist_trim(0, pgd_dtor, 25, 16); } - diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index ea85172fc0c..65416f843e5 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -130,6 +130,9 @@ void __init acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { int pxm, node; + int apic_id; + + apic_id = pa->apic_id; if (srat_disabled()) return; if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { @@ -145,68 +148,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) bad_srat(); return; } - apicid_to_node[pa->apic_id] = node; + apicid_to_node[apic_id] = node; acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", - pxm, pa->apic_id, node); -} - -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -/* - * Protect against too large hotadd areas that would fill up memory. - */ -static int hotadd_enough_memory(struct bootnode *nd) -{ - static unsigned long allocated; - static unsigned long last_area_end; - unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; - long mem = pages * sizeof(struct page); - unsigned long addr; - unsigned long allowed; - unsigned long oldpages = pages; - - if (mem < 0) - return 0; - allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE; - allowed = (allowed / 100) * hotadd_percent; - if (allocated + mem > allowed) { - unsigned long range; - /* Give them at least part of their hotadd memory upto hotadd_percent - It would be better to spread the limit out - over multiple hotplug areas, but that is too complicated - right now */ - if (allocated >= allowed) - return 0; - range = allowed - allocated; - pages = (range / PAGE_SIZE); - mem = pages * sizeof(struct page); - nd->end = nd->start + range; - } - /* Not completely fool proof, but a good sanity check */ - addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem); - if (addr == -1UL) - return 0; - if (pages != oldpages) - printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n", - pages << PAGE_SHIFT); - last_area_end = addr + mem; - allocated += mem; - return 1; -} - -static int update_end_of_memory(unsigned long end) -{ - found_add_area = 1; - if ((end >> PAGE_SHIFT) > end_pfn) - end_pfn = end >> PAGE_SHIFT; - return 1; + pxm, apic_id, node); } -static inline int save_add_info(void) -{ - return hotadd_percent > 0; -} -#else int update_end_of_memory(unsigned long end) {return -1;} static int hotadd_enough_memory(struct bootnode *nd) {return 1;} #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE @@ -214,10 +161,9 @@ static inline int save_add_info(void) {return 1;} #else static inline int save_add_info(void) {return 0;} #endif -#endif /* * Update nodes_add and decide if to include add are in the zone. - * Both SPARSE and RESERVE need nodes_add infomation. + * Both SPARSE and RESERVE need nodes_add information. * This code supports one contiguous hot add area per node. */ static int reserve_hotadd(int node, unsigned long start, unsigned long end) @@ -377,7 +323,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) return 1; } -static void unparse_node(int node) +static void __init unparse_node(int node) { int i; node_clear(node, nodes_parsed); @@ -400,7 +346,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) { cutoff_node(i, start, end); - if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { + /* + * don't confuse VM with a node that doesn't have the + * minimum memory. + */ + if (nodes[i].end && + (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { unparse_node(i); node_set_offline(i); } @@ -431,9 +382,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) setup_node_bootmem(i, nodes[i].start, nodes[i].end); for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node(i) == NUMA_NO_NODE) + int node = early_cpu_to_node(i); + + if (node == NUMA_NO_NODE) continue; - if (!node_isset(cpu_to_node(i), node_possible_map)) + if (!node_isset(node, node_possible_map)) numa_set_node(i, NUMA_NO_NODE); } numa_init_array(); @@ -441,6 +394,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) } #ifdef CONFIG_NUMA_EMU +static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { + [0 ... MAX_NUMNODES-1] = PXM_INVAL +}; +static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; static int __init find_node_by_addr(unsigned long addr) { int ret = NUMA_NO_NODE; @@ -457,7 +416,7 @@ static int __init find_node_by_addr(unsigned long addr) break; } } - return i; + return ret; } /* @@ -471,12 +430,6 @@ static int __init find_node_by_addr(unsigned long addr) void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) { int i, j; - int fake_node_to_pxm_map[MAX_NUMNODES] = { - [0 ... MAX_NUMNODES-1] = PXM_INVAL - }; - unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE - }; printk(KERN_INFO "Faking PXM affinity for fake nodes on real " "topology.\n"); diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 0ed046a187f..e2095cba409 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -32,7 +32,7 @@ static int backtrace_stack(void *data, char *name) return 0; } -static void backtrace_address(void *data, unsigned long addr) +static void backtrace_address(void *data, unsigned long addr, int reliable) { unsigned int *depth = data; @@ -48,7 +48,7 @@ static struct stacktrace_ops backtrace_ops = { }; struct frame_head { - struct frame_head *ebp; + struct frame_head *bp; unsigned long ret; } __attribute__((packed)); @@ -67,21 +67,21 @@ dump_user_backtrace(struct frame_head * head) /* frame pointers should strictly progress back up the stack * (towards higher addresses) */ - if (head >= bufhead[0].ebp) + if (head >= bufhead[0].bp) return NULL; - return bufhead[0].ebp; + return bufhead[0].bp; } void x86_backtrace(struct pt_regs * const regs, unsigned int depth) { struct frame_head *head = (struct frame_head *)frame_pointer(regs); - unsigned long stack = stack_pointer(regs); + unsigned long stack = kernel_trap_sp(regs); if (!user_mode_vm(regs)) { if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, + dump_trace(NULL, regs, (unsigned long *)stack, 0, &backtrace_ops, &depth); return; } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index c8ab79ef427..1f11cf0a307 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -18,11 +18,11 @@ #include <asm/nmi.h> #include <asm/msr.h> #include <asm/apic.h> - + #include "op_counter.h" #include "op_x86_model.h" -static struct op_x86_model_spec const * model; +static struct op_x86_model_spec const *model; static struct op_msrs cpu_msrs[NR_CPUS]; static unsigned long saved_lvtpc[NR_CPUS]; @@ -41,7 +41,6 @@ static int nmi_suspend(struct sys_device *dev, pm_message_t state) return 0; } - static int nmi_resume(struct sys_device *dev) { if (nmi_enabled == 1) @@ -49,29 +48,27 @@ static int nmi_resume(struct sys_device *dev) return 0; } - static struct sysdev_class oprofile_sysclass = { .name = "oprofile", .resume = nmi_resume, .suspend = nmi_suspend, }; - static struct sys_device device_oprofile = { .id = 0, .cls = &oprofile_sysclass, }; - static int __init init_sysfs(void) { int error; - if (!(error = sysdev_class_register(&oprofile_sysclass))) + + error = sysdev_class_register(&oprofile_sysclass); + if (!error) error = sysdev_register(&device_oprofile); return error; } - static void exit_sysfs(void) { sysdev_unregister(&device_oprofile); @@ -90,7 +87,7 @@ static int profile_exceptions_notify(struct notifier_block *self, int ret = NOTIFY_DONE; int cpu = smp_processor_id(); - switch(val) { + switch (val) { case DIE_NMI: if (model->check_ctrs(args->regs, &cpu_msrs[cpu])) ret = NOTIFY_STOP; @@ -101,24 +98,24 @@ static int profile_exceptions_notify(struct notifier_block *self, return ret; } -static void nmi_cpu_save_registers(struct op_msrs * msrs) +static void nmi_cpu_save_registers(struct op_msrs *msrs) { unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; - struct op_msr * counters = msrs->counters; - struct op_msr * controls = msrs->controls; + unsigned int const nr_ctrls = model->num_controls; + struct op_msr *counters = msrs->counters; + struct op_msr *controls = msrs->controls; unsigned int i; for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr){ + if (counters[i].addr) { rdmsr(counters[i].addr, counters[i].saved.low, counters[i].saved.high); } } - + for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr){ + if (controls[i].addr) { rdmsr(controls[i].addr, controls[i].saved.low, controls[i].saved.high); @@ -126,15 +123,13 @@ static void nmi_cpu_save_registers(struct op_msrs * msrs) } } - -static void nmi_save_registers(void * dummy) +static void nmi_save_registers(void *dummy) { int cpu = smp_processor_id(); - struct op_msrs * msrs = &cpu_msrs[cpu]; + struct op_msrs *msrs = &cpu_msrs[cpu]; nmi_cpu_save_registers(msrs); } - static void free_msrs(void) { int i; @@ -146,7 +141,6 @@ static void free_msrs(void) } } - static int allocate_msrs(void) { int success = 1; @@ -173,11 +167,10 @@ static int allocate_msrs(void) return success; } - -static void nmi_cpu_setup(void * dummy) +static void nmi_cpu_setup(void *dummy) { int cpu = smp_processor_id(); - struct op_msrs * msrs = &cpu_msrs[cpu]; + struct op_msrs *msrs = &cpu_msrs[cpu]; spin_lock(&oprofilefs_lock); model->setup_ctrs(msrs); spin_unlock(&oprofilefs_lock); @@ -193,13 +186,14 @@ static struct notifier_block profile_exceptions_nb = { static int nmi_setup(void) { - int err=0; + int err = 0; int cpu; if (!allocate_msrs()) return -ENOMEM; - if ((err = register_die_notifier(&profile_exceptions_nb))){ + err = register_die_notifier(&profile_exceptions_nb); + if (err) { free_msrs(); return err; } @@ -210,7 +204,7 @@ static int nmi_setup(void) /* Assume saved/restored counters are the same on all CPUs */ model->fill_in_addresses(&cpu_msrs[0]); - for_each_possible_cpu (cpu) { + for_each_possible_cpu(cpu) { if (cpu != 0) { memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters, sizeof(struct op_msr) * model->num_counters); @@ -226,39 +220,37 @@ static int nmi_setup(void) return 0; } - -static void nmi_restore_registers(struct op_msrs * msrs) +static void nmi_restore_registers(struct op_msrs *msrs) { unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; - struct op_msr * counters = msrs->counters; - struct op_msr * controls = msrs->controls; + unsigned int const nr_ctrls = model->num_controls; + struct op_msr *counters = msrs->counters; + struct op_msr *controls = msrs->controls; unsigned int i; for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr){ + if (controls[i].addr) { wrmsr(controls[i].addr, controls[i].saved.low, controls[i].saved.high); } } - + for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr){ + if (counters[i].addr) { wrmsr(counters[i].addr, counters[i].saved.low, counters[i].saved.high); } } } - -static void nmi_cpu_shutdown(void * dummy) +static void nmi_cpu_shutdown(void *dummy) { unsigned int v; int cpu = smp_processor_id(); - struct op_msrs * msrs = &cpu_msrs[cpu]; - + struct op_msrs *msrs = &cpu_msrs[cpu]; + /* restoring APIC_LVTPC can trigger an apic error because the delivery * mode and vector nr combination can be illegal. That's by design: on * power on apic lvt contain a zero vector nr which are legal only for @@ -271,7 +263,6 @@ static void nmi_cpu_shutdown(void * dummy) nmi_restore_registers(msrs); } - static void nmi_shutdown(void) { nmi_enabled = 0; @@ -281,45 +272,40 @@ static void nmi_shutdown(void) free_msrs(); } - -static void nmi_cpu_start(void * dummy) +static void nmi_cpu_start(void *dummy) { - struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()]; + struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()]; model->start(msrs); } - static int nmi_start(void) { on_each_cpu(nmi_cpu_start, NULL, 0, 1); return 0; } - - -static void nmi_cpu_stop(void * dummy) + +static void nmi_cpu_stop(void *dummy) { - struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()]; + struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()]; model->stop(msrs); } - - + static void nmi_stop(void) { on_each_cpu(nmi_cpu_stop, NULL, 0, 1); } - struct op_counter_config counter_config[OP_MAX_COUNTER]; -static int nmi_create_files(struct super_block * sb, struct dentry * root) +static int nmi_create_files(struct super_block *sb, struct dentry *root) { unsigned int i; for (i = 0; i < model->num_counters; ++i) { - struct dentry * dir; + struct dentry *dir; char buf[4]; - - /* quick little hack to _not_ expose a counter if it is not + + /* quick little hack to _not_ expose a counter if it is not * available for use. This should protect userspace app. * NOTE: assumes 1:1 mapping here (that counters are organized * sequentially in their struct assignment). @@ -329,21 +315,21 @@ static int nmi_create_files(struct super_block * sb, struct dentry * root) snprintf(buf, sizeof(buf), "%d", i); dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); - oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); - oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); - oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); - oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); - oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); + oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); + oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); + oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); + oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); + oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); + oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); } return 0; } - + static int p4force; module_param(p4force, int, 0); - -static int __init p4_init(char ** cpu_type) + +static int __init p4_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; @@ -356,15 +342,15 @@ static int __init p4_init(char ** cpu_type) return 1; #else switch (smp_num_siblings) { - case 1: - *cpu_type = "i386/p4"; - model = &op_p4_spec; - return 1; - - case 2: - *cpu_type = "i386/p4-ht"; - model = &op_p4_ht2_spec; - return 1; + case 1: + *cpu_type = "i386/p4"; + model = &op_p4_spec; + return 1; + + case 2: + *cpu_type = "i386/p4-ht"; + model = &op_p4_ht2_spec; + return 1; } #endif @@ -373,8 +359,7 @@ static int __init p4_init(char ** cpu_type) return 0; } - -static int __init ppro_init(char ** cpu_type) +static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; @@ -409,52 +394,52 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (!cpu_has_apic) return -ENODEV; - + switch (vendor) { - case X86_VENDOR_AMD: - /* Needs to be at least an Athlon (or hammer in 32bit mode) */ + case X86_VENDOR_AMD: + /* Needs to be at least an Athlon (or hammer in 32bit mode) */ - switch (family) { - default: + switch (family) { + default: + return -ENODEV; + case 6: + model = &op_athlon_spec; + cpu_type = "i386/athlon"; + break; + case 0xf: + model = &op_athlon_spec; + /* Actually it could be i386/hammer too, but give + user space an consistent name. */ + cpu_type = "x86-64/hammer"; + break; + case 0x10: + model = &op_athlon_spec; + cpu_type = "x86-64/family10"; + break; + } + break; + + case X86_VENDOR_INTEL: + switch (family) { + /* Pentium IV */ + case 0xf: + if (!p4_init(&cpu_type)) return -ENODEV; - case 6: - model = &op_athlon_spec; - cpu_type = "i386/athlon"; - break; - case 0xf: - model = &op_athlon_spec; - /* Actually it could be i386/hammer too, but give - user space an consistent name. */ - cpu_type = "x86-64/hammer"; - break; - case 0x10: - model = &op_athlon_spec; - cpu_type = "x86-64/family10"; - break; - } break; - - case X86_VENDOR_INTEL: - switch (family) { - /* Pentium IV */ - case 0xf: - if (!p4_init(&cpu_type)) - return -ENODEV; - break; - - /* A P6-class processor */ - case 6: - if (!ppro_init(&cpu_type)) - return -ENODEV; - break; - - default: - return -ENODEV; - } + + /* A P6-class processor */ + case 6: + if (!ppro_init(&cpu_type)) + return -ENODEV; break; default: return -ENODEV; + } + break; + + default: + return -ENODEV; } init_sysfs(); @@ -469,7 +454,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) return 0; } - void op_nmi_exit(void) { if (using_nmi) diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 86274639066..52deabc72a6 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -109,6 +109,19 @@ static void __devinit pcibios_fixup_ghosts(struct pci_bus *b) } } +static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) +{ + struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; + + if (rom_r->parent) + return; + if (rom_r->start) + /* we deal with BIOS assigned ROM later */ + return; + if (!(pci_probe & PCI_ASSIGN_ROMS)) + rom_r->start = rom_r->end = rom_r->flags = 0; +} + /* * Called after each bus is probed, but before its children * are examined. @@ -116,8 +129,12 @@ static void __devinit pcibios_fixup_ghosts(struct pci_bus *b) void __devinit pcibios_fixup_bus(struct pci_bus *b) { + struct pci_dev *dev; + pcibios_fixup_ghosts(b); pci_read_bridge_bases(b); + list_for_each_entry(dev, &b->devices, bus_list) + pcibios_fixup_device_resources(dev); } /* diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 6cff66dd0c9..cb63007e20b 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -19,7 +19,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d) printk(KERN_WARNING "PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); reg = 0xd0; - for(pxb=0; pxb<2; pxb++) { + for(pxb = 0; pxb < 2; pxb++) { pci_read_config_byte(d, reg++, &busno); pci_read_config_byte(d, reg++, &suba); pci_read_config_byte(d, reg++, &subb); @@ -56,7 +56,7 @@ static void __devinit pci_fixup_umc_ide(struct pci_dev *d) int i; printk(KERN_WARNING "PCI: Fixing base address flags for device %s\n", pci_name(d)); - for(i=0; i<4; i++) + for(i = 0; i < 4; i++) d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide); @@ -127,7 +127,7 @@ static void pci_fixup_via_northbridge_bug(struct pci_dev *d) NB latency to zero */ pci_write_config_byte(d, PCI_LATENCY_TIMER, 0); - where = 0x95; /* the memory write queue timer register is + where = 0x95; /* the memory write queue timer register is different for the KT266x's: 0x95 not 0x55 */ } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 && (d->revision == VIA_8363_KL133_REVISION_ID || @@ -230,7 +230,7 @@ static int quirk_pcie_aspm_write(struct pci_bus *bus, unsigned int devfn, int wh if ((offset) && (where == offset)) value = value & 0xfffffffc; - + return raw_pci_ops->write(0, bus->number, devfn, where, size, value); } @@ -271,8 +271,8 @@ static void pcie_rootport_aspm_quirk(struct pci_dev *pdev) * after hot-remove, the pbus->devices is empty and this code * will set the offsets to zero and the bus ops to parent's bus * ops, which is unmodified. - */ - for (i= GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i) + */ + for (i = GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i) quirk_aspm_offset[i] = 0; pbus->ops = pbus->parent->ops; @@ -286,17 +286,17 @@ static void pcie_rootport_aspm_quirk(struct pci_dev *pdev) list_for_each_entry(dev, &pbus->devices, bus_list) { /* There are 0 to 8 devices attached to this bus */ cap_base = pci_find_capability(dev, PCI_CAP_ID_EXP); - quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)]= cap_base + 0x10; + quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)] = cap_base + 0x10; } pbus->ops = &quirk_pcie_aspm_ops; } } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA, pcie_rootport_aspm_quirk ); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA1, pcie_rootport_aspm_quirk ); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB, pcie_rootport_aspm_quirk ); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_rootport_aspm_quirk ); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk ); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk ); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA, pcie_rootport_aspm_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA1, pcie_rootport_aspm_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB, pcie_rootport_aspm_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_rootport_aspm_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk); /* * Fixup to mark boot BIOS video selected by BIOS before it changes @@ -336,8 +336,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev) * PCI header type NORMAL. */ if (bridge - &&((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE) - ||(bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) { + && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE) + || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) { pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, &config); if (!(config & PCI_BRIDGE_CTL_VGA)) diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 88d8f5c0ecb..ed07ce6c171 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -200,6 +200,7 @@ static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; + WARN_ON_ONCE(pirq >= 16); return irqmap[read_config_nybble(router, 0x48, pirq-1)]; } @@ -207,7 +208,8 @@ static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i { static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; unsigned int val = irqmap[irq]; - + + WARN_ON_ONCE(pirq >= 16); if (val) { write_config_nybble(router, 0x48, pirq-1, val); return 1; @@ -257,12 +259,16 @@ static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; + + WARN_ON_ONCE(pirq >= 5); return read_config_nybble(router, 0x55, pirqmap[pirq-1]); } static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; + + WARN_ON_ONCE(pirq >= 5); write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); return 1; } @@ -275,12 +281,16 @@ static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + + WARN_ON_ONCE(pirq >= 4); return read_config_nybble(router,0x43, pirqmap[pirq-1]); } static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + + WARN_ON_ONCE(pirq >= 4); write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); return 1; } @@ -419,6 +429,7 @@ static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { + WARN_ON_ONCE(pirq >= 9); if (pirq > 8) { printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); return 0; @@ -428,6 +439,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { + WARN_ON_ONCE(pirq >= 9); if (pirq > 8) { printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); return 0; @@ -449,14 +461,14 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, */ static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { - outb_p(pirq, 0xc00); + outb(pirq, 0xc00); return inb(0xc01) & 0xf; } static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { - outb_p(pirq, 0xc00); - outb_p(irq, 0xc01); + outb(pirq, 0xc00); + outb(irq, 0xc01); return 1; } diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 998fd3ec0d6..efcf620d143 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -19,7 +19,7 @@ unsigned long saved_context_esp, saved_context_ebp; unsigned long saved_context_esi, saved_context_edi; unsigned long saved_context_eflags; -void __save_processor_state(struct saved_context *ctxt) +static void __save_processor_state(struct saved_context *ctxt) { mtrr_save_fixed_ranges(NULL); kernel_fpu_begin(); @@ -74,19 +74,19 @@ static void fix_processor_context(void) /* * Now maybe reload the debug registers */ - if (current->thread.debugreg[7]){ - set_debugreg(current->thread.debugreg[0], 0); - set_debugreg(current->thread.debugreg[1], 1); - set_debugreg(current->thread.debugreg[2], 2); - set_debugreg(current->thread.debugreg[3], 3); + if (current->thread.debugreg7) { + set_debugreg(current->thread.debugreg0, 0); + set_debugreg(current->thread.debugreg1, 1); + set_debugreg(current->thread.debugreg2, 2); + set_debugreg(current->thread.debugreg3, 3); /* no 4 and 5 */ - set_debugreg(current->thread.debugreg[6], 6); - set_debugreg(current->thread.debugreg[7], 7); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(current->thread.debugreg7, 7); } } -void __restore_processor_state(struct saved_context *ctxt) +static void __restore_processor_state(struct saved_context *ctxt) { /* * control registers diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore index f8b69d84238..60274d5746e 100644 --- a/arch/x86/vdso/.gitignore +++ b/arch/x86/vdso/.gitignore @@ -1 +1,6 @@ vdso.lds +vdso-syms.lds +vdso32-syms.lds +vdso32-syscall-syms.lds +vdso32-sysenter-syms.lds +vdso32-int80-syms.lds diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index e7bff0fbac2..d28dda57470 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -1,39 +1,37 @@ # -# x86-64 vDSO. +# Building vDSO images for x86. # +VDSO64-$(CONFIG_X86_64) := y +VDSO32-$(CONFIG_X86_32) := y +VDSO32-$(CONFIG_COMPAT) := y + +vdso-install-$(VDSO64-y) += vdso.so +vdso-install-$(VDSO32-y) += $(vdso32-y:=.so) + + # files to link into the vdso -# vdso-start.o has to be first -vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o # files to link into kernel -obj-y := vma.o vdso.o vdso-syms.o +obj-$(VDSO64-y) += vma.o vdso.o +obj-$(VDSO32-y) += vdso32.o vdso32-setup.o vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) $(obj)/vdso.o: $(obj)/vdso.so -targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) vdso-syms.o - -# The DSO images are built using a special linker script. -quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \ - -Wl,-T,$(filter-out FORCE,$^) -o $@ +targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) export CPPFLAGS_vdso.lds += -P -C -vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \ - $(call ld-option, -Wl$(comma)--hash-style=sysv) \ - -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 -SYSCFLAGS_vdso.so = $(vdso-flags) -SYSCFLAGS_vdso.so.dbg = $(vdso-flags) +VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \ + -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so -$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE - $(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE - $(call if_changed,syscall) + $(call if_changed,vdso) $(obj)/%.so: OBJCOPYFLAGS := -S $(obj)/%.so: $(obj)/%.so.dbg FORCE @@ -41,24 +39,96 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64 -$(obj)/vclock_gettime.o: KBUILD_CFLAGS = $(CFL) -$(obj)/vgetcpu.o: KBUILD_CFLAGS = $(CFL) +$(vobjs): KBUILD_CFLAGS = $(CFL) + +targets += vdso-syms.lds +obj-$(VDSO64-y) += vdso-syms.lds + +# +# Match symbols in the DSO that look like VDSO*; produce a file of constants. +# +sed-vdsosym := -e 's/^00*/0/' \ + -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ + +$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE + $(call if_changed,vdsosym) + +# +# Build multiple 32-bit vDSO images to choose from at boot time. +# +obj-$(VDSO32-y) += vdso32-syms.lds +vdso32.so-$(CONFIG_X86_32) += int80 +vdso32.so-$(CONFIG_COMPAT) += syscall +vdso32.so-$(VDSO32-y) += sysenter + +CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 + +# This makes sure the $(obj) subdirectory exists even though vdso32/ +# is not a kbuild sub-make subdirectory. +override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ -# We also create a special relocatable object that should mirror the symbol -# table and layout of the linked DSO. With ld -R we can then refer to -# these symbols in the kernel code rather than hand-coded addresses. -extra-y += vdso-syms.o -$(obj)/built-in.o: $(obj)/vdso-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o +targets += vdso32/vdso32.lds +targets += $(vdso32.so-y:%=vdso32-%.so.dbg) $(vdso32.so-y:%=vdso32-%.so) +targets += vdso32/note.o $(vdso32.so-y:%=vdso32/%.o) -SYSCFLAGS_vdso-syms.o = -r -d -$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE - $(call if_changed,syscall) +extra-y += $(vdso32.so-y:%=vdso32-%.so) +$(obj)/vdso32.o: $(vdso32.so-y:%=$(obj)/vdso32-%.so) + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): asflags-$(CONFIG_X86_64) += -m32 + +$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/note.o \ + $(obj)/vdso32/%.o + $(call if_changed,vdso) + +# Make vdso32-*-syms.lds from each image, and then make sure they match. +# The only difference should be that some do not define VDSO32_SYSENTER_RETURN. + +targets += vdso32-syms.lds $(vdso32.so-y:%=vdso32-%-syms.lds) + +quiet_cmd_vdso32sym = VDSOSYM $@ +define cmd_vdso32sym + if LC_ALL=C sort -u $(filter-out FORCE,$^) > $(@D)/.tmp_$(@F) && \ + $(foreach H,$(filter-out FORCE,$^),\ + if grep -q VDSO32_SYSENTER_RETURN $H; \ + then diff -u $(@D)/.tmp_$(@F) $H; \ + else sed /VDSO32_SYSENTER_RETURN/d $(@D)/.tmp_$(@F) | \ + diff -u - $H; fi &&) : ;\ + then mv -f $(@D)/.tmp_$(@F) $@; \ + else rm -f $(@D)/.tmp_$(@F); exit 1; \ + fi +endef + +$(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE + $(call if_changed,vdso32sym) + +# +# The DSO images are built using a special linker script. +# +quiet_cmd_vdso = VDSO $@ + cmd_vdso = $(CC) -nostdlib -o $@ \ + $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) + +VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) + +# +# Install the unstripped copy of vdso*.so listed in $(vdso-install-y). +# quiet_cmd_vdso_install = INSTALL $@ cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ -vdso.so: +$(vdso-install-y): %.so: $(obj)/%.so.dbg FORCE @mkdir -p $(MODLIB)/vdso $(call cmd,vdso_install) -vdso_install: vdso.so +PHONY += vdso_install $(vdso-install-y) +vdso_install: $(vdso-install-y) + +clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 5b54cdfb2b0..23476c2ebfc 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -19,7 +19,6 @@ #include <asm/hpet.h> #include <asm/unistd.h> #include <asm/io.h> -#include <asm/vgtod.h> #include "vextern.h" #define gtod vdso_vsyscall_gtod_data diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S new file mode 100644 index 00000000000..634a2cf6204 --- /dev/null +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -0,0 +1,64 @@ +/* + * Linker script for vDSO. This is an ELF shared object prelinked to + * its virtual address, and with only one read-only segment. + * This script controls its layout. + */ + +SECTIONS +{ + . = VDSO_PRELINK + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { *(.rodata*) } :text + .data : { + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + } + + .altinstructions : { *(.altinstructions) } + .altinstr_replacement : { *(.altinstr_replacement) } + + /* + * Align the actual code well away from the non-instruction data. + * This is the best thing for the I-cache. + */ + . = ALIGN(0x100); + + .text : { *(.text*) } :text =0x90909090 +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} diff --git a/arch/x86/vdso/vdso-start.S b/arch/x86/vdso/vdso-start.S deleted file mode 100644 index 2dc2cdb84d6..00000000000 --- a/arch/x86/vdso/vdso-start.S +++ /dev/null @@ -1,2 +0,0 @@ - .globl vdso_kernel_start -vdso_kernel_start: diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S index 667d3245d97..4e5dd3b4de7 100644 --- a/arch/x86/vdso/vdso.lds.S +++ b/arch/x86/vdso/vdso.lds.S @@ -1,79 +1,37 @@ /* - * Linker script for vsyscall DSO. The vsyscall page is an ELF shared - * object prelinked to its virtual address, and with only one read-only - * segment (that fits in one page). This script controls its layout. + * Linker script for 64-bit vDSO. + * We #include the file to define the layout details. + * Here we only choose the prelinked virtual address. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. We can define local symbols here called VDSO* to make their + * values visible using the asm-x86/vdso.h macros from the kernel proper. */ -#include <asm/asm-offsets.h> -#include "voffset.h" #define VDSO_PRELINK 0xffffffffff700000 - -SECTIONS -{ - . = VDSO_PRELINK + SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - /* This linker script is used both with -r and with -shared. - For the layouts to match, we need to skip more than enough - space for the dynamic symbol table et al. If this amount - is insufficient, ld -shared will barf. Just increase it here. */ - . = VDSO_PRELINK + VDSO_TEXT_OFFSET; - - .text : { *(.text*) } :text - .rodata : { *(.rodata*) } :text - .data : { - *(.data*) - *(.sdata*) - *(.bss*) - *(.dynbss*) - } :text - - .altinstructions : { *(.altinstructions) } :text - .altinstr_replacement : { *(.altinstr_replacement) } :text - - .note : { *(.note.*) } :text :note - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic - .useless : { - *(.got.plt) *(.got) - *(.gnu.linkonce.d.*) - *(.gnu.linkonce.b.*) - } :text -} +#include "vdso-layout.lds.S" /* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. + * This controls what userland symbols we export from the vDSO. */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + local: *; + }; } +VDSO64_PRELINK = VDSO_PRELINK; + /* - * This controls what symbols we export from the DSO. + * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL. */ -VERSION -{ - LINUX_2.6 { - global: - clock_gettime; - __vdso_clock_gettime; - gettimeofday; - __vdso_gettimeofday; - getcpu; - __vdso_getcpu; - local: *; - }; -} +#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x; +#include "vextern.h" +#undef VEXTERN diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/vdso/vdso32-setup.c index 5a2d951e260..348f1341e1c 100644 --- a/arch/x86/kernel/sysenter_32.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -23,6 +23,8 @@ #include <asm/unistd.h> #include <asm/elf.h> #include <asm/tlbflush.h> +#include <asm/vdso.h> +#include <asm/proto.h> enum { VDSO_DISABLED = 0, @@ -36,14 +38,24 @@ enum { #define VDSO_DEFAULT VDSO_ENABLED #endif +#ifdef CONFIG_X86_64 +#define vdso_enabled sysctl_vsyscall32 +#define arch_setup_additional_pages syscall32_setup_pages +#endif + +/* + * This is the difference between the prelinked addresses in the vDSO images + * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO + * in the user address space. + */ +#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK) + /* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; -EXPORT_SYMBOL_GPL(vdso_enabled); - static int __init vdso_setup(char *s) { vdso_enabled = simple_strtoul(s, NULL, 0); @@ -51,9 +63,18 @@ static int __init vdso_setup(char *s) return 1; } -__setup("vdso=", vdso_setup); +/* + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO + * behavior on both 64-bit and 32-bit kernels. + * On 32-bit kernels, vdso=[012] means the same thing. + */ +__setup("vdso32=", vdso_setup); + +#ifdef CONFIG_X86_32 +__setup_param("vdso=", vdso32_setup, vdso_setup, 0); -extern asmlinkage void sysenter_entry(void); +EXPORT_SYMBOL_GPL(vdso_enabled); +#endif static __init void reloc_symtab(Elf32_Ehdr *ehdr, unsigned offset, unsigned size) @@ -78,7 +99,7 @@ static __init void reloc_symtab(Elf32_Ehdr *ehdr, case STT_FUNC: case STT_SECTION: case STT_FILE: - sym->st_value += VDSO_HIGH_BASE; + sym->st_value += VDSO_ADDR_ADJUST; } } } @@ -104,7 +125,7 @@ static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) case DT_VERNEED: case DT_ADDRRNGLO ... DT_ADDRRNGHI: /* definitely pointers needing relocation */ - dyn->d_un.d_ptr += VDSO_HIGH_BASE; + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; break; case DT_ENCODING ... OLD_DT_LOOS-1: @@ -113,7 +134,7 @@ static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) they're even */ if (dyn->d_tag >= DT_ENCODING && (dyn->d_tag & 1) == 0) - dyn->d_un.d_ptr += VDSO_HIGH_BASE; + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; break; case DT_VERDEFNUM: @@ -142,15 +163,15 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) int i; BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || - !elf_check_arch(ehdr) || + !elf_check_arch_ia32(ehdr) || ehdr->e_type != ET_DYN); - ehdr->e_entry += VDSO_HIGH_BASE; + ehdr->e_entry += VDSO_ADDR_ADJUST; /* rebase phdrs */ phdr = (void *)ehdr + ehdr->e_phoff; for (i = 0; i < ehdr->e_phnum; i++) { - phdr[i].p_vaddr += VDSO_HIGH_BASE; + phdr[i].p_vaddr += VDSO_ADDR_ADJUST; /* relocate dynamic stuff */ if (phdr[i].p_type == PT_DYNAMIC) @@ -163,7 +184,7 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) if (!(shdr[i].sh_flags & SHF_ALLOC)) continue; - shdr[i].sh_addr += VDSO_HIGH_BASE; + shdr[i].sh_addr += VDSO_ADDR_ADJUST; if (shdr[i].sh_type == SHT_SYMTAB || shdr[i].sh_type == SHT_DYNSYM) @@ -172,6 +193,45 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) } } +/* + * These symbols are defined by vdso32.S to mark the bounds + * of the ELF DSO images included therein. + */ +extern const char vdso32_default_start, vdso32_default_end; +extern const char vdso32_sysenter_start, vdso32_sysenter_end; +static struct page *vdso32_pages[1]; + +#ifdef CONFIG_X86_64 + +static int use_sysenter __read_mostly = -1; + +#define vdso32_sysenter() (use_sysenter > 0) + +/* May not be __init: called during resume */ +void syscall32_cpu_init(void) +{ + if (use_sysenter < 0) + use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); + + /* Load these always in case some future AMD CPU supports + SYSENTER from compat mode too. */ + checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); + checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + + wrmsrl(MSR_CSTAR, ia32_cstar_target); +} + +#define compat_uses_vma 1 + +static inline void map_compat_vdso(int map) +{ +} + +#else /* CONFIG_X86_32 */ + +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) + void enable_sep_cpu(void) { int cpu = get_cpu(); @@ -183,10 +243,10 @@ void enable_sep_cpu(void) } tss->x86_tss.ss1 = __KERNEL_CS; - tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss; + tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); put_cpu(); } @@ -209,13 +269,7 @@ static int __init gate_vma_init(void) return 0; } -/* - * These symbols are defined by vsyscall.o to mark the bounds - * of the ELF DSO images included therein. - */ -extern const char vsyscall_int80_start, vsyscall_int80_end; -extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; -static struct page *syscall_pages[1]; +#define compat_uses_vma 0 static void map_compat_vdso(int map) { @@ -226,31 +280,35 @@ static void map_compat_vdso(int map) vdso_mapped = map; - __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT, + __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT, map ? PAGE_READONLY_EXEC : PAGE_NONE); /* flush stray tlbs */ flush_tlb_all(); } +#endif /* CONFIG_X86_64 */ + int __init sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); const void *vsyscall; size_t vsyscall_len; - syscall_pages[0] = virt_to_page(syscall_page); + vdso32_pages[0] = virt_to_page(syscall_page); +#ifdef CONFIG_X86_32 gate_vma_init(); printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); +#endif - if (!boot_cpu_has(X86_FEATURE_SEP)) { - vsyscall = &vsyscall_int80_start; - vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start; + if (!vdso32_sysenter()) { + vsyscall = &vdso32_default_start; + vsyscall_len = &vdso32_default_end - &vdso32_default_start; } else { - vsyscall = &vsyscall_sysenter_start; - vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start; + vsyscall = &vdso32_sysenter_start; + vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; } memcpy(syscall_page, vsyscall, vsyscall_len); @@ -259,9 +317,6 @@ int __init sysenter_setup(void) return 0; } -/* Defined in vsyscall-sysenter.S */ -extern void SYSENTER_RETURN; - /* Setup a VMA at program startup for the vsyscall page */ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) { @@ -286,7 +341,9 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) ret = addr; goto up_fail; } + } + if (compat_uses_vma || !compat) { /* * MAYWRITE to allow gdb to COW and set breakpoints * @@ -300,7 +357,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| VM_ALWAYSDUMP, - syscall_pages); + vdso32_pages); if (ret) goto up_fail; @@ -308,7 +365,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) current->mm->context.vdso = (void *)addr; current_thread_info()->sysenter_return = - (void *)VDSO_SYM(&SYSENTER_RETURN); + VDSO32_SYMBOL(addr, SYSENTER_RETURN); up_fail: up_write(&mm->mmap_sem); @@ -316,6 +373,45 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) return ret; } +#ifdef CONFIG_X86_64 + +__initcall(sysenter_setup); + +#ifdef CONFIG_SYSCTL +/* Register vsyscall32 into the ABI table */ +#include <linux/sysctl.h> + +static ctl_table abi_table2[] = { + { + .procname = "vsyscall32", + .data = &sysctl_vsyscall32, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + {} +}; + +static ctl_table abi_root_table2[] = { + { + .ctl_name = CTL_ABI, + .procname = "abi", + .mode = 0555, + .child = abi_table2 + }, + {} +}; + +static __init int ia32_binfmt_init(void) +{ + register_sysctl_table(abi_root_table2); + return 0; +} +__initcall(ia32_binfmt_init); +#endif + +#else /* CONFIG_X86_32 */ + const char *arch_vma_name(struct vm_area_struct *vma) { if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) @@ -344,3 +440,5 @@ int in_gate_area_no_task(unsigned long addr) { return 0; } + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S new file mode 100644 index 00000000000..1e36f72cab8 --- /dev/null +++ b/arch/x86/vdso/vdso32.S @@ -0,0 +1,19 @@ +#include <linux/init.h> + +__INITDATA + + .globl vdso32_default_start, vdso32_default_end +vdso32_default_start: +#ifdef CONFIG_X86_32 + .incbin "arch/x86/vdso/vdso32-int80.so" +#else + .incbin "arch/x86/vdso/vdso32-syscall.so" +#endif +vdso32_default_end: + + .globl vdso32_sysenter_start, vdso32_sysenter_end +vdso32_sysenter_start: + .incbin "arch/x86/vdso/vdso32-sysenter.so" +vdso32_sysenter_end: + +__FINIT diff --git a/arch/x86/vdso/vdso32/.gitignore b/arch/x86/vdso/vdso32/.gitignore new file mode 100644 index 00000000000..e45fba9d0ce --- /dev/null +++ b/arch/x86/vdso/vdso32/.gitignore @@ -0,0 +1 @@ +vdso32.lds diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/vdso/vdso32/int80.S index 103cab6aa7c..b15b7c01aed 100644 --- a/arch/x86/kernel/vsyscall-int80_32.S +++ b/arch/x86/vdso/vdso32/int80.S @@ -1,15 +1,15 @@ /* - * Code for the vsyscall page. This version uses the old int $0x80 method. + * Code for the vDSO. This version uses the old int $0x80 method. * - * NOTE: - * 1) __kernel_vsyscall _must_ be first in this page. - * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S - * for details. + * First get the common code for the sigreturn entry points. + * This must come first. */ +#include "sigreturn.S" .text .globl __kernel_vsyscall .type __kernel_vsyscall,@function + ALIGN __kernel_vsyscall: .LSTART_vsyscall: int $0x80 @@ -47,7 +47,10 @@ __kernel_vsyscall: .LENDFDEDLSI: .previous -/* - * Get the common code for the sigreturn entry points. - */ -#include "vsyscall-sigreturn_32.S" + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 + .previous diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/vdso/vdso32/note.S index fcf376a37f7..c83f2573469 100644 --- a/arch/x86/kernel/vsyscall-note_32.S +++ b/arch/x86/vdso/vdso32/note.S @@ -33,12 +33,11 @@ ELFNOTE_END * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. */ -#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ +#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ - .globl VDSO_NOTE_MASK ELFNOTE_START(GNU, 2, "a") .long 1 /* ncaps */ -VDSO_NOTE_MASK: +VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */ .long 0 /* mask */ .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ ELFNOTE_END diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/vdso/vdso32/sigreturn.S index a92262f4165..31776d0efc8 100644 --- a/arch/x86/kernel/vsyscall-sigreturn_32.S +++ b/arch/x86/vdso/vdso32/sigreturn.S @@ -1,41 +1,42 @@ /* - * Common code for the sigreturn entry points on the vsyscall page. + * Common code for the sigreturn entry points in vDSO images. * So far this code is the same for both int80 and sysenter versions. - * This file is #include'd by vsyscall-*.S to define them after the - * vsyscall entry point. The kernel assumes that the addresses of these - * routines are constant for all vsyscall implementations. + * This file is #include'd by int80.S et al to define them first thing. + * The kernel assumes that the addresses of these routines are constant + * for all vDSO implementations. */ -#include <asm/unistd.h> +#include <linux/linkage.h> +#include <asm/unistd_32.h> #include <asm/asm-offsets.h> - -/* XXX - Should these be named "_sigtramp" or something? -*/ +#ifndef SYSCALL_ENTER_KERNEL +#define SYSCALL_ENTER_KERNEL int $0x80 +#endif .text - .org __kernel_vsyscall+32,0x90 .globl __kernel_sigreturn .type __kernel_sigreturn,@function + ALIGN __kernel_sigreturn: .LSTART_sigreturn: popl %eax /* XXX does this mean it needs unwind info? */ movl $__NR_sigreturn, %eax - int $0x80 + SYSCALL_ENTER_KERNEL .LEND_sigreturn: + nop .size __kernel_sigreturn,.-.LSTART_sigreturn - .balign 32 .globl __kernel_rt_sigreturn .type __kernel_rt_sigreturn,@function + ALIGN __kernel_rt_sigreturn: .LSTART_rt_sigreturn: movl $__NR_rt_sigreturn, %eax - int $0x80 + SYSCALL_ENTER_KERNEL .LEND_rt_sigreturn: + nop .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn - .balign 32 .previous .section .eh_frame,"a",@progbits @@ -70,9 +71,9 @@ __kernel_rt_sigreturn: be the value of the stack pointer in the caller. This means that we must define the CFA of this body of code to be the saved value of the stack pointer in the sigcontext. Which - also means that there is no fixed relation to the other + also means that there is no fixed relation to the other saved registers, which means that we must use DW_CFA_expression - to compute their addresses. It also means that when we + to compute their addresses. It also means that when we adjust the stack with the popl, we have to do it all over again. */ #define do_cfa_expr(offset) \ @@ -91,27 +92,27 @@ __kernel_rt_sigreturn: .sleb128 offset; /* offset */ \ 1: - do_cfa_expr(SIGCONTEXT_esp+4) - do_expr(0, SIGCONTEXT_eax+4) - do_expr(1, SIGCONTEXT_ecx+4) - do_expr(2, SIGCONTEXT_edx+4) - do_expr(3, SIGCONTEXT_ebx+4) - do_expr(5, SIGCONTEXT_ebp+4) - do_expr(6, SIGCONTEXT_esi+4) - do_expr(7, SIGCONTEXT_edi+4) - do_expr(8, SIGCONTEXT_eip+4) + do_cfa_expr(IA32_SIGCONTEXT_sp+4) + do_expr(0, IA32_SIGCONTEXT_ax+4) + do_expr(1, IA32_SIGCONTEXT_cx+4) + do_expr(2, IA32_SIGCONTEXT_dx+4) + do_expr(3, IA32_SIGCONTEXT_bx+4) + do_expr(5, IA32_SIGCONTEXT_bp+4) + do_expr(6, IA32_SIGCONTEXT_si+4) + do_expr(7, IA32_SIGCONTEXT_di+4) + do_expr(8, IA32_SIGCONTEXT_ip+4) .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ - do_cfa_expr(SIGCONTEXT_esp) - do_expr(0, SIGCONTEXT_eax) - do_expr(1, SIGCONTEXT_ecx) - do_expr(2, SIGCONTEXT_edx) - do_expr(3, SIGCONTEXT_ebx) - do_expr(5, SIGCONTEXT_ebp) - do_expr(6, SIGCONTEXT_esi) - do_expr(7, SIGCONTEXT_edi) - do_expr(8, SIGCONTEXT_eip) + do_cfa_expr(IA32_SIGCONTEXT_sp) + do_expr(0, IA32_SIGCONTEXT_ax) + do_expr(1, IA32_SIGCONTEXT_cx) + do_expr(2, IA32_SIGCONTEXT_dx) + do_expr(3, IA32_SIGCONTEXT_bx) + do_expr(5, IA32_SIGCONTEXT_bp) + do_expr(6, IA32_SIGCONTEXT_si) + do_expr(7, IA32_SIGCONTEXT_di) + do_expr(8, IA32_SIGCONTEXT_ip) .align 4 .LENDFDEDLSI1: @@ -128,15 +129,15 @@ __kernel_rt_sigreturn: slightly less complicated than the above, since we don't modify the stack pointer in the process. */ - do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp) - do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax) - do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx) - do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx) - do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx) - do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp) - do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi) - do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi) - do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip) + do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp) + do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax) + do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx) + do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx) + do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx) + do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp) + do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si) + do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di) + do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip) .align 4 .LENDFDEDLSI2: diff --git a/arch/x86/ia32/vsyscall-syscall.S b/arch/x86/vdso/vdso32/syscall.S index cf9ef678de3..5415b5613d5 100644 --- a/arch/x86/ia32/vsyscall-syscall.S +++ b/arch/x86/vdso/vdso32/syscall.S @@ -1,16 +1,18 @@ /* - * Code for the vsyscall page. This version uses the syscall instruction. + * Code for the vDSO. This version uses the syscall instruction. + * + * First get the common code for the sigreturn entry points. + * This must come first. */ +#define SYSCALL_ENTER_KERNEL syscall +#include "sigreturn.S" -#include <asm/ia32_unistd.h> -#include <asm/asm-offsets.h> #include <asm/segment.h> - .code32 .text - .section .text.vsyscall,"ax" .globl __kernel_vsyscall .type __kernel_vsyscall,@function + ALIGN __kernel_vsyscall: .LSTART_vsyscall: push %ebp @@ -64,6 +66,12 @@ __kernel_vsyscall: .uleb128 4 .align 4 .LENDFDE1: + .previous -#define SYSCALL_ENTER_KERNEL syscall -#include "vsyscall-sigreturn.S" + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0 + .previous diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/vdso/vdso32/sysenter.S index ed879bf4299..e2800affa75 100644 --- a/arch/x86/kernel/vsyscall-sysenter_32.S +++ b/arch/x86/vdso/vdso32/sysenter.S @@ -1,11 +1,10 @@ /* - * Code for the vsyscall page. This version uses the sysenter instruction. + * Code for the vDSO. This version uses the sysenter instruction. * - * NOTE: - * 1) __kernel_vsyscall _must_ be first in this page. - * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S - * for details. + * First get the common code for the sigreturn entry points. + * This must come first. */ +#include "sigreturn.S" /* * The caller puts arg2 in %ecx, which gets pushed. The kernel will use @@ -23,11 +22,12 @@ * arg6 from the stack. * * You can not use this vsyscall for the clone() syscall because the - * three dwords on the parent stack do not get copied to the child. + * three words on the parent stack do not get copied to the child. */ .text .globl __kernel_vsyscall .type __kernel_vsyscall,@function + ALIGN __kernel_vsyscall: .LSTART_vsyscall: push %ecx @@ -45,8 +45,7 @@ __kernel_vsyscall: /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ jmp .Lenter_kernel /* 16: System call normal return point is here! */ - .globl SYSENTER_RETURN /* Symbol used by sysenter.c */ -SYSENTER_RETURN: +VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ pop %ebp .Lpop_ebp: pop %edx @@ -85,38 +84,33 @@ SYSENTER_RETURN: .uleb128 0 /* What follows are the instructions for the table generation. We have to record all changes of the stack pointer. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_ecx-.LSTART_vsyscall + .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_edx-.Lpush_ecx + .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x0c /* RA at offset 12 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lenter_kernel-.Lpush_edx + .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x10 /* RA at offset 16 now */ .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ /* Finally the epilogue. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ebp-.Lenter_kernel + .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x0c /* RA at offset 12 now */ .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_edx-.Lpop_ebp + .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ecx-.Lpop_edx + .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x04 /* RA at offset 4 now */ .align 4 .LENDFDEDLSI: .previous -/* - * Get the common code for the sigreturn entry points. - */ -#include "vsyscall-sigreturn_32.S" + /* + * Emit a symbol with the size of this .eh_frame data, + * to verify it matches the other versions. + */ +VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI) diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S new file mode 100644 index 00000000000..976124bb5f9 --- /dev/null +++ b/arch/x86/vdso/vdso32/vdso32.lds.S @@ -0,0 +1,37 @@ +/* + * Linker script for 32-bit vDSO. + * We #include the file to define the layout details. + * Here we only choose the prelinked virtual address. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. We can define local symbols here called VDSO* to make their + * values visible using the asm-x86/vdso.h macros from the kernel proper. + */ + +#define VDSO_PRELINK 0 +#include "../vdso-layout.lds.S" + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_vsyscall); + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION +{ + LINUX_2.5 { + global: + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + local: *; + }; +} + +/* + * Symbols we define here called VDSO* get their values into vdso32-syms.h. + */ +VDSO32_PRELINK = VDSO_PRELINK; +VDSO32_vsyscall = __kernel_vsyscall; +VDSO32_sigreturn = __kernel_sigreturn; +VDSO32_rt_sigreturn = __kernel_rt_sigreturn; diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index 3b1ae1abfba..c8097f17f8a 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -15,11 +15,11 @@ long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { - unsigned int dummy, p; + unsigned int p; if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ - rdtscp(dummy, dummy, p); + native_read_tscp(&p); } else { /* Load per CPU data from GDT */ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index ff9333e5fb0..3fdd51497a8 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -11,23 +11,20 @@ #include <asm/vsyscall.h> #include <asm/vgtod.h> #include <asm/proto.h> -#include "voffset.h" +#include <asm/vdso.h> -int vdso_enabled = 1; - -#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x; -#include "vextern.h" +#include "vextern.h" /* Just for VMAGIC. */ #undef VEXTERN -extern char vdso_kernel_start[], vdso_start[], vdso_end[]; +int vdso_enabled = 1; + +extern char vdso_start[], vdso_end[]; extern unsigned short vdso_sync_cpuid; struct page **vdso_pages; -static inline void *var_ref(void *vbase, char *var, char *name) +static inline void *var_ref(void *p, char *name) { - unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET; - void *p = vbase + offset; if (*(void **)p != (void *)VMAGIC) { printk("VDSO: variable %s broken\n", name); vdso_enabled = 0; @@ -62,9 +59,8 @@ static int __init init_vdso_vars(void) vdso_enabled = 0; } -#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x) #define VEXTERN(x) \ - V(vdso_ ## x) = &__ ## x; + *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; #include "vextern.h" #undef VEXTERN return 0; diff --git a/arch/x86/vdso/voffset.h b/arch/x86/vdso/voffset.h deleted file mode 100644 index 4af67c79085..00000000000 --- a/arch/x86/vdso/voffset.h +++ /dev/null @@ -1 +0,0 @@ -#define VDSO_TEXT_OFFSET 0x600 diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index fbfa55ce0d5..4d5f2649bee 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -5,6 +5,7 @@ config XEN bool "Xen guest support" select PARAVIRT + depends on X86_32 depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 79ad1525215..de647bc6e74 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -141,8 +141,8 @@ static void __init xen_banner(void) printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); } -static void xen_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static void xen_cpuid(unsigned int *ax, unsigned int *bx, + unsigned int *cx, unsigned int *dx) { unsigned maskedx = ~0; @@ -150,18 +150,18 @@ static void xen_cpuid(unsigned int *eax, unsigned int *ebx, * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. */ - if (*eax == 1) + if (*ax == 1) maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ (1 << X86_FEATURE_ACPI) | /* disable ACPI */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ asm(XEN_EMULATE_PREFIX "cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); - *edx &= maskedx; + : "=a" (*ax), + "=b" (*bx), + "=c" (*cx), + "=d" (*dx) + : "0" (*ax), "2" (*cx)); + *dx &= maskedx; } static void xen_set_debugreg(int reg, unsigned long val) @@ -275,19 +275,12 @@ static unsigned long xen_store_tr(void) static void xen_set_ldt(const void *addr, unsigned entries) { - unsigned long linear_addr = (unsigned long)addr; struct mmuext_op *op; struct multicall_space mcs = xen_mc_entry(sizeof(*op)); op = mcs.args; op->cmd = MMUEXT_SET_LDT; - if (linear_addr) { - /* ldt my be vmalloced, use arbitrary_virt_to_machine */ - xmaddr_t maddr; - maddr = arbitrary_virt_to_machine((unsigned long)addr); - linear_addr = (unsigned long)maddr.maddr; - } - op->arg1.linear_addr = linear_addr; + op->arg1.linear_addr = (unsigned long)addr; op->arg2.nr_ents = entries; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); @@ -295,7 +288,7 @@ static void xen_set_ldt(const void *addr, unsigned entries) xen_mc_issue(PARAVIRT_LAZY_CPU); } -static void xen_load_gdt(const struct Xgt_desc_struct *dtr) +static void xen_load_gdt(const struct desc_ptr *dtr) { unsigned long *frames; unsigned long va = dtr->address; @@ -357,11 +350,11 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) } static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, - u32 low, u32 high) + const void *ptr) { unsigned long lp = (unsigned long)&dt[entrynum]; xmaddr_t mach_lp = virt_to_machine(lp); - u64 entry = (u64)high << 32 | low; + u64 entry = *(u64 *)ptr; preempt_disable(); @@ -395,12 +388,11 @@ static int cvt_gate_to_trap(int vector, u32 low, u32 high, } /* Locations of each CPU's IDT */ -static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); +static DEFINE_PER_CPU(struct desc_ptr, idt_desc); /* Set an IDT entry. If the entry is part of the current IDT, then also update Xen. */ -static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, - u32 low, u32 high) +static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) { unsigned long p = (unsigned long)&dt[entrynum]; unsigned long start, end; @@ -412,14 +404,15 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, xen_mc_flush(); - write_dt_entry(dt, entrynum, low, high); + native_write_idt_entry(dt, entrynum, g); if (p >= start && (p + 8) <= end) { struct trap_info info[2]; + u32 *desc = (u32 *)g; info[1].address = 0; - if (cvt_gate_to_trap(entrynum, low, high, &info[0])) + if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) if (HYPERVISOR_set_trap_table(info)) BUG(); } @@ -427,7 +420,7 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } -static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, +static void xen_convert_trap_info(const struct desc_ptr *desc, struct trap_info *traps) { unsigned in, out, count; @@ -446,7 +439,7 @@ static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, void xen_copy_trap_info(struct trap_info *traps) { - const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc); + const struct desc_ptr *desc = &__get_cpu_var(idt_desc); xen_convert_trap_info(desc, traps); } @@ -454,7 +447,7 @@ void xen_copy_trap_info(struct trap_info *traps) /* Load a new IDT into Xen. In principle this can be per-CPU, so we hold a spinlock to protect the static traps[] array (static because it avoids allocation, and saves stack space). */ -static void xen_load_idt(const struct Xgt_desc_struct *desc) +static void xen_load_idt(const struct desc_ptr *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; @@ -475,22 +468,21 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc) /* Write a GDT descriptor entry. Ignore LDT descriptors, since they're handled differently. */ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, - u32 low, u32 high) + const void *desc, int type) { preempt_disable(); - switch ((high >> 8) & 0xff) { - case DESCTYPE_LDT: - case DESCTYPE_TSS: + switch (type) { + case DESC_LDT: + case DESC_TSS: /* ignore */ break; default: { xmaddr_t maddr = virt_to_machine(&dt[entry]); - u64 desc = (u64)high << 32 | low; xen_mc_flush(); - if (HYPERVISOR_update_descriptor(maddr.maddr, desc)) + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) BUG(); } @@ -499,11 +491,11 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, preempt_enable(); } -static void xen_load_esp0(struct tss_struct *tss, +static void xen_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { struct multicall_space mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); } @@ -521,12 +513,12 @@ static void xen_io_delay(void) } #ifdef CONFIG_X86_LOCAL_APIC -static unsigned long xen_apic_read(unsigned long reg) +static u32 xen_apic_read(unsigned long reg) { return 0; } -static void xen_apic_write(unsigned long reg, unsigned long val) +static void xen_apic_write(unsigned long reg, u32 val) { /* Warn to see if there's any stray references */ WARN_ON(1); @@ -666,6 +658,13 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); } +/* Early release_pt assumes that all pts are pinned, since there's + only init_mm and anything attached to that is pinned. */ +static void xen_release_pt_init(u32 pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + static void pin_pagetable_pfn(unsigned level, unsigned long pfn) { struct mmuext_op op; @@ -677,7 +676,7 @@ static void pin_pagetable_pfn(unsigned level, unsigned long pfn) /* This needs to make sure the new pte page is pinned iff its being attached to a pinned pagetable. */ -static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) +static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) { struct page *page = pfn_to_page(pfn); @@ -686,7 +685,7 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + pin_pagetable_pfn(level, pfn); } else /* make sure there are no stray mappings of this page */ @@ -694,6 +693,16 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) } } +static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) +{ + xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L1_TABLE); +} + +static void xen_alloc_pd(struct mm_struct *mm, u32 pfn) +{ + xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L2_TABLE); +} + /* This should never happen until we're OK to use struct page */ static void xen_release_pt(u32 pfn) { @@ -796,6 +805,9 @@ static __init void xen_pagetable_setup_done(pgd_t *base) /* This will work as long as patching hasn't happened yet (which it hasn't) */ pv_mmu_ops.alloc_pt = xen_alloc_pt; + pv_mmu_ops.alloc_pd = xen_alloc_pd; + pv_mmu_ops.release_pt = xen_release_pt; + pv_mmu_ops.release_pd = xen_release_pt; pv_mmu_ops.set_pte = xen_set_pte; if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -953,7 +965,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .read_pmc = native_read_pmc, .iret = (void *)&hypercall_page[__HYPERVISOR_iret], - .irq_enable_sysexit = NULL, /* never called */ + .irq_enable_syscall_ret = NULL, /* never called */ .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, @@ -968,7 +980,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .write_ldt_entry = xen_write_ldt_entry, .write_gdt_entry = xen_write_gdt_entry, .write_idt_entry = xen_write_idt_entry, - .load_esp0 = xen_load_esp0, + .load_sp0 = xen_load_sp0, .set_iopl_mask = xen_set_iopl_mask, .io_delay = xen_io_delay, @@ -1019,10 +1031,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .pte_update_defer = paravirt_nop, .alloc_pt = xen_alloc_pt_init, - .release_pt = xen_release_pt, - .alloc_pd = paravirt_nop, + .release_pt = xen_release_pt_init, + .alloc_pd = xen_alloc_pt_init, .alloc_pd_clone = paravirt_nop, - .release_pd = paravirt_nop, + .release_pd = xen_release_pt_init, #ifdef CONFIG_HIGHPTE .kmap_atomic_pte = xen_kmap_atomic_pte, diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c index 6d1da5809e6..dcf613e1758 100644 --- a/arch/x86/xen/events.c +++ b/arch/x86/xen/events.c @@ -465,7 +465,7 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) * a bitset of words which contain pending event bits. The second * level is a bitset of pending events themselves. */ -fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) +void xen_evtchn_do_upcall(struct pt_regs *regs) { int cpu = get_cpu(); struct shared_info *s = HYPERVISOR_shared_info; @@ -487,7 +487,7 @@ fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) int irq = evtchn_to_irq[port]; if (irq != -1) { - regs->orig_eax = ~irq; + regs->orig_ax = ~irq; do_IRQ(regs); } } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0ac6c5dc49b..45aa771e73a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -58,7 +58,8 @@ xmaddr_t arbitrary_virt_to_machine(unsigned long address) { - pte_t *pte = lookup_address(address); + int level; + pte_t *pte = lookup_address(address, &level); unsigned offset = address & PAGE_MASK; BUG_ON(pte == NULL); @@ -70,8 +71,9 @@ void make_lowmem_page_readonly(void *vaddr) { pte_t *pte, ptev; unsigned long address = (unsigned long)vaddr; + int level; - pte = lookup_address(address); + pte = lookup_address(address, &level); BUG_ON(pte == NULL); ptev = pte_wrprotect(*pte); @@ -84,8 +86,9 @@ void make_lowmem_page_readwrite(void *vaddr) { pte_t *pte, ptev; unsigned long address = (unsigned long)vaddr; + int level; - pte = lookup_address(address); + pte = lookup_address(address, &level); BUG_ON(pte == NULL); ptev = pte_mkwrite(*pte); @@ -241,12 +244,12 @@ unsigned long long xen_pgd_val(pgd_t pgd) pte_t xen_make_pte(unsigned long long pte) { - if (pte & 1) + if (pte & _PAGE_PRESENT) { pte = phys_to_machine(XPADDR(pte)).maddr; + pte &= ~(_PAGE_PCD | _PAGE_PWT); + } - pte &= ~_PAGE_PCD; - - return (pte_t){ pte, pte >> 32 }; + return (pte_t){ .pte = pte }; } pmd_t xen_make_pmd(unsigned long long pmd) @@ -290,10 +293,10 @@ unsigned long xen_pgd_val(pgd_t pgd) pte_t xen_make_pte(unsigned long pte) { - if (pte & _PAGE_PRESENT) + if (pte & _PAGE_PRESENT) { pte = phys_to_machine(XPADDR(pte)).maddr; - - pte &= ~_PAGE_PCD; + pte &= ~(_PAGE_PCD | _PAGE_PWT); + } return (pte_t){ pte }; } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f84e7722664..3bad4773a2f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -10,6 +10,7 @@ #include <linux/pm.h> #include <asm/elf.h> +#include <asm/vdso.h> #include <asm/e820.h> #include <asm/setup.h> #include <asm/xen/hypervisor.h> @@ -59,12 +60,10 @@ static void xen_idle(void) /* * Set the bit indicating "nosegneg" library variants should be used. */ -static void fiddle_vdso(void) +static void __init fiddle_vdso(void) { - extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */ - extern char vsyscall_int80_start; - u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK + - &vsyscall_int80_start); + extern const char vdso32_default_start; + u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index c1b131bcdcb..aafc5443740 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -146,7 +146,7 @@ void __init xen_smp_prepare_boot_cpu(void) old memory can be recycled */ make_lowmem_page_readwrite(&per_cpu__gdt_page); - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { cpus_clear(per_cpu(cpu_sibling_map, cpu)); /* * cpu_core_map lives in a per cpu area that is cleared @@ -163,7 +163,7 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { cpus_clear(per_cpu(cpu_sibling_map, cpu)); /* * cpu_core_ map will be zeroed when the per @@ -239,10 +239,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); ctxt->user_regs.cs = __KERNEL_CS; - ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); + ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.esp0; + ctxt->kernel_sp = idle->thread.sp0; ctxt->event_callback_cs = __KERNEL_CS; ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index d083ff5ef08..b3721fd6877 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -592,7 +592,7 @@ __init void xen_time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - tsc_disable = 0; + setup_force_cpu_cap(X86_FEATURE_TSC); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index f8d6937db2e..288d587ce73 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -4,16 +4,18 @@ #ifdef CONFIG_XEN #include <linux/elfnote.h> +#include <linux/init.h> #include <asm/boot.h> #include <xen/interface/elfnote.h> -.pushsection .init.text + __INIT ENTRY(startup_xen) movl %esi,xen_start_info cld movl $(init_thread_union+THREAD_SIZE),%esp jmp xen_start_kernel -.popsection + + __FINIT .pushsection .bss.page_aligned .align PAGE_SIZE_asm |