aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig41
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/boot.h10
-rw-r--r--arch/x86/boot/cpu.c3
-rw-r--r--arch/x86/boot/cpucheck.c10
-rw-r--r--arch/x86/boot/main.c5
-rw-r--r--arch/x86/boot/memory.c1
-rw-r--r--arch/x86/ia32/ia32_aout.c6
-rw-r--r--arch/x86/kernel/acpi/boot.c16
-rw-r--r--arch/x86/kernel/acpi/cstate.c3
-rw-r--r--arch/x86/kernel/acpi/sleep.c4
-rw-r--r--arch/x86/kernel/amd_iommu.c36
-rw-r--r--arch/x86/kernel/amd_iommu_init.c28
-rw-r--r--arch/x86/kernel/apic_32.c22
-rw-r--r--arch/x86/kernel/apic_64.c7
-rw-r--r--arch/x86/kernel/cpu/bugs.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c10
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c125
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h3
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c12
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c3
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c8
-rw-r--r--arch/x86/kernel/efi_32.c4
-rw-r--r--arch/x86/kernel/genapic_64.c1
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c2
-rw-r--r--arch/x86/kernel/head64.c1
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/hpet.c24
-rw-r--r--arch/x86/kernel/io_apic_32.c6
-rw-r--r--arch/x86/kernel/io_apic_64.c25
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_32.c57
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c52
-rw-r--r--arch/x86/kernel/microcode.c17
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/mpparse.c17
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/nmi.c28
-rw-r--r--arch/x86/kernel/pci-calgary_64.c75
-rw-r--r--arch/x86/kernel/pci-dma.c157
-rw-r--r--arch/x86/kernel/pci-gart_64.c14
-rw-r--r--arch/x86/kernel/pci-nommu.c14
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c2
-rw-r--r--arch/x86/kernel/process_32.c5
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/kernel/reboot.c11
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S178
-rw-r--r--arch/x86/kernel/setup.c37
-rw-r--r--arch/x86/kernel/setup_percpu.c21
-rw-r--r--arch/x86/kernel/signal_64.c11
-rw-r--r--arch/x86/kernel/smpboot.c65
-rw-r--r--arch/x86/kernel/smpcommon.c17
-rw-r--r--arch/x86/kernel/traps_64.c9
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/visws_quirks.c6
-rw-r--r--arch/x86/kernel/vmi_32.c3
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S8
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/mmu.c107
-rw-r--r--arch/x86/kvm/paging_tmpl.h12
-rw-r--r--arch/x86/kvm/svm.c10
-rw-r--r--arch/x86/kvm/vmx.c22
-rw-r--r--arch/x86/kvm/x86.c133
-rw-r--r--arch/x86/lguest/boot.c3
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S3
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/gup.c298
-rw-r--r--arch/x86/mm/init_64.c49
-rw-r--r--arch/x86/mm/ioremap.c2
-rw-r--r--arch/x86/mm/pageattr-test.c3
-rw-r--r--arch/x86/mm/pageattr.c21
-rw-r--r--arch/x86/mm/pgtable.c3
-rw-r--r--arch/x86/mm/pgtable_32.c47
-rw-r--r--arch/x86/mm/srat_32.c12
-rw-r--r--arch/x86/oprofile/op_model_p4.c175
-rw-r--r--arch/x86/pci/fixup.c3
-rw-r--r--arch/x86/pci/i386.c26
-rw-r--r--arch/x86/pci/irq.c106
-rw-r--r--arch/x86/pci/mmconfig-shared.c2
-rw-r--r--arch/x86/pci/numaq_32.c5
-rw-r--r--arch/x86/power/cpu_32.c6
-rw-r--r--arch/x86/power/hibernate_asm_32.S26
87 files changed, 1422 insertions, 924 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e3cba0b4560..68d91c8233f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,12 +23,13 @@ config X86
select HAVE_OPROFILE
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
- select ARCH_WANT_OPTIONAL_GPIOLIB if !X86_RDC321X
+ select ARCH_WANT_OPTIONAL_GPIOLIB
select HAVE_KRETPROBES
select HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE
select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
select HAVE_ARCH_KGDB if !X86_VOYAGER
+ select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
config ARCH_DEFCONFIG
@@ -332,20 +333,6 @@ config X86_BIGSMP
endif
-config X86_RDC321X
- bool "RDC R-321x SoC"
- depends on X86_32
- select M486
- select X86_REBOOTFIXUPS
- select GENERIC_GPIO
- select LEDS_CLASS
- select LEDS_GPIO
- select NEW_LEDS
- help
- This option is needed for RDC R-321x system-on-chip, also known
- as R-8610-(G).
- If you don't have one of these chips, you should say N here.
-
config X86_VSMP
bool "Support for ScaleMP vSMP"
select PARAVIRT
@@ -369,6 +356,16 @@ config X86_VISWS
A kernel compiled for the Visual Workstation will run on general
PCs as well. See <file:Documentation/sgi-visws.txt> for details.
+config X86_RDC321X
+ bool "RDC R-321x SoC"
+ depends on X86_32
+ select M486
+ select X86_REBOOTFIXUPS
+ help
+ This option is needed for RDC R-321x system-on-chip, also known
+ as R-8610-(G).
+ If you don't have one of these chips, you should say N here.
+
config SCHED_NO_NO_OMIT_FRAME_POINTER
def_bool y
prompt "Single-depth WCHAN output"
@@ -954,9 +951,9 @@ config NUMA
local memory controller of the CPU and add some more
NUMA awareness to the kernel.
- For i386 this is currently highly experimental and should be only
+ For 32-bit this is currently highly experimental and should be only
used for kernel development. It might also cause boot failures.
- For x86_64 this is recommended on all multiprocessor Opteron systems.
+ For 64-bit this is recommended on all multiprocessor Opteron systems.
If the system is EM64T, you should say N unless your system is
EM64T NUMA.
@@ -1266,7 +1263,7 @@ config KEXEC
strongly in flux, so no good recommendation can be made.
config CRASH_DUMP
- bool "kernel crash dumps (EXPERIMENTAL)"
+ bool "kernel crash dumps"
depends on X86_64 || (X86_32 && HIGHMEM)
help
Generate crash dump after being started by kexec.
@@ -1279,6 +1276,14 @@ config CRASH_DUMP
(CONFIG_RELOCATABLE=y).
For more details see Documentation/kdump/kdump.txt
+config KEXEC_JUMP
+ bool "kexec jump (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ depends on KEXEC && HIBERNATION && X86_32
+ help
+ Jump between original kernel and kexeced kernel and invoke
+ code in physical address mode via KEXEC
+
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
default "0x1000000" if X86_NUMAQ
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 919ce21ea65..f5631da585b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -118,11 +118,6 @@ mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic
fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
-# RDC R-321x subarch support
-mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x
-mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/
-core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/
-
# default subarch .h files
mflags-y += -Iinclude/asm-x86/mach-default
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index a34b9982c7c..cc0ef13fba7 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -24,10 +24,14 @@
#include <linux/edd.h>
#include <asm/boot.h>
#include <asm/setup.h>
+#include "bitops.h"
+#include <asm/cpufeature.h>
/* Useful macros */
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
extern struct setup_header hdr;
extern struct boot_params boot_params;
@@ -242,6 +246,12 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize);
int cmdline_find_option_bool(const char *option);
/* cpu.c, cpucheck.c */
+struct cpu_features {
+ int level; /* Family, or 64 for x86-64 */
+ int model;
+ u32 flags[NCAPINTS];
+};
+extern struct cpu_features cpu;
int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
int validate_cpu(void);
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 92d6fd73dc7..75298fe2edc 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -16,9 +16,6 @@
*/
#include "boot.h"
-#include "bitops.h"
-#include <asm/cpufeature.h>
-
#include "cpustr.h"
static char *cpu_name(int level)
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 7804389ee00..4b9ae7c5674 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -22,21 +22,13 @@
#ifdef _SETUP
# include "boot.h"
-# include "bitops.h"
#endif
#include <linux/types.h>
-#include <asm/cpufeature.h>
#include <asm/processor-flags.h>
#include <asm/required-features.h>
#include <asm/msr-index.h>
-struct cpu_features {
- int level; /* Family, or 64 for x86-64 */
- int model;
- u32 flags[NCAPINTS];
-};
-
-static struct cpu_features cpu;
+struct cpu_features cpu;
static u32 cpu_vendor[3];
static u32 err_flags[NCAPINTS];
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 2296164b54d..197421db1af 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -73,6 +73,11 @@ static void keyboard_set_repeat(void)
*/
static void query_ist(void)
{
+ /* Some older BIOSes apparently crash on this call, so filter
+ it from machines too old to have SpeedStep at all. */
+ if (cpu.level < 6)
+ return;
+
asm("int $0x15"
: "=a" (boot_params.ist_info.signature),
"=b" (boot_params.ist_info.command),
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 53165c97336..8c3c25f3557 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -13,7 +13,6 @@
*/
#include "boot.h"
-#include <linux/kernel.h>
#define SMAP 0x534d4150 /* ASCII "SMAP" */
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 58cccb6483b..a0e1dbe67dc 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -441,12 +441,6 @@ beyond_if:
regs->r8 = regs->r9 = regs->r10 = regs->r11 =
regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
set_fs(USER_DS);
- if (unlikely(current->ptrace & PT_PTRACED)) {
- if (current->ptrace & PT_TRACE_EXEC)
- ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
- else
- send_sig(SIGTRAP, current, 0);
- }
return 0;
}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index fa88a1d7129..bfd10fd211c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -97,6 +97,8 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#warning ACPI uses CMPXCHG, i486 and later hardware
#endif
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
/* --------------------------------------------------------------------------
Boot-time Configuration
-------------------------------------------------------------------------- */
@@ -158,6 +160,14 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
struct acpi_mcfg_allocation *pci_mmcfg_config;
int pci_mmcfg_config_num;
+static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
+{
+ if (!strcmp(mcfg->header.oem_id, "SGI"))
+ acpi_mcfg_64bit_base_addr = TRUE;
+
+ return 0;
+}
+
int __init acpi_parse_mcfg(struct acpi_table_header *header)
{
struct acpi_table_mcfg *mcfg;
@@ -190,8 +200,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header)
}
memcpy(pci_mmcfg_config, &mcfg[1], config_size);
+
+ acpi_mcfg_oem_check(mcfg);
+
for (i = 0; i < pci_mmcfg_config_num; ++i) {
- if (pci_mmcfg_config[i].address > 0xFFFFFFFF) {
+ if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
+ !acpi_mcfg_64bit_base_addr) {
printk(KERN_ERR PREFIX
"MMCONFIG not in low 4GB of memory\n");
kfree(pci_mmcfg_config);
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 9220cf46aa1..c2502eb9aa8 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -73,7 +73,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
struct cpuinfo_x86 *c = &cpu_data(cpu);
cpumask_t saved_mask;
- cpumask_of_cpu_ptr(new_mask, cpu);
int retval;
unsigned int eax, ebx, ecx, edx;
unsigned int edx_part;
@@ -92,7 +91,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
/* Make sure we are running on right CPU */
saved_mask = current->cpus_allowed;
- retval = set_cpus_allowed_ptr(current, new_mask);
+ retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (retval)
return -1;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index fa2161d5003..426e5d91b63 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -20,7 +20,7 @@ unsigned long acpi_realmode_flags;
/* address in low memory of the wakeup routine. */
static unsigned long acpi_realmode;
-#ifdef CONFIG_64BIT
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
static char temp_stack[10240];
#endif
@@ -86,7 +86,7 @@ int acpi_save_state_mem(void)
#endif /* !CONFIG_64BIT */
header->pmode_cr0 = read_cr0();
- header->pmode_cr4 = read_cr4();
+ header->pmode_cr4 = read_cr4_safe();
header->realmode_flags = acpi_realmode_flags;
header->real_magic = 0x12345678;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index c25210e6ac8..de39e1f2ede 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -29,9 +29,6 @@
#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
-#define to_pages(addr, size) \
- (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
-
#define EXIT_LOOP_COUNT 10000000
static DEFINE_RWLOCK(amd_iommu_devtable_lock);
@@ -104,16 +101,13 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
*/
static int iommu_completion_wait(struct amd_iommu *iommu)
{
- int ret;
+ int ret, ready = 0;
+ unsigned status = 0;
struct iommu_cmd cmd;
- volatile u64 ready = 0;
- unsigned long ready_phys = virt_to_phys(&ready);
unsigned long i = 0;
memset(&cmd, 0, sizeof(cmd));
- cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
- cmd.data[1] = upper_32_bits(ready_phys);
- cmd.data[2] = 1; /* value written to 'ready' */
+ cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
iommu->need_sync = 0;
@@ -125,9 +119,15 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
while (!ready && (i < EXIT_LOOP_COUNT)) {
++i;
- cpu_relax();
+ /* wait for the bit to become one */
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
}
+ /* set bit back to zero */
+ status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
+ writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
+
if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
@@ -164,7 +164,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
address &= PAGE_MASK;
CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
cmd.data[1] |= domid;
- cmd.data[2] = LOW_U32(address);
+ cmd.data[2] = lower_32_bits(address);
cmd.data[3] = upper_32_bits(address);
if (s) /* size bit - we flush more than one 4kb page */
cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
@@ -185,7 +185,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
u64 address, size_t size)
{
int s = 0;
- unsigned pages = to_pages(address, size);
+ unsigned pages = iommu_num_pages(address, size);
address &= PAGE_MASK;
@@ -557,8 +557,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
if (iommu->exclusion_start &&
iommu->exclusion_start < dma_dom->aperture_size) {
unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
- int pages = to_pages(iommu->exclusion_start,
- iommu->exclusion_length);
+ int pages = iommu_num_pages(iommu->exclusion_start,
+ iommu->exclusion_length);
dma_ops_reserve_addresses(dma_dom, startpage, pages);
}
@@ -667,7 +667,7 @@ static int get_device_resources(struct device *dev,
_bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
/* device not translated by any IOMMU in the system? */
- if (_bdf >= amd_iommu_last_bdf) {
+ if (_bdf > amd_iommu_last_bdf) {
*iommu = NULL;
*domain = NULL;
*bdf = 0xffff;
@@ -767,7 +767,7 @@ static dma_addr_t __map_single(struct device *dev,
unsigned int pages;
int i;
- pages = to_pages(paddr, size);
+ pages = iommu_num_pages(paddr, size);
paddr &= PAGE_MASK;
address = dma_ops_alloc_addresses(dev, dma_dom, pages);
@@ -802,7 +802,7 @@ static void __unmap_single(struct amd_iommu *iommu,
if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
return;
- pages = to_pages(dma_addr, size);
+ pages = iommu_num_pages(dma_addr, size);
dma_addr &= PAGE_MASK;
start = dma_addr;
@@ -1085,7 +1085,7 @@ void prealloc_protection_domains(void)
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
devid = (dev->bus->number << 8) | dev->devfn;
- if (devid >= amd_iommu_last_bdf)
+ if (devid > amd_iommu_last_bdf)
continue;
devid = amd_iommu_alias_table[devid];
if (domain_for_device(devid))
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c9d8ff2eb13..a69cc0f5204 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -732,7 +732,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
set_device_exclusion_range(m->devid, m);
break;
case ACPI_IVMD_TYPE_ALL:
- for (i = 0; i < amd_iommu_last_bdf; ++i)
+ for (i = 0; i <= amd_iommu_last_bdf; ++i)
set_device_exclusion_range(i, m);
break;
case ACPI_IVMD_TYPE_RANGE:
@@ -801,6 +801,21 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
}
/*
+ * Init the device table to not allow DMA access for devices and
+ * suppress all page faults
+ */
+static void init_device_table(void)
+{
+ u16 devid;
+
+ for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
+ set_dev_entry_bit(devid, DEV_ENTRY_VALID);
+ set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
+ set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT);
+ }
+}
+
+/*
* This function finally enables all IOMMUs found in the system after
* they have been initialized
*/
@@ -931,10 +946,13 @@ int __init amd_iommu_init(void)
if (amd_iommu_pd_alloc_bitmap == NULL)
goto free;
+ /* init the device table */
+ init_device_table();
+
/*
* let all alias entries point to itself
*/
- for (i = 0; i < amd_iommu_last_bdf; ++i)
+ for (i = 0; i <= amd_iommu_last_bdf; ++i)
amd_iommu_alias_table[i] = i;
/*
@@ -954,15 +972,15 @@ int __init amd_iommu_init(void)
if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
goto free;
- ret = amd_iommu_init_dma_ops();
+ ret = sysdev_class_register(&amd_iommu_sysdev_class);
if (ret)
goto free;
- ret = sysdev_class_register(&amd_iommu_sysdev_class);
+ ret = sysdev_register(&device_amd_iommu);
if (ret)
goto free;
- ret = sysdev_register(&device_amd_iommu);
+ ret = amd_iommu_init_dma_ops();
if (ret)
goto free;
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 0059e7a8a9e..0ff576d026a 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1458,8 +1458,6 @@ void disconnect_bsp_APIC(int virt_wire_setup)
}
}
-unsigned int __cpuinitdata maxcpus = NR_CPUS;
-
void __cpuinit generic_processor_info(int apicid, int version)
{
int cpu;
@@ -1486,12 +1484,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
return;
}
- if (num_processors >= maxcpus) {
- printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
- " Processor ignored.\n", maxcpus);
- return;
- }
-
num_processors++;
cpus_complement(tmp_map, cpu_present_map);
cpu = first_cpu(tmp_map);
@@ -1724,15 +1716,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
}
early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
-static int __init apic_set_verbosity(char *str)
+static int __init apic_set_verbosity(char *arg)
{
- if (strcmp("debug", str) == 0)
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp(arg, "debug") == 0)
apic_verbosity = APIC_DEBUG;
- else if (strcmp("verbose", str) == 0)
+ else if (strcmp(arg, "verbose") == 0)
apic_verbosity = APIC_VERBOSE;
- return 1;
+
+ return 0;
}
-__setup("apic=", apic_set_verbosity);
+early_param("apic", apic_set_verbosity);
static int __init lapic_insert_resource(void)
{
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index e571351f2a9..57744f4a75b 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -90,7 +90,6 @@ static unsigned long apic_phys;
unsigned long mp_lapic_addr;
-unsigned int __cpuinitdata maxcpus = NR_CPUS;
/*
* Get the LAPIC version
*/
@@ -1066,12 +1065,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
return;
}
- if (num_processors >= maxcpus) {
- printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
- " Processor ignored.\n", maxcpus);
- return;
- }
-
num_processors++;
cpus_complement(tmp_map, cpu_present_map);
cpu = first_cpu(tmp_map);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c9b58a806e8..c8e315f1aa8 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -50,6 +50,8 @@ static double __initdata y = 3145727.0;
*/
static void __init check_fpu(void)
{
+ s32 fdiv_bug;
+
if (!boot_cpu_data.hard_math) {
#ifndef CONFIG_MATH_EMULATION
printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
@@ -74,8 +76,10 @@ static void __init check_fpu(void)
"fistpl %0\n\t"
"fwait\n\t"
"fninit"
- : "=m" (*&boot_cpu_data.fdiv_bug)
+ : "=m" (*&fdiv_bug)
: "m" (*&x), "m" (*&y));
+
+ boot_cpu_data.fdiv_bug = fdiv_bug;
if (boot_cpu_data.fdiv_bug)
printk("Hmm, FPU with FDIV bug.\n");
}
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index cb7a5715596..efae3b22a0f 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -235,9 +235,9 @@ config X86_LONGHAUL
If in doubt, say N.
config X86_E_POWERSAVER
- tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)"
+ tristate "VIA C7 Enhanced PowerSaver"
select CPU_FREQ_TABLE
- depends on X86_32 && EXPERIMENTAL
+ depends on X86_32
help
This adds the CPUFreq driver for VIA C7 processors.
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ff2fff56f0a..dd097b83583 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -200,12 +200,10 @@ static void drv_read(struct drv_cmd *cmd)
static void drv_write(struct drv_cmd *cmd)
{
cpumask_t saved_mask = current->cpus_allowed;
- cpumask_of_cpu_ptr_declare(cpu_mask);
unsigned int i;
for_each_cpu_mask_nr(i, cmd->mask) {
- cpumask_of_cpu_ptr_next(cpu_mask, i);
- set_cpus_allowed_ptr(current, cpu_mask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
do_drv_write(cmd);
}
@@ -269,12 +267,11 @@ static unsigned int get_measured_perf(unsigned int cpu)
} aperf_cur, mperf_cur;
cpumask_t saved_mask;
- cpumask_of_cpu_ptr(cpu_mask, cpu);
unsigned int perf_percent;
unsigned int retval;
saved_mask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, cpu_mask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (get_cpu() != cpu) {
/* We were not able to run on requested processor */
put_cpu();
@@ -340,7 +337,6 @@ static unsigned int get_measured_perf(unsigned int cpu)
static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
{
- cpumask_of_cpu_ptr(cpu_mask, cpu);
struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
unsigned int freq;
unsigned int cached_freq;
@@ -353,7 +349,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
}
cached_freq = data->freq_table[data->acpi_data->state].frequency;
- freq = extract_freq(get_cur_val(cpu_mask), data);
+ freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data);
if (freq != cached_freq) {
/*
* The dreaded BIOS frequency change behind our back.
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 94619c22f56..e4a4bf870e9 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -44,7 +44,7 @@ struct s_elan_multiplier {
* It is important that the frequencies
* are listed in ascending order here!
*/
-struct s_elan_multiplier elan_multiplier[] = {
+static struct s_elan_multiplier elan_multiplier[] = {
{1000, 0x02, 0x18},
{2000, 0x02, 0x10},
{4000, 0x02, 0x08},
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 53c7b693697..4e7271999a7 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -66,7 +66,6 @@ static u32 find_freq_from_fid(u32 fid)
return 800 + (fid * 100);
}
-
/* Return a frequency in KHz, given an input fid */
static u32 find_khz_freq_from_fid(u32 fid)
{
@@ -78,7 +77,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p
return data[pstate].frequency;
}
-
/* Return the vco fid for an input fid
*
* Each "low" fid has corresponding "high" fid, and you can get to "low" fids
@@ -166,7 +164,6 @@ static void fidvid_msr_init(void)
wrmsr(MSR_FIDVID_CTL, lo, hi);
}
-
/* write the new fid value along with the other control fields to the msr */
static int write_new_fid(struct powernow_k8_data *data, u32 fid)
{
@@ -479,12 +476,11 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi
static int check_supported_cpu(unsigned int cpu)
{
cpumask_t oldmask;
- cpumask_of_cpu_ptr(cpu_mask, cpu);
u32 eax, ebx, ecx, edx;
unsigned int rc = 0;
oldmask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, cpu_mask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (smp_processor_id() != cpu) {
printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
@@ -741,44 +737,63 @@ static int find_psb_table(struct powernow_k8_data *data)
#ifdef CONFIG_X86_POWERNOW_K8_ACPI
static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index)
{
- if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
+ if (!data->acpi_data->state_count || (cpu_family == CPU_HW_PSTATE))
return;
- data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK;
- data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK;
- data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
- data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK;
- data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK);
- data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK;
+ data->irt = (data->acpi_data->states[index].control >> IRT_SHIFT) & IRT_MASK;
+ data->rvo = (data->acpi_data->states[index].control >> RVO_SHIFT) & RVO_MASK;
+ data->exttype = (data->acpi_data->states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
+ data->plllock = (data->acpi_data->states[index].control >> PLL_L_SHIFT) & PLL_L_MASK;
+ data->vidmvs = 1 << ((data->acpi_data->states[index].control >> MVS_SHIFT) & MVS_MASK);
+ data->vstable = (data->acpi_data->states[index].control >> VST_SHIFT) & VST_MASK;
+}
+
+
+static struct acpi_processor_performance *acpi_perf_data;
+static int preregister_valid;
+
+static int powernow_k8_cpu_preinit_acpi(void)
+{
+ acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
+ if (!acpi_perf_data)
+ return -ENODEV;
+
+ if (acpi_processor_preregister_performance(acpi_perf_data))
+ return -ENODEV;
+ else
+ preregister_valid = 1;
+ return 0;
}
static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
{
struct cpufreq_frequency_table *powernow_table;
int ret_val;
+ int cpu = 0;
- if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
+ data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+ if (acpi_processor_register_performance(data->acpi_data, data->cpu)) {
dprintk("register performance failed: bad ACPI data\n");
return -EIO;
}
/* verify the data contained in the ACPI structures */
- if (data->acpi_data.state_count <= 1) {
+ if (data->acpi_data->state_count <= 1) {
dprintk("No ACPI P-States\n");
goto err_out;
}
- if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
- (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
+ if ((data->acpi_data->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
+ (data->acpi_data->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
dprintk("Invalid control/status registers (%x - %x)\n",
- data->acpi_data.control_register.space_id,
- data->acpi_data.status_register.space_id);
+ data->acpi_data->control_register.space_id,
+ data->acpi_data->status_register.space_id);
goto err_out;
}
/* fill in data->powernow_table */
powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
- * (data->acpi_data.state_count + 1)), GFP_KERNEL);
+ * (data->acpi_data->state_count + 1)), GFP_KERNEL);
if (!powernow_table) {
dprintk("powernow_table memory alloc failure\n");
goto err_out;
@@ -791,12 +806,12 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
if (ret_val)
goto err_out_mem;
- powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END;
- powernow_table[data->acpi_data.state_count].index = 0;
+ powernow_table[data->acpi_data->state_count].frequency = CPUFREQ_TABLE_END;
+ powernow_table[data->acpi_data->state_count].index = 0;
data->powernow_table = powernow_table;
/* fill in data */
- data->numps = data->acpi_data.state_count;
+ data->numps = data->acpi_data->state_count;
if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu)
print_basics(data);
powernow_k8_acpi_pst_values(data, 0);
@@ -804,16 +819,31 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
/* notify BIOS that we exist */
acpi_processor_notify_smm(THIS_MODULE);
+ /* determine affinity, from ACPI if available */
+ if (preregister_valid) {
+ if ((data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ALL) ||
+ (data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ANY))
+ data->starting_core_affinity = data->acpi_data->shared_cpu_map;
+ else
+ data->starting_core_affinity = cpumask_of_cpu(data->cpu);
+ } else {
+ /* best guess from family if not */
+ if (cpu_family == CPU_HW_PSTATE)
+ data->starting_core_affinity = cpumask_of_cpu(data->cpu);
+ else
+ data->starting_core_affinity = per_cpu(cpu_core_map, data->cpu);
+ }
+
return 0;
err_out_mem:
kfree(powernow_table);
err_out:
- acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
+ acpi_processor_unregister_performance(data->acpi_data, data->cpu);
/* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
- data->acpi_data.state_count = 0;
+ data->acpi_data->state_count = 0;
return -ENODEV;
}
@@ -825,10 +855,10 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
- for (i = 0; i < data->acpi_data.state_count; i++) {
+ for (i = 0; i < data->acpi_data->state_count; i++) {
u32 index;
- index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
+ index = data->acpi_data->states[i].control & HW_PSTATE_MASK;
if (index > data->max_hw_pstate) {
printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index);
printk(KERN_ERR PFX "Please report to BIOS manufacturer\n");
@@ -844,7 +874,7 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
powernow_table[i].index = index;
- powernow_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000;
+ powernow_table[i].frequency = data->acpi_data->states[i].core_frequency * 1000;
}
return 0;
}
@@ -853,16 +883,16 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
{
int i;
int cntlofreq = 0;
- for (i = 0; i < data->acpi_data.state_count; i++) {
+ for (i = 0; i < data->acpi_data->state_count; i++) {
u32 fid;
u32 vid;
if (data->exttype) {
- fid = data->acpi_data.states[i].status & EXT_FID_MASK;
- vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK;
+ fid = data->acpi_data->states[i].status & EXT_FID_MASK;
+ vid = (data->acpi_data->states[i].status >> VID_SHIFT) & EXT_VID_MASK;
} else {
- fid = data->acpi_data.states[i].control & FID_MASK;
- vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK;
+ fid = data->acpi_data->states[i].control & FID_MASK;
+ vid = (data->acpi_data->states[i].control >> VID_SHIFT) & VID_MASK;
}
dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
@@ -903,10 +933,10 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
cntlofreq = i;
}
- if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) {
+ if (powernow_table[i].frequency != (data->acpi_data->states[i].core_frequency * 1000)) {
printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n",
powernow_table[i].frequency,
- (unsigned int) (data->acpi_data.states[i].core_frequency * 1000));
+ (unsigned int) (data->acpi_data->states[i].core_frequency * 1000));
powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
continue;
}
@@ -916,11 +946,12 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
{
- if (data->acpi_data.state_count)
- acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
+ if (data->acpi_data->state_count)
+ acpi_processor_unregister_performance(data->acpi_data, data->cpu);
}
#else
+static int powernow_k8_cpu_preinit_acpi(void) { return -ENODEV; }
static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; }
static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; }
static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; }
@@ -1017,7 +1048,6 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
{
cpumask_t oldmask;
- cpumask_of_cpu_ptr(cpu_mask, pol->cpu);
struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
u32 checkfid;
u32 checkvid;
@@ -1032,7 +1062,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
/* only run on specific CPU from here on */
oldmask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, cpu_mask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
if (smp_processor_id() != pol->cpu) {
printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1106,8 +1136,7 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
{
struct powernow_k8_data *data;
- cpumask_t oldmask;
- cpumask_of_cpu_ptr_declare(newmask);
+ cpumask_t oldmask = CPU_MASK_ALL;
int rc;
if (!cpu_online(pol->cpu))
@@ -1159,8 +1188,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
/* only run on specific CPU from here on */
oldmask = current->cpus_allowed;
- cpumask_of_cpu_ptr_next(newmask, pol->cpu);
- set_cpus_allowed_ptr(current, newmask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
if (smp_processor_id() != pol->cpu) {
printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1181,10 +1209,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
/* run on any CPU again */
set_cpus_allowed_ptr(current, &oldmask);
- if (cpu_family == CPU_HW_PSTATE)
- pol->cpus = *newmask;
- else
- pol->cpus = per_cpu(cpu_core_map, pol->cpu);
+ pol->cpus = data->starting_core_affinity;
data->available_cores = &(pol->cpus);
/* Take a crude guess here.
@@ -1248,7 +1273,6 @@ static unsigned int powernowk8_get (unsigned int cpu)
{
struct powernow_k8_data *data;
cpumask_t oldmask = current->cpus_allowed;
- cpumask_of_cpu_ptr(newmask, cpu);
unsigned int khz = 0;
unsigned int first;
@@ -1258,7 +1282,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
if (!data)
return -EINVAL;
- set_cpus_allowed_ptr(current, newmask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (smp_processor_id() != cpu) {
printk(KERN_ERR PFX
"limiting to CPU %d failed in powernowk8_get\n", cpu);
@@ -1308,6 +1332,7 @@ static int __cpuinit powernowk8_init(void)
}
if (supported_cpus == num_online_cpus()) {
+ powernow_k8_cpu_preinit_acpi();
printk(KERN_INFO PFX "Found %d %s "
"processors (%d cpu cores) (" VERSION ")\n",
num_online_nodes(),
@@ -1324,6 +1349,10 @@ static void __exit powernowk8_exit(void)
dprintk("exit\n");
cpufreq_unregister_driver(&cpufreq_amd64_driver);
+
+#ifdef CONFIG_X86_POWERNOW_K8_ACPI
+ free_percpu(acpi_perf_data);
+#endif
}
MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and Mark Langsdorf <mark.langsdorf@amd.com>");
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index ab48cfed4d9..a62612cd4be 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -33,12 +33,13 @@ struct powernow_k8_data {
#ifdef CONFIG_X86_POWERNOW_K8_ACPI
/* the acpi table needs to be kept. it's only available if ACPI was
* used to determine valid frequency/vid/fid states */
- struct acpi_processor_performance acpi_data;
+ struct acpi_processor_performance *acpi_data;
#endif
/* we need to keep track of associated cores, but let cpufreq
* handle hotplug events - so just point at cpufreq pol->cpus
* structure */
cpumask_t *available_cores;
+ cpumask_t starting_core_affinity;
};
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index ca2ac13b7af..15e13c01cc3 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -324,10 +324,9 @@ static unsigned int get_cur_freq(unsigned int cpu)
unsigned l, h;
unsigned clock_freq;
cpumask_t saved_mask;
- cpumask_of_cpu_ptr(new_mask, cpu);
saved_mask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, new_mask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (smp_processor_id() != cpu)
return 0;
@@ -585,15 +584,12 @@ static int centrino_target (struct cpufreq_policy *policy,
* Best effort undo..
*/
- if (!cpus_empty(*covered_cpus)) {
- cpumask_of_cpu_ptr_declare(new_mask);
-
+ if (!cpus_empty(*covered_cpus))
for_each_cpu_mask_nr(j, *covered_cpus) {
- cpumask_of_cpu_ptr_next(new_mask, j);
- set_cpus_allowed_ptr(current, new_mask);
+ set_cpus_allowed_ptr(current,
+ &cpumask_of_cpu(j));
wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
}
- }
tmp = freqs.new;
freqs.new = freqs.old;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 2f3728dc24f..191f7263c61 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -244,8 +244,7 @@ static unsigned int _speedstep_get(const cpumask_t *cpus)
static unsigned int speedstep_get(unsigned int cpu)
{
- cpumask_of_cpu_ptr(newmask, cpu);
- return _speedstep_get(newmask);
+ return _speedstep_get(&cpumask_of_cpu(cpu));
}
/**
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 650d40f7912..6b0a10b002f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -516,7 +516,6 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
unsigned long j;
int retval;
cpumask_t oldmask;
- cpumask_of_cpu_ptr(newmask, cpu);
if (num_cache_leaves == 0)
return -ENOENT;
@@ -527,7 +526,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
return -ENOMEM;
oldmask = current->cpus_allowed;
- retval = set_cpus_allowed_ptr(current, newmask);
+ retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (retval)
goto out;
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index de7439f82b9..05cc22dbd4f 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -478,7 +478,13 @@ static int setup_p4_watchdog(unsigned nmi_hz)
perfctr_msr = MSR_P4_IQ_PERFCTR1;
evntsel_msr = MSR_P4_CRU_ESCR0;
cccr_msr = MSR_P4_IQ_CCCR1;
- cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
+
+ /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
+ if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
+ cccr_val = P4_CCCR_OVF_PMI0;
+ else
+ cccr_val = P4_CCCR_OVF_PMI1;
+ cccr_val |= P4_CCCR_ESCR_SELECT(4);
}
evntsel = P4_ESCR_EVENT_SELECT(0x3F)
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 4b63c8e1f13..5cab48ee61a 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -53,7 +53,7 @@ void efi_call_phys_prelog(void)
* directory. If I have PAE, I just need to duplicate one entry in
* page directory.
*/
- cr4 = read_cr4();
+ cr4 = read_cr4_safe();
if (cr4 & X86_CR4_PAE) {
efi_bak_pg_dir_pointer[0].pgd =
@@ -91,7 +91,7 @@ void efi_call_phys_epilog(void)
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
- cr4 = read_cr4();
+ cr4 = read_cr4_safe();
if (cr4 & X86_CR4_PAE) {
swapper_pg_dir[pgd_index(0)].pgd =
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 1fa8be5bd21..eaff0bbb144 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -99,3 +99,4 @@ int is_uv_system(void)
{
return uv_system_type != UV_NONE;
}
+EXPORT_SYMBOL_GPL(is_uv_system);
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 2cfcbded888..2d7e307c777 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -222,7 +222,7 @@ static __init void map_low_mmrs(void)
enum map_type {map_wb, map_uc};
-static void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
+static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
{
unsigned long bytes, paddr;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 1b318e903bf..9bfc4d72fb2 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -88,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
+ BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f67e93441ca..a7010c3a377 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -456,9 +456,6 @@ is386: movl $2,%ecx # set MP
1:
#endif /* CONFIG_SMP */
jmp *(initial_code)
-.align 4
-ENTRY(initial_code)
- .long i386_start_kernel
/*
* We depend on ET to be correct. This checks for 287/387.
@@ -601,6 +598,11 @@ ignore_int:
#endif
iret
+.section .cpuinit.data,"wa"
+.align 4
+ENTRY(initial_code)
+ .long i386_start_kernel
+
.section .text
/*
* Real beginning of normal "text" segment
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad2b15a1334..59fd3b6b130 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -359,6 +359,7 @@ static int hpet_clocksource_register(void)
int __init hpet_enable(void)
{
unsigned long id;
+ int i;
if (!is_hpet_capable())
return 0;
@@ -369,6 +370,29 @@ int __init hpet_enable(void)
* Read the period and check for a sane value:
*/
hpet_period = hpet_readl(HPET_PERIOD);
+
+ /*
+ * AMD SB700 based systems with spread spectrum enabled use a
+ * SMM based HPET emulation to provide proper frequency
+ * setting. The SMM code is initialized with the first HPET
+ * register access and takes some time to complete. During
+ * this time the config register reads 0xffffffff. We check
+ * for max. 1000 loops whether the config register reads a non
+ * 0xffffffff value to make sure that HPET is up and running
+ * before we go further. A counting loop is safe, as the HPET
+ * access takes thousands of CPU cycles. On non SB700 based
+ * machines this check is only done once and has no side
+ * effects.
+ */
+ for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
+ if (i == 1000) {
+ printk(KERN_WARNING
+ "HPET config register value = 0xFFFFFFFF. "
+ "Disabling HPET\n");
+ goto out_nohpet;
+ }
+ }
+
if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
goto out_nohpet;
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index de9aa0e3a9c..09cddb57bec 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -57,7 +57,7 @@ atomic_t irq_mis_count;
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
+DEFINE_SPINLOCK(vector_lock);
int timer_through_8259 __initdata;
@@ -1209,10 +1209,6 @@ static int assign_irq_vector(int irq)
return vector;
}
-void setup_vector_irq(int cpu)
-{
-}
-
static struct irq_chip ioapic_chip;
#define IOAPIC_AUTO -1
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 8269434d170..61a83b70c18 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -101,7 +101,7 @@ int timer_through_8259 __initdata;
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
static DEFINE_SPINLOCK(ioapic_lock);
-DEFINE_SPINLOCK(vector_lock);
+static DEFINE_SPINLOCK(vector_lock);
/*
* # of IRQ routing registers
@@ -697,6 +697,19 @@ static int pin_2_irq(int idx, int apic, int pin)
return irq;
}
+void lock_vector_lock(void)
+{
+ /* Used to the online set of cpus does not change
+ * during assign_irq_vector.
+ */
+ spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+ spin_unlock(&vector_lock);
+}
+
static int __assign_irq_vector(int irq, cpumask_t mask)
{
/*
@@ -802,7 +815,7 @@ static void __clear_irq_vector(int irq)
cpus_clear(cfg->domain);
}
-static void __setup_vector_irq(int cpu)
+void __setup_vector_irq(int cpu)
{
/* Initialize vector_irq on a new cpu */
/* This function must be called with vector_lock held */
@@ -825,14 +838,6 @@ static void __setup_vector_irq(int cpu)
}
}
-void setup_vector_irq(int cpu)
-{
- spin_lock(&vector_lock);
- __setup_vector_irq(smp_processor_id());
- spin_unlock(&vector_lock);
-}
-
-
static struct irq_chip ioapic_chip;
static void ioapic_register_intr(int irq, unsigned long trigger)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 3fee2aa50f3..b68e21f06f4 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -62,12 +62,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
if (reload) {
#ifdef CONFIG_SMP
- cpumask_of_cpu_ptr_declare(mask);
-
preempt_disable();
load_LDT(pc);
- cpumask_of_cpu_ptr_next(mask, smp_processor_id());
- if (!cpus_equal(current->mm->cpu_vm_mask, *mask))
+ if (!cpus_equal(current->mm->cpu_vm_mask,
+ cpumask_of_cpu(smp_processor_id())))
smp_call_function(flush_ldt, current->mm, 1);
preempt_enable();
#else
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55a..0732adba05c 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -12,6 +12,7 @@
#include <linux/init.h>
#include <linux/numa.h>
#include <linux/ftrace.h>
+#include <linux/suspend.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -22,6 +23,7 @@
#include <asm/cpufeature.h>
#include <asm/desc.h>
#include <asm/system.h>
+#include <asm/cacheflush.h>
#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -77,7 +79,7 @@ static void load_segments(void)
/*
* A architecture hook called to validate the
* proposed image and prepare the control pages
- * as needed. The pages for KEXEC_CONTROL_CODE_SIZE
+ * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE
* have been allocated, but the segments have yet
* been copied into the kernel.
*
@@ -85,10 +87,12 @@ static void load_segments(void)
* reboot code buffer to allow us to avoid allocations
* later.
*
- * Currently nothing.
+ * Make control page executable.
*/
int machine_kexec_prepare(struct kimage *image)
{
+ if (nx_enabled)
+ set_pages_x(image->control_code_page, 1);
return 0;
}
@@ -98,27 +102,54 @@ int machine_kexec_prepare(struct kimage *image)
*/
void machine_kexec_cleanup(struct kimage *image)
{
+ if (nx_enabled)
+ set_pages_nx(image->control_code_page, 1);
}
/*
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
*/
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
{
unsigned long page_list[PAGES_NR];
void *control_page;
+ int save_ftrace_enabled;
+ asmlinkage unsigned long
+ (*relocate_kernel_ptr)(unsigned long indirection_page,
+ unsigned long control_page,
+ unsigned long start_address,
+ unsigned int has_pae,
+ unsigned int preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context)
+ save_processor_state();
+#endif
- tracer_disable();
+ save_ftrace_enabled = __ftrace_enabled_save();
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
+ if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+ /* We need to put APICs in legacy mode so that we can
+ * get timer interrupts in second kernel. kexec/kdump
+ * paths already have calls to disable_IO_APIC() in
+ * one form or other. kexec jump path also need
+ * one.
+ */
+ disable_IO_APIC();
+#endif
+ }
+
control_page = page_address(image->control_code_page);
- memcpy(control_page, relocate_kernel, PAGE_SIZE);
+ memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+ relocate_kernel_ptr = control_page;
page_list[PA_CONTROL_PAGE] = __pa(control_page);
- page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+ page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
page_list[PA_PGD] = __pa(kexec_pgd);
page_list[VA_PGD] = (unsigned long)kexec_pgd;
#ifdef CONFIG_X86_PAE
@@ -131,6 +162,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
page_list[PA_PTE_1] = __pa(kexec_pte1);
page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
/* The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
@@ -149,8 +181,17 @@ NORET_TYPE void machine_kexec(struct kimage *image)
set_idt(phys_to_virt(0),0);
/* now call it */
- relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
- image->start, cpu_has_pae);
+ image->start = relocate_kernel_ptr((unsigned long)image->head,
+ (unsigned long)page_list,
+ image->start, cpu_has_pae,
+ image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context)
+ restore_processor_state();
+#endif
+
+ __ftrace_enabled_restore(save_ftrace_enabled);
}
void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a..c43caa3a91f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
*/
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
{
unsigned long page_list[PAGES_NR];
void *control_page;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 07c0f828f48..3b599518c32 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -33,6 +33,8 @@
#include <linux/module.h>
#include <asm/geode.h>
+#define MFGPT_DEFAULT_IRQ 7
+
static struct mfgpt_timer_t {
unsigned int avail:1;
} mfgpt_timers[MFGPT_MAX_TIMERS];
@@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
}
EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
-int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable)
+int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
{
- u32 val, dummy;
- int offset;
+ u32 zsel, lpc, dummy;
+ int shift;
if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
return -EIO;
- if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
+ /*
+ * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
+ * is using the same CMP of the timer's Siamese twin, the IRQ is set to
+ * 2, and we mustn't use nor change it.
+ * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
+ * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
+ * with *irq==0 is safe. Currently there _are_ no 2 drivers.
+ */
+ rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
+ shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
+ if (((zsel >> shift) & 0xF) == 2)
return -EIO;
- rdmsr(MSR_PIC_ZSEL_LOW, val, dummy);
+ /* Choose IRQ: if none supplied, keep IRQ already set or use default */
+ if (!*irq)
+ *irq = (zsel >> shift) & 0xF;
+ if (!*irq)
+ *irq = MFGPT_DEFAULT_IRQ;
- offset = (timer % 4) * 4;
-
- val &= ~((0xF << offset) | (0xF << (offset + 16)));
+ /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
+ if (*irq < 1 || *irq == 2 || *irq > 15)
+ return -EIO;
+ rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
+ if (lpc & (1 << *irq))
+ return -EIO;
+ /* All chosen and checked - go for it */
+ if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
+ return -EIO;
if (enable) {
- val |= (irq & 0x0F) << (offset);
- val |= (irq & 0x0F) << (offset + 16);
+ zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
+ wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
}
- wrmsr(MSR_PIC_ZSEL_LOW, val, dummy);
return 0;
}
@@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
static u16 mfgpt_event_clock;
-static int irq = 7;
+static int irq;
static int __init mfgpt_setup(char *str)
{
get_option(&str, &irq);
@@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void)
mfgpt_event_clock = timer;
/* Set up the IRQ on the MFGPT side */
- if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) {
+ if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
return -EIO;
}
@@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void)
&mfgpt_clockevent);
printk(KERN_INFO
- "mfgpt-timer: registering the MFGPT timer as a clock event.\n");
+ "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
+ timer, irq);
clockevents_register_device(&mfgpt_clockevent);
return 0;
err:
- geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq);
+ geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
printk(KERN_ERR
"mfgpt-timer: Unable to set up the MFGPT clock source\n");
return -EIO;
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 6994c751590..652fa5c38eb 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -388,7 +388,6 @@ static int do_microcode_update (void)
void *new_mc = NULL;
int cpu;
cpumask_t old;
- cpumask_of_cpu_ptr_declare(newmask);
old = current->cpus_allowed;
@@ -405,8 +404,7 @@ static int do_microcode_update (void)
if (!uci->valid)
continue;
- cpumask_of_cpu_ptr_next(newmask, cpu);
- set_cpus_allowed_ptr(current, newmask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
error = get_maching_microcode(new_mc, cpu);
if (error < 0)
goto out;
@@ -576,7 +574,6 @@ static int apply_microcode_check_cpu(int cpu)
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
cpumask_t old;
- cpumask_of_cpu_ptr(newmask, cpu);
unsigned int val[2];
int err = 0;
@@ -585,7 +582,7 @@ static int apply_microcode_check_cpu(int cpu)
return 0;
old = current->cpus_allowed;
- set_cpus_allowed_ptr(current, newmask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
/* Check if the microcode we have in memory matches the CPU */
if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
@@ -623,12 +620,11 @@ static int apply_microcode_check_cpu(int cpu)
static void microcode_init_cpu(int cpu, int resume)
{
cpumask_t old;
- cpumask_of_cpu_ptr(newmask, cpu);
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
old = current->cpus_allowed;
- set_cpus_allowed_ptr(current, newmask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
mutex_lock(&microcode_mutex);
collect_cpu_info(cpu);
if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
@@ -661,13 +657,10 @@ static ssize_t reload_store(struct sys_device *dev,
if (end == buf)
return -EINVAL;
if (val == 1) {
- cpumask_t old;
- cpumask_of_cpu_ptr(newmask, cpu);
-
- old = current->cpus_allowed;
+ cpumask_t old = current->cpus_allowed;
get_online_cpus();
- set_cpus_allowed_ptr(current, newmask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
mutex_lock(&microcode_mutex);
if (uci->valid)
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index fdfdc550b36..efc2f361fe8 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -238,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
{}
};
-void __init check_enable_amd_mmconf_dmi(void)
+void __cpuinit check_enable_amd_mmconf_dmi(void)
{
dmi_check_system(mmconf_dmi_table);
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 6ae005ccaed..b3fb430725c 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -49,7 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
return sum & 0xFF;
}
-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_config_processor *m)
{
int apicid;
char *bootup_cpu = "";
@@ -83,7 +83,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
if (x86_quirks->mpc_oem_bus_info)
x86_quirks->mpc_oem_bus_info(m, str);
else
- printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
+ apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
#if MAX_MP_BUSSES < 256
if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -154,7 +154,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
{
- printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
@@ -163,7 +163,7 @@ static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
{
- printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
(mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
@@ -235,7 +235,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
{
- printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x,"
+ apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
@@ -484,7 +484,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
}
-static void construct_ioapic_table(int mpc_default_type)
+static void __init construct_ioapic_table(int mpc_default_type)
{
struct mpc_config_ioapic ioapic;
struct mpc_config_bus bus;
@@ -529,7 +529,7 @@ static void construct_ioapic_table(int mpc_default_type)
construct_default_ioirq_mptable(mpc_default_type);
}
#else
-static inline void construct_ioapic_table(int mpc_default_type) { }
+static inline void __init construct_ioapic_table(int mpc_default_type) { }
#endif
static inline void __init construct_default_ISA_mptable(int mpc_default_type)
@@ -695,7 +695,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
unsigned int *bp = phys_to_virt(base);
struct intel_mp_floating *mpf;
- printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length);
+ apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
+ bp, length);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 9fd80955244..e4393808688 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -131,7 +131,7 @@ static int msr_open(struct inode *inode, struct file *file)
ret = -EIO; /* MSR not supported */
out:
unlock_kernel();
- return 0;
+ return ret;
}
/*
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ac6d51222e7..abb78a2cc4a 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -114,6 +114,23 @@ static __init void nmi_cpu_busy(void *data)
}
#endif
+static void report_broken_nmi(int cpu, int *prev_nmi_count)
+{
+ printk(KERN_CONT "\n");
+
+ printk(KERN_WARNING
+ "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
+ cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
+
+ printk(KERN_WARNING
+ "Please report this to bugzilla.kernel.org,\n");
+ printk(KERN_WARNING
+ "and attach the output of the 'dmesg' command.\n");
+
+ per_cpu(wd_enabled, cpu) = 0;
+ atomic_dec(&nmi_active);
+}
+
int __init check_nmi_watchdog(void)
{
unsigned int *prev_nmi_count;
@@ -141,15 +158,8 @@ int __init check_nmi_watchdog(void)
for_each_online_cpu(cpu) {
if (!per_cpu(wd_enabled, cpu))
continue;
- if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
- printk(KERN_WARNING "WARNING: CPU#%d: NMI "
- "appears to be stuck (%d->%d)!\n",
- cpu,
- prev_nmi_count[cpu],
- get_nmi_count(cpu));
- per_cpu(wd_enabled, cpu) = 0;
- atomic_dec(&nmi_active);
- }
+ if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
+ report_broken_nmi(cpu, prev_nmi_count);
}
endflag = 1;
if (!atomic_read(&nmi_active)) {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 19e7fc7c2c4..218d783ed7a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -37,6 +37,7 @@
#include <linux/delay.h>
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
+
#include <asm/iommu.h>
#include <asm/calgary.h>
#include <asm/tce.h>
@@ -413,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev,
}
}
-static int calgary_nontranslate_map_sg(struct device* dev,
- struct scatterlist *sg, int nelems, int direction)
-{
- struct scatterlist *s;
- int i;
-
- for_each_sg(sg, s, nelems, i) {
- struct page *p = sg_page(s);
-
- BUG_ON(!p);
- s->dma_address = virt_to_bus(sg_virt(s));
- s->dma_length = s->length;
- }
- return nelems;
-}
-
static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
int nelems, int direction)
{
@@ -439,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
unsigned long entry;
int i;
- if (!translation_enabled(tbl))
- return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
-
for_each_sg(sg, s, nelems, i) {
BUG_ON(!sg_page(s));
@@ -477,7 +459,6 @@ error:
static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
size_t size, int direction)
{
- dma_addr_t dma_handle = bad_dma_address;
void *vaddr = phys_to_virt(paddr);
unsigned long uaddr;
unsigned int npages;
@@ -486,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
uaddr = (unsigned long)vaddr;
npages = num_dma_pages(uaddr, size);
- if (translation_enabled(tbl))
- dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
- else
- dma_handle = virt_to_bus(vaddr);
-
- return dma_handle;
+ return iommu_alloc(dev, tbl, vaddr, npages, direction);
}
static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -500,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
struct iommu_table *tbl = find_iommu_table(dev);
unsigned int npages;
- if (!translation_enabled(tbl))
- return;
-
npages = num_dma_pages(dma_handle, size);
iommu_free(tbl, dma_handle, npages);
}
@@ -525,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
goto error;
memset(ret, 0, size);
- if (translation_enabled(tbl)) {
- /* set up tces to cover the allocated range */
- mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
- if (mapping == bad_dma_address)
- goto free;
-
- *dma_handle = mapping;
- } else /* non translated slot */
- *dma_handle = virt_to_bus(ret);
-
+ /* set up tces to cover the allocated range */
+ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
+ if (mapping == bad_dma_address)
+ goto free;
+ *dma_handle = mapping;
return ret;
-
free:
free_pages((unsigned long)ret, get_order(size));
ret = NULL;
@@ -544,7 +511,7 @@ error:
return ret;
}
-static const struct dma_mapping_ops calgary_dma_ops = {
+static struct dma_mapping_ops calgary_dma_ops = {
.alloc_coherent = calgary_alloc_coherent,
.map_single = calgary_map_single,
.unmap_single = calgary_unmap_single,
@@ -1241,6 +1208,16 @@ static int __init calgary_init(void)
goto error;
} while (1);
+ dev = NULL;
+ for_each_pci_dev(dev) {
+ struct iommu_table *tbl;
+
+ tbl = find_iommu_table(&dev->dev);
+
+ if (translation_enabled(tbl))
+ dev->dev.archdata.dma_ops = &calgary_dma_ops;
+ }
+
return ret;
error:
@@ -1262,6 +1239,7 @@ error:
calgary_disable_translation(dev);
calgary_free_bus(dev);
pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
+ dev->dev.archdata.dma_ops = NULL;
} while (1);
return ret;
@@ -1372,7 +1350,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
* Function for kdump case. Get the tce tables from first kernel
* by reading the contents of the base adress register of calgary iommu
*/
-static void get_tce_space_from_tar()
+static void __init get_tce_space_from_tar(void)
{
int bus;
void __iomem *target;
@@ -1503,6 +1481,10 @@ void __init detect_calgary(void)
printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
"CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
debugging ? "enabled" : "disabled");
+
+ /* swiotlb for devices that aren't behind the Calgary. */
+ if (max_pfn > MAX_DMA32_PFN)
+ swiotlb = 1;
}
return;
@@ -1519,7 +1501,7 @@ int __init calgary_iommu_init(void)
{
int ret;
- if (no_iommu || swiotlb)
+ if (no_iommu || (swiotlb && !calgary_detected))
return -ENODEV;
if (!calgary_detected)
@@ -1532,15 +1514,14 @@ int __init calgary_iommu_init(void)
if (ret) {
printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
"falling back to no_iommu\n", ret);
- if (max_pfn > MAX_DMA32_PFN)
- printk(KERN_ERR "WARNING more than 4GB of memory, "
- "32bit PCI may malfunction.\n");
return ret;
}
force_iommu = 1;
bad_dma_address = 0x0;
- dma_ops = &calgary_dma_ops;
+ /* dma_ops is set to swiotlb or nommu */
+ if (!dma_ops)
+ dma_ops = &nommu_dma_ops;
return 0;
}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index cbecb05551b..87d4d6964ec 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,7 +11,7 @@
static int forbid_dac __read_mostly;
-const struct dma_mapping_ops *dma_ops;
+struct dma_mapping_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
static int iommu_sac_force __read_mostly;
@@ -123,6 +123,14 @@ void __init pci_iommu_alloc(void)
pci_swiotlb_init();
}
+
+unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
+{
+ unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
+
+ return size >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(iommu_num_pages);
#endif
/*
@@ -192,126 +200,10 @@ static __init int iommu_setup(char *p)
}
early_param("iommu", iommu_setup);
-#ifdef CONFIG_X86_32
-int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
- dma_addr_t device_addr, size_t size, int flags)
-{
- void __iomem *mem_base = NULL;
- int pages = size >> PAGE_SHIFT;
- int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
-
- if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
- goto out;
- if (!size)
- goto out;
- if (dev->dma_mem)
- goto out;
-
- /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
-
- mem_base = ioremap(bus_addr, size);
- if (!mem_base)
- goto out;
-
- dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
- if (!dev->dma_mem)
- goto out;
- dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!dev->dma_mem->bitmap)
- goto free1_out;
-
- dev->dma_mem->virt_base = mem_base;
- dev->dma_mem->device_base = device_addr;
- dev->dma_mem->size = pages;
- dev->dma_mem->flags = flags;
-
- if (flags & DMA_MEMORY_MAP)
- return DMA_MEMORY_MAP;
-
- return DMA_MEMORY_IO;
-
- free1_out:
- kfree(dev->dma_mem);
- out:
- if (mem_base)
- iounmap(mem_base);
- return 0;
-}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-
-void dma_release_declared_memory(struct device *dev)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
-
- if (!mem)
- return;
- dev->dma_mem = NULL;
- iounmap(mem->virt_base);
- kfree(mem->bitmap);
- kfree(mem);
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
-
-void *dma_mark_declared_memory_occupied(struct device *dev,
- dma_addr_t device_addr, size_t size)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
- int pos, err;
- int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
-
- pages >>= PAGE_SHIFT;
-
- if (!mem)
- return ERR_PTR(-EINVAL);
-
- pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
- err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
- if (err != 0)
- return ERR_PTR(err);
- return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
-static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
- dma_addr_t *dma_handle, void **ret)
-{
- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
- int order = get_order(size);
-
- if (mem) {
- int page = bitmap_find_free_region(mem->bitmap, mem->size,
- order);
- if (page >= 0) {
- *dma_handle = mem->device_base + (page << PAGE_SHIFT);
- *ret = mem->virt_base + (page << PAGE_SHIFT);
- memset(*ret, 0, size);
- }
- if (mem->flags & DMA_MEMORY_EXCLUSIVE)
- *ret = NULL;
- }
- return (mem != NULL);
-}
-
-static int dma_release_coherent(struct device *dev, int order, void *vaddr)
-{
- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-
- if (mem && vaddr >= mem->virt_base && vaddr <
- (mem->virt_base + (mem->size << PAGE_SHIFT))) {
- int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
-
- bitmap_release_region(mem->bitmap, page, order);
- return 1;
- }
- return 0;
-}
-#else
-#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
-#define dma_release_coherent(dev, order, vaddr) (0)
-#endif /* CONFIG_X86_32 */
-
int dma_supported(struct device *dev, u64 mask)
{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
#ifdef CONFIG_PCI
if (mask > 0xffffffff && forbid_dac > 0) {
dev_info(dev, "PCI: Disallowing DAC for device\n");
@@ -319,8 +211,8 @@ int dma_supported(struct device *dev, u64 mask)
}
#endif
- if (dma_ops->dma_supported)
- return dma_ops->dma_supported(dev, mask);
+ if (ops->dma_supported)
+ return ops->dma_supported(dev, mask);
/* Copied from i386. Doesn't make much sense, because it will
only work for pci_alloc_coherent.
@@ -367,6 +259,7 @@ void *
dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp)
{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
void *memory = NULL;
struct page *page;
unsigned long dma_mask = 0;
@@ -376,7 +269,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* ignore region specifiers */
gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
- if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
+ if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
return memory;
if (!dev) {
@@ -435,8 +328,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* Let low level make its own zone decisions */
gfp &= ~(GFP_DMA32|GFP_DMA);
- if (dma_ops->alloc_coherent)
- return dma_ops->alloc_coherent(dev, size,
+ if (ops->alloc_coherent)
+ return ops->alloc_coherent(dev, size,
dma_handle, gfp);
return NULL;
}
@@ -448,14 +341,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
}
}
- if (dma_ops->alloc_coherent) {
+ if (ops->alloc_coherent) {
free_pages((unsigned long)memory, get_order(size));
gfp &= ~(GFP_DMA|GFP_DMA32);
- return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+ return ops->alloc_coherent(dev, size, dma_handle, gfp);
}
- if (dma_ops->map_simple) {
- *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory),
+ if (ops->map_simple) {
+ *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
size,
PCI_DMA_BIDIRECTIONAL);
if (*dma_handle != bad_dma_address)
@@ -477,12 +370,14 @@ EXPORT_SYMBOL(dma_alloc_coherent);
void dma_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t bus)
{
+ struct dma_mapping_ops *ops = get_dma_ops(dev);
+
int order = get_order(size);
WARN_ON(irqs_disabled()); /* for portability */
- if (dma_release_coherent(dev, order, vaddr))
+ if (dma_release_from_coherent(dev, order, vaddr))
return;
- if (dma_ops->unmap_single)
- dma_ops->unmap_single(dev, bus, size, 0);
+ if (ops->unmap_single)
+ ops->unmap_single(dev, bus, size, 0);
free_pages((unsigned long)vaddr, order);
}
EXPORT_SYMBOL(dma_free_coherent);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index df5f142657d..49285f8fd4d 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -67,9 +67,6 @@ static u32 gart_unmapped_entry;
(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
-#define to_pages(addr, size) \
- (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
-
#define EMERGENCY_PAGES 32 /* = 128KB */
#ifdef CONFIG_AGP
@@ -241,7 +238,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
size_t size, int dir)
{
- unsigned long npages = to_pages(phys_mem, size);
+ unsigned long npages = iommu_num_pages(phys_mem, size);
unsigned long iommu_page = alloc_iommu(dev, npages);
int i;
@@ -304,7 +301,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
return;
iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
- npages = to_pages(dma_addr, size);
+ npages = iommu_num_pages(dma_addr, size);
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
CLEAR_LEAK(iommu_page + i);
@@ -387,7 +384,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
}
addr = phys_addr;
- pages = to_pages(s->offset, s->length);
+ pages = iommu_num_pages(s->offset, s->length);
while (pages--) {
iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
SET_LEAK(iommu_page);
@@ -470,7 +467,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
seg_size += s->length;
need = nextneed;
- pages += to_pages(s->offset, s->length);
+ pages += iommu_num_pages(s->offset, s->length);
ps = s;
}
if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
@@ -692,8 +689,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
extern int agp_amd64_init(void);
-static const struct dma_mapping_ops gart_dma_ops = {
- .mapping_error = NULL,
+static struct dma_mapping_ops gart_dma_ops = {
.map_single = gart_map_single,
.map_simple = gart_map_simple,
.unmap_single = gart_unmap_single,
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 792b9179eff..3f91f71cdc3 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
return nents;
}
-/* Make sure we keep the same behaviour */
-static int nommu_mapping_error(dma_addr_t dma_addr)
-{
-#ifdef CONFIG_X86_32
- return 0;
-#else
- return (dma_addr == bad_dma_address);
-#endif
-}
-
-
-const struct dma_mapping_ops nommu_dma_ops = {
+struct dma_mapping_ops nommu_dma_ops = {
.map_single = nommu_map_single,
.map_sg = nommu_map_sg,
- .mapping_error = nommu_mapping_error,
.is_phys = 1,
};
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 20df839b9c2..c4ce0332759 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
}
-const struct dma_mapping_ops swiotlb_dma_ops = {
+struct dma_mapping_ops swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
.alloc_coherent = swiotlb_alloc_coherent,
.free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 53bc653ed5c..3b7a1ddcc0b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -95,7 +95,6 @@ static inline void play_dead(void)
{
/* This must be done before dead CPU ack */
cpu_exit_clear();
- wbinvd();
mb();
/* Ack it */
__get_cpu_var(cpu_state) = CPU_DEAD;
@@ -104,8 +103,8 @@ static inline void play_dead(void)
* With physical CPU hotplug, we should halt the cpu
*/
local_irq_disable();
- while (1)
- halt();
+ /* mask all interrupts, flush any and all caches, and halt */
+ wbinvd_halt();
}
#else
static inline void play_dead(void)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3fb62a7d9a1..71553b664e2 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -93,14 +93,13 @@ DECLARE_PER_CPU(int, cpu_state);
static inline void play_dead(void)
{
idle_task_exit();
- wbinvd();
mb();
/* Ack it */
__get_cpu_var(cpu_state) = CPU_DEAD;
local_irq_disable();
- while (1)
- halt();
+ /* mask all interrupts, flush any and all caches, and halt */
+ wbinvd_halt();
}
#else
static inline void play_dead(void)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 06a9f643817..724adfc63cb 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -414,25 +414,20 @@ void native_machine_shutdown(void)
/* The boot cpu is always logical cpu 0 */
int reboot_cpu_id = 0;
- cpumask_of_cpu_ptr(newmask, reboot_cpu_id);
#ifdef CONFIG_X86_32
/* See if there has been given a command line override */
if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
- cpu_online(reboot_cpu)) {
+ cpu_online(reboot_cpu))
reboot_cpu_id = reboot_cpu;
- cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
- }
#endif
/* Make certain the cpu I'm about to reboot on is online */
- if (!cpu_online(reboot_cpu_id)) {
+ if (!cpu_online(reboot_cpu_id))
reboot_cpu_id = smp_processor_id();
- cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
- }
/* Make certain I only run on the appropriate processor */
- set_cpus_allowed_ptr(current, newmask);
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id));
/* O.K Now that I'm on the appropriate processor,
* stop all of the others.
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470..6f50664b2ba 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,45 @@
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
#define PAE_PGD_ATTR (_PAGE_PRESENT)
+/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define ESP DATA(0x0)
+#define CR0 DATA(0x4)
+#define CR3 DATA(0x8)
+#define CR4 DATA(0xc)
+
+/* other data */
+#define CP_VA_CONTROL_PAGE DATA(0x10)
+#define CP_PA_PGD DATA(0x14)
+#define CP_PA_SWAP_PAGE DATA(0x18)
+#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
+
.text
.align PAGE_SIZE
.globl relocate_kernel
relocate_kernel:
- movl 8(%esp), %ebp /* list of pages */
+ /* Save the CPU context, used for jumping back */
+
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushf
+
+ movl 20+8(%esp), %ebp /* list of pages */
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
+ movl %esp, ESP(%edi)
+ movl %cr0, %eax
+ movl %eax, CR0(%edi)
+ movl %cr3, %eax
+ movl %eax, CR3(%edi)
+ movl %cr4, %eax
+ movl %eax, CR4(%edi)
#ifdef CONFIG_X86_PAE
/* map the control page at its virtual address */
@@ -138,15 +172,25 @@ relocate_kernel:
relocate_new_kernel:
/* read the arguments and say goodbye to the stack */
- movl 4(%esp), %ebx /* page_list */
- movl 8(%esp), %ebp /* list of pages */
- movl 12(%esp), %edx /* start address */
- movl 16(%esp), %ecx /* cpu_has_pae */
+ movl 20+4(%esp), %ebx /* page_list */
+ movl 20+8(%esp), %ebp /* list of pages */
+ movl 20+12(%esp), %edx /* start address */
+ movl 20+16(%esp), %ecx /* cpu_has_pae */
+ movl 20+20(%esp), %esi /* preserve_context */
/* zero out flags, and disable interrupts */
pushl $0
popfl
+ /* save some information for jumping back */
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
+ movl %edi, CP_VA_CONTROL_PAGE(%edi)
+ movl PTR(PA_PGD)(%ebp), %eax
+ movl %eax, CP_PA_PGD(%edi)
+ movl PTR(PA_SWAP_PAGE)(%ebp), %eax
+ movl %eax, CP_PA_SWAP_PAGE(%edi)
+ movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
+
/* get physical address of control page now */
/* this is impossible after page table switch */
movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +241,90 @@ identity_mapped:
xorl %eax, %eax
movl %eax, %cr3
+ movl CP_PA_SWAP_PAGE(%edi), %eax
+ pushl %eax
+ pushl %ebx
+ call swap_pages
+ addl $8, %esp
+
+ /* To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB, it's handy, and not processor dependent.
+ */
+ xorl %eax, %eax
+ movl %eax, %cr3
+
+ /* set all of the registers to known values */
+ /* leave %esp alone */
+
+ testl %esi, %esi
+ jnz 1f
+ xorl %edi, %edi
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %ebp, %ebp
+ ret
+1:
+ popl %edx
+ movl CP_PA_SWAP_PAGE(%edi), %esp
+ addl $PAGE_SIZE, %esp
+2:
+ call *%edx
+
+ /* get the re-entry point of the peer system */
+ movl 0(%esp), %ebp
+ call 1f
+1:
+ popl %ebx
+ subl $(1b - relocate_kernel), %ebx
+ movl CP_VA_CONTROL_PAGE(%ebx), %edi
+ lea PAGE_SIZE(%ebx), %esp
+ movl CP_PA_SWAP_PAGE(%ebx), %eax
+ movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
+ pushl %eax
+ pushl %edx
+ call swap_pages
+ addl $8, %esp
+ movl CP_PA_PGD(%ebx), %eax
+ movl %eax, %cr3
+ movl %cr0, %eax
+ orl $(1<<31), %eax
+ movl %eax, %cr0
+ lea PAGE_SIZE(%edi), %esp
+ movl %edi, %eax
+ addl $(virtual_mapped - relocate_kernel), %eax
+ pushl %eax
+ ret
+
+virtual_mapped:
+ movl CR4(%edi), %eax
+ movl %eax, %cr4
+ movl CR3(%edi), %eax
+ movl %eax, %cr3
+ movl CR0(%edi), %eax
+ movl %eax, %cr0
+ movl ESP(%edi), %esp
+ movl %ebp, %eax
+
+ popf
+ popl %ebp
+ popl %edi
+ popl %esi
+ popl %ebx
+ ret
+
/* Do the copies */
- movl %ebx, %ecx
+swap_pages:
+ movl 8(%esp), %edx
+ movl 4(%esp), %ecx
+ pushl %ebp
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ movl %ecx, %ebx
jmp 1f
0: /* top, read another word from the indirection page */
@@ -226,27 +352,31 @@ identity_mapped:
movl %ecx, %esi /* For every source page do a copy */
andl $0xfffff000, %esi
+ movl %edi, %eax
+ movl %esi, %ebp
+
+ movl %edx, %edi
movl $1024, %ecx
rep ; movsl
- jmp 0b
-
-3:
- /* To be certain of avoiding problems with self-modifying code
- * I need to execute a serializing instruction here.
- * So I flush the TLB, it's handy, and not processor dependent.
- */
- xorl %eax, %eax
- movl %eax, %cr3
+ movl %ebp, %edi
+ movl %eax, %esi
+ movl $1024, %ecx
+ rep ; movsl
- /* set all of the registers to known values */
- /* leave %esp alone */
+ movl %eax, %edi
+ movl %edx, %esi
+ movl $1024, %ecx
+ rep ; movsl
- xorl %eax, %eax
- xorl %ebx, %ebx
- xorl %ecx, %ecx
- xorl %edx, %edx
- xorl %esi, %esi
- xorl %edi, %edi
- xorl %ebp, %ebp
+ lea PAGE_SIZE(%ebp), %esi
+ jmp 0b
+3:
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
ret
+
+ .globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b520dae02bf..a4656adab53 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -445,7 +445,7 @@ static void __init reserve_early_setup_data(void)
* @size: Size of the crashkernel memory to reserve.
* Returns the base address on success, and -1ULL on failure.
*/
-unsigned long long find_and_reserve_crashkernel(unsigned long long size)
+unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
{
const unsigned long long alignment = 16<<20; /* 16M */
unsigned long long start = 0LL;
@@ -604,6 +604,14 @@ void __init setup_arch(char **cmdline_p)
early_cpu_init();
early_ioremap_init();
+#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
+ /*
+ * Must be before kernel pagetables are setup
+ * or fixmap area is touched.
+ */
+ vmi_init();
+#endif
+
ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
screen_info = boot_params.screen_info;
edid_info = boot_params.edid_info;
@@ -788,10 +796,6 @@ void __init setup_arch(char **cmdline_p)
initmem_init(0, max_pfn);
-#ifdef CONFIG_X86_64
- dma32_reserve_bootmem();
-#endif
-
#ifdef CONFIG_ACPI_SLEEP
/*
* Reserve low memory region for sleep support.
@@ -806,20 +810,21 @@ void __init setup_arch(char **cmdline_p)
#endif
reserve_crashkernel();
+#ifdef CONFIG_X86_64
+ /*
+ * dma32_reserve_bootmem() allocates bootmem which may conflict
+ * with the crashkernel command line, so do that after
+ * reserve_crashkernel()
+ */
+ dma32_reserve_bootmem();
+#endif
+
reserve_ibft_region();
#ifdef CONFIG_KVM_CLOCK
kvmclock_init();
#endif
-#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
- /*
- * Must be after max_low_pfn is determined, and before kernel
- * pagetables are setup.
- */
- vmi_init();
-#endif
-
paravirt_pagetable_setup_start(swapper_pg_dir);
paging_init();
paravirt_pagetable_setup_done(swapper_pg_dir);
@@ -856,12 +861,6 @@ void __init setup_arch(char **cmdline_p)
init_apic_mappings();
ioapic_init_mappings();
-#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
- if (def_to_bigsmp)
- printk(KERN_WARNING "More than 8 CPUs detected and "
- "CONFIG_X86_PC cannot handle it.\nUse "
- "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
-#endif
kvm_guest_init();
e820_reserve_resources();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index f7745f94c00..76e305e064f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,24 +80,6 @@ static void __init setup_per_cpu_maps(void)
#endif
}
-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-cpumask_t *cpumask_of_cpu_map __read_mostly;
-EXPORT_SYMBOL(cpumask_of_cpu_map);
-
-/* requires nr_cpu_ids to be initialized */
-static void __init setup_cpumask_of_cpu(void)
-{
- int i;
-
- /* alloc_bootmem zeroes memory */
- cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
- for (i = 0; i < nr_cpu_ids; i++)
- cpu_set(i, cpumask_of_cpu_map[i]);
-}
-#else
-static inline void setup_cpumask_of_cpu(void) { }
-#endif
-
#ifdef CONFIG_X86_32
/*
* Great future not-so-futuristic plan: make i386 and x86_64 do it
@@ -197,9 +179,6 @@ void __init setup_per_cpu_areas(void)
/* Setup node to cpumask map */
setup_node_to_cpumask_map();
-
- /* Setup cpumask_of_cpu map */
- setup_cpumask_of_cpu();
}
#endif
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index b45ef8ddd65..ca316b5b742 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -104,7 +104,16 @@ static inline int restore_i387(struct _fpstate __user *buf)
clts();
task_thread_info(current)->status |= TS_USEDFPU;
}
- return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
+ err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
+ if (unlikely(err)) {
+ /*
+ * Encountered an error while doing the restore from the
+ * user buffer, clear the fpu state.
+ */
+ clear_fpu(tsk);
+ clear_used_math();
+ }
+ return err;
}
/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 332512767f4..e139e617f42 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -326,12 +326,16 @@ static void __cpuinit start_secondary(void *unused)
* for which cpus receive the IPI. Holding this
* lock helps us to not include this cpu in a currently in progress
* smp_call_function().
+ *
+ * We need to hold vector_lock so there the set of online cpus
+ * does not change while we are assigning vectors to cpus. Holding
+ * this lock ensures we don't half assign or remove an irq from a cpu.
*/
ipi_call_lock_irq();
-#ifdef CONFIG_X86_IO_APIC
- setup_vector_irq(smp_processor_id());
-#endif
+ lock_vector_lock();
+ __setup_vector_irq(smp_processor_id());
cpu_set(smp_processor_id(), cpu_online_map);
+ unlock_vector_lock();
ipi_call_unlock_irq();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
@@ -752,6 +756,14 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
}
#ifdef CONFIG_X86_64
+
+/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
+static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
+{
+ if (!after_bootmem)
+ free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
+}
+
/*
* Allocate node local memory for the AP pda.
*
@@ -780,8 +792,7 @@ int __cpuinit get_local_pda(int cpu)
if (oldpda) {
memcpy(newpda, oldpda, size);
- if (!after_bootmem)
- free_bootmem((unsigned long)oldpda, size);
+ free_bootmem_pda(oldpda);
}
newpda->in_bootmem = 0;
@@ -1044,6 +1055,34 @@ static __init void disable_smp(void)
static int __init smp_sanity_check(unsigned max_cpus)
{
preempt_disable();
+
+#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
+ if (def_to_bigsmp && nr_cpu_ids > 8) {
+ unsigned int cpu;
+ unsigned nr;
+
+ printk(KERN_WARNING
+ "More than 8 CPUs detected - skipping them.\n"
+ "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
+
+ nr = 0;
+ for_each_present_cpu(cpu) {
+ if (nr >= 8)
+ cpu_clear(cpu, cpu_present_map);
+ nr++;
+ }
+
+ nr = 0;
+ for_each_possible_cpu(cpu) {
+ if (nr >= 8)
+ cpu_clear(cpu, cpu_possible_map);
+ nr++;
+ }
+
+ nr_cpu_ids = 8;
+ }
+#endif
+
if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
"by the BIOS.\n", hard_smp_processor_id());
@@ -1336,7 +1375,9 @@ int __cpu_disable(void)
remove_siblinginfo(cpu);
/* It's now safe to remove this processor from the online map */
+ lock_vector_lock();
remove_cpu_from_maps(cpu);
+ unlock_vector_lock();
fixup_irqs(cpu_online_map);
return 0;
}
@@ -1370,17 +1411,3 @@ void __cpu_die(unsigned int cpu)
BUG();
}
#endif
-
-/*
- * If the BIOS enumerates physical processors before logical,
- * maxcpus=N at enumeration-time can be used to disable HT.
- */
-static int __init parse_maxcpus(char *arg)
-{
- extern unsigned int maxcpus;
-
- if (arg)
- maxcpus = simple_strtoul(arg, NULL, 0);
- return 0;
-}
-early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 99941b37eca..397e309839d 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -8,18 +8,21 @@
DEFINE_PER_CPU(unsigned long, this_cpu_off);
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-/* Initialize the CPU's GDT. This is either the boot CPU doing itself
- (still using the master per-cpu area), or a CPU doing it for a
- secondary which will soon come up. */
+/*
+ * Initialize the CPU's GDT. This is either the boot CPU doing itself
+ * (still using the master per-cpu area), or a CPU doing it for a
+ * secondary which will soon come up.
+ */
__cpuinit void init_gdt(int cpu)
{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+ struct desc_struct gdt;
- pack_descriptor(&gdt[GDT_ENTRY_PERCPU],
- __per_cpu_offset[cpu], 0xFFFFF,
+ pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
0x2 | DESCTYPE_S, 0x8);
+ gdt.s = 1;
- gdt[GDT_ENTRY_PERCPU].s = 1;
+ write_gdt_entry(get_cpu_gdt_table(cpu),
+ GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 3f18d73f420..513caaca711 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -1131,7 +1131,14 @@ asmlinkage void math_state_restore(void)
}
clts(); /* Allow maths ops (or we recurse) */
- restore_fpu_checking(&me->thread.xstate->fxsave);
+ /*
+ * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+ */
+ if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
+ stts();
+ force_sig(SIGSEGV, me);
+ return;
+ }
task_thread_info(me)->status |= TS_USEDFPU;
me->fpu_counter++;
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7603c055390..46af7167673 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
/*
* Read TSC and the reference counters. Take care of SMI disturbance
*/
-static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
+static u64 tsc_read_refs(u64 *pm, u64 *hpet)
{
u64 t1, t2;
int i;
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 41e01b145c4..594ef47f0a6 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -184,8 +184,6 @@ static int __init visws_get_smp_config(unsigned int early)
return 1;
}
-extern unsigned int __cpuinitdata maxcpus;
-
/*
* The Visual Workstation is Intel MP compliant in the hardware
* sense, but it doesn't have a BIOS(-configuration table).
@@ -244,8 +242,8 @@ static int __init visws_find_smp_config(unsigned int reserve)
ncpus = CO_CPU_MAX;
}
- if (ncpus > maxcpus)
- ncpus = maxcpus;
+ if (ncpus > setup_max_cpus)
+ ncpus = setup_max_cpus;
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 1;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 0a1b1a9d922..6ca515d6db5 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -37,6 +37,7 @@
#include <asm/timer.h>
#include <asm/vmi_time.h>
#include <asm/kmap_types.h>
+#include <asm/setup.h>
/* Convenient for calling VMI functions indirectly in the ROM */
typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
@@ -683,7 +684,7 @@ void vmi_bringup(void)
{
/* We must establish the lowmem mapping for MMU ops to work */
if (vmi_ops.set_linear_mapping)
- vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
+ vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
}
/*
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index cdb2363697d..af5bdad8460 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -209,3 +209,11 @@ SECTIONS
DWARF_DEBUG
}
+
+#ifdef CONFIG_KEXEC
+/* Link time checks */
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "kexec control code size is too big")
+#endif
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8d45fabc5f3..ce3251ce550 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
select PREEMPT_NOTIFIERS
+ select MMU_NOTIFIER
select ANON_INODES
---help---
Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b0e4ddca6c1..0bfe2bd305e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -653,6 +653,84 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
account_shadowed(kvm, gfn);
}
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+ u64 *spte;
+ int need_tlb_flush = 0;
+
+ while ((spte = rmap_next(kvm, rmapp, NULL))) {
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+ rmap_remove(kvm, spte);
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ need_tlb_flush = 1;
+ }
+ return need_tlb_flush;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+{
+ int i;
+ int retval = 0;
+
+ /*
+ * If mmap_sem isn't taken, we can look the memslots with only
+ * the mmu_lock by skipping over the slots with userspace_addr == 0.
+ */
+ for (i = 0; i < kvm->nmemslots; i++) {
+ struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ unsigned long start = memslot->userspace_addr;
+ unsigned long end;
+
+ /* mmu_lock protects userspace_addr */
+ if (!start)
+ continue;
+
+ end = start + (memslot->npages << PAGE_SHIFT);
+ if (hva >= start && hva < end) {
+ gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ retval |= handler(kvm, &memslot->rmap[gfn_offset]);
+ retval |= handler(kvm,
+ &memslot->lpage_info[
+ gfn_offset /
+ KVM_PAGES_PER_HPAGE].rmap_pde);
+ }
+ }
+
+ return retval;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+ u64 *spte;
+ int young = 0;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ int _young;
+ u64 _spte = *spte;
+ BUG_ON(!(_spte & PT_PRESENT_MASK));
+ _young = _spte & PT_ACCESSED_MASK;
+ if (_young) {
+ young = 1;
+ clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+ return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+}
+
#ifdef MMU_DEBUG
static int is_empty_shadow_page(u64 *spt)
{
@@ -1203,6 +1281,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
int r;
int largepage = 0;
pfn_t pfn;
+ unsigned long mmu_seq;
down_read(&current->mm->mmap_sem);
if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1210,6 +1289,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
largepage = 1;
}
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ /* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
@@ -1220,6 +1301,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
}
spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu, mmu_seq))
+ goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
PT32E_ROOT_LEVEL);
@@ -1227,6 +1310,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
return r;
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return 0;
}
@@ -1345,6 +1433,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
int r;
int largepage = 0;
gfn_t gfn = gpa >> PAGE_SHIFT;
+ unsigned long mmu_seq;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1358,6 +1447,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1;
}
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ /* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
if (is_error_pfn(pfn)) {
@@ -1365,12 +1456,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
return 1;
}
spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu, mmu_seq))
+ goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return 0;
}
static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1670,6 +1768,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
vcpu->arch.update_pte.largepage = 1;
}
+ vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ /* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
@@ -1814,6 +1914,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
}
+EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
@@ -1870,6 +1971,12 @@ void kvm_enable_tdp(void)
}
EXPORT_SYMBOL_GPL(kvm_enable_tdp);
+void kvm_disable_tdp(void)
+{
+ tdp_enabled = false;
+}
+EXPORT_SYMBOL_GPL(kvm_disable_tdp);
+
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4d918220bae..f72ac1fa35f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,6 +263,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
pfn = vcpu->arch.update_pte.pfn;
if (is_error_pfn(pfn))
return;
+ if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
+ return;
kvm_get_pfn(pfn);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -380,6 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
int r;
pfn_t pfn;
int largepage = 0;
+ unsigned long mmu_seq;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,6 +416,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
largepage = 1;
}
}
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ /* implicit mb(), we'll read before PT lock is unlocked */
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
up_read(&current->mm->mmap_sem);
@@ -424,6 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
}
spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu, mmu_seq))
+ goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
largepage, &write_pt, pfn);
@@ -439,6 +446,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
spin_unlock(&vcpu->kvm->mmu_lock);
return write_pt;
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return 0;
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b756e876dce..e2ee264740c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -453,7 +453,8 @@ static __init int svm_hardware_setup(void)
if (npt_enabled) {
printk(KERN_INFO "kvm: Nested Paging enabled\n");
kvm_enable_tdp();
- }
+ } else
+ kvm_disable_tdp();
return 0;
@@ -1007,10 +1008,13 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
struct kvm *kvm = svm->vcpu.kvm;
u64 fault_address;
u32 error_code;
+ bool event_injection = false;
if (!irqchip_in_kernel(kvm) &&
- is_external_interrupt(exit_int_info))
+ is_external_interrupt(exit_int_info)) {
+ event_injection = true;
push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
+ }
fault_address = svm->vmcb->control.exit_info_2;
error_code = svm->vmcb->control.exit_info_1;
@@ -1024,6 +1028,8 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
(u32)fault_address, (u32)(fault_address >> 32),
handler);
+ if (event_injection)
+ kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0cac6370171..2a69773e3b2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2298,6 +2298,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
cr2 = vmcs_readl(EXIT_QUALIFICATION);
KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
(u32)((u64)cr2 >> 32), handler);
+ if (vect_info & VECTORING_INFO_VALID_MASK)
+ kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
@@ -3116,15 +3118,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
return ERR_PTR(-ENOMEM);
allocate_vpid(vmx);
- if (id == 0 && vm_need_ept()) {
- kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
- VMX_EPT_WRITABLE_MASK |
- VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
- kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
- VMX_EPT_FAKE_DIRTY_MASK, 0ull,
- VMX_EPT_EXECUTABLE_MASK);
- kvm_enable_tdp();
- }
err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
if (err)
@@ -3303,8 +3296,17 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
- if (cpu_has_vmx_ept())
+ if (vm_need_ept()) {
bypass_guest_pf = 0;
+ kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
+ VMX_EPT_WRITABLE_MASK |
+ VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
+ kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
+ VMX_EPT_FAKE_DIRTY_MASK, 0ull,
+ VMX_EPT_EXECUTABLE_MASK);
+ kvm_enable_tdp();
+ } else
+ kvm_disable_tdp();
if (bypass_guest_pf)
kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9f1cdb011cf..0d682fc6aeb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -883,6 +883,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PIT:
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
+ case KVM_CAP_SYNC_MMU:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1495,6 +1496,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
goto out;
down_write(&kvm->slots_lock);
+ spin_lock(&kvm->mmu_lock);
p = &kvm->arch.aliases[alias->slot];
p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1506,6 +1508,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
break;
kvm->arch.naliases = n;
+ spin_unlock(&kvm->mmu_lock);
kvm_mmu_zap_all(kvm);
up_write(&kvm->slots_lock);
@@ -3184,6 +3187,10 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
kvm_desct->base |= seg_desc->base2 << 24;
kvm_desct->limit = seg_desc->limit0;
kvm_desct->limit |= seg_desc->limit << 16;
+ if (seg_desc->g) {
+ kvm_desct->limit <<= 12;
+ kvm_desct->limit |= 0xfff;
+ }
kvm_desct->selector = selector;
kvm_desct->type = seg_desc->type;
kvm_desct->present = seg_desc->p;
@@ -3223,6 +3230,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
struct desc_struct *seg_desc)
{
+ gpa_t gpa;
struct descriptor_table dtable;
u16 index = selector >> 3;
@@ -3232,13 +3240,16 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
return 1;
}
- return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
+ gpa += index * 8;
+ return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
}
/* allowed just for 8 bytes segments */
static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
struct desc_struct *seg_desc)
{
+ gpa_t gpa;
struct descriptor_table dtable;
u16 index = selector >> 3;
@@ -3246,7 +3257,9 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
if (dtable.limit < index * 8 + 7)
return 1;
- return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
+ gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
+ gpa += index * 8;
+ return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
}
static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
@@ -3258,55 +3271,7 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
base_addr |= (seg_desc->base1 << 16);
base_addr |= (seg_desc->base2 << 24);
- return base_addr;
-}
-
-static int load_tss_segment32(struct kvm_vcpu *vcpu,
- struct desc_struct *seg_desc,
- struct tss_segment_32 *tss)
-{
- u32 base_addr;
-
- base_addr = get_tss_base_addr(vcpu, seg_desc);
-
- return kvm_read_guest(vcpu->kvm, base_addr, tss,
- sizeof(struct tss_segment_32));
-}
-
-static int save_tss_segment32(struct kvm_vcpu *vcpu,
- struct desc_struct *seg_desc,
- struct tss_segment_32 *tss)
-{
- u32 base_addr;
-
- base_addr = get_tss_base_addr(vcpu, seg_desc);
-
- return kvm_write_guest(vcpu->kvm, base_addr, tss,
- sizeof(struct tss_segment_32));
-}
-
-static int load_tss_segment16(struct kvm_vcpu *vcpu,
- struct desc_struct *seg_desc,
- struct tss_segment_16 *tss)
-{
- u32 base_addr;
-
- base_addr = get_tss_base_addr(vcpu, seg_desc);
-
- return kvm_read_guest(vcpu->kvm, base_addr, tss,
- sizeof(struct tss_segment_16));
-}
-
-static int save_tss_segment16(struct kvm_vcpu *vcpu,
- struct desc_struct *seg_desc,
- struct tss_segment_16 *tss)
-{
- u32 base_addr;
-
- base_addr = get_tss_base_addr(vcpu, seg_desc);
-
- return kvm_write_guest(vcpu->kvm, base_addr, tss,
- sizeof(struct tss_segment_16));
+ return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
}
static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -3466,20 +3431,26 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
}
static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
- struct desc_struct *cseg_desc,
+ u32 old_tss_base,
struct desc_struct *nseg_desc)
{
struct tss_segment_16 tss_segment_16;
int ret = 0;
- if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16))
+ if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
+ sizeof tss_segment_16))
goto out;
save_state_to_tss16(vcpu, &tss_segment_16);
- save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
- if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16))
+ if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
+ sizeof tss_segment_16))
+ goto out;
+
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
+ &tss_segment_16, sizeof tss_segment_16))
goto out;
+
if (load_state_from_tss16(vcpu, &tss_segment_16))
goto out;
@@ -3489,20 +3460,26 @@ out:
}
static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
- struct desc_struct *cseg_desc,
+ u32 old_tss_base,
struct desc_struct *nseg_desc)
{
struct tss_segment_32 tss_segment_32;
int ret = 0;
- if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32))
+ if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
+ sizeof tss_segment_32))
goto out;
save_state_to_tss32(vcpu, &tss_segment_32);
- save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
- if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32))
+ if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
+ sizeof tss_segment_32))
goto out;
+
+ if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
+ &tss_segment_32, sizeof tss_segment_32))
+ goto out;
+
if (load_state_from_tss32(vcpu, &tss_segment_32))
goto out;
@@ -3517,16 +3494,20 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
struct desc_struct cseg_desc;
struct desc_struct nseg_desc;
int ret = 0;
+ u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
+ u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
- kvm_get_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+ old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
+ /* FIXME: Handle errors. Failure to read either TSS or their
+ * descriptors should generate a pagefault.
+ */
if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
goto out;
- if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc))
+ if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
goto out;
-
if (reason != TASK_SWITCH_IRET) {
int cpl;
@@ -3544,8 +3525,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
cseg_desc.type &= ~(1 << 1); //clear the B flag
- save_guest_segment_descriptor(vcpu, tr_seg.selector,
- &cseg_desc);
+ save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
}
if (reason == TASK_SWITCH_IRET) {
@@ -3557,10 +3537,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
kvm_x86_ops->cache_regs(vcpu);
if (nseg_desc.type & 8)
- ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc,
+ ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
&nseg_desc);
else
- ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc,
+ ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
&nseg_desc);
if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
@@ -3995,16 +3975,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
*/
if (!user_alloc) {
if (npages && !old.rmap) {
+ unsigned long userspace_addr;
+
down_write(&current->mm->mmap_sem);
- memslot->userspace_addr = do_mmap(NULL, 0,
- npages * PAGE_SIZE,
- PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_ANONYMOUS,
- 0);
+ userspace_addr = do_mmap(NULL, 0,
+ npages * PAGE_SIZE,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS,
+ 0);
up_write(&current->mm->mmap_sem);
- if (IS_ERR((void *)memslot->userspace_addr))
- return PTR_ERR((void *)memslot->userspace_addr);
+ if (IS_ERR((void *)userspace_addr))
+ return PTR_ERR((void *)userspace_addr);
+
+ /* set userspace_addr atomically for kvm_hva_to_rmapp */
+ spin_lock(&kvm->mmu_lock);
+ memslot->userspace_addr = userspace_addr;
+ spin_unlock(&kvm->mmu_lock);
} else {
if (!old.user_alloc && old.rmap) {
int ret;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 0313a5eec41..d9249a882aa 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1014,6 +1014,9 @@ __init void lguest_init(void)
init_pg_tables_start = __pa(pg0);
init_pg_tables_end = __pa(pg0);
+ /* As described in head_32.S, we map the first 128M of memory. */
+ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
+
/* Load the %fs segment register (the per-cpu segment register) with
* the normal data segment to get through booting. */
asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dfdf428975c..f118c110af3 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -52,7 +52,7 @@
jnz 100b
102:
.section .fixup,"ax"
-103: addl %r8d,%edx /* ecx is zerorest also */
+103: addl %ecx,%edx /* ecx is zerorest also */
jmp copy_user_handle_tail
.previous
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 40e0e309d27..cb0c112386f 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -32,7 +32,7 @@
jnz 100b
102:
.section .fixup,"ax"
-103: addl %r8d,%edx /* ecx is zerorest also */
+103: addl %ecx,%edx /* ecx is zerorest also */
jmp copy_user_handle_tail
.previous
@@ -108,7 +108,6 @@ ENTRY(__copy_user_nocache)
jmp 60f
50: movl %ecx,%edx
60: sfence
- movl %r8d,%ecx
jmp copy_user_handle_tail
.previous
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 1fbb844c3d7..dfb932dcf13 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o
+ pat.o pgtable.o gup.o
obj-$(CONFIG_X86_32) += pgtable_32.o
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 00000000000..007bb06c750
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,298 @@
+/*
+ * Lockless get_user_pages_fast for x86
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/highmem.h>
+
+#include <asm/pgtable.h>
+
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+#ifndef CONFIG_X86_PAE
+ return *ptep;
+#else
+ /*
+ * With get_user_pages_fast, we walk down the pagetables without taking
+ * any locks. For this we would like to load the pointers atoimcally,
+ * but that is not possible (without expensive cmpxchg8b) on PAE. What
+ * we do have is the guarantee that a pte will only either go from not
+ * present to present, or present to not present or both -- it will not
+ * switch to a completely different present page without a TLB flush in
+ * between; something that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ * ptep->pte_high = h;
+ * smp_wmb();
+ * ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ * ptep->pte_low = 0;
+ * smp_wmb();
+ * ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees l iff pte_high
+ * sees h. We load pte_high *after* loading pte_low, which ensures we
+ * don't see an older value of pte_high. *Then* we recheck pte_low,
+ * which ensures that we haven't picked up a changed pte high. We might
+ * have got rubbish values from pte_low and pte_high, but we are
+ * guaranteed that pte_low will not have the present bit set *unless*
+ * it is 'l'. And get_user_pages_fast only operates on present ptes, so
+ * we're safe.
+ *
+ * gup_get_pte should not be used or copied outside gup.c without being
+ * very careful -- it does not atomically load the pte or anything that
+ * is likely to be useful for you.
+ */
+ pte_t pte;
+
+retry:
+ pte.pte_low = ptep->pte_low;
+ smp_rmb();
+ pte.pte_high = ptep->pte_high;
+ smp_rmb();
+ if (unlikely(pte.pte_low != ptep->pte_low))
+ goto retry;
+
+ return pte;
+#endif
+}
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ pte_t *ptep;
+
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+
+ ptep = pte_offset_map(&pmd, addr);
+ do {
+ pte_t pte = gup_get_pte(ptep);
+ struct page *page;
+
+ if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ pte_unmap(ptep);
+ return 0;
+ }
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ page = pte_page(pte);
+ get_page(page);
+ pages[*nr] = page;
+ (*nr)++;
+
+ } while (ptep++, addr += PAGE_SIZE, addr != end);
+ pte_unmap(ptep - 1);
+
+ return 1;
+}
+
+static inline void get_head_page_multiple(struct page *page, int nr)
+{
+ VM_BUG_ON(page != compound_head(page));
+ VM_BUG_ON(page_count(page) == 0);
+ atomic_add(nr, &page->_count);
+}
+
+static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ pte_t pte = *(pte_t *)&pmd;
+ struct page *head, *page;
+ int refs;
+
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+ if ((pte_val(pte) & mask) != mask)
+ return 0;
+ /* hugepages are never "special" */
+ VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+ page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+ get_head_page_multiple(head, refs);
+
+ return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+
+ pmdp = pmd_offset(&pud, addr);
+ do {
+ pmd_t pmd = *pmdp;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(pmd))
+ return 0;
+ if (unlikely(pmd_large(pmd))) {
+ if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+ return 0;
+ } else {
+ if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ return 0;
+ }
+ } while (pmdp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ pte_t pte = *(pte_t *)&pud;
+ struct page *head, *page;
+ int refs;
+
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+ if ((pte_val(pte) & mask) != mask)
+ return 0;
+ /* hugepages are never "special" */
+ VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+ page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+ get_head_page_multiple(head, refs);
+
+ return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pud_t *pudp;
+
+ pudp = pud_offset(&pgd, addr);
+ do {
+ pud_t pud = *pudp;
+
+ next = pud_addr_end(addr, end);
+ if (pud_none(pud))
+ return 0;
+ if (unlikely(pud_large(pud))) {
+ if (!gup_huge_pud(pud, addr, next, write, pages, nr))
+ return 0;
+ } else {
+ if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ return 0;
+ }
+ } while (pudp++, addr = next, addr != end);
+
+ return 1;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, len, end;
+ unsigned long next;
+ pgd_t *pgdp;
+ int nr = 0;
+
+ start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ start, len)))
+ goto slow_irqon;
+
+ /*
+ * XXX: batch / limit 'nr', to avoid large irq off latency
+ * needs some instrumenting to determine the common sizes used by
+ * important workloads (eg. DB2), and whether limiting the batch size
+ * will decrease performance.
+ *
+ * It seems like we're in the clear for the moment. Direct-IO is
+ * the main guy that batches up lots of get_user_pages, and even
+ * they are limited to 64-at-a-time which is not so many.
+ */
+ /*
+ * This doesn't prevent pagetable teardown, but does prevent
+ * the pagetables and pages from being freed on x86.
+ *
+ * So long as we atomically load page table pointers versus teardown
+ * (which we do on x86, with the above PAE exception), we can follow the
+ * address down to the the page and take a ref on it.
+ */
+ local_irq_disable();
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd_t pgd = *pgdp;
+
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ goto slow;
+ if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ goto slow;
+ } while (pgdp++, addr = next, addr != end);
+ local_irq_enable();
+
+ VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+ return nr;
+
+ {
+ int ret;
+
+slow:
+ local_irq_enable();
+slow_irqon:
+ /* Try to get the remaining pages with get_user_pages */
+ start += nr << PAGE_SHIFT;
+ pages += nr;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start,
+ (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ /* Have to be a bit careful with return values */
+ if (nr > 0) {
+ if (ret < 0)
+ ret = nr;
+ else
+ ret += nr;
+ }
+
+ return ret;
+ }
+}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec37121f670..a87ea0e4b3d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -60,7 +60,7 @@ static unsigned long dma_reserve __initdata;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-int direct_gbpages __meminitdata
+int direct_gbpages
#ifdef CONFIG_DIRECT_GBPAGES
= 1
#endif
@@ -86,46 +86,13 @@ early_param("gbpages", parse_direct_gbpages_on);
* around without checking the pgd every time.
*/
-void show_mem(void)
-{
- long i, total = 0, reserved = 0;
- long shared = 0, cached = 0;
- struct page *page;
- pg_data_t *pgdat;
-
- printk(KERN_INFO "Mem-info:\n");
- show_free_areas();
- for_each_online_pgdat(pgdat) {
- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
- /*
- * This loop can take a while with 256 GB and
- * 4k pages so defer the NMI watchdog:
- */
- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
- touch_nmi_watchdog();
-
- if (!pfn_valid(pgdat->node_start_pfn + i))
- continue;
-
- page = pfn_to_page(pgdat->node_start_pfn + i);
- total++;
- if (PageReserved(page))
- reserved++;
- else if (PageSwapCache(page))
- cached++;
- else if (page_count(page))
- shared += page_count(page) - 1;
- }
- }
- printk(KERN_INFO "%lu pages of RAM\n", total);
- printk(KERN_INFO "%lu reserved pages\n", reserved);
- printk(KERN_INFO "%lu pages shared\n", shared);
- printk(KERN_INFO "%lu pages swap cached\n", cached);
-}
-
int after_bootmem;
-static __init void *spp_getpage(void)
+/*
+ * NOTE: This function is marked __ref because it calls __init function
+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ */
+static __ref void *spp_getpage(void)
{
void *ptr;
@@ -351,6 +318,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
{
unsigned long pages = 0;
unsigned long last_map_addr = end;
+ unsigned long start = address;
int i = pmd_index(address);
@@ -371,6 +339,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
if (!pmd_large(*pmd))
last_map_addr = phys_pte_update(pmd, address,
end);
+ /* Count entries we're using from level2_ident_pgt */
+ if (start == 0)
+ pages++;
continue;
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 016f335bbee..6ba6f889c79 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -170,7 +170,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
- retval = reserve_memtype(phys_addr, phys_addr + size,
+ retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
prot_val, &new_prot_val);
if (retval) {
pr_debug("Warning: reserve_memtype returned %d\n", retval);
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 0dcd42eb94e..d4aa503caaa 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -221,8 +221,7 @@ static int pageattr_test(void)
failed += print_split(&sc);
if (failed) {
- printk(KERN_ERR "NOT PASSED. Please report.\n");
- WARN_ON(1);
+ WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
return -EINVAL;
} else {
if (print)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 65c6e46bf05..f5f5154ea11 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -55,13 +55,19 @@ static void split_page_count(int level)
int arch_report_meminfo(char *page)
{
- int n = sprintf(page, "DirectMap4k: %8lu\n"
- "DirectMap2M: %8lu\n",
- direct_pages_count[PG_LEVEL_4K],
- direct_pages_count[PG_LEVEL_2M]);
+ int n = sprintf(page, "DirectMap4k: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_4K] << 2);
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+ n += sprintf(page + n, "DirectMap2M: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_2M] << 11);
+#else
+ n += sprintf(page + n, "DirectMap4M: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_2M] << 12);
+#endif
#ifdef CONFIG_X86_64
- n += sprintf(page + n, "DirectMap1G: %8lu\n",
- direct_pages_count[PG_LEVEL_1G]);
+ if (direct_gbpages)
+ n += sprintf(page + n, "DirectMap1G: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_1G] << 20);
#endif
return n;
}
@@ -592,10 +598,9 @@ repeat:
if (!pte_val(old_pte)) {
if (!primary)
return 0;
- printk(KERN_WARNING "CPA: called for zero pte. "
+ WARN(1, KERN_WARNING "CPA: called for zero pte. "
"vaddr = %lx cpa->vaddr = %lx\n", address,
cpa->vaddr);
- WARN_ON(1);
return -EINVAL;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 557b2abceef..d50302774fe 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -207,6 +207,9 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
unsigned long addr;
int i;
+ if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
+ return;
+
pud = pud_offset(pgd, 0);
for (addr = i = 0; i < PREALLOCATED_PMDS;
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b4becbf8c57..cab0abbd1eb 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,53 +20,6 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-void show_mem(void)
-{
- int total = 0, reserved = 0;
- int shared = 0, cached = 0;
- int highmem = 0;
- struct page *page;
- pg_data_t *pgdat;
- unsigned long i;
- unsigned long flags;
-
- printk(KERN_INFO "Mem-info:\n");
- show_free_areas();
- for_each_online_pgdat(pgdat) {
- pgdat_resize_lock(pgdat, &flags);
- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
- if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
- touch_nmi_watchdog();
- page = pgdat_page_nr(pgdat, i);
- total++;
- if (PageHighMem(page))
- highmem++;
- if (PageReserved(page))
- reserved++;
- else if (PageSwapCache(page))
- cached++;
- else if (page_count(page))
- shared += page_count(page) - 1;
- }
- pgdat_resize_unlock(pgdat, &flags);
- }
- printk(KERN_INFO "%d pages of RAM\n", total);
- printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
- printk(KERN_INFO "%d reserved pages\n", reserved);
- printk(KERN_INFO "%d pages shared\n", shared);
- printk(KERN_INFO "%d pages swap cached\n", cached);
-
- printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
- printk(KERN_INFO "%lu pages writeback\n",
- global_page_state(NR_WRITEBACK));
- printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
- printk(KERN_INFO "%lu pages slab\n",
- global_page_state(NR_SLAB_RECLAIMABLE) +
- global_page_state(NR_SLAB_UNRECLAIMABLE));
- printk(KERN_INFO "%lu pages pagetables\n",
- global_page_state(NR_PAGETABLE));
-}
-
/*
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 1eb2973a301..16ae70fc57e 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -178,7 +178,7 @@ void acpi_numa_arch_fixup(void)
* start of the node, and that the current "end" address is after
* the previous one.
*/
-static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
+static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
{
/*
* Only add present memory as told by the e820.
@@ -189,10 +189,10 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
if (memory_chunk->start_pfn >= max_pfn) {
printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
memory_chunk->start_pfn, memory_chunk->end_pfn);
- return;
+ return -1;
}
if (memory_chunk->nid != nid)
- return;
+ return -1;
if (!node_has_online_mem(nid))
node_start_pfn[nid] = memory_chunk->start_pfn;
@@ -202,6 +202,8 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
if (node_end_pfn[nid] < memory_chunk->end_pfn)
node_end_pfn[nid] = memory_chunk->end_pfn;
+
+ return 0;
}
int __init get_memcfg_from_srat(void)
@@ -259,7 +261,9 @@ int __init get_memcfg_from_srat(void)
printk(KERN_DEBUG
"chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
- node_read_chunk(chunk->nid, chunk);
+ if (node_read_chunk(chunk->nid, chunk))
+ continue;
+
e820_register_active_regions(chunk->nid, chunk->start_pfn,
min(chunk->end_pfn, max_pfn));
}
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index e641545d479..cacba61ffba 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -10,11 +10,12 @@
#include <linux/oprofile.h>
#include <linux/smp.h>
+#include <linux/ptrace.h>
+#include <linux/nmi.h>
#include <asm/msr.h>
-#include <asm/ptrace.h>
#include <asm/fixmap.h>
#include <asm/apic.h>
-#include <asm/nmi.h>
+
#include "op_x86_model.h"
#include "op_counter.h"
@@ -40,7 +41,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT;
static inline void setup_num_counters(void)
{
#ifdef CONFIG_SMP
- if (smp_num_siblings == 2){
+ if (smp_num_siblings == 2) {
num_counters = NUM_COUNTERS_HT2;
num_controls = NUM_CONTROLS_HT2;
}
@@ -86,7 +87,7 @@ struct p4_event_binding {
#define CTR_FLAME_2 (1 << 6)
#define CTR_IQ_5 (1 << 7)
-static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
+static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
{ CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
{ CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
{ CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
@@ -97,32 +98,32 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
{ CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
};
-#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT
+#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
/* p4 event codes in libop/op_event.h are indices into this table. */
static struct p4_event_binding p4_events[NUM_EVENTS] = {
-
+
{ /* BRANCH_RETIRED */
- 0x05, 0x06,
+ 0x05, 0x06,
{ {CTR_IQ_4, MSR_P4_CRU_ESCR2},
{CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
-
+
{ /* MISPRED_BRANCH_RETIRED */
- 0x04, 0x03,
+ 0x04, 0x03,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR0},
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
-
+
{ /* TC_DELIVER_MODE */
0x01, 0x01,
- { { CTR_MS_0, MSR_P4_TC_ESCR0},
+ { { CTR_MS_0, MSR_P4_TC_ESCR0},
{ CTR_MS_2, MSR_P4_TC_ESCR1} }
},
-
+
{ /* BPU_FETCH_REQUEST */
- 0x00, 0x03,
+ 0x00, 0x03,
{ { CTR_BPU_0, MSR_P4_BPU_ESCR0},
{ CTR_BPU_2, MSR_P4_BPU_ESCR1} }
},
@@ -146,7 +147,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* LOAD_PORT_REPLAY */
- 0x02, 0x04,
+ 0x02, 0x04,
{ { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
{ CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
},
@@ -170,43 +171,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* BSQ_CACHE_REFERENCE */
- 0x07, 0x0c,
+ 0x07, 0x0c,
{ { CTR_BPU_0, MSR_P4_BSU_ESCR0},
{ CTR_BPU_2, MSR_P4_BSU_ESCR1} }
},
{ /* IOQ_ALLOCATION */
- 0x06, 0x03,
+ 0x06, 0x03,
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ 0, 0 } }
},
{ /* IOQ_ACTIVE_ENTRIES */
- 0x06, 0x1a,
+ 0x06, 0x1a,
{ { CTR_BPU_2, MSR_P4_FSB_ESCR1},
{ 0, 0 } }
},
{ /* FSB_DATA_ACTIVITY */
- 0x06, 0x17,
+ 0x06, 0x17,
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ CTR_BPU_2, MSR_P4_FSB_ESCR1} }
},
{ /* BSQ_ALLOCATION */
- 0x07, 0x05,
+ 0x07, 0x05,
{ { CTR_BPU_0, MSR_P4_BSU_ESCR0},
{ 0, 0 } }
},
{ /* BSQ_ACTIVE_ENTRIES */
0x07, 0x06,
- { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
+ { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
{ 0, 0 } }
},
{ /* X87_ASSIST */
- 0x05, 0x03,
+ 0x05, 0x03,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
{ CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
@@ -216,21 +217,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* PACKED_SP_UOP */
- 0x01, 0x08,
+ 0x01, 0x08,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* PACKED_DP_UOP */
- 0x01, 0x0c,
+ 0x01, 0x0c,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
{ /* SCALAR_SP_UOP */
- 0x01, 0x0a,
+ 0x01, 0x0a,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
@@ -242,31 +243,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* 64BIT_MMX_UOP */
- 0x01, 0x02,
+ 0x01, 0x02,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* 128BIT_MMX_UOP */
- 0x01, 0x1a,
+ 0x01, 0x1a,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
{ /* X87_FP_UOP */
- 0x01, 0x04,
+ 0x01, 0x04,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* X87_SIMD_MOVES_UOP */
- 0x01, 0x2e,
+ 0x01, 0x2e,
{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
{ CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
},
-
+
{ /* MACHINE_CLEAR */
- 0x05, 0x02,
+ 0x05, 0x02,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
{ CTR_IQ_5, MSR_P4_CRU_ESCR3} }
},
@@ -276,9 +277,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
{ CTR_BPU_2, MSR_P4_FSB_ESCR1} }
},
-
+
{ /* TC_MS_XFER */
- 0x00, 0x05,
+ 0x00, 0x05,
{ { CTR_MS_0, MSR_P4_MS_ESCR0},
{ CTR_MS_2, MSR_P4_MS_ESCR1} }
},
@@ -308,7 +309,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
},
{ /* INSTR_RETIRED */
- 0x04, 0x02,
+ 0x04, 0x02,
{ { CTR_IQ_4, MSR_P4_CRU_ESCR0},
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
@@ -319,14 +320,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
{ CTR_IQ_5, MSR_P4_CRU_ESCR1} }
},
- { /* UOP_TYPE */
- 0x02, 0x02,
+ { /* UOP_TYPE */
+ 0x02, 0x02,
{ { CTR_IQ_4, MSR_P4_RAT_ESCR0},
{ CTR_IQ_5, MSR_P4_RAT_ESCR1} }
},
{ /* RETIRED_MISPRED_BRANCH_TYPE */
- 0x02, 0x05,
+ 0x02, 0x05,
{ { CTR_MS_0, MSR_P4_TBPU_ESCR0},
{ CTR_MS_2, MSR_P4_TBPU_ESCR1} }
},
@@ -349,8 +350,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
-#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
-#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0)
+#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
+#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
#define CCCR_RESERVED_BITS 0x38030FFF
#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -360,15 +361,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
-#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
-#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0)
+#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
+#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
-#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
-#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
-#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0)
-#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0)
+#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
+#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
+#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
+#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
@@ -380,7 +381,7 @@ static unsigned int get_stagger(void)
#ifdef CONFIG_SMP
int cpu = smp_processor_id();
return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu)));
-#endif
+#endif
return 0;
}
@@ -395,25 +396,23 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT];
static void p4_fill_in_addresses(struct op_msrs * const msrs)
{
- unsigned int i;
+ unsigned int i;
unsigned int addr, cccraddr, stag;
setup_num_counters();
stag = get_stagger();
/* initialize some registers */
- for (i = 0; i < num_counters; ++i) {
+ for (i = 0; i < num_counters; ++i)
msrs->counters[i].addr = 0;
- }
- for (i = 0; i < num_controls; ++i) {
+ for (i = 0; i < num_controls; ++i)
msrs->controls[i].addr = 0;
- }
-
+
/* the counter & cccr registers we pay attention to */
for (i = 0; i < num_counters; ++i) {
addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
- if (reserve_perfctr_nmi(addr)){
+ if (reserve_perfctr_nmi(addr)) {
msrs->counters[i].addr = addr;
msrs->controls[i].addr = cccraddr;
}
@@ -447,22 +446,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
-
+
for (addr = MSR_P4_MS_ESCR0 + stag;
- addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
+ addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
-
+
for (addr = MSR_P4_IX_ESCR0 + stag;
- addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
+ addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
if (reserve_evntsel_nmi(addr))
msrs->controls[i].addr = addr;
}
/* there are 2 remaining non-contiguously located ESCRs */
- if (num_counters == NUM_COUNTERS_NON_HT) {
+ if (num_counters == NUM_COUNTERS_NON_HT) {
/* standard non-HT CPUs handle both remaining ESCRs*/
if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
@@ -498,20 +497,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
unsigned int stag;
stag = get_stagger();
-
+
/* convert from counter *number* to counter *bit* */
counter_bit = 1 << VIRT_CTR(stag, ctr);
-
+
/* find our event binding structure. */
if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
- printk(KERN_ERR
- "oprofile: P4 event code 0x%lx out of range\n",
+ printk(KERN_ERR
+ "oprofile: P4 event code 0x%lx out of range\n",
counter_config[ctr].event);
return;
}
-
+
ev = &(p4_events[counter_config[ctr].event - 1]);
-
+
for (i = 0; i < maxbind; i++) {
if (ev->bindings[i].virt_counter & counter_bit) {
@@ -526,25 +525,24 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
}
ESCR_SET_EVENT_SELECT(escr, ev->event_select);
- ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
+ ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
ESCR_WRITE(escr, high, ev, i);
-
+
/* modify CCCR */
CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
CCCR_CLEAR(cccr);
CCCR_SET_REQUIRED_BITS(cccr);
CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
- if (stag == 0) {
+ if (stag == 0)
CCCR_SET_PMI_OVF_0(cccr);
- } else {
+ else
CCCR_SET_PMI_OVF_1(cccr);
- }
CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
return;
}
}
- printk(KERN_ERR
+ printk(KERN_ERR
"oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
counter_config[ctr].event, stag, ctr);
}
@@ -559,14 +557,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
stag = get_stagger();
rdmsr(MSR_IA32_MISC_ENABLE, low, high);
- if (! MISC_PMC_ENABLED_P(low)) {
+ if (!MISC_PMC_ENABLED_P(low)) {
printk(KERN_ERR "oprofile: P4 PMC not available\n");
return;
}
/* clear the cccrs we will use */
for (i = 0 ; i < num_counters ; i++) {
- if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+ if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
continue;
rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
CCCR_CLEAR(low);
@@ -576,14 +574,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
/* clear all escrs (including those outside our concern) */
for (i = num_counters; i < num_controls; i++) {
- if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+ if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
continue;
wrmsr(msrs->controls[i].addr, 0, 0);
}
/* setup all counters */
for (i = 0 ; i < num_counters ; ++i) {
- if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) {
+ if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) {
reset_value[i] = counter_config[i].count;
pmc_setup_one_p4_counter(i);
CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
@@ -603,11 +601,11 @@ static int p4_check_ctrs(struct pt_regs * const regs,
stag = get_stagger();
for (i = 0; i < num_counters; ++i) {
-
- if (!reset_value[i])
+
+ if (!reset_value[i])
continue;
- /*
+ /*
* there is some eccentricity in the hardware which
* requires that we perform 2 extra corrections:
*
@@ -616,24 +614,24 @@ static int p4_check_ctrs(struct pt_regs * const regs,
*
* - write the counter back twice to ensure it gets
* updated properly.
- *
+ *
* the former seems to be related to extra NMIs happening
* during the current NMI; the latter is reported as errata
* N15 in intel doc 249199-029, pentium 4 specification
* update, though their suggested work-around does not
* appear to solve the problem.
*/
-
+
real = VIRT_CTR(stag, i);
CCCR_READ(low, high, real);
- CTR_READ(ctr, high, real);
+ CTR_READ(ctr, high, real);
if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
oprofile_add_sample(regs, i);
- CTR_WRITE(reset_value[i], real);
+ CTR_WRITE(reset_value[i], real);
CCCR_CLEAR_OVF(low);
CCCR_WRITE(low, high, real);
- CTR_WRITE(reset_value[i], real);
+ CTR_WRITE(reset_value[i], real);
}
}
@@ -683,15 +681,16 @@ static void p4_shutdown(struct op_msrs const * const msrs)
int i;
for (i = 0 ; i < num_counters ; ++i) {
- if (CTR_IS_RESERVED(msrs,i))
+ if (CTR_IS_RESERVED(msrs, i))
release_perfctr_nmi(msrs->counters[i].addr);
}
- /* some of the control registers are specially reserved in
+ /*
+ * some of the control registers are specially reserved in
* conjunction with the counter registers (hence the starting offset).
* This saves a few bits.
*/
for (i = num_counters ; i < num_controls ; ++i) {
- if (CTRL_IS_RESERVED(msrs,i))
+ if (CTRL_IS_RESERVED(msrs, i))
release_evntsel_nmi(msrs->controls[i].addr);
}
}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index ff3a6a33634..4bdaa590375 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -23,7 +23,8 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
pci_read_config_byte(d, reg++, &busno);
pci_read_config_byte(d, reg++, &suba);
pci_read_config_byte(d, reg++, &subb);
- DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
+ dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno,
+ suba, subb);
if (busno)
pci_scan_bus_with_sysdata(busno); /* Bus A */
if (suba < subb)
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index a09505806b8..5807d1bc73f 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -128,10 +128,8 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
pr = pci_find_parent_resource(dev, r);
if (!r->start || !pr ||
request_resource(pr, r) < 0) {
- printk(KERN_ERR "PCI: Cannot allocate "
- "resource region %d "
- "of bridge %s\n",
- idx, pci_name(dev));
+ dev_err(&dev->dev, "BAR %d: can't "
+ "allocate resource\n", idx);
/*
* Something is wrong with the region.
* Invalidate the resource to prevent
@@ -166,15 +164,15 @@ static void __init pcibios_allocate_resources(int pass)
else
disabled = !(command & PCI_COMMAND_MEMORY);
if (pass == disabled) {
- DBG("PCI: Resource %08lx-%08lx "
- "(f=%lx, d=%d, p=%d)\n",
- r->start, r->end, r->flags, disabled, pass);
+ dev_dbg(&dev->dev, "resource %#08llx-%#08llx "
+ "(f=%lx, d=%d, p=%d)\n",
+ (unsigned long long) r->start,
+ (unsigned long long) r->end,
+ r->flags, disabled, pass);
pr = pci_find_parent_resource(dev, r);
if (!pr || request_resource(pr, r) < 0) {
- printk(KERN_ERR "PCI: Cannot allocate "
- "resource region %d "
- "of device %s\n",
- idx, pci_name(dev));
+ dev_err(&dev->dev, "BAR %d: can't "
+ "allocate resource\n", idx);
/* We'll assign a new address later */
r->end -= r->start;
r->start = 0;
@@ -187,8 +185,7 @@ static void __init pcibios_allocate_resources(int pass)
/* Turn the ROM off, leave the resource region,
* but keep it unregistered. */
u32 reg;
- DBG("PCI: Switching off ROM of %s\n",
- pci_name(dev));
+ dev_dbg(&dev->dev, "disabling ROM\n");
r->flags &= ~IORESOURCE_ROM_ENABLE;
pci_read_config_dword(dev,
dev->rom_base_reg, &reg);
@@ -257,8 +254,7 @@ void pcibios_set_master(struct pci_dev *dev)
lat = pcibios_max_latency;
else
return;
- printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n",
- pci_name(dev), lat);
+ dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 6a06a2eb059..fec0123b33a 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -436,7 +436,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
{
WARN_ON_ONCE(pirq >= 9);
if (pirq > 8) {
- printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+ dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
return 0;
}
return read_config_nybble(router, 0x74, pirq-1);
@@ -446,7 +446,7 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
{
WARN_ON_ONCE(pirq >= 9);
if (pirq > 8) {
- printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+ dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
return 0;
}
write_config_nybble(router, 0x74, pirq-1, irq);
@@ -492,15 +492,17 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
irq = 0;
if (pirq <= 4)
irq = read_config_nybble(router, 0x56, pirq - 1);
- printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
- dev->vendor, dev->device, pirq, irq);
+ dev_info(&dev->dev,
+ "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n",
+ dev->vendor, dev->device, pirq, irq);
return irq;
}
static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
{
- printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
- dev->vendor, dev->device, pirq, irq);
+ dev_info(&dev->dev,
+ "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n",
+ dev->vendor, dev->device, pirq, irq);
if (pirq <= 4)
write_config_nybble(router, 0x56, pirq - 1, irq);
return 1;
@@ -730,7 +732,6 @@ static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router,
switch (device) {
case PCI_DEVICE_ID_AL_M1533:
case PCI_DEVICE_ID_AL_M1563:
- printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
r->name = "ALI";
r->get = pirq_ali_get;
r->set = pirq_ali_set;
@@ -840,11 +841,9 @@ static void __init pirq_find_router(struct irq_router *r)
h->probe(r, pirq_router_dev, pirq_router_dev->device))
break;
}
- printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
- pirq_router.name,
- pirq_router_dev->vendor,
- pirq_router_dev->device,
- pci_name(pirq_router_dev));
+ dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n",
+ pirq_router.name,
+ pirq_router_dev->vendor, pirq_router_dev->device);
/* The device remains referenced for the kernel lifetime */
}
@@ -877,7 +876,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
/* Find IRQ pin */
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (!pin) {
- DBG(KERN_DEBUG " -> no interrupt pin\n");
+ dev_dbg(&dev->dev, "no interrupt pin\n");
return 0;
}
pin = pin - 1;
@@ -887,20 +886,20 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
if (!pirq_table)
return 0;
- DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
info = pirq_get_info(dev);
if (!info) {
- DBG(" -> not found in routing table\n" KERN_DEBUG);
+ dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
+ 'A' + pin);
return 0;
}
pirq = info->irq[pin].link;
mask = info->irq[pin].bitmap;
if (!pirq) {
- DBG(" -> not routed\n" KERN_DEBUG);
+ dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
return 0;
}
- DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask,
- pirq_table->exclusive_irqs);
+ dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
+ 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
mask &= pcibios_irq_mask;
/* Work around broken HP Pavilion Notebooks which assign USB to
@@ -930,10 +929,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
if (pci_probe & PCI_USE_PIRQ_MASK)
newirq = 0;
else
- printk("\n" KERN_WARNING
- "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n"
- KERN_DEBUG, newirq,
- pci_name(dev));
+ dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
+ "%#x; try pci=usepirqmask\n", newirq, mask);
}
if (!newirq && assign) {
for (i = 0; i < 16; i++) {
@@ -944,39 +941,35 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
newirq = i;
}
}
- DBG(" -> newirq=%d", newirq);
+ dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
/* Check if it is hardcoded */
if ((pirq & 0xf0) == 0xf0) {
irq = pirq & 0xf;
- DBG(" -> hardcoded IRQ %d\n", irq);
- msg = "Hardcoded";
+ msg = "hardcoded";
} else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
- DBG(" -> got IRQ %d\n", irq);
- msg = "Found";
+ msg = "found";
eisa_set_level_irq(irq);
} else if (newirq && r->set &&
(dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
- DBG(" -> assigning IRQ %d", newirq);
if (r->set(pirq_router_dev, dev, pirq, newirq)) {
eisa_set_level_irq(newirq);
- DBG(" ... OK\n");
- msg = "Assigned";
+ msg = "assigned";
irq = newirq;
}
}
if (!irq) {
- DBG(" ... failed\n");
if (newirq && mask == (1 << newirq)) {
- msg = "Guessed";
+ msg = "guessed";
irq = newirq;
- } else
+ } else {
+ dev_dbg(&dev->dev, "can't route interrupt\n");
return 0;
+ }
}
- printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq,
- pci_name(dev));
+ dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
/* Update IRQ for all devices with the same pirq value */
while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
@@ -996,17 +989,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
(!(pci_probe & PCI_USE_PIRQ_MASK) || \
((1 << dev2->irq) & mask))) {
#ifndef CONFIG_PCI_MSI
- printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
- pci_name(dev2), dev2->irq, irq);
+ dev_info(&dev2->dev, "IRQ routing conflict: "
+ "have IRQ %d, want IRQ %d\n",
+ dev2->irq, irq);
#endif
continue;
}
dev2->irq = irq;
pirq_penalty[irq]++;
if (dev != dev2)
- printk(KERN_INFO
- "PCI: Sharing IRQ %d with %s\n",
- irq, pci_name(dev2));
+ dev_info(&dev->dev, "sharing IRQ %d with %s\n",
+ irq, pci_name(dev2));
}
}
return 1;
@@ -1025,8 +1018,7 @@ static void __init pcibios_fixup_irqs(void)
* already in use.
*/
if (dev->irq >= 16) {
- DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n",
- pci_name(dev), dev->irq);
+ dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
dev->irq = 0;
}
/*
@@ -1070,12 +1062,12 @@ static void __init pcibios_fixup_irqs(void)
irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
PCI_SLOT(bridge->devfn), pin);
if (irq >= 0)
- printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
- pci_name(bridge), 'A' + pin, irq);
+ dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n",
+ pci_name(bridge),
+ 'A' + pin, irq);
}
if (irq >= 0) {
- printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
- pci_name(dev), 'A' + pin, irq);
+ dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq);
dev->irq = irq;
}
}
@@ -1231,25 +1223,24 @@ static int pirq_enable_irq(struct pci_dev *dev)
irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
PCI_SLOT(bridge->devfn), pin);
if (irq >= 0)
- printk(KERN_WARNING
- "PCI: using PPB %s[%c] to get irq %d\n",
- pci_name(bridge),
- 'A' + pin, irq);
+ dev_warn(&dev->dev, "using bridge %s "
+ "INT %c to get IRQ %d\n",
+ pci_name(bridge), 'A' + pin,
+ irq);
dev = bridge;
}
dev = temp_dev;
if (irq >= 0) {
- printk(KERN_INFO
- "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
- pci_name(dev), 'A' + pin, irq);
+ dev_info(&dev->dev, "PCI->APIC IRQ transform: "
+ "INT %c -> IRQ %d\n", 'A' + pin, irq);
dev->irq = irq;
return 0;
} else
- msg = " Probably buggy MP table.";
+ msg = "; probably buggy MP table";
} else if (pci_probe & PCI_BIOS_IRQ_SCAN)
msg = "";
else
- msg = " Please try using pci=biosirq.";
+ msg = "; please try using pci=biosirq";
/*
* With IDE legacy devices the IRQ lookup failure is not
@@ -1259,9 +1250,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
!(dev->class & 0x5))
return 0;
- printk(KERN_WARNING
- "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
- 'A' + pin, pci_name(dev), msg);
+ dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
+ 'A' + pin, msg);
}
return 0;
}
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 23faaa890ff..2bd5c53f638 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -365,7 +365,7 @@ static void __init pci_mmcfg_reject_broken(int early)
return;
reject:
- printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
+ printk(KERN_INFO "PCI: Not using MMCONFIG.\n");
pci_mmcfg_arch_free();
kfree(pci_mmcfg_config);
pci_mmcfg_config = NULL;
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index f4b16dc11da..1177845d318 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -131,13 +131,14 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
u8 busno, suba, subb;
int quad = BUS2QUAD(d->bus->number);
- printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d));
+ dev_info(&d->dev, "searching for i450NX host bridges\n");
reg = 0xd0;
for(pxb=0; pxb<2; pxb++) {
pci_read_config_byte(d, reg++, &busno);
pci_read_config_byte(d, reg++, &suba);
pci_read_config_byte(d, reg++, &subb);
- DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb);
+ dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n",
+ pxb, busno, suba, subb);
if (busno) {
/* Bus A */
pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno));
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 7dc5d5cf50a..d3e083dea72 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -45,7 +45,7 @@ static void __save_processor_state(struct saved_context *ctxt)
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
ctxt->cr3 = read_cr3();
- ctxt->cr4 = read_cr4();
+ ctxt->cr4 = read_cr4_safe();
}
/* Needed by apm.c */
@@ -98,7 +98,9 @@ static void __restore_processor_state(struct saved_context *ctxt)
/*
* control registers
*/
- write_cr4(ctxt->cr4);
+ /* cr4 was introduced in the Pentium CPU */
+ if (ctxt->cr4)
+ write_cr4(ctxt->cr4);
write_cr3(ctxt->cr3);
write_cr2(ctxt->cr2);
write_cr0(ctxt->cr0);
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index b95aa6cfe3c..4fc7e872c85 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -28,9 +28,9 @@ ENTRY(swsusp_arch_suspend)
ret
ENTRY(restore_image)
- movl resume_pg_dir, %ecx
- subl $__PAGE_OFFSET, %ecx
- movl %ecx, %cr3
+ movl resume_pg_dir, %eax
+ subl $__PAGE_OFFSET, %eax
+ movl %eax, %cr3
movl restore_pblist, %edx
.p2align 4,,7
@@ -52,17 +52,21 @@ copy_loop:
done:
/* go back to the original page tables */
- movl $swapper_pg_dir, %ecx
- subl $__PAGE_OFFSET, %ecx
- movl %ecx, %cr3
+ movl $swapper_pg_dir, %eax
+ subl $__PAGE_OFFSET, %eax
+ movl %eax, %cr3
/* Flush TLB, including "global" things (vmalloc) */
- movl mmu_cr4_features, %eax
- movl %eax, %edx
+ movl mmu_cr4_features, %ecx
+ jecxz 1f # cr4 Pentium and higher, skip if zero
+ movl %ecx, %edx
andl $~(1<<7), %edx; # PGE
movl %edx, %cr4; # turn off PGE
- movl %cr3, %ecx; # flush TLB
- movl %ecx, %cr3
- movl %eax, %cr4; # turn PGE back on
+1:
+ movl %cr3, %eax; # flush TLB
+ movl %eax, %cr3
+ jecxz 1f # cr4 Pentium and higher, skip if zero
+ movl %ecx, %cr4; # turn PGE back on
+1:
movl saved_context_esp, %esp
movl saved_context_ebp, %ebp