aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig10
-rw-r--r--arch/x86/Kconfig.cpu14
-rw-r--r--arch/x86/boot/memory.c9
-rw-r--r--arch/x86/boot/vesa.h9
-rw-r--r--arch/x86/boot/video-vesa.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c4
-rw-r--r--arch/x86/kernel/acpi/boot.c3
-rw-r--r--arch/x86/kernel/alternative.c12
-rw-r--r--arch/x86/kernel/aperture_64.c3
-rw-r--r--arch/x86/kernel/asm-offsets_32.c4
-rw-r--r--arch/x86/kernel/cpu/bugs.c8
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c39
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c42
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c12
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c7
-rw-r--r--arch/x86/kernel/cpu/transmeta.c7
-rw-r--r--arch/x86/kernel/e820_32.c26
-rw-r--r--arch/x86/kernel/e820_64.c27
-rw-r--r--arch/x86/kernel/entry_64.S6
-rw-r--r--arch/x86/kernel/head_32.S4
-rw-r--r--arch/x86/kernel/head_64.S22
-rw-r--r--arch/x86/kernel/hpet.c13
-rw-r--r--arch/x86/kernel/i387.c10
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/io_delay.c8
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/mfgpt_32.c1
-rw-r--r--arch/x86/kernel/pci-dma_64.c5
-rw-r--r--arch/x86/kernel/pci-gart_64.c10
-rw-r--r--arch/x86/kernel/process_32.c49
-rw-r--r--arch/x86/kernel/process_64.c55
-rw-r--r--arch/x86/kernel/ptrace.c204
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c24
-rw-r--r--arch/x86/kernel/setup64.c14
-rw-r--r--arch/x86/kernel/setup_32.c9
-rw-r--r--arch/x86/kernel/setup_64.c4
-rw-r--r--arch/x86/kernel/signal_32.c4
-rw-r--r--arch/x86/kernel/signal_64.c40
-rw-r--r--arch/x86/kernel/smpboot_64.c2
-rw-r--r--arch/x86/kernel/stacktrace.c4
-rw-r--r--arch/x86/kernel/step.c13
-rw-r--r--arch/x86/kernel/tls.c8
-rw-r--r--arch/x86/kernel/tsc_32.c7
-rw-r--r--arch/x86/kernel/tsc_64.c4
-rw-r--r--arch/x86/kernel/vsyscall_64.c49
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu.c56
-rw-r--r--arch/x86/kvm/paging_tmpl.h20
-rw-r--r--arch/x86/kvm/svm.c26
-rw-r--r--arch/x86/kvm/vmx.c21
-rw-r--r--arch/x86/kvm/x86.c114
-rw-r--r--arch/x86/lguest/boot.c159
-rw-r--r--arch/x86/lguest/i386_head.S15
-rw-r--r--arch/x86/mach-rdc321x/gpio.c199
-rw-r--r--arch/x86/mach-rdc321x/platform.c2
-rw-r--r--arch/x86/mach-visws/traps.c5
-rw-r--r--arch/x86/mm/discontig_32.c1
-rw-r--r--arch/x86/mm/fault.c10
-rw-r--r--arch/x86/mm/highmem_32.c6
-rw-r--r--arch/x86/mm/hugetlbpage.c2
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/ioremap.c16
-rw-r--r--arch/x86/mm/numa_64.c11
-rw-r--r--arch/x86/mm/pageattr.c105
-rw-r--r--arch/x86/mm/pgtable_32.c18
-rw-r--r--arch/x86/pci/acpi.c17
-rw-r--r--arch/x86/pci/irq.c4
-rw-r--r--arch/x86/pci/pcbios.c10
-rw-r--r--arch/x86/vdso/Makefile4
-rw-r--r--arch/x86/xen/enlighten.c75
-rw-r--r--arch/x86/xen/mmu.c7
-rw-r--r--arch/x86/xen/mmu.h7
-rw-r--r--arch/x86/xen/setup.c3
-rw-r--r--arch/x86/xen/xen-asm.S9
77 files changed, 1063 insertions, 697 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3be2305709b..6c70fed0f9a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,7 +21,8 @@ config X86
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_KPROBES
- select HAVE_KVM
+ select HAVE_KRETPROBES
+ select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
config GENERIC_LOCKBREAK
@@ -65,9 +66,6 @@ config MMU
config ZONE_DMA
def_bool y
-config QUICKLIST
- def_bool X86_32
-
config SBUS
bool
@@ -1054,7 +1052,7 @@ config SECCOMP
config CC_STACKPROTECTOR
bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
- depends on X86_64 && EXPERIMENTAL
+ depends on X86_64 && EXPERIMENTAL && BROKEN
help
This option turns on the -fstack-protector GCC feature. This
feature puts, at the beginning of critical functions, a canary
@@ -1261,7 +1259,7 @@ menuconfig APM
machines with more than one CPU.
In order to use APM, you will need supporting software. For location
- and more information, read <file:Documentation/pm.txt> and the
+ and more information, read <file:Documentation/power/pm.txt> and the
Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e09a6b73a1a..9304bfba7d4 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -377,6 +377,19 @@ config X86_OOSTORE
def_bool y
depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
+#
+# P6_NOPs are a relatively minor optimization that require a family >=
+# 6 processor, except that it is broken on certain VIA chips.
+# Furthermore, AMD chips prefer a totally different sequence of NOPs
+# (which work on all CPUs). As a result, disallow these if we're
+# compiling X86_GENERIC but not X86_64 (these NOPs do work on all
+# x86-64 capable chips); the list of processors in the right-hand clause
+# are the cores that benefit from this optimization.
+#
+config X86_P6_NOP
+ def_bool y
+ depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4)
+
config X86_TSC
def_bool y
depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
@@ -390,6 +403,7 @@ config X86_CMOV
config X86_MINIMUM_CPU_FAMILY
int
default "64" if X86_64
+ default "6" if X86_32 && X86_P6_NOP
default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK)
default "3"
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 378353956b5..e77d89f9e8a 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -37,6 +37,12 @@ static int detect_memory_e820(void)
"=m" (*desc)
: "D" (desc), "d" (SMAP), "a" (0xe820));
+ /* BIOSes which terminate the chain with CF = 1 as opposed
+ to %ebx = 0 don't always report the SMAP signature on
+ the final, failing, probe. */
+ if (err)
+ break;
+
/* Some BIOSes stop returning SMAP in the middle of
the search loop. We don't know exactly how the BIOS
screwed up the map at that point, we might have a
@@ -47,9 +53,6 @@ static int detect_memory_e820(void)
break;
}
- if (err)
- break;
-
count++;
desc++;
} while (next && count < E820MAX);
diff --git a/arch/x86/boot/vesa.h b/arch/x86/boot/vesa.h
index ff5b73cd406..468e444622c 100644
--- a/arch/x86/boot/vesa.h
+++ b/arch/x86/boot/vesa.h
@@ -26,17 +26,10 @@ struct vesa_general_info {
far_ptr video_mode_ptr; /* 14 */
u16 total_memory; /* 18 */
- u16 oem_software_rev; /* 20 */
- far_ptr oem_vendor_name_ptr; /* 22 */
- far_ptr oem_product_name_ptr; /* 26 */
- far_ptr oem_product_rev_ptr; /* 30 */
-
- u8 reserved[222]; /* 34 */
- u8 oem_data[256]; /* 256 */
+ u8 reserved[236]; /* 20 */
} __attribute__ ((packed));
#define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24))
-#define VBE2_MAGIC ('V' + ('B' << 8) + ('E' << 16) + ('2' << 24))
struct vesa_mode_info {
u16 mode_attr; /* 0 */
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 662dd2f1306..419b5c27337 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -37,8 +37,6 @@ static int vesa_probe(void)
video_vesa.modes = GET_HEAP(struct mode_info, 0);
- vginfo.signature = VBE2_MAGIC;
-
ax = 0x4f00;
di = (size_t)&vginfo;
asm(INT10
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 1c0503bdfb1..5e7771a3ba2 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -500,7 +500,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
regs->ss = __USER32_DS;
set_fs(USER_DS);
- regs->flags &= ~X86_EFLAGS_TF;
+ regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
if (test_thread_flag(TIF_SINGLESTEP))
ptrace_notify(SIGTRAP);
@@ -600,7 +600,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->ss = __USER32_DS;
set_fs(USER_DS);
- regs->flags &= ~X86_EFLAGS_TF;
+ regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
if (test_thread_flag(TIF_SINGLESTEP))
ptrace_notify(SIGTRAP);
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 680b7300a48..2cdc9de9371 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -72,7 +72,8 @@ static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return
#define PREFIX "ACPI: "
int acpi_noirq; /* skip ACPI IRQ initialization */
-int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */
+int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
+EXPORT_SYMBOL(acpi_pci_disabled);
int acpi_ht __initdata = 1; /* enable HT */
int acpi_lapic;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 45d79ea890a..5fed98ca0e1 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -65,7 +65,8 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
get them easily into strings. */
asm("\t.section .rodata, \"a\"\nintelnops: "
GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
- GENERIC_NOP7 GENERIC_NOP8);
+ GENERIC_NOP7 GENERIC_NOP8
+ "\t.previous");
extern const unsigned char intelnops[];
static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = {
NULL,
@@ -83,7 +84,8 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = {
#ifdef K8_NOP1
asm("\t.section .rodata, \"a\"\nk8nops: "
K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
- K8_NOP7 K8_NOP8);
+ K8_NOP7 K8_NOP8
+ "\t.previous");
extern const unsigned char k8nops[];
static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = {
NULL,
@@ -101,7 +103,8 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = {
#ifdef K7_NOP1
asm("\t.section .rodata, \"a\"\nk7nops: "
K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
- K7_NOP7 K7_NOP8);
+ K7_NOP7 K7_NOP8
+ "\t.previous");
extern const unsigned char k7nops[];
static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = {
NULL,
@@ -119,7 +122,8 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = {
#ifdef P6_NOP1
asm("\t.section .rodata, \"a\"\np6nops: "
P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
- P6_NOP7 P6_NOP8);
+ P6_NOP7 P6_NOP8
+ "\t.previous");
extern const unsigned char p6nops[];
static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
NULL,
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 608152a2a05..00df126169b 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -18,6 +18,7 @@
#include <linux/pci.h>
#include <linux/bitops.h>
#include <linux/ioport.h>
+#include <linux/suspend.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/gart.h>
@@ -76,6 +77,8 @@ static u32 __init allocate_aperture(void)
printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
aper_size >> 10, __pa(p));
insert_aperture_resource((u32)__pa(p), aper_size);
+ register_nosave_region((u32)__pa(p) >> PAGE_SHIFT,
+ (u32)__pa(p+aper_size) >> PAGE_SHIFT);
return (u32)__pa(p);
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index a33d5301799..8ea040124f7 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -128,13 +128,11 @@ void foo(void)
OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
#endif
-#ifdef CONFIG_LGUEST_GUEST
+#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
-#endif
-#ifdef CONFIG_LGUEST
BLANK();
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 027e5c003b1..170d2f5523b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -143,14 +143,6 @@ static void __init check_config(void)
#endif
/*
- * If we configured ourselves for a TSC, we'd better have one!
- */
-#ifdef CONFIG_X86_TSC
- if (!cpu_has_tsc)
- panic("Kernel compiled for Pentium+, requires TSC feature!");
-#endif
-
-/*
* If we were told we had a good local APIC, check for buggy Pentia,
* i.e. all B steppings and the C2 stepping of P54C when using their
* integrated APIC (see 11AP erratum in "Pentium Processor
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f86a3c4a266..a38aafaefc2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -504,7 +504,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* Clear all flags overriden by options */
for (i = 0; i < NCAPINTS; i++)
- c->x86_capability[i] ^= cleared_cpu_caps[i];
+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
/* Init Machine Check Exception if available. */
mcheck_init(c);
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
index 39f8cb18296..c2f930d8664 100644
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
@@ -55,7 +55,6 @@ static int eps_set_state(struct eps_cpu_data *centaur,
{
struct cpufreq_freqs freqs;
u32 lo, hi;
- u8 current_multiplier, current_voltage;
int err = 0;
int i;
@@ -95,6 +94,10 @@ postchange:
rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
+#ifdef DEBUG
+ {
+ u8 current_multiplier, current_voltage;
+
/* Print voltage and multiplier */
rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
current_voltage = lo & 0xff;
@@ -103,7 +106,8 @@ postchange:
current_multiplier = (lo >> 8) & 0xff;
printk(KERN_INFO "eps: Current multiplier = %d\n",
current_multiplier);
-
+ }
+#endif
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
return err;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index f2b5a621d27..8a85c93bd62 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -63,7 +63,7 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
*/
static int speedstep_smi_ownership (void)
{
- u32 command, result, magic;
+ u32 command, result, magic, dummy;
u32 function = GET_SPEEDSTEP_OWNER;
unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
@@ -73,8 +73,11 @@ static int speedstep_smi_ownership (void)
dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port);
__asm__ __volatile__(
+ "push %%ebp\n"
"out %%al, (%%dx)\n"
- : "=D" (result)
+ "pop %%ebp\n"
+ : "=D" (result), "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
+ "=S" (dummy)
: "a" (command), "b" (function), "c" (0), "d" (smi_port),
"D" (0), "S" (magic)
: "memory"
@@ -96,7 +99,7 @@ static int speedstep_smi_ownership (void)
*/
static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
{
- u32 command, result = 0, edi, high_mhz, low_mhz;
+ u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
u32 state=0;
u32 function = GET_SPEEDSTEP_FREQS;
@@ -109,10 +112,12 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port);
- __asm__ __volatile__("movl $0, %%edi\n"
+ __asm__ __volatile__(
+ "push %%ebp\n"
"out %%al, (%%dx)\n"
- : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi)
- : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0)
+ "pop %%ebp"
+ : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi), "=S" (dummy)
+ : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0)
);
dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz);
@@ -135,16 +140,18 @@ static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high)
static int speedstep_get_state (void)
{
u32 function=GET_SPEEDSTEP_STATE;
- u32 result, state, edi, command;
+ u32 result, state, edi, command, dummy;
command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port);
- __asm__ __volatile__("movl $0, %%edi\n"
+ __asm__ __volatile__(
+ "push %%ebp\n"
"out %%al, (%%dx)\n"
- : "=a" (result), "=b" (state), "=D" (edi)
- : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0)
+ "pop %%ebp\n"
+ : "=a" (result), "=b" (state), "=D" (edi), "=c" (dummy), "=d" (dummy), "=S" (dummy)
+ : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0), "D" (0)
);
dprintk("state is %x, result is %x\n", state, result);
@@ -160,7 +167,7 @@ static int speedstep_get_state (void)
*/
static void speedstep_set_state (unsigned int state)
{
- unsigned int result = 0, command, new_state;
+ unsigned int result = 0, command, new_state, dummy;
unsigned long flags;
unsigned int function=SET_SPEEDSTEP_STATE;
unsigned int retry = 0;
@@ -182,10 +189,12 @@ static void speedstep_set_state (unsigned int state)
}
retry++;
__asm__ __volatile__(
- "movl $0, %%edi\n"
+ "push %%ebp\n"
"out %%al, (%%dx)\n"
- : "=b" (new_state), "=D" (result)
- : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0)
+ "pop %%ebp"
+ : "=b" (new_state), "=D" (result), "=c" (dummy), "=a" (dummy),
+ "=d" (dummy), "=S" (dummy)
+ : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0), "D" (0)
);
} while ((new_state != state) && (retry <= SMI_TRIES));
@@ -195,7 +204,7 @@ static void speedstep_set_state (unsigned int state)
if (new_state == state) {
dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result);
} else {
- printk(KERN_ERR "cpufreq: change failed with new_state %u and result %u\n", new_state, result);
+ printk(KERN_ERR "cpufreq: change to state %u failed with new_state %u and result %u\n", state, new_state, result);
}
return;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 103d61a59b1..3e18db4cefe 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -176,12 +176,13 @@ static inline void k8_enable_fixed_iorrs(void)
}
/**
- * Checks and updates an fixed-range MTRR if it differs from the value it
- * should have. If K8 extentions are wanted, update the K8 SYSCFG MSR also.
- * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information
- * \param msr MSR address of the MTTR which should be checked and updated
- * \param changed pointer which indicates whether the MTRR needed to be changed
- * \param msrwords pointer to the MSR values which the MSR should have
+ * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
+ * @msr: MSR address of the MTTR which should be checked and updated
+ * @changed: pointer which indicates whether the MTRR needed to be changed
+ * @msrwords: pointer to the MSR values which the MSR should have
+ *
+ * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
+ * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
*/
static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
{
@@ -199,12 +200,15 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
}
}
+/**
+ * generic_get_free_region - Get a free MTRR.
+ * @base: The starting (base) address of the region.
+ * @size: The size (in bytes) of the region.
+ * @replace_reg: mtrr index to be replaced; set to invalid value if none.
+ *
+ * Returns: The index of the region on success, else negative on error.
+ */
int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
-/* [SUMMARY] Get a free MTRR.
- <base> The starting (base) address of the region.
- <size> The size (in bytes) of the region.
- [RETURNS] The index of the region on success, else -1 on error.
-*/
{
int i, max;
mtrr_type ltype;
@@ -249,8 +253,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
}
/**
- * Checks and updates the fixed-range MTRRs if they differ from the saved set
- * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges()
+ * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set
+ * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges()
*/
static int set_fixed_ranges(mtrr_type * frs)
{
@@ -294,13 +298,13 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
static u32 deftype_lo, deftype_hi;
+/**
+ * set_mtrr_state - Set the MTRR state for this CPU.
+ *
+ * NOTE: The CPU must already be in a safe state for MTRR changes.
+ * RETURNS: 0 if no changes made, else a mask indicating what was changed.
+ */
static unsigned long set_mtrr_state(void)
-/* [SUMMARY] Set the MTRR state for this CPU.
- <state> The MTRR state information to read.
- <ctxt> Some relevant CPU context.
- [NOTE] The CPU must already be in a safe state for MTRR changes.
- [RETURNS] 0 if no changes made, else a mask indication what was changed.
-*/
{
unsigned int i;
unsigned long change_mask = 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index b6e136f23d3..a6450b3ae75 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -43,6 +43,7 @@
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/msr.h>
+#include <asm/kvm_para.h>
#include "mtrr.h"
u32 num_var_ranges = 0;
@@ -649,6 +650,7 @@ static __init int amd_special_default_mtrr(void)
/**
* mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ * @end_pfn: ending page frame number
*
* Some buggy BIOSes don't setup the MTRRs properly for systems with certain
* memory configurations. This routine checks that the highest MTRR matches
@@ -688,8 +690,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* kvm/qemu doesn't have mtrr set right, don't trim them all */
if (!highest_pfn) {
- printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n");
- WARN_ON(1);
+ if (!kvm_para_available()) {
+ printk(KERN_WARNING
+ "WARNING: strange, CPU MTRRs all blank?\n");
+ WARN_ON(1);
+ }
return 0;
}
@@ -706,7 +711,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
trim_size = end_pfn;
trim_size <<= PAGE_SHIFT;
trim_size -= trim_start;
- add_memory_region(trim_start, trim_size, E820_RESERVED);
+ update_memory_range(trim_start, trim_size, E820_RAM,
+ E820_RESERVED);
update_e820();
return 1;
}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9b838324b81..b943e10ad81 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -652,9 +652,6 @@ static void probe_nmi_watchdog(void)
wd_ops = &p6_wd_ops;
break;
case 15:
- if (boot_cpu_data.x86_model > 0x4)
- return;
-
wd_ops = &p4_wd_ops;
break;
default:
@@ -670,8 +667,10 @@ int lapic_watchdog_init(unsigned nmi_hz)
{
if (!wd_ops) {
probe_nmi_watchdog();
- if (!wd_ops)
+ if (!wd_ops) {
+ printk(KERN_INFO "NMI watchdog: CPU not supported\n");
return -1;
+ }
if (!wd_ops->reserve()) {
printk(KERN_ERR
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 200fb3f9ebf..e8b422c1c51 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -76,13 +76,6 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
/* All Transmeta CPUs have a constant TSC */
set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
- /* If we can run i686 user-space code, call us an i686 */
-#define USER686 ((1 << X86_FEATURE_TSC)|\
- (1 << X86_FEATURE_CX8)|\
- (1 << X86_FEATURE_CMOV))
- if (c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686)
- c->x86 = 6;
-
#ifdef CONFIG_SYSCTL
/* randomize_va_space slows us down enormously;
it probably triggers retranslation of x86->native bytecode */
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 4e16ef4a265..80444c5c9b1 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -749,6 +749,32 @@ static int __init parse_memmap(char *arg)
return 0;
}
early_param("memmap", parse_memmap);
+void __init update_memory_range(u64 start, u64 size, unsigned old_type,
+ unsigned new_type)
+{
+ int i;
+
+ BUG_ON(old_type == new_type);
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 final_start, final_end;
+ if (ei->type != old_type)
+ continue;
+ /* totally covered? */
+ if (ei->addr >= start && ei->size <= size) {
+ ei->type = new_type;
+ continue;
+ }
+ /* partially covered */
+ final_start = max(start, ei->addr);
+ final_end = min(start + size, ei->addr + ei->size);
+ if (final_start >= final_end)
+ continue;
+ add_memory_region(final_start, final_end - final_start,
+ new_type);
+ }
+}
void __init update_e820(void)
{
u8 nr_map;
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 9f65b4cc323..9be69712601 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -744,6 +744,33 @@ void __init finish_e820_parsing(void)
}
}
+void __init update_memory_range(u64 start, u64 size, unsigned old_type,
+ unsigned new_type)
+{
+ int i;
+
+ BUG_ON(old_type == new_type);
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 final_start, final_end;
+ if (ei->type != old_type)
+ continue;
+ /* totally covered? */
+ if (ei->addr >= start && ei->size <= size) {
+ ei->type = new_type;
+ continue;
+ }
+ /* partially covered */
+ final_start = max(start, ei->addr);
+ final_end = min(start + size, ei->addr + ei->size);
+ if (final_start >= final_end)
+ continue;
+ add_memory_region(final_start, final_end - final_start,
+ new_type);
+ }
+}
+
void __init update_e820(void)
{
u8 nr_map;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2ad9a1bc6a7..c20c9e7e08d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -453,6 +453,7 @@ ENTRY(stub_execve)
CFI_REGISTER rip, r11
SAVE_REST
FIXUP_TOP_OF_STACK %r11
+ movq %rsp, %rcx
call sys_execve
RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
@@ -1036,15 +1037,16 @@ ENDPROC(child_rip)
* rdi: name, rsi: argv, rdx: envp
*
* We want to fallback into:
- * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
+ * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
*
* do_sys_execve asm fallback arguments:
- * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
+ * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
*/
ENTRY(kernel_execve)
CFI_STARTPROC
FAKE_STACK_FRAME $0
SAVE_ALL
+ movq %rsp,%rcx
call sys_execve
movq %rax, RAX(%rsp)
RESTORE_REST
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 25eb98540a4..74d87ea85b5 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -606,7 +606,7 @@ ENTRY(_stext)
.section ".bss.page_aligned","wa"
.align PAGE_SIZE_asm
#ifdef CONFIG_X86_PAE
-ENTRY(swapper_pg_pmd)
+swapper_pg_pmd:
.fill 1024*KPMDS,4,0
#else
ENTRY(swapper_pg_dir)
@@ -657,7 +657,7 @@ int_msg:
.asciz "Unknown interrupt or fault at EIP %p %p %p\n"
fault_msg:
- .ascii \
+ .asciz \
/* fault info: */ "BUG: Int %d: CR2 %p\n" \
/* pusha regs: */ " EDI %p ESI %p EBP %p ESP %p\n" \
" EBX %p EDX %p ECX %p EAX %p\n" \
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index eb415043a92..a007454133a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -379,18 +379,24 @@ NEXT_PAGE(level2_ident_pgt)
/* Since I easily can, map the first 1G.
* Don't set NX because code runs from these pages.
*/
- PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
NEXT_PAGE(level2_kernel_pgt)
- /* 40MB kernel mapping. The kernel code cannot be bigger than that.
- When you change this change KERNEL_TEXT_SIZE in page.h too. */
- /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
- PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, KERNEL_TEXT_SIZE/PMD_SIZE)
- /* Module mapping starts here */
- .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0
+ /*
+ * 128 MB kernel mapping. We spend a full page on this pagetable
+ * anyway.
+ *
+ * The kernel code+data+bss must not be bigger than that.
+ *
+ * (NOTE: at +128MB starts the module area, see MODULES_VADDR.
+ * If you want to increase this then increase MODULES_VADDR
+ * too.)
+ */
+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL,
+ KERNEL_IMAGE_SIZE/PMD_SIZE)
NEXT_PAGE(level2_spare_pgt)
- .fill 512,8,0
+ .fill 512, 8, 0
#undef PMDS
#undef NEXT_PAGE
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 429d084e014..36652ea1a26 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -133,13 +133,16 @@ static void hpet_reserve_platform_timers(unsigned long id)
#ifdef CONFIG_HPET_EMULATE_RTC
hpet_reserve_timer(&hd, 1);
#endif
+
hd.hd_irq[0] = HPET_LEGACY_8254;
hd.hd_irq[1] = HPET_LEGACY_RTC;
- for (i = 2; i < nrtimers; timer++, i++)
- hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
- Tn_INT_ROUTE_CNF_SHIFT;
+ for (i = 2; i < nrtimers; timer++, i++)
+ hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
+ Tn_INT_ROUTE_CNF_SHIFT;
+
hpet_alloc(&hd);
+
}
#else
static void hpet_reserve_platform_timers(unsigned long id) { }
@@ -368,8 +371,8 @@ static int hpet_clocksource_register(void)
return 0;
}
-/*
- * Try to setup the HPET timer
+/**
+ * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
*/
int __init hpet_enable(void)
{
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 763dfc40723..d2e39e69aaf 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -132,7 +132,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
if (!cpu_has_fxsr)
return -ENODEV;
- unlazy_fpu(target);
+ init_fpu(target);
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
&target->thread.i387.fxsave, 0, -1);
@@ -147,7 +147,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
if (!cpu_has_fxsr)
return -ENODEV;
- unlazy_fpu(target);
+ init_fpu(target);
set_stopped_child_used_math(target);
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
@@ -261,7 +261,7 @@ static void convert_from_fxsr(struct user_i387_ia32_struct *env,
}
#else
env->fip = fxsave->fip;
- env->fcs = fxsave->fcs;
+ env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
env->foo = fxsave->foo;
env->fos = fxsave->fos;
#endif
@@ -307,7 +307,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
if (!HAVE_HWFP)
return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
- unlazy_fpu(target);
+ init_fpu(target);
if (!cpu_has_fxsr)
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
@@ -332,7 +332,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
if (!HAVE_HWFP)
return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
- unlazy_fpu(target);
+ init_fpu(target);
set_stopped_child_used_math(target);
if (!cpu_has_fxsr)
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 5b3ce793436..3d01e47777d 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,6 +15,7 @@ static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
/*
* Initial thread structure.
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index c706a306155..5921e5f0a64 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -78,6 +78,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
},
{
.callback = dmi_io_delay_0xed_port,
+ .ident = "HP Pavilion dv6000",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B8")
+ }
+ },
+ {
+ .callback = dmi_io_delay_0xed_port,
.ident = "HP Pavilion tx1000",
.matches = {
DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 236d2f8f7dd..576a03db451 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -233,6 +233,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
void arch_crash_save_vmcoreinfo(void)
{
+ VMCOREINFO_SYMBOL(phys_base);
VMCOREINFO_SYMBOL(init_level4_pgt);
#ifdef CONFIG_NUMA
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 027fc067b39..b402c0f3f19 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -30,6 +30,7 @@
#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/module.h>
#include <asm/geode.h>
static struct mfgpt_timer_t {
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index a82473d192a..375cb2bc45b 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -53,11 +53,6 @@ dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
int node;
node = dev_to_node(dev);
- if (node == -1)
- node = numa_node_id();
-
- if (node < first_node(node_online_map))
- node = first_node(node_online_map);
page = alloc_pages_node(node, gfp, order);
return page ? page_address(page) : NULL;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index faf3229f8fb..700e4647dd3 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -615,8 +615,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
nommu:
/* Should not happen anymore */
- printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
- KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
+ printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+ KERN_WARNING "falling back to iommu=soft.\n");
return -1;
}
@@ -692,9 +692,9 @@ void __init gart_iommu_init(void)
!gart_iommu_aperture ||
(no_agp && init_k8_gatt(&info) < 0)) {
if (end_pfn > MAX_DMA32_PFN) {
- printk(KERN_ERR "WARNING more than 4GB of memory "
- "but GART IOMMU not available.\n"
- KERN_ERR "WARNING 32bit PCI may malfunction.\n");
+ printk(KERN_WARNING "More than 4GB of memory "
+ "but GART IOMMU not available.\n"
+ KERN_WARNING "falling back to iommu=soft.\n");
}
return;
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a7d50a547dc..43930e73f65 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -82,7 +82,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
*/
void (*pm_idle)(void);
EXPORT_SYMBOL(pm_idle);
-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
void disable_hlt(void)
{
@@ -190,9 +189,6 @@ void cpu_idle(void)
while (!need_resched()) {
void (*idle)(void);
- if (__get_cpu_var(cpu_idle_state))
- __get_cpu_var(cpu_idle_state) = 0;
-
check_pgt_cache();
rmb();
idle = pm_idle;
@@ -220,40 +216,19 @@ static void do_nothing(void *unused)
{
}
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
+ * handler on SMP systems.
+ *
+ * Caller must have changed pm_idle to the new value before the call. Old
+ * pm_idle value will not be used by any CPU after the return of this function.
+ */
void cpu_idle_wait(void)
{
- unsigned int cpu, this_cpu = get_cpu();
- cpumask_t map, tmp = current->cpus_allowed;
-
- set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
- put_cpu();
-
- cpus_clear(map);
- for_each_online_cpu(cpu) {
- per_cpu(cpu_idle_state, cpu) = 1;
- cpu_set(cpu, map);
- }
-
- __get_cpu_var(cpu_idle_state) = 0;
-
- wmb();
- do {
- ssleep(1);
- for_each_online_cpu(cpu) {
- if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
- cpu_clear(cpu, map);
- }
- cpus_and(map, map, cpu_online_map);
- /*
- * We waited 1 sec, if a CPU still did not call idle
- * it may be because it is in idle and not waking up
- * because it has nothing to do.
- * Give all the remaining CPUS a kick.
- */
- smp_call_function_mask(map, do_nothing, NULL, 0);
- } while (!cpus_empty(map));
-
- set_cpus_allowed(current, tmp);
+ smp_mb();
+ /* kick all the CPUs so that they exit out of pm_idle */
+ smp_call_function(do_nothing, NULL, 0, 1);
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
@@ -603,11 +578,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
}
#endif
+#ifdef X86_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif
if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b0cc8f0136d..46c4c546b49 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -63,7 +63,6 @@ EXPORT_SYMBOL(boot_option_idle_override);
*/
void (*pm_idle)(void);
EXPORT_SYMBOL(pm_idle);
-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -173,9 +172,6 @@ void cpu_idle(void)
while (!need_resched()) {
void (*idle)(void);
- if (__get_cpu_var(cpu_idle_state))
- __get_cpu_var(cpu_idle_state) = 0;
-
rmb();
idle = pm_idle;
if (!idle)
@@ -207,40 +203,19 @@ static void do_nothing(void *unused)
{
}
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
+ * handler on SMP systems.
+ *
+ * Caller must have changed pm_idle to the new value before the call. Old
+ * pm_idle value will not be used by any CPU after the return of this function.
+ */
void cpu_idle_wait(void)
{
- unsigned int cpu, this_cpu = get_cpu();
- cpumask_t map, tmp = current->cpus_allowed;
-
- set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
- put_cpu();
-
- cpus_clear(map);
- for_each_online_cpu(cpu) {
- per_cpu(cpu_idle_state, cpu) = 1;
- cpu_set(cpu, map);
- }
-
- __get_cpu_var(cpu_idle_state) = 0;
-
- wmb();
- do {
- ssleep(1);
- for_each_online_cpu(cpu) {
- if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
- cpu_clear(cpu, map);
- }
- cpus_and(map, map, cpu_online_map);
- /*
- * We waited 1 sec, if a CPU still did not call idle
- * it may be because it is in idle and not waking up
- * because it has nothing to do.
- * Give all the remaining CPUS a kick.
- */
- smp_call_function_mask(map, do_nothing, 0, 0);
- } while (!cpus_empty(map));
-
- set_cpus_allowed(current, tmp);
+ smp_mb();
+ /* kick all the CPUs so that they exit out of pm_idle */
+ smp_call_function(do_nothing, NULL, 0, 1);
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
@@ -604,11 +579,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
+#ifdef X86_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
+#endif
}
/*
@@ -730,16 +707,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
asmlinkage
long sys_execve(char __user *name, char __user * __user *argv,
- char __user * __user *envp, struct pt_regs regs)
+ char __user * __user *envp, struct pt_regs *regs)
{
long error;
char * filename;
filename = getname(name);
error = PTR_ERR(filename);
- if (IS_ERR(filename))
+ if (IS_ERR(filename))
return error;
- error = do_execve(filename, argv, envp, &regs);
+ error = do_execve(filename, argv, envp, regs);
putname(filename);
return error;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 702c33efea8..eb92ccbb350 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -323,6 +323,16 @@ static int putreg(struct task_struct *child,
return set_flags(child, value);
#ifdef CONFIG_X86_64
+ /*
+ * Orig_ax is really just a flag with small positive and
+ * negative values, so make sure to always sign-extend it
+ * from 32 bits so that it works correctly regardless of
+ * whether we come from a 32-bit environment or not.
+ */
+ case offsetof(struct user_regs_struct, orig_ax):
+ value = (long) (s32) value;
+ break;
+
case offsetof(struct user_regs_struct,fs_base):
if (value >= TASK_SIZE_OF(child))
return -EIO;
@@ -544,6 +554,8 @@ static int ptrace_set_debugreg(struct task_struct *child,
return 0;
}
+#ifdef X86_BTS
+
static int ptrace_bts_get_size(struct task_struct *child)
{
if (!child->thread.ds_area_msr)
@@ -588,21 +600,6 @@ static int ptrace_bts_read_record(struct task_struct *child,
return sizeof(ret);
}
-static int ptrace_bts_write_record(struct task_struct *child,
- const struct bts_struct *in)
-{
- int retval;
-
- if (!child->thread.ds_area_msr)
- return -ENXIO;
-
- retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
- if (retval)
- return retval;
-
- return sizeof(*in);
-}
-
static int ptrace_bts_clear(struct task_struct *child)
{
if (!child->thread.ds_area_msr)
@@ -645,75 +642,6 @@ static int ptrace_bts_drain(struct task_struct *child,
return end;
}
-static int ptrace_bts_realloc(struct task_struct *child,
- int size, int reduce_size)
-{
- unsigned long rlim, vm;
- int ret, old_size;
-
- if (size < 0)
- return -EINVAL;
-
- old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
- if (old_size < 0)
- return old_size;
-
- ret = ds_free((void **)&child->thread.ds_area_msr);
- if (ret < 0)
- goto out;
-
- size >>= PAGE_SHIFT;
- old_size >>= PAGE_SHIFT;
-
- current->mm->total_vm -= old_size;
- current->mm->locked_vm -= old_size;
-
- if (size == 0)
- goto out;
-
- rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->total_vm + size;
- if (rlim < vm) {
- ret = -ENOMEM;
-
- if (!reduce_size)
- goto out;
-
- size = rlim - current->mm->total_vm;
- if (size <= 0)
- goto out;
- }
-
- rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->locked_vm + size;
- if (rlim < vm) {
- ret = -ENOMEM;
-
- if (!reduce_size)
- goto out;
-
- size = rlim - current->mm->locked_vm;
- if (size <= 0)
- goto out;
- }
-
- ret = ds_allocate((void **)&child->thread.ds_area_msr,
- size << PAGE_SHIFT);
- if (ret < 0)
- goto out;
-
- current->mm->total_vm += size;
- current->mm->locked_vm += size;
-
-out:
- if (child->thread.ds_area_msr)
- set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
- else
- clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
-
- return ret;
-}
-
static int ptrace_bts_config(struct task_struct *child,
long cfg_size,
const struct ptrace_bts_config __user *ucfg)
@@ -816,6 +744,91 @@ static int ptrace_bts_status(struct task_struct *child,
return sizeof(cfg);
}
+
+static int ptrace_bts_write_record(struct task_struct *child,
+ const struct bts_struct *in)
+{
+ int retval;
+
+ if (!child->thread.ds_area_msr)
+ return -ENXIO;
+
+ retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
+ if (retval)
+ return retval;
+
+ return sizeof(*in);
+}
+
+static int ptrace_bts_realloc(struct task_struct *child,
+ int size, int reduce_size)
+{
+ unsigned long rlim, vm;
+ int ret, old_size;
+
+ if (size < 0)
+ return -EINVAL;
+
+ old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
+ if (old_size < 0)
+ return old_size;
+
+ ret = ds_free((void **)&child->thread.ds_area_msr);
+ if (ret < 0)
+ goto out;
+
+ size >>= PAGE_SHIFT;
+ old_size >>= PAGE_SHIFT;
+
+ current->mm->total_vm -= old_size;
+ current->mm->locked_vm -= old_size;
+
+ if (size == 0)
+ goto out;
+
+ rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+ vm = current->mm->total_vm + size;
+ if (rlim < vm) {
+ ret = -ENOMEM;
+
+ if (!reduce_size)
+ goto out;
+
+ size = rlim - current->mm->total_vm;
+ if (size <= 0)
+ goto out;
+ }
+
+ rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+ vm = current->mm->locked_vm + size;
+ if (rlim < vm) {
+ ret = -ENOMEM;
+
+ if (!reduce_size)
+ goto out;
+
+ size = rlim - current->mm->locked_vm;
+ if (size <= 0)
+ goto out;
+ }
+
+ ret = ds_allocate((void **)&child->thread.ds_area_msr,
+ size << PAGE_SHIFT);
+ if (ret < 0)
+ goto out;
+
+ current->mm->total_vm += size;
+ current->mm->locked_vm += size;
+
+out:
+ if (child->thread.ds_area_msr)
+ set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+ else
+ clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
+
+ return ret;
+}
+
void ptrace_bts_take_timestamp(struct task_struct *tsk,
enum bts_qualifier qualifier)
{
@@ -826,6 +839,7 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
ptrace_bts_write_record(tsk, &rec);
}
+#endif /* X86_BTS */
/*
* Called by kernel/ptrace.c when detaching..
@@ -839,7 +853,9 @@ void ptrace_disable(struct task_struct *child)
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
if (child->thread.ds_area_msr) {
+#ifdef X86_BTS
ptrace_bts_realloc(child, 0, 0);
+#endif
child->thread.debugctlmsr &= ~ds_debugctl_mask();
if (!child->thread.debugctlmsr)
clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
@@ -961,6 +977,10 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
#endif
+ /*
+ * These bits need more cooking - not enabled yet:
+ */
+#ifdef X86_BTS
case PTRACE_BTS_CONFIG:
ret = ptrace_bts_config
(child, data, (struct ptrace_bts_config __user *)addr);
@@ -988,6 +1008,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
ret = ptrace_bts_drain
(child, data, (struct bts_struct __user *) addr);
break;
+#endif
default:
ret = ptrace_request(child, request, addr, data);
@@ -1035,10 +1056,17 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
R32(esi, si);
R32(ebp, bp);
R32(eax, ax);
- R32(orig_eax, orig_ax);
R32(eip, ip);
R32(esp, sp);
+ case offsetof(struct user32, regs.orig_eax):
+ /*
+ * Sign-extend the value so that orig_eax = -1
+ * causes (long)orig_ax < 0 tests to fire correctly.
+ */
+ regs->orig_ax = (long) (s32) value;
+ break;
+
case offsetof(struct user32, regs.eflags):
return set_flags(child, value);
@@ -1160,7 +1188,7 @@ static int genregs32_set(struct task_struct *target,
if (kbuf) {
const compat_ulong_t *k = kbuf;
while (count > 0 && !ret) {
- ret = putreg(target, pos, *k++);
+ ret = putreg32(target, pos, *k++);
count -= sizeof(*k);
pos += sizeof(*k);
}
@@ -1171,7 +1199,7 @@ static int genregs32_set(struct task_struct *target,
ret = __get_user(word, u++);
if (ret)
break;
- ret = putreg(target, pos, word);
+ ret = putreg32(target, pos, word);
count -= sizeof(*u);
pos += sizeof(*u);
}
@@ -1226,12 +1254,14 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
case PTRACE_SETOPTIONS:
case PTRACE_SET_THREAD_AREA:
case PTRACE_GET_THREAD_AREA:
+#ifdef X86_BTS
case PTRACE_BTS_CONFIG:
case PTRACE_BTS_STATUS:
case PTRACE_BTS_SIZE:
case PTRACE_BTS_GET:
case PTRACE_BTS_CLEAR:
case PTRACE_BTS_DRAIN:
+#endif
return sys_ptrace(request, pid, addr, data);
default:
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index c47208fc593..d89a648fe71 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -363,6 +363,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0051,
nvidia_force_enable_hpet);
/* LPC bridges */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
+ nvidia_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
nvidia_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 7fd6ac43e4a..484c4a80d38 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -152,6 +152,24 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
},
},
+ { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 745",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 745",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
+ },
+ },
{ /* Handle problems with rebooting on Dell 2400's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 2400",
@@ -326,6 +344,10 @@ static inline void kb_wait(void)
}
}
+void __attribute__((weak)) mach_reboot_fixups(void)
+{
+}
+
static void native_machine_emergency_restart(void)
{
int i;
@@ -337,6 +359,8 @@ static void native_machine_emergency_restart(void)
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
case BOOT_KBD:
+ mach_reboot_fixups(); /* for board specific fixups */
+
for (i = 0; i < 10; i++) {
kb_wait();
udelay(50);
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 309366f8f60..e24c4567709 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -142,14 +142,16 @@ void __init setup_per_cpu_areas(void)
printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
for_each_cpu_mask (i, cpu_possible_map) {
char *ptr;
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ ptr = alloc_bootmem_pages(size);
+#else
+ int node = early_cpu_to_node(i);
- if (!NODE_DATA(early_cpu_to_node(i))) {
- printk("cpu with no node %d, num_online_nodes %d\n",
- i, num_online_nodes());
+ if (!node_online(node) || !NODE_DATA(node))
ptr = alloc_bootmem_pages(size);
- } else {
- ptr = alloc_bootmem_pages_node(NODE_DATA(early_cpu_to_node(i)), size);
- }
+ else
+ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+#endif
if (!ptr)
panic("Cannot allocate cpu data for CPU %d\n", i);
cpu_pda(i)->data_offset = ptr - __per_cpu_start;
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index a1d7071a51c..2b3e5d45176 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -406,8 +406,6 @@ static unsigned long __init setup_memory(void)
*/
min_low_pfn = PFN_UP(init_pg_tables_end);
- find_max_pfn();
-
max_low_pfn = find_max_low_pfn();
#ifdef CONFIG_HIGHMEM
@@ -764,12 +762,13 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled)
efi_init();
- max_low_pfn = setup_memory();
-
/* update e820 for memory not covered by WB MTRRs */
+ find_max_pfn();
mtrr_bp_init();
if (mtrr_trim_uncached_memory(max_pfn))
- max_low_pfn = setup_memory();
+ find_max_pfn();
+
+ max_low_pfn = setup_memory();
#ifdef CONFIG_VMI
/*
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 6fd804f0782..f4f7ecfb898 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -801,7 +801,7 @@ static void __cpuinit srat_detect_node(void)
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
node = apicid_to_node[apicid];
- if (node == NUMA_NO_NODE)
+ if (node == NUMA_NO_NODE || !node_online(node))
node = first_node(node_online_map);
numa_set_node(cpu, node);
@@ -1021,7 +1021,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* Clear all flags overriden by options */
for (i = 0; i < NCAPINTS; i++)
- c->x86_capability[i] ^= cleared_cpu_caps[i];
+ c->x86_capability[i] &= ~cleared_cpu_caps[i];
#ifdef CONFIG_X86_MCE
mcheck_init(c);
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index caee1f002fe..0157a6f0f41 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -407,7 +407,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
* The tracer may want to single-step inside the
* handler too.
*/
- regs->flags &= ~TF_MASK;
+ regs->flags &= ~(TF_MASK | X86_EFLAGS_DF);
if (test_thread_flag(TIF_SINGLESTEP))
ptrace_notify(SIGTRAP);
@@ -500,7 +500,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
* The tracer may want to single-step inside the
* handler too.
*/
- regs->flags &= ~TF_MASK;
+ regs->flags &= ~(TF_MASK | X86_EFLAGS_DF);
if (test_thread_flag(TIF_SINGLESTEP))
ptrace_notify(SIGTRAP);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 7347bb14e30..1c83e5124c6 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -295,7 +295,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
see include/asm-x86_64/uaccess.h for details. */
set_fs(USER_DS);
- regs->flags &= ~X86_EFLAGS_TF;
+ regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
if (test_thread_flag(TIF_SINGLESTEP))
ptrace_notify(SIGTRAP);
#ifdef DEBUG_SIG
@@ -311,6 +311,35 @@ give_sigsegv:
}
/*
+ * Return -1L or the syscall number that @regs is executing.
+ */
+static long current_syscall(struct pt_regs *regs)
+{
+ /*
+ * We always sign-extend a -1 value being set here,
+ * so this is always either -1L or a syscall number.
+ */
+ return regs->orig_ax;
+}
+
+/*
+ * Return a value that is -EFOO if the system call in @regs->orig_ax
+ * returned an error. This only works for @regs from @current.
+ */
+static long current_syscall_ret(struct pt_regs *regs)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (test_thread_flag(TIF_IA32))
+ /*
+ * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
+ * and will match correctly in comparisons.
+ */
+ return (int) regs->ax;
+#endif
+ return regs->ax;
+}
+
+/*
* OK, we're invoking a handler
*/
@@ -327,9 +356,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
#endif
/* Are we from a system call? */
- if ((long)regs->orig_ax >= 0) {
+ if (current_syscall(regs) >= 0) {
/* If so, check system call restarting.. */
- switch (regs->ax) {
+ switch (current_syscall_ret(regs)) {
case -ERESTART_RESTARTBLOCK:
case -ERESTARTNOHAND:
regs->ax = -EINTR;
@@ -426,10 +455,9 @@ static void do_signal(struct pt_regs *regs)
}
/* Did we come from a system call? */
- if ((long)regs->orig_ax >= 0) {
+ if (current_syscall(regs) >= 0) {
/* Restart the system call - no handlers present */
- long res = regs->ax;
- switch (res) {
+ switch (current_syscall_ret(regs)) {
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index d53bd6fcb42..0880f2c388a 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -554,10 +554,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
int timeout;
unsigned long start_rip;
struct create_idle c_idle = {
- .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle),
.cpu = cpu,
.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
};
+ INIT_WORK(&c_idle.work, do_fork_idle);
/* allocate memory for gdts of secondary cpus. Hotplug is considered */
if (!cpu_gdt_descr[cpu].address &&
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 02f0f61f5b1..c28c342c162 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -25,6 +25,8 @@ static int save_stack_stack(void *data, char *name)
static void save_stack_address(void *data, unsigned long addr, int reliable)
{
struct stack_trace *trace = data;
+ if (!reliable)
+ return;
if (trace->skip > 0) {
trace->skip--;
return;
@@ -37,6 +39,8 @@ static void
save_stack_address_nosched(void *data, unsigned long addr, int reliable)
{
struct stack_trace *trace = (struct stack_trace *)data;
+ if (!reliable)
+ return;
if (in_sched_functions(addr))
return;
if (trace->skip > 0) {
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 2ef1a5f8d67..071ff479823 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -140,6 +140,9 @@ static int enable_single_step(struct task_struct *child)
*/
static void write_debugctlmsr(struct task_struct *child, unsigned long val)
{
+ if (child->thread.debugctlmsr == val)
+ return;
+
child->thread.debugctlmsr = val;
if (child != current)
@@ -165,11 +168,11 @@ static void enable_step(struct task_struct *child, bool block)
write_debugctlmsr(child,
child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
} else {
- write_debugctlmsr(child,
- child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
+ write_debugctlmsr(child,
+ child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+ if (!child->thread.debugctlmsr)
+ clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
}
}
@@ -189,7 +192,7 @@ void user_disable_single_step(struct task_struct *child)
* Make sure block stepping (BTF) is disabled.
*/
write_debugctlmsr(child,
- child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
+ child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
if (!child->thread.debugctlmsr)
clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6dfd4e76661..ab6bf375a30 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -91,7 +91,9 @@ int do_set_thread_area(struct task_struct *p, int idx,
asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
{
- return do_set_thread_area(current, -1, u_info, 1);
+ int ret = do_set_thread_area(current, -1, u_info, 1);
+ asmlinkage_protect(1, ret, u_info);
+ return ret;
}
@@ -139,7 +141,9 @@ int do_get_thread_area(struct task_struct *p, int idx,
asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
{
- return do_get_thread_area(current, -1, u_info);
+ int ret = do_get_thread_area(current, -1, u_info);
+ asmlinkage_protect(1, ret, u_info);
+ return ret;
}
int regset_tls_active(struct task_struct *target,
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 43517e324be..c2241e04ea5 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -28,7 +28,8 @@ EXPORT_SYMBOL_GPL(tsc_khz);
static int __init tsc_setup(char *str)
{
printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
- "cannot disable TSC.\n");
+ "cannot disable TSC completely.\n");
+ mark_tsc_unstable("user disabled TSC");
return 1;
}
#else
@@ -255,9 +256,7 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
tsc_khz = cpu_khz;
- preempt_disable();
- set_cyc2ns_scale(cpu_khz, smp_processor_id());
- preempt_enable();
+ set_cyc2ns_scale(cpu_khz, freq->cpu);
/*
* TSC based sched_clock turns
* to junk w/ cpufreq
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 947554ddabb..d3bebaaad84 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -148,9 +148,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
mark_tsc_unstable("cpufreq changes");
}
- preempt_disable();
- set_cyc2ns_scale(tsc_khz_ref, smp_processor_id());
- preempt_enable();
+ set_cyc2ns_scale(tsc_khz_ref, freq->cpu);
return 0;
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3f824277458..edff4c98548 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -44,11 +44,6 @@
#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
#define __syscall_clobber "r11","cx","memory"
-#define __pa_vsymbol(x) \
- ({unsigned long v; \
- extern char __vsyscall_0; \
- asm("" : "=r" (v) : "0" (x)); \
- ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); })
/*
* vsyscall_gtod_data contains data that is :
@@ -102,7 +97,7 @@ static __always_inline void do_get_tz(struct timezone * tz)
static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
{
int ret;
- asm volatile("vsysc2: syscall"
+ asm volatile("syscall"
: "=a" (ret)
: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
: __syscall_clobber );
@@ -112,7 +107,7 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
static __always_inline long time_syscall(long *t)
{
long secs;
- asm volatile("vsysc1: syscall"
+ asm volatile("syscall"
: "=a" (secs)
: "0" (__NR_time),"D" (t) : __syscall_clobber);
return secs;
@@ -228,42 +223,11 @@ long __vsyscall(3) venosys_1(void)
#ifdef CONFIG_SYSCTL
-#define SYSCALL 0x050f
-#define NOP2 0x9090
-
-/*
- * NOP out syscall in vsyscall page when not needed.
- */
-static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int
+vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
- extern u16 vsysc1, vsysc2;
- u16 __iomem *map1;
- u16 __iomem *map2;
- int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
- if (!write)
- return ret;
- /* gcc has some trouble with __va(__pa()), so just do it this
- way. */
- map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
- if (!map1)
- return -ENOMEM;
- map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
- if (!map2) {
- ret = -ENOMEM;
- goto out;
- }
- if (!vsyscall_gtod_data.sysctl_enabled) {
- writew(SYSCALL, map1);
- writew(SYSCALL, map2);
- } else {
- writew(NOP2, map1);
- writew(NOP2, map2);
- }
- iounmap(map2);
-out:
- iounmap(map1);
- return ret;
+ return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
}
static ctl_table kernel_table2[] = {
@@ -279,7 +243,6 @@ static ctl_table kernel_root_table2[] = {
.child = kernel_table2 },
{}
};
-
#endif
/* Assume __initcall executes before all user space. Hopefully kmod
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2cbee9479ce..68a6b151193 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -647,6 +647,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
APIC_BUS_CYCLE_NS * apic->timer.divide_count;
atomic_set(&apic->timer.pending, 0);
+
+ if (!apic->timer.period)
+ return;
+
hrtimer_start(&apic->timer.dev,
ktime_add_ns(now, apic->timer.period),
HRTIMER_MODE_ABS);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8efdcdbebb0..e55af12e11b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -222,8 +222,7 @@ static int is_io_pte(unsigned long pte)
static int is_rmap_pte(u64 pte)
{
- return pte != shadow_trap_nonpresent_pte
- && pte != shadow_notrap_nonpresent_pte;
+ return is_shadow_present_pte(pte);
}
static gfn_t pse36_gfn_delta(u32 gpte)
@@ -681,8 +680,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
unsigned level,
int metaphysical,
unsigned access,
- u64 *parent_pte,
- bool *new_page)
+ u64 *parent_pte)
{
union kvm_mmu_page_role role;
unsigned index;
@@ -722,8 +720,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
vcpu->arch.mmu.prefetch_page(vcpu, sp);
if (!metaphysical)
rmap_write_protect(vcpu->kvm, gfn);
- if (new_page)
- *new_page = 1;
return sp;
}
@@ -876,11 +872,18 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
+ struct page *page;
+
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
if (gpa == UNMAPPED_GVA)
return NULL;
- return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+ down_read(&current->mm->mmap_sem);
+ page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ up_read(&current->mm->mmap_sem);
+
+ return page;
}
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
@@ -889,14 +892,25 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
int *ptwrite, gfn_t gfn, struct page *page)
{
u64 spte;
- int was_rmapped = is_rmap_pte(*shadow_pte);
+ int was_rmapped = 0;
int was_writeble = is_writeble_pte(*shadow_pte);
+ hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
__FUNCTION__, *shadow_pte, pt_access,
write_fault, user_fault, gfn);
+ if (is_rmap_pte(*shadow_pte)) {
+ if (host_pfn != page_to_pfn(page)) {
+ pgprintk("hfn old %lx new %lx\n",
+ host_pfn, page_to_pfn(page));
+ rmap_remove(vcpu->kvm, shadow_pte);
+ }
+ else
+ was_rmapped = 1;
+ }
+
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
@@ -999,8 +1013,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
>> PAGE_SHIFT;
new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
v, level - 1,
- 1, ACC_ALL, &table[index],
- NULL);
+ 1, ACC_ALL, &table[index]);
if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n");
kvm_release_page_clean(page);
@@ -1020,15 +1033,18 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
struct page *page;
+ down_read(&vcpu->kvm->slots_lock);
+
down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gfn);
+ up_read(&current->mm->mmap_sem);
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
r = __nonpaging_map(vcpu, v, write, gfn, page);
spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return r;
}
@@ -1090,7 +1106,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
- PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
+ PT64_ROOT_LEVEL, 0, ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
vcpu->arch.mmu.root_hpa = root;
@@ -1111,7 +1127,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
root_gfn = 0;
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, !is_paging(vcpu),
- ACC_ALL, NULL, NULL);
+ ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
@@ -1172,7 +1188,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
- pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
+ pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3);
mmu_free_roots(vcpu);
}
@@ -1362,6 +1378,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gfn_t gfn;
int r;
u64 gpte = 0;
+ struct page *page;
if (bytes != 4 && bytes != 8)
return;
@@ -1389,8 +1406,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!is_present_pte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+
+ down_read(&current->mm->mmap_sem);
+ page = gfn_to_page(vcpu->kvm, gfn);
+ up_read(&current->mm->mmap_sem);
+
vcpu->arch.update_pte.gfn = gfn;
- vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
+ vcpu->arch.update_pte.page = page;
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1496,9 +1518,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 03ba8608fe0..ecc0856268c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -91,7 +91,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
pt_element_t *table;
struct page *page;
+ down_read(&current->mm->mmap_sem);
page = gfn_to_page(kvm, table_gfn);
+ up_read(&current->mm->mmap_sem);
+
table = kmap_atomic(page, KM_USER0);
ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -140,7 +143,7 @@ walk:
}
#endif
ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
- (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
+ (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
pt_access = ACC_ALL;
@@ -297,7 +300,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
u64 shadow_pte;
int metaphysical;
gfn_t table_gfn;
- bool new_page = 0;
shadow_ent = ((u64 *)__va(shadow_addr)) + index;
if (level == PT_PAGE_TABLE_LEVEL)
@@ -319,8 +321,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
metaphysical, access,
- shadow_ent, &new_page);
- if (new_page && !metaphysical) {
+ shadow_ent);
+ if (!metaphysical) {
int r;
pt_element_t curr_pte;
r = kvm_read_guest_atomic(vcpu->kvm,
@@ -378,7 +380,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (r)
return r;
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
/*
* Look up the shadow pte for the faulting address.
*/
@@ -392,11 +394,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pgprintk("%s: guest page fault\n", __FUNCTION__);
inject_page_fault(vcpu, addr, walker.error_code);
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 0;
}
+ down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, walker.gfn);
+ up_read(&current->mm->mmap_sem);
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
@@ -413,14 +417,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
*/
if (shadow_pte && is_io_pte(*shadow_pte)) {
spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 1;
}
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, "post page fault (fixed)");
spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return write_pt;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index de755cb1431..1a582f1090e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -792,6 +792,10 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vcpu->arch.cr0 = cr0;
cr0 |= X86_CR0_PG | X86_CR0_WP;
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+ if (!vcpu->fpu_active) {
+ svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
+ cr0 |= X86_CR0_TS;
+ }
svm->vmcb->save.cr0 = cr0;
}
@@ -1096,6 +1100,24 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
case MSR_IA32_SYSENTER_ESP:
*data = svm->vmcb->save.sysenter_esp;
break;
+ /* Nobody will change the following 5 values in the VMCB so
+ we can safely return them on rdmsr. They will always be 0
+ until LBRV is implemented. */
+ case MSR_IA32_DEBUGCTLMSR:
+ *data = svm->vmcb->save.dbgctl;
+ break;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ *data = svm->vmcb->save.br_from;
+ break;
+ case MSR_IA32_LASTBRANCHTOIP:
+ *data = svm->vmcb->save.br_to;
+ break;
+ case MSR_IA32_LASTINTFROMIP:
+ *data = svm->vmcb->save.last_excp_from;
+ break;
+ case MSR_IA32_LASTINTTOIP:
+ *data = svm->vmcb->save.last_excp_to;
+ break;
default:
return kvm_get_msr_common(vcpu, ecx, data);
}
@@ -1156,6 +1178,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
case MSR_IA32_SYSENTER_ESP:
svm->vmcb->save.sysenter_esp = data;
break;
+ case MSR_IA32_DEBUGCTLMSR:
+ pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __FUNCTION__, data);
+ break;
case MSR_K7_EVNTSEL0:
case MSR_K7_EVNTSEL1:
case MSR_K7_EVNTSEL2:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ad36447e696..8e1462880d1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -349,8 +349,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
static void reload_tss(void)
{
-#ifndef CONFIG_X86_64
-
/*
* VT restores TR but not its size. Useless.
*/
@@ -361,7 +359,6 @@ static void reload_tss(void)
descs = (void *)gdt.base;
descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
load_TR_desc();
-#endif
}
static void load_transition_efer(struct vcpu_vmx *vmx)
@@ -638,6 +635,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
{
int save_nmsrs;
+ vmx_load_host_state(vmx);
save_nmsrs = 0;
#ifdef CONFIG_X86_64
if (is_long_mode(&vmx->vcpu)) {
@@ -1435,7 +1433,7 @@ static int init_rmode_tss(struct kvm *kvm)
int ret = 0;
int r;
- down_read(&current->mm->mmap_sem);
+ down_read(&kvm->slots_lock);
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0)
goto out;
@@ -1458,7 +1456,7 @@ static int init_rmode_tss(struct kvm *kvm)
ret = 1;
out:
- up_read(&current->mm->mmap_sem);
+ up_read(&kvm->slots_lock);
return ret;
}
@@ -1477,7 +1475,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
- down_write(&current->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
if (kvm->arch.apic_access_page)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -1487,9 +1485,12 @@ static int alloc_apic_access_page(struct kvm *kvm)
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
if (r)
goto out;
+
+ down_read(&current->mm->mmap_sem);
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
+ up_read(&current->mm->mmap_sem);
out:
- up_write(&current->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return r;
}
@@ -1602,9 +1603,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
- if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
- if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
- return -ENOMEM;
return 0;
}
@@ -2534,6 +2532,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
put_cpu();
if (err)
goto free_vmcs;
+ if (vm_need_virtualize_apic_accesses(kvm))
+ if (alloc_apic_access_page(kvm) != 0)
+ goto free_vmcs;
return &vmx->vcpu;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cf530814868..6b01552bd1f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,9 @@
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+
struct kvm_x86_ops *kvm_x86_ops;
struct kvm_stats_debugfs_item debugfs_entries[] = {
@@ -181,7 +184,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
int ret;
u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
offset * sizeof(u64), sizeof(pdpte));
if (ret < 0) {
@@ -198,7 +201,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
out:
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return ret;
}
@@ -212,13 +215,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
out:
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return changed;
}
@@ -356,7 +359,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
*/
}
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
/*
* Does the new cr3 value map to physical memory? (Note, we
* catch an invalid cr3 even in real-mode, because it would
@@ -372,7 +375,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
vcpu->arch.cr3 = cr3;
vcpu->arch.mmu.new_cr3(vcpu);
}
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
}
EXPORT_SYMBOL_GPL(set_cr3);
@@ -484,6 +487,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
__FUNCTION__, data);
break;
+ case MSR_IA32_MCG_CTL:
+ pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
+ __FUNCTION__, data);
+ break;
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
case 0x200 ... 0x2ff: /* MTRRs */
@@ -526,6 +533,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MC0_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
case MSR_IA32_MC0_MISC:
case MSR_IA32_MC0_MISC+4:
case MSR_IA32_MC0_MISC+8:
@@ -727,6 +735,24 @@ long kvm_arch_dev_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_GET_SUPPORTED_CPUID: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ goto out;
+ r = 0;
+ break;
+ }
default:
r = -EINVAL;
}
@@ -974,8 +1000,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
put_cpu();
}
-static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
- struct kvm_cpuid2 *cpuid,
+static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries)
{
struct kvm_cpuid_entry2 *cpuid_entries;
@@ -1207,12 +1232,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
- down_write(&current->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
- up_write(&current->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return 0;
}
@@ -1261,7 +1286,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
< alias->target_phys_addr)
goto out;
- down_write(&current->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
p = &kvm->arch.aliases[alias->slot];
p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1275,7 +1300,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
kvm_mmu_zap_all(kvm);
- up_write(&current->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return 0;
@@ -1351,7 +1376,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
int is_dirty = 0;
- down_write(&current->mm->mmap_sem);
+ down_write(&kvm->slots_lock);
r = kvm_get_dirty_log(kvm, log, &is_dirty);
if (r)
@@ -1367,7 +1392,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
}
r = 0;
out:
- up_write(&current->mm->mmap_sem);
+ up_write(&kvm->slots_lock);
return r;
}
@@ -1487,24 +1512,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
- case KVM_GET_SUPPORTED_CPUID: {
- struct kvm_cpuid2 __user *cpuid_arg = argp;
- struct kvm_cpuid2 cpuid;
-
- r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
- goto out;
- r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
- cpuid_arg->entries);
- if (r)
- goto out;
-
- r = -EFAULT;
- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
- goto out;
- r = 0;
- break;
- }
default:
;
}
@@ -1563,7 +1570,7 @@ int emulator_read_std(unsigned long addr,
void *data = val;
int r = X86EMUL_CONTINUE;
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
while (bytes) {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
unsigned offset = addr & (PAGE_SIZE-1);
@@ -1585,7 +1592,7 @@ int emulator_read_std(unsigned long addr,
addr += tocopy;
}
out:
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return r;
}
EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1604,9 +1611,9 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1644,14 +1651,14 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
{
int ret;
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
if (ret < 0) {
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 0;
}
kvm_mmu_pte_write(vcpu, gpa, val, bytes);
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
return 1;
}
@@ -1663,9 +1670,9 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct kvm_io_device *mmio_dev;
gpa_t gpa;
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
if (gpa == UNMAPPED_GVA) {
kvm_inject_page_fault(vcpu, addr, 2);
@@ -1742,7 +1749,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
char *kaddr;
u64 val;
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
if (gpa == UNMAPPED_GVA ||
@@ -1753,13 +1760,17 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
goto emul_write;
val = *(u64 *)new;
+
+ down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ up_read(&current->mm->mmap_sem);
+
kaddr = kmap_atomic(page, KM_USER0);
set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
kunmap_atomic(kaddr, KM_USER0);
kvm_release_page_dirty(page);
emul_write:
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
}
#endif
@@ -2152,10 +2163,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
kvm_x86_ops->skip_emulated_instruction(vcpu);
for (i = 0; i < nr_pages; ++i) {
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
page = gva_to_page(vcpu, address + i * PAGE_SIZE);
vcpu->arch.pio.guest_pages[i] = page;
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
if (!page) {
kvm_inject_gp(vcpu, 0);
free_pio_guest_pages(vcpu);
@@ -2478,8 +2489,9 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- vcpu->arch.apic->vapic_page = page;
up_read(&current->mm->mmap_sem);
+
+ vcpu->arch.apic->vapic_page = page;
}
static void vapic_exit(struct kvm_vcpu *vcpu)
@@ -2861,8 +2873,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_x86_ops->decache_cr4_guest_bits(vcpu);
mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
- vcpu->arch.cr0 = sregs->cr0;
kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+ vcpu->arch.cr0 = sregs->cr0;
mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
@@ -2952,9 +2964,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa_t gpa;
vcpu_load(vcpu);
- down_read(&current->mm->mmap_sem);
+ down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
- up_read(&current->mm->mmap_sem);
+ up_read(&vcpu->kvm->slots_lock);
tr->physical_address = gpa;
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
@@ -3227,11 +3239,13 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
*/
if (!user_alloc) {
if (npages && !old.rmap) {
+ down_write(&current->mm->mmap_sem);
memslot->userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS,
0);
+ up_write(&current->mm->mmap_sem);
if (IS_ERR((void *)memslot->userspace_addr))
return PTR_ERR((void *)memslot->userspace_addr);
@@ -3239,8 +3253,10 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
if (!old.user_alloc && old.rmap) {
int ret;
+ down_write(&current->mm->mmap_sem);
ret = do_munmap(current->mm, old.userspace_addr,
old.npages * PAGE_SIZE);
+ up_write(&current->mm->mmap_sem);
if (ret < 0)
printk(KERN_WARNING
"kvm_vm_ioctl_set_memory_region: "
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 5afdde4895d..3335b4595ef 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -10,21 +10,19 @@
* (such as the example in Documentation/lguest/lguest.c) is called the
* Launcher.
*
- * Secondly, we only run specially modified Guests, not normal kernels. When
- * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets
- * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows
- * how to be a Guest. This means that you can use the same kernel you boot
- * normally (ie. as a Host) as a Guest.
+ * Secondly, we only run specially modified Guests, not normal kernels: setting
+ * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
+ * how to be a Guest at boot time. This means that you can use the same kernel
+ * you boot normally (ie. as a Host) as a Guest.
*
* These Guests know that they cannot do privileged operations, such as disable
* interrupts, and that they have to ask the Host to do such things explicitly.
* This file consists of all the replacements for such low-level native
* hardware operations: these special Guest versions call the Host.
*
- * So how does the kernel know it's a Guest? The Guest starts at a special
- * entry point marked with a magic string, which sets up a few things then
- * calls here. We replace the native functions various "paravirt" structures
- * with our Guest versions, then boot like normal. :*/
+ * So how does the kernel know it's a Guest? We'll see that later, but let's
+ * just say that we end up here where we replace the native functions various
+ * "paravirt" structures with our Guest versions, then boot like normal. :*/
/*
* Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
@@ -57,6 +55,7 @@
#include <linux/lguest_launcher.h>
#include <linux/virtio_console.h>
#include <linux/pm.h>
+#include <asm/lguest.h>
#include <asm/paravirt.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -75,15 +74,6 @@
* behaving in simplified but equivalent ways. In particular, the Guest is the
* same kernel as the Host (or at least, built from the same source code). :*/
-/* Declarations for definitions in lguest_guest.S */
-extern char lguest_noirq_start[], lguest_noirq_end[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_sti[], lgend_sti[];
-extern const char lgstart_popf[], lgend_popf[];
-extern const char lgstart_pushf[], lgend_pushf[];
-extern const char lgstart_iret[], lgend_iret[];
-extern void lguest_iret(void);
-
struct lguest_data lguest_data = {
.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
.noirq_start = (u32)lguest_noirq_start,
@@ -92,7 +82,6 @@ struct lguest_data lguest_data = {
.blocked_interrupts = { 1 }, /* Block timer interrupts */
.syscall_vec = SYSCALL_VECTOR,
};
-static cycle_t clock_base;
/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a
* ring buffer of stored hypercalls which the Host will run though next time we
@@ -143,7 +132,7 @@ static void async_hcall(unsigned long call, unsigned long arg1,
* lguest_leave_lazy_mode().
*
* So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing. */
+ * future processing: */
static void lazy_hcall(unsigned long call,
unsigned long arg1,
unsigned long arg2,
@@ -156,7 +145,7 @@ static void lazy_hcall(unsigned long call,
}
/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
- * issue a hypercall to flush any stored calls. */
+ * issue the do-nothing hypercall to flush any stored calls. */
static void lguest_leave_lazy_mode(void)
{
paravirt_leave_lazy(paravirt_get_lazy_mode());
@@ -173,7 +162,7 @@ static void lguest_leave_lazy_mode(void)
*
* So instead we keep an "irq_enabled" field inside our "struct lguest_data",
* which the Guest can update with a single instruction. The Host knows to
- * check there when it wants to deliver an interrupt.
+ * check there before it tries to deliver an interrupt.
*/
/* save_flags() is expected to return the processor state (ie. "flags"). The
@@ -205,10 +194,15 @@ static void irq_enable(void)
/*M:003 Note that we don't check for outstanding interrupts when we re-enable
* them (or when we unmask an interrupt). This seems to work for the moment,
* since interrupts are rare and we'll just get the interrupt on the next timer
- * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way
+ * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way
* would be to put the "irq_enabled" field in a page by itself, and have the
* Host write-protect it when an interrupt comes in when irqs are disabled.
- * There will then be a page fault as soon as interrupts are re-enabled. :*/
+ * There will then be a page fault as soon as interrupts are re-enabled.
+ *
+ * A better method is to implement soft interrupt disable generally for x86:
+ * instead of disabling interrupts, we set a flag. If an interrupt does come
+ * in, we then disable them for real. This is uncommon, so we could simply use
+ * a hypercall for interrupt control and not worry about efficiency. :*/
/*G:034
* The Interrupt Descriptor Table (IDT).
@@ -221,6 +215,10 @@ static void irq_enable(void)
static void lguest_write_idt_entry(gate_desc *dt,
int entrynum, const gate_desc *g)
{
+ /* The gate_desc structure is 8 bytes long: we hand it to the Host in
+ * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors
+ * around like this; typesafety wasn't a big concern in Linux's early
+ * years. */
u32 *desc = (u32 *)g;
/* Keep the local copy up to date. */
native_write_idt_entry(dt, entrynum, g);
@@ -252,7 +250,8 @@ static void lguest_load_idt(const struct desc_ptr *desc)
*
* This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY
* hypercall and use that repeatedly to load a new IDT. I don't think it
- * really matters, but wouldn't it be nice if they were the same?
+ * really matters, but wouldn't it be nice if they were the same? Wouldn't
+ * it be even better if you were the one to send the patch to fix it?
*/
static void lguest_load_gdt(const struct desc_ptr *desc)
{
@@ -307,9 +306,9 @@ static void lguest_load_tr_desc(void)
/* The "cpuid" instruction is a way of querying both the CPU identity
* (manufacturer, model, etc) and its features. It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you
- * might imagine, after a decade and a half this treatment, it is now a giant
- * ball of hair. Its entry in the current Intel manual runs to 28 pages.
+ * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
+ * As you might imagine, after a decade and a half this treatment, it is now a
+ * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
*
* This instruction even it has its own Wikipedia entry. The Wikipedia entry
* has been translated into 4 languages. I am not making this up!
@@ -335,8 +334,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
case 1: /* Basic feature request. */
/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
*cx &= 0x00002201;
- /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
- *dx &= 0x07808101;
+ /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */
+ *dx &= 0x07808111;
/* The Host can do a nice optimization if it knows that the
* kernel mappings (addresses above 0xC0000000 or whatever
* PAGE_OFFSET is set to) haven't changed. But Linux calls
@@ -603,19 +602,25 @@ static unsigned long lguest_get_wallclock(void)
return lguest_data.time.tv_sec;
}
+/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us
+ * what speed it runs at, or 0 if it's unusable as a reliable clock source.
+ * This matches what we want here: if we return 0 from this function, the x86
+ * TSC clock will give up and not register itself. */
+static unsigned long lguest_cpu_khz(void)
+{
+ return lguest_data.tsc_khz;
+}
+
+/* If we can't use the TSC, the kernel falls back to our lower-priority
+ * "lguest_clock", where we read the time value given to us by the Host. */
static cycle_t lguest_clock_read(void)
{
unsigned long sec, nsec;
- /* If the Host tells the TSC speed, we can trust that. */
- if (lguest_data.tsc_khz)
- return native_read_tsc();
-
- /* If we can't use the TSC, we read the time value written by the Host.
- * Since it's in two parts (seconds and nanoseconds), we risk reading
- * it just as it's changing from 99 & 0.999999999 to 100 and 0, and
- * getting 99 and 0. As Linux tends to come apart under the stress of
- * time travel, we must be careful: */
+ /* Since the time is in two parts (seconds and nanoseconds), we risk
+ * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
+ * and getting 99 and 0. As Linux tends to come apart under the stress
+ * of time travel, we must be careful: */
do {
/* First we read the seconds part. */
sec = lguest_data.time.tv_sec;
@@ -630,14 +635,14 @@ static cycle_t lguest_clock_read(void)
/* Now if the seconds part has changed, try again. */
} while (unlikely(lguest_data.time.tv_sec != sec));
- /* Our non-TSC clock is in real nanoseconds. */
+ /* Our lguest clock is in real nanoseconds. */
return sec*1000000000ULL + nsec;
}
-/* This is what we tell the kernel is our clocksource. */
+/* This is the fallback clocksource: lower priority than the TSC clocksource. */
static struct clocksource lguest_clock = {
.name = "lguest",
- .rating = 400,
+ .rating = 200,
.read = lguest_clock_read,
.mask = CLOCKSOURCE_MASK(64),
.mult = 1 << 22,
@@ -645,24 +650,22 @@ static struct clocksource lguest_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-/* The "scheduler clock" is just our real clock, adjusted to start at zero */
-static unsigned long long lguest_sched_clock(void)
-{
- return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base);
-}
-
/* We also need a "struct clock_event_device": Linux asks us to set it to go
* off some time in the future. Actually, James Morris figured all this out, I
* just applied the patch. */
static int lguest_clockevent_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
+ /* FIXME: I don't think this can ever happen, but James tells me he had
+ * to put this code in. Maybe we should remove it now. Anyone? */
if (delta < LG_CLOCK_MIN_DELTA) {
if (printk_ratelimit())
printk(KERN_DEBUG "%s: small delta %lu ns\n",
__FUNCTION__, delta);
return -ETIME;
}
+
+ /* Please wake us this far in the future. */
hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
return 0;
}
@@ -720,19 +723,8 @@ static void lguest_time_init(void)
/* Set up the timer interrupt (0) to go to our simple timer routine */
set_irq_handler(0, lguest_time_irq);
- /* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can
- * use the TSC, otherwise it's a dumb nanosecond-resolution clock.
- * Either way, the "rating" is set so high that it's always chosen over
- * any other clocksource. */
- if (lguest_data.tsc_khz)
- lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
- lguest_clock.shift);
- clock_base = lguest_clock_read();
clocksource_register(&lguest_clock);
- /* Now we've set up our clock, we can use it as the scheduler clock */
- pv_time_ops.sched_clock = lguest_sched_clock;
-
/* We can't set cpumask in the initializer: damn C limitations! Set it
* here and register our timer device. */
lguest_clockevent.cpumask = cpumask_of_cpu(0);
@@ -758,7 +750,7 @@ static void lguest_time_init(void)
* will not tolerate us trying to use that), the stack pointer, and the number
* of pages in the stack. */
static void lguest_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+ struct thread_struct *thread)
{
lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0,
THREAD_SIZE/PAGE_SIZE);
@@ -806,9 +798,8 @@ static void lguest_safe_halt(void)
hcall(LHCALL_HALT, 0, 0, 0);
}
-/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a
- * message out when we're crashing as well as elegant termination like powering
- * off.
+/* The SHUTDOWN hypercall takes a string to describe what's happening, and
+ * an argument which says whether this to restart (reboot) the Guest or not.
*
* Note that the Host always prefers that the Guest speak in physical addresses
* rather than virtual addresses, so we use __pa() here. */
@@ -836,8 +827,9 @@ static struct notifier_block paniced = {
/* Setting up memory is fairly easy. */
static __init char *lguest_memory_setup(void)
{
- /* We do this here and not earlier because lockcheck barfs if we do it
- * before start_kernel() */
+ /* We do this here and not earlier because lockcheck used to barf if we
+ * did it before start_kernel(). I think we fixed that, so it'd be
+ * nice to move it back to lguest_init. Patch welcome... */
atomic_notifier_chain_register(&panic_notifier_list, &paniced);
/* The Linux bootloader header contains an "e820" memory map: the
@@ -870,12 +862,19 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
return len;
}
+/* Rebooting also tells the Host we're finished, but the RESTART flag tells the
+ * Launcher to reboot us. */
+static void lguest_restart(char *reason)
+{
+ hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
+}
+
/*G:050
* Patching (Powerfully Placating Performance Pedants)
*
- * We have already seen that pv_ops structures let us replace simple
- * native instructions with calls to the appropriate back end all throughout
- * the kernel. This allows the same kernel to run as a Guest and as a native
+ * We have already seen that pv_ops structures let us replace simple native
+ * instructions with calls to the appropriate back end all throughout the
+ * kernel. This allows the same kernel to run as a Guest and as a native
* kernel, but it's slow because of all the indirect branches.
*
* Remember that David Wheeler quote about "Any problem in computer science can
@@ -928,14 +927,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
return insn_len;
}
-static void lguest_restart(char *reason)
-{
- hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0);
-}
-
-/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops
- * structures in the kernel provide points for (almost) every routine we have
- * to override to avoid privileged instructions. */
+/*G:030 Once we get to lguest_init(), we know we're a Guest. The various
+ * pv_ops structures in the kernel provide points for (almost) every routine we
+ * have to override to avoid privileged instructions. */
__init void lguest_init(void)
{
/* We're under lguest, paravirt is enabled, and we're running at
@@ -1003,6 +997,7 @@ __init void lguest_init(void)
/* time operations */
pv_time_ops.get_wallclock = lguest_get_wallclock;
pv_time_ops.time_init = lguest_time_init;
+ pv_time_ops.get_cpu_khz = lguest_cpu_khz;
/* Now is a good time to look at the implementations of these functions
* before returning to the rest of lguest_init(). */
@@ -1022,9 +1017,9 @@ __init void lguest_init(void)
* the normal data segment to get through booting. */
asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
- /* The Host uses the top of the Guest's virtual address space for the
- * Host<->Guest Switcher, and it tells us how big that is in
- * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
+ /* The Host<->Guest Switcher lives at the top of our address space, and
+ * the Host told us how big it is when we made LGUEST_INIT hypercall:
+ * it put the answer in lguest_data.reserve_mem */
reserve_top_address(lguest_data.reserve_mem);
/* If we don't initialize the lock dependency checker now, it crashes
@@ -1046,6 +1041,7 @@ __init void lguest_init(void)
/* Math is always hard! */
new_cpu_data.hard_math = 1;
+ /* We don't have features. We have puppies! Puppies! */
#ifdef CONFIG_X86_MCE
mce_disabled = 1;
#endif
@@ -1063,10 +1059,11 @@ __init void lguest_init(void)
virtio_cons_early_init(early_put_chars);
/* Last of all, we set the power management poweroff hook to point to
- * the Guest routine to power off. */
+ * the Guest routine to power off, and the reboot hook to our restart
+ * routine. */
pm_power_off = lguest_power_off;
-
machine_ops.restart = lguest_restart;
+
/* Now we're set up, call start_kernel() in init/main.c and we proceed
* to boot as normal. It never returns. */
start_kernel();
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 95b6fbcded6..5c7cef34c9e 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -5,13 +5,20 @@
#include <asm/thread_info.h>
#include <asm/processor-flags.h>
-/*G:020 This is where we begin: head.S notes that the boot header's platform
- * type field is "1" (lguest), so calls us here.
+/*G:020 Our story starts with the kernel booting into startup_32 in
+ * arch/x86/kernel/head_32.S. It expects a boot header, which is created by
+ * the bootloader (the Launcher in our case).
+ *
+ * The startup_32 function does very little: it clears the uninitialized global
+ * C variables which we expect to be zero (ie. BSS) and then copies the boot
+ * header and kernel command line somewhere safe. Finally it checks the
+ * 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen:
+ * if it's set to '1' (lguest's assigned number), then it calls us here.
*
* WARNING: be very careful here! We're running at addresses equal to physical
* addesses (around 0), not above PAGE_OFFSET as most code expectes
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
- * data.
+ * data without remembering to subtract __PAGE_OFFSET!
*
* The .section line puts this code in .init.text so it will be discarded after
* boot. */
@@ -24,7 +31,7 @@ ENTRY(lguest_entry)
int $LGUEST_TRAP_ENTRY
/* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
- * instruction uses %esi implicitly as the source for the copy we'
+ * instruction uses %esi implicitly as the source for the copy we're
* about to do. */
movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
diff --git a/arch/x86/mach-rdc321x/gpio.c b/arch/x86/mach-rdc321x/gpio.c
index 031269163bd..247f33d3a40 100644
--- a/arch/x86/mach-rdc321x/gpio.c
+++ b/arch/x86/mach-rdc321x/gpio.c
@@ -1,91 +1,194 @@
/*
- * Copyright (C) 2007, OpenWrt.org, Florian Fainelli <florian@openwrt.org>
- * RDC321x architecture specific GPIO support
+ * GPIO support for RDC SoC R3210/R8610
+ *
+ * Copyright (C) 2007, Florian Fainelli <florian@openwrt.org>
+ * Copyright (C) 2008, Volker Weiss <dev@tintuc.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
*/
-#include <linux/autoconf.h>
-#include <linux/init.h>
+
+#include <linux/spinlock.h>
#include <linux/io.h>
#include <linux/types.h>
#include <linux/module.h>
-#include <linux/delay.h>
+#include <asm/gpio.h>
#include <asm/mach-rdc321x/rdc321x_defs.h>
-static inline int rdc_gpio_is_valid(unsigned gpio)
+
+/* spin lock to protect our private copy of GPIO data register plus
+ the access to PCI conf registers. */
+static DEFINE_SPINLOCK(gpio_lock);
+
+/* copy of GPIO data registers */
+static u32 gpio_data_reg1;
+static u32 gpio_data_reg2;
+
+static u32 gpio_request_data[2];
+
+
+static inline void rdc321x_conf_write(unsigned addr, u32 value)
{
- return (gpio <= RDC_MAX_GPIO);
+ outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
+ outl(value, RDC3210_CFGREG_DATA);
}
-static unsigned int rdc_gpio_read(unsigned gpio)
+static inline void rdc321x_conf_or(unsigned addr, u32 value)
{
- unsigned int val;
-
- val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x84:0x48));
- outl(val, RDC3210_CFGREG_ADDR);
- udelay(10);
- val = inl(RDC3210_CFGREG_DATA);
- val |= (0x1 << (gpio & 0x1F));
- outl(val, RDC3210_CFGREG_DATA);
- udelay(10);
- val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x88:0x4C));
- outl(val, RDC3210_CFGREG_ADDR);
- udelay(10);
- val = inl(RDC3210_CFGREG_DATA);
-
- return val;
+ outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
+ value |= inl(RDC3210_CFGREG_DATA);
+ outl(value, RDC3210_CFGREG_DATA);
}
-static void rdc_gpio_write(unsigned int val)
+static inline u32 rdc321x_conf_read(unsigned addr)
{
- if (val) {
- outl(val, RDC3210_CFGREG_DATA);
- udelay(10);
- }
+ outl((1 << 31) | (7 << 11) | addr, RDC3210_CFGREG_ADDR);
+
+ return inl(RDC3210_CFGREG_DATA);
}
-int rdc_gpio_get_value(unsigned gpio)
+/* configure pin as GPIO */
+static void rdc321x_configure_gpio(unsigned gpio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&gpio_lock, flags);
+ rdc321x_conf_or(gpio < 32
+ ? RDC321X_GPIO_CTRL_REG1 : RDC321X_GPIO_CTRL_REG2,
+ 1 << (gpio & 0x1f));
+ spin_unlock_irqrestore(&gpio_lock, flags);
+}
+
+/* initially setup the 2 copies of the gpio data registers.
+ This function must be called by the platform setup code. */
+void __init rdc321x_gpio_setup()
+{
+ /* this might not be, what others (BIOS, bootloader, etc.)
+ wrote to these registers before, but it's a good guess. Still
+ better than just using 0xffffffff. */
+
+ gpio_data_reg1 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG1);
+ gpio_data_reg2 = rdc321x_conf_read(RDC321X_GPIO_DATA_REG2);
+}
+
+/* determine, if gpio number is valid */
+static inline int rdc321x_is_gpio(unsigned gpio)
+{
+ return gpio <= RDC321X_MAX_GPIO;
+}
+
+/* request GPIO */
+int rdc_gpio_request(unsigned gpio, const char *label)
{
- if (rdc_gpio_is_valid(gpio))
- return (int)rdc_gpio_read(gpio);
- else
+ unsigned long flags;
+
+ if (!rdc321x_is_gpio(gpio))
return -EINVAL;
+
+ spin_lock_irqsave(&gpio_lock, flags);
+ if (gpio_request_data[(gpio & 0x20) ? 1 : 0] & (1 << (gpio & 0x1f)))
+ goto inuse;
+ gpio_request_data[(gpio & 0x20) ? 1 : 0] |= (1 << (gpio & 0x1f));
+ spin_unlock_irqrestore(&gpio_lock, flags);
+
+ return 0;
+inuse:
+ spin_unlock_irqrestore(&gpio_lock, flags);
+ return -EINVAL;
}
-EXPORT_SYMBOL(rdc_gpio_get_value);
+EXPORT_SYMBOL(rdc_gpio_request);
-void rdc_gpio_set_value(unsigned gpio, int value)
+/* release previously-claimed GPIO */
+void rdc_gpio_free(unsigned gpio)
{
- unsigned int val;
+ unsigned long flags;
- if (!rdc_gpio_is_valid(gpio))
+ if (!rdc321x_is_gpio(gpio))
return;
- val = rdc_gpio_read(gpio);
+ spin_lock_irqsave(&gpio_lock, flags);
+ gpio_request_data[(gpio & 0x20) ? 1 : 0] &= ~(1 << (gpio & 0x1f));
+ spin_unlock_irqrestore(&gpio_lock, flags);
+}
+EXPORT_SYMBOL(rdc_gpio_free);
+
+/* read GPIO pin */
+int rdc_gpio_get_value(unsigned gpio)
+{
+ u32 reg;
+ unsigned long flags;
+
+ spin_lock_irqsave(&gpio_lock, flags);
+ reg = rdc321x_conf_read(gpio < 32
+ ? RDC321X_GPIO_DATA_REG1 : RDC321X_GPIO_DATA_REG2);
+ spin_unlock_irqrestore(&gpio_lock, flags);
- if (value)
- val &= ~(0x1 << (gpio & 0x1F));
- else
- val |= (0x1 << (gpio & 0x1F));
+ return (1 << (gpio & 0x1f)) & reg ? 1 : 0;
+}
+EXPORT_SYMBOL(rdc_gpio_get_value);
- rdc_gpio_write(val);
+/* set GPIO pin to value */
+void rdc_gpio_set_value(unsigned gpio, int value)
+{
+ unsigned long flags;
+ u32 reg;
+
+ reg = 1 << (gpio & 0x1f);
+ if (gpio < 32) {
+ spin_lock_irqsave(&gpio_lock, flags);
+ if (value)
+ gpio_data_reg1 |= reg;
+ else
+ gpio_data_reg1 &= ~reg;
+ rdc321x_conf_write(RDC321X_GPIO_DATA_REG1, gpio_data_reg1);
+ spin_unlock_irqrestore(&gpio_lock, flags);
+ } else {
+ spin_lock_irqsave(&gpio_lock, flags);
+ if (value)
+ gpio_data_reg2 |= reg;
+ else
+ gpio_data_reg2 &= ~reg;
+ rdc321x_conf_write(RDC321X_GPIO_DATA_REG2, gpio_data_reg2);
+ spin_unlock_irqrestore(&gpio_lock, flags);
+ }
}
EXPORT_SYMBOL(rdc_gpio_set_value);
+/* configure GPIO pin as input */
int rdc_gpio_direction_input(unsigned gpio)
{
+ if (!rdc321x_is_gpio(gpio))
+ return -EINVAL;
+
+ rdc321x_configure_gpio(gpio);
+
return 0;
}
EXPORT_SYMBOL(rdc_gpio_direction_input);
+/* configure GPIO pin as output and set value */
int rdc_gpio_direction_output(unsigned gpio, int value)
{
+ if (!rdc321x_is_gpio(gpio))
+ return -EINVAL;
+
+ gpio_set_value(gpio, value);
+ rdc321x_configure_gpio(gpio);
+
return 0;
}
EXPORT_SYMBOL(rdc_gpio_direction_output);
-
-
diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c
index dda6024a586..a037041817c 100644
--- a/arch/x86/mach-rdc321x/platform.c
+++ b/arch/x86/mach-rdc321x/platform.c
@@ -62,6 +62,8 @@ static struct platform_device *rdc321x_devs[] = {
static int __init rdc_board_setup(void)
{
+ rdc321x_gpio_setup();
+
return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs));
}
diff --git a/arch/x86/mach-visws/traps.c b/arch/x86/mach-visws/traps.c
index 843b67acf43..bfac6ba10f8 100644
--- a/arch/x86/mach-visws/traps.c
+++ b/arch/x86/mach-visws/traps.c
@@ -46,8 +46,9 @@ static __init void cobalt_init(void)
*/
set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
setup_local_APIC();
- printk(KERN_INFO "Local APIC Version %#lx, ID %#lx\n",
- apic_read(APIC_LVR), apic_read(APIC_ID));
+ printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
+ (unsigned int)apic_read(APIC_LVR),
+ (unsigned int)apic_read(APIC_ID));
set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index c394ca0720b..8e25e06ff73 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -324,7 +324,6 @@ unsigned long __init setup_memory(void)
* this space and use it to adjust the boundary between ZONE_NORMAL
* and ZONE_HIGHMEM.
*/
- find_max_pfn();
get_memcfg_numa();
kva_pages = calculate_numa_remap_pages();
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fdc667422df..ec08d838985 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -91,12 +91,10 @@ static int is_prefetch(struct pt_regs *regs, unsigned long addr,
int prefetch = 0;
unsigned char *max_instr;
-#ifdef CONFIG_X86_32
- if (!(__supported_pte_mask & _PAGE_NX))
- return 0;
-#endif
-
- /* If it was a exec fault on NX page, ignore */
+ /*
+ * If it was a exec (instruction fetch) fault on NX page, then
+ * do not ignore the fault:
+ */
if (error_code & PF_INSTR)
return 0;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 3d936f23270..9cf33d3ee5b 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -73,15 +73,15 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
- /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
-
- debug_kmap_atomic_prot(type);
+ /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
pagefault_disable();
if (!PageHighMem(page))
return page_address(page);
+ debug_kmap_atomic_prot(type);
+
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 4fbafb4bc2f..0b3d567e686 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -178,7 +178,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
- WARN_ON(!PageCompound(page));
+ WARN_ON(!PageHead(page));
return page;
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bb652f5a93f..a02a14f0f32 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -172,8 +172,9 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
}
/*
- * The head.S code sets up the kernel high mapping from:
- * __START_KERNEL_map to __START_KERNEL_map + KERNEL_TEXT_SIZE
+ * The head.S code sets up the kernel high mapping:
+ *
+ * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
*
* phys_addr holds the negative offset to the kernel, which is added
* to the compile time generated pmds. This results in invalid pmds up
@@ -515,14 +516,6 @@ void __init mem_init(void)
/* clear_bss() already clear the empty_zero_page */
- /* temporary debugging - double check it's true: */
- {
- int i;
-
- for (i = 0; i < 1024; i++)
- WARN_ON_ONCE(empty_zero_page[i]);
- }
-
reservedpages = 0;
/* this will put all low memory onto the freelists */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 882328efc3d..794895c6dcc 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -106,7 +106,7 @@ static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
* have to convert them into an offset in a page-aligned mapping, but the
* caller shouldn't need to know that small detail.
*/
-static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
+static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
enum ioremap_mode mode)
{
unsigned long pfn, offset, last_addr, vaddr;
@@ -134,12 +134,14 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
return NULL;
}
- WARN_ON_ONCE(page_is_ram(pfn));
-
switch (mode) {
case IOR_MODE_UNCACHED:
default:
- prot = PAGE_KERNEL_NOCACHE;
+ /*
+ * FIXME: we will use UC MINUS for now, as video fb drivers
+ * depend on it. Upcoming ioremap_wc() will fix this behavior.
+ */
+ prot = PAGE_KERNEL_UC_MINUS;
break;
case IOR_MODE_CACHED:
prot = PAGE_KERNEL;
@@ -162,7 +164,7 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
- remove_vm_area((void *)(vaddr & PAGE_MASK));
+ free_vm_area(area);
return NULL;
}
@@ -195,13 +197,13 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
*
* Must be freed with iounmap.
*/
-void __iomem *ioremap_nocache(unsigned long phys_addr, unsigned long size)
+void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
}
EXPORT_SYMBOL(ioremap_nocache);
-void __iomem *ioremap_cache(unsigned long phys_addr, unsigned long size)
+void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap(phys_addr, size, IOR_MODE_CACHED);
}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 59898fb0a4a..16b82ad34b9 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -221,8 +221,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
if (bootmap == NULL) {
if (nodedata_phys < start || nodedata_phys >= end)
- free_bootmem((unsigned long)node_data[nodeid],
- pgdat_size);
+ free_bootmem(nodedata_phys, pgdat_size);
node_data[nodeid] = NULL;
return;
}
@@ -622,13 +621,17 @@ void __init init_cpu_to_node(void)
int i;
for (i = 0; i < NR_CPUS; i++) {
+ int node;
u16 apicid = x86_cpu_to_apicid_init[i];
if (apicid == BAD_APICID)
continue;
- if (apicid_to_node[apicid] == NUMA_NO_NODE)
+ node = apicid_to_node[apicid];
+ if (node == NUMA_NO_NODE)
continue;
- numa_set_node(i, apicid_to_node[apicid]);
+ if (!node_online(node))
+ continue;
+ numa_set_node(i, node);
}
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e2a74ea11a5..7b79f6be4e7 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -44,6 +44,12 @@ static inline unsigned long highmap_end_pfn(void)
#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+# define debug_pagealloc 1
+#else
+# define debug_pagealloc 0
+#endif
+
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -355,45 +361,48 @@ out_unlock:
static LIST_HEAD(page_pool);
static unsigned long pool_size, pool_pages, pool_low;
-static unsigned long pool_used, pool_failed, pool_refill;
+static unsigned long pool_used, pool_failed;
-static void cpa_fill_pool(void)
+static void cpa_fill_pool(struct page **ret)
{
- struct page *p;
gfp_t gfp = GFP_KERNEL;
+ unsigned long flags;
+ struct page *p;
- /* Do not allocate from interrupt context */
- if (in_irq() || irqs_disabled())
- return;
/*
- * Check unlocked. I does not matter when we have one more
- * page in the pool. The bit lock avoids recursive pool
- * allocations:
+ * Avoid recursion (on debug-pagealloc) and also signal
+ * our priority to get to these pagetables:
*/
- if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill))
+ if (current->flags & PF_MEMALLOC)
return;
+ current->flags |= PF_MEMALLOC;
-#ifdef CONFIG_DEBUG_PAGEALLOC
/*
- * We could do:
- * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL;
- * but this fails on !PREEMPT kernels
+ * Allocate atomically from atomic contexts:
*/
- gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
-#endif
+ if (in_atomic() || irqs_disabled() || debug_pagealloc)
+ gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
- while (pool_pages < pool_size) {
+ while (pool_pages < pool_size || (ret && !*ret)) {
p = alloc_pages(gfp, 0);
if (!p) {
pool_failed++;
break;
}
- spin_lock_irq(&pgd_lock);
+ /*
+ * If the call site needs a page right now, provide it:
+ */
+ if (ret && !*ret) {
+ *ret = p;
+ continue;
+ }
+ spin_lock_irqsave(&pgd_lock, flags);
list_add(&p->lru, &page_pool);
pool_pages++;
- spin_unlock_irq(&pgd_lock);
+ spin_unlock_irqrestore(&pgd_lock, flags);
}
- clear_bit_unlock(0, &pool_refill);
+
+ current->flags &= ~PF_MEMALLOC;
}
#define SHIFT_MB (20 - PAGE_SHIFT)
@@ -414,11 +423,15 @@ void __init cpa_init(void)
* GiB. Shift MiB to Gib and multiply the result by
* POOL_PAGES_PER_GB:
*/
- gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
- pool_size = POOL_PAGES_PER_GB * gb;
+ if (debug_pagealloc) {
+ gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
+ pool_size = POOL_PAGES_PER_GB * gb;
+ } else {
+ pool_size = 1;
+ }
pool_low = pool_size;
- cpa_fill_pool();
+ cpa_fill_pool(NULL);
printk(KERN_DEBUG
"CPA: page pool initialized %lu of %lu pages preallocated\n",
pool_pages, pool_size);
@@ -440,16 +453,20 @@ static int split_large_page(pte_t *kpte, unsigned long address)
spin_lock_irqsave(&pgd_lock, flags);
if (list_empty(&page_pool)) {
spin_unlock_irqrestore(&pgd_lock, flags);
- return -ENOMEM;
+ base = NULL;
+ cpa_fill_pool(&base);
+ if (!base)
+ return -ENOMEM;
+ spin_lock_irqsave(&pgd_lock, flags);
+ } else {
+ base = list_first_entry(&page_pool, struct page, lru);
+ list_del(&base->lru);
+ pool_pages--;
+
+ if (pool_pages < pool_low)
+ pool_low = pool_pages;
}
- base = list_first_entry(&page_pool, struct page, lru);
- list_del(&base->lru);
- pool_pages--;
-
- if (pool_pages < pool_low)
- pool_low = pool_pages;
-
/*
* Check for races, another CPU might have split this page
* up for us already:
@@ -734,7 +751,8 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
cpa_flush_all(cache);
out:
- cpa_fill_pool();
+ cpa_fill_pool(NULL);
+
return ret;
}
@@ -753,7 +771,7 @@ static inline int change_page_attr_clear(unsigned long addr, int numpages,
int set_memory_uc(unsigned long addr, int numpages)
{
return change_page_attr_set(addr, numpages,
- __pgprot(_PAGE_PCD | _PAGE_PWT));
+ __pgprot(_PAGE_PCD));
}
EXPORT_SYMBOL(set_memory_uc);
@@ -897,9 +915,26 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
* Try to refill the page pool here. We can do this only after
* the tlb flush.
*/
- cpa_fill_pool();
+ cpa_fill_pool(NULL);
}
-#endif
+
+#ifdef CONFIG_HIBERNATION
+
+bool kernel_page_present(struct page *page)
+{
+ unsigned int level;
+ pte_t *pte;
+
+ if (PageHighMem(page))
+ return false;
+
+ pte = lookup_address((unsigned long)page_address(page), &level);
+ return (pte_val(*pte) & _PAGE_PRESENT);
+}
+
+#endif /* CONFIG_HIBERNATION */
+
+#endif /* CONFIG_DEBUG_PAGEALLOC */
/*
* The testcases use internal knowledge of the implementation that shouldn't
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 73aba712520..2f9e9afcb9f 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -342,12 +342,16 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
pgd_t *pgd_alloc(struct mm_struct *mm)
{
- pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
+ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- mm->pgd = pgd; /* so that alloc_pd can use it */
+ /* so that alloc_pd can use it */
+ mm->pgd = pgd;
+ if (pgd)
+ pgd_ctor(pgd);
if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
- quicklist_free(0, pgd_dtor, pgd);
+ pgd_dtor(pgd);
+ free_page((unsigned long)pgd);
pgd = NULL;
}
@@ -357,12 +361,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
pgd_mop_up_pmds(mm, pgd);
- quicklist_free(0, pgd_dtor, pgd);
-}
-
-void check_pgt_cache(void)
-{
- quicklist_trim(0, pgd_dtor, 25, 16);
+ pgd_dtor(pgd);
+ free_page((unsigned long)pgd);
}
void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 0234f2831bf..378136fb504 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -219,8 +219,21 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
if (pxm >= 0)
sd->node = pxm_to_node(pxm);
#endif
+ /*
+ * Maybe the desired pci bus has been already scanned. In such case
+ * it is unnecessary to scan the pci bus with the given domain,busnum.
+ */
+ bus = pci_find_bus(domain, busnum);
+ if (bus) {
+ /*
+ * If the desired bus exits, the content of bus->sysdata will
+ * be replaced by sd.
+ */
+ memcpy(bus->sysdata, sd, sizeof(*sd));
+ kfree(sd);
+ } else
+ bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
- bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
if (!bus)
kfree(sd);
@@ -228,7 +241,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
if (bus != NULL) {
if (pxm >= 0) {
printk("bus %d -> pxm %d -> node %d\n",
- busnum, pxm, sd->node);
+ busnum, pxm, pxm_to_node(pxm));
}
}
#endif
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index ed07ce6c171..a8715861877 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -583,6 +583,10 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
case PCI_DEVICE_ID_INTEL_ICH9_4:
case PCI_DEVICE_ID_INTEL_ICH9_5:
case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
+ case PCI_DEVICE_ID_INTEL_ICH10_0:
+ case PCI_DEVICE_ID_INTEL_ICH10_1:
+ case PCI_DEVICE_ID_INTEL_ICH10_2:
+ case PCI_DEVICE_ID_INTEL_ICH10_3:
r->name = "PIIX/ICH";
r->get = pirq_piix_get;
r->set = pirq_piix_set;
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 10ac8c316c4..2f7109ac4c1 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -198,6 +198,11 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
"b" (bx),
"D" ((long)reg),
"S" (&pci_indirect));
+ /*
+ * Zero-extend the result beyond 8 bits, do not trust the
+ * BIOS having done it:
+ */
+ *value &= 0xff;
break;
case 2:
__asm__("lcall *(%%esi); cld\n\t"
@@ -210,6 +215,11 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
"b" (bx),
"D" ((long)reg),
"S" (&pci_indirect));
+ /*
+ * Zero-extend the result beyond 16 bits, do not trust the
+ * BIOS having done it:
+ */
+ *value &= 0xffff;
break;
case 4:
__asm__("lcall *(%%esi); cld\n\t"
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index f385a4b4a48..0a8f4742ef5 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -50,7 +50,9 @@ obj-$(VDSO64-y) += vdso-syms.lds
sed-vdsosym := -e 's/^00*/0/' \
-e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p'
quiet_cmd_vdsosym = VDSOSYM $@
- cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
+define cmd_vdsosym
+ $(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@
+endef
$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
$(call if_changed,vdsosym)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 49e5358f481..27ee26aedf9 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -95,7 +95,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
*
* 0: not available, 1: available
*/
-static int have_vcpu_info_placement = 0;
+static int have_vcpu_info_placement = 1;
static void __init xen_vcpu_setup(int cpu)
{
@@ -103,6 +103,7 @@ static void __init xen_vcpu_setup(int cpu)
int err;
struct vcpu_info *vcpup;
+ BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info);
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
if (!have_vcpu_info_placement)
@@ -153,6 +154,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
if (*ax == 1)
maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
(1 << X86_FEATURE_ACPI) | /* disable ACPI */
+ (1 << X86_FEATURE_SEP) | /* disable SEP */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
asm(XEN_EMULATE_PREFIX "cpuid"
@@ -665,10 +667,10 @@ static void xen_release_pt_init(u32 pfn)
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
-static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
struct mmuext_op op;
- op.cmd = level;
+ op.cmd = cmd;
op.arg1.mfn = pfn_to_mfn(pfn);
if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
BUG();
@@ -685,7 +687,8 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
if (!PageHighMem(page)) {
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
- pin_pagetable_pfn(level, pfn);
+ if (level == PT_PTE)
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
} else
/* make sure there are no stray mappings of
this page */
@@ -695,27 +698,39 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
{
- xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L1_TABLE);
+ xen_alloc_ptpage(mm, pfn, PT_PTE);
}
static void xen_alloc_pd(struct mm_struct *mm, u32 pfn)
{
- xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L2_TABLE);
+ xen_alloc_ptpage(mm, pfn, PT_PMD);
}
/* This should never happen until we're OK to use struct page */
-static void xen_release_pt(u32 pfn)
+static void xen_release_ptpage(u32 pfn, unsigned level)
{
struct page *page = pfn_to_page(pfn);
if (PagePinned(page)) {
if (!PageHighMem(page)) {
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+ if (level == PT_PTE)
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
+ ClearPagePinned(page);
}
}
+static void xen_release_pt(u32 pfn)
+{
+ xen_release_ptpage(pfn, PT_PTE);
+}
+
+static void xen_release_pd(u32 pfn)
+{
+ xen_release_ptpage(pfn, PT_PMD);
+}
+
#ifdef CONFIG_HIGHPTE
static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
{
@@ -804,33 +819,43 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
PFN_DOWN(__pa(xen_start_info->pt_base)));
}
-static __init void xen_pagetable_setup_done(pgd_t *base)
+static __init void setup_shared_info(void)
{
- /* This will work as long as patching hasn't happened yet
- (which it hasn't) */
- pv_mmu_ops.alloc_pt = xen_alloc_pt;
- pv_mmu_ops.alloc_pd = xen_alloc_pd;
- pv_mmu_ops.release_pt = xen_release_pt;
- pv_mmu_ops.release_pd = xen_release_pt;
- pv_mmu_ops.set_pte = xen_set_pte;
-
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+
/*
* Create a mapping for the shared info page.
* Should be set_fixmap(), but shared_info is a machine
* address with no corresponding pseudo-phys address.
*/
- set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+ set_pte_mfn(addr,
PFN_DOWN(xen_start_info->shared_info),
PAGE_KERNEL);
- HYPERVISOR_shared_info =
- (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-
+ HYPERVISOR_shared_info = (struct shared_info *)addr;
} else
HYPERVISOR_shared_info =
(struct shared_info *)__va(xen_start_info->shared_info);
+#ifndef CONFIG_SMP
+ /* In UP this is as good a place as any to set up shared info */
+ xen_setup_vcpu_info_placement();
+#endif
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+ /* This will work as long as patching hasn't happened yet
+ (which it hasn't) */
+ pv_mmu_ops.alloc_pt = xen_alloc_pt;
+ pv_mmu_ops.alloc_pd = xen_alloc_pd;
+ pv_mmu_ops.release_pt = xen_release_pt;
+ pv_mmu_ops.release_pd = xen_release_pd;
+ pv_mmu_ops.set_pte = xen_set_pte;
+
+ setup_shared_info();
+
/* Actually pin the pagetable down, but we can't set PG_pinned
yet because the page structures don't exist yet. */
{
@@ -1181,15 +1206,9 @@ asmlinkage void __init xen_start_kernel(void)
x86_write_percpu(xen_cr3, __pa(pgd));
x86_write_percpu(xen_current_cr3, __pa(pgd));
-#ifdef CONFIG_SMP
/* Don't do the full vcpu_info placement stuff until we have a
- possible map. */
+ possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-#else
- /* May as well do it now, since there's no good time to call
- it later on UP. */
- xen_setup_vcpu_info_placement();
-#endif
pv_info.kernel_rpl = 1;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0144395448a..2a054ef2a3d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -310,13 +310,6 @@ pgd_t xen_make_pgd(unsigned long pgd)
}
#endif /* CONFIG_X86_PAE */
-enum pt_level {
- PT_PGD,
- PT_PUD,
- PT_PMD,
- PT_PTE
-};
-
/*
(Yet another) pagetable walker. This one is intended for pinning a
pagetable. This means that it walks a pagetable and calls the
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index c9ff27f3ac3..b5e189b1519 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -3,6 +3,13 @@
#include <linux/linkage.h>
#include <asm/page.h>
+enum pt_level {
+ PT_PGD,
+ PT_PUD,
+ PT_PMD,
+ PT_PTE
+};
+
/*
* Page-directory addresses above 4GB do not fit into architectural %cr3.
* When accessing %cr3, or equivalent field in vcpu_guest_context, guests
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 3bad4773a2f..2341492bf7a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -38,7 +38,8 @@ char * __init xen_memory_setup(void)
unsigned long max_pfn = xen_start_info->nr_pages;
e820.nr_map = 0;
- add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+ add_memory_region(0, LOWMEMSIZE(), E820_RAM);
+ add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM);
return "Xen";
}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 1a43b60c0c6..6b7190449d0 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -33,12 +33,17 @@
events, then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
- /* Clear mask and test pending */
- andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+ /* Unmask events */
+ movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
+
+ /* Test for pending */
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
jz 1f
+
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)