aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/ia32/ia32entry.S1
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/cpu/amd_64.c2
-rw-r--r--arch/x86/kernel/cpu/common_64.c7
-rw-r--r--arch/x86/kernel/entry_64.S106
-rw-r--r--arch/x86/kernel/head64.c11
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/paravirt.c28
-rw-r--r--arch/x86/kernel/process_64.c56
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vdso32-setup.c19
-rw-r--r--arch/x86/vdso/vdso32.S13
-rw-r--r--arch/x86/xen/Kconfig9
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten.c688
-rw-r--r--arch/x86/xen/mmu.c316
-rw-r--r--arch/x86/xen/mmu.h29
-rw-r--r--arch/x86/xen/multicalls.c1
-rw-r--r--arch/x86/xen/setup.c79
-rw-r--r--arch/x86/xen/smp.c306
-rw-r--r--arch/x86/xen/suspend.c5
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)0
-rw-r--r--arch/x86/xen/xen-asm_64.S271
-rw-r--r--arch/x86/xen/xen-head.S28
-rw-r--r--arch/x86/xen/xen-ops.h21
-rw-r--r--drivers/net/xen-netfront.c19
-rw-r--r--drivers/xen/events.c27
-rw-r--r--drivers/xen/manage.c6
-rw-r--r--include/asm-x86/paravirt.h69
-rw-r--r--include/asm-x86/percpu.h26
-rw-r--r--include/asm-x86/pgtable.h18
-rw-r--r--include/asm-x86/pgtable_32.h15
-rw-r--r--include/asm-x86/pgtable_64.h2
-rw-r--r--include/asm-x86/setup.h1
-rw-r--r--include/asm-x86/smp.h2
-rw-r--r--include/asm-x86/spinlock.h118
-rw-r--r--include/asm-x86/spinlock_types.h2
-rw-r--r--include/asm-x86/vdso.h8
-rw-r--r--include/asm-x86/xen/events.h1
-rw-r--r--include/asm-x86/xen/hypercall.h263
-rw-r--r--include/asm-x86/xen/interface.h139
-rw-r--r--include/asm-x86/xen/interface_32.h97
-rw-r--r--include/asm-x86/xen/interface_64.h159
-rw-r--r--include/asm-x86/xen/page.h6
-rw-r--r--include/xen/events.h7
-rw-r--r--include/xen/hvc-console.h7
-rw-r--r--include/xen/interface/callback.h6
-rw-r--r--include/xen/xen-ops.h3
53 files changed, 2409 insertions, 621 deletions
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 20371d0635e..0ae1e77eae5 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -321,6 +321,7 @@ ENTRY(ia32_syscall)
/*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
/*CFI_REL_OFFSET cs,CS-RIP*/
CFI_REL_OFFSET rip,RIP-RIP
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
SWAPGS
/*
* No need to follow this irqs on/off section: the syscall
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index da140611bb5..058c5594f49 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -7,9 +7,10 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
ifdef CONFIG_FTRACE
-# Do not profile debug utilities
+# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_tsc.o = -pg
CFLAGS_REMOVE_rtc.o = -pg
+CFLAGS_REMOVE_paravirt.o = -pg
endif
#
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bacf5deeec2..aa89387006f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,6 +18,8 @@
#include <asm/ia32.h>
#include <asm/bootparam.h>
+#include <xen/interface/xen.h>
+
#define __NO_STUBS 1
#undef __SYSCALL
#undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
+
+ BLANK();
+ DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+#ifdef CONFIG_XEN
+ BLANK();
+ OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+ OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#undef ENTRY
+#endif
return 0;
}
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 7c36fb8a28d..d1692b2a41f 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
if (c->x86_power & (1<<8))
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
}
static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 7b8cc72feb4..736f50fa433 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -16,6 +16,7 @@
#include <asm/i387.h>
#include <asm/msr.h>
#include <asm/io.h>
+#include <asm/linkage.h>
#include <asm/mmu_context.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
@@ -316,9 +317,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
c->x86_phys_bits = eax & 0xff;
}
- /* Assume all 64-bit CPUs support 32-bit syscall */
- set_cpu_cap(c, X86_FEATURE_SYSCALL32);
-
if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
cpu_devs[c->x86_vendor]->c_early_init)
cpu_devs[c->x86_vendor]->c_early_init(c);
@@ -517,8 +515,7 @@ void pda_init(int cpu)
}
char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
- DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
+ DEBUG_STKSZ] __page_aligned_bss;
extern asmlinkage void ignore_sysret(void);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ae63e584c34..80d5663db3b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1189,6 +1189,7 @@ END(device_not_available)
/* runs on exception stack */
KPROBE_ENTRY(debug)
INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1199,7 @@ KPROBE_END(debug)
/* runs on exception stack */
KPROBE_ENTRY(nmi)
INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $-1
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_nmi, 0, 0
@@ -1211,6 +1213,7 @@ KPROBE_END(nmi)
KPROBE_ENTRY(int3)
INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1240,7 @@ END(coprocessor_segment_overrun)
/* runs on exception stack */
ENTRY(double_fault)
XCPT_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
paranoidentry do_double_fault
jmp paranoid_exit1
CFI_ENDPROC
@@ -1253,6 +1257,7 @@ END(segment_not_present)
/* runs on exception stack */
ENTRY(stack_segment)
XCPT_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
paranoidentry do_stack_segment
jmp paranoid_exit1
CFI_ENDPROC
@@ -1278,6 +1283,7 @@ END(spurious_interrupt_bug)
/* runs on exception stack */
ENTRY(machine_check)
INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_machine_check
@@ -1312,3 +1318,103 @@ KPROBE_ENTRY(ignore_sysret)
sysret
CFI_ENDPROC
ENDPROC(ignore_sysret)
+
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+ zeroentry xen_do_hypervisor_callback
+END(xen_hypervisor_callback)
+
+/*
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+*/
+ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
+ CFI_STARTPROC
+/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+ see the correct pointer to the pt_regs */
+ movq %rdi, %rsp # we don't return, adjust the stack frame
+ CFI_ENDPROC
+ CFI_DEFAULT_STACK
+11: incl %gs:pda_irqcount
+ movq %rsp,%rbp
+ CFI_DEF_CFA_REGISTER rbp
+ cmovzq %gs:pda_irqstackptr,%rsp
+ pushq %rbp # backlink for old unwinder
+ call xen_evtchn_do_upcall
+ popq %rsp
+ CFI_DEF_CFA_REGISTER rsp
+ decl %gs:pda_irqcount
+ jmp error_exit
+ CFI_ENDPROC
+END(do_hypervisor_callback)
+
+/*
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+# 1. Fault while reloading DS, ES, FS or GS
+# 2. Fault while executing IRET
+# Category 1 we do not need to fix up as Xen has already reloaded all segment
+# registers that could be reloaded and zeroed the others.
+# Category 2 we fix up by killing the current process. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by comparing each saved segment register
+# with its current contents: any discrepancy means we in category 1.
+*/
+ENTRY(xen_failsafe_callback)
+ framesz = (RIP-0x30) /* workaround buggy gas */
+ _frame framesz
+ CFI_REL_OFFSET rcx, 0
+ CFI_REL_OFFSET r11, 8
+ movw %ds,%cx
+ cmpw %cx,0x10(%rsp)
+ CFI_REMEMBER_STATE
+ jne 1f
+ movw %es,%cx
+ cmpw %cx,0x18(%rsp)
+ jne 1f
+ movw %fs,%cx
+ cmpw %cx,0x20(%rsp)
+ jne 1f
+ movw %gs,%cx
+ cmpw %cx,0x28(%rsp)
+ jne 1f
+ /* All segments match their saved values => Category 2 (Bad IRET). */
+ movq (%rsp),%rcx
+ CFI_RESTORE rcx
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ addq $0x30,%rsp
+ CFI_ADJUST_CFA_OFFSET -0x30
+ pushq $0
+ CFI_ADJUST_CFA_OFFSET 8
+ pushq %r11
+ CFI_ADJUST_CFA_OFFSET 8
+ pushq %rcx
+ CFI_ADJUST_CFA_OFFSET 8
+ jmp general_protection
+ CFI_RESTORE_STATE
+1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+ movq (%rsp),%rcx
+ CFI_RESTORE rcx
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ addq $0x30,%rsp
+ CFI_ADJUST_CFA_OFFSET -0x30
+ pushq $0
+ CFI_ADJUST_CFA_OFFSET 8
+ SAVE_ALL
+ jmp error_exit
+ CFI_ENDPROC
+END(xen_failsafe_callback)
+
+#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c9781982914..1b318e903bf 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
#endif
+void __init x86_64_init_pda(void)
+{
+ _cpu_pda = __cpu_pda;
+ cpu_pda(0) = &_boot_cpu_pda;
+ pda_init(0);
+}
+
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
early_printk("Kernel alive\n");
- _cpu_pda = __cpu_pda;
- cpu_pda(0) = &_boot_cpu_pda;
- pda_init(0);
+ x86_64_init_pda();
early_printk("Kernel really alive\n");
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b07ac7b217c..db3280afe88 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -407,6 +407,7 @@ ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
+#include "../../x86/xen/xen-head.S"
.section .bss, "aw", @nobits
.align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 47a6f6f1247..1cf8c1fcc08 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -83,11 +83,8 @@ union irq_ctx {
static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
- __attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
- __attribute__((__section__(".bss.page_aligned")));
+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
static void call_on_stack(void *func, void *stack)
{
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e0f571d58c1..3edfd7af22a 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/arch_hooks.h>
+#include <asm/pgtable.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
#include <asm/irq.h>
@@ -123,6 +124,7 @@ static void *get_call_destination(u8 type)
.pv_irq_ops = pv_irq_ops,
.pv_apic_ops = pv_apic_ops,
.pv_mmu_ops = pv_mmu_ops,
+ .pv_lock_ops = pv_lock_ops,
};
return *((void **)&tmpl + type);
}
@@ -266,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
return __get_cpu_var(paravirt_lazy_mode);
}
+void __init paravirt_use_bytelocks(void)
+{
+#ifdef CONFIG_SMP
+ pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
+ pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
+ pv_lock_ops.spin_lock = __byte_spin_lock;
+ pv_lock_ops.spin_trylock = __byte_spin_trylock;
+ pv_lock_ops.spin_unlock = __byte_spin_unlock;
+#endif
+}
+
struct pv_info pv_info = {
.name = "bare hardware",
.paravirt_enabled = 0,
@@ -373,6 +386,9 @@ struct pv_mmu_ops pv_mmu_ops = {
#ifndef CONFIG_X86_64
.pagetable_setup_start = native_pagetable_setup_start,
.pagetable_setup_done = native_pagetable_setup_done,
+#else
+ .pagetable_setup_start = paravirt_nop,
+ .pagetable_setup_done = paravirt_nop,
#endif
.read_cr2 = native_read_cr2,
@@ -446,6 +462,18 @@ struct pv_mmu_ops pv_mmu_ops = {
.set_fixmap = native_set_fixmap,
};
+struct pv_lock_ops pv_lock_ops = {
+#ifdef CONFIG_SMP
+ .spin_is_locked = __ticket_spin_is_locked,
+ .spin_is_contended = __ticket_spin_is_contended,
+
+ .spin_lock = __ticket_spin_lock,
+ .spin_trylock = __ticket_spin_trylock,
+ .spin_unlock = __ticket_spin_unlock,
+#endif
+};
+EXPORT_SYMBOL_GPL(pv_lock_ops);
+
EXPORT_SYMBOL_GPL(pv_time_ops);
EXPORT_SYMBOL (pv_cpu_ops);
EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a8e53626ac9..e8a8e1b9981 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
- struct thread_struct *prev = &prev_p->thread,
- *next = &next_p->thread;
+ struct thread_struct *prev = &prev_p->thread;
+ struct thread_struct *next = &next_p->thread;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
unsigned fsindex, gsindex;
@@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch FS and GS.
+ *
+ * Segment register != 0 always requires a reload. Also
+ * reload when it has changed. When prev process used 64bit
+ * base always reload to avoid an information leak.
*/
- {
- /* segment register != 0 always requires a reload.
- also reload when it has changed.
- when prev process used 64bit base always reload
- to avoid an information leak. */
- if (unlikely(fsindex | next->fsindex | prev->fs)) {
- loadsegment(fs, next->fsindex);
- /* check if the user used a selector != 0
- * if yes clear 64bit base, since overloaded base
- * is always mapped to the Null selector
- */
- if (fsindex)
+ if (unlikely(fsindex | next->fsindex | prev->fs)) {
+ loadsegment(fs, next->fsindex);
+ /*
+ * Check if the user used a selector != 0; if yes
+ * clear 64bit base, since overloaded base is always
+ * mapped to the Null selector
+ */
+ if (fsindex)
prev->fs = 0;
- }
- /* when next process has a 64bit base use it */
- if (next->fs)
- wrmsrl(MSR_FS_BASE, next->fs);
- prev->fsindex = fsindex;
-
- if (unlikely(gsindex | next->gsindex | prev->gs)) {
- load_gs_index(next->gsindex);
- if (gsindex)
+ }
+ /* when next process has a 64bit base use it */
+ if (next->fs)
+ wrmsrl(MSR_FS_BASE, next->fs);
+ prev->fsindex = fsindex;
+
+ if (unlikely(gsindex | next->gsindex | prev->gs)) {
+ load_gs_index(next->gsindex);
+ if (gsindex)
prev->gs = 0;
- }
- if (next->gs)
- wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
- prev->gsindex = gsindex;
}
+ if (next->gs)
+ wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
+ prev->gsindex = gsindex;
/* Must be after DS reload */
unlazy_fpu(prev_p);
@@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
write_pda(pcurrent, next_p);
write_pda(kernelstack,
- (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE - PDA_STACKOFFSET);
#ifdef CONFIG_CC_STACKPROTECTOR
write_pda(stack_canary, next_p->stack_canary);
/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 531b55b8e81..c9010f82141 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -824,7 +824,10 @@ void __init setup_arch(char **cmdline_p)
vmi_init();
#endif
+ paravirt_pagetable_setup_start(swapper_pg_dir);
paging_init();
+ paravirt_pagetable_setup_done(swapper_pg_dir);
+ paravirt_post_allocator_init();
#ifdef CONFIG_X86_64
map_vsyscall();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 687376ab07e..1deb3b624a7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -768,7 +768,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
*
* Must be called after the _cpu_pda pointer table is initialized.
*/
-static int __cpuinit get_local_pda(int cpu)
+int __cpuinit get_local_pda(int cpu)
{
struct x8664_pda *oldpda, *newpda;
unsigned long size = sizeof(struct x8664_pda);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9689a5138e6..7113acd8ac4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -868,8 +868,6 @@ void __init paging_init(void)
*/
sparse_init();
zone_sizes_init();
-
- paravirt_post_allocator_init();
}
/*
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b7ad9f89d21..4d6ef0a336d 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
# Build multiple 32-bit vDSO images to choose from at boot time.
#
obj-$(VDSO32-y) += vdso32-syms.lds
-vdso32.so-$(CONFIG_X86_32) += int80
+vdso32.so-$(VDSO32-y) += int80
vdso32.so-$(CONFIG_COMPAT) += syscall
vdso32.so-$(VDSO32-y) += sysenter
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 0bce5429a51..513f330c583 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
}
}
-/*
- * These symbols are defined by vdso32.S to mark the bounds
- * of the ELF DSO images included therein.
- */
-extern const char vdso32_default_start, vdso32_default_end;
-extern const char vdso32_sysenter_start, vdso32_sysenter_end;
static struct page *vdso32_pages[1];
#ifdef CONFIG_X86_64
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
+#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
/* May not be __init: called during resume */
void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
#else /* CONFIG_X86_32 */
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
+#define vdso32_syscall() (0)
void enable_sep_cpu(void)
{
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
gate_vma_init();
#endif
- if (!vdso32_sysenter()) {
- vsyscall = &vdso32_default_start;
- vsyscall_len = &vdso32_default_end - &vdso32_default_start;
- } else {
+ if (vdso32_syscall()) {
+ vsyscall = &vdso32_syscall_start;
+ vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
+ } else if (vdso32_sysenter()){
vsyscall = &vdso32_sysenter_start;
vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
+ } else {
+ vsyscall = &vdso32_int80_start;
+ vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
}
memcpy(syscall_page, vsyscall, vsyscall_len);
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 1e36f72cab8..2ce5f82c333 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -2,14 +2,17 @@
__INITDATA
- .globl vdso32_default_start, vdso32_default_end
-vdso32_default_start:
-#ifdef CONFIG_X86_32
+ .globl vdso32_int80_start, vdso32_int80_end
+vdso32_int80_start:
.incbin "arch/x86/vdso/vdso32-int80.so"
-#else
+vdso32_int80_end:
+
+ .globl vdso32_syscall_start, vdso32_syscall_end
+vdso32_syscall_start:
+#ifdef CONFIG_COMPAT
.incbin "arch/x86/vdso/vdso32-syscall.so"
#endif
-vdso32_default_end:
+vdso32_syscall_end:
.globl vdso32_sysenter_start, vdso32_sysenter_end
vdso32_sysenter_start:
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c2cc9958087..20b49729bed 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,8 @@ config XEN
bool "Xen guest support"
select PARAVIRT
select PARAVIRT_CLOCK
- depends on X86_32
- depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
+ depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
+ depends on X86_CMPXCHG && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,11 @@ config XEN
config XEN_MAX_DOMAIN_MEMORY
int "Maximum allowed size of a domain in gigabytes"
- default 8
+ default 8 if X86_32
+ default 32 if X86_64
depends on XEN
help
The pseudo-physical to machine address array is sized
according to the maximum possible memory size of a Xen
domain. This array uses 1 page per gigabyte, so there's no
- need to be too stingy here. \ No newline at end of file
+ need to be too stingy here.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 2ba2d164913..59c1e539aed 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
obj-y := enlighten.o setup.o multicalls.o mmu.o \
- time.o xen-asm.o grant-table.o suspend.o
+ time.o xen-asm_$(BITS).o grant-table.o suspend.o
obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bb508456ef5..3da6acb7eaf 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
#include <xen/interface/sched.h>
#include <xen/features.h>
#include <xen/page.h>
+#include <xen/hvc-console.h>
#include <asm/paravirt.h>
#include <asm/page.h>
@@ -40,12 +41,12 @@
#include <asm/xen/hypervisor.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
+#include <asm/msr-index.h>
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
-#include <asm/pgalloc.h>
#include "xen-ops.h"
#include "mmu.h"
@@ -57,6 +58,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
/*
+ * Identity map, in addition to plain kernel map. This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
+/*
* Note about cr3 (pagetable base) values:
*
* xen_cr3 contains the current logical cr3 value; it contains the
@@ -363,14 +376,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
- xen_mc_batch();
-
- load_TLS_descriptor(t, cpu, 0);
- load_TLS_descriptor(t, cpu, 1);
- load_TLS_descriptor(t, cpu, 2);
-
- xen_mc_issue(PARAVIRT_LAZY_CPU);
-
/*
* XXX sleazy hack: If we're being called in a lazy-cpu zone,
* it means we're in a context switch, and %gs has just been
@@ -379,10 +384,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
* Either way, it has been saved, and the new value will get
* loaded properly. This will go away as soon as Xen has been
* modified to not save/restore %gs for normal hypercalls.
+ *
+ * On x86_64, this hack is not used for %gs, because gs points
+ * to KERNEL_GS_BASE (and uses it for PDA references), so we
+ * must not zero %gs on x86_64
+ *
+ * For x86_64, we need to zero %fs, otherwise we may get an
+ * exception between the new %fs descriptor being loaded and
+ * %fs being effectively cleared at __switch_to().
*/
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
+#ifdef CONFIG_X86_32
loadsegment(gs, 0);
+#else
+ loadsegment(fs, 0);
+#endif
+ }
+
+ xen_mc_batch();
+
+ load_TLS_descriptor(t, cpu, 0);
+ load_TLS_descriptor(t, cpu, 1);
+ load_TLS_descriptor(t, cpu, 2);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+#ifdef CONFIG_X86_64
+static void xen_load_gs_index(unsigned int idx)
+{
+ if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
+ BUG();
}
+#endif
static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
const void *ptr)
@@ -400,23 +434,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
preempt_enable();
}
-static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
{
- u8 type, dpl;
-
- type = (high >> 8) & 0x1f;
- dpl = (high >> 13) & 3;
-
- if (type != 0xf && type != 0xe)
+ if (val->type != 0xf && val->type != 0xe)
return 0;
info->vector = vector;
- info->address = (high & 0xffff0000) | (low & 0x0000ffff);
- info->cs = low >> 16;
- info->flags = dpl;
+ info->address = gate_offset(*val);
+ info->cs = gate_segment(*val);
+ info->flags = val->dpl;
/* interrupt gates clear IF */
- if (type == 0xe)
+ if (val->type == 0xe)
info->flags |= 4;
return 1;
@@ -443,11 +472,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
if (p >= start && (p + 8) <= end) {
struct trap_info info[2];
- u32 *desc = (u32 *)g;
info[1].address = 0;
- if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0]))
+ if (cvt_gate_to_trap(entrynum, g, &info[0]))
if (HYPERVISOR_set_trap_table(info))
BUG();
}
@@ -460,13 +488,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
{
unsigned in, out, count;
- count = (desc->size+1) / 8;
+ count = (desc->size+1) / sizeof(gate_desc);
BUG_ON(count > 256);
for (in = out = 0; in < count; in++) {
- const u32 *entry = (u32 *)(desc->address + in * 8);
+ gate_desc *entry = (gate_desc*)(desc->address) + in;
- if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+ if (cvt_gate_to_trap(in, entry, &traps[out]))
out++;
}
traps[out].address = 0;
@@ -695,33 +723,89 @@ static void set_current_cr3(void *v)
x86_write_percpu(xen_current_cr3, (unsigned long)v);
}
-static void xen_write_cr3(unsigned long cr3)
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
{
struct mmuext_op *op;
struct multicall_space mcs;
- unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+ unsigned long mfn;
- BUG_ON(preemptible());
+ if (cr3)
+ mfn = pfn_to_mfn(PFN_DOWN(cr3));
+ else
+ mfn = 0;
- mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */
+ WARN_ON(mfn == 0 && kernel);
- /* Update while interrupts are disabled, so its atomic with
- respect to ipis */
- x86_write_percpu(xen_cr3, cr3);
+ mcs = __xen_mc_entry(sizeof(*op));
op = mcs.args;
- op->cmd = MMUEXT_NEW_BASEPTR;
+ op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
op->arg1.mfn = mfn;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- /* Update xen_update_cr3 once the batch has actually
- been submitted. */
- xen_mc_callback(set_current_cr3, (void *)cr3);
+ if (kernel) {
+ x86_write_percpu(xen_cr3, cr3);
+
+ /* Update xen_current_cr3 once the batch has actually
+ been submitted. */
+ xen_mc_callback(set_current_cr3, (void *)cr3);
+ }
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ xen_mc_batch(); /* disables interrupts */
+
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
+ x86_write_percpu(xen_cr3, cr3);
+
+ __xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+ if (user_pgd)
+ __xen_write_cr3(false, __pa(user_pgd));
+ else
+ __xen_write_cr3(false, 0);
+ }
+#endif
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
+static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+{
+ int ret;
+
+ ret = 0;
+
+ switch(msr) {
+#ifdef CONFIG_X86_64
+ unsigned which;
+ u64 base;
+
+ case MSR_FS_BASE: which = SEGBASE_FS; goto set;
+ case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
+ case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
+
+ set:
+ base = ((u64)high << 32) | low;
+ if (HYPERVISOR_set_segment_base(which, base) != 0)
+ ret = -EFAULT;
+ break;
+#endif
+ default:
+ ret = native_write_msr_safe(msr, low, high);
+ }
+
+ return ret;
+}
+
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -778,6 +862,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
xen_alloc_ptpage(mm, pfn, PT_PMD);
}
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd = mm->pgd;
+ int ret = 0;
+
+ BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+ {
+ struct page *page = virt_to_page(pgd);
+ pgd_t *user_pgd;
+
+ BUG_ON(page->private != 0);
+
+ ret = -ENOMEM;
+
+ user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ page->private = (unsigned long)user_pgd;
+
+ if (user_pgd != NULL) {
+ user_pgd[pgd_index(VSYSCALL_START)] =
+ __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+ ret = 0;
+ }
+
+ BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+ }
+#endif
+
+ return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ if (user_pgd)
+ free_page((unsigned long)user_pgd);
+#endif
+}
+
/* This should never happen until we're OK to use struct page */
static void xen_release_ptpage(u32 pfn, unsigned level)
{
@@ -803,6 +929,18 @@ static void xen_release_pmd(u32 pfn)
xen_release_ptpage(pfn, PT_PMD);
}
+#if PAGETABLE_LEVELS == 4
+static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+
+static void xen_release_pud(u32 pfn)
+{
+ xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+
#ifdef CONFIG_HIGHPTE
static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
{
@@ -841,68 +979,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
static __init void xen_pagetable_setup_start(pgd_t *base)
{
- pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
- int i;
-
- /* special set_pte for pagetable initialization */
- pv_mmu_ops.set_pte = xen_set_pte_init;
-
- init_mm.pgd = base;
- /*
- * copy top-level of Xen-supplied pagetable into place. This
- * is a stand-in while we copy the pmd pages.
- */
- memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
-
- /*
- * For PAE, need to allocate new pmds, rather than
- * share Xen's, since Xen doesn't like pmd's being
- * shared between address spaces.
- */
- for (i = 0; i < PTRS_PER_PGD; i++) {
- if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
- pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
-
- memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
- PAGE_SIZE);
-
- make_lowmem_page_readonly(pmd);
-
- set_pgd(&base[i], __pgd(1 + __pa(pmd)));
- } else
- pgd_clear(&base[i]);
- }
-
- /* make sure zero_page is mapped RO so we can use it in pagetables */
- make_lowmem_page_readonly(empty_zero_page);
- make_lowmem_page_readonly(base);
- /*
- * Switch to new pagetable. This is done before
- * pagetable_init has done anything so that the new pages
- * added to the table can be prepared properly for Xen.
- */
- xen_write_cr3(__pa(base));
-
- /* Unpin initial Xen pagetable */
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
- PFN_DOWN(__pa(xen_start_info->pt_base)));
}
void xen_setup_shared_info(void)
{
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-
- /*
- * Create a mapping for the shared info page.
- * Should be set_fixmap(), but shared_info is a machine
- * address with no corresponding pseudo-phys address.
- */
- set_pte_mfn(addr,
- PFN_DOWN(xen_start_info->shared_info),
- PAGE_KERNEL);
-
- HYPERVISOR_shared_info = (struct shared_info *)addr;
+ set_fixmap(FIX_PARAVIRT_BOOTMAP,
+ xen_start_info->shared_info);
+
+ HYPERVISOR_shared_info =
+ (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
} else
HYPERVISOR_shared_info =
(struct shared_info *)__va(xen_start_info->shared_info);
@@ -917,26 +1003,32 @@ void xen_setup_shared_info(void)
static __init void xen_pagetable_setup_done(pgd_t *base)
{
- /* This will work as long as patching hasn't happened yet
- (which it hasn't) */
- pv_mmu_ops.alloc_pte = xen_alloc_pte;
- pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
- pv_mmu_ops.release_pte = xen_release_pte;
- pv_mmu_ops.release_pmd = xen_release_pmd;
- pv_mmu_ops.set_pte = xen_set_pte;
-
xen_setup_shared_info();
-
- /* Actually pin the pagetable down, but we can't set PG_pinned
- yet because the page structures don't exist yet. */
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
}
static __init void xen_post_allocator_init(void)
{
+ pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
+
+ /* This will work as long as patching hasn't happened yet
+ (which it hasn't) */
+ pv_mmu_ops.alloc_pte = xen_alloc_pte;
+ pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+ pv_mmu_ops.release_pte = xen_release_pte;
+ pv_mmu_ops.release_pmd = xen_release_pmd;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.alloc_pud = xen_alloc_pud;
+ pv_mmu_ops.release_pud = xen_release_pud;
+#endif
+#ifdef CONFIG_X86_64
+ SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
xen_mark_init_mm_pinned();
}
@@ -950,6 +1042,7 @@ void xen_setup_vcpu_info_placement(void)
/* xen_vcpu_setup managed to place the vcpu_info within the
percpu area for all cpus, so make use of it */
+#ifdef CONFIG_X86_32
if (have_vcpu_info_placement) {
printk(KERN_INFO "Xen: using vcpu_info placement\n");
@@ -959,6 +1052,7 @@ void xen_setup_vcpu_info_placement(void)
pv_irq_ops.irq_enable = xen_irq_enable_direct;
pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
}
+#endif
}
static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -979,10 +1073,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
goto patch_site
switch (type) {
+#ifdef CONFIG_X86_32
SITE(pv_irq_ops, irq_enable);
SITE(pv_irq_ops, irq_disable);
SITE(pv_irq_ops, save_fl);
SITE(pv_irq_ops, restore_fl);
+#endif /* CONFIG_X86_32 */
#undef SITE
patch_site:
@@ -1025,8 +1121,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
#ifdef CONFIG_X86_F00F_BUG
case FIX_F00F_IDT:
#endif
+#ifdef CONFIG_X86_32
case FIX_WP_TEST:
case FIX_VDSO:
+# ifdef CONFIG_HIGHMEM
+ case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
+#else
+ case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+#endif
#ifdef CONFIG_X86_LOCAL_APIC
case FIX_APIC_BASE: /* maps dummy local APIC */
#endif
@@ -1039,6 +1142,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
}
__native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+ /* Replicate changes to map the vsyscall page into the user
+ pagetable vsyscall mapping. */
+ if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+ unsigned long vaddr = __fix_to_virt(idx);
+ set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+ }
+#endif
}
static const struct pv_info xen_info __initdata = {
@@ -1084,18 +1196,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
- .write_msr = native_write_msr_safe,
+ .write_msr = xen_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
.iret = xen_iret,
.irq_enable_sysexit = xen_sysexit,
+#ifdef CONFIG_X86_64
+ .usergs_sysret32 = xen_sysret32,
+ .usergs_sysret64 = xen_sysret64,
+#endif
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
.load_gdt = xen_load_gdt,
.load_idt = xen_load_idt,
.load_tls = xen_load_tls,
+#ifdef CONFIG_X86_64
+ .load_gs_index = xen_load_gs_index,
+#endif
.store_gdt = native_store_gdt,
.store_idt = native_store_idt,
@@ -1109,14 +1228,34 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.set_iopl_mask = xen_set_iopl_mask,
.io_delay = xen_io_delay,
+ /* Xen takes care of %gs when switching to usermode for us */
+ .swapgs = paravirt_nop,
+
.lazy_mode = {
.enter = paravirt_enter_lazy_cpu,
.leave = xen_leave_lazy,
},
};
+static void __init __xen_init_IRQ(void)
+{
+#ifdef CONFIG_X86_64
+ int i;
+
+ /* Create identity vector->irq map */
+ for(i = 0; i < NR_VECTORS; i++) {
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(vector_irq, cpu)[i] = i;
+ }
+#endif /* CONFIG_X86_64 */
+
+ xen_init_IRQ();
+}
+
static const struct pv_irq_ops xen_irq_ops __initdata = {
- .init_IRQ = xen_init_IRQ,
+ .init_IRQ = __xen_init_IRQ,
.save_fl = xen_save_fl,
.restore_fl = xen_restore_fl,
.irq_disable = xen_irq_disable,
@@ -1124,7 +1263,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
.safe_halt = xen_safe_halt,
.halt = xen_halt,
#ifdef CONFIG_X86_64
- .adjust_exception_frame = paravirt_nop,
+ .adjust_exception_frame = xen_adjust_exception_frame,
#endif
};
@@ -1157,8 +1296,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
- .pgd_alloc = __paravirt_pgd_alloc,
- .pgd_free = paravirt_nop,
+ .pgd_alloc = xen_pgd_alloc,
+ .pgd_free = xen_pgd_free,
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
@@ -1170,7 +1309,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.kmap_atomic_pte = xen_kmap_atomic_pte,
#endif
- .set_pte = NULL, /* see xen_pagetable_setup_* */
+#ifdef CONFIG_X86_64
+ .set_pte = xen_set_pte,
+#else
+ .set_pte = xen_set_pte_init,
+#endif
.set_pte_at = xen_set_pte_at,
.set_pmd = xen_set_pmd_hyper,
@@ -1184,15 +1327,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.make_pte = xen_make_pte,
.make_pgd = xen_make_pgd,
+#ifdef CONFIG_X86_PAE
.set_pte_atomic = xen_set_pte_atomic,
.set_pte_present = xen_set_pte_at,
- .set_pud = xen_set_pud_hyper,
.pte_clear = xen_pte_clear,
.pmd_clear = xen_pmd_clear,
+#endif /* CONFIG_X86_PAE */
+ .set_pud = xen_set_pud_hyper,
.make_pmd = xen_make_pmd,
.pmd_val = xen_pmd_val,
+#if PAGETABLE_LEVELS == 4
+ .pud_val = xen_pud_val,
+ .make_pud = xen_make_pud,
+ .set_pgd = xen_set_pgd_hyper,
+
+ .alloc_pud = xen_alloc_pte_init,
+ .release_pud = xen_release_pte_init,
+#endif /* PAGETABLE_LEVELS == 4 */
+
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
.exit_mmap = xen_exit_mmap,
@@ -1205,21 +1359,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.set_fixmap = xen_set_fixmap,
};
-#ifdef CONFIG_SMP
-static const struct smp_ops xen_smp_ops __initdata = {
- .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
- .smp_prepare_cpus = xen_smp_prepare_cpus,
- .cpu_up = xen_cpu_up,
- .smp_cpus_done = xen_smp_cpus_done,
-
- .smp_send_stop = xen_smp_send_stop,
- .smp_send_reschedule = xen_smp_send_reschedule,
-
- .send_call_func_ipi = xen_smp_send_call_function_ipi,
- .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
-};
-#endif /* CONFIG_SMP */
-
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
@@ -1264,6 +1403,7 @@ static const struct machine_ops __initdata xen_machine_ops = {
static void __init xen_reserve_top(void)
{
+#ifdef CONFIG_X86_32
unsigned long top = HYPERVISOR_VIRT_START;
struct xen_platform_parameters pp;
@@ -1271,7 +1411,247 @@ static void __init xen_reserve_top(void)
top = pp.virt_start;
reserve_top_address(-top + 2 * PAGE_SIZE);
+#endif /* CONFIG_X86_32 */
+}
+
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void *__ka(phys_addr_t paddr)
+{
+#ifdef CONFIG_X86_64
+ return (void *)(paddr + __START_KERNEL_map);
+#else
+ return __va(paddr);
+#endif
+}
+
+/* Convert a machine address to physical address */
+static unsigned long m2p(phys_addr_t maddr)
+{
+ phys_addr_t paddr;
+
+ maddr &= PTE_MASK;
+ paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+
+ return paddr;
+}
+
+/* Convert a machine address to kernel virtual */
+static void *m2v(phys_addr_t maddr)
+{
+ return __ka(m2p(maddr));
+}
+
+#ifdef CONFIG_X86_64
+static void walk(pgd_t *pgd, unsigned long addr)
+{
+ unsigned l4idx = pgd_index(addr);
+ unsigned l3idx = pud_index(addr);
+ unsigned l2idx = pmd_index(addr);
+ unsigned l1idx = pte_index(addr);
+ pgd_t l4;
+ pud_t l3;
+ pmd_t l2;
+ pte_t l1;
+
+ xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
+ pgd, addr, l4idx, l3idx, l2idx, l1idx);
+
+ l4 = pgd[l4idx];
+ xen_raw_printk(" l4: %016lx\n", l4.pgd);
+ xen_raw_printk(" %016lx\n", pgd_val(l4));
+
+ l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
+ xen_raw_printk(" l3: %016lx\n", l3.pud);
+ xen_raw_printk(" %016lx\n", pud_val(l3));
+
+ l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
+ xen_raw_printk(" l2: %016lx\n", l2.pmd);
+ xen_raw_printk(" %016lx\n", pmd_val(l2));
+
+ l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
+ xen_raw_printk(" l1: %016lx\n", l1.pte);
+ xen_raw_printk(" %016lx\n", pte_val(l1));
+}
+#endif
+
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+ unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+ pte_t pte = pfn_pte(pfn, prot);
+
+ xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
+ addr, pfn, get_phys_to_machine(pfn),
+ pgprot_val(prot), pte.pte);
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+ BUG();
+}
+
+static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+{
+ unsigned pmdidx, pteidx;
+ unsigned ident_pte;
+ unsigned long pfn;
+
+ ident_pte = 0;
+ pfn = 0;
+ for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+ pte_t *pte_page;
+
+ /* Reuse or allocate a page of ptes */
+ if (pmd_present(pmd[pmdidx]))
+ pte_page = m2v(pmd[pmdidx].pmd);
+ else {
+ /* Check for free pte pages */
+ if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+ break;
+
+ pte_page = &level1_ident_pgt[ident_pte];
+ ident_pte += PTRS_PER_PTE;
+
+ pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+ }
+
+ /* Install mappings */
+ for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+ pte_t pte;
+
+ if (pfn > max_pfn_mapped)
+ max_pfn_mapped = pfn;
+
+ if (!pte_none(pte_page[pteidx]))
+ continue;
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+ pte_page[pteidx] = pte;
+ }
+ }
+
+ for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+ set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+
+ set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+
+#ifdef CONFIG_X86_64
+static void convert_pfn_mfn(void *v)
+{
+ pte_t *pte = v;
+ int i;
+
+ /* All levels are converted the same way, so just treat them
+ as ptes. */
+ for(i = 0; i < PTRS_PER_PTE; i++)
+ pte[i] = xen_make_pte(pte[i].pte);
+}
+
+/*
+ * Set up the inital kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables. We copy the Xen L2's into
+ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
+ * means that only the kernel has a physical mapping to start with -
+ * but that's enough to get __va working. We need to fill in the rest
+ * of the physical mapping once some sort of allocator has been set
+ * up.
+ */
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+ pud_t *l3;
+ pmd_t *l2;
+
+ /* Zap identity mapping */
+ init_level4_pgt[0] = __pgd(0);
+
+ /* Pre-constructed entries are in pfn, so convert to mfn */
+ convert_pfn_mfn(init_level4_pgt);
+ convert_pfn_mfn(level3_ident_pgt);
+ convert_pfn_mfn(level3_kernel_pgt);
+
+ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+ l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+
+ memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+ memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+ l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+ memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+ /* Set up identity map */
+ xen_map_identity_early(level2_ident_pgt, max_pfn);
+
+ /* Make pagetable pieces RO */
+ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+ /* Pin down new L4 */
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+ /* Unpin Xen-provided one */
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ /* Switch over */
+ pgd = init_level4_pgt;
+
+ /*
+ * At this stage there can be no user pgd, and no page
+ * structure to attach it to, so make sure we just set kernel
+ * pgd.
+ */
+ xen_mc_batch();
+ __xen_write_cr3(true, __pa(pgd));
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+ reserve_early(__pa(xen_start_info->pt_base),
+ __pa(xen_start_info->pt_base +
+ xen_start_info->nr_pt_frames * PAGE_SIZE),
+ "XEN PAGETABLES");
+
+ return pgd;
+}
+#else /* !CONFIG_X86_64 */
+static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+ pmd_t *kernel_pmd;
+
+ init_pg_tables_start = __pa(pgd);
+ init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+ max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+
+ kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+ memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+
+ xen_map_identity_early(level2_kernel_pgt, max_pfn);
+
+ memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+ set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+ __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+ set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ xen_write_cr3(__pa(swapper_pg_dir));
+
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+
+ return swapper_pg_dir;
}
+#endif /* CONFIG_X86_64 */
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
@@ -1301,53 +1681,56 @@ asmlinkage void __init xen_start_kernel(void)
machine_ops = xen_machine_ops;
-#ifdef CONFIG_SMP
- smp_ops = xen_smp_ops;
+#ifdef CONFIG_X86_64
+ /* Disable until direct per-cpu data access. */
+ have_vcpu_info_placement = 0;
+ x86_64_init_pda();
#endif
+ xen_smp_init();
+
/* Get mfn list */
if (!xen_feature(XENFEAT_auto_translated_physmap))
xen_build_dynamic_phys_to_machine();
pgd = (pgd_t *)xen_start_info->pt_base;
- init_pg_tables_start = __pa(pgd);
- init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
- max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT;
-
- init_mm.pgd = pgd; /* use the Xen pagetables to start */
-
- /* keep using Xen gdt for now; no urgent need to change it */
-
- x86_write_percpu(xen_cr3, __pa(pgd));
- x86_write_percpu(xen_current_cr3, __pa(pgd));
+ /* Prevent unwanted bits from being set in PTEs. */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+ if (!is_initial_xendomain())
+ __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
/* Don't do the full vcpu_info placement stuff until we have a
possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+ xen_raw_console_write("mapping kernel into physical memory\n");
+ pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
+
+ init_mm.pgd = pgd;
+
+ /* keep using Xen gdt for now; no urgent need to change it */
+
pv_info.kernel_rpl = 1;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
- /* Prevent unwanted bits from being set in PTEs. */
- __supported_pte_mask &= ~_PAGE_GLOBAL;
- if (!is_initial_xendomain())
- __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
-
/* set the limit of our address space */
xen_reserve_top();
+#ifdef CONFIG_X86_32
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
new_cpu_data.hard_math = 1;
new_cpu_data.x86_capability[0] = cpuid_edx(1);
+#endif
/* Poke various useful things into boot_params */
boot_params.hdr.type_of_loader = (9 << 4) | 0;
boot_params.hdr.ramdisk_image = xen_start_info->mod_start
? __pa(xen_start_info->mod_start) : 0;
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
+ boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
if (!is_initial_xendomain()) {
add_preferred_console("xenboot", 0, NULL);
@@ -1355,6 +1738,21 @@ asmlinkage void __init xen_start_kernel(void)
add_preferred_console("hvc", 0, NULL);
}
+ xen_raw_console_write("about to get started...\n");
+
+#if 0
+ xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
+ &boot_params, __pa_symbol(&boot_params),
+ __va(__pa_symbol(&boot_params)));
+
+ walk(pgd, &boot_params);
+ walk(pgd, __va(__pa(&boot_params)));
+#endif
+
/* Start the world */
+#ifdef CONFIG_X86_32
i386_start_kernel();
+#else
+ x86_64_start_reservations((char *)__pa_symbol(&boot_params));
+#endif
}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ff0aa74afaa..a44d56e38bd 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,8 +44,10 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
#include <asm/mmu_context.h>
#include <asm/paravirt.h>
+#include <asm/linkage.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@@ -56,26 +58,29 @@
#include "multicalls.h"
#include "mmu.h"
+/*
+ * Just beyond the highest usermode address. STACK_TOP_MAX has a
+ * redzone above it, so round it up to a PGD boundary.
+ */
+#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
+
+
#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
/* Placeholder for holes in the address space */
-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE]
- __attribute__((section(".data.page_aligned"))) =
+static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
/* Array of pointers to pages containing p2m entries */
-static unsigned long *p2m_top[TOP_ENTRIES]
- __attribute__((section(".data.page_aligned"))) =
+static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
/* Arrays of p2m arrays expressed in mfns used for save/restore */
-static unsigned long p2m_top_mfn[TOP_ENTRIES]
- __attribute__((section(".bss.page_aligned")));
+static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
-static unsigned long p2m_top_mfn_list[
- PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)]
- __attribute__((section(".bss.page_aligned")));
+static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
+ __page_aligned_bss;
static inline unsigned p2m_top_index(unsigned long pfn)
{
@@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
p2m_top[topidx][idx] = mfn;
}
-xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+xmaddr_t arbitrary_virt_to_machine(void *vaddr)
{
+ unsigned long address = (unsigned long)vaddr;
unsigned int level;
pte_t *pte = lookup_address(address, &level);
unsigned offset = address & ~PAGE_MASK;
BUG_ON(pte == NULL);
- return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+ return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
}
void make_lowmem_page_readonly(void *vaddr)
@@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
xen_mc_batch();
- u.ptr = virt_to_machine(ptr).maddr;
+ /* ptr may be ioremapped for 64-bit pagetable setup */
+ u.ptr = arbitrary_virt_to_machine(ptr).maddr;
u.val = pmd_val_ma(val);
extend_mmu_update(&u);
@@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
*/
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = swapper_pg_dir + pgd_index(vaddr);
- if (pgd_none(*pgd)) {
- BUG();
- return;
- }
- pud = pud_offset(pgd, vaddr);
- if (pud_none(*pud)) {
- BUG();
- return;
- }
- pmd = pmd_offset(pud, vaddr);
- if (pmd_none(*pmd)) {
- BUG();
- return;
- }
- pte = pte_offset_kernel(pmd, vaddr);
- /* <mfn,flags> stored as-is, to permit clearing entries */
- xen_set_pte(pte, mfn_pte(mfn, flags));
-
- /*
- * It's enough to flush this one mapping.
- * (PGE mappings get flushed as well)
- */
- __flush_tlb_one(vaddr);
+ set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
}
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
xen_mc_batch();
- u.ptr = virt_to_machine(ptr).maddr;
+ /* ptr may be ioremapped for 64-bit pagetable setup */
+ u.ptr = arbitrary_virt_to_machine(ptr).maddr;
u.val = pud_val_ma(val);
extend_mmu_update(&u);
@@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val)
void xen_set_pte(pte_t *ptep, pte_t pte)
{
+#ifdef CONFIG_X86_PAE
ptep->pte_high = pte.pte_high;
smp_wmb();
ptep->pte_low = pte.pte_low;
+#else
+ *ptep = pte;
+#endif
}
+#ifdef CONFIG_X86_PAE
void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
- set_64bit((u64 *)ptep, pte_val_ma(pte));
+ set_64bit((u64 *)ptep, native_pte_val(pte));
}
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp)
{
set_pmd(pmdp, __pmd(0));
}
+#endif /* CONFIG_X86_PAE */
pmd_t xen_make_pmd(pmdval_t pmd)
{
@@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd)
return native_make_pmd(pmd);
}
+#if PAGETABLE_LEVELS == 4
+pudval_t xen_pud_val(pud_t pud)
+{
+ return pte_mfn_to_pfn(pud.pud);
+}
+
+pud_t xen_make_pud(pudval_t pud)
+{
+ pud = pte_pfn_to_mfn(pud);
+
+ return native_make_pud(pud);
+}
+
+pgd_t *xen_get_user_pgd(pgd_t *pgd)
+{
+ pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
+ unsigned offset = pgd - pgd_page;
+ pgd_t *user_ptr = NULL;
+
+ if (offset < pgd_index(USER_LIMIT)) {
+ struct page *page = virt_to_page(pgd_page);
+ user_ptr = (pgd_t *)page->private;
+ if (user_ptr)
+ user_ptr += offset;
+ }
+
+ return user_ptr;
+}
+
+static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+{
+ struct mmu_update u;
+
+ u.ptr = virt_to_machine(ptr).maddr;
+ u.val = pgd_val_ma(val);
+ extend_mmu_update(&u);
+}
+
+/*
+ * Raw hypercall-based set_pgd, intended for in early boot before
+ * there's a page structure. This implies:
+ * 1. The only existing pagetable is the kernel's
+ * 2. It is always pinned
+ * 3. It has no user pagetable attached to it
+ */
+void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+{
+ preempt_disable();
+
+ xen_mc_batch();
+
+ __xen_set_pgd_hyper(ptr, val);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+void xen_set_pgd(pgd_t *ptr, pgd_t val)
+{
+ pgd_t *user_ptr = xen_get_user_pgd(ptr);
+
+ /* If page is not pinned, we can just update the entry
+ directly */
+ if (!page_pinned(ptr)) {
+ *ptr = val;
+ if (user_ptr) {
+ WARN_ON(page_pinned(user_ptr));
+ *user_ptr = val;
+ }
+ return;
+ }
+
+ /* If it's pinned, then we can at least batch the kernel and
+ user updates together. */
+ xen_mc_batch();
+
+ __xen_set_pgd_hyper(ptr, val);
+ if (user_ptr)
+ __xen_set_pgd_hyper(user_ptr, val);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+#endif /* PAGETABLE_LEVELS == 4 */
+
/*
- (Yet another) pagetable walker. This one is intended for pinning a
- pagetable. This means that it walks a pagetable and calls the
- callback function on each page it finds making up the page table,
- at every level. It walks the entire pagetable, but it only bothers
- pinning pte pages which are below pte_limit. In the normal case
- this will be TASK_SIZE, but at boot we need to pin up to
- FIXADDR_TOP. But the important bit is that we don't pin beyond
- there, because then we start getting into Xen's ptes.
-*/
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
+ * (Yet another) pagetable walker. This one is intended for pinning a
+ * pagetable. This means that it walks a pagetable and calls the
+ * callback function on each page it finds making up the page table,
+ * at every level. It walks the entire pagetable, but it only bothers
+ * pinning pte pages which are below limit. In the normal case this
+ * will be STACK_TOP_MAX, but at boot we need to pin up to
+ * FIXADDR_TOP.
+ *
+ * For 32-bit the important bit is that we don't pin beyond there,
+ * because then we start getting into Xen's ptes.
+ *
+ * For 64-bit, we must skip the Xen hole in the middle of the address
+ * space, just after the big x86-64 virtual hole.
+ */
+static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
unsigned long limit)
{
- pgd_t *pgd = pgd_base;
int flush = 0;
- unsigned long addr = 0;
- unsigned long pgd_next;
+ unsigned hole_low, hole_high;
+ unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
+ unsigned pgdidx, pudidx, pmdidx;
- BUG_ON(limit > FIXADDR_TOP);
+ /* The limit is the last byte to be touched */
+ limit--;
+ BUG_ON(limit >= FIXADDR_TOP);
if (xen_feature(XENFEAT_auto_translated_physmap))
return 0;
- for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+ /*
+ * 64-bit has a great big hole in the middle of the address
+ * space, which contains the Xen mappings. On 32-bit these
+ * will end up making a zero-sized hole and so is a no-op.
+ */
+ hole_low = pgd_index(USER_LIMIT);
+ hole_high = pgd_index(PAGE_OFFSET);
+
+ pgdidx_limit = pgd_index(limit);
+#if PTRS_PER_PUD > 1
+ pudidx_limit = pud_index(limit);
+#else
+ pudidx_limit = 0;
+#endif
+#if PTRS_PER_PMD > 1
+ pmdidx_limit = pmd_index(limit);
+#else
+ pmdidx_limit = 0;
+#endif
+
+ flush |= (*func)(virt_to_page(pgd), PT_PGD);
+
+ for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
pud_t *pud;
- unsigned long pud_limit, pud_next;
- pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+ if (pgdidx >= hole_low && pgdidx < hole_high)
+ continue;
- if (!pgd_val(*pgd))
+ if (!pgd_val(pgd[pgdidx]))
continue;
- pud = pud_offset(pgd, 0);
+ pud = pud_offset(&pgd[pgdidx], 0);
if (PTRS_PER_PUD > 1) /* not folded */
flush |= (*func)(virt_to_page(pud), PT_PUD);
- for (; addr != pud_limit; pud++, addr = pud_next) {
+ for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
pmd_t *pmd;
- unsigned long pmd_limit;
- pud_next = pud_addr_end(addr, pud_limit);
-
- if (pud_next < limit)
- pmd_limit = pud_next;
- else
- pmd_limit = limit;
+ if (pgdidx == pgdidx_limit &&
+ pudidx > pudidx_limit)
+ goto out;
- if (pud_none(*pud))
+ if (pud_none(pud[pudidx]))
continue;
- pmd = pmd_offset(pud, 0);
+ pmd = pmd_offset(&pud[pudidx], 0);
if (PTRS_PER_PMD > 1) /* not folded */
flush |= (*func)(virt_to_page(pmd), PT_PMD);
- for (; addr != pmd_limit; pmd++) {
- addr += (PAGE_SIZE * PTRS_PER_PTE);
- if ((pmd_limit-1) < (addr-1)) {
- addr = pmd_limit;
- break;
- }
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
+ struct page *pte;
+
+ if (pgdidx == pgdidx_limit &&
+ pudidx == pudidx_limit &&
+ pmdidx > pmdidx_limit)
+ goto out;
- if (pmd_none(*pmd))
+ if (pmd_none(pmd[pmdidx]))
continue;
- flush |= (*func)(pmd_page(*pmd), PT_PTE);
+ pte = pmd_page(pmd[pmdidx]);
+ flush |= (*func)(pte, PT_PTE);
}
}
}
-
- flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
+out:
return flush;
}
@@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd)
{
xen_mc_batch();
- if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+ if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
/* re-enable interrupts for kmap_flush_unused */
xen_mc_issue(0);
kmap_flush_unused();
xen_mc_batch();
}
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
+
+ if (user_pgd) {
+ pin_page(virt_to_page(user_pgd), PT_PGD);
+ xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
+ }
+ }
+#else /* CONFIG_X86_32 */
+#ifdef CONFIG_X86_PAE
+ /* Need to make sure unshared kernel PMD is pinnable */
+ pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+#endif
xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
+#endif /* CONFIG_X86_64 */
xen_mc_issue(0);
}
@@ -656,9 +770,11 @@ void xen_mm_pin_all(void)
spin_unlock_irqrestore(&pgd_lock, flags);
}
-/* The init_mm pagetable is really pinned as soon as its created, but
- that's before we have page structures to store the bits. So do all
- the book-keeping now. */
+/*
+ * The init_mm pagetable is really pinned as soon as its created, but
+ * that's before we have page structures to store the bits. So do all
+ * the book-keeping now.
+ */
static __init int mark_pinned(struct page *page, enum pt_level level)
{
SetPagePinned(page);
@@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd)
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
- pgd_walk(pgd, unpin_page, TASK_SIZE);
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ if (user_pgd) {
+ xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
+ unpin_page(virt_to_page(user_pgd), PT_PGD);
+ }
+ }
+#endif
+
+#ifdef CONFIG_X86_PAE
+ /* Need to make sure unshared kernel PMD is unpinned */
+ pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+#endif
+
+ pgd_walk(pgd, unpin_page, USER_LIMIT);
xen_mc_issue(0);
}
@@ -727,7 +859,6 @@ void xen_mm_unpin_all(void)
list_for_each_entry(page, &pgd_list, lru) {
if (PageSavePinned(page)) {
BUG_ON(!PagePinned(page));
- printk("unpinning pinned %p\n", page_address(page));
xen_pgd_unpin((pgd_t *)page_address(page));
ClearPageSavePinned(page);
}
@@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
static void drop_other_mm_ref(void *info)
{
struct mm_struct *mm = info;
+ struct mm_struct *active_mm;
+
+#ifdef CONFIG_X86_64
+ active_mm = read_pda(active_mm);
+#else
+ active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
+#endif
- if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+ if (active_mm == mm)
leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 297bf9f5b8b..0f59bd03f9e 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,18 +10,6 @@ enum pt_level {
PT_PTE
};
-/*
- * Page-directory addresses above 4GB do not fit into architectural %cr3.
- * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
- * must use the following accessor macros to pack/unpack valid MFNs.
- *
- * Note that Xen is using the fact that the pagetable base is always
- * page-aligned, and putting the 12 MSB of the address into the 12 LSB
- * of cr3.
- */
-#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
-#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
-
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
@@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t);
void xen_set_pte(pte_t *ptep, pte_t pteval);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
+
+#ifdef CONFIG_X86_PAE
void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+#endif /* CONFIG_X86_PAE */
+
void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
void xen_set_pud(pud_t *ptr, pud_t val);
void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
void xen_set_pud_hyper(pud_t *ptr, pud_t val);
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_pmd_clear(pmd_t *pmdp);
+
+#if PAGETABLE_LEVELS == 4
+pudval_t xen_pud_val(pud_t pud);
+pud_t xen_make_pud(pudval_t pudval);
+void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
+void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
+#endif
+
+pgd_t *xen_get_user_pgd(pgd_t *pgd);
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 3c63c4da7ed..9efd1c6c977 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -76,6 +76,7 @@ void xen_mc_flush(void)
if (ret) {
printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
ret, smp_processor_id());
+ dump_stack();
for (i = 0; i < b->mcidx; i++) {
printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
i+1, b->mcidx,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e0a39595bde..b6acc3a0af4 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -83,30 +83,72 @@ static void xen_idle(void)
/*
* Set the bit indicating "nosegneg" library variants should be used.
+ * We only need to bother in pure 32-bit mode; compat 32-bit processes
+ * can have un-truncated segments, so wrapping around is allowed.
*/
static void __init fiddle_vdso(void)
{
- extern const char vdso32_default_start;
- u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
+#ifdef CONFIG_X86_32
+ u32 *mask;
+ mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+ mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
+ *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+#endif
}
-void xen_enable_sysenter(void)
+static __cpuinit int register_callback(unsigned type, const void *func)
{
- int cpu = smp_processor_id();
- extern void xen_sysenter_target(void);
- /* Mask events on entry, even though they get enabled immediately */
- static struct callback_register sysenter = {
- .type = CALLBACKTYPE_sysenter,
- .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
+ struct callback_register callback = {
+ .type = type,
+ .address = XEN_CALLBACK(__KERNEL_CS, func),
.flags = CALLBACKF_mask_events,
};
- if (!boot_cpu_has(X86_FEATURE_SEP) ||
- HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
- clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
+ return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
+}
+
+void __cpuinit xen_enable_sysenter(void)
+{
+ extern void xen_sysenter_target(void);
+ int ret;
+ unsigned sysenter_feature;
+
+#ifdef CONFIG_X86_32
+ sysenter_feature = X86_FEATURE_SEP;
+#else
+ sysenter_feature = X86_FEATURE_SYSENTER32;
+#endif
+
+ if (!boot_cpu_has(sysenter_feature))
+ return;
+
+ ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
+ if(ret != 0)
+ setup_clear_cpu_cap(sysenter_feature);
+}
+
+void __cpuinit xen_enable_syscall(void)
+{
+#ifdef CONFIG_X86_64
+ int ret;
+ extern void xen_syscall_target(void);
+ extern void xen_syscall32_target(void);
+
+ ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
+ if (ret != 0) {
+ printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
+ /* Pretty fatal; 64-bit userspace has no other
+ mechanism for syscalls. */
}
+
+ if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
+ ret = register_callback(CALLBACKTYPE_syscall32,
+ xen_syscall32_target);
+ if (ret != 0)
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+ }
+#endif /* CONFIG_X86_64 */
}
void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
if (!xen_feature(XENFEAT_auto_translated_physmap))
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
- HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
- __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+ if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
+ register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
+ BUG();
xen_enable_sysenter();
+ xen_enable_syscall();
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
pm_idle = xen_idle;
-#ifdef CONFIG_SMP
- /* fill cpus_possible with all available cpus */
- xen_fill_possible_map();
-#endif
-
paravirt_disable_iospace();
fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 233156f39b7..e693812ac59 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -15,6 +15,7 @@
* This does not handle HOTPLUG_CPU yet.
*/
#include <linux/sched.h>
+#include <linux/kernel_stat.h>
#include <linux/err.h>
#include <linux/smp.h>
@@ -35,6 +36,8 @@
#include "xen-ops.h"
#include "mmu.h"
+static void __cpuinit xen_init_lock_cpu(int cpu);
+
cpumask_t xen_cpu_initialized_map;
static DEFINE_PER_CPU(int, resched_irq);
@@ -66,13 +69,22 @@ static __cpuinit void cpu_bringup_and_idle(void)
int cpu = smp_processor_id();
cpu_init();
+ preempt_disable();
+
xen_enable_sysenter();
+ xen_enable_syscall();
- preempt_disable();
- per_cpu(cpu_state, cpu) = CPU_ONLINE;
+ cpu = smp_processor_id();
+ smp_store_cpu_info(cpu);
+ cpu_data(cpu).x86_max_cores = 1;
+ set_cpu_sibling_map(cpu);
xen_setup_cpu_clockevents();
+ cpu_set(cpu, cpu_online_map);
+ x86_write_percpu(cpu_state, CPU_ONLINE);
+ wmb();
+
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
@@ -141,56 +153,39 @@ static int xen_smp_intr_init(unsigned int cpu)
return rc;
}
-void __init xen_fill_possible_map(void)
+static void __init xen_fill_possible_map(void)
{
int i, rc;
for (i = 0; i < NR_CPUS; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
- if (rc >= 0)
+ if (rc >= 0) {
+ num_processors++;
cpu_set(i, cpu_possible_map);
+ }
}
}
-void __init xen_smp_prepare_boot_cpu(void)
+static void __init xen_smp_prepare_boot_cpu(void)
{
- int cpu;
-
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
- make_lowmem_page_readwrite(&per_cpu__gdt_page);
-
- for_each_possible_cpu(cpu) {
- cpus_clear(per_cpu(cpu_sibling_map, cpu));
- /*
- * cpu_core_map lives in a per cpu area that is cleared
- * when the per cpu array is allocated.
- *
- * cpus_clear(per_cpu(cpu_core_map, cpu));
- */
- }
+ make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
xen_setup_vcpu_info_placement();
}
-void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
- for_each_possible_cpu(cpu) {
- cpus_clear(per_cpu(cpu_sibling_map, cpu));
- /*
- * cpu_core_ map will be zeroed when the per
- * cpu area is allocated.
- *
- * cpus_clear(per_cpu(cpu_core_map, cpu));
- */
- }
+ xen_init_lock_cpu(0);
smp_store_cpu_info(0);
+ cpu_data(0).x86_max_cores = 1;
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
@@ -225,7 +220,7 @@ static __cpuinit int
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
- struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+ struct desc_struct *gdt;
if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
return 0;
@@ -234,12 +229,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
if (ctxt == NULL)
return -ENOMEM;
+ gdt = get_cpu_gdt_table(cpu);
+
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
- ctxt->user_regs.fs = __KERNEL_PERCPU;
- ctxt->user_regs.gs = 0;
ctxt->user_regs.ss = __KERNEL_DS;
+#ifdef CONFIG_X86_32
+ ctxt->user_regs.fs = __KERNEL_PERCPU;
+#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -249,11 +247,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->ldt_ents = 0;
- BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
- make_lowmem_page_readonly(gdt->gdt);
+ BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+ make_lowmem_page_readonly(gdt);
- ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
- ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
+ ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+ ctxt->gdt_ents = GDT_ENTRIES;
ctxt->user_regs.cs = __KERNEL_CS;
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +259,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->kernel_ss = __KERNEL_DS;
ctxt->kernel_sp = idle->thread.sp0;
+#ifdef CONFIG_X86_32
ctxt->event_callback_cs = __KERNEL_CS;
- ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_cs = __KERNEL_CS;
+#endif
+ ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,7 +276,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
return 0;
}
-int __cpuinit xen_cpu_up(unsigned int cpu)
+static int __cpuinit xen_cpu_up(unsigned int cpu)
{
struct task_struct *idle = idle_task(cpu);
int rc;
@@ -287,10 +287,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
return rc;
#endif
+#ifdef CONFIG_X86_64
+ /* Allocate node local memory for AP pdas */
+ WARN_ON(cpu == 0);
+ if (cpu > 0) {
+ rc = get_local_pda(cpu);
+ if (rc)
+ return rc;
+ }
+#endif
+
+#ifdef CONFIG_X86_32
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
+#else
+ cpu_pda(cpu)->pcurrent = idle;
+ clear_tsk_thread_flag(idle, TIF_FORK);
+#endif
xen_setup_timer(cpu);
+ xen_init_lock_cpu(cpu);
+
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -306,20 +324,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
if (rc)
return rc;
- smp_store_cpu_info(cpu);
- set_cpu_sibling_map(cpu);
- /* This must be done before setting cpu_online_map */
- wmb();
-
- cpu_set(cpu, cpu_online_map);
-
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
+ while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
+ HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+ barrier();
+ }
+
return 0;
}
-void xen_smp_cpus_done(unsigned int max_cpus)
+static void xen_smp_cpus_done(unsigned int max_cpus)
{
}
@@ -335,12 +351,12 @@ static void stop_self(void *v)
BUG();
}
-void xen_smp_send_stop(void)
+static void xen_smp_send_stop(void)
{
smp_call_function(stop_self, NULL, 0);
}
-void xen_smp_send_reschedule(int cpu)
+static void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
@@ -355,7 +371,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
xen_send_IPI_one(cpu, vector);
}
-void xen_smp_send_call_function_ipi(cpumask_t mask)
+static void xen_smp_send_call_function_ipi(cpumask_t mask)
{
int cpu;
@@ -370,7 +386,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
}
}
-void xen_smp_send_call_function_single_ipi(int cpu)
+static void xen_smp_send_call_function_single_ipi(int cpu)
{
xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
}
@@ -379,7 +395,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
+#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
+#else
+ add_pda(irq_call_count, 1);
+#endif
irq_exit();
return IRQ_HANDLED;
@@ -389,8 +409,196 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
+#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
+#else
+ add_pda(irq_call_count, 1);
+#endif
irq_exit();
return IRQ_HANDLED;
}
+
+struct xen_spinlock {
+ unsigned char lock; /* 0 -> free; 1 -> locked */
+ unsigned short spinners; /* count of waiting cpus */
+};
+
+static int xen_spin_is_locked(struct raw_spinlock *lock)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+ return xl->lock != 0;
+}
+
+static int xen_spin_is_contended(struct raw_spinlock *lock)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+ /* Not strictly true; this is only the count of contended
+ lock-takers entering the slow path. */
+ return xl->spinners != 0;
+}
+
+static int xen_spin_trylock(struct raw_spinlock *lock)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+ u8 old = 1;
+
+ asm("xchgb %b0,%1"
+ : "+q" (old), "+m" (xl->lock) : : "memory");
+
+ return old == 0;
+}
+
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
+
+static inline void spinning_lock(struct xen_spinlock *xl)
+{
+ __get_cpu_var(lock_spinners) = xl;
+ wmb(); /* set lock of interest before count */
+ asm(LOCK_PREFIX " incw %0"
+ : "+m" (xl->spinners) : : "memory");
+}
+
+static inline void unspinning_lock(struct xen_spinlock *xl)
+{
+ asm(LOCK_PREFIX " decw %0"
+ : "+m" (xl->spinners) : : "memory");
+ wmb(); /* decrement count before clearing lock */
+ __get_cpu_var(lock_spinners) = NULL;
+}
+
+static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+ int irq = __get_cpu_var(lock_kicker_irq);
+ int ret;
+
+ /* If kicker interrupts not initialized yet, just spin */
+ if (irq == -1)
+ return 0;
+
+ /* announce we're spinning */
+ spinning_lock(xl);
+
+ /* clear pending */
+ xen_clear_irq_pending(irq);
+
+ /* check again make sure it didn't become free while
+ we weren't looking */
+ ret = xen_spin_trylock(lock);
+ if (ret)
+ goto out;
+
+ /* block until irq becomes pending */
+ xen_poll_irq(irq);
+ kstat_this_cpu.irqs[irq]++;
+
+out:
+ unspinning_lock(xl);
+ return ret;
+}
+
+static void xen_spin_lock(struct raw_spinlock *lock)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+ int timeout;
+ u8 oldval;
+
+ do {
+ timeout = 1 << 10;
+
+ asm("1: xchgb %1,%0\n"
+ " testb %1,%1\n"
+ " jz 3f\n"
+ "2: rep;nop\n"
+ " cmpb $0,%0\n"
+ " je 1b\n"
+ " dec %2\n"
+ " jnz 2b\n"
+ "3:\n"
+ : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
+ : "1" (1)
+ : "memory");
+
+ } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
+}
+
+static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ /* XXX should mix up next cpu selection */
+ if (per_cpu(lock_spinners, cpu) == xl) {
+ xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+ break;
+ }
+ }
+}
+
+static void xen_spin_unlock(struct raw_spinlock *lock)
+{
+ struct xen_spinlock *xl = (struct xen_spinlock *)lock;
+
+ smp_wmb(); /* make sure no writes get moved after unlock */
+ xl->lock = 0; /* release lock */
+
+ /* make sure unlock happens before kick */
+ barrier();
+
+ if (unlikely(xl->spinners))
+ xen_spin_unlock_slow(xl);
+}
+
+static __cpuinit void xen_init_lock_cpu(int cpu)
+{
+ int irq;
+ const char *name;
+
+ name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
+ irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
+ cpu,
+ xen_reschedule_interrupt,
+ IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+ name,
+ NULL);
+
+ if (irq >= 0) {
+ disable_irq(irq); /* make sure it's never delivered */
+ per_cpu(lock_kicker_irq, cpu) = irq;
+ }
+
+ printk("cpu %d spinlock event irq %d\n", cpu, irq);
+}
+
+static void __init xen_init_spinlocks(void)
+{
+ pv_lock_ops.spin_is_locked = xen_spin_is_locked;
+ pv_lock_ops.spin_is_contended = xen_spin_is_contended;
+ pv_lock_ops.spin_lock = xen_spin_lock;
+ pv_lock_ops.spin_trylock = xen_spin_trylock;
+ pv_lock_ops.spin_unlock = xen_spin_unlock;
+}
+
+static const struct smp_ops xen_smp_ops __initdata = {
+ .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = xen_smp_prepare_cpus,
+ .cpu_up = xen_cpu_up,
+ .smp_cpus_done = xen_smp_cpus_done,
+
+ .smp_send_stop = xen_smp_send_stop,
+ .smp_send_reschedule = xen_smp_send_reschedule,
+
+ .send_call_func_ipi = xen_smp_send_call_function_ipi,
+ .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
+};
+
+void __init xen_smp_init(void)
+{
+ smp_ops = xen_smp_ops;
+ xen_fill_possible_map();
+ xen_init_spinlocks();
+}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 251669a932d..2a234db5949 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
xen_cpu_initialized_map = cpu_online_map;
#endif
xen_vcpu_restore();
- xen_timer_resume();
}
}
+void xen_arch_resume(void)
+{
+ /* nothing */
+}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41d..2497a30f41d 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 00000000000..4038cbfe333
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,271 @@
+/*
+ Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+ The inline versions are the same as the direct-use versions, with the
+ pre- and post-amble chopped off.
+
+ This code is encoded for size rather than absolute efficiency,
+ with a view to being able to inline as much as possible.
+
+ We only bother with direct forms (ie, vcpu in pda) of the operations
+ here; the indirect forms are better handled in C, since they're
+ generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x) .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+#if 0
+#include <asm/percpu.h>
+
+/*
+ Enable events. This clears the event mask and tests the pending
+ event status with one and operation. If there are pending
+ events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+ /* Unmask events */
+ movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+
+ /* Preempt here doesn't matter because that will deal with
+ any pending interrupts. The pending check may end up being
+ run on the wrong CPU, but that doesn't hurt. */
+
+ /* Test for pending */
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+ jz 1f
+
+2: call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+ ret
+ ENDPROC(xen_irq_enable_direct)
+ RELOC(xen_irq_enable_direct, 2b+1)
+
+/*
+ Disabling events is simply a matter of making the event mask
+ non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+ movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ENDPATCH(xen_irq_disable_direct)
+ ret
+ ENDPROC(xen_irq_disable_direct)
+ RELOC(xen_irq_disable_direct, 0)
+
+/*
+ (xen_)save_fl is used to get the current interrupt enable status.
+ Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ may be set in the return value. We take advantage of this by
+ making sure that X86_EFLAGS_IF has the right value (and other bits
+ in that byte are 0), but other bits in the return value are
+ undefined. We need to toggle the state of the bit, because
+ Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ setz %ah
+ addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+ ret
+ ENDPROC(xen_save_fl_direct)
+ RELOC(xen_save_fl_direct, 0)
+
+/*
+ In principle the caller should be passing us a value return
+ from xen_save_fl_direct, but for robustness sake we test only
+ the X86_EFLAGS_IF flag rather than the whole byte. After
+ setting the interrupt mask state, it checks for unmasked
+ pending events and enters the hypervisor to get them delivered
+ if so.
+ */
+ENTRY(xen_restore_fl_direct)
+ testb $X86_EFLAGS_IF>>8, %ah
+ setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ /* Preempt here doesn't matter because that will deal with
+ any pending interrupts. The pending check may end up being
+ run on the wrong CPU, but that doesn't hurt. */
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+ jz 1f
+2: call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+ ret
+ ENDPROC(xen_restore_fl_direct)
+ RELOC(xen_restore_fl_direct, 2b+1)
+
+
+/*
+ Force an event check by making a hypercall,
+ but preserve regs before making the call.
+ */
+check_events:
+ push %rax
+ push %rcx
+ push %rdx
+ push %rsi
+ push %rdi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+ call force_evtchn_callback
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rdi
+ pop %rsi
+ pop %rdx
+ pop %rcx
+ pop %rax
+ ret
+#endif
+
+ENTRY(xen_adjust_exception_frame)
+ mov 8+0(%rsp),%rcx
+ mov 8+8(%rsp),%r11
+ ret $16
+
+hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
+/*
+ Xen64 iret frame:
+
+ ss
+ rsp
+ rflags
+ cs
+ rip <-- standard iret frame
+
+ flags
+
+ rcx }
+ r11 }<-- pushed by hypercall page
+rsp -> rax }
+ */
+ENTRY(xen_iret)
+ pushq $0
+1: jmp hypercall_iret
+ENDPATCH(xen_iret)
+RELOC(xen_iret, 1b+1)
+
+/*
+ sysexit is not used for 64-bit processes, so it's
+ only ever used to return to 32-bit compat userspace.
+ */
+ENTRY(xen_sysexit)
+ pushq $__USER32_DS
+ pushq %rcx
+ pushq $X86_EFLAGS_IF
+ pushq $__USER32_CS
+ pushq %rdx
+
+ pushq $VGCF_in_syscall
+1: jmp hypercall_iret
+ENDPATCH(xen_sysexit)
+RELOC(xen_sysexit, 1b+1)
+
+ENTRY(xen_sysret64)
+ /* We're already on the usermode stack at this point, but still
+ with the kernel gs, so we can easily switch back */
+ movq %rsp, %gs:pda_oldrsp
+ movq %gs:pda_kernelstack,%rsp
+
+ pushq $__USER_DS
+ pushq %gs:pda_oldrsp
+ pushq %r11
+ pushq $__USER_CS
+ pushq %rcx
+
+ pushq $VGCF_in_syscall
+1: jmp hypercall_iret
+ENDPATCH(xen_sysret64)
+RELOC(xen_sysret64, 1b+1)
+
+ENTRY(xen_sysret32)
+ /* We're already on the usermode stack at this point, but still
+ with the kernel gs, so we can easily switch back */
+ movq %rsp, %gs:pda_oldrsp
+ movq %gs:pda_kernelstack, %rsp
+
+ pushq $__USER32_DS
+ pushq %gs:pda_oldrsp
+ pushq %r11
+ pushq $__USER32_CS
+ pushq %rcx
+
+ pushq $VGCF_in_syscall
+1: jmp hypercall_iret
+ENDPATCH(xen_sysret32)
+RELOC(xen_sysret32, 1b+1)
+
+/*
+ Xen handles syscall callbacks much like ordinary exceptions,
+ which means we have:
+ - kernel gs
+ - kernel rsp
+ - an iret-like stack frame on the stack (including rcx and r11):
+ ss
+ rsp
+ rflags
+ cs
+ rip
+ r11
+ rsp-> rcx
+
+ In all the entrypoints, we undo all that to make it look
+ like a CPU-generated syscall/sysenter and jump to the normal
+ entrypoint.
+ */
+
+.macro undo_xen_syscall
+ mov 0*8(%rsp),%rcx
+ mov 1*8(%rsp),%r11
+ mov 5*8(%rsp),%rsp
+.endm
+
+/* Normal 64-bit system call target */
+ENTRY(xen_syscall_target)
+ undo_xen_syscall
+ jmp system_call_after_swapgs
+ENDPROC(xen_syscall_target)
+
+#ifdef CONFIG_IA32_EMULATION
+
+/* 32-bit compat syscall target */
+ENTRY(xen_syscall32_target)
+ undo_xen_syscall
+ jmp ia32_cstar_target
+ENDPROC(xen_syscall32_target)
+
+/* 32-bit compat sysenter target */
+ENTRY(xen_sysenter_target)
+ undo_xen_syscall
+ jmp ia32_sysenter_target
+ENDPROC(xen_sysenter_target)
+
+#else /* !CONFIG_IA32_EMULATION */
+
+ENTRY(xen_syscall32_target)
+ENTRY(xen_sysenter_target)
+ lea 16(%rsp), %rsp /* strip %rcx,%r11 */
+ mov $-ENOSYS, %rax
+ pushq $VGCF_in_syscall
+ jmp hypercall_iret
+ENDPROC(xen_syscall32_target)
+ENDPROC(xen_sysenter_target)
+
+#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7c0cf6320a0..63d49a523ed 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,15 +5,24 @@
#include <linux/elfnote.h>
#include <linux/init.h>
+
#include <asm/boot.h>
+#include <asm/asm.h>
+#include <asm/page.h>
+
#include <xen/interface/elfnote.h>
#include <asm/xen/interface.h>
__INIT
ENTRY(startup_xen)
- movl %esi,xen_start_info
cld
- movl $(init_thread_union+THREAD_SIZE),%esp
+#ifdef CONFIG_X86_32
+ mov %esi,xen_start_info
+ mov $init_thread_union+THREAD_SIZE,%esp
+#else
+ mov %rsi,xen_start_info
+ mov $init_thread_union+THREAD_SIZE,%rsp
+#endif
jmp xen_start_kernel
__FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
.pushsection .text
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
- .skip 0x1000
+ .skip PAGE_SIZE_asm
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
- ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
- ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
- ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
+#ifdef CONFIG_X86_32
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
+#else
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
+#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
- ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START)
+ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 6f4b1045c1c..dd3c23152a2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -26,6 +26,7 @@ char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);
void xen_enable_sysenter(void);
+void xen_enable_syscall(void);
void xen_vcpu_restore(void);
void __init xen_build_dynamic_phys_to_machine(void);
@@ -37,7 +38,6 @@ void __init xen_time_init(void);
unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
-void xen_timer_resume(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
@@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu);
void xen_mark_init_mm_pinned(void);
-void __init xen_fill_possible_map(void);
-
void __init xen_setup_vcpu_info_placement(void);
-void xen_smp_prepare_boot_cpu(void);
-void xen_smp_prepare_cpus(unsigned int max_cpus);
-int xen_cpu_up(unsigned int cpu);
-void xen_smp_cpus_done(unsigned int max_cpus);
-void xen_smp_send_stop(void);
-void xen_smp_send_reschedule(int cpu);
-void xen_smp_send_call_function_ipi(cpumask_t mask);
-void xen_smp_send_call_function_single_ipi(int cpu);
+#ifdef CONFIG_SMP
+void xen_smp_init(void);
extern cpumask_t xen_cpu_initialized_map;
+#else
+static inline void xen_smp_init(void) {}
+#endif
/* Declare an asm function, along with symbols needed to make it
@@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+/* These are not functions, and cannot be called normally */
void xen_iret(void);
void xen_sysexit(void);
+void xen_sysret32(void);
+void xen_sysret64(void);
+void xen_adjust_exception_frame(void);
#endif /* XEN_OPS_H */
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index ef671d1a3bf..902bbe78821 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -92,7 +92,7 @@ struct netfront_info {
*/
union skb_entry {
struct sk_buff *skb;
- unsigned link;
+ unsigned long link;
} tx_skbs[NET_TX_RING_SIZE];
grant_ref_t gref_tx_head;
grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
@@ -125,6 +125,17 @@ struct netfront_rx_info {
struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
};
+static void skb_entry_set_link(union skb_entry *list, unsigned short id)
+{
+ list->link = id;
+}
+
+static int skb_entry_is_link(const union skb_entry *list)
+{
+ BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link));
+ return ((unsigned long)list->skb < PAGE_OFFSET);
+}
+
/*
* Access macros for acquiring freeing slots in tx_skbs[].
*/
@@ -132,7 +143,7 @@ struct netfront_rx_info {
static void add_id_to_freelist(unsigned *head, union skb_entry *list,
unsigned short id)
{
- list[id].link = *head;
+ skb_entry_set_link(&list[id], *head);
*head = id;
}
@@ -993,7 +1004,7 @@ static void xennet_release_tx_bufs(struct netfront_info *np)
for (i = 0; i < NET_TX_RING_SIZE; i++) {
/* Skip over entries which are actually freelist references */
- if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET)
+ if (skb_entry_is_link(&np->tx_skbs[i]))
continue;
skb = np->tx_skbs[i].skb;
@@ -1123,7 +1134,7 @@ static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev
/* Initialise tx_skbs as a free chain containing every entry. */
np->tx_skb_freelist = 0;
for (i = 0; i < NET_TX_RING_SIZE; i++) {
- np->tx_skbs[i].link = i+1;
+ skb_entry_set_link(&np->tx_skbs[i], i+1);
np->grant_tx_ref[i] = GRANT_INVALID_REF;
}
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 332dd63750a..0e0c28574af 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -734,6 +734,33 @@ static void restore_cpu_ipis(unsigned int cpu)
}
}
+/* Clear an irq's pending state, in preparation for polling on it */
+void xen_clear_irq_pending(int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn))
+ clear_evtchn(evtchn);
+}
+
+/* Poll waiting for an irq to become pending. In the usual case, the
+ irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq)
+{
+ evtchn_port_t evtchn = evtchn_from_irq(irq);
+
+ if (VALID_EVTCHN(evtchn)) {
+ struct sched_poll poll;
+
+ poll.nr_ports = 1;
+ poll.timeout = 0;
+ poll.ports = &evtchn;
+
+ if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
+ BUG();
+ }
+}
+
void xen_irq_resume(void)
{
unsigned int cpu, irq, evtchn;
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 5b546e365f0..2bb268e4ac5 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -68,6 +68,7 @@ static int xen_suspend(void *data)
if (!*cancelled) {
xen_irq_resume();
xen_console_resume();
+ xen_timer_resume();
}
return 0;
@@ -107,9 +108,10 @@ static void do_suspend(void)
goto out;
}
- if (!cancelled)
+ if (!cancelled) {
+ xen_arch_resume();
xenbus_resume();
- else
+ } else
xenbus_suspend_cancel();
device_resume();
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index ef5e8ec6a6a..b2aba8fdaae 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -326,6 +326,15 @@ struct pv_mmu_ops {
unsigned long phys, pgprot_t flags);
};
+struct raw_spinlock;
+struct pv_lock_ops {
+ int (*spin_is_locked)(struct raw_spinlock *lock);
+ int (*spin_is_contended)(struct raw_spinlock *lock);
+ void (*spin_lock)(struct raw_spinlock *lock);
+ int (*spin_trylock)(struct raw_spinlock *lock);
+ void (*spin_unlock)(struct raw_spinlock *lock);
+};
+
/* This contains all the paravirt structures: we get a convenient
* number for each function using the offset which we use to indicate
* what to patch. */
@@ -336,6 +345,7 @@ struct paravirt_patch_template {
struct pv_irq_ops pv_irq_ops;
struct pv_apic_ops pv_apic_ops;
struct pv_mmu_ops pv_mmu_ops;
+ struct pv_lock_ops pv_lock_ops;
};
extern struct pv_info pv_info;
@@ -345,6 +355,7 @@ extern struct pv_cpu_ops pv_cpu_ops;
extern struct pv_irq_ops pv_irq_ops;
extern struct pv_apic_ops pv_apic_ops;
extern struct pv_mmu_ops pv_mmu_ops;
+extern struct pv_lock_ops pv_lock_ops;
#define PARAVIRT_PATCH(x) \
(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
@@ -1374,6 +1385,37 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
void _paravirt_nop(void);
#define paravirt_nop ((void *)_paravirt_nop)
+void paravirt_use_bytelocks(void);
+
+#ifdef CONFIG_SMP
+
+static inline int __raw_spin_is_locked(struct raw_spinlock *lock)
+{
+ return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
+}
+
+static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
+{
+ return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
+}
+
+static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
+{
+ return PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
+}
+
+static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock)
+{
+ return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
+}
+
+static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
+{
+ return PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
+}
+
+#endif
+
/* These all sit in the .parainstructions section to tell us what to patch. */
struct paravirt_patch_site {
u8 *instr; /* original instructions */
@@ -1396,8 +1438,8 @@ extern struct paravirt_patch_site __parainstructions[],
* caller saved registers but the argument parameter */
#define PV_SAVE_REGS "pushq %%rdi;"
#define PV_RESTORE_REGS "popq %%rdi;"
-#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx"
-#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx"
+#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
+#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
#define PV_FLAGS_ARG "D"
#endif
@@ -1458,6 +1500,7 @@ static inline unsigned long __raw_local_irq_save(void)
return f;
}
+
/* Make sure as little as possible of this mess escapes. */
#undef PARAVIRT_CALL
#undef __PVOP_CALL
@@ -1489,8 +1532,26 @@ static inline unsigned long __raw_local_irq_save(void)
#ifdef CONFIG_X86_64
-#define PV_SAVE_REGS pushq %rax; pushq %rdi; pushq %rcx; pushq %rdx
-#define PV_RESTORE_REGS popq %rdx; popq %rcx; popq %rdi; popq %rax
+#define PV_SAVE_REGS \
+ push %rax; \
+ push %rcx; \
+ push %rdx; \
+ push %rsi; \
+ push %rdi; \
+ push %r8; \
+ push %r9; \
+ push %r10; \
+ push %r11
+#define PV_RESTORE_REGS \
+ pop %r11; \
+ pop %r10; \
+ pop %r9; \
+ pop %r8; \
+ pop %rdi; \
+ pop %rsi; \
+ pop %rdx; \
+ pop %rcx; \
+ pop %rax
#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h
index 912a3a17b9d..4e91ee1e37a 100644
--- a/include/asm-x86/percpu.h
+++ b/include/asm-x86/percpu.h
@@ -22,6 +22,32 @@
DECLARE_PER_CPU(struct x8664_pda, pda);
+/*
+ * These are supposed to be implemented as a single instruction which
+ * operates on the per-cpu data base segment. x86-64 doesn't have
+ * that yet, so this is a fairly inefficient workaround for the
+ * meantime. The single instruction is atomic with respect to
+ * preemption and interrupts, so we need to explicitly disable
+ * interrupts here to achieve the same effect. However, because it
+ * can be used from within interrupt-disable/enable, we can't actually
+ * disable interrupts; disabling preemption is enough.
+ */
+#define x86_read_percpu(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp; \
+ preempt_disable(); \
+ __tmp = __get_cpu_var(var); \
+ preempt_enable(); \
+ __tmp; \
+ })
+
+#define x86_write_percpu(var, val) \
+ do { \
+ preempt_disable(); \
+ __get_cpu_var(var) = (val); \
+ preempt_enable(); \
+ } while(0)
+
#else /* CONFIG_X86_64 */
#ifdef __ASSEMBLY__
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 49cbd76b954..96aa76e691d 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -302,6 +302,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+#ifdef CONFIG_X86_32
+extern void native_pagetable_setup_start(pgd_t *base);
+extern void native_pagetable_setup_done(pgd_t *base);
+#else
+static inline void native_pagetable_setup_start(pgd_t *base) {}
+static inline void native_pagetable_setup_done(pgd_t *base) {}
+#endif
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else /* !CONFIG_PARAVIRT */
@@ -333,6 +341,16 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte);
#define pte_update(mm, addr, ptep) do { } while (0)
#define pte_update_defer(mm, addr, ptep) do { } while (0)
+
+static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
+{
+ native_pagetable_setup_start(base);
+}
+
+static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
+{
+ native_pagetable_setup_done(base);
+}
#endif /* CONFIG_PARAVIRT */
#endif /* __ASSEMBLY__ */
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
index ec871c420d7..0611abf96a5 100644
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -171,21 +171,6 @@ do { \
*/
#define update_mmu_cache(vma, address, pte) do { } while (0)
-extern void native_pagetable_setup_start(pgd_t *base);
-extern void native_pagetable_setup_done(pgd_t *base);
-
-#ifndef CONFIG_PARAVIRT
-static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
-{
- native_pagetable_setup_start(base);
-}
-
-static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
-{
- native_pagetable_setup_done(base);
-}
-#endif /* !CONFIG_PARAVIRT */
-
#endif /* !__ASSEMBLY__ */
/*
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
index fa7208b483c..805d3128bfc 100644
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -16,6 +16,8 @@
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
extern pgd_t init_level4_pgt[];
#define swapper_pg_dir init_level4_pgt
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
index 90ab2225e71..659492624e7 100644
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -76,6 +76,7 @@ extern unsigned long init_pg_tables_start;
extern unsigned long init_pg_tables_end;
#else
+void __init x86_64_init_pda(void);
void __init x86_64_start_kernel(char *real_mode);
void __init x86_64_start_reservations(char *real_mode_data);
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index c2784b3e0b7..3c877f74f27 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -25,6 +25,8 @@ extern cpumask_t cpu_callin_map;
extern void (*mtrr_hook)(void);
extern void zap_low_mappings(void);
+extern int __cpuinit get_local_pda(int cpu);
+
extern int smp_num_siblings;
extern unsigned int num_processors;
extern cpumask_t cpu_initialized;
diff --git a/include/asm-x86/spinlock.h b/include/asm-x86/spinlock.h
index 21e89bf92f1..4f9a9861799 100644
--- a/include/asm-x86/spinlock.h
+++ b/include/asm-x86/spinlock.h
@@ -6,7 +6,7 @@
#include <asm/page.h>
#include <asm/processor.h>
#include <linux/compiler.h>
-
+#include <asm/paravirt.h>
/*
* Your basic SMP spinlocks, allowing only a single CPU anywhere
*
@@ -54,21 +54,21 @@
* much between them in performance though, especially as locks are out of line.
*/
#if (NR_CPUS < 256)
-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
{
int tmp = ACCESS_ONCE(lock->slock);
return (((tmp >> 8) & 0xff) != (tmp & 0xff));
}
-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
{
int tmp = ACCESS_ONCE(lock->slock);
return (((tmp >> 8) & 0xff) - (tmp & 0xff)) > 1;
}
-static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
{
short inc = 0x0100;
@@ -87,9 +87,7 @@ static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
: "memory", "cc");
}
-#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
-
-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
{
int tmp;
short new;
@@ -110,7 +108,7 @@ static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
return tmp;
}
-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
{
asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
: "+m" (lock->slock)
@@ -118,21 +116,21 @@ static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
: "memory", "cc");
}
#else
-static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
{
int tmp = ACCESS_ONCE(lock->slock);
return (((tmp >> 16) & 0xffff) != (tmp & 0xffff));
}
-static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
{
int tmp = ACCESS_ONCE(lock->slock);
return (((tmp >> 16) & 0xffff) - (tmp & 0xffff)) > 1;
}
-static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
{
int inc = 0x00010000;
int tmp;
@@ -153,9 +151,7 @@ static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
: "memory", "cc");
}
-#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
-
-static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
{
int tmp;
int new;
@@ -177,7 +173,7 @@ static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
return tmp;
}
-static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
{
asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
: "+m" (lock->slock)
@@ -186,6 +182,98 @@ static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
}
#endif
+#define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+
+#ifdef CONFIG_PARAVIRT
+/*
+ * Define virtualization-friendly old-style lock byte lock, for use in
+ * pv_lock_ops if desired.
+ *
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure. It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+struct __byte_spinlock {
+ s8 lock;
+ s8 spinners;
+};
+
+static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ return bl->lock != 0;
+}
+
+static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ return bl->spinners != 0;
+}
+
+static inline void __byte_spin_lock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ s8 val = 1;
+
+ asm("1: xchgb %1, %0\n"
+ " test %1,%1\n"
+ " jz 3f\n"
+ " " LOCK_PREFIX "incb %2\n"
+ "2: rep;nop\n"
+ " cmpb $1, %0\n"
+ " je 2b\n"
+ " " LOCK_PREFIX "decb %2\n"
+ " jmp 1b\n"
+ "3:"
+ : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
+}
+
+static inline int __byte_spin_trylock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ u8 old = 1;
+
+ asm("xchgb %1,%0"
+ : "+m" (bl->lock), "+q" (old) : : "memory");
+
+ return old == 0;
+}
+
+static inline void __byte_spin_unlock(raw_spinlock_t *lock)
+{
+ struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
+ smp_wmb();
+ bl->lock = 0;
+}
+#else /* !CONFIG_PARAVIRT */
+static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
+{
+ return __ticket_spin_is_locked(lock);
+}
+
+static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
+{
+ return __ticket_spin_is_contended(lock);
+}
+
+static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
+{
+ __ticket_spin_lock(lock);
+}
+
+static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
+{
+ return __ticket_spin_trylock(lock);
+}
+
+static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
+{
+ __ticket_spin_unlock(lock);
+}
+#endif /* CONFIG_PARAVIRT */
+
static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
{
while (__raw_spin_is_locked(lock))
diff --git a/include/asm-x86/spinlock_types.h b/include/asm-x86/spinlock_types.h
index 9029cf78cf5..06c071c9eee 100644
--- a/include/asm-x86/spinlock_types.h
+++ b/include/asm-x86/spinlock_types.h
@@ -5,7 +5,7 @@
# error "please don't include this file directly"
#endif
-typedef struct {
+typedef struct raw_spinlock {
unsigned int slock;
} raw_spinlock_t;
diff --git a/include/asm-x86/vdso.h b/include/asm-x86/vdso.h
index 86e085e003d..8e18fb80f5e 100644
--- a/include/asm-x86/vdso.h
+++ b/include/asm-x86/vdso.h
@@ -36,4 +36,12 @@ extern const char VDSO32_PRELINK[];
extern void __user __kernel_sigreturn;
extern void __user __kernel_rt_sigreturn;
+/*
+ * These symbols are defined by vdso32.S to mark the bounds
+ * of the ELF DSO images included therein.
+ */
+extern const char vdso32_int80_start, vdso32_int80_end;
+extern const char vdso32_syscall_start, vdso32_syscall_end;
+extern const char vdso32_sysenter_start, vdso32_sysenter_end;
+
#endif /* asm-x86/vdso.h */
diff --git a/include/asm-x86/xen/events.h b/include/asm-x86/xen/events.h
index f8d57ea1f05..8ded7472002 100644
--- a/include/asm-x86/xen/events.h
+++ b/include/asm-x86/xen/events.h
@@ -5,6 +5,7 @@ enum ipi_vector {
XEN_RESCHEDULE_VECTOR,
XEN_CALL_FUNCTION_VECTOR,
XEN_CALL_FUNCTION_SINGLE_VECTOR,
+ XEN_SPIN_UNLOCK_VECTOR,
XEN_NR_IPIS,
};
diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h
index 2a4f9b41d68..91cb7fd5c12 100644
--- a/include/asm-x86/xen/hypercall.h
+++ b/include/asm-x86/xen/hypercall.h
@@ -40,83 +40,157 @@
#include <xen/interface/sched.h>
#include <xen/interface/physdev.h>
+/*
+ * The hypercall asms have to meet several constraints:
+ * - Work on 32- and 64-bit.
+ * The two architectures put their arguments in different sets of
+ * registers.
+ *
+ * - Work around asm syntax quirks
+ * It isn't possible to specify one of the rNN registers in a
+ * constraint, so we use explicit register variables to get the
+ * args into the right place.
+ *
+ * - Mark all registers as potentially clobbered
+ * Even unused parameters can be clobbered by the hypervisor, so we
+ * need to make sure gcc knows it.
+ *
+ * - Avoid compiler bugs.
+ * This is the tricky part. Because x86_32 has such a constrained
+ * register set, gcc versions below 4.3 have trouble generating
+ * code when all the arg registers and memory are trashed by the
+ * asm. There are syntactically simpler ways of achieving the
+ * semantics below, but they cause the compiler to crash.
+ *
+ * The only combination I found which works is:
+ * - assign the __argX variables first
+ * - list all actually used parameters as "+r" (__argX)
+ * - clobber the rest
+ *
+ * The result certainly isn't pretty, and it really shows up cpp's
+ * weakness as as macro language. Sorry. (But let's just give thanks
+ * there aren't more than 5 arguments...)
+ */
+
extern struct { char _entry[32]; } hypercall_page[];
+#define __HYPERCALL "call hypercall_page+%c[offset]"
+#define __HYPERCALL_ENTRY(x) \
+ [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
+
+#ifdef CONFIG_X86_32
+#define __HYPERCALL_RETREG "eax"
+#define __HYPERCALL_ARG1REG "ebx"
+#define __HYPERCALL_ARG2REG "ecx"
+#define __HYPERCALL_ARG3REG "edx"
+#define __HYPERCALL_ARG4REG "esi"
+#define __HYPERCALL_ARG5REG "edi"
+#else
+#define __HYPERCALL_RETREG "rax"
+#define __HYPERCALL_ARG1REG "rdi"
+#define __HYPERCALL_ARG2REG "rsi"
+#define __HYPERCALL_ARG3REG "rdx"
+#define __HYPERCALL_ARG4REG "r10"
+#define __HYPERCALL_ARG5REG "r8"
+#endif
+
+#define __HYPERCALL_DECLS \
+ register unsigned long __res asm(__HYPERCALL_RETREG); \
+ register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \
+ register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
+ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
+ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
+ register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
+
+#define __HYPERCALL_0PARAM "=r" (__res)
+#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
+#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
+#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
+#define __HYPERCALL_4PARAM __HYPERCALL_3PARAM, "+r" (__arg4)
+#define __HYPERCALL_5PARAM __HYPERCALL_4PARAM, "+r" (__arg5)
+
+#define __HYPERCALL_0ARG()
+#define __HYPERCALL_1ARG(a1) \
+ __HYPERCALL_0ARG() __arg1 = (unsigned long)(a1);
+#define __HYPERCALL_2ARG(a1,a2) \
+ __HYPERCALL_1ARG(a1) __arg2 = (unsigned long)(a2);
+#define __HYPERCALL_3ARG(a1,a2,a3) \
+ __HYPERCALL_2ARG(a1,a2) __arg3 = (unsigned long)(a3);
+#define __HYPERCALL_4ARG(a1,a2,a3,a4) \
+ __HYPERCALL_3ARG(a1,a2,a3) __arg4 = (unsigned long)(a4);
+#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5) \
+ __HYPERCALL_4ARG(a1,a2,a3,a4) __arg5 = (unsigned long)(a5);
+
+#define __HYPERCALL_CLOBBER5 "memory"
+#define __HYPERCALL_CLOBBER4 __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG
+#define __HYPERCALL_CLOBBER3 __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG
+#define __HYPERCALL_CLOBBER2 __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG
+#define __HYPERCALL_CLOBBER1 __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG
+#define __HYPERCALL_CLOBBER0 __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG
+
#define _hypercall0(type, name) \
({ \
- long __res; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res) \
- : [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_0ARG(); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_0PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER0); \
(type)__res; \
})
#define _hypercall1(type, name, a1) \
({ \
- long __res, __ign1; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res), "=b" (__ign1) \
- : "1" ((long)(a1)), \
- [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_1ARG(a1); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_1PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER1); \
(type)__res; \
})
#define _hypercall2(type, name, a1, a2) \
({ \
- long __res, __ign1, __ign2; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \
- : "1" ((long)(a1)), "2" ((long)(a2)), \
- [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_2ARG(a1, a2); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_2PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER2); \
(type)__res; \
})
#define _hypercall3(type, name, a1, a2, a3) \
({ \
- long __res, __ign1, __ign2, __ign3; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
- "=d" (__ign3) \
- : "1" ((long)(a1)), "2" ((long)(a2)), \
- "3" ((long)(a3)), \
- [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_3ARG(a1, a2, a3); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_3PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER3); \
(type)__res; \
})
#define _hypercall4(type, name, a1, a2, a3, a4) \
({ \
- long __res, __ign1, __ign2, __ign3, __ign4; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
- "=d" (__ign3), "=S" (__ign4) \
- : "1" ((long)(a1)), "2" ((long)(a2)), \
- "3" ((long)(a3)), "4" ((long)(a4)), \
- [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_4ARG(a1, a2, a3, a4); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_4PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER4); \
(type)__res; \
})
#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
({ \
- long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
- "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \
- : "1" ((long)(a1)), "2" ((long)(a2)), \
- "3" ((long)(a3)), "4" ((long)(a4)), \
- "5" ((long)(a5)), \
- [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_5PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER5); \
(type)__res; \
})
@@ -152,6 +226,7 @@ HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
return _hypercall2(int, stack_switch, ss, esp);
}
+#ifdef CONFIG_X86_32
static inline int
HYPERVISOR_set_callbacks(unsigned long event_selector,
unsigned long event_address,
@@ -162,6 +237,17 @@ HYPERVISOR_set_callbacks(unsigned long event_selector,
event_selector, event_address,
failsafe_selector, failsafe_address);
}
+#else /* CONFIG_X86_64 */
+static inline int
+HYPERVISOR_set_callbacks(unsigned long event_address,
+ unsigned long failsafe_address,
+ unsigned long syscall_address)
+{
+ return _hypercall3(int, set_callbacks,
+ event_address, failsafe_address,
+ syscall_address);
+}
+#endif /* CONFIG_X86_{32,64} */
static inline int
HYPERVISOR_callback_op(int cmd, void *arg)
@@ -223,12 +309,12 @@ static inline int
HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
unsigned long flags)
{
- unsigned long pte_hi = 0;
-#ifdef CONFIG_X86_PAE
- pte_hi = new_val.pte_high;
-#endif
- return _hypercall4(int, update_va_mapping, va,
- new_val.pte_low, pte_hi, flags);
+ if (sizeof(new_val) == sizeof(long))
+ return _hypercall3(int, update_va_mapping, va,
+ new_val.pte, flags);
+ else
+ return _hypercall4(int, update_va_mapping, va,
+ new_val.pte, new_val.pte >> 32, flags);
}
static inline int
@@ -281,12 +367,13 @@ static inline int
HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
unsigned long flags, domid_t domid)
{
- unsigned long pte_hi = 0;
-#ifdef CONFIG_X86_PAE
- pte_hi = new_val.pte_high;
-#endif
- return _hypercall5(int, update_va_mapping_otherdomain, va,
- new_val.pte_low, pte_hi, flags, domid);
+ if (sizeof(new_val) == sizeof(long))
+ return _hypercall4(int, update_va_mapping_otherdomain, va,
+ new_val.pte, flags, domid);
+ else
+ return _hypercall5(int, update_va_mapping_otherdomain, va,
+ new_val.pte, new_val.pte >> 32,
+ flags, domid);
}
static inline int
@@ -301,6 +388,14 @@ HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
}
+#ifdef CONFIG_X86_64
+static inline int
+HYPERVISOR_set_segment_base(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
+#endif
+
static inline int
HYPERVISOR_suspend(unsigned long srec)
{
@@ -327,14 +422,14 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
{
mcl->op = __HYPERVISOR_update_va_mapping;
mcl->args[0] = va;
-#ifdef CONFIG_X86_PAE
- mcl->args[1] = new_val.pte_low;
- mcl->args[2] = new_val.pte_high;
-#else
- mcl->args[1] = new_val.pte_low;
- mcl->args[2] = 0;
-#endif
- mcl->args[3] = flags;
+ if (sizeof(new_val) == sizeof(long)) {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+ } else {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = new_val.pte >> 32;
+ mcl->args[3] = flags;
+ }
}
static inline void
@@ -354,15 +449,16 @@ MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long v
{
mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
mcl->args[0] = va;
-#ifdef CONFIG_X86_PAE
- mcl->args[1] = new_val.pte_low;
- mcl->args[2] = new_val.pte_high;
-#else
- mcl->args[1] = new_val.pte_low;
- mcl->args[2] = 0;
-#endif
- mcl->args[3] = flags;
- mcl->args[4] = domid;
+ if (sizeof(new_val) == sizeof(long)) {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+ mcl->args[3] = domid;
+ } else {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = new_val.pte >> 32;
+ mcl->args[3] = flags;
+ mcl->args[4] = domid;
+ }
}
static inline void
@@ -370,10 +466,15 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
struct desc_struct desc)
{
mcl->op = __HYPERVISOR_update_descriptor;
- mcl->args[0] = maddr;
- mcl->args[1] = maddr >> 32;
- mcl->args[2] = desc.a;
- mcl->args[3] = desc.b;
+ if (sizeof(maddr) == sizeof(long)) {
+ mcl->args[0] = maddr;
+ mcl->args[1] = *(unsigned long *)&desc;
+ } else {
+ mcl->args[0] = maddr;
+ mcl->args[1] = maddr >> 32;
+ mcl->args[2] = desc.a;
+ mcl->args[3] = desc.b;
+ }
}
static inline void
diff --git a/include/asm-x86/xen/interface.h b/include/asm-x86/xen/interface.h
index 6227000a1e8..9d810f2538a 100644
--- a/include/asm-x86/xen/interface.h
+++ b/include/asm-x86/xen/interface.h
@@ -1,13 +1,13 @@
/******************************************************************************
* arch-x86_32.h
*
- * Guest OS interface to x86 32-bit Xen.
+ * Guest OS interface to x86 Xen.
*
* Copyright (c) 2004, K A Fraser
*/
-#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
-#define __XEN_PUBLIC_ARCH_X86_32_H__
+#ifndef __ASM_X86_XEN_INTERFACE_H
+#define __ASM_X86_XEN_INTERFACE_H
#ifdef __XEN__
#define __DEFINE_GUEST_HANDLE(name, type) \
@@ -57,6 +57,17 @@ DEFINE_GUEST_HANDLE(long);
DEFINE_GUEST_HANDLE(void);
#endif
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+
/*
* SEGMENT DESCRIPTOR TABLES
*/
@@ -71,58 +82,21 @@ DEFINE_GUEST_HANDLE(void);
#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
/*
- * These flat segments are in the Xen-private section of every GDT. Since these
- * are also present in the initial GDT, many OSes will be able to avoid
- * installing their own GDT.
- */
-#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
-#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
-#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
-#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
-#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
-#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
-
-#define FLAT_KERNEL_CS FLAT_RING1_CS
-#define FLAT_KERNEL_DS FLAT_RING1_DS
-#define FLAT_KERNEL_SS FLAT_RING1_SS
-#define FLAT_USER_CS FLAT_RING3_CS
-#define FLAT_USER_DS FLAT_RING3_DS
-#define FLAT_USER_SS FLAT_RING3_SS
-
-/* And the trap vector is... */
-#define TRAP_INSTR "int $0x82"
-
-/*
- * Virtual addresses beyond this are not modifiable by guest OSes. The
- * machine->physical mapping table starts at this address, read-only.
- */
-#ifdef CONFIG_X86_PAE
-#define __HYPERVISOR_VIRT_START 0xF5800000
-#else
-#define __HYPERVISOR_VIRT_START 0xFC000000
-#endif
-
-#ifndef HYPERVISOR_VIRT_START
-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
-#endif
-
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
-
-/* Maximum number of virtual CPUs in multi-processor guests. */
-#define MAX_VIRT_CPUS 32
-
-#ifndef __ASSEMBLY__
-
-/*
* Send an array of these to HYPERVISOR_set_trap_table()
+ * The privilege level specifies which modes may enter a trap via a software
+ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
+ * privilege levels as follows:
+ * Level == 0: Noone may enter
+ * Level == 1: Kernel may enter
+ * Level == 2: Kernel may enter
+ * Level == 3: Everyone may enter
*/
#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
#define TI_GET_IF(_ti) ((_ti)->flags & 4)
#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
+#ifndef __ASSEMBLY__
struct trap_info {
uint8_t vector; /* exception vector */
uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
@@ -131,32 +105,21 @@ struct trap_info {
};
DEFINE_GUEST_HANDLE_STRUCT(trap_info);
-struct cpu_user_regs {
- uint32_t ebx;
- uint32_t ecx;
- uint32_t edx;
- uint32_t esi;
- uint32_t edi;
- uint32_t ebp;
- uint32_t eax;
- uint16_t error_code; /* private */
- uint16_t entry_vector; /* private */
- uint32_t eip;
- uint16_t cs;
- uint8_t saved_upcall_mask;
- uint8_t _pad0;
- uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
- uint32_t esp;
- uint16_t ss, _pad1;
- uint16_t es, _pad2;
- uint16_t ds, _pad3;
- uint16_t fs, _pad4;
- uint16_t gs, _pad5;
+struct arch_shared_info {
+ unsigned long max_pfn; /* max pfn that appears in table */
+ /* Frame containing list of mfns containing list of mfns containing p2m. */
+ unsigned long pfn_to_mfn_frame_list_list;
+ unsigned long nmi_reason;
};
-DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+#endif /* !__ASSEMBLY__ */
-typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+#ifdef CONFIG_X86_32
+#include "interface_32.h"
+#else
+#include "interface_64.h"
+#endif
+#ifndef __ASSEMBLY__
/*
* The following is all CPU context. Note that the fpu_ctxt block is filled
* in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
@@ -173,33 +136,29 @@ struct vcpu_guest_context {
unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
+ /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
+#ifdef __i386__
unsigned long event_callback_cs; /* CS:EIP of event callback */
unsigned long event_callback_eip;
unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
unsigned long failsafe_callback_eip;
+#else
+ unsigned long event_callback_eip;
+ unsigned long failsafe_callback_eip;
+ unsigned long syscall_callback_eip;
+#endif
unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
+#ifdef __x86_64__
+ /* Segment base addresses. */
+ uint64_t fs_base;
+ uint64_t gs_base_kernel;
+ uint64_t gs_base_user;
+#endif
};
DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
-
-struct arch_shared_info {
- unsigned long max_pfn; /* max pfn that appears in table */
- /* Frame containing list of mfns containing list of mfns containing p2m. */
- unsigned long pfn_to_mfn_frame_list_list;
- unsigned long nmi_reason;
-};
-
-struct arch_vcpu_info {
- unsigned long cr2;
- unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
-};
-
-struct xen_callback {
- unsigned long cs;
- unsigned long eip;
-};
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLY__ */
/*
* Prefix forces emulation of some non-trapping instructions.
@@ -213,4 +172,4 @@ struct xen_callback {
#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
#endif
-#endif
+#endif /* __ASM_X86_XEN_INTERFACE_H */
diff --git a/include/asm-x86/xen/interface_32.h b/include/asm-x86/xen/interface_32.h
new file mode 100644
index 00000000000..d8ac41d5db8
--- /dev/null
+++ b/include/asm-x86/xen/interface_32.h
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 32-bit Xen.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __ASM_X86_XEN_INTERFACE_32_H
+#define __ASM_X86_XEN_INTERFACE_32_H
+
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS FLAT_RING3_CS
+#define FLAT_USER_DS FLAT_RING3_DS
+#define FLAT_USER_SS FLAT_RING3_SS
+
+/* And the trap vector is... */
+#define TRAP_INSTR "int $0x82"
+
+/*
+ * Virtual addresses beyond this are not modifiable by guest OSes. The
+ * machine->physical mapping table starts at this address, read-only.
+ */
+#define __HYPERVISOR_VIRT_START 0xF5800000
+
+#ifndef __ASSEMBLY__
+
+struct cpu_user_regs {
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+ uint32_t esi;
+ uint32_t edi;
+ uint32_t ebp;
+ uint32_t eax;
+ uint16_t error_code; /* private */
+ uint16_t entry_vector; /* private */
+ uint32_t eip;
+ uint16_t cs;
+ uint8_t saved_upcall_mask;
+ uint8_t _pad0;
+ uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
+ uint32_t esp;
+ uint16_t ss, _pad1;
+ uint16_t es, _pad2;
+ uint16_t ds, _pad3;
+ uint16_t fs, _pad4;
+ uint16_t gs, _pad5;
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
+};
+
+struct xen_callback {
+ unsigned long cs;
+ unsigned long eip;
+};
+typedef struct xen_callback xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __eip) \
+ ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
+#endif /* !__ASSEMBLY__ */
+
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+#endif /* __ASM_X86_XEN_INTERFACE_32_H */
diff --git a/include/asm-x86/xen/interface_64.h b/include/asm-x86/xen/interface_64.h
new file mode 100644
index 00000000000..842266ce96e
--- /dev/null
+++ b/include/asm-x86/xen/interface_64.h
@@ -0,0 +1,159 @@
+#ifndef __ASM_X86_XEN_INTERFACE_64_H
+#define __ASM_X86_XEN_INTERFACE_64_H
+
+/*
+ * 64-bit segment selectors
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+
+#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */
+#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */
+#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_DS64 0x0000 /* NULL selector */
+#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */
+
+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
+#define FLAT_KERNEL_DS FLAT_KERNEL_DS64
+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
+#define FLAT_KERNEL_CS FLAT_KERNEL_CS64
+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
+#define FLAT_KERNEL_SS FLAT_KERNEL_SS64
+
+#define FLAT_USER_DS64 FLAT_RING3_DS64
+#define FLAT_USER_DS32 FLAT_RING3_DS32
+#define FLAT_USER_DS FLAT_USER_DS64
+#define FLAT_USER_CS64 FLAT_RING3_CS64
+#define FLAT_USER_CS32 FLAT_RING3_CS32
+#define FLAT_USER_CS FLAT_USER_CS64
+#define FLAT_USER_SS64 FLAT_RING3_SS64
+#define FLAT_USER_SS32 FLAT_RING3_SS32
+#define FLAT_USER_SS FLAT_USER_SS64
+
+#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
+#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
+#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
+#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
+#endif
+
+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ * @which == SEGBASE_* ; @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS 0
+#define SEGBASE_GS_USER 1
+#define SEGBASE_GS_KERNEL 2
+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
+
+/*
+ * int HYPERVISOR_iret(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * The saved CS is mapped as follows:
+ * RING0 -> RING3 kernel mode.
+ * RING1 -> RING3 kernel mode.
+ * RING2 -> RING3 kernel mode.
+ * RING3 -> RING3 user mode.
+ * However RING0 indicates that the guest kernel should return to iteself
+ * directly with
+ * orb $3,1*8(%rsp)
+ * iretq
+ * If flags contains VGCF_in_syscall:
+ * Restore RAX, RIP, RFLAGS, RSP.
+ * Discard R11, RCX, CS, SS.
+ * Otherwise:
+ * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
+#define _VGCF_in_syscall 8
+#define VGCF_in_syscall (1<<_VGCF_in_syscall)
+#define VGCF_IN_SYSCALL VGCF_in_syscall
+
+#ifndef __ASSEMBLY__
+
+struct iret_context {
+ /* Top of stack (%rsp at point of hypercall). */
+ uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+ /* Bottom of iret stack frame. */
+};
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
+#define __DECL_REG(name) union { \
+ uint64_t r ## name, e ## name; \
+ uint32_t _e ## name; \
+}
+#else
+/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
+#define __DECL_REG(name) uint64_t r ## name
+#endif
+
+struct cpu_user_regs {
+ uint64_t r15;
+ uint64_t r14;
+ uint64_t r13;
+ uint64_t r12;
+ __DECL_REG(bp);
+ __DECL_REG(bx);
+ uint64_t r11;
+ uint64_t r10;
+ uint64_t r9;
+ uint64_t r8;
+ __DECL_REG(ax);
+ __DECL_REG(cx);
+ __DECL_REG(dx);
+ __DECL_REG(si);
+ __DECL_REG(di);
+ uint32_t error_code; /* private */
+ uint32_t entry_vector; /* private */
+ __DECL_REG(ip);
+ uint16_t cs, _pad0[1];
+ uint8_t saved_upcall_mask;
+ uint8_t _pad1[3];
+ __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */
+ __DECL_REG(sp);
+ uint16_t ss, _pad2[3];
+ uint16_t es, _pad3[3];
+ uint16_t ds, _pad4[3];
+ uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
+ uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+#undef __DECL_REG
+
+#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
+#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+
+typedef unsigned long xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __rip) \
+ ((unsigned long)(__rip))
+
+#endif /* !__ASSEMBLY__ */
+
+
+#endif /* __ASM_X86_XEN_INTERFACE_64_H */
diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h
index 377c04591c1..05e678a8662 100644
--- a/include/asm-x86/xen/page.h
+++ b/include/asm-x86/xen/page.h
@@ -148,13 +148,17 @@ static inline pte_t __pte_ma(pteval_t x)
}
#define pmd_val_ma(v) ((v).pmd)
+#ifdef __PAGETABLE_PUD_FOLDED
#define pud_val_ma(v) ((v).pgd.pgd)
+#else
+#define pud_val_ma(v) ((v).pud)
+#endif
#define __pmd_ma(x) ((pmd_t) { (x) } )
#define pgd_val_ma(x) ((x).pgd)
-xmaddr_t arbitrary_virt_to_machine(unsigned long address);
+xmaddr_t arbitrary_virt_to_machine(void *address);
void make_lowmem_page_readonly(void *vaddr);
void make_lowmem_page_readwrite(void *vaddr);
diff --git a/include/xen/events.h b/include/xen/events.h
index 67c4436554a..4680ff3fbc9 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -44,4 +44,11 @@ extern void notify_remote_via_irq(int irq);
extern void xen_irq_resume(void);
+/* Clear an irq's pending state, in preparation for polling on it */
+void xen_clear_irq_pending(int irq);
+
+/* Poll waiting for an irq to become pending. In the usual case, the
+ irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq);
+
#endif /* _XEN_EVENTS_H */
diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h
index 98b79bc404d..c3adde32669 100644
--- a/include/xen/hvc-console.h
+++ b/include/xen/hvc-console.h
@@ -5,11 +5,12 @@ extern struct console xenboot_console;
#ifdef CONFIG_HVC_XEN
void xen_console_resume(void);
+void xen_raw_console_write(const char *str);
+void xen_raw_printk(const char *fmt, ...);
#else
static inline void xen_console_resume(void) { }
+static inline void xen_raw_console_write(const char *str) { }
+static inline void xen_raw_printk(const char *fmt, ...) { }
#endif
-void xen_raw_console_write(const char *str);
-void xen_raw_printk(const char *fmt, ...);
-
#endif /* XEN_HVC_CONSOLE_H */
diff --git a/include/xen/interface/callback.h b/include/xen/interface/callback.h
index 4aadcba31af..2ae3cd24326 100644
--- a/include/xen/interface/callback.h
+++ b/include/xen/interface/callback.h
@@ -82,9 +82,9 @@
*/
#define CALLBACKOP_register 0
struct callback_register {
- uint16_t type;
- uint16_t flags;
- struct xen_callback address;
+ uint16_t type;
+ uint16_t flags;
+ xen_callback_t address;
};
/*
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index a706d6a7896..883a21bba24 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -11,4 +11,7 @@ void xen_post_suspend(int suspend_cancelled);
void xen_mm_pin_all(void);
void xen_mm_unpin_all(void);
+void xen_timer_resume(void);
+void xen_arch_resume(void);
+
#endif /* INCLUDE_XEN_OPS_H */