aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/enlighten.c233
-rw-r--r--arch/x86/xen/mmu.c145
-rw-r--r--arch/x86/xen/multicalls.c52
-rw-r--r--arch/x86/xen/multicalls.h5
-rw-r--r--arch/x86/xen/smp.c14
-rw-r--r--arch/x86/xen/time.c6
-rw-r--r--arch/x86/xen/xen-ops.h10
7 files changed, 324 insertions, 141 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 493a083f688..94c39aaf695 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -25,7 +25,6 @@
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/highmem.h>
-#include <linux/smp.h>
#include <xen/interface/xen.h>
#include <xen/interface/physdev.h>
@@ -52,11 +51,25 @@
EXPORT_SYMBOL_GPL(hypercall_page);
-DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
-
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
-DEFINE_PER_CPU(unsigned long, xen_cr3);
+
+/*
+ * Note about cr3 (pagetable base) values:
+ *
+ * xen_cr3 contains the current logical cr3 value; it contains the
+ * last set cr3. This may not be the current effective cr3, because
+ * its update may be being lazily deferred. However, a vcpu looking
+ * at its own cr3 can use this value knowing that it everything will
+ * be self-consistent.
+ *
+ * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+ * hypercall to set the vcpu cr3 is complete (so it may be a little
+ * out of date, but it will never be set early). If one vcpu is
+ * looking at another vcpu's cr3 value, it should use this variable.
+ */
+DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
+DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);
@@ -100,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu)
info.mfn = virt_to_mfn(vcpup);
info.offset = offset_in_page(vcpup);
- printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+ printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
cpu, vcpup, info.mfn, info.offset);
/* Check to see if the hypervisor will put the vcpu_info
@@ -124,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu)
static void __init xen_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
- paravirt_ops.name);
+ pv_info.name);
printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
}
@@ -249,29 +262,10 @@ static void xen_halt(void)
xen_safe_halt();
}
-static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+static void xen_leave_lazy(void)
{
- BUG_ON(preemptible());
-
- switch (mode) {
- case PARAVIRT_LAZY_NONE:
- BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
- break;
-
- case PARAVIRT_LAZY_MMU:
- case PARAVIRT_LAZY_CPU:
- BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
- break;
-
- case PARAVIRT_LAZY_FLUSH:
- /* flush if necessary, but don't change state */
- if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
- xen_mc_flush();
- return;
- }
-
+ paravirt_leave_lazy(paravirt_get_lazy_mode());
xen_mc_flush();
- x86_write_percpu(xen_lazy_mode, mode);
}
static unsigned long xen_store_tr(void)
@@ -358,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
* loaded properly. This will go away as soon as Xen has been
* modified to not save/restore %gs for normal hypercalls.
*/
- if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
loadsegment(gs, 0);
}
@@ -632,32 +626,36 @@ static unsigned long xen_read_cr3(void)
return x86_read_percpu(xen_cr3);
}
+static void set_current_cr3(void *v)
+{
+ x86_write_percpu(xen_current_cr3, (unsigned long)v);
+}
+
static void xen_write_cr3(unsigned long cr3)
{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+ unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+
BUG_ON(preemptible());
- if (cr3 == x86_read_percpu(xen_cr3)) {
- /* just a simple tlb flush */
- xen_flush_tlb();
- return;
- }
+ mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
x86_write_percpu(xen_cr3, cr3);
+ op = mcs.args;
+ op->cmd = MMUEXT_NEW_BASEPTR;
+ op->arg1.mfn = mfn;
- {
- struct mmuext_op *op;
- struct multicall_space mcs = xen_mc_entry(sizeof(*op));
- unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
-
- op = mcs.args;
- op->cmd = MMUEXT_NEW_BASEPTR;
- op->arg1.mfn = mfn;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+ /* Update xen_update_cr3 once the batch has actually
+ been submitted. */
+ xen_mc_callback(set_current_cr3, (void *)cr3);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
- }
+ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
/* Early in boot, while setting up the initial pagetable, assume
@@ -668,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
}
+static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
+{
+ struct mmuext_op op;
+ op.cmd = level;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
/* This needs to make sure the new pte page is pinned iff its being
attached to a pinned pagetable. */
static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
@@ -677,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
if (PagePinned(virt_to_page(mm->pgd))) {
SetPagePinned(page);
- if (!PageHighMem(page))
+ if (!PageHighMem(page)) {
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
- else
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+ } else
/* make sure there are no stray mappings of
this page */
kmap_flush_unused();
@@ -692,8 +700,10 @@ static void xen_release_pt(u32 pfn)
struct page *page = pfn_to_page(pfn);
if (PagePinned(page)) {
- if (!PageHighMem(page))
+ if (!PageHighMem(page)) {
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+ }
}
}
@@ -738,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
/* special set_pte for pagetable initialization */
- paravirt_ops.set_pte = xen_set_pte_init;
+ pv_mmu_ops.set_pte = xen_set_pte_init;
init_mm.pgd = base;
/*
@@ -785,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
{
/* This will work as long as patching hasn't happened yet
(which it hasn't) */
- paravirt_ops.alloc_pt = xen_alloc_pt;
- paravirt_ops.set_pte = xen_set_pte;
+ pv_mmu_ops.alloc_pt = xen_alloc_pt;
+ pv_mmu_ops.set_pte = xen_set_pte;
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
/*
@@ -808,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
/* Actually pin the pagetable down, but we can't set PG_pinned
yet because the page structures don't exist yet. */
{
- struct mmuext_op op;
+ unsigned level;
+
#ifdef CONFIG_X86_PAE
- op.cmd = MMUEXT_PIN_L3_TABLE;
+ level = MMUEXT_PIN_L3_TABLE;
#else
- op.cmd = MMUEXT_PIN_L3_TABLE;
+ level = MMUEXT_PIN_L2_TABLE;
#endif
- op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
- BUG();
+
+ pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
}
}
@@ -833,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void)
if (have_vcpu_info_placement) {
printk(KERN_INFO "Xen: using vcpu_info placement\n");
- paravirt_ops.save_fl = xen_save_fl_direct;
- paravirt_ops.restore_fl = xen_restore_fl_direct;
- paravirt_ops.irq_disable = xen_irq_disable_direct;
- paravirt_ops.irq_enable = xen_irq_enable_direct;
- paravirt_ops.read_cr2 = xen_read_cr2_direct;
- paravirt_ops.iret = xen_iret_direct;
+ pv_irq_ops.save_fl = xen_save_fl_direct;
+ pv_irq_ops.restore_fl = xen_restore_fl_direct;
+ pv_irq_ops.irq_disable = xen_irq_disable_direct;
+ pv_irq_ops.irq_enable = xen_irq_enable_direct;
+ pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
+ pv_cpu_ops.iret = xen_iret_direct;
}
}
@@ -850,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
start = end = reloc = NULL;
-#define SITE(x) \
- case PARAVIRT_PATCH(x): \
+#define SITE(op, x) \
+ case PARAVIRT_PATCH(op.x): \
if (have_vcpu_info_placement) { \
start = (char *)xen_##x##_direct; \
end = xen_##x##_direct_end; \
@@ -860,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
goto patch_site
switch (type) {
- SITE(irq_enable);
- SITE(irq_disable);
- SITE(save_fl);
- SITE(restore_fl);
+ SITE(pv_irq_ops, irq_enable);
+ SITE(pv_irq_ops, irq_disable);
+ SITE(pv_irq_ops, save_fl);
+ SITE(pv_irq_ops, restore_fl);
#undef SITE
patch_site:
@@ -895,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
return ret;
}
-static const struct paravirt_ops xen_paravirt_ops __initdata = {
+static const struct pv_info xen_info __initdata = {
.paravirt_enabled = 1,
.shared_kernel_pmd = 0,
.name = "Xen",
- .banner = xen_banner,
+};
+static const struct pv_init_ops xen_init_ops __initdata = {
.patch = xen_patch,
+ .banner = xen_banner,
.memory_setup = xen_memory_setup,
.arch_setup = xen_arch_setup,
- .init_IRQ = xen_init_IRQ,
.post_allocator_init = xen_mark_init_mm_pinned,
+};
+static const struct pv_time_ops xen_time_ops __initdata = {
.time_init = xen_time_init,
+
.set_wallclock = xen_set_wallclock,
.get_wallclock = xen_get_wallclock,
.get_cpu_khz = xen_cpu_khz,
.sched_clock = xen_sched_clock,
+};
+static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.cpuid = xen_cpuid,
.set_debugreg = xen_set_debugreg,
@@ -925,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
- .read_cr2 = xen_read_cr2,
- .write_cr2 = xen_write_cr2,
-
- .read_cr3 = xen_read_cr3,
- .write_cr3 = xen_write_cr3,
-
.read_cr4 = native_read_cr4,
.read_cr4_safe = native_read_cr4_safe,
.write_cr4 = xen_write_cr4,
- .save_fl = xen_save_fl,
- .restore_fl = xen_restore_fl,
- .irq_disable = xen_irq_disable,
- .irq_enable = xen_irq_enable,
- .safe_halt = xen_safe_halt,
- .halt = xen_halt,
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
@@ -969,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.set_iopl_mask = xen_set_iopl_mask,
.io_delay = xen_io_delay,
+ .lazy_mode = {
+ .enter = paravirt_enter_lazy_cpu,
+ .leave = xen_leave_lazy,
+ },
+};
+
+static const struct pv_irq_ops xen_irq_ops __initdata = {
+ .init_IRQ = xen_init_IRQ,
+ .save_fl = xen_save_fl,
+ .restore_fl = xen_restore_fl,
+ .irq_disable = xen_irq_disable,
+ .irq_enable = xen_irq_enable,
+ .safe_halt = xen_safe_halt,
+ .halt = xen_halt,
+};
+
+static const struct pv_apic_ops xen_apic_ops __initdata = {
#ifdef CONFIG_X86_LOCAL_APIC
.apic_write = xen_apic_write,
.apic_write_atomic = xen_apic_write,
@@ -977,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.setup_secondary_clock = paravirt_nop,
.startup_ipi_hook = paravirt_nop,
#endif
+};
+
+static const struct pv_mmu_ops xen_mmu_ops __initdata = {
+ .pagetable_setup_start = xen_pagetable_setup_start,
+ .pagetable_setup_done = xen_pagetable_setup_done,
+
+ .read_cr2 = xen_read_cr2,
+ .write_cr2 = xen_write_cr2,
+
+ .read_cr3 = xen_read_cr3,
+ .write_cr3 = xen_write_cr3,
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
@@ -986,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
- .pagetable_setup_start = xen_pagetable_setup_start,
- .pagetable_setup_done = xen_pagetable_setup_done,
-
.alloc_pt = xen_alloc_pt_init,
.release_pt = xen_release_pt,
.alloc_pd = paravirt_nop,
@@ -1024,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
.dup_mmap = xen_dup_mmap,
.exit_mmap = xen_exit_mmap,
- .set_lazy_mode = xen_set_lazy_mode,
+ .lazy_mode = {
+ .enter = paravirt_enter_lazy_mmu,
+ .leave = xen_leave_lazy,
+ },
};
#ifdef CONFIG_SMP
@@ -1080,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = {
};
+static void __init xen_reserve_top(void)
+{
+ unsigned long top = HYPERVISOR_VIRT_START;
+ struct xen_platform_parameters pp;
+
+ if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+ top = pp.virt_start;
+
+ reserve_top_address(-top + 2 * PAGE_SIZE);
+}
+
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
@@ -1091,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void)
BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
/* Install Xen paravirt ops */
- paravirt_ops = xen_paravirt_ops;
+ pv_info = xen_info;
+ pv_init_ops = xen_init_ops;
+ pv_time_ops = xen_time_ops;
+ pv_cpu_ops = xen_cpu_ops;
+ pv_irq_ops = xen_irq_ops;
+ pv_apic_ops = xen_apic_ops;
+ pv_mmu_ops = xen_mmu_ops;
+
machine_ops = xen_machine_ops;
#ifdef CONFIG_SMP
@@ -1113,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void)
/* keep using Xen gdt for now; no urgent need to change it */
x86_write_percpu(xen_cr3, __pa(pgd));
+ x86_write_percpu(xen_current_cr3, __pa(pgd));
#ifdef CONFIG_SMP
/* Don't do the full vcpu_info placement stuff until we have a
@@ -1124,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void)
xen_setup_vcpu_info_placement();
#endif
- paravirt_ops.kernel_rpl = 1;
+ pv_info.kernel_rpl = 1;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
- paravirt_ops.kernel_rpl = 0;
+ pv_info.kernel_rpl = 0;
/* set the limit of our address space */
- reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+ xen_reserve_top();
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 874db0cd1d2..b2e32f9d007 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -41,7 +41,6 @@
#include <linux/sched.h>
#include <linux/highmem.h>
#include <linux/bug.h>
-#include <linux/sched.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
if (mm == current->mm || mm == &init_mm) {
- if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
struct multicall_space mcs;
mcs = xen_mc_entry(0);
@@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
}
#endif /* CONFIG_X86_PAE */
-
+enum pt_level {
+ PT_PGD,
+ PT_PUD,
+ PT_PMD,
+ PT_PTE
+};
/*
(Yet another) pagetable walker. This one is intended for pinning a
@@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
FIXADDR_TOP. But the important bit is that we don't pin beyond
there, because then we start getting into Xen's ptes.
*/
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
unsigned long limit)
{
pgd_t *pgd = pgd_base;
@@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
pud = pud_offset(pgd, 0);
if (PTRS_PER_PUD > 1) /* not folded */
- flush |= (*func)(virt_to_page(pud), 0);
+ flush |= (*func)(virt_to_page(pud), PT_PUD);
for (; addr != pud_limit; pud++, addr = pud_next) {
pmd_t *pmd;
@@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
pmd = pmd_offset(pud, 0);
if (PTRS_PER_PMD > 1) /* not folded */
- flush |= (*func)(virt_to_page(pmd), 0);
+ flush |= (*func)(virt_to_page(pmd), PT_PMD);
for (; addr != pmd_limit; pmd++) {
addr += (PAGE_SIZE * PTRS_PER_PTE);
@@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
if (pmd_none(*pmd))
continue;
- flush |= (*func)(pmd_page(*pmd), 0);
+ flush |= (*func)(pmd_page(*pmd), PT_PTE);
}
}
}
- flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+ flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
return flush;
}
-static int pin_page(struct page *page, unsigned flags)
+static spinlock_t *lock_pte(struct page *page)
+{
+ spinlock_t *ptl = NULL;
+
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+ ptl = __pte_lockptr(page);
+ spin_lock(ptl);
+#endif
+
+ return ptl;
+}
+
+static void do_unlock(void *v)
+{
+ spinlock_t *ptl = v;
+ spin_unlock(ptl);
+}
+
+static void xen_do_pin(unsigned level, unsigned long pfn)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ mcs = __xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+ op->cmd = level;
+ op->arg1.mfn = pfn_to_mfn(pfn);
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+}
+
+static int pin_page(struct page *page, enum pt_level level)
{
unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
int flush;
@@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page);
struct multicall_space mcs = __xen_mc_entry(0);
+ spinlock_t *ptl;
flush = 0;
+ ptl = NULL;
+ if (level == PT_PTE)
+ ptl = lock_pte(page);
+
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL_RO),
- flags);
+ level == PT_PGD ? UVMF_TLB_FLUSH : 0);
+
+ if (level == PT_PTE)
+ xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
+
+ if (ptl) {
+ /* Queue a deferred unlock for when this batch
+ is completed. */
+ xen_mc_callback(do_unlock, ptl);
+ }
}
return flush;
@@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
- struct multicall_space mcs;
- struct mmuext_op *op;
+ unsigned level;
xen_mc_batch();
@@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
xen_mc_batch();
}
- mcs = __xen_mc_entry(sizeof(*op));
- op = mcs.args;
-
#ifdef CONFIG_X86_PAE
- op->cmd = MMUEXT_PIN_L3_TABLE;
+ level = MMUEXT_PIN_L3_TABLE;
#else
- op->cmd = MMUEXT_PIN_L2_TABLE;
+ level = MMUEXT_PIN_L2_TABLE;
#endif
- op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_do_pin(level, PFN_DOWN(__pa(pgd)));
xen_mc_issue(0);
}
@@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
/* The init_mm pagetable is really pinned as soon as its created, but
that's before we have page structures to store the bits. So do all
the book-keeping now. */
-static __init int mark_pinned(struct page *page, unsigned flags)
+static __init int mark_pinned(struct page *page, enum pt_level level)
{
SetPagePinned(page);
return 0;
@@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
}
-static int unpin_page(struct page *page, unsigned flags)
+static int unpin_page(struct page *page, enum pt_level level)
{
unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
if (pgfl && !PageHighMem(page)) {
void *pt = lowmem_page_address(page);
unsigned long pfn = page_to_pfn(page);
- struct multicall_space mcs = __xen_mc_entry(0);
+ spinlock_t *ptl = NULL;
+ struct multicall_space mcs;
+
+ if (level == PT_PTE) {
+ ptl = lock_pte(page);
+
+ xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
+ }
+
+ mcs = __xen_mc_entry(0);
MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
pfn_pte(pfn, PAGE_KERNEL),
- flags);
+ level == PT_PGD ? UVMF_TLB_FLUSH : 0);
+
+ if (ptl) {
+ /* unlock when batch completed */
+ xen_mc_callback(do_unlock, ptl);
+ }
}
return 0; /* never need to flush on unpin */
@@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
/* Release a pagetables pages back as normal RW */
static void xen_pgd_unpin(pgd_t *pgd)
{
- struct mmuext_op *op;
- struct multicall_space mcs;
-
xen_mc_batch();
- mcs = __xen_mc_entry(sizeof(*op));
-
- op = mcs.args;
- op->cmd = MMUEXT_UNPIN_TABLE;
- op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
-
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+ xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
pgd_walk(pgd, unpin_page, TASK_SIZE);
@@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info)
if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
leave_mm(smp_processor_id());
+
+ /* If this cpu still has a stale cr3 reference, then make sure
+ it has been flushed. */
+ if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+ load_cr3(swapper_pg_dir);
+ arch_flush_lazy_cpu_mode();
+ }
}
static void drop_mm_ref(struct mm_struct *mm)
{
+ cpumask_t mask;
+ unsigned cpu;
+
if (current->active_mm == mm) {
if (current->mm == mm)
load_cr3(swapper_pg_dir);
else
leave_mm(smp_processor_id());
+ arch_flush_lazy_cpu_mode();
}
- if (!cpus_empty(mm->cpu_vm_mask))
- xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
- mm, 1);
+ /* Get the "official" set of cpus referring to our pagetable. */
+ mask = mm->cpu_vm_mask;
+
+ /* It's possible that a vcpu may have a stale reference to our
+ cr3, because its in lazy mode, and it hasn't yet flushed
+ its set of pending hypercalls yet. In this case, we can
+ look at its actual current cr3 value, and force it to flush
+ if needed. */
+ for_each_online_cpu(cpu) {
+ if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
+ cpu_set(cpu, mask);
+ }
+
+ if (!cpus_empty(mask))
+ xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
}
#else
static void drop_mm_ref(struct mm_struct *mm)
@@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
/* pgd may not be pinned in the error exit path of execve */
if (PagePinned(virt_to_page(mm->pgd)))
xen_pgd_unpin(mm->pgd);
+
spin_unlock(&mm->page_table_lock);
}
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index c837e8e463d..5e6f36f6d87 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -26,13 +26,22 @@
#include "multicalls.h"
+#define MC_DEBUG 1
+
#define MC_BATCH 32
#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
struct mc_buffer {
struct multicall_entry entries[MC_BATCH];
+#if MC_DEBUG
+ struct multicall_entry debug[MC_BATCH];
+#endif
u64 args[MC_ARGS];
- unsigned mcidx, argidx;
+ struct callback {
+ void (*fn)(void *);
+ void *data;
+ } callbacks[MC_BATCH];
+ unsigned mcidx, argidx, cbidx;
};
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
@@ -43,6 +52,7 @@ void xen_mc_flush(void)
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
int ret = 0;
unsigned long flags;
+ int i;
BUG_ON(preemptible());
@@ -51,13 +61,31 @@ void xen_mc_flush(void)
local_irq_save(flags);
if (b->mcidx) {
- int i;
+#if MC_DEBUG
+ memcpy(b->debug, b->entries,
+ b->mcidx * sizeof(struct multicall_entry));
+#endif
if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
BUG();
for (i = 0; i < b->mcidx; i++)
if (b->entries[i].result < 0)
ret++;
+
+#if MC_DEBUG
+ if (ret) {
+ printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
+ ret, smp_processor_id());
+ for(i = 0; i < b->mcidx; i++) {
+ printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
+ i+1, b->mcidx,
+ b->debug[i].op,
+ b->debug[i].args[0],
+ b->entries[i].result);
+ }
+ }
+#endif
+
b->mcidx = 0;
b->argidx = 0;
} else
@@ -65,6 +93,13 @@ void xen_mc_flush(void)
local_irq_restore(flags);
+ for(i = 0; i < b->cbidx; i++) {
+ struct callback *cb = &b->callbacks[i];
+
+ (*cb->fn)(cb->data);
+ }
+ b->cbidx = 0;
+
BUG_ON(ret);
}
@@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args)
return ret;
}
+
+void xen_mc_callback(void (*fn)(void *), void *data)
+{
+ struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct callback *cb;
+
+ if (b->cbidx == MC_BATCH)
+ xen_mc_flush();
+
+ cb = &b->callbacks[b->cbidx++];
+ cb->fn = fn;
+ cb->data = data;
+}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index e6f7530b156..8bae996d99a 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -35,11 +35,14 @@ void xen_mc_flush(void);
/* Issue a multicall if we're not in a lazy mode */
static inline void xen_mc_issue(unsigned mode)
{
- if ((xen_get_lazy_mode() & mode) == 0)
+ if ((paravirt_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
}
+/* Set up a callback to be called when the current batch is flushed */
+void xen_mc_callback(void (*fn)(void *), void *data);
+
#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 4fa33c27ccb..d53bf9d8a72 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -370,7 +370,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
void *info, int wait)
{
struct call_data_struct data;
- int cpus;
+ int cpus, cpu;
+ bool yield;
/* Holding any lock stops cpus from going down. */
spin_lock(&call_lock);
@@ -399,9 +400,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
/* Send a message to other CPUs and wait for them to respond */
xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
- /* Make sure other vcpus get a chance to run.
- XXX too severe? Maybe we should check the other CPU's states? */
- HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+ /* Make sure other vcpus get a chance to run if they need to. */
+ yield = false;
+ for_each_cpu_mask(cpu, mask)
+ if (xen_vcpu_stolen(cpu))
+ yield = true;
+
+ if (yield)
+ HYPERVISOR_sched_op(SCHEDOP_yield, 0);
/* Wait for response */
while (atomic_read(&data.started) != cpus ||
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index dfd6db69ead..d083ff5ef08 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
} while (get64(&state->state_entry_time) != state_time);
}
+/* return true when a vcpu could run but has no real cpu to run on */
+bool xen_vcpu_stolen(int vcpu)
+{
+ return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
+}
+
static void setup_runstate_info(int cpu)
{
struct vcpu_register_runstate_memory_area area;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b9aaea45f07..b02a909bfd4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps);
DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
DECLARE_PER_CPU(unsigned long, xen_cr3);
+DECLARE_PER_CPU(unsigned long, xen_current_cr3);
extern struct start_info *xen_start_info;
extern struct shared_info *HYPERVISOR_shared_info;
@@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
-void xen_mark_init_mm_pinned(void);
-
-DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+bool xen_vcpu_stolen(int vcpu);
-static inline unsigned xen_get_lazy_mode(void)
-{
- return x86_read_percpu(xen_lazy_mode);
-}
+void xen_mark_init_mm_pinned(void);
void __init xen_fill_possible_map(void);