From 86313c488a6848b7ec2ba04e74f25f79dd32a0b7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:03 -0700 Subject: usermodehelper: Tidy up waiting Rather than using a tri-state integer for the wait flag in call_usermodehelper_exec, define a proper enum, and use that. I've preserved the integer values so that any callers I've missed should still work OK. Signed-off-by: Jeremy Fitzhardinge Cc: James Bottomley Cc: Randy Dunlap Cc: Christoph Hellwig Cc: Andi Kleen Cc: Paul Mackerras Cc: Johannes Berg Cc: Ralf Baechle Cc: Bjorn Helgaas Cc: Joel Becker Cc: Tony Luck Cc: Kay Sievers Cc: Srivatsa Vaddagiri Cc: Oleg Nesterov Cc: David Howells --- arch/i386/mach-voyager/voyager_thread.c | 2 +- arch/x86_64/kernel/mce.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c index b4b24e0e45e..f9d59533815 100644 --- a/arch/i386/mach-voyager/voyager_thread.c +++ b/arch/i386/mach-voyager/voyager_thread.c @@ -52,7 +52,7 @@ execute(const char *string) NULL, }; - if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) { + if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string, ret); } diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index aa1d1599179..f3fb8174559 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -174,7 +174,7 @@ static void do_mce_trigger(void) if (events != atomic_read(&mce_logged) && trigger[0]) { /* Small race window, but should be harmless. */ atomic_set(&mce_logged, events); - call_usermodehelper(trigger, trigger_argv, NULL, -1); + call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); } } -- cgit v1.2.3 From 810bab448e563ffd1718d78e9a3756806b626acc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:03 -0700 Subject: use elfnote.h to generate vsyscall notes. Use existing elfnote.h to generate vsyscall notes, rather than doing it locally. Changes elfnote.h a bit to suit, since this is the first asm user, and it wasn't quite right. Signed-off-by: Jeremy Fitzhardinge Cc: "Eric W. Biederman" Cc: Roland McGrath Cc: Andrew Morton --- arch/i386/kernel/vsyscall-note.S | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S index d4b5be4f3d5..52e0cbbac70 100644 --- a/arch/i386/kernel/vsyscall-note.S +++ b/arch/i386/kernel/vsyscall-note.S @@ -3,23 +3,12 @@ * Here we can supply some information useful to userland. */ -#include #include +#include -#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \ - .section name, flags; \ - .balign 4; \ - .long 1f - 0f; /* name length */ \ - .long 3f - 2f; /* data length */ \ - .long type; /* note type */ \ -0: .asciz vendor; /* vendor name */ \ -1: .balign 4; \ -2: - -#define ASM_ELF_NOTE_END \ -3: .balign 4; /* pad out section */ \ - .previous - - ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0) +/* Ideally this would use UTS_NAME, but using a quoted string here + doesn't work. Remember to change this when changing the + kernel's name. */ +ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE - ASM_ELF_NOTE_END +ELFNOTE_END -- cgit v1.2.3 From fdb4c338c8d1d494e17c3422a3ea2129f6791596 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:03 -0700 Subject: paravirt: add an "mm" argument to alloc_pt It's useful to know which mm is allocating a pagetable. Xen uses this to determine whether the pagetable being added to is pinned or not. Signed-off-by: Jeremy Fitzhardinge --- arch/i386/kernel/vmi.c | 2 +- arch/i386/mm/init.c | 2 +- arch/i386/mm/pageattr.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index c12720d7cbc..234bd6ff518 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) } #endif -static void vmi_allocate_pt(u32 pfn) +static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) { vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 7135946d366..f9b6a887854 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); + paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 2eb14a73be9..37992ffb163 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); - paravirt_alloc_pt(page_to_pfn(base)); + paravirt_alloc_pt(&init_mm, page_to_pfn(base)); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, addr == address ? prot : ref_prot)); -- cgit v1.2.3 From 6996d3b63fd9a64341bc80dad1b556fd3eb81272 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:03 -0700 Subject: paravirt: add a hook for once the allocator is ready Add a hook so that the paravirt backend knows when the allocator is ready. This is useful for the obvious reason that the allocator is available, but the other side-effect of having the bootmem allocator available is that each page now has an associated "struct page". Signed-off-by: Jeremy Fitzhardinge --- arch/i386/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 2d61e65eeb5..74871d066c2 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p) * NOTE: at this point the bootmem allocator is fully available. */ + paravirt_post_allocator_init(); + dmi_scan_machine(); #ifdef CONFIG_X86_GENERICARCH -- cgit v1.2.3 From 53787013248f52af81d99f63454e5a5cf34d6f12 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:03 -0700 Subject: paravirt: unstatic leave_mm Make globally leave_mm visible, specifically so that Xen can use it to shoot-down lazy uses of cr3. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright --- arch/i386/kernel/smp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 6299c080f6e..2d35d850202 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -22,6 +22,7 @@ #include #include +#include #include /* @@ -249,13 +250,13 @@ static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); /* - * We cannot call mmdrop() because we are in interrupt context, + * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. * * We need to reload %cr3 since the page tables may be going * away from under us.. */ -static inline void leave_mm (unsigned long cpu) +void leave_mm(unsigned long cpu) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) BUG(); -- cgit v1.2.3 From 724faa89ccd8fae65f3d41a47b0e1034cf07918b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:03 -0700 Subject: paravirt: unstatic smp_store_cpu_info Paravirt implementations need to store cpu info when bringing up cpus. Signed-off-by: Jeremy Fitzhardinge --- arch/i386/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 0b2954534b8..26752931023 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -148,7 +148,7 @@ void __init smp_alloc_memory(void) * a given CPU */ -static void __cpuinit smp_store_cpu_info(int id) +void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = cpu_data + id; -- cgit v1.2.3 From c70df74376c1e29a04e07e23dd3f4c384d6166dd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:03 -0700 Subject: paravirt: make siblingmap functions visible Paravirt implementations need to set the sibling map on new cpus. Signed-off-by: Jeremy Fitzhardinge --- arch/i386/kernel/smpboot.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 26752931023..5910d3fac56 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu) /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; -static inline void -set_cpu_sibling_map(int cpu) +void set_cpu_sibling_map(int cpu) { int i; struct cpuinfo_x86 *c = cpu_data; @@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void) } #ifdef CONFIG_HOTPLUG_CPU -static void -remove_siblinginfo(int cpu) +void remove_siblinginfo(int cpu) { int sibling; struct cpuinfo_x86 *c = cpu_data; -- cgit v1.2.3 From bdef40a6af64a0140a65df49bf504124d57094a9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:04 -0700 Subject: paravirt: export __supported_pte_mask __supported_pte_mask is needed when constructing pte values. Xen device drivers need to do this to make mappings of foreign pages (ie, pages granted to us by other domains). Signed-off-by: Jeremy Fitzhardinge --- arch/i386/mm/init.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index f9b6a887854..6a68b1ae061 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -473,6 +473,7 @@ void zap_low_mappings (void) static int disable_nx __initdata = 0; u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; +EXPORT_SYMBOL_GPL(__supported_pte_mask); /* * noexec = on|off -- cgit v1.2.3 From d572929cdd12a60732c3522f7cf011bfa29165cf Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:04 -0700 Subject: paravirt: helper to disable all IO space In a virtual environment, device drivers such as legacy IDE will waste quite a lot of time probing for their devices which will never appear. This helper function allows a paravirt implementation to lay claim to the whole iomem and ioport space, thereby disabling all device drivers trying to claim IO resources. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Cc: Rusty Russell --- arch/i386/kernel/paravirt.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'arch') diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index faab09abca5..60e08b9b50a 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -228,6 +228,41 @@ static int __init print_banner(void) } core_initcall(print_banner); +static struct resource reserve_ioports = { + .start = 0, + .end = IO_SPACE_LIMIT, + .name = "paravirt-ioport", + .flags = IORESOURCE_IO | IORESOURCE_BUSY, +}; + +static struct resource reserve_iomem = { + .start = 0, + .end = -1, + .name = "paravirt-iomem", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + +/* + * Reserve the whole legacy IO space to prevent any legacy drivers + * from wasting time probing for their hardware. This is a fairly + * brute-force approach to disabling all non-virtual drivers. + * + * Note that this must be called very early to have any effect. + */ +int paravirt_disable_iospace(void) +{ + int ret; + + ret = request_resource(&ioport_resource, &reserve_ioports); + if (ret == 0) { + ret = request_resource(&iomem_resource, &reserve_iomem); + if (ret) + release_resource(&reserve_ioports); + } + + return ret; +} + struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, -- cgit v1.2.3 From 688340ea34c61ad12473ccd837325b59aada9a93 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:04 -0700 Subject: Add a sched_clock paravirt_op The tsc-based get_scheduled_cycles interface is not a good match for Xen's runstate accounting, which reports everything in nanoseconds. This patch replaces this interface with a sched_clock interface, which matches both Xen and VMI's requirements. In order to do this, we: 1. replace get_scheduled_cycles with sched_clock 2. hoist cycles_2_ns into a common header 3. update vmi accordingly One thing to note: because sched_clock is implemented as a weak function in kernel/sched.c, we must define a real function in order to override this weak binding. This means the usual paravirt_ops technique of using an inline function won't work in this case. Signed-off-by: Jeremy Fitzhardinge Cc: Zachary Amsden Cc: Dan Hecht Cc: john stultz --- arch/i386/kernel/paravirt.c | 2 +- arch/i386/kernel/tsc.c | 23 +++++++++++++++-------- arch/i386/kernel/vmi.c | 2 +- arch/i386/kernel/vmiclock.c | 6 +++--- 4 files changed, 20 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 60e08b9b50a..53f07a8275e 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -302,7 +302,7 @@ struct paravirt_ops paravirt_ops = { .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .get_scheduled_cycles = native_read_tsc, + .sched_clock = native_sched_clock, .get_cpu_khz = native_calculate_cpu_khz, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index ea63a30ca3e..252f9010f28 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -84,7 +84,7 @@ static inline int check_tsc_unstable(void) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static unsigned long cyc2ns_scale __read_mostly; +unsigned long cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ @@ -93,15 +93,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz) cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; } -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - /* * Scheduler clock - returns current time in nanosec units. */ -unsigned long long sched_clock(void) +unsigned long long native_sched_clock(void) { unsigned long long this_offset; @@ -118,12 +113,24 @@ unsigned long long sched_clock(void) return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); /* read the Time Stamp Counter: */ - get_scheduled_cycles(this_offset); + rdtscll(this_offset); /* return the value in ns */ return cycles_2_ns(this_offset); } +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long sched_clock(void) + __attribute__((alias("native_sched_clock"))); +#endif + unsigned long native_calculate_cpu_khz(void) { unsigned long long start, end; diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index 234bd6ff518..72042bb7ec9 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -891,7 +891,7 @@ static inline int __init activate_vmi(void) paravirt_ops.setup_boot_clock = vmi_time_bsp_init; paravirt_ops.setup_secondary_clock = vmi_time_ap_init; #endif - paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; + paravirt_ops.sched_clock = vmi_sched_clock; paravirt_ops.get_cpu_khz = vmi_cpu_khz; /* We have true wallclock functions; disable CMOS clock sync */ diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c index 26a37f8a876..f9b845f4e69 100644 --- a/arch/i386/kernel/vmiclock.c +++ b/arch/i386/kernel/vmiclock.c @@ -64,10 +64,10 @@ int vmi_set_wallclock(unsigned long now) return 0; } -/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */ -unsigned long long vmi_get_sched_cycles(void) +/* paravirt_ops.sched_clock = vmi_sched_clock */ +unsigned long long vmi_sched_clock(void) { - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); + return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ -- cgit v1.2.3 From 24037a8b69dbf15bfed8fd42a2a2e442d7b0395b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:04 -0700 Subject: Add nosegneg capability to the vsyscall page notes Add the "nosegneg" fake capabilty to the vsyscall page notes. This is used by the runtime linker to select a glibc version which then disables negative-offset accesses to the thread-local segment via %gs. These accesses require emulation in Xen (because segments are truncated to protect the hypervisor address space) and avoiding them provides a measurable performance boost. Signed-off-by: Ian Pratt Signed-off-by: Christian Limpach Signed-off-by: Chris Wright Signed-off-by: Jeremy Fitzhardinge Acked-by: Zachary Amsden Cc: Roland McGrath Cc: Ulrich Drepper --- arch/i386/kernel/vsyscall-note.S | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'arch') diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S index 52e0cbbac70..271f16a8ca0 100644 --- a/arch/i386/kernel/vsyscall-note.S +++ b/arch/i386/kernel/vsyscall-note.S @@ -12,3 +12,31 @@ ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END + +#ifdef CONFIG_XEN + +/* + * Add a special note telling glibc's dynamic linker a fake hardware + * flavor that it will use to choose the search path for libraries in the + * same way it uses real hardware capabilities like "mmx". + * We supply "nosegneg" as the fake capability, to indicate that we + * do not like negative offsets in instructions using segment overrides, + * since we implement those inefficiently. This makes it possible to + * install libraries optimized to avoid those access patterns in someplace + * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file + * corresponding to the bits here is needed to make ldconfig work right. + * It should contain: + * hwcap 1 nosegneg + * to match the mapping of bit to name that we give here. + */ + +/* Bit used for the pseudo-hwcap for non-negative segments. We use + bit 1 to avoid bugs in some versions of glibc when bit 0 is + used; the choice is otherwise arbitrary. */ +#define VDSO_NOTE_NONEGSEG_BIT 1 + +ELFNOTE_START(GNU, 2, "a") + .long 1, 1< Date: Tue, 17 Jul 2007 18:37:04 -0700 Subject: xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Cc: Ian Pratt Cc: Christian Limpach Cc: Adrian Bunk --- arch/i386/Makefile | 3 + arch/i386/kernel/entry.S | 71 ++++ arch/i386/kernel/head.S | 5 +- arch/i386/kernel/vmlinux.lds.S | 1 + arch/i386/xen/Makefile | 1 + arch/i386/xen/enlighten.c | 745 +++++++++++++++++++++++++++++++++++++++++ arch/i386/xen/features.c | 29 ++ arch/i386/xen/multicalls.c | 89 +++++ arch/i386/xen/multicalls.h | 45 +++ arch/i386/xen/setup.c | 97 ++++++ arch/i386/xen/xen-head.S | 36 ++ arch/i386/xen/xen-ops.h | 31 ++ 12 files changed, 1152 insertions(+), 1 deletion(-) create mode 100644 arch/i386/xen/Makefile create mode 100644 arch/i386/xen/enlighten.c create mode 100644 arch/i386/xen/features.c create mode 100644 arch/i386/xen/multicalls.c create mode 100644 arch/i386/xen/multicalls.h create mode 100644 arch/i386/xen/setup.c create mode 100644 arch/i386/xen/xen-head.S create mode 100644 arch/i386/xen/xen-ops.h (limited to 'arch') diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 181cc29a7c4..01f0ff0daaf 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000 mcore-$(CONFIG_X86_ES7000) := mach-default core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ +# Xen paravirtualization support +core-$(CONFIG_XEN) += arch/i386/xen/ + # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3c3c220488c..ffb23654427 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -1023,6 +1023,77 @@ ENTRY(kernel_thread_helper) CFI_ENDPROC ENDPROC(kernel_thread_helper) +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + CFI_STARTPROC + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + mov %esp, %eax + call xen_evtchn_do_upcall + jmp ret_from_intr + CFI_ENDPROC +ENDPROC(xen_hypervisor_callback) + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + lea 16(%esp),%esp + CFI_ADJUST_CFA_OFFSET -16 + jz 5f + addl $16,%esp + jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) +5: pushl $0 # EAX == 0 => Category 1 (Bad segment) + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + jmp ret_from_exception + CFI_ENDPROC + +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous +.section __ex_table,"a" + .align 4 + .long 1b,6b + .long 2b,7b + .long 3b,8b + .long 4b,9b +.previous +ENDPROC(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ + .section .rodata,"a" #include "syscall_table.S" diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 82714668d43..7c52b222207 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -510,7 +510,8 @@ ENTRY(_stext) /* * BSS section */ -.section ".bss.page_aligned","w" +.section ".bss.page_aligned","wa" + .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) .fill 1024,4,0 ENTRY(swapper_pg_pmd) @@ -538,6 +539,8 @@ fault_msg: .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" .asciz "Stack: %p %p %p %p %p %p %p %p\n" +#include "../xen/xen-head.S" + /* * The IDT and GDT 'descriptors' are a strange 48-bit object * only used by the lidt and lgdt instructions. They are not diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index aa87b06c7c8..00f1bc47d3a 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -88,6 +88,7 @@ SECTIONS . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) *(.data.idt) } diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile new file mode 100644 index 00000000000..60bc1cfb101 --- /dev/null +++ b/arch/i386/xen/Makefile @@ -0,0 +1 @@ +obj-y := enlighten.o setup.o features.o multicalls.o diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c new file mode 100644 index 00000000000..2d484f9320d --- /dev/null +++ b/arch/i386/xen/enlighten.c @@ -0,0 +1,745 @@ +/* + * Core of Xen paravirt_ops implementation. + * + * This file contains the xen_paravirt_ops structure itself, and the + * implementations for: + * - privileged instructions + * - interrupt flags + * - segment operations + * - booting and setup + * + * Jeremy Fitzhardinge , XenSource Inc, 2007 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xen-ops.h" +#include "multicalls.h" + +EXPORT_SYMBOL_GPL(hypercall_page); + +DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); + +DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); +DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DEFINE_PER_CPU(unsigned long, xen_cr3); + +struct start_info *xen_start_info; +EXPORT_SYMBOL_GPL(xen_start_info); + +static void xen_vcpu_setup(int cpu) +{ + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; +} + +static void __init xen_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + paravirt_ops.name); + printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); +} + +static void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + unsigned maskedx = ~0; + + /* + * Mask out inconvenient features, to try and disable as many + * unsupported kernel subsystems as possible. + */ + if (*eax == 1) + maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ + (1 << X86_FEATURE_ACPI) | /* disable ACPI */ + (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + + asm(XEN_EMULATE_PREFIX "cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); + *edx &= maskedx; +} + +static void xen_set_debugreg(int reg, unsigned long val) +{ + HYPERVISOR_set_debugreg(reg, val); +} + +static unsigned long xen_get_debugreg(int reg) +{ + return HYPERVISOR_get_debugreg(reg); +} + +static unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + preempt_enable(); + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + preempt_disable(); + + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = flags; + + if (flags == 0) { + /* Unmask then check (avoid races). We're only protecting + against updates by this CPU, so there's no need for + anything stronger. */ + barrier(); + + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); + preempt_enable(); + } else + preempt_enable_no_resched(); +} + +static void xen_irq_disable(void) +{ + struct vcpu_info *vcpu; + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); +} + +static void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = 0; + + /* Unmask then check (avoid races). We're only protecting + against updates by this CPU, so there's no need for + anything stronger. */ + barrier(); + + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); + preempt_enable(); +} + +static void xen_safe_halt(void) +{ + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) + BUG(); +} + +static void xen_halt(void) +{ + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + else + xen_safe_halt(); +} + +static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) +{ + switch (mode) { + case PARAVIRT_LAZY_NONE: + BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); + break; + + case PARAVIRT_LAZY_MMU: + case PARAVIRT_LAZY_CPU: + BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE); + break; + + case PARAVIRT_LAZY_FLUSH: + /* flush if necessary, but don't change state */ + if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE) + xen_mc_flush(); + return; + } + + xen_mc_flush(); + x86_write_percpu(xen_lazy_mode, mode); +} + +static unsigned long xen_store_tr(void) +{ + return 0; +} + +static void xen_set_ldt(const void *addr, unsigned entries) +{ + unsigned long linear_addr = (unsigned long)addr; + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_SET_LDT; + if (linear_addr) { + /* ldt my be vmalloced, use arbitrary_virt_to_machine */ + xmaddr_t maddr; + maddr = arbitrary_virt_to_machine((unsigned long)addr); + linear_addr = (unsigned long)maddr.maddr; + } + op->arg1.linear_addr = linear_addr; + op->arg2.nr_ents = entries; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_load_gdt(const struct Xgt_desc_struct *dtr) +{ + unsigned long *frames; + unsigned long va = dtr->address; + unsigned int size = dtr->size + 1; + unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + int f; + struct multicall_space mcs; + + /* A GDT can be up to 64k in size, which corresponds to 8192 + 8-byte entries, or 16 4k pages.. */ + + BUG_ON(size > 65536); + BUG_ON(va & ~PAGE_MASK); + + mcs = xen_mc_entry(sizeof(*frames) * pages); + frames = mcs.args; + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_lowmem_page_readonly((void *)va); + } + + MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void load_TLS_descriptor(struct thread_struct *t, + unsigned int cpu, unsigned int i) +{ + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + struct multicall_space mc = __xen_mc_entry(0); + + MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); +} + +static void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + xen_mc_batch(); + + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, + u32 low, u32 high) +{ + unsigned long lp = (unsigned long)&dt[entrynum]; + xmaddr_t mach_lp = virt_to_machine(lp); + u64 entry = (u64)high << 32 | low; + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) + BUG(); +} + +static int cvt_gate_to_trap(int vector, u32 low, u32 high, + struct trap_info *info) +{ + u8 type, dpl; + + type = (high >> 8) & 0x1f; + dpl = (high >> 13) & 3; + + if (type != 0xf && type != 0xe) + return 0; + + info->vector = vector; + info->address = (high & 0xffff0000) | (low & 0x0000ffff); + info->cs = low >> 16; + info->flags = dpl; + /* interrupt gates clear IF */ + if (type == 0xe) + info->flags |= 4; + + return 1; +} + +/* Locations of each CPU's IDT */ +static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); + +/* Set an IDT entry. If the entry is part of the current IDT, then + also update Xen. */ +static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, + u32 low, u32 high) +{ + + int cpu = smp_processor_id(); + unsigned long p = (unsigned long)&dt[entrynum]; + unsigned long start = per_cpu(idt_desc, cpu).address; + unsigned long end = start + per_cpu(idt_desc, cpu).size + 1; + + xen_mc_flush(); + + write_dt_entry(dt, entrynum, low, high); + + if (p >= start && (p + 8) <= end) { + struct trap_info info[2]; + + info[1].address = 0; + + if (cvt_gate_to_trap(entrynum, low, high, &info[0])) + if (HYPERVISOR_set_trap_table(info)) + BUG(); + } +} + +/* Load a new IDT into Xen. In principle this can be per-CPU, so we + hold a spinlock to protect the static traps[] array (static because + it avoids allocation, and saves stack space). */ +static void xen_load_idt(const struct Xgt_desc_struct *desc) +{ + static DEFINE_SPINLOCK(lock); + static struct trap_info traps[257]; + + int cpu = smp_processor_id(); + unsigned in, out, count; + + per_cpu(idt_desc, cpu) = *desc; + + count = (desc->size+1) / 8; + BUG_ON(count > 256); + + spin_lock(&lock); + for (in = out = 0; in < count; in++) { + const u32 *entry = (u32 *)(desc->address + in * 8); + + if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) + out++; + } + traps[out].address = 0; + + xen_mc_flush(); + if (HYPERVISOR_set_trap_table(traps)) + BUG(); + + spin_unlock(&lock); +} + +/* Write a GDT descriptor entry. Ignore LDT descriptors, since + they're handled differently. */ +static void xen_write_gdt_entry(struct desc_struct *dt, int entry, + u32 low, u32 high) +{ + switch ((high >> 8) & 0xff) { + case DESCTYPE_LDT: + case DESCTYPE_TSS: + /* ignore */ + break; + + default: { + xmaddr_t maddr = virt_to_machine(&dt[entry]); + u64 desc = (u64)high << 32 | low; + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(maddr.maddr, desc)) + BUG(); + } + + } +} + +static void xen_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + struct multicall_space mcs = xen_mc_entry(0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_set_iopl_mask(unsigned mask) +{ + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); +} + +static void xen_io_delay(void) +{ +} + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned long xen_apic_read(unsigned long reg) +{ + return 0; +} +#endif + +static void xen_flush_tlb(void) +{ + struct mmuext_op op; + + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + +static void xen_flush_tlb_single(unsigned long addr) +{ + struct mmuext_op op; + + op.cmd = MMUEXT_INVLPG_LOCAL; + op.arg1.linear_addr = addr & PAGE_MASK; + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + +static unsigned long xen_read_cr2(void) +{ + return x86_read_percpu(xen_vcpu)->arch.cr2; +} + +static void xen_write_cr4(unsigned long cr4) +{ + /* never allow TSC to be disabled */ + native_write_cr4(cr4 & ~X86_CR4_TSD); +} + +/* + * Page-directory addresses above 4GB do not fit into architectural %cr3. + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests + * must use the following accessor macros to pack/unpack valid MFNs. + * + * Note that Xen is using the fact that the pagetable base is always + * page-aligned, and putting the 12 MSB of the address into the 12 LSB + * of cr3. + */ +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) + +static unsigned long xen_read_cr3(void) +{ + return x86_read_percpu(xen_cr3); +} + +static void xen_write_cr3(unsigned long cr3) +{ + if (cr3 == x86_read_percpu(xen_cr3)) { + /* just a simple tlb flush */ + xen_flush_tlb(); + return; + } + + x86_write_percpu(xen_cr3, cr3); + + + { + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + + op = mcs.args; + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = mfn; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_CPU); + } +} + +static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) +{ + /* XXX pfn isn't necessarily a lowmem page */ + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +static void xen_alloc_pd(u32 pfn) +{ + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +static void xen_release_pd(u32 pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static void xen_release_pt(u32 pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static void xen_alloc_pd_clone(u32 pfn, u32 clonepfn, + u32 start, u32 count) +{ + xen_alloc_pd(pfn); +} + +static __init void xen_pagetable_setup_start(pgd_t *base) +{ + pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + + init_mm.pgd = base; + /* + * copy top-level of Xen-supplied pagetable into place. For + * !PAE we can use this as-is, but for PAE it is a stand-in + * while we copy the pmd pages. + */ + memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); + + if (PTRS_PER_PMD > 1) { + int i; + /* + * For PAE, need to allocate new pmds, rather than + * share Xen's, since Xen doesn't like pmd's being + * shared between address spaces. + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); + + xen_alloc_pd(PFN_DOWN(__pa(pmd))); + + set_pgd(&base[i], __pgd(1 + __pa(pmd))); + } else + pgd_clear(&base[i]); + } + } + + /* make sure zero_page is mapped RO so we can use it in pagetables */ + make_lowmem_page_readonly(empty_zero_page); + make_lowmem_page_readonly(base); + /* + * Switch to new pagetable. This is done before + * pagetable_init has done anything so that the new pages + * added to the table can be prepared properly for Xen. + */ + xen_write_cr3(__pa(base)); +} + +static __init void xen_pagetable_setup_done(pgd_t *base) +{ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* + * Create a mapping for the shared info page. + * Should be set_fixmap(), but shared_info is a machine + * address with no corresponding pseudo-phys address. + */ +#if 0 + set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + PFN_DOWN(xen_start_info->shared_info), + PAGE_KERNEL); +#endif + + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + + } else + HYPERVISOR_shared_info = + (struct shared_info *)__va(xen_start_info->shared_info); + +#if 0 + xen_pgd_pin(base); +#endif + + xen_vcpu_setup(smp_processor_id()); +} + +static const struct paravirt_ops xen_paravirt_ops __initdata = { + .paravirt_enabled = 1, + .shared_kernel_pmd = 0, + + .name = "Xen", + .banner = xen_banner, + + .patch = paravirt_patch_default, + + .memory_setup = xen_memory_setup, + .arch_setup = xen_arch_setup, + + .cpuid = xen_cpuid, + + .set_debugreg = xen_set_debugreg, + .get_debugreg = xen_get_debugreg, + + .clts = native_clts, + + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, + + .read_cr2 = xen_read_cr2, + .write_cr2 = native_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3, + + .read_cr4 = native_read_cr4, + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = xen_write_cr4, + + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, + .wbinvd = native_wbinvd, + + .read_msr = native_read_msr_safe, + .write_msr = native_write_msr_safe, + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + + .iret = (void *)&hypercall_page[__HYPERVISOR_iret], + .irq_enable_sysexit = NULL, /* never called */ + + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, + + .store_gdt = native_store_gdt, + .store_idt = native_store_idt, + .store_tr = xen_store_tr, + + .write_ldt_entry = xen_write_ldt_entry, + .write_gdt_entry = xen_write_gdt_entry, + .write_idt_entry = xen_write_idt_entry, + .load_esp0 = xen_load_esp0, + + .set_iopl_mask = xen_set_iopl_mask, + .io_delay = xen_io_delay, + +#ifdef CONFIG_X86_LOCAL_APIC + .apic_write = paravirt_nop, + .apic_write_atomic = paravirt_nop, + .apic_read = xen_apic_read, + .setup_boot_clock = paravirt_nop, + .setup_secondary_clock = paravirt_nop, + .startup_ipi_hook = paravirt_nop, +#endif + + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_single = xen_flush_tlb_single, + + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + + .pagetable_setup_start = xen_pagetable_setup_start, + .pagetable_setup_done = xen_pagetable_setup_done, + + .alloc_pt = xen_alloc_pt, + .alloc_pd = xen_alloc_pd, + .alloc_pd_clone = xen_alloc_pd_clone, + .release_pd = xen_release_pd, + .release_pt = xen_release_pt, + + .set_lazy_mode = xen_set_lazy_mode, +}; + +/* First C function to be called on Xen boot */ +asmlinkage void __init xen_start_kernel(void) +{ + pgd_t *pgd; + + if (!xen_start_info) + return; + + BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); + + /* Install Xen paravirt ops */ + paravirt_ops = xen_paravirt_ops; + + xen_setup_features(); + + /* Get mfn list */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; + + pgd = (pgd_t *)xen_start_info->pt_base; + + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + + init_mm.pgd = pgd; /* use the Xen pagetables to start */ + + /* keep using Xen gdt for now; no urgent need to change it */ + + x86_write_percpu(xen_cr3, __pa(pgd)); + xen_vcpu_setup(0); + + paravirt_ops.kernel_rpl = 1; + if (xen_feature(XENFEAT_supervisor_mode_kernel)) + paravirt_ops.kernel_rpl = 0; + + /* set the limit of our address space */ + reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); + + /* set up basic CPUID stuff */ + cpu_detect(&new_cpu_data); + new_cpu_data.hard_math = 1; + new_cpu_data.x86_capability[0] = cpuid_edx(1); + + /* Poke various useful things into boot_params */ + LOADER_TYPE = (9 << 4) | 0; + INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; + INITRD_SIZE = xen_start_info->mod_len; + + /* Start the world */ + start_kernel(); +} diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c new file mode 100644 index 00000000000..0707714e40d --- /dev/null +++ b/arch/i386/xen/features.c @@ -0,0 +1,29 @@ +/****************************************************************************** + * features.c + * + * Xen feature flags. + * + * Copyright (c) 2006, Ian Campbell, XenSource Inc. + */ +#include +#include +#include +#include +#include + +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; +EXPORT_SYMBOL_GPL(xen_features); + +void xen_setup_features(void) +{ + struct xen_feature_info fi; + int i, j; + + for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { + fi.submap_idx = i; + if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) + break; + for (j = 0; j < 32; j++) + xen_features[i * 32 + j] = !!(fi.submap & 1<, XenSource Inc, 2007 + */ +#include + +#include + +#include "multicalls.h" + +#define MC_BATCH 8 +#define MC_ARGS (MC_BATCH * 32 / sizeof(u64)) + +struct mc_buffer { + struct multicall_entry entries[MC_BATCH]; + u64 args[MC_ARGS]; + unsigned mcidx, argidx; +}; + +static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); +DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); + +void xen_mc_flush(void) +{ + struct mc_buffer *b = &get_cpu_var(mc_buffer); + int ret = 0; + unsigned long flags; + + /* Disable interrupts in case someone comes in and queues + something in the middle */ + local_irq_save(flags); + + if (b->mcidx) { + int i; + + if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) + BUG(); + for (i = 0; i < b->mcidx; i++) + if (b->entries[i].result < 0) + ret++; + b->mcidx = 0; + b->argidx = 0; + } else + BUG_ON(b->argidx != 0); + + put_cpu_var(mc_buffer); + local_irq_restore(flags); + + BUG_ON(ret); +} + +struct multicall_space __xen_mc_entry(size_t args) +{ + struct mc_buffer *b = &get_cpu_var(mc_buffer); + struct multicall_space ret; + unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); + + BUG_ON(argspace > MC_ARGS); + + if (b->mcidx == MC_BATCH || + (b->argidx + argspace) > MC_ARGS) + xen_mc_flush(); + + ret.mc = &b->entries[b->mcidx]; + b->mcidx++; + ret.args = &b->args[b->argidx]; + b->argidx += argspace; + + put_cpu_var(mc_buffer); + + return ret; +} diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h new file mode 100644 index 00000000000..e6f7530b156 --- /dev/null +++ b/arch/i386/xen/multicalls.h @@ -0,0 +1,45 @@ +#ifndef _XEN_MULTICALLS_H +#define _XEN_MULTICALLS_H + +#include "xen-ops.h" + +/* Multicalls */ +struct multicall_space +{ + struct multicall_entry *mc; + void *args; +}; + +/* Allocate room for a multicall and its args */ +struct multicall_space __xen_mc_entry(size_t args); + +DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); + +/* Call to start a batch of multiple __xen_mc_entry()s. Must be + paired with xen_mc_issue() */ +static inline void xen_mc_batch(void) +{ + /* need to disable interrupts until this entry is complete */ + local_irq_save(__get_cpu_var(xen_mc_irq_flags)); +} + +static inline struct multicall_space xen_mc_entry(size_t args) +{ + xen_mc_batch(); + return __xen_mc_entry(args); +} + +/* Flush all pending multicalls */ +void xen_mc_flush(void); + +/* Issue a multicall if we're not in a lazy mode */ +static inline void xen_mc_issue(unsigned mode) +{ + if ((xen_get_lazy_mode() & mode) == 0) + xen_mc_flush(); + + /* restore flags saved in xen_mc_batch */ + local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); +} + +#endif /* _XEN_MULTICALLS_H */ diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c new file mode 100644 index 00000000000..7da93ee612f --- /dev/null +++ b/arch/i386/xen/setup.c @@ -0,0 +1,97 @@ +/* + * Machine specific setup for xen + * + * Jeremy Fitzhardinge , XenSource Inc, 2007 + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include "xen-ops.h" + +/* These are code, but not functions. Defined in entry.S */ +extern const char xen_hypervisor_callback[]; +extern const char xen_failsafe_callback[]; + +static __initdata struct shared_info init_shared; + +/* + * Point at some empty memory to start with. We map the real shared_info + * page as soon as fixmap is up and running. + */ +struct shared_info *HYPERVISOR_shared_info = &init_shared; + +unsigned long *phys_to_machine_mapping; +EXPORT_SYMBOL(phys_to_machine_mapping); + +/** + * machine_specific_memory_setup - Hook for machine specific memory setup. + **/ + +char * __init xen_memory_setup(void) +{ + unsigned long max_pfn = xen_start_info->nr_pages; + + e820.nr_map = 0; + add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); + + return "Xen"; +} + +static void xen_idle(void) +{ + local_irq_disable(); + + if (need_resched()) + local_irq_enable(); + else { + current_thread_info()->status &= ~TS_POLLING; + smp_mb__after_clear_bit(); + safe_halt(); + current_thread_info()->status |= TS_POLLING; + } +} + +void __init xen_arch_setup(void) +{ + struct physdev_set_iopl set_iopl; + int rc; + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); + + HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, + __KERNEL_CS, (unsigned long)xen_failsafe_callback); + + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + printk(KERN_INFO "physdev_op failed %d\n", rc); + +#ifdef CONFIG_ACPI + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); + disable_acpi(); + } +#endif + + memcpy(boot_command_line, xen_start_info->cmd_line, + MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? + COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); + + pm_idle = xen_idle; +} diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S new file mode 100644 index 00000000000..2998d55a001 --- /dev/null +++ b/arch/i386/xen/xen-head.S @@ -0,0 +1,36 @@ +/* Xen-specific pieces of head.S, intended to be included in the right + place in head.S */ + +#ifdef CONFIG_XEN + +#include +#include +#include + +ENTRY(startup_xen) + movl %esi,xen_start_info + cld + movl $(init_thread_union+THREAD_SIZE),%esp + jmp xen_start_kernel + +.pushsection ".bss.page_aligned" + .align PAGE_SIZE_asm +ENTRY(hypercall_page) + .skip 0x1000 +.popsection + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") +#ifdef CONFIG_X86_PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + +#endif /*CONFIG_XEN */ diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h new file mode 100644 index 00000000000..79648fe1ab7 --- /dev/null +++ b/arch/i386/xen/xen-ops.h @@ -0,0 +1,31 @@ +#ifndef XEN_OPS_H +#define XEN_OPS_H + +#include +#include + +DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); +DECLARE_PER_CPU(unsigned long, xen_cr3); + +extern struct start_info *xen_start_info; +extern struct shared_info *HYPERVISOR_shared_info; + +char * __init xen_memory_setup(void); +void __init xen_arch_setup(void); +void __init xen_init_IRQ(void); + +unsigned long xen_cpu_khz(void); +void __init xen_time_init(void); +unsigned long xen_get_wallclock(void); +int xen_set_wallclock(unsigned long time); +cycle_t xen_clocksource_read(void); + +DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); + +static inline unsigned xen_get_lazy_mode(void) +{ + return x86_read_percpu(xen_lazy_mode); +} + + +#endif /* XEN_OPS_H */ -- cgit v1.2.3 From 3b827c1b3aadf3adb4c602d19863f2d24e7cbc18 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:04 -0700 Subject: xen: virtual mmu Xen pagetable handling, including the machinery to implement direct pagetables. Xen presents the real CPU's pagetables directly to guests, with no added shadowing or other layer of abstraction. Naturally this means the hypervisor must maintain close control over what the guest can put into the pagetable. When the guest modifies the pte/pmd/pgd, it must convert its domain-specific notion of a "physical" pfn into a global machine frame number (mfn) before inserting the entry into the pagetable. Xen will check to make sure the domain is allowed to create a mapping of the given mfn. Xen also requires that all mappings the guest has of its own active pagetable are read-only. This is relatively easy to implement in Linux because all pagetables share the same pte pages for kernel mappings, so updating the pte in one pagetable will implicitly update the mapping in all pagetables. Normally a pagetable becomes active when you point to it with cr3 (or the Xen equivalent), but when you do so, Xen must check the whole pagetable for correctness, which is clearly a performance problem. Xen solves this with pinning which keeps a pagetable effectively active even if its currently unused, which means that all the normal update rules are enforced. This means that it need not revalidate the pagetable when loading cr3. This patch has a first-cut implementation of pinning, but it is more fully implemented in a later patch. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright --- arch/i386/xen/Makefile | 2 +- arch/i386/xen/enlighten.c | 30 +++- arch/i386/xen/mmu.c | 420 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/xen/mmu.h | 47 ++++++ 4 files changed, 494 insertions(+), 5 deletions(-) create mode 100644 arch/i386/xen/mmu.c create mode 100644 arch/i386/xen/mmu.h (limited to 'arch') diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile index 60bc1cfb101..803c1ee2b76 100644 --- a/arch/i386/xen/Makefile +++ b/arch/i386/xen/Makefile @@ -1 +1 @@ -obj-y := enlighten.o setup.o features.o multicalls.o +obj-y := enlighten.o setup.o features.o multicalls.o mmu.o diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index 2d484f9320d..c0b0aa7af14 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -39,6 +39,7 @@ #include #include "xen-ops.h" +#include "mmu.h" #include "multicalls.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -579,11 +580,9 @@ static __init void xen_pagetable_setup_done(pgd_t *base) * Should be set_fixmap(), but shared_info is a machine * address with no corresponding pseudo-phys address. */ -#if 0 set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP), PFN_DOWN(xen_start_info->shared_info), PAGE_KERNEL); -#endif HYPERVISOR_shared_info = (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); @@ -592,9 +591,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) HYPERVISOR_shared_info = (struct shared_info *)__va(xen_start_info->shared_info); -#if 0 xen_pgd_pin(base); -#endif xen_vcpu_setup(smp_processor_id()); } @@ -690,6 +687,31 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .release_pd = xen_release_pd, .release_pt = xen_release_pt, + .set_pte = xen_set_pte, + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd, + + .pte_val = xen_pte_val, + .pgd_val = xen_pgd_val, + + .make_pte = xen_make_pte, + .make_pgd = xen_make_pgd, + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = xen_set_pte_atomic, + .set_pte_present = xen_set_pte_at, + .set_pud = xen_set_pud, + .pte_clear = xen_pte_clear, + .pmd_clear = xen_pmd_clear, + + .make_pmd = xen_make_pmd, + .pmd_val = xen_pmd_val, +#endif /* PAE */ + + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, + .set_lazy_mode = xen_set_lazy_mode, }; diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c new file mode 100644 index 00000000000..de16cb5f55c --- /dev/null +++ b/arch/i386/xen/mmu.c @@ -0,0 +1,420 @@ +/* + * Xen mmu operations + * + * This file contains the various mmu fetch and update operations. + * The most important job they must perform is the mapping between the + * domain's pfn and the overall machine mfns. + * + * Xen allows guests to directly update the pagetable, in a controlled + * fashion. In other words, the guest modifies the same pagetable + * that the CPU actually uses, which eliminates the overhead of having + * a separate shadow pagetable. + * + * In order to allow this, it falls on the guest domain to map its + * notion of a "physical" pfn - which is just a domain-local linear + * address - into a real "machine address" which the CPU's MMU can + * use. + * + * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be + * inserted directly into the pagetable. When creating a new + * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, + * when reading the content back with __(pgd|pmd|pte)_val, it converts + * the mfn back into a pfn. + * + * The other constraint is that all pages which make up a pagetable + * must be mapped read-only in the guest. This prevents uncontrolled + * guest updates to the pagetable. Xen strictly enforces this, and + * will disallow any pagetable update which will end up mapping a + * pagetable page RW, and will disallow using any writable page as a + * pagetable. + * + * Naively, when loading %cr3 with the base of a new pagetable, Xen + * would need to validate the whole pagetable before going on. + * Naturally, this is quite slow. The solution is to "pin" a + * pagetable, which enforces all the constraints on the pagetable even + * when it is not actively in use. This menas that Xen can be assured + * that it is still valid when you do load it into %cr3, and doesn't + * need to revalidate it. + * + * Jeremy Fitzhardinge , XenSource Inc, 2007 + */ +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include "mmu.h" + +xmaddr_t arbitrary_virt_to_machine(unsigned long address) +{ + pte_t *pte = lookup_address(address); + unsigned offset = address & PAGE_MASK; + + BUG_ON(pte == NULL); + + return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); +} + +void make_lowmem_page_readonly(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + + pte = lookup_address(address); + BUG_ON(pte == NULL); + + ptev = pte_wrprotect(*pte); + + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + +void make_lowmem_page_readwrite(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + + pte = lookup_address(address); + BUG_ON(pte == NULL); + + ptev = pte_mkwrite(*pte); + + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + + +void xen_set_pte(pte_t *ptep, pte_t pte) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptep).maddr; + u.val = pte_val_ma(pte); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +} + +void xen_set_pmd(pmd_t *ptr, pmd_t val) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pmd_val_ma(val); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +} + +#ifdef CONFIG_X86_PAE +void xen_set_pud(pud_t *ptr, pud_t val) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pud_val_ma(val); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +} +#endif + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + /* stored as-is, to permit clearing entries */ + xen_set_pte(pte, mfn_pte(mfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + if ((mm != current->mm && mm != &init_mm) || + HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0) + xen_set_pte(ptep, pteval); +} + +#ifdef CONFIG_X86_PAE +void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_64bit((u64 *)ptep, pte_val_ma(pte)); +} + +void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + ptep->pte_low = 0; + smp_wmb(); /* make sure low gets written first */ + ptep->pte_high = 0; +} + +void xen_pmd_clear(pmd_t *pmdp) +{ + xen_set_pmd(pmdp, __pmd(0)); +} + +unsigned long long xen_pte_val(pte_t pte) +{ + unsigned long long ret = 0; + + if (pte.pte_low) { + ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + } + + return ret; +} + +unsigned long long xen_pmd_val(pmd_t pmd) +{ + unsigned long long ret = pmd.pmd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +unsigned long long xen_pgd_val(pgd_t pgd) +{ + unsigned long long ret = pgd.pgd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +pte_t xen_make_pte(unsigned long long pte) +{ + if (pte & 1) + pte = phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte, pte >> 32 }; +} + +pmd_t xen_make_pmd(unsigned long long pmd) +{ + if (pmd & 1) + pmd = phys_to_machine(XPADDR(pmd)).maddr; + + return (pmd_t){ pmd }; +} + +pgd_t xen_make_pgd(unsigned long long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} +#else /* !PAE */ +unsigned long xen_pte_val(pte_t pte) +{ + unsigned long ret = pte.pte_low; + + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr; + + return ret; +} + +unsigned long xen_pmd_val(pmd_t pmd) +{ + /* a BUG here is a lot easier to track down than a NULL eip */ + BUG(); + return 0; +} + +unsigned long xen_pgd_val(pgd_t pgd) +{ + unsigned long ret = pgd.pgd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +pte_t xen_make_pte(unsigned long pte) +{ + if (pte & _PAGE_PRESENT) + pte = phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte }; +} + +pmd_t xen_make_pmd(unsigned long pmd) +{ + /* a BUG here is a lot easier to track down than a NULL eip */ + BUG(); + return __pmd(0); +} + +pgd_t xen_make_pgd(unsigned long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} +#endif /* CONFIG_X86_PAE */ + + + +static void pgd_walk_set_prot(void *pt, pgprot_t flags) +{ + unsigned long pfn = PFN_DOWN(__pa(pt)); + + if (HYPERVISOR_update_va_mapping((unsigned long)pt, + pfn_pte(pfn, flags), 0) < 0) + BUG(); +} + +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) +{ + pgd_t *pgd = pgd_base; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int g, u, m; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + + if (PTRS_PER_PUD > 1) /* not folded */ + pgd_walk_set_prot(pud, flags); + + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + + if (PTRS_PER_PMD > 1) /* not folded */ + pgd_walk_set_prot(pmd, flags); + + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + + /* This can get called before mem_map + is set up, so we assume nothing is + highmem at that point. */ + if (mem_map == NULL || + !PageHighMem(pmd_page(*pmd))) { + pte = pte_offset_kernel(pmd, 0); + pgd_walk_set_prot(pte, flags); + } + } + } + } + + if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base, + pfn_pte(PFN_DOWN(__pa(pgd_base)), + flags), + UVMF_TLB_FLUSH) < 0) + BUG(); +} + + +/* This is called just after a mm has been duplicated from its parent, + but it has not been used yet. We need to make sure that its + pagetable is all read-only, and can be pinned. */ +void xen_pgd_pin(pgd_t *pgd) +{ + struct mmuext_op op; + + pgd_walk(pgd, PAGE_KERNEL_RO); + +#if defined(CONFIG_X86_PAE) + op.cmd = MMUEXT_PIN_L3_TABLE; +#else + op.cmd = MMUEXT_PIN_L2_TABLE; +#endif + op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) + BUG(); +} + +/* Release a pagetables pages back as normal RW */ +void xen_pgd_unpin(pgd_t *pgd) +{ + struct mmuext_op op; + + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) + BUG(); + + pgd_walk(pgd, PAGE_KERNEL); +} + + +void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) +{ + xen_pgd_pin(next->pgd); +} + +void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + xen_pgd_pin(mm->pgd); +} + +void xen_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if (tsk->active_mm == mm) { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + xen_pgd_unpin(mm->pgd); +} diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h new file mode 100644 index 00000000000..764eaaae0a2 --- /dev/null +++ b/arch/i386/xen/mmu.h @@ -0,0 +1,47 @@ +#ifndef _XEN_MMU_H + +#include +#include + +void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +void xen_set_pte(pte_t *ptep, pte_t pteval); +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); +void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); + +void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); +void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); +void xen_exit_mmap(struct mm_struct *mm); + +void xen_pgd_pin(pgd_t *pgd); +void xen_pgd_unpin(pgd_t *pgd); + +#ifdef CONFIG_X86_PAE +unsigned long long xen_pte_val(pte_t); +unsigned long long xen_pmd_val(pmd_t); +unsigned long long xen_pgd_val(pgd_t); + +pte_t xen_make_pte(unsigned long long); +pmd_t xen_make_pmd(unsigned long long); +pgd_t xen_make_pgd(unsigned long long); + +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); +void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +void xen_set_pud(pud_t *ptr, pud_t val); +void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void xen_pmd_clear(pmd_t *pmdp); + + +#else +unsigned long xen_pte_val(pte_t); +unsigned long xen_pmd_val(pmd_t); +unsigned long xen_pgd_val(pgd_t); + +pte_t xen_make_pte(unsigned long); +pmd_t xen_make_pmd(unsigned long); +pgd_t xen_make_pgd(unsigned long); +#endif + +#endif /* _XEN_MMU_H */ -- cgit v1.2.3 From e46cdb66c8fc1c8d61cfae0f219ff47ac4b9d531 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:05 -0700 Subject: xen: event channels Xen implements interrupts in terms of event channels. Each guest domain gets 1024 event channels which can be used for a variety of purposes, such as Xen timer events, inter-domain events, inter-processor events (IPI) or for real hardware IRQs. Within the kernel, we map the event channels to IRQs, and implement the whole interrupt handling using a Xen irq_chip. Rather than setting NR_IRQ to 1024 under PARAVIRT in order to accomodate Xen, we create a dynamic mapping between event channels and IRQs. Ideally, Linux will eventually move towards dynamically allocating per-irq structures, and we can use a 1:1 mapping between event channels and irqs. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Cc: Ingo Molnar Cc: Eric W. Biederman --- arch/i386/xen/Makefile | 3 +- arch/i386/xen/enlighten.c | 1 + arch/i386/xen/events.c | 511 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 514 insertions(+), 1 deletion(-) create mode 100644 arch/i386/xen/events.c (limited to 'arch') diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile index 803c1ee2b76..7a78f27bfb1 100644 --- a/arch/i386/xen/Makefile +++ b/arch/i386/xen/Makefile @@ -1 +1,2 @@ -obj-y := enlighten.o setup.o features.o multicalls.o mmu.o +obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ + events.o diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index c0b0aa7af14..6417dfdccb4 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -607,6 +607,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .memory_setup = xen_memory_setup, .arch_setup = xen_arch_setup, + .init_IRQ = xen_init_IRQ, .cpuid = xen_cpuid, diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c new file mode 100644 index 00000000000..e7c5d00ab4f --- /dev/null +++ b/arch/i386/xen/events.c @@ -0,0 +1,511 @@ +/* + * Xen event channels + * + * Xen models interrupts with abstract event channels. Because each + * domain gets 1024 event channels, but NR_IRQ is not that large, we + * must dynamically map irqs<->event channels. The event channels + * interface with the rest of the kernel by defining a xen interrupt + * chip. When an event is recieved, it is mapped to an irq and sent + * through the normal interrupt processing path. + * + * There are four kinds of events which can be mapped to an event + * channel: + * + * 1. Inter-domain notifications. This includes all the virtual + * device events, since they're driven by front-ends in another domain + * (typically dom0). + * 2. VIRQs, typically used for timers. These are per-cpu events. + * 3. IPIs. + * 4. Hardware interrupts. Not supported at present. + * + * Jeremy Fitzhardinge , XenSource Inc, 2007 + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "xen-ops.h" + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. + */ +static DEFINE_SPINLOCK(irq_mapping_update_lock); + +/* IRQ <-> VIRQ mapping. */ +static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; + +/* Packed IRQ information: binding type, sub-type index, and event channel. */ +struct packed_irq +{ + unsigned short evtchn; + unsigned char index; + unsigned char type; +}; + +static struct packed_irq irq_info[NR_IRQS]; + +/* Binding types. */ +enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN }; + +/* Convenient shorthand for packed representation of an unbound IRQ. */ +#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) + +static int evtchn_to_irq[NR_EVENT_CHANNELS] = { + [0 ... NR_EVENT_CHANNELS-1] = -1 +}; +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; +static u8 cpu_evtchn[NR_EVENT_CHANNELS]; + +/* Reference counts for bindings to IRQs. */ +static int irq_bindcount[NR_IRQS]; + +/* Xen will never allocate port zero for any purpose. */ +#define VALID_EVTCHN(chn) ((chn) != 0) + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} +EXPORT_SYMBOL_GPL(force_evtchn_callback); + +static struct irq_chip xen_dynamic_chip; + +/* Constructor for packed IRQ information. */ +static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) +{ + return (struct packed_irq) { evtchn, index, type }; +} + +/* + * Accessors for packed IRQ information. + */ +static inline unsigned int evtchn_from_irq(int irq) +{ + return irq_info[irq].evtchn; +} + +static inline unsigned int index_from_irq(int irq) +{ + return irq_info[irq].index; +} + +static inline unsigned int type_from_irq(int irq) +{ + return irq_info[irq].type; +} + +static inline unsigned long active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & + cpu_evtchn_mask[cpu][idx] & + ~sh->evtchn_mask[idx]); +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ + int irq = evtchn_to_irq[chn]; + + BUG_ON(irq == -1); +#ifdef CONFIG_SMP + irq_desc[irq].affinity = cpumask_of_cpu(cpu); +#endif + + __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); + __set_bit(chn, cpu_evtchn_mask[cpu]); + + cpu_evtchn[chn] = cpu; +} + +static void init_evtchn_cpu_bindings(void) +{ +#ifdef CONFIG_SMP + int i; + /* By default all event channels notify CPU#0. */ + for (i = 0; i < NR_IRQS; i++) + irq_desc[i].affinity = cpumask_of_cpu(0); +#endif + + memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); + memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); +} + +static inline unsigned int cpu_from_evtchn(unsigned int evtchn) +{ + return cpu_evtchn[evtchn]; +} + +static inline void clear_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, &s->evtchn_pending[0]); +} + +static inline void set_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_pending[0]); +} + + +/** + * notify_remote_via_irq - send event to remote end of event channel via irq + * @irq: irq of event channel to send event to + * + * Unlike notify_remote_via_evtchn(), this is safe to use across + * save/restore. Notifications on a broken connection are silently + * dropped. + */ +void notify_remote_via_irq(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + notify_remote_via_evtchn(evtchn); +} +EXPORT_SYMBOL_GPL(notify_remote_via_irq); + +static void mask_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_mask[0]); +} + +static void unmask_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = get_cpu(); + + BUG_ON(!irqs_disabled()); + + /* Slow path (hypercall) if this is a non-local port. */ + if (unlikely(cpu != cpu_from_evtchn(port))) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } else { + struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); + + sync_clear_bit(port, &s->evtchn_mask[0]); + + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose + * the interrupt edge' if the channel is masked. + */ + if (sync_test_bit(port, &s->evtchn_pending[0]) && + !sync_test_and_set_bit(port / BITS_PER_LONG, + &vcpu_info->evtchn_pending_sel)) + vcpu_info->evtchn_upcall_pending = 1; + } + + put_cpu(); +} + +static int find_unbound_irq(void) +{ + int irq; + + /* Only allocate from dynirq range */ + for (irq = 0; irq < NR_IRQS; irq++) + if (irq_bindcount[irq] == 0) + break; + + if (irq == NR_IRQS) + panic("No available IRQ to bind to: increase NR_IRQS!\n"); + + return irq; +} + +static int bind_evtchn_to_irq(unsigned int evtchn) +{ + int irq; + + spin_lock(&irq_mapping_update_lock); + + irq = evtchn_to_irq[evtchn]; + + if (irq == -1) { + irq = find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "event"); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + irq = per_cpu(virq_to_irq, cpu)[virq]; + + if (irq == -1) { + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + irq = find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "virq"); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + + per_cpu(virq_to_irq, cpu)[virq] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +static void unbind_from_irq(unsigned int irq) +{ + struct evtchn_close close; + int evtchn = evtchn_from_irq(irq); + + spin_lock(&irq_mapping_update_lock); + + if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { + close.port = evtchn; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) + [index_from_irq(irq)] = -1; + break; + default: + break; + } + + /* Closed ports are implicitly re-bound to VCPU0. */ + bind_evtchn_to_cpu(evtchn, 0); + + evtchn_to_irq[evtchn] = -1; + irq_info[irq] = IRQ_UNBOUND; + + dynamic_irq_init(irq); + } + + spin_unlock(&irq_mapping_update_lock); +} + +int bind_evtchn_to_irqhandler(unsigned int evtchn, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, + const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq = bind_evtchn_to_irq(evtchn); + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq = bind_virq_to_irq(virq, cpu); + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); + +void unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ + free_irq(irq, dev_id); + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(unbind_from_irqhandler); + +/* + * Search the CPUs pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for + * handling. + * + * Xen uses a two-level bitmap to speed searching. The first level is + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) +{ + int cpu = get_cpu(); + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); + unsigned long pending_words; + + vcpu_info->evtchn_upcall_pending = 0; + + /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ + pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); + while (pending_words != 0) { + unsigned long pending_bits; + int word_idx = __ffs(pending_words); + pending_words &= ~(1UL << word_idx); + + while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { + int bit_idx = __ffs(pending_bits); + int port = (word_idx * BITS_PER_LONG) + bit_idx; + int irq = evtchn_to_irq[port]; + + if (irq != -1) { + regs->orig_eax = ~irq; + do_IRQ(regs); + } + } + } + + put_cpu(); +} + +/* Rebind an evtchn so that it gets delivered to a specific cpu */ +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) +{ + struct evtchn_bind_vcpu bind_vcpu; + int evtchn = evtchn_from_irq(irq); + + if (!VALID_EVTCHN(evtchn)) + return; + + /* Send future instances of this interrupt to other vcpu. */ + bind_vcpu.port = evtchn; + bind_vcpu.vcpu = tcpu; + + /* + * If this fails, it usually just indicates that we're dealing with a + * virq or IPI channel, which don't actually need to be rebound. Ignore + * it, but don't do the xenlinux-level rebind in that case. + */ + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) + bind_evtchn_to_cpu(evtchn, tcpu); +} + + +static void set_affinity_irq(unsigned irq, cpumask_t dest) +{ + unsigned tcpu = first_cpu(dest); + rebind_irq_to_cpu(irq, tcpu); +} + +static void enable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + unmask_evtchn(evtchn); +} + +static void disable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + mask_evtchn(evtchn); +} + +static void ack_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); +} + +static int retrigger_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + int ret = 0; + + if (VALID_EVTCHN(evtchn)) { + set_evtchn(evtchn); + ret = 1; + } + + return ret; +} + +static struct irq_chip xen_dynamic_chip __read_mostly = { + .name = "xen-dyn", + .mask = disable_dynirq, + .unmask = enable_dynirq, + .ack = ack_dynirq, + .set_affinity = set_affinity_irq, + .retrigger = retrigger_dynirq, +}; + +void __init xen_init_IRQ(void) +{ + int i; + + init_evtchn_cpu_bindings(); + + /* No event channels are 'live' right now. */ + for (i = 0; i < NR_EVENT_CHANNELS; i++) + mask_evtchn(i); + + /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ + for (i = 0; i < NR_IRQS; i++) + irq_bindcount[i] = 0; + + irq_ctx_init(smp_processor_id()); +} -- cgit v1.2.3 From 15c84731d647c34d1491793fa6be96f5de3432eb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:05 -0700 Subject: xen: time implementation Xen maintains a base clock which measures nanoseconds since system boot. This is provided to guests via a shared page which contains a base time in ns, a tsc timestamp at that point and tsc frequency parameters. Guests can compute the current time by reading the tsc and using it to extrapolate the current time from the basetime. The hypervisor makes sure that the frequency parameters are updated regularly, paricularly if the tsc changes rate or stops. This is implemented as a clocksource, so the interface to the rest of the kernel is a simple clocksource which simply returns the current time directly in nanoseconds. Xen also provides a simple timer mechanism, which allows a timeout to be set in the future. When that time arrives, a timer event is sent to the guest. There are two timer interfaces: - An old one which also delivers a stream of (unused) ticks at 100Hz, and on the same event, the actual timer events. The 100Hz ticks cause a lot of spurious wakeups, but are basically harmless. - The new timer interface doesn't have the 100Hz ticks, and can also fail if the specified time is in the past. This code presents the Xen timer as a clockevent driver, and uses the new interface by preference. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Cc: Ingo Molnar Cc: Thomas Gleixner --- arch/i386/xen/Makefile | 2 +- arch/i386/xen/enlighten.c | 6 + arch/i386/xen/time.c | 407 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 414 insertions(+), 1 deletion(-) create mode 100644 arch/i386/xen/time.c (limited to 'arch') diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile index 7a78f27bfb1..bf51cabed0d 100644 --- a/arch/i386/xen/Makefile +++ b/arch/i386/xen/Makefile @@ -1,2 +1,2 @@ obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ - events.o + events.o time.o diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index 6417dfdccb4..25eb3592f11 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -609,6 +609,12 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .arch_setup = xen_arch_setup, .init_IRQ = xen_init_IRQ, + .time_init = xen_time_init, + .set_wallclock = xen_set_wallclock, + .get_wallclock = xen_get_wallclock, + .get_cpu_khz = xen_cpu_khz, + .sched_clock = xen_clocksource_read, + .cpuid = xen_cpuid, .set_debugreg = xen_set_debugreg, diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c new file mode 100644 index 00000000000..b457980ff3c --- /dev/null +++ b/arch/i386/xen/time.c @@ -0,0 +1,407 @@ +/* + * Xen time implementation. + * + * This is implemented in terms of a clocksource driver which uses + * the hypervisor clock as a nanosecond timebase, and a clockevent + * driver which uses the hypervisor's timer mechanism. + * + * Jeremy Fitzhardinge , XenSource Inc, 2007 + */ +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "xen-ops.h" + +#define XEN_SHIFT 22 + +/* Xen may fire a timer up to this many ns early */ +#define TIMER_SLOP 100000 + +/* These are perodically updated in shared_info, and then copied here. */ +struct shadow_time_info { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); + +unsigned long xen_cpu_khz(void) +{ + u64 cpu_khz = 1000000ULL << 32; + const struct vcpu_time_info *info = + &HYPERVISOR_shared_info->vcpu_info[0].time; + + do_div(cpu_khz, info->tsc_to_system_mul); + if (info->tsc_shift < 0) + cpu_khz <<= -info->tsc_shift; + else + cpu_khz >>= info->tsc_shift; + + return cpu_khz; +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. + */ +static void get_time_values_from_xen(void) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + preempt_disable(); + + /* src is shared memory with the hypervisor, so we need to + make sure we get a consistent snapshot, even in the face of + being preempted. */ + src = &__get_cpu_var(xen_vcpu)->time; + dst = &__get_cpu_var(shadow_time); + + do { + dst->version = src->version; + rmb(); /* fetch version before data */ + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); /* test version after fetching data */ + } while ((src->version & 1) | (dst->version ^ src->version)); + + preempt_enable(); +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 get_nsec_offset(struct shadow_time_info *shadow) +{ + u64 now, delta; + rdtscll(now); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +cycle_t xen_clocksource_read(void) +{ + struct shadow_time_info *shadow = &get_cpu_var(shadow_time); + cycle_t ret; + + get_time_values_from_xen(); + + ret = shadow->system_timestamp + get_nsec_offset(shadow); + + put_cpu_var(shadow_time); + + return ret; +} + +static void xen_read_wallclock(struct timespec *ts) +{ + const struct shared_info *s = HYPERVISOR_shared_info; + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = s->wc_version; + rmb(); /* fetch version before time */ + now.tv_sec = s->wc_sec; + now.tv_nsec = s->wc_nsec; + rmb(); /* fetch time before checking version */ + } while ((s->wc_version & 1) | (version ^ s->wc_version)); + + delta = xen_clocksource_read(); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} + +unsigned long xen_get_wallclock(void) +{ + struct timespec ts; + + xen_read_wallclock(&ts); + + return ts.tv_sec; +} + +int xen_set_wallclock(unsigned long now) +{ + /* do nothing for domU */ + return -1; +} + +static struct clocksource xen_clocksource __read_mostly = { + .name = "xen", + .rating = 400, + .read = xen_clocksource_read, + .mask = ~0, + .mult = 1<mode != CLOCK_EVT_MODE_ONESHOT); + + if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) + BUG(); + + /* We may have missed the deadline, but there's no real way of + knowing for sure. If the event was in the past, then we'll + get an immediate interrupt. */ + + return 0; +} + +static const struct clock_event_device xen_timerop_clockevent = { + .name = "xen", + .features = CLOCK_EVT_FEAT_ONESHOT, + + .max_delta_ns = 0xffffffff, + .min_delta_ns = TIMER_SLOP, + + .mult = 1, + .shift = 0, + .rating = 500, + + .set_mode = xen_timerop_set_mode, + .set_next_event = xen_timerop_set_next_event, +}; + + + +static void xen_vcpuop_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + int cpu = smp_processor_id(); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + WARN_ON(1); /* unsupported */ + break; + + case CLOCK_EVT_MODE_ONESHOT: + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || + HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + break; + } +} + +static int xen_vcpuop_set_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + int cpu = smp_processor_id(); + struct vcpu_set_singleshot_timer single; + int ret; + + WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + + single.timeout_abs_ns = get_abs_timeout(delta); + single.flags = VCPU_SSHOTTMR_future; + + ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); + + BUG_ON(ret != 0 && ret != -ETIME); + + return ret; +} + +static const struct clock_event_device xen_vcpuop_clockevent = { + .name = "xen", + .features = CLOCK_EVT_FEAT_ONESHOT, + + .max_delta_ns = 0xffffffff, + .min_delta_ns = TIMER_SLOP, + + .mult = 1, + .shift = 0, + .rating = 500, + + .set_mode = xen_vcpuop_set_mode, + .set_next_event = xen_vcpuop_set_next_event, +}; + +static const struct clock_event_device *xen_clockevent = + &xen_timerop_clockevent; +static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); + +static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); + irqreturn_t ret; + + ret = IRQ_NONE; + if (evt->event_handler) { + evt->event_handler(evt); + ret = IRQ_HANDLED; + } + + return ret; +} + +static void xen_setup_timer(int cpu) +{ + const char *name; + struct clock_event_device *evt; + int irq; + + printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); + + name = kasprintf(GFP_KERNEL, "timer%d", cpu); + if (!name) + name = ""; + + irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, NULL); + + evt = &get_cpu_var(xen_clock_events); + memcpy(evt, xen_clockevent, sizeof(*evt)); + + evt->cpumask = cpumask_of_cpu(cpu); + evt->irq = irq; + clockevents_register_device(evt); + + put_cpu_var(xen_clock_events); +} + +__init void xen_time_init(void) +{ + int cpu = smp_processor_id(); + + get_time_values_from_xen(); + + clocksource_register(&xen_clocksource); + + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { + /* Successfully turned off 100hz tick, so we have the + vcpuop-based timer interface */ + printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); + xen_clockevent = &xen_vcpuop_clockevent; + } + + /* Set initial system time with full resolution */ + xen_read_wallclock(&xtime); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + tsc_disable = 0; + + xen_setup_timer(cpu); +} -- cgit v1.2.3 From e738fca8d7dffec30eeee231c38f128eed56c8c8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:05 -0700 Subject: xen: configuration Put config options for Xen after the core pieces are in place. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright --- arch/i386/Kconfig | 2 ++ arch/i386/xen/Kconfig | 11 +++++++++++ 2 files changed, 13 insertions(+) create mode 100644 arch/i386/xen/Kconfig (limited to 'arch') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index c7c9c2a15fa..7a11b905ef4 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -222,6 +222,8 @@ config PARAVIRT However, when run without a hypervisor the kernel is theoretically slower. If in doubt, say N. +source "arch/i386/xen/Kconfig" + config VMI bool "VMI Paravirt-ops support" depends on PARAVIRT diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig new file mode 100644 index 00000000000..7c5550058c1 --- /dev/null +++ b/arch/i386/xen/Kconfig @@ -0,0 +1,11 @@ +# +# This Kconfig describes xen options +# + +config XEN + bool "Enable support for Xen hypervisor" + depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || SMP || NEED_MULTIPLE_NODES) + help + This is the Linux Xen port. Enabling this will allow the + kernel to boot in a paravirtualized environment under the + Xen hypervisor. -- cgit v1.2.3 From f4f97b3ea90130520afb478cbc2918be2b6587b8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:05 -0700 Subject: xen: Complete pagetable pinning Xen requires all active pagetables to be marked read-only. When the base of the pagetable is loaded into %cr3, the hypervisor validates the entire pagetable and only allows the load to proceed if it all checks out. This is pretty slow, so to mitigate this cost Xen has a notion of pinned pagetables. Pinned pagetables are pagetables which are considered to be active even if no processor's cr3 is pointing to is. This means that it must remain read-only and all updates are validated by the hypervisor. This makes context switches much cheaper, because the hypervisor doesn't need to revalidate the pagetable each time. This also adds a new paravirt hook which is called during setup once the zones and memory allocator have been initialized. When the init_mm pagetable is first built, the struct page array does not yet exist, and so there's nowhere to put he init_mm pagetable's PG_pinned flags. Once the zones are initialized and the struct page array exists, we can set the PG_pinned flags for those pages. This patch also adds the Xen support for pte pages allocated out of highmem (highpte) by implementing xen_kmap_atomic_pte. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Cc: Zach Amsden --- arch/i386/xen/enlighten.c | 87 ++++++++++++---- arch/i386/xen/mmu.c | 260 ++++++++++++++++++++++++++++++---------------- arch/i386/xen/mmu.h | 2 +- arch/i386/xen/xen-ops.h | 2 + 4 files changed, 242 insertions(+), 109 deletions(-) (limited to 'arch') diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index 25eb3592f11..86e68e68011 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -21,6 +21,9 @@ #include #include #include +#include +#include +#include #include #include @@ -500,32 +503,59 @@ static void xen_write_cr3(unsigned long cr3) } } -static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) +/* Early in boot, while setting up the initial pagetable, assume + everything is pinned. */ +static void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) { - /* XXX pfn isn't necessarily a lowmem page */ + BUG_ON(mem_map); /* should only be used early */ make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); } -static void xen_alloc_pd(u32 pfn) +/* This needs to make sure the new pte page is pinned iff its being + attached to a pinned pagetable. */ +static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) { - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); -} + struct page *page = pfn_to_page(pfn); -static void xen_release_pd(u32 pfn) -{ - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + if (PagePinned(virt_to_page(mm->pgd))) { + SetPagePinned(page); + + if (!PageHighMem(page)) + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + else + /* make sure there are no stray mappings of + this page */ + kmap_flush_unused(); + } } +/* This should never happen until we're OK to use struct page */ static void xen_release_pt(u32 pfn) { - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + struct page *page = pfn_to_page(pfn); + + if (PagePinned(page)) { + if (!PageHighMem(page)) + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + } } -static void xen_alloc_pd_clone(u32 pfn, u32 clonepfn, - u32 start, u32 count) +#ifdef CONFIG_HIGHPTE +static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) { - xen_alloc_pd(pfn); + pgprot_t prot = PAGE_KERNEL; + + if (PagePinned(page)) + prot = PAGE_KERNEL_RO; + + if (0 && PageHighMem(page)) + printk("mapping highpte %lx type %d prot %s\n", + page_to_pfn(page), type, + (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); + + return kmap_atomic_prot(page, type, prot); } +#endif static __init void xen_pagetable_setup_start(pgd_t *base) { @@ -553,7 +583,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), PAGE_SIZE); - xen_alloc_pd(PFN_DOWN(__pa(pmd))); + make_lowmem_page_readonly(pmd); set_pgd(&base[i], __pgd(1 + __pa(pmd))); } else @@ -574,6 +604,10 @@ static __init void xen_pagetable_setup_start(pgd_t *base) static __init void xen_pagetable_setup_done(pgd_t *base) { + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + paravirt_ops.alloc_pt = xen_alloc_pt; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* * Create a mapping for the shared info page. @@ -591,7 +625,19 @@ static __init void xen_pagetable_setup_done(pgd_t *base) HYPERVISOR_shared_info = (struct shared_info *)__va(xen_start_info->shared_info); - xen_pgd_pin(base); + /* Actually pin the pagetable down, but we can't set PG_pinned + yet because the page structures don't exist yet. */ + { + struct mmuext_op op; +#ifdef CONFIG_X86_PAE + op.cmd = MMUEXT_PIN_L3_TABLE; +#else + op.cmd = MMUEXT_PIN_L3_TABLE; +#endif + op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); + } xen_vcpu_setup(smp_processor_id()); } @@ -608,6 +654,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .memory_setup = xen_memory_setup, .arch_setup = xen_arch_setup, .init_IRQ = xen_init_IRQ, + .post_allocator_init = xen_mark_init_mm_pinned, .time_init = xen_time_init, .set_wallclock = xen_set_wallclock, @@ -688,11 +735,15 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .pagetable_setup_start = xen_pagetable_setup_start, .pagetable_setup_done = xen_pagetable_setup_done, - .alloc_pt = xen_alloc_pt, - .alloc_pd = xen_alloc_pd, - .alloc_pd_clone = xen_alloc_pd_clone, - .release_pd = xen_release_pd, + .alloc_pt = xen_alloc_pt_init, .release_pt = xen_release_pt, + .alloc_pd = paravirt_nop, + .alloc_pd_clone = paravirt_nop, + .release_pd = paravirt_nop, + +#ifdef CONFIG_HIGHPTE + .kmap_atomic_pte = xen_kmap_atomic_pte, +#endif .set_pte = xen_set_pte, .set_pte_at = xen_set_pte_at, diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c index de16cb5f55c..53501ce2d15 100644 --- a/arch/i386/xen/mmu.c +++ b/arch/i386/xen/mmu.c @@ -38,19 +38,22 @@ * * Jeremy Fitzhardinge , XenSource Inc, 2007 */ +#include #include #include #include #include #include +#include #include -#include +#include #include #include +#include "multicalls.h" #include "mmu.h" xmaddr_t arbitrary_virt_to_machine(unsigned long address) @@ -92,16 +95,6 @@ void make_lowmem_page_readwrite(void *vaddr) } -void xen_set_pte(pte_t *ptep, pte_t pte) -{ - struct mmu_update u; - - u.ptr = virt_to_machine(ptep).maddr; - u.val = pte_val_ma(pte); - if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) - BUG(); -} - void xen_set_pmd(pmd_t *ptr, pmd_t val) { struct mmu_update u; @@ -112,18 +105,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) BUG(); } -#ifdef CONFIG_X86_PAE -void xen_set_pud(pud_t *ptr, pud_t val) -{ - struct mmu_update u; - - u.ptr = virt_to_machine(ptr).maddr; - u.val = pud_val_ma(val); - if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) - BUG(); -} -#endif - /* * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. @@ -170,6 +151,23 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, } #ifdef CONFIG_X86_PAE +void xen_set_pud(pud_t *ptr, pud_t val) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pud_val_ma(val); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +} + +void xen_set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { set_64bit((u64 *)ptep, pte_val_ma(pte)); @@ -239,6 +237,11 @@ pgd_t xen_make_pgd(unsigned long long pgd) return (pgd_t){ pgd }; } #else /* !PAE */ +void xen_set_pte(pte_t *ptep, pte_t pte) +{ + *ptep = pte; +} + unsigned long xen_pte_val(pte_t pte) { unsigned long ret = pte.pte_low; @@ -249,13 +252,6 @@ unsigned long xen_pte_val(pte_t pte) return ret; } -unsigned long xen_pmd_val(pmd_t pmd) -{ - /* a BUG here is a lot easier to track down than a NULL eip */ - BUG(); - return 0; -} - unsigned long xen_pgd_val(pgd_t pgd) { unsigned long ret = pgd.pgd; @@ -272,13 +268,6 @@ pte_t xen_make_pte(unsigned long pte) return (pte_t){ pte }; } -pmd_t xen_make_pmd(unsigned long pmd) -{ - /* a BUG here is a lot easier to track down than a NULL eip */ - BUG(); - return __pmd(0); -} - pgd_t xen_make_pgd(unsigned long pgd) { if (pgd & _PAGE_PRESENT) @@ -290,108 +279,199 @@ pgd_t xen_make_pgd(unsigned long pgd) -static void pgd_walk_set_prot(void *pt, pgprot_t flags) -{ - unsigned long pfn = PFN_DOWN(__pa(pt)); - - if (HYPERVISOR_update_va_mapping((unsigned long)pt, - pfn_pte(pfn, flags), 0) < 0) - BUG(); -} - -static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) +/* + (Yet another) pagetable walker. This one is intended for pinning a + pagetable. This means that it walks a pagetable and calls the + callback function on each page it finds making up the page table, + at every level. It walks the entire pagetable, but it only bothers + pinning pte pages which are below pte_limit. In the normal case + this will be TASK_SIZE, but at boot we need to pin up to + FIXADDR_TOP. But the important bit is that we don't pin beyond + there, because then we start getting into Xen's ptes. +*/ +static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), + unsigned long limit) { pgd_t *pgd = pgd_base; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int g, u, m; + int flush = 0; + unsigned long addr = 0; + unsigned long pgd_next; + + BUG_ON(limit > FIXADDR_TOP); if (xen_feature(XENFEAT_auto_translated_physmap)) - return; + return 0; + + for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { + pud_t *pud; + unsigned long pud_limit, pud_next; - for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { - if (pgd_none(*pgd)) + pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); + + if (!pgd_val(*pgd)) continue; + pud = pud_offset(pgd, 0); if (PTRS_PER_PUD > 1) /* not folded */ - pgd_walk_set_prot(pud, flags); + flush |= (*func)(virt_to_page(pud), 0); + + for (; addr != pud_limit; pud++, addr = pud_next) { + pmd_t *pmd; + unsigned long pmd_limit; + + pud_next = pud_addr_end(addr, pud_limit); + + if (pud_next < limit) + pmd_limit = pud_next; + else + pmd_limit = limit; - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { if (pud_none(*pud)) continue; + pmd = pmd_offset(pud, 0); if (PTRS_PER_PMD > 1) /* not folded */ - pgd_walk_set_prot(pmd, flags); + flush |= (*func)(virt_to_page(pmd), 0); + + for (; addr != pmd_limit; pmd++) { + addr += (PAGE_SIZE * PTRS_PER_PTE); + if ((pmd_limit-1) < (addr-1)) { + addr = pmd_limit; + break; + } - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { if (pmd_none(*pmd)) continue; - /* This can get called before mem_map - is set up, so we assume nothing is - highmem at that point. */ - if (mem_map == NULL || - !PageHighMem(pmd_page(*pmd))) { - pte = pte_offset_kernel(pmd, 0); - pgd_walk_set_prot(pte, flags); - } + flush |= (*func)(pmd_page(*pmd), 0); } } } - if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base, - pfn_pte(PFN_DOWN(__pa(pgd_base)), - flags), - UVMF_TLB_FLUSH) < 0) - BUG(); + flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); + + return flush; } +static int pin_page(struct page *page, unsigned flags) +{ + unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); + int flush; + + if (pgfl) + flush = 0; /* already pinned */ + else if (PageHighMem(page)) + /* kmaps need flushing if we found an unpinned + highpage */ + flush = 1; + else { + void *pt = lowmem_page_address(page); + unsigned long pfn = page_to_pfn(page); + struct multicall_space mcs = __xen_mc_entry(0); + + flush = 0; + + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, + pfn_pte(pfn, PAGE_KERNEL_RO), + flags); + } + + return flush; +} -/* This is called just after a mm has been duplicated from its parent, - but it has not been used yet. We need to make sure that its - pagetable is all read-only, and can be pinned. */ +/* This is called just after a mm has been created, but it has not + been used yet. We need to make sure that its pagetable is all + read-only, and can be pinned. */ void xen_pgd_pin(pgd_t *pgd) { - struct mmuext_op op; + struct multicall_space mcs; + struct mmuext_op *op; - pgd_walk(pgd, PAGE_KERNEL_RO); + xen_mc_batch(); -#if defined(CONFIG_X86_PAE) - op.cmd = MMUEXT_PIN_L3_TABLE; + if (pgd_walk(pgd, pin_page, TASK_SIZE)) + kmap_flush_unused(); + + mcs = __xen_mc_entry(sizeof(*op)); + op = mcs.args; + +#ifdef CONFIG_X86_PAE + op->cmd = MMUEXT_PIN_L3_TABLE; #else - op.cmd = MMUEXT_PIN_L2_TABLE; + op->cmd = MMUEXT_PIN_L2_TABLE; #endif - op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) - BUG(); + op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(0); } -/* Release a pagetables pages back as normal RW */ -void xen_pgd_unpin(pgd_t *pgd) +/* The init_mm pagetable is really pinned as soon as its created, but + that's before we have page structures to store the bits. So do all + the book-keeping now. */ +static __init int mark_pinned(struct page *page, unsigned flags) { - struct mmuext_op op; + SetPagePinned(page); + return 0; +} - op.cmd = MMUEXT_UNPIN_TABLE; - op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); +void __init xen_mark_init_mm_pinned(void) +{ + pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); +} - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) - BUG(); +static int unpin_page(struct page *page, unsigned flags) +{ + unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); - pgd_walk(pgd, PAGE_KERNEL); + if (pgfl && !PageHighMem(page)) { + void *pt = lowmem_page_address(page); + unsigned long pfn = page_to_pfn(page); + struct multicall_space mcs = __xen_mc_entry(0); + + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, + pfn_pte(pfn, PAGE_KERNEL), + flags); + } + + return 0; /* never need to flush on unpin */ } +/* Release a pagetables pages back as normal RW */ +static void xen_pgd_unpin(pgd_t *pgd) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + xen_mc_batch(); + + mcs = __xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_UNPIN_TABLE; + op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + pgd_walk(pgd, unpin_page, TASK_SIZE); + + xen_mc_issue(0); +} void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { + spin_lock(&next->page_table_lock); xen_pgd_pin(next->pgd); + spin_unlock(&next->page_table_lock); } void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { + spin_lock(&mm->page_table_lock); xen_pgd_pin(mm->pgd); + spin_unlock(&mm->page_table_lock); } void xen_exit_mmap(struct mm_struct *mm) diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h index 764eaaae0a2..49776fe9f02 100644 --- a/arch/i386/xen/mmu.h +++ b/arch/i386/xen/mmu.h @@ -15,7 +15,7 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); void xen_exit_mmap(struct mm_struct *mm); void xen_pgd_pin(pgd_t *pgd); -void xen_pgd_unpin(pgd_t *pgd); +//void xen_pgd_unpin(pgd_t *pgd); #ifdef CONFIG_X86_PAE unsigned long long xen_pte_val(pte_t); diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h index 79648fe1ab7..54d98b52085 100644 --- a/arch/i386/xen/xen-ops.h +++ b/arch/i386/xen/xen-ops.h @@ -20,6 +20,8 @@ unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); cycle_t xen_clocksource_read(void); +void xen_mark_init_mm_pinned(void); + DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); static inline unsigned xen_get_lazy_mode(void) -- cgit v1.2.3 From 9a4029fd3409eb224eb62c32d9792071382694ec Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:05 -0700 Subject: xen: ignore RW mapping of RO pages in pagetable_init When setting up the initial pagetable, which includes mappings of all low physical memory, ignore a mapping which tries to set the RW bit on an RO pte. An RO pte indicates a page which is part of the current pagetable, and so it cannot be allowed to become RW. Once xen_pagetable_setup_done is called, set_pte reverts to its normal behaviour. Signed-off-by: Jeremy Fitzhardinge Acked-by: Chris Wright Cc: ebiederm@xmission.com (Eric W. Biederman) --- arch/i386/xen/enlighten.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index 86e68e68011..9550ae3b1fb 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -505,7 +505,7 @@ static void xen_write_cr3(unsigned long cr3) /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ -static void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) +static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) { BUG_ON(mem_map); /* should only be used early */ make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); @@ -557,10 +557,32 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) } #endif +static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +{ + /* If there's an existing pte, then don't allow _PAGE_RW to be set */ + if (pte_val_ma(*ptep) & _PAGE_PRESENT) + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); + + return pte; +} + +/* Init-time set_pte while constructing initial pagetables, which + doesn't allow RO pagetable pages to be remapped RW */ +static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) +{ + pte = mask_rw_pte(ptep, pte); + + xen_set_pte(ptep, pte); +} + static __init void xen_pagetable_setup_start(pgd_t *base) { pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + /* special set_pte for pagetable initialization */ + paravirt_ops.set_pte = xen_set_pte_init; + init_mm.pgd = base; /* * copy top-level of Xen-supplied pagetable into place. For @@ -607,6 +629,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) /* This will work as long as patching hasn't happened yet (which it hasn't) */ paravirt_ops.alloc_pt = xen_alloc_pt; + paravirt_ops.set_pte = xen_set_pte; if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* @@ -745,7 +768,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .kmap_atomic_pte = xen_kmap_atomic_pte, #endif - .set_pte = xen_set_pte, + .set_pte = NULL, /* see xen_pagetable_setup_* */ .set_pte_at = xen_set_pte_at, .set_pmd = xen_set_pmd, -- cgit v1.2.3 From f91a8b447b9af64f589f6e13fec7f09b5927563d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:05 -0700 Subject: xen: Account for stolen time This patch accounts for the time stolen from our VCPUs. Stolen time is time where a vcpu is runnable and could be running, but all available physical CPUs are being used for something else. This accounting gets run on each timer interrupt, just as a way to get it run relatively often, and when interesting things are going on. Stolen time is not really used by much in the kernel; it is reported in /proc/stats, and that's about it. Signed-off-by: Jeremy Fitzhardinge Acked-by: Chris Wright Cc: john stultz Cc: Rik van Riel --- arch/i386/xen/time.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 150 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c index b457980ff3c..acbfd996946 100644 --- a/arch/i386/xen/time.c +++ b/arch/i386/xen/time.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,7 @@ /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 +#define NS_PER_TICK (1000000000LL / HZ) /* These are perodically updated in shared_info, and then copied here. */ struct shadow_time_info { @@ -37,6 +39,139 @@ struct shadow_time_info { static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); +/* runstate info updated by Xen */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); + +/* snapshots of runstate info */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); + +/* unused ns of stolen and blocked time */ +static DEFINE_PER_CPU(u64, residual_stolen); +static DEFINE_PER_CPU(u64, residual_blocked); + +/* return an consistent snapshot of 64-bit time/counter value */ +static u64 get64(const u64 *p) +{ + u64 ret; + + if (BITS_PER_LONG < 64) { + u32 *p32 = (u32 *)p; + u32 h, l; + + /* + * Read high then low, and then make sure high is + * still the same; this will only loop if low wraps + * and carries into high. + * XXX some clean way to make this endian-proof? + */ + do { + h = p32[1]; + barrier(); + l = p32[0]; + barrier(); + } while (p32[1] != h); + + ret = (((u64)h) << 32) | l; + } else + ret = *p; + + return ret; +} + +/* + * Runstate accounting + */ +static void get_runstate_snapshot(struct vcpu_runstate_info *res) +{ + u64 state_time; + struct vcpu_runstate_info *state; + + preempt_disable(); + + state = &__get_cpu_var(runstate); + + /* + * The runstate info is always updated by the hypervisor on + * the current CPU, so there's no need to use anything + * stronger than a compiler barrier when fetching it. + */ + do { + state_time = get64(&state->state_entry_time); + barrier(); + *res = *state; + barrier(); + } while (get64(&state->state_entry_time) != state_time); + + preempt_enable(); +} + +static void setup_runstate_info(int cpu) +{ + struct vcpu_register_runstate_memory_area area; + + area.addr.v = &per_cpu(runstate, cpu); + + if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, + cpu, &area)) + BUG(); +} + +static void do_stolen_accounting(void) +{ + struct vcpu_runstate_info state; + struct vcpu_runstate_info *snap; + s64 blocked, runnable, offline, stolen; + cputime_t ticks; + + get_runstate_snapshot(&state); + + WARN_ON(state.state != RUNSTATE_running); + + snap = &__get_cpu_var(runstate_snapshot); + + /* work out how much time the VCPU has not been runn*ing* */ + blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; + runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; + offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; + + *snap = state; + + /* Add the appropriate number of ticks of stolen time, + including any left-overs from last time. Passing NULL to + account_steal_time accounts the time as stolen. */ + stolen = runnable + offline + __get_cpu_var(residual_stolen); + + if (stolen < 0) + stolen = 0; + + ticks = 0; + while (stolen >= NS_PER_TICK) { + ticks++; + stolen -= NS_PER_TICK; + } + __get_cpu_var(residual_stolen) = stolen; + account_steal_time(NULL, ticks); + + /* Add the appropriate number of ticks of blocked time, + including any left-overs from last time. Passing idle to + account_steal_time accounts the time as idle/wait. */ + blocked += __get_cpu_var(residual_blocked); + + if (blocked < 0) + blocked = 0; + + ticks = 0; + while (blocked >= NS_PER_TICK) { + ticks++; + blocked -= NS_PER_TICK; + } + __get_cpu_var(residual_blocked) = blocked; + account_steal_time(idle_task(smp_processor_id()), ticks); +} + + + +/* Get the CPU speed from Xen */ unsigned long xen_cpu_khz(void) { u64 cpu_khz = 1000000ULL << 32; @@ -56,13 +191,11 @@ unsigned long xen_cpu_khz(void) * Reads a consistent set of time-base values from Xen, into a shadow data * area. */ -static void get_time_values_from_xen(void) +static unsigned get_time_values_from_xen(void) { struct vcpu_time_info *src; struct shadow_time_info *dst; - preempt_disable(); - /* src is shared memory with the hypervisor, so we need to make sure we get a consistent snapshot, even in the face of being preempted. */ @@ -79,7 +212,7 @@ static void get_time_values_from_xen(void) rmb(); /* test version after fetching data */ } while ((src->version & 1) | (dst->version ^ src->version)); - preempt_enable(); + return dst->version; } /* @@ -123,7 +256,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) static u64 get_nsec_offset(struct shadow_time_info *shadow) { u64 now, delta; - rdtscll(now); + now = native_read_tsc(); delta = now - shadow->tsc_timestamp; return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); } @@ -132,10 +265,14 @@ cycle_t xen_clocksource_read(void) { struct shadow_time_info *shadow = &get_cpu_var(shadow_time); cycle_t ret; + unsigned version; - get_time_values_from_xen(); - - ret = shadow->system_timestamp + get_nsec_offset(shadow); + do { + version = get_time_values_from_xen(); + barrier(); + ret = shadow->system_timestamp + get_nsec_offset(shadow); + barrier(); + } while (version != __get_cpu_var(xen_vcpu)->time.version); put_cpu_var(shadow_time); @@ -352,6 +489,8 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) ret = IRQ_HANDLED; } + do_stolen_accounting(); + return ret; } @@ -378,6 +517,8 @@ static void xen_setup_timer(int cpu) evt->irq = irq; clockevents_register_device(evt); + setup_runstate_info(cpu); + put_cpu_var(xen_clock_events); } @@ -390,7 +531,7 @@ __init void xen_time_init(void) clocksource_register(&xen_clocksource); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { - /* Successfully turned off 100hz tick, so we have the + /* Successfully turned off 100Hz tick, so we have the vcpuop-based timer interface */ printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); xen_clockevent = &xen_vcpuop_clockevent; -- cgit v1.2.3 From ab55028886dd1dd54585f22bf19a00eb23869340 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:05 -0700 Subject: xen: Implement sched_clock Implement xen_sched_clock, which returns the number of ns the current vcpu has been actually in an unstolen state (ie, running or blocked, vs runnable-but-not-running, or offline) since boot. Signed-off-by: Jeremy Fitzhardinge Acked-by: Chris Wright Cc: john stultz --- arch/i386/xen/enlighten.c | 2 +- arch/i386/xen/time.c | 27 ++++++++++++++++++++++++++- arch/i386/xen/xen-ops.h | 3 +-- 3 files changed, 28 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index 9550ae3b1fb..a9ba834295a 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -683,7 +683,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, .get_cpu_khz = xen_cpu_khz, - .sched_clock = xen_clocksource_read, + .sched_clock = xen_sched_clock, .cpuid = xen_cpuid, diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c index acbfd996946..2aab44bec2a 100644 --- a/arch/i386/xen/time.c +++ b/arch/i386/xen/time.c @@ -28,6 +28,8 @@ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) +static cycle_t xen_clocksource_read(void); + /* These are perodically updated in shared_info, and then copied here. */ struct shadow_time_info { u64 tsc_timestamp; /* TSC at last update of time vals. */ @@ -169,6 +171,29 @@ static void do_stolen_accounting(void) account_steal_time(idle_task(smp_processor_id()), ticks); } +/* + * Xen sched_clock implementation. Returns the number of unstolen + * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED + * states. + */ +unsigned long long xen_sched_clock(void) +{ + struct vcpu_runstate_info state; + cycle_t now = xen_clocksource_read(); + s64 offset; + + get_runstate_snapshot(&state); + + WARN_ON(state.state != RUNSTATE_running); + + offset = now - state.state_entry_time; + if (offset < 0) + offset = 0; + + return state.time[RUNSTATE_blocked] + + state.time[RUNSTATE_running] + + offset; +} /* Get the CPU speed from Xen */ @@ -261,7 +286,7 @@ static u64 get_nsec_offset(struct shadow_time_info *shadow) return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); } -cycle_t xen_clocksource_read(void) +static cycle_t xen_clocksource_read(void) { struct shadow_time_info *shadow = &get_cpu_var(shadow_time); cycle_t ret; diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h index 54d98b52085..7667abd390e 100644 --- a/arch/i386/xen/xen-ops.h +++ b/arch/i386/xen/xen-ops.h @@ -2,7 +2,6 @@ #define XEN_OPS_H #include -#include DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); DECLARE_PER_CPU(unsigned long, xen_cr3); @@ -18,7 +17,7 @@ unsigned long xen_cpu_khz(void); void __init xen_time_init(void); unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); -cycle_t xen_clocksource_read(void); +unsigned long long xen_sched_clock(void); void xen_mark_init_mm_pinned(void); -- cgit v1.2.3 From f87e4cac4f4e940b328d3deb5b53e642e3881f43 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:06 -0700 Subject: xen: SMP guest support This is a fairly straightforward Xen implementation of smp_ops. Xen has its own IPI mechanisms, and has no dependency on any APIC-based IPI. The smp_ops hooks and the flush_tlb_others pv_op allow a Xen guest to avoid all APIC code in arch/i386 (the only apic operation is a single apic_read for the apic version number). One subtle point which needs to be addressed is unpinning pagetables when another cpu may have a lazy tlb reference to the pagetable. Xen will not allow an in-use pagetable to be unpinned, so we must find any other cpus with a reference to the pagetable and get them to shoot down their references. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Cc: Benjamin LaHaise Cc: Ingo Molnar Cc: Andi Kleen --- arch/i386/xen/Kconfig | 2 +- arch/i386/xen/Makefile | 2 + arch/i386/xen/enlighten.c | 115 ++++++++++--- arch/i386/xen/events.c | 80 ++++++++- arch/i386/xen/mmu.c | 69 ++++++-- arch/i386/xen/mmu.h | 13 ++ arch/i386/xen/setup.c | 5 + arch/i386/xen/smp.c | 407 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/xen/time.c | 13 +- arch/i386/xen/xen-ops.h | 25 +++ 10 files changed, 682 insertions(+), 49 deletions(-) create mode 100644 arch/i386/xen/smp.c (limited to 'arch') diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig index 7c5550058c1..b7697ff2236 100644 --- a/arch/i386/xen/Kconfig +++ b/arch/i386/xen/Kconfig @@ -4,7 +4,7 @@ config XEN bool "Enable support for Xen hypervisor" - depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || SMP || NEED_MULTIPLE_NODES) + depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || NEED_MULTIPLE_NODES) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile index bf51cabed0d..fd05f243a3f 100644 --- a/arch/i386/xen/Makefile +++ b/arch/i386/xen/Makefile @@ -1,2 +1,4 @@ obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ events.o time.o + +obj-$(CONFIG_SMP) += smp.o diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index a9ba834295a..de62d66e089 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,7 @@ #include #include #include +#include #include "xen-ops.h" #include "mmu.h" @@ -56,7 +58,7 @@ DEFINE_PER_CPU(unsigned long, xen_cr3); struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); -static void xen_vcpu_setup(int cpu) +void xen_vcpu_setup(int cpu) { per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; } @@ -347,23 +349,14 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, } } -/* Load a new IDT into Xen. In principle this can be per-CPU, so we - hold a spinlock to protect the static traps[] array (static because - it avoids allocation, and saves stack space). */ -static void xen_load_idt(const struct Xgt_desc_struct *desc) +static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, + struct trap_info *traps) { - static DEFINE_SPINLOCK(lock); - static struct trap_info traps[257]; - - int cpu = smp_processor_id(); unsigned in, out, count; - per_cpu(idt_desc, cpu) = *desc; - count = (desc->size+1) / 8; BUG_ON(count > 256); - spin_lock(&lock); for (in = out = 0; in < count; in++) { const u32 *entry = (u32 *)(desc->address + in * 8); @@ -371,6 +364,31 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc) out++; } traps[out].address = 0; +} + +void xen_copy_trap_info(struct trap_info *traps) +{ + const struct Xgt_desc_struct *desc = &get_cpu_var(idt_desc); + + xen_convert_trap_info(desc, traps); + + put_cpu_var(idt_desc); +} + +/* Load a new IDT into Xen. In principle this can be per-CPU, so we + hold a spinlock to protect the static traps[] array (static because + it avoids allocation, and saves stack space). */ +static void xen_load_idt(const struct Xgt_desc_struct *desc) +{ + static DEFINE_SPINLOCK(lock); + static struct trap_info traps[257]; + int cpu = smp_processor_id(); + + per_cpu(idt_desc, cpu) = *desc; + + spin_lock(&lock); + + xen_convert_trap_info(desc, traps); xen_mc_flush(); if (HYPERVISOR_set_trap_table(traps)) @@ -428,6 +446,12 @@ static unsigned long xen_apic_read(unsigned long reg) { return 0; } + +static void xen_apic_write(unsigned long reg, unsigned long val) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} #endif static void xen_flush_tlb(void) @@ -449,6 +473,40 @@ static void xen_flush_tlb_single(unsigned long addr) BUG(); } +static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, + unsigned long va) +{ + struct mmuext_op op; + cpumask_t cpumask = *cpus; + + /* + * A couple of (to be removed) sanity checks: + * + * - current CPU must not be in mask + * - mask must exist :) + */ + BUG_ON(cpus_empty(cpumask)); + BUG_ON(cpu_isset(smp_processor_id(), cpumask)); + BUG_ON(!mm); + + /* If a CPU which we ran on has gone down, OK. */ + cpus_and(cpumask, cpumask, cpu_online_map); + if (cpus_empty(cpumask)) + return; + + if (va == TLB_FLUSH_ALL) { + op.cmd = MMUEXT_TLB_FLUSH_MULTI; + op.arg2.vcpumask = (void *)cpus; + } else { + op.cmd = MMUEXT_INVLPG_MULTI; + op.arg1.linear_addr = va; + op.arg2.vcpumask = (void *)cpus; + } + + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + static unsigned long xen_read_cr2(void) { return x86_read_percpu(xen_vcpu)->arch.cr2; @@ -460,18 +518,6 @@ static void xen_write_cr4(unsigned long cr4) native_write_cr4(cr4 & ~X86_CR4_TSD); } -/* - * Page-directory addresses above 4GB do not fit into architectural %cr3. - * When accessing %cr3, or equivalent field in vcpu_guest_context, guests - * must use the following accessor macros to pack/unpack valid MFNs. - * - * Note that Xen is using the fact that the pagetable base is always - * page-aligned, and putting the 12 MSB of the address into the 12 LSB - * of cr3. - */ -#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) -#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) - static unsigned long xen_read_cr3(void) { return x86_read_percpu(xen_cr3); @@ -740,8 +786,8 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .io_delay = xen_io_delay, #ifdef CONFIG_X86_LOCAL_APIC - .apic_write = paravirt_nop, - .apic_write_atomic = paravirt_nop, + .apic_write = xen_apic_write, + .apic_write_atomic = xen_apic_write, .apic_read = xen_apic_read, .setup_boot_clock = paravirt_nop, .setup_secondary_clock = paravirt_nop, @@ -751,6 +797,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .flush_tlb_user = xen_flush_tlb, .flush_tlb_kernel = xen_flush_tlb, .flush_tlb_single = xen_flush_tlb_single, + .flush_tlb_others = xen_flush_tlb_others, .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, @@ -796,6 +843,19 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .set_lazy_mode = xen_set_lazy_mode, }; +#ifdef CONFIG_SMP +static const struct smp_ops xen_smp_ops __initdata = { + .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, + .smp_prepare_cpus = xen_smp_prepare_cpus, + .cpu_up = xen_cpu_up, + .smp_cpus_done = xen_smp_cpus_done, + + .smp_send_stop = xen_smp_send_stop, + .smp_send_reschedule = xen_smp_send_reschedule, + .smp_call_function_mask = xen_smp_call_function_mask, +}; +#endif /* CONFIG_SMP */ + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -808,6 +868,9 @@ asmlinkage void __init xen_start_kernel(void) /* Install Xen paravirt ops */ paravirt_ops = xen_paravirt_ops; +#ifdef CONFIG_SMP + smp_ops = xen_smp_ops; +#endif xen_setup_features(); diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c index e7c5d00ab4f..4103b8bf22f 100644 --- a/arch/i386/xen/events.c +++ b/arch/i386/xen/events.c @@ -47,6 +47,9 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock); /* IRQ <-> VIRQ mapping. */ static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; +/* IRQ <-> IPI mapping */ +static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; + /* Packed IRQ information: binding type, sub-type index, and event channel. */ struct packed_irq { @@ -58,7 +61,13 @@ struct packed_irq static struct packed_irq irq_info[NR_IRQS]; /* Binding types. */ -enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN }; +enum { + IRQT_UNBOUND, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN +}; /* Convenient shorthand for packed representation of an unbound IRQ. */ #define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) @@ -261,6 +270,45 @@ static int bind_evtchn_to_irq(unsigned int evtchn) return irq; } +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + irq = per_cpu(ipi_to_irq, cpu)[ipi]; + if (irq == -1) { + irq = find_unbound_irq(); + if (irq < 0) + goto out; + + dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "ipi"); + + bind_ipi.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); + + per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} + + static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { struct evtchn_bind_virq bind_virq; @@ -369,6 +417,28 @@ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, } EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); +int bind_ipi_to_irqhandler(enum ipi_vector ipi, + unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_ipi_to_irq(ipi, cpu); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} + void unbind_from_irqhandler(unsigned int irq, void *dev_id) { free_irq(irq, dev_id); @@ -376,6 +446,14 @@ void unbind_from_irqhandler(unsigned int irq, void *dev_id) } EXPORT_SYMBOL_GPL(unbind_from_irqhandler); +void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) +{ + int irq = per_cpu(ipi_to_irq, cpu)[vector]; + BUG_ON(irq < 0); + notify_remote_via_irq(irq); +} + + /* * Search the CPUs pending events bitmasks. For each one found, map * the event number to an irq, and feed it into do_IRQ() for diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c index 53501ce2d15..bc49ef84620 100644 --- a/arch/i386/xen/mmu.c +++ b/arch/i386/xen/mmu.c @@ -391,8 +391,12 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); - if (pgd_walk(pgd, pin_page, TASK_SIZE)) + if (pgd_walk(pgd, pin_page, TASK_SIZE)) { + /* re-enable interrupts for kmap_flush_unused */ + xen_mc_issue(0); kmap_flush_unused(); + xen_mc_batch(); + } mcs = __xen_mc_entry(sizeof(*op)); op = mcs.args; @@ -474,27 +478,58 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -void xen_exit_mmap(struct mm_struct *mm) -{ - struct task_struct *tsk = current; - - task_lock(tsk); - /* - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() - * *much* faster this way, as no tlb flushes means bigger wrpt batches. - */ - if (tsk->active_mm == mm) { - tsk->active_mm = &init_mm; - atomic_inc(&init_mm.mm_count); +#ifdef CONFIG_SMP +/* Another cpu may still have their %cr3 pointing at the pagetable, so + we need to repoint it somewhere else before we can unpin it. */ +static void drop_other_mm_ref(void *info) +{ + struct mm_struct *mm = info; - switch_mm(mm, &init_mm, tsk); + if (__get_cpu_var(cpu_tlbstate).active_mm == mm) + leave_mm(smp_processor_id()); +} - atomic_dec(&mm->mm_count); - BUG_ON(atomic_read(&mm->mm_count) == 0); +static void drop_mm_ref(struct mm_struct *mm) +{ + if (current->active_mm == mm) { + if (current->mm == mm) + load_cr3(swapper_pg_dir); + else + leave_mm(smp_processor_id()); } - task_unlock(tsk); + if (!cpus_empty(mm->cpu_vm_mask)) + xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, + mm, 1); +} +#else +static void drop_mm_ref(struct mm_struct *mm) +{ + if (current->active_mm == mm) + load_cr3(swapper_pg_dir); +} +#endif + +/* + * While a process runs, Xen pins its pagetables, which means that the + * hypervisor forces it to be read-only, and it controls all updates + * to it. This means that all pagetable updates have to go via the + * hypervisor, which is moderately expensive. + * + * Since we're pulling the pagetable down, we switch to use init_mm, + * unpin old process pagetable and mark it all read-write, which + * allows further operations on it to be simple memory accesses. + * + * The only subtle point is that another CPU may be still using the + * pagetable because of lazy tlb flushing. This means we need need to + * switch all CPUs off this pagetable before we can unpin it. + */ +void xen_exit_mmap(struct mm_struct *mm) +{ + get_cpu(); /* make sure we don't move around */ + drop_mm_ref(mm); + put_cpu(); xen_pgd_unpin(mm->pgd); } diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h index 49776fe9f02..c9ff27f3ac3 100644 --- a/arch/i386/xen/mmu.h +++ b/arch/i386/xen/mmu.h @@ -3,6 +3,19 @@ #include #include +/* + * Page-directory addresses above 4GB do not fit into architectural %cr3. + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests + * must use the following accessor macros to pack/unpack valid MFNs. + * + * Note that Xen is using the fact that the pagetable base is always + * page-aligned, and putting the 12 MSB of the address into the 12 LSB + * of cr3. + */ +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) + + void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); void xen_set_pte(pte_t *ptep, pte_t pteval); diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c index 7da93ee612f..18a994d5a4c 100644 --- a/arch/i386/xen/setup.c +++ b/arch/i386/xen/setup.c @@ -94,4 +94,9 @@ void __init xen_arch_setup(void) COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); pm_idle = xen_idle; + +#ifdef CONFIG_SMP + /* fill cpus_possible with all available cpus */ + xen_fill_possible_map(); +#endif } diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c new file mode 100644 index 00000000000..a91587fbf5c --- /dev/null +++ b/arch/i386/xen/smp.c @@ -0,0 +1,407 @@ +/* + * Xen SMP support + * + * This file implements the Xen versions of smp_ops. SMP under Xen is + * very straightforward. Bringing a CPU up is simply a matter of + * loading its initial context and setting it running. + * + * IPIs are handled through the Xen event mechanism. + * + * Because virtual CPUs can be scheduled onto any real CPU, there's no + * useful topology information for the kernel to make use of. As a + * result, all CPUs are treated as if they're single-core and + * single-threaded. + * + * This does not handle HOTPLUG_CPU yet. + */ +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include "xen-ops.h" +#include "mmu.h" + +static cpumask_t cpu_initialized_map; +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); + +/* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + */ +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +}; + +static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); + +static struct call_data_struct *call_data; + +/* + * Reschedule call back. Nothing to do, + * all the work is done automatically when + * we return from the interrupt. + */ +static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) +{ + return IRQ_HANDLED; +} + +static __cpuinit void cpu_bringup_and_idle(void) +{ + int cpu = smp_processor_id(); + + cpu_init(); + + preempt_disable(); + per_cpu(cpu_state, cpu) = CPU_ONLINE; + + xen_setup_cpu_clockevents(); + + /* We can take interrupts now: we're officially "up". */ + local_irq_enable(); + + wmb(); /* make sure everything is out */ + cpu_idle(); +} + +static int xen_smp_intr_init(unsigned int cpu) +{ + int rc; + const char *resched_name, *callfunc_name; + + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; + + resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, + cpu, + xen_reschedule_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + resched_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(resched_irq, cpu) = rc; + + callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, + cpu, + xen_call_function_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(callfunc_irq, cpu) = rc; + + return 0; + + fail: + if (per_cpu(resched_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); + if (per_cpu(callfunc_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + return rc; +} + +void __init xen_fill_possible_map(void) +{ + int i, rc; + + for (i = 0; i < NR_CPUS; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) + cpu_set(i, cpu_possible_map); + } +} + +void __init xen_smp_prepare_boot_cpu(void) +{ + int cpu; + + BUG_ON(smp_processor_id() != 0); + native_smp_prepare_boot_cpu(); + + xen_vcpu_setup(0); + + /* We've switched to the "real" per-cpu gdt, so make sure the + old memory can be recycled */ + make_lowmem_page_readwrite(&per_cpu__gdt_page); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + } +} + +void __init xen_smp_prepare_cpus(unsigned int max_cpus) +{ + unsigned cpu; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + } + + smp_store_cpu_info(0); + set_cpu_sibling_map(0); + + if (xen_smp_intr_init(0)) + BUG(); + + cpu_initialized_map = cpumask_of_cpu(0); + + /* Restrict the possible_map according to max_cpus. */ + while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { + for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--) + continue; + cpu_clear(cpu, cpu_possible_map); + } + + for_each_possible_cpu (cpu) { + struct task_struct *idle; + + if (cpu == 0) + continue; + + idle = fork_idle(cpu); + if (IS_ERR(idle)) + panic("failed fork for CPU %d", cpu); + + cpu_set(cpu, cpu_present_map); + } + + //init_xenbus_allowed_cpumask(); +} + +static __cpuinit int +cpu_initialize_context(unsigned int cpu, struct task_struct *idle) +{ + struct vcpu_guest_context *ctxt; + struct gdt_page *gdt = &per_cpu(gdt_page, cpu); + + if (cpu_test_and_set(cpu, cpu_initialized_map)) + return 0; + + ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); + if (ctxt == NULL) + return -ENOMEM; + + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.fs = __KERNEL_PERCPU; + ctxt->user_regs.gs = 0; + ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + + memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); + + xen_copy_trap_info(ctxt->trap_ctxt); + + ctxt->ldt_ents = 0; + + BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); + make_lowmem_page_readonly(gdt->gdt); + + ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); + ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); + + ctxt->user_regs.cs = __KERNEL_CS; + ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); + + ctxt->kernel_ss = __KERNEL_DS; + ctxt->kernel_sp = idle->thread.esp0; + + ctxt->event_callback_cs = __KERNEL_CS; + ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_cs = __KERNEL_CS; + ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; + + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); + ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); + + if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) + BUG(); + + kfree(ctxt); + return 0; +} + +int __cpuinit xen_cpu_up(unsigned int cpu) +{ + struct task_struct *idle = idle_task(cpu); + int rc; + +#if 0 + rc = cpu_up_check(cpu); + if (rc) + return rc; +#endif + + init_gdt(cpu); + per_cpu(current_task, cpu) = idle; + xen_vcpu_setup(cpu); + irq_ctx_init(cpu); + xen_setup_timer(cpu); + + /* make sure interrupts start blocked */ + per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; + + rc = cpu_initialize_context(cpu, idle); + if (rc) + return rc; + + if (num_online_cpus() == 1) + alternatives_smp_switch(1); + + rc = xen_smp_intr_init(cpu); + if (rc) + return rc; + + smp_store_cpu_info(cpu); + set_cpu_sibling_map(cpu); + /* This must be done before setting cpu_online_map */ + wmb(); + + cpu_set(cpu, cpu_online_map); + + rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); + BUG_ON(rc); + + return 0; +} + +void xen_smp_cpus_done(unsigned int max_cpus) +{ +} + +static void stop_self(void *v) +{ + int cpu = smp_processor_id(); + + /* make sure we're not pinning something down */ + load_cr3(swapper_pg_dir); + /* should set up a minimal gdt */ + + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); + BUG(); +} + +void xen_smp_send_stop(void) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); + xen_smp_call_function_mask(mask, stop_self, NULL, 0); +} + +void xen_smp_send_reschedule(int cpu) +{ + xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); +} + + +static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) +{ + unsigned cpu; + + cpus_and(mask, mask, cpu_online_map); + + for_each_cpu_mask(cpu, mask) + xen_send_IPI_one(cpu, vector); +} + +static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) +{ + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; + + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ + irq_enter(); + (*func)(info); + irq_exit(); + + if (wait) { + mb(); /* commit everything before setting finished */ + atomic_inc(&call_data->finished); + } + + return IRQ_HANDLED; +} + +int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), + void *info, int wait) +{ + struct call_data_struct data; + int cpus; + + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + + cpu_clear(smp_processor_id(), mask); + + cpus = cpus_weight(mask); + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); /* write everything before IPI */ + + /* Send a message to other CPUs and wait for them to respond */ + xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); + + /* Make sure other vcpus get a chance to run. + XXX too severe? Maybe we should check the other CPU's states? */ + HYPERVISOR_sched_op(SCHEDOP_yield, 0); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus || + (wait && atomic_read(&data.finished) != cpus)) + cpu_relax(); + + spin_unlock(&call_lock); + + return 0; +} diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c index 2aab44bec2a..aeb04cf5dbf 100644 --- a/arch/i386/xen/time.c +++ b/arch/i386/xen/time.c @@ -519,7 +519,7 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) return ret; } -static void xen_setup_timer(int cpu) +void xen_setup_timer(int cpu) { const char *name; struct clock_event_device *evt; @@ -535,16 +535,20 @@ static void xen_setup_timer(int cpu) IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, name, NULL); - evt = &get_cpu_var(xen_clock_events); + evt = &per_cpu(xen_clock_events, cpu); memcpy(evt, xen_clockevent, sizeof(*evt)); evt->cpumask = cpumask_of_cpu(cpu); evt->irq = irq; - clockevents_register_device(evt); setup_runstate_info(cpu); +} + +void xen_setup_cpu_clockevents(void) +{ + BUG_ON(preemptible()); - put_cpu_var(xen_clock_events); + clockevents_register_device(&__get_cpu_var(xen_clock_events)); } __init void xen_time_init(void) @@ -570,4 +574,5 @@ __init void xen_time_init(void) tsc_disable = 0; xen_setup_timer(cpu); + xen_setup_cpu_clockevents(); } diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h index 7667abd390e..4069be8ba31 100644 --- a/arch/i386/xen/xen-ops.h +++ b/arch/i386/xen/xen-ops.h @@ -3,6 +3,12 @@ #include +/* These are code, but not functions. Defined in entry.S */ +extern const char xen_hypervisor_callback[]; +extern const char xen_failsafe_callback[]; + +void xen_copy_trap_info(struct trap_info *traps); + DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); DECLARE_PER_CPU(unsigned long, xen_cr3); @@ -13,6 +19,8 @@ char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); +void xen_setup_timer(int cpu); +void xen_setup_cpu_clockevents(void); unsigned long xen_cpu_khz(void); void __init xen_time_init(void); unsigned long xen_get_wallclock(void); @@ -28,5 +36,22 @@ static inline unsigned xen_get_lazy_mode(void) return x86_read_percpu(xen_lazy_mode); } +void __init xen_fill_possible_map(void); + +void xen_vcpu_setup(int cpu); +void xen_smp_prepare_boot_cpu(void); +void xen_smp_prepare_cpus(unsigned int max_cpus); +int xen_cpu_up(unsigned int cpu); +void xen_smp_cpus_done(unsigned int max_cpus); + +void xen_smp_send_stop(void); +void xen_smp_send_reschedule(int cpu); +int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait); +int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait); + +int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), + void *info, int wait); #endif /* XEN_OPS_H */ -- cgit v1.2.3 From f120f13ea0dbb0b0d6675683d5f6faea71277e65 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:06 -0700 Subject: xen: Add support for preemption Add Xen support for preemption. This is mostly a cleanup of existing preempt_enable/disable calls, or just comments to explain the current usage. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright --- arch/i386/xen/Kconfig | 2 +- arch/i386/xen/enlighten.c | 80 ++++++++++++++++++++++++++++------------------ arch/i386/xen/mmu.c | 3 ++ arch/i386/xen/multicalls.c | 11 ++++--- arch/i386/xen/time.c | 22 ++++++++++--- 5 files changed, 76 insertions(+), 42 deletions(-) (limited to 'arch') diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig index b7697ff2236..9df99e1885a 100644 --- a/arch/i386/xen/Kconfig +++ b/arch/i386/xen/Kconfig @@ -4,7 +4,7 @@ config XEN bool "Enable support for Xen hypervisor" - depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || NEED_MULTIPLE_NODES) + depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index de62d66e089..a1124b7f1d1 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -108,11 +109,10 @@ static unsigned long xen_save_fl(void) struct vcpu_info *vcpu; unsigned long flags; - preempt_disable(); vcpu = x86_read_percpu(xen_vcpu); + /* flag has opposite sense of mask */ flags = !vcpu->evtchn_upcall_mask; - preempt_enable(); /* convert to IF type flag -0 -> 0x00000000 @@ -125,32 +125,35 @@ static void xen_restore_fl(unsigned long flags) { struct vcpu_info *vcpu; - preempt_disable(); - /* convert from IF type flag */ flags = !(flags & X86_EFLAGS_IF); + + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); vcpu = x86_read_percpu(xen_vcpu); vcpu->evtchn_upcall_mask = flags; + preempt_enable_no_resched(); - if (flags == 0) { - /* Unmask then check (avoid races). We're only protecting - against updates by this CPU, so there's no need for - anything stronger. */ - barrier(); + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + if (flags == 0) { + preempt_check_resched(); + barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) force_evtchn_callback(); - preempt_enable(); - } else - preempt_enable_no_resched(); + } } static void xen_irq_disable(void) { - struct vcpu_info *vcpu; + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ preempt_disable(); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = 1; + x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; preempt_enable_no_resched(); } @@ -158,18 +161,20 @@ static void xen_irq_enable(void) { struct vcpu_info *vcpu; + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ preempt_disable(); vcpu = x86_read_percpu(xen_vcpu); vcpu->evtchn_upcall_mask = 0; + preempt_enable_no_resched(); - /* Unmask then check (avoid races). We're only protecting - against updates by this CPU, so there's no need for - anything stronger. */ - barrier(); + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) force_evtchn_callback(); - preempt_enable(); } static void xen_safe_halt(void) @@ -189,6 +194,8 @@ static void xen_halt(void) static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) { + BUG_ON(preemptible()); + switch (mode) { case PARAVIRT_LAZY_NONE: BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); @@ -293,9 +300,13 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, xmaddr_t mach_lp = virt_to_machine(lp); u64 entry = (u64)high << 32 | low; + preempt_disable(); + xen_mc_flush(); if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) BUG(); + + preempt_enable(); } static int cvt_gate_to_trap(int vector, u32 low, u32 high, @@ -328,11 +339,13 @@ static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, u32 low, u32 high) { - - int cpu = smp_processor_id(); unsigned long p = (unsigned long)&dt[entrynum]; - unsigned long start = per_cpu(idt_desc, cpu).address; - unsigned long end = start + per_cpu(idt_desc, cpu).size + 1; + unsigned long start, end; + + preempt_disable(); + + start = __get_cpu_var(idt_desc).address; + end = start + __get_cpu_var(idt_desc).size + 1; xen_mc_flush(); @@ -347,6 +360,8 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, if (HYPERVISOR_set_trap_table(info)) BUG(); } + + preempt_enable(); } static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, @@ -368,11 +383,9 @@ static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, void xen_copy_trap_info(struct trap_info *traps) { - const struct Xgt_desc_struct *desc = &get_cpu_var(idt_desc); + const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc); xen_convert_trap_info(desc, traps); - - put_cpu_var(idt_desc); } /* Load a new IDT into Xen. In principle this can be per-CPU, so we @@ -382,12 +395,11 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; - int cpu = smp_processor_id(); - - per_cpu(idt_desc, cpu) = *desc; spin_lock(&lock); + __get_cpu_var(idt_desc) = *desc; + xen_convert_trap_info(desc, traps); xen_mc_flush(); @@ -402,6 +414,8 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc) static void xen_write_gdt_entry(struct desc_struct *dt, int entry, u32 low, u32 high) { + preempt_disable(); + switch ((high >> 8) & 0xff) { case DESCTYPE_LDT: case DESCTYPE_TSS: @@ -418,10 +432,12 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, } } + + preempt_enable(); } static void xen_load_esp0(struct tss_struct *tss, - struct thread_struct *thread) + struct thread_struct *thread) { struct multicall_space mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); @@ -525,6 +541,8 @@ static unsigned long xen_read_cr3(void) static void xen_write_cr3(unsigned long cr3) { + BUG_ON(preemptible()); + if (cr3 == x86_read_percpu(xen_cr3)) { /* just a simple tlb flush */ xen_flush_tlb(); diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c index bc49ef84620..f431cf14e64 100644 --- a/arch/i386/xen/mmu.c +++ b/arch/i386/xen/mmu.c @@ -38,6 +38,7 @@ * * Jeremy Fitzhardinge , XenSource Inc, 2007 */ +#include #include #include #include @@ -531,5 +532,7 @@ void xen_exit_mmap(struct mm_struct *mm) drop_mm_ref(mm); put_cpu(); + spin_lock(&mm->page_table_lock); xen_pgd_unpin(mm->pgd); + spin_unlock(&mm->page_table_lock); } diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c index 869f9833f08..d4015a9ed46 100644 --- a/arch/i386/xen/multicalls.c +++ b/arch/i386/xen/multicalls.c @@ -20,6 +20,7 @@ * Jeremy Fitzhardinge , XenSource Inc, 2007 */ #include +#include #include @@ -39,10 +40,12 @@ DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); void xen_mc_flush(void) { - struct mc_buffer *b = &get_cpu_var(mc_buffer); + struct mc_buffer *b = &__get_cpu_var(mc_buffer); int ret = 0; unsigned long flags; + BUG_ON(preemptible()); + /* Disable interrupts in case someone comes in and queues something in the middle */ local_irq_save(flags); @@ -60,7 +63,6 @@ void xen_mc_flush(void) } else BUG_ON(b->argidx != 0); - put_cpu_var(mc_buffer); local_irq_restore(flags); BUG_ON(ret); @@ -68,10 +70,11 @@ void xen_mc_flush(void) struct multicall_space __xen_mc_entry(size_t args) { - struct mc_buffer *b = &get_cpu_var(mc_buffer); + struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct multicall_space ret; unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); + BUG_ON(preemptible()); BUG_ON(argspace > MC_ARGS); if (b->mcidx == MC_BATCH || @@ -83,7 +86,5 @@ struct multicall_space __xen_mc_entry(size_t args) ret.args = &b->args[b->argidx]; b->argidx += argspace; - put_cpu_var(mc_buffer); - return ret; } diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c index aeb04cf5dbf..51fdabf1fd4 100644 --- a/arch/i386/xen/time.c +++ b/arch/i386/xen/time.c @@ -88,7 +88,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) u64 state_time; struct vcpu_runstate_info *state; - preempt_disable(); + BUG_ON(preemptible()); state = &__get_cpu_var(runstate); @@ -103,8 +103,6 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) *res = *state; barrier(); } while (get64(&state->state_entry_time) != state_time); - - preempt_enable(); } static void setup_runstate_info(int cpu) @@ -179,9 +177,19 @@ static void do_stolen_accounting(void) unsigned long long xen_sched_clock(void) { struct vcpu_runstate_info state; - cycle_t now = xen_clocksource_read(); + cycle_t now; + u64 ret; s64 offset; + /* + * Ideally sched_clock should be called on a per-cpu basis + * anyway, so preempt should already be disabled, but that's + * not current practice at the moment. + */ + preempt_disable(); + + now = xen_clocksource_read(); + get_runstate_snapshot(&state); WARN_ON(state.state != RUNSTATE_running); @@ -190,9 +198,13 @@ unsigned long long xen_sched_clock(void) if (offset < 0) offset = 0; - return state.time[RUNSTATE_blocked] + + ret = state.time[RUNSTATE_blocked] + state.time[RUNSTATE_running] + offset; + + preempt_enable(); + + return ret; } -- cgit v1.2.3 From d66bf8fcf3fce058a1cd164a7c8ee6093fdf039c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:06 -0700 Subject: xen: lazy-mmu operations This patch uses the lazy-mmu hooks to batch mmu operations where possible. This is primarily useful for batching operations applied to active pagetables, which happens during mprotect, munmap, mremap and the like (mmap does not do bulk pagetable operations, so it isn't helped). Signed-off-by: Jeremy Fitzhardinge Acked-by: Chris Wright --- arch/i386/xen/enlighten.c | 48 +++++++++++++++++++++++++++--------------- arch/i386/xen/mmu.c | 52 ++++++++++++++++++++++++++++++++++------------ arch/i386/xen/multicalls.c | 4 ++-- 3 files changed, 72 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index a1124b7f1d1..031dc1dcf81 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -472,28 +472,38 @@ static void xen_apic_write(unsigned long reg, unsigned long val) static void xen_flush_tlb(void) { - struct mmuext_op op; + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - op.cmd = MMUEXT_TLB_FLUSH_LOCAL; - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_LOCAL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); } static void xen_flush_tlb_single(unsigned long addr) { - struct mmuext_op op; + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - op.cmd = MMUEXT_INVLPG_LOCAL; - op.arg1.linear_addr = addr & PAGE_MASK; - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); + op = mcs.args; + op->cmd = MMUEXT_INVLPG_LOCAL; + op->arg1.linear_addr = addr & PAGE_MASK; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); } static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, unsigned long va) { - struct mmuext_op op; + struct { + struct mmuext_op op; + cpumask_t mask; + } *args; cpumask_t cpumask = *cpus; + struct multicall_space mcs; /* * A couple of (to be removed) sanity checks: @@ -510,17 +520,21 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, if (cpus_empty(cpumask)) return; + mcs = xen_mc_entry(sizeof(*args)); + args = mcs.args; + args->mask = cpumask; + args->op.arg2.vcpumask = &args->mask; + if (va == TLB_FLUSH_ALL) { - op.cmd = MMUEXT_TLB_FLUSH_MULTI; - op.arg2.vcpumask = (void *)cpus; + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; } else { - op.cmd = MMUEXT_INVLPG_MULTI; - op.arg1.linear_addr = va; - op.arg2.vcpumask = (void *)cpus; + args->op.cmd = MMUEXT_INVLPG_MULTI; + args->op.arg1.linear_addr = va; } - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); + MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); } static unsigned long xen_read_cr2(void) diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c index f431cf14e64..4ae038aa6c2 100644 --- a/arch/i386/xen/mmu.c +++ b/arch/i386/xen/mmu.c @@ -98,12 +98,20 @@ void make_lowmem_page_readwrite(void *vaddr) void xen_set_pmd(pmd_t *ptr, pmd_t val) { - struct mmu_update u; + struct multicall_space mcs; + struct mmu_update *u; - u.ptr = virt_to_machine(ptr).maddr; - u.val = pmd_val_ma(val); - if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) - BUG(); + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + u->ptr = virt_to_machine(ptr).maddr; + u->val = pmd_val_ma(val); + MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); } /* @@ -146,20 +154,38 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - if ((mm != current->mm && mm != &init_mm) || - HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0) - xen_set_pte(ptep, pteval); + if (mm == current->mm || mm == &init_mm) { + if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + struct multicall_space mcs; + mcs = xen_mc_entry(0); + + MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); + xen_mc_issue(PARAVIRT_LAZY_MMU); + return; + } else + if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) + return; + } + xen_set_pte(ptep, pteval); } #ifdef CONFIG_X86_PAE void xen_set_pud(pud_t *ptr, pud_t val) { - struct mmu_update u; + struct multicall_space mcs; + struct mmu_update *u; - u.ptr = virt_to_machine(ptr).maddr; - u.val = pud_val_ma(val); - if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) - BUG(); + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + u->ptr = virt_to_machine(ptr).maddr; + u->val = pud_val_ma(val); + MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); } void xen_set_pte(pte_t *ptep, pte_t pte) diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c index d4015a9ed46..c837e8e463d 100644 --- a/arch/i386/xen/multicalls.c +++ b/arch/i386/xen/multicalls.c @@ -26,8 +26,8 @@ #include "multicalls.h" -#define MC_BATCH 8 -#define MC_ARGS (MC_BATCH * 32 / sizeof(u64)) +#define MC_BATCH 32 +#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) struct mc_buffer { struct multicall_entry entries[MC_BATCH]; -- cgit v1.2.3 From 8b84ad942b534f8faeb34b68f0f7277ea375fed0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:06 -0700 Subject: xen: hack to prevent bad segment register reload The hypervisor saves and restores the segment registers as part of the state is saves while context switching. If, during a context switch, the next process doesn't use the TLS segments, it invalidates the GDT entry, causing the segment register reload to fault. This fault effectively doubles the cost of a context switch. This patch is a band-aid workaround which clears the usermode %gs after it has been saved for the previous process, but before it gets reloaded for the next, and it avoids having the hypervisor attempt to erroneously reload it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright --- arch/i386/xen/enlighten.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index 031dc1dcf81..42756771b8e 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -291,6 +291,18 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) load_TLS_descriptor(t, cpu, 2); xen_mc_issue(PARAVIRT_LAZY_CPU); + + /* + * XXX sleazy hack: If we're being called in a lazy-cpu zone, + * it means we're in a context switch, and %gs has just been + * saved. This means we can zero it out to prevent faults on + * exit from the hypervisor if the next process has no %gs. + * Either way, it has been saved, and the new value will get + * loaded properly. This will go away as soon as Xen has been + * modified to not save/restore %gs for normal hypercalls. + */ + if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) + loadsegment(gs, 0); } static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, -- cgit v1.2.3 From b536b4b9623084d86f2b1f19cb44a2d6d74f00bf Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:06 -0700 Subject: xen: use the hvc console infrastructure for Xen console Implement a Xen back-end for hvc console. * * * Add early printk support via hvc console, enable using "earlyprintk=xen" on the kernel command line. From: Gerd Hoffmann Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Acked-by: Ingo Molnar Acked-by: Olof Johansson --- arch/i386/xen/events.c | 3 ++- arch/x86_64/kernel/early_printk.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c index 4103b8bf22f..8904acc20f8 100644 --- a/arch/i386/xen/events.c +++ b/arch/i386/xen/events.c @@ -244,7 +244,7 @@ static int find_unbound_irq(void) return irq; } -static int bind_evtchn_to_irq(unsigned int evtchn) +int bind_evtchn_to_irq(unsigned int evtchn) { int irq; @@ -269,6 +269,7 @@ static int bind_evtchn_to_irq(unsigned int evtchn) return irq; } +EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 296d2b0c5d8..fd9aff3f389 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -6,6 +6,7 @@ #include #include #include +#include /* Simple VGA output */ @@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf) simnow_init(buf + 6); early_console = &simnow_console; keep_early = 1; +#ifdef CONFIG_HVC_XEN + } else if (!strncmp(buf, "xen", 3)) { + early_console = &xenboot_console; +#endif } if (keep_early) -- cgit v1.2.3 From fefa629abebe328cf6d07f99fe5796dbfc3e4981 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:07 -0700 Subject: xen: machine operations Make the appropriate hypercalls to halt and reboot the virtual machine. Signed-off-by: Jeremy Fitzhardinge Acked-by: Chris Wright --- arch/i386/xen/enlighten.c | 43 +++++++++++++++++++++++++++++++++++++++++++ arch/i386/xen/smp.c | 4 +--- 2 files changed, 44 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index 42756771b8e..142e7489134 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,7 @@ #include #include #include +#include #include "xen-ops.h" #include "mmu.h" @@ -900,6 +902,45 @@ static const struct smp_ops xen_smp_ops __initdata = { }; #endif /* CONFIG_SMP */ +static void xen_reboot(int reason) +{ +#ifdef CONFIG_SMP + smp_send_stop(); +#endif + + if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) + BUG(); +} + +static void xen_restart(char *msg) +{ + xen_reboot(SHUTDOWN_reboot); +} + +static void xen_emergency_restart(void) +{ + xen_reboot(SHUTDOWN_reboot); +} + +static void xen_machine_halt(void) +{ + xen_reboot(SHUTDOWN_poweroff); +} + +static void xen_crash_shutdown(struct pt_regs *regs) +{ + xen_reboot(SHUTDOWN_crash); +} + +static const struct machine_ops __initdata xen_machine_ops = { + .restart = xen_restart, + .halt = xen_machine_halt, + .power_off = xen_machine_halt, + .shutdown = xen_machine_halt, + .crash_shutdown = xen_crash_shutdown, + .emergency_restart = xen_emergency_restart, +}; + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -912,6 +953,8 @@ asmlinkage void __init xen_start_kernel(void) /* Install Xen paravirt ops */ paravirt_ops = xen_paravirt_ops; + machine_ops = xen_machine_ops; + #ifdef CONFIG_SMP smp_ops = xen_smp_ops; #endif diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c index a91587fbf5c..a620918f87e 100644 --- a/arch/i386/xen/smp.c +++ b/arch/i386/xen/smp.c @@ -311,9 +311,7 @@ static void stop_self(void *v) void xen_smp_send_stop(void) { - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - xen_smp_call_function_mask(mask, stop_self, NULL, 0); + smp_call_function(stop_self, NULL, 0, 0); } void xen_smp_send_reschedule(int cpu) -- cgit v1.2.3 From 3e2b8fbeec8f005672f2a2e862fb9c26a0bafedc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:07 -0700 Subject: xen: handle external requests for shutdown, reboot and sysrq The guest domain can be asked to shutdown or reboot itself, or have a sysrq key injected, via xenbus. This patch adds a watcher for those events, and does the appropriate action. Signed-off-by: Jeremy Fitzhardinge Cc: Chris Wright --- arch/i386/xen/Makefile | 2 +- arch/i386/xen/manage.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 arch/i386/xen/manage.c (limited to 'arch') diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile index fd05f243a3f..7bf2ce399a2 100644 --- a/arch/i386/xen/Makefile +++ b/arch/i386/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ - events.o time.o + events.o time.o manage.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c new file mode 100644 index 00000000000..aa7af9e6abc --- /dev/null +++ b/arch/i386/xen/manage.c @@ -0,0 +1,143 @@ +/* + * Handle extern requests for shutdown, reboot and sysrq + */ +#include +#include +#include +#include + +#include + +#define SHUTDOWN_INVALID -1 +#define SHUTDOWN_POWEROFF 0 +#define SHUTDOWN_SUSPEND 2 +/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only + * report a crash, not be instructed to crash! + * HALT is the same as POWEROFF, as far as we're concerned. The tools use + * the distinction when we return the reason code to them. + */ +#define SHUTDOWN_HALT 4 + +/* Ignore multiple shutdown requests. */ +static int shutting_down = SHUTDOWN_INVALID; + +static void shutdown_handler(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + char *str; + struct xenbus_transaction xbt; + int err; + + if (shutting_down != SHUTDOWN_INVALID) + return; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + + str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); + /* Ignore read errors and empty reads. */ + if (XENBUS_IS_ERR_READ(str)) { + xenbus_transaction_end(xbt, 1); + return; + } + + xenbus_write(xbt, "control", "shutdown", ""); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) { + kfree(str); + goto again; + } + + if (strcmp(str, "poweroff") == 0 || + strcmp(str, "halt") == 0) + orderly_poweroff(false); + else if (strcmp(str, "reboot") == 0) + ctrl_alt_del(); + else { + printk(KERN_INFO "Ignoring shutdown request: %s\n", str); + shutting_down = SHUTDOWN_INVALID; + } + + kfree(str); +} + +static void sysrq_handler(struct xenbus_watch *watch, const char **vec, + unsigned int len) +{ + char sysrq_key = '\0'; + struct xenbus_transaction xbt; + int err; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { + printk(KERN_ERR "Unable to read sysrq code in " + "control/sysrq\n"); + xenbus_transaction_end(xbt, 1); + return; + } + + if (sysrq_key != '\0') + xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + + if (sysrq_key != '\0') + handle_sysrq(sysrq_key, NULL); +} + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; + +static struct xenbus_watch sysrq_watch = { + .node = "control/sysrq", + .callback = sysrq_handler +}; + +static int setup_shutdown_watcher(void) +{ + int err; + + err = register_xenbus_watch(&shutdown_watch); + if (err) { + printk(KERN_ERR "Failed to set shutdown watcher\n"); + return err; + } + + err = register_xenbus_watch(&sysrq_watch); + if (err) { + printk(KERN_ERR "Failed to set sysrq watcher\n"); + return err; + } + + return 0; +} + +static int shutdown_event(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + setup_shutdown_watcher(); + return NOTIFY_DONE; +} + +static int __init setup_shutdown_event(void) +{ + static struct notifier_block xenstore_notifier = { + .notifier_call = shutdown_event + }; + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} + +subsys_initcall(setup_shutdown_event); -- cgit v1.2.3 From 60223a326fc8fa6e90e2c3fd28ae6de4a311d731 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:07 -0700 Subject: xen: Place vcpu_info structure into per-cpu memory An experimental patch for Xen allows guests to place their vcpu_info structs anywhere. We try to use this to place the vcpu_info into the PDA, which allows direct access. If this works, then switch to using direct access operations for irq_enable, disable, save_fl and restore_fl. Signed-off-by: Jeremy Fitzhardinge Cc: Chris Wright Cc: Keir Fraser --- arch/i386/xen/enlighten.c | 152 ++++++++++++++++++++++++++++++++++++++++++++-- arch/i386/xen/setup.c | 8 --- arch/i386/xen/smp.c | 5 +- arch/i386/xen/xen-ops.h | 2 +- 4 files changed, 151 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index 142e7489134..e33fa0990ed 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -61,9 +61,63 @@ DEFINE_PER_CPU(unsigned long, xen_cr3); struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); -void xen_vcpu_setup(int cpu) +static /* __initdata */ struct shared_info dummy_shared_info; + +/* + * Point at some empty memory to start with. We map the real shared_info + * page as soon as fixmap is up and running. + */ +struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; + +/* + * Flag to determine whether vcpu info placement is available on all + * VCPUs. We assume it is to start with, and then set it to zero on + * the first failure. This is because it can succeed on some VCPUs + * and not others, since it can involve hypervisor memory allocation, + * or because the guest failed to guarantee all the appropriate + * constraints on all VCPUs (ie buffer can't cross a page boundary). + * + * Note that any particular CPU may be using a placed vcpu structure, + * but we can only optimise if the all are. + * + * 0: not available, 1: available + */ +static int have_vcpu_info_placement = 1; + +static void __init xen_vcpu_setup(int cpu) { + struct vcpu_register_vcpu_info info; + int err; + struct vcpu_info *vcpup; + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + + if (!have_vcpu_info_placement) + return; /* already tested, not available */ + + vcpup = &per_cpu(xen_vcpu_info, cpu); + + info.mfn = virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); + + printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", + cpu, vcpup, info.mfn, info.offset); + + /* Check to see if the hypervisor will put the vcpu_info + structure where we want it, which allows direct access via + a percpu-variable. */ + err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); + + if (err) { + printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); + have_vcpu_info_placement = 0; + } else { + /* This cpu is using the registered vcpu info, even if + later ones fail to. */ + per_cpu(xen_vcpu, cpu) = vcpup; + printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", + cpu, vcpup); + } } static void __init xen_banner(void) @@ -123,6 +177,20 @@ static unsigned long xen_save_fl(void) return (-flags) & X86_EFLAGS_IF; } +static unsigned long xen_save_fl_direct(void) +{ + unsigned long flags; + + /* flag has opposite sense of mask */ + flags = !x86_read_percpu(xen_vcpu_info.evtchn_upcall_mask); + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + static void xen_restore_fl(unsigned long flags) { struct vcpu_info *vcpu; @@ -149,6 +217,25 @@ static void xen_restore_fl(unsigned long flags) } } +static void xen_restore_fl_direct(unsigned long flags) +{ + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + + /* This is an atomic update, so no need to worry about + preemption. */ + x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, flags); + + /* If we get preempted here, then any pending event will be + handled anyway. */ + + if (flags == 0) { + barrier(); /* unmask then check (avoid races) */ + if (unlikely(x86_read_percpu(xen_vcpu_info.evtchn_upcall_pending))) + force_evtchn_callback(); + } +} + static void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to @@ -159,6 +246,12 @@ static void xen_irq_disable(void) preempt_enable_no_resched(); } +static void xen_irq_disable_direct(void) +{ + /* Atomic update, so preemption not a concern. */ + x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, 1); +} + static void xen_irq_enable(void) { struct vcpu_info *vcpu; @@ -179,6 +272,19 @@ static void xen_irq_enable(void) force_evtchn_callback(); } +static void xen_irq_enable_direct(void) +{ + /* Atomic update, so preemption not a concern. */ + x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, 0); + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + barrier(); /* unmask then check (avoid races) */ + if (unlikely(x86_read_percpu(xen_vcpu_info.evtchn_upcall_pending))) + force_evtchn_callback(); +} + static void xen_safe_halt(void) { /* Blocking includes an implicit local_irq_enable(). */ @@ -551,11 +657,21 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, xen_mc_issue(PARAVIRT_LAZY_MMU); } +static void xen_write_cr2(unsigned long cr2) +{ + x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; +} + static unsigned long xen_read_cr2(void) { return x86_read_percpu(xen_vcpu)->arch.cr2; } +static unsigned long xen_read_cr2_direct(void) +{ + return x86_read_percpu(xen_vcpu_info.arch.cr2); +} + static void xen_write_cr4(unsigned long cr4) { /* never allow TSC to be disabled */ @@ -753,8 +869,27 @@ static __init void xen_pagetable_setup_done(pgd_t *base) if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) BUG(); } +} - xen_vcpu_setup(smp_processor_id()); +/* This is called once we have the cpu_possible_map */ +void __init xen_setup_vcpu_info_placement(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + xen_vcpu_setup(cpu); + + /* xen_vcpu_setup managed to place the vcpu_info within the + percpu area for all cpus, so make use of it */ + if (have_vcpu_info_placement) { + printk(KERN_INFO "Xen: using vcpu_info placement\n"); + + paravirt_ops.save_fl = xen_save_fl_direct; + paravirt_ops.restore_fl = xen_restore_fl_direct; + paravirt_ops.irq_disable = xen_irq_disable_direct; + paravirt_ops.irq_enable = xen_irq_enable_direct; + paravirt_ops.read_cr2 = xen_read_cr2_direct; + } } static const struct paravirt_ops xen_paravirt_ops __initdata = { @@ -788,7 +923,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .write_cr0 = native_write_cr0, .read_cr2 = xen_read_cr2, - .write_cr2 = native_write_cr2, + .write_cr2 = xen_write_cr2, .read_cr3 = xen_read_cr3, .write_cr3 = xen_write_cr3, @@ -974,7 +1109,16 @@ asmlinkage void __init xen_start_kernel(void) /* keep using Xen gdt for now; no urgent need to change it */ x86_write_percpu(xen_cr3, __pa(pgd)); - xen_vcpu_setup(0); + +#ifdef CONFIG_SMP + /* Don't do the full vcpu_info placement stuff until we have a + possible map. */ + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; +#else + /* May as well do it now, since there's no good time to call + it later on UP. */ + xen_setup_vcpu_info_placement(); +#endif paravirt_ops.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c index 18a994d5a4c..3f8684eba62 100644 --- a/arch/i386/xen/setup.c +++ b/arch/i386/xen/setup.c @@ -24,14 +24,6 @@ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; -static __initdata struct shared_info init_shared; - -/* - * Point at some empty memory to start with. We map the real shared_info - * page as soon as fixmap is up and running. - */ -struct shared_info *HYPERVISOR_shared_info = &init_shared; - unsigned long *phys_to_machine_mapping; EXPORT_SYMBOL(phys_to_machine_mapping); diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c index a620918f87e..557b8e24706 100644 --- a/arch/i386/xen/smp.c +++ b/arch/i386/xen/smp.c @@ -142,8 +142,6 @@ void __init xen_smp_prepare_boot_cpu(void) BUG_ON(smp_processor_id() != 0); native_smp_prepare_boot_cpu(); - xen_vcpu_setup(0); - /* We've switched to the "real" per-cpu gdt, so make sure the old memory can be recycled */ make_lowmem_page_readwrite(&per_cpu__gdt_page); @@ -152,6 +150,8 @@ void __init xen_smp_prepare_boot_cpu(void) cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); } + + xen_setup_vcpu_info_placement(); } void __init xen_smp_prepare_cpus(unsigned int max_cpus) @@ -262,7 +262,6 @@ int __cpuinit xen_cpu_up(unsigned int cpu) init_gdt(cpu); per_cpu(current_task, cpu) = idle; - xen_vcpu_setup(cpu); irq_ctx_init(cpu); xen_setup_timer(cpu); diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h index 4069be8ba31..5b56f7fecd1 100644 --- a/arch/i386/xen/xen-ops.h +++ b/arch/i386/xen/xen-ops.h @@ -38,7 +38,7 @@ static inline unsigned xen_get_lazy_mode(void) void __init xen_fill_possible_map(void); -void xen_vcpu_setup(int cpu); +void __init xen_setup_vcpu_info_placement(void); void xen_smp_prepare_boot_cpu(void); void xen_smp_prepare_cpus(unsigned int max_cpus); int xen_cpu_up(unsigned int cpu); -- cgit v1.2.3 From 6487673b8a858f99a5348e1078b3f5aec700f9e0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:07 -0700 Subject: xen: Attempt to patch inline versions of common operations This patchs adds the mechanism to allow us to patch inline versions of common operations. The implementations of the direct-access versions save_fl, restore_fl, irq_enable and irq_disable are now in assembler, and the same code is used for both out of line and inline uses. Signed-off-by: Jeremy Fitzhardinge Cc: Chris Wright Cc: Keir Fraser --- arch/i386/kernel/asm-offsets.c | 8 +++ arch/i386/xen/Makefile | 2 +- arch/i386/xen/enlighten.c | 107 +++++++++++++++++++------------------- arch/i386/xen/xen-asm.S | 114 +++++++++++++++++++++++++++++++++++++++++ arch/i386/xen/xen-ops.h | 13 +++++ 5 files changed, 190 insertions(+), 54 deletions(-) create mode 100644 arch/i386/xen/xen-asm.S (limited to 'arch') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 27a776c9044..a7c2947b396 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -17,6 +17,8 @@ #include #include +#include + #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -115,4 +117,10 @@ void foo(void) OFFSET(PARAVIRT_iret, paravirt_ops, iret); OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); #endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#endif } diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile index 7bf2ce399a2..343df246bd3 100644 --- a/arch/i386/xen/Makefile +++ b/arch/i386/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ - events.o time.o manage.o + events.o time.o manage.o xen-asm.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index e33fa0990ed..4fa62a4cb7c 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -115,6 +115,7 @@ static void __init xen_vcpu_setup(int cpu) /* This cpu is using the registered vcpu info, even if later ones fail to. */ per_cpu(xen_vcpu, cpu) = vcpup; + printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", cpu, vcpup); } @@ -177,20 +178,6 @@ static unsigned long xen_save_fl(void) return (-flags) & X86_EFLAGS_IF; } -static unsigned long xen_save_fl_direct(void) -{ - unsigned long flags; - - /* flag has opposite sense of mask */ - flags = !x86_read_percpu(xen_vcpu_info.evtchn_upcall_mask); - - /* convert to IF type flag - -0 -> 0x00000000 - -1 -> 0xffffffff - */ - return (-flags) & X86_EFLAGS_IF; -} - static void xen_restore_fl(unsigned long flags) { struct vcpu_info *vcpu; @@ -217,25 +204,6 @@ static void xen_restore_fl(unsigned long flags) } } -static void xen_restore_fl_direct(unsigned long flags) -{ - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); - - /* This is an atomic update, so no need to worry about - preemption. */ - x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, flags); - - /* If we get preempted here, then any pending event will be - handled anyway. */ - - if (flags == 0) { - barrier(); /* unmask then check (avoid races) */ - if (unlikely(x86_read_percpu(xen_vcpu_info.evtchn_upcall_pending))) - force_evtchn_callback(); - } -} - static void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to @@ -246,12 +214,6 @@ static void xen_irq_disable(void) preempt_enable_no_resched(); } -static void xen_irq_disable_direct(void) -{ - /* Atomic update, so preemption not a concern. */ - x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, 1); -} - static void xen_irq_enable(void) { struct vcpu_info *vcpu; @@ -272,19 +234,6 @@ static void xen_irq_enable(void) force_evtchn_callback(); } -static void xen_irq_enable_direct(void) -{ - /* Atomic update, so preemption not a concern. */ - x86_write_percpu(xen_vcpu_info.evtchn_upcall_mask, 0); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - barrier(); /* unmask then check (avoid races) */ - if (unlikely(x86_read_percpu(xen_vcpu_info.evtchn_upcall_pending))) - force_evtchn_callback(); -} - static void xen_safe_halt(void) { /* Blocking includes an implicit local_irq_enable(). */ @@ -892,6 +841,57 @@ void __init xen_setup_vcpu_info_placement(void) } } +static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len) +{ + char *start, *end, *reloc; + unsigned ret; + + start = end = reloc = NULL; + +#define SITE(x) \ + case PARAVIRT_PATCH(x): \ + if (have_vcpu_info_placement) { \ + start = (char *)xen_##x##_direct; \ + end = xen_##x##_direct_end; \ + reloc = xen_##x##_direct_reloc; \ + } \ + goto patch_site + + switch (type) { + SITE(irq_enable); + SITE(irq_disable); + SITE(save_fl); + SITE(restore_fl); +#undef SITE + + patch_site: + if (start == NULL || (end-start) > len) + goto default_patch; + + ret = paravirt_patch_insns(insns, len, start, end); + + /* Note: because reloc is assigned from something that + appears to be an array, gcc assumes it's non-null, + but doesn't know its relationship with start and + end. */ + if (reloc > start && reloc < end) { + int reloc_off = reloc - start; + long *relocp = (long *)(insns + reloc_off); + long delta = start - (char *)insns; + + *relocp += delta; + } + break; + + default_patch: + default: + ret = paravirt_patch_default(type, clobbers, insns, len); + break; + } + + return ret; +} + static const struct paravirt_ops xen_paravirt_ops __initdata = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, @@ -899,7 +899,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .name = "Xen", .banner = xen_banner, - .patch = paravirt_patch_default, + .patch = xen_patch, .memory_setup = xen_memory_setup, .arch_setup = xen_arch_setup, @@ -1076,6 +1076,7 @@ static const struct machine_ops __initdata xen_machine_ops = { .emergency_restart = xen_emergency_restart, }; + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S new file mode 100644 index 00000000000..dc4d36d51bc --- /dev/null +++ b/arch/i386/xen/xen-asm.S @@ -0,0 +1,114 @@ +/* + Asm versions of Xen pv-ops, suitable for either direct use or inlining. + The inline versions are the same as the direct-use versions, with the + pre- and post-amble chopped off. + + This code is encoded for size rather than absolute efficiency, + with a view to being able to inline as much as possible. + + We only bother with direct forms (ie, vcpu in pda) of the operations + here; the indirect forms are better handled in C, since they're + generally too large to inline anyway. + */ + +#include +#include +#include +#include +#include +#include + +#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +#define ENDPATCH(x) .globl x##_end; x##_end=. + +/* + Enable events. This clears the event mask and tests the pending + event status with one and operation. If there are pending + events, then enter the hypervisor to get them handled. + */ +ENTRY(xen_irq_enable_direct) + /* Clear mask and test pending */ + andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + jz 1f +2: call check_events +1: +ENDPATCH(xen_irq_enable_direct) + ret + ENDPROC(xen_irq_enable_direct) + RELOC(xen_irq_enable_direct, 2b+1) + + +/* + Disabling events is simply a matter of making the event mask + non-zero. + */ +ENTRY(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask +ENDPATCH(xen_irq_disable_direct) + ret + ENDPROC(xen_irq_disable_direct) + RELOC(xen_irq_disable_direct, 0) + +/* + (xen_)save_fl is used to get the current interrupt enable status. + Callers expect the status to be in X86_EFLAGS_IF, and other bits + may be set in the return value. We take advantage of this by + making sure that X86_EFLAGS_IF has the right value (and other bits + in that byte are 0), but other bits in the return value are + undefined. We need to toggle the state of the bit, because + Xen and x86 use opposite senses (mask vs enable). + */ +ENTRY(xen_save_fl_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + setz %ah + addb %ah,%ah +ENDPATCH(xen_save_fl_direct) + ret + ENDPROC(xen_save_fl_direct) + RELOC(xen_save_fl_direct, 0) + + +/* + In principle the caller should be passing us a value return + from xen_save_fl_direct, but for robustness sake we test only + the X86_EFLAGS_IF flag rather than the whole byte. After + setting the interrupt mask state, it checks for unmasked + pending events and enters the hypervisor to get them delivered + if so. + */ +ENTRY(xen_restore_fl_direct) + testb $X86_EFLAGS_IF>>8, %ah + setz %al + movb %al, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* check for pending but unmasked */ + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + jz 1f +2: call check_events +1: +ENDPATCH(xen_restore_fl_direct) + ret + ENDPROC(xen_restore_fl_direct) + RELOC(xen_restore_fl_direct, 2b+1) + + + +/* + Force an event check by making a hypercall, + but preserve regs before making the call. + */ +check_events: + push %eax + push %ecx + push %edx + call force_evtchn_callback + pop %edx + pop %ecx + pop %eax + ret diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h index 5b56f7fecd1..33e4c8a1628 100644 --- a/arch/i386/xen/xen-ops.h +++ b/arch/i386/xen/xen-ops.h @@ -54,4 +54,17 @@ int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, int wait); + +/* Declare an asm function, along with symbols needed to make it + inlineable */ +#define DECL_ASM(ret, name, ...) \ + ret name(__VA_ARGS__); \ + extern char name##_end[]; \ + extern char name##_reloc[] \ + +DECL_ASM(void, xen_irq_enable_direct, void); +DECL_ASM(void, xen_irq_disable_direct, void); +DECL_ASM(unsigned long, xen_save_fl_direct, void); +DECL_ASM(void, xen_restore_fl_direct, unsigned long); + #endif /* XEN_OPS_H */ -- cgit v1.2.3 From 600b2fc242992e552e0b4e24c8c1f084b341f39b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:07 -0700 Subject: xen: suppress abs symbol warnings for unused reloc pointers arch/i386/xen/xen-asm.S defines some small pieces of code which are used to implement a few paravirt_ops. They're designed so they can be used either in-place, or be inline patched into their callsites if there's enough space. Some of those operations need to make calls out (specifically, if you re-enable events [interrupts], and there's a pending event at that time). These calls need the call instruction to be relocated if the code is patched inline. In this case xen_foo_reloc is a section-relative symbol which points to xen_foo's required relocation. Other operations have no need of a relocation, and so their corresponding xen_bar_reloc is absolute 0. These are the cases which are triggering the warning. This patch adds those symbols to the list of safe abs symbols. Signed-off-by: Jeremy Fitzhardinge Cc: Adrian Bunk --- arch/i386/boot/compressed/relocs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c index ce4fda261aa..b0e21c3cee5 100644 --- a/arch/i386/boot/compressed/relocs.c +++ b/arch/i386/boot/compressed/relocs.c @@ -31,6 +31,8 @@ static const char* safe_abs_relocs[] = { "__kernel_rt_sigreturn", "__kernel_sigreturn", "SYSENTER_RETURN", + "xen_irq_disable_direct_reloc", + "xen_save_fl_direct_reloc", }; static int is_safe_abs_reloc(const char* sym_name) -- cgit v1.2.3 From 9ec2b804e099e8a326369e6cccab10dee1d172ee Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:07 -0700 Subject: xen: use iret directly when possible Most of the time we can simply use the iret instruction to exit the kernel, rather than having to use the iret hypercall - the only exception is if we're returning into vm86 mode, or from delivering an NMI (which we don't support yet). When running native, iret has the behaviour of testing for a pending interrupt atomically with re-enabling interrupts. Unfortunately there's no way to do this with Xen, so there's a window in which we could get a recursive exception after enabling events but before actually returning to userspace. This causes a problem: if the nested interrupt causes one of the task's TIF_WORK_MASK flags to be set, they will not be checked again before returning to userspace. This means that pending work may be left pending indefinitely, until the process enters and leaves the kernel again. The net effect is that a pending signal or reschedule event could be delayed for an unbounded amount of time. To deal with this, the xen event upcall handler checks to see if the EIP is within the critical section of the iret code, after events are (potentially) enabled up to the iret itself. If its within this range, it calls the iret critical section fixup, which adjusts the stack to deal with any unrestored registers, and then shifts the stack frame up to replace the previous invocation. Signed-off-by: Jeremy Fitzhardinge --- arch/i386/kernel/asm-offsets.c | 1 + arch/i386/kernel/entry.S | 16 +++- arch/i386/xen/enlighten.c | 1 + arch/i386/xen/xen-asm.S | 185 ++++++++++++++++++++++++++++++++++++++++- arch/i386/xen/xen-ops.h | 1 + 5 files changed, 199 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index a7c2947b396..25f7eb51392 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -61,6 +61,7 @@ void foo(void) OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); OFFSET(TI_sysenter_return, thread_info, sysenter_return); + OFFSET(TI_cpu, thread_info, cpu); BLANK(); OFFSET(GDS_size, Xgt_desc_struct, size); diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index ffb23654427..32980b83493 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -1030,7 +1030,21 @@ ENTRY(xen_hypervisor_callback) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL TRACE_IRQS_OFF - mov %esp, %eax + + /* Check to see if we got the event in the critical + region in xen_iret_direct, after we've reenabled + events and checked for pending events. This simulates + iret instruction's behaviour where it delivers a + pending interrupt when enabling interrupts. */ + movl PT_EIP(%esp),%eax + cmpl $xen_iret_start_crit,%eax + jb 1f + cmpl $xen_iret_end_crit,%eax + jae 1f + + call xen_iret_crit_fixup + +1: mov %esp, %eax call xen_evtchn_do_upcall jmp ret_from_intr CFI_ENDPROC diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index 4fa62a4cb7c..9a8c1181c00 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c @@ -838,6 +838,7 @@ void __init xen_setup_vcpu_info_placement(void) paravirt_ops.irq_disable = xen_irq_disable_direct; paravirt_ops.irq_enable = xen_irq_enable_direct; paravirt_ops.read_cr2 = xen_read_cr2_direct; + paravirt_ops.iret = xen_iret_direct; } } diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S index dc4d36d51bc..1a43b60c0c6 100644 --- a/arch/i386/xen/xen-asm.S +++ b/arch/i386/xen/xen-asm.S @@ -12,15 +12,21 @@ */ #include + #include #include #include -#include #include +#include + +#include #define RELOC(x, v) .globl x##_reloc; x##_reloc=v #define ENDPATCH(x) .globl x##_end; x##_end=. +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 + /* Enable events. This clears the event mask and tests the pending event status with one and operation. If there are pending @@ -81,13 +87,12 @@ ENDPATCH(xen_save_fl_direct) */ ENTRY(xen_restore_fl_direct) testb $X86_EFLAGS_IF>>8, %ah - setz %al - movb %al, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask /* Preempt here doesn't matter because that will deal with any pending interrupts. The pending check may end up being run on the wrong CPU, but that doesn't hurt. */ - /* check for pending but unmasked */ + /* check for unmasked and pending */ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending jz 1f 2: call check_events @@ -97,6 +102,178 @@ ENDPATCH(xen_restore_fl_direct) ENDPROC(xen_restore_fl_direct) RELOC(xen_restore_fl_direct, 2b+1) +/* + This is run where a normal iret would be run, with the same stack setup: + 8: eflags + 4: cs + esp-> 0: eip + + This attempts to make sure that any pending events are dealt + with on return to usermode, but there is a small window in + which an event can happen just before entering usermode. If + the nested interrupt ends up setting one of the TIF_WORK_MASK + pending work flags, they will not be tested again before + returning to usermode. This means that a process can end up + with pending work, which will be unprocessed until the process + enters and leaves the kernel again, which could be an + unbounded amount of time. This means that a pending signal or + reschedule event could be indefinitely delayed. + + The fix is to notice a nested interrupt in the critical + window, and if one occurs, then fold the nested interrupt into + the current interrupt stack frame, and re-process it + iteratively rather than recursively. This means that it will + exit via the normal path, and all pending work will be dealt + with appropriately. + + Because the nested interrupt handler needs to deal with the + current stack state in whatever form its in, we keep things + simple by only using a single register which is pushed/popped + on the stack. + + Non-direct iret could be done in the same way, but it would + require an annoying amount of code duplication. We'll assume + that direct mode will be the common case once the hypervisor + support becomes commonplace. + */ +ENTRY(xen_iret_direct) + /* test eflags for special cases */ + testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) + jnz hyper_iret + + push %eax + ESP_OFFSET=4 # bytes pushed onto stack + + /* Store vcpu_info pointer for easy access. Do it this + way to avoid having to reload %fs */ +#ifdef CONFIG_SMP + GET_THREAD_INFO(%eax) + movl TI_cpu(%eax),%eax + movl __per_cpu_offset(,%eax,4),%eax + lea per_cpu__xen_vcpu_info(%eax),%eax +#else + movl $per_cpu__xen_vcpu_info, %eax +#endif + + /* check IF state we're restoring */ + testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) + + /* Maybe enable events. Once this happens we could get a + recursive event, so the critical region starts immediately + afterwards. However, if that happens we don't end up + resuming the code, so we don't have to be worried about + being preempted to another CPU. */ + setz XEN_vcpu_info_mask(%eax) +xen_iret_start_crit: + + /* check for unmasked and pending */ + cmpw $0x0001, XEN_vcpu_info_pending(%eax) + + /* If there's something pending, mask events again so we + can jump back into xen_hypervisor_callback */ + sete XEN_vcpu_info_mask(%eax) + + popl %eax + + /* From this point on the registers are restored and the stack + updated, so we don't need to worry about it if we're preempted */ +iret_restore_end: + + /* Jump to hypervisor_callback after fixing up the stack. + Events are masked, so jumping out of the critical + region is OK. */ + je xen_hypervisor_callback + + iret +xen_iret_end_crit: + +hyper_iret: + /* put this out of line since its very rarely used */ + jmp hypercall_page + __HYPERVISOR_iret * 32 + + .globl xen_iret_start_crit, xen_iret_end_crit + +/* + This is called by xen_hypervisor_callback in entry.S when it sees + that the EIP at the time of interrupt was between xen_iret_start_crit + and xen_iret_end_crit. We're passed the EIP in %eax so we can do + a more refined determination of what to do. + + The stack format at this point is: + ---------------- + ss : (ss/esp may be present if we came from usermode) + esp : + eflags } outer exception info + cs } + eip } + ---------------- <- edi (copy dest) + eax : outer eax if it hasn't been restored + ---------------- + eflags } nested exception info + cs } (no ss/esp because we're nested + eip } from the same ring) + orig_eax }<- esi (copy src) + - - - - - - - - + fs } + es } + ds } SAVE_ALL state + eax } + : : + ebx } + ---------------- + return addr <- esp + ---------------- + + In order to deliver the nested exception properly, we need to shift + everything from the return addr up to the error code so it + sits just under the outer exception info. This means that when we + handle the exception, we do it in the context of the outer exception + rather than starting a new one. + + The only caveat is that if the outer eax hasn't been + restored yet (ie, it's still on stack), we need to insert + its value into the SAVE_ALL state before going on, since + it's usermode state which we eventually need to restore. + */ +ENTRY(xen_iret_crit_fixup) + /* offsets +4 for return address */ + + /* + Paranoia: Make sure we're really coming from userspace. + One could imagine a case where userspace jumps into the + critical range address, but just before the CPU delivers a GP, + it decides to deliver an interrupt instead. Unlikely? + Definitely. Easy to avoid? Yes. The Intel documents + explicitly say that the reported EIP for a bad jump is the + jump instruction itself, not the destination, but some virtual + environments get this wrong. + */ + movl PT_CS+4(%esp), %ecx + andl $SEGMENT_RPL_MASK, %ecx + cmpl $USER_RPL, %ecx + je 2f + + lea PT_ORIG_EAX+4(%esp), %esi + lea PT_EFLAGS+4(%esp), %edi + + /* If eip is before iret_restore_end then stack + hasn't been restored yet. */ + cmp $iret_restore_end, %eax + jae 1f + + movl 0+4(%edi),%eax /* copy EAX */ + movl %eax, PT_EAX+4(%esp) + + lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ + + /* set up the copy */ +1: std + mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ + rep movsl + cld + + lea 4(%edi),%esp /* point esp to new frame */ +2: ret /* diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h index 33e4c8a1628..b9aaea45f07 100644 --- a/arch/i386/xen/xen-ops.h +++ b/arch/i386/xen/xen-ops.h @@ -67,4 +67,5 @@ DECL_ASM(void, xen_irq_disable_direct, void); DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); +void xen_iret_direct(void); #endif /* XEN_OPS_H */ -- cgit v1.2.3 From dfdcdd42fdf63452ddd1bed6f49ae2a35dfb5d6c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Jul 2007 18:37:07 -0700 Subject: xen: disable all non-virtual drivers A domU Xen environment has no non-virtual drivers, so make sure they're all disabled at once. Signed-off-by: Jeremy Fitzhardinge Cc: Rusty Russell --- arch/i386/xen/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c index 3f8684eba62..2fe6eac510f 100644 --- a/arch/i386/xen/setup.c +++ b/arch/i386/xen/setup.c @@ -91,4 +91,6 @@ void __init xen_arch_setup(void) /* fill cpus_possible with all available cpus */ xen_fill_possible_map(); #endif + + paravirt_disable_iospace(); } -- cgit v1.2.3