diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/lguest.h | 7 | ||||
-rw-r--r-- | arch/x86/include/asm/lguest_hcall.h | 15 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable_32_types.h | 4 | ||||
-rw-r--r-- | arch/x86/kernel/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kernel/asm-offsets_32.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/module.c (renamed from arch/x86/kernel/module_64.c) | 82 | ||||
-rw-r--r-- | arch/x86/kernel/module_32.c | 152 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 15 | ||||
-rw-r--r-- | arch/x86/kernel/vmlinux.lds.S | 2 | ||||
-rw-r--r-- | arch/x86/lguest/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/lguest/boot.c | 158 | ||||
-rw-r--r-- | arch/x86/lguest/i386_head.S | 60 |
12 files changed, 265 insertions, 234 deletions
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 1caf57628b9..313389cd50d 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -17,8 +17,13 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M for ease of mapping into the guest (one PTE page). */ +/* We map at -4M (-2M when PAE is activated) for ease of mapping + * into the guest (one PTE page). */ +#ifdef CONFIG_X86_PAE +#define SWITCHER_ADDR 0xFFE00000 +#else #define SWITCHER_ADDR 0xFFC00000 +#endif /* Found in switcher.S */ extern unsigned long default_idt_entries[]; diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index faae1996487..d31c4a68407 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -12,11 +12,13 @@ #define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 +#define LHCALL_SET_PMD 13 #define LHCALL_SET_PTE 14 -#define LHCALL_SET_PMD 15 +#define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 +#define LHCALL_SEND_INTERRUPTS 19 #define LGUEST_TRAP_ENTRY 0x1F @@ -32,10 +34,10 @@ * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Eighteen hypercalls are + * We use the KVM hypercall mechanism. Seventeen hypercalls are * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx and %edx. If a return - * value makes sense, it's returned in %eax. + * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. + * If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's @@ -47,8 +49,9 @@ #define LHCALL_RING_SIZE 64 struct hcall_args { - /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3; + /* These map directly onto eax, ebx, ecx, edx and esi + * in struct lguest_regs */ + unsigned long arg0, arg1, arg2, arg3, arg4; }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 2733fad45f9..5e67c153231 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -46,6 +46,10 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ # define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) #endif +#define MODULES_VADDR VMALLOC_START +#define MODULES_END VMALLOC_END +#define MODULES_LEN (MODULES_VADDR - MODULES_END) + #define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) #endif /* _ASM_X86_PGTABLE_32_DEFS_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4f78bd68212..f3477bb8456 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -73,7 +73,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_MODULES) += module_$(BITS).o +obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a830cbd701..dfdbf640389 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -126,6 +126,7 @@ void foo(void) #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); BLANK(); diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module.c index c23880b90b5..89f386f044e 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module.c @@ -1,6 +1,5 @@ -/* Kernel module help for x86-64 +/* Kernel module help for x86. Copyright (C) 2001 Rusty Russell. - Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -22,23 +21,18 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/slab.h> #include <linux/bug.h> +#include <linux/mm.h> #include <asm/system.h> #include <asm/page.h> #include <asm/pgtable.h> +#if 0 +#define DEBUGP printk +#else #define DEBUGP(fmt...) - -#ifndef CONFIG_UML -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ -} +#endif void *module_alloc(unsigned long size) { @@ -54,9 +48,15 @@ void *module_alloc(unsigned long size) if (!area) return NULL; - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); + return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, + PAGE_KERNEL_EXEC); +} + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); } -#endif /* We don't need anything special. */ int module_frob_arch_sections(Elf_Ehdr *hdr, @@ -67,6 +67,58 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } +#ifdef CONFIG_X86_32 +int apply_relocate(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; + uint32_t *location; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + + ELF32_R_SYM(rel[i].r_info); + + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_386_32: + /* We add the value into the location given */ + *location += sym->st_value; + break; + case R_386_PC32: + /* Add the value, subtract its postition */ + *location += sym->st_value - (uint32_t)location; + break; + default: + printk(KERN_ERR "module %s: Unknown relocation: %u\n", + me->name, ELF32_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; +} + +int apply_relocate_add(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", + me->name); + return -ENOEXEC; +} +#else /*X86_64*/ int apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, @@ -147,6 +199,8 @@ int apply_relocate(Elf_Shdr *sechdrs, return -ENOSYS; } +#endif + int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c deleted file mode 100644 index 0edd819050e..00000000000 --- a/arch/x86/kernel/module_32.c +++ /dev/null @@ -1,152 +0,0 @@ -/* Kernel module help for i386. - Copyright (C) 2001 Rusty Russell. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ -#include <linux/moduleloader.h> -#include <linux/elf.h> -#include <linux/vmalloc.h> -#include <linux/fs.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/bug.h> - -#if 0 -#define DEBUGP printk -#else -#define DEBUGP(fmt...) -#endif - -void *module_alloc(unsigned long size) -{ - if (size == 0) - return NULL; - return vmalloc_exec(size); -} - - -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) -{ - vfree(module_region); - /* FIXME: If module_region == mod->init_region, trim exception - table entries. */ -} - -/* We don't need anything special. */ -int module_frob_arch_sections(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - char *secstrings, - struct module *mod) -{ - return 0; -} - -int apply_relocate(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - unsigned int i; - Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; - Elf32_Sym *sym; - uint32_t *location; - - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); - for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { - /* This is where to make the change */ - location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr - + rel[i].r_offset; - /* This is the symbol it is referring to. Note that all - undefined symbols have been resolved. */ - sym = (Elf32_Sym *)sechdrs[symindex].sh_addr - + ELF32_R_SYM(rel[i].r_info); - - switch (ELF32_R_TYPE(rel[i].r_info)) { - case R_386_32: - /* We add the value into the location given */ - *location += sym->st_value; - break; - case R_386_PC32: - /* Add the value, subtract its postition */ - *location += sym->st_value - (uint32_t)location; - break; - default: - printk(KERN_ERR "module %s: Unknown relocation: %u\n", - me->name, ELF32_R_TYPE(rel[i].r_info)); - return -ENOEXEC; - } - } - return 0; -} - -int apply_relocate_add(Elf32_Shdr *sechdrs, - const char *strtab, - unsigned int symindex, - unsigned int relsec, - struct module *me) -{ - printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", - me->name); - return -ENOEXEC; -} - -int module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".text", secstrings + s->sh_name)) - text = s; - if (!strcmp(".altinstructions", secstrings + s->sh_name)) - alt = s; - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - if (!strcmp(".parainstructions", secstrings + s->sh_name)) - para = s; - } - - if (alt) { - /* patch .altinstructions */ - void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - if (locks && text) { - void *lseg = (void *)locks->sh_addr; - void *tseg = (void *)text->sh_addr; - alternatives_smp_module_add(me, me->name, - lseg, lseg + locks->sh_size, - tseg, tseg + text->sh_size); - } - - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } - - return module_bug_finalize(hdr, sechdrs, me); -} - -void module_arch_cleanup(struct module *mod) -{ - alternatives_smp_module_del(mod); - module_bug_cleanup(mod); -} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d1c636bf31a..be5ae80f897 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -301,15 +301,13 @@ static void __init reserve_brk(void) #ifdef CONFIG_BLK_DEV_INITRD -#ifdef CONFIG_X86_32 - #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; @@ -365,14 +363,13 @@ static void __init relocate_initrd(void) ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } -#endif static void __init reserve_initrd(void) { u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = ramdisk_image + ramdisk_size; - u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT; + u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -402,14 +399,8 @@ static void __init reserve_initrd(void) return; } -#ifdef CONFIG_X86_32 relocate_initrd(); -#else - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08llx > 0x%08llx)\ndisabling initrd\n", - ramdisk_end, end_of_lowmem); - initrd_start = 0; -#endif + free_early(ramdisk_image, ramdisk_end); } #else diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4c85b2e2bb6..367e8788204 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -108,6 +108,8 @@ SECTIONS /* Data */ . = ALIGN(PAGE_SIZE); .data : AT(ADDR(.data) - LOAD_OFFSET) { + /* Start of data section */ + _sdata = .; DATA_DATA CONSTRUCTORS diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 8dab8f7844d..38718041efc 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -2,7 +2,6 @@ config LGUEST_GUEST bool "Lguest guest support" select PARAVIRT depends on X86_32 - depends on !X86_PAE select VIRTIO select VIRTIO_RING select VIRTIO_CONSOLE diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4e0c2655939..7bc65f0f62c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -87,7 +87,7 @@ struct lguest_data lguest_data = { /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall + * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, * and 255 once the Host has finished with it. * @@ -96,7 +96,8 @@ struct lguest_data lguest_data = { * effect of causing the Host to run all the stored calls in the ring buffer * which empties it for next time! */ static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3) + unsigned long arg2, unsigned long arg3, + unsigned long arg4) { /* Note: This code assumes we're uniprocessor. */ static unsigned int next_call; @@ -108,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ - kvm_hypercall3(call, arg1, arg2, arg3); + kvm_hypercall4(call, arg1, arg2, arg3, arg4); } else { lguest_data.hcalls[next_call].arg0 = call; lguest_data.hcalls[next_call].arg1 = arg1; lguest_data.hcalls[next_call].arg2 = arg2; lguest_data.hcalls[next_call].arg3 = arg3; + lguest_data.hcalls[next_call].arg4 = arg4; /* Arguments must all be written before we mark it to go */ wmb(); lguest_data.hcall_status[next_call] = 0; @@ -141,7 +143,7 @@ static void lazy_hcall1(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall1(call, arg1); else - async_hcall(call, arg1, 0, 0); + async_hcall(call, arg1, 0, 0, 0); } static void lazy_hcall2(unsigned long call, @@ -151,7 +153,7 @@ static void lazy_hcall2(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall2(call, arg1, arg2); else - async_hcall(call, arg1, arg2, 0); + async_hcall(call, arg1, arg2, 0, 0); } static void lazy_hcall3(unsigned long call, @@ -162,9 +164,23 @@ static void lazy_hcall3(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall3(call, arg1, arg2, arg3); else - async_hcall(call, arg1, arg2, arg3); + async_hcall(call, arg1, arg2, arg3, 0); } +#ifdef CONFIG_X86_PAE +static void lazy_hcall4(unsigned long call, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4) +{ + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) + kvm_hypercall4(call, arg1, arg2, arg3, arg4); + else + async_hcall(call, arg1, arg2, arg3, arg4); +} +#endif + /* When lazy mode is turned off reset the per-cpu lazy mode variable and then * issue the do-nothing hypercall to flush any stored calls. */ static void lguest_leave_lazy_mmu_mode(void) @@ -179,7 +195,7 @@ static void lguest_end_context_switch(struct task_struct *next) paravirt_end_context_switch(next); } -/*G:033 +/*G:032 * After that diversion we return to our first native-instruction * replacements: four functions for interrupt control. * @@ -199,30 +215,28 @@ static unsigned long save_fl(void) { return lguest_data.irq_enabled; } -PV_CALLEE_SAVE_REGS_THUNK(save_fl); - -/* restore_flags() just sets the flags back to the value given. */ -static void restore_fl(unsigned long flags) -{ - lguest_data.irq_enabled = flags; -} -PV_CALLEE_SAVE_REGS_THUNK(restore_fl); /* Interrupts go off... */ static void irq_disable(void) { lguest_data.irq_enabled = 0; } + +/* Let's pause a moment. Remember how I said these are called so often? + * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to + * break some rules. In particular, these functions are assumed to save their + * own registers if they need to: normal C functions assume they can trash the + * eax register. To use normal C functions, we use + * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the + * C function, then restores it. */ +PV_CALLEE_SAVE_REGS_THUNK(save_fl); PV_CALLEE_SAVE_REGS_THUNK(irq_disable); +/*:*/ -/* Interrupts go on... */ -static void irq_enable(void) -{ - lguest_data.irq_enabled = X86_EFLAGS_IF; -} -PV_CALLEE_SAVE_REGS_THUNK(irq_enable); +/* These are in i386_head.S */ +extern void lg_irq_enable(void); +extern void lg_restore_fl(unsigned long flags); -/*:*/ /*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, * since interrupts are rare and we'll just get the interrupt on the next timer @@ -368,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ - *dx &= 0x07808111; + /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ + *dx &= 0x07808151; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls @@ -388,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, if (*ax > 0x80000008) *ax = 0x80000008; break; + case 0x80000001: + /* Here we should fix nx cap depending on host. */ + /* For this version of PAE, we just clear NX bit. */ + *dx &= ~(1 << 20); + break; } } @@ -521,25 +540,52 @@ static void lguest_write_cr4(unsigned long val) static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { +#ifdef CONFIG_X86_PAE + lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, + ptep->pte_low, ptep->pte_high); +#else lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); +#endif } static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); lguest_pte_update(mm, addr, ptep); } -/* The Guest calls this to set a top-level entry. Again, we set the entry then - * tell the Host which top-level page we changed, and the index of the entry we - * changed. */ +/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd + * to set a middle-level entry when PAE is activated. + * Again, we set the entry then tell the Host which page we changed, + * and the index of the entry we changed. */ +#ifdef CONFIG_X86_PAE +static void lguest_set_pud(pud_t *pudp, pud_t pudval) +{ + native_set_pud(pudp, pudval); + + /* 32 bytes aligned pdpt address and the index. */ + lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, + (__pa(pudp) & 0x1F) / sizeof(pud_t)); +} + static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { - *pmdp = pmdval; + native_set_pmd(pmdp, pmdval); lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } +#else + +/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not + * activated. */ +static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + native_set_pmd(pmdp, pmdval); + lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); +} +#endif /* There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't @@ -552,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * which brings boot back to 0.25 seconds. */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); + if (cr3_changed) + lazy_hcall1(LHCALL_FLUSH_TLB, 1); +} + +#ifdef CONFIG_X86_PAE +static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + native_set_pte_atomic(ptep, pte); if (cr3_changed) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } +void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + native_pte_clear(mm, addr, ptep); + lguest_pte_update(mm, addr, ptep); +} + +void lguest_pmd_clear(pmd_t *pmdp) +{ + lguest_set_pmd(pmdp, __pmd(0)); +} +#endif + /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do @@ -628,13 +694,12 @@ static void __init lguest_init_IRQ(void) { unsigned int i; - for (i = 0; i < LGUEST_IRQS; i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* Some systems map "vectors" to interrupts weirdly. Lguest has * a straightforward 1 to 1 mapping, so force that here. */ - __get_cpu_var(vector_irq)[vector] = i; - if (vector != SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); + __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } /* This call is required to set up for 4k stacks, where we have * separate stacks for hard and soft interrupts. */ @@ -973,10 +1038,10 @@ static void lguest_restart(char *reason) * * Our current solution is to allow the paravirt back end to optionally patch * over the indirect calls to replace them with something more efficient. We - * patch the four most commonly called functions: disable interrupts, enable - * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 - * bytes to patch into: the Guest versions of these operations are small enough - * that we can fit comfortably. + * patch two of the simplest of the most commonly called functions: disable + * interrupts and save interrupts. We usually have 6 or 10 bytes to patch + * into: the Guest versions of these operations are small enough that we can + * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, * and these are in i386_head.S. */ @@ -987,8 +1052,6 @@ static const struct lguest_insns const char *start, *end; } lguest_insns[] = { [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, - [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; @@ -1026,6 +1089,7 @@ __init void lguest_init(void) pv_info.name = "lguest"; pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = 1; + pv_info.shared_kernel_pmd = 1; /* We set up all the lguest overrides for sensitive operations. These * are detailed with the operations themselves. */ @@ -1033,9 +1097,9 @@ __init void lguest_init(void) /* interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); - pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); + pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); - pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); + pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; /* init-time operations */ @@ -1071,6 +1135,12 @@ __init void lguest_init(void) pv_mmu_ops.set_pte = lguest_set_pte; pv_mmu_ops.set_pte_at = lguest_set_pte_at; pv_mmu_ops.set_pmd = lguest_set_pmd; +#ifdef CONFIG_X86_PAE + pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; + pv_mmu_ops.pte_clear = lguest_pte_clear; + pv_mmu_ops.pmd_clear = lguest_pmd_clear; + pv_mmu_ops.set_pud = lguest_set_pud; +#endif pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index f7954198947..a9c8cfe61cd 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -46,10 +46,64 @@ ENTRY(lguest_entry) .globl lgstart_##name; .globl lgend_##name LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) -/*:*/ + +/*G:033 But using those wrappers is inefficient (we'll see why that doesn't + * matter for save_fl and irq_disable later). If we write our routines + * carefully in assembler, we can avoid clobbering any registers and avoid + * jumping through the wrapper functions. + * + * I skipped over our first piece of assembler, but this one is worth studying + * in a bit more detail so I'll describe in easy stages. First, the routine + * to enable interrupts: */ +ENTRY(lg_irq_enable) + /* The reverse of irq_disable, this sets lguest_data.irq_enabled to + * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ + movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled + /* But now we need to check if the Host wants to know: there might have + * been interrupts waiting to be delivered, in which case it will have + * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we + * jump to send_interrupts, otherwise we're done. */ + testl $0, lguest_data+LGUEST_DATA_irq_pending + jnz send_interrupts + /* One cool thing about x86 is that you can do many things without using + * a register. In this case, the normal path hasn't needed to save or + * restore any registers at all! */ + ret +send_interrupts: + /* OK, now we need a register: eax is used for the hypercall number, + * which is LHCALL_SEND_INTERRUPTS. + * + * We used not to bother with this pending detection at all, which was + * much simpler. Sooner or later the Host would realize it had to + * send us an interrupt. But that turns out to make performance 7 + * times worse on a simple tcp benchmark. So now we do this the hard + * way. */ + pushl %eax + movl $LHCALL_SEND_INTERRUPTS, %eax + /* This is a vmcall instruction (same thing that KVM uses). Older + * assembler versions might not know the "vmcall" instruction, so we + * create one manually here. */ + .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + popl %eax + ret + +/* Finally, the "popf" or "restore flags" routine. The %eax register holds the + * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're + * enabling interrupts again, if it's 0 we're leaving them off. */ +ENTRY(lg_restore_fl) + /* This is just "lguest_data.irq_enabled = flags;" */ + movl %eax, lguest_data+LGUEST_DATA_irq_enabled + /* Now, if the %eax value has enabled interrupts and + * lguest_data.irq_pending is set, we want to tell the Host so it can + * deliver any outstanding interrupts. Fortunately, both values will + * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" + * instruction will AND them together for us. If both are set, we + * jump to send_interrupts. */ + testl lguest_data+LGUEST_DATA_irq_pending, %eax + jnz send_interrupts + /* Again, the normal path has used no extra registers. Clever, huh? */ + ret /* These demark the EIP range where host should never deliver interrupts. */ .global lguest_noirq_start |