From 47436aa4ad054c1c7c8231618e86ebd9305308dc Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 22 Oct 2007 11:03:36 +1000 Subject: Boot with virtual == physical to get closer to native Linux. 1) This allows us to get alot closer to booting bzImages. 2) It means we don't have to know page_offset. 3) The Guest needs to modify the boot pagetables to create the PAGE_OFFSET mapping before jumping to C code. 4) guest_pa() walks the page tables rather than using page_offset. 5) We don't use page_offset to figure out whether to emulate: it was always kinda quesationable, and won't work for instructions done before remapping (bzImage unpacking in particular). 6) We still want the kernel address for tlb flushing: have the initial hypercall give us that, too. Signed-off-by: Rusty Russell --- drivers/lguest/hypercalls.c | 8 +++--- drivers/lguest/interrupts_and_traps.c | 13 +++++++--- drivers/lguest/lg.h | 8 +++--- drivers/lguest/lguest_user.c | 11 ++------ drivers/lguest/page_tables.c | 47 +++++++++++++++++++++++++++++------ drivers/lguest/x86/core.c | 7 +++--- 6 files changed, 62 insertions(+), 32 deletions(-) (limited to 'drivers/lguest') diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 02d0ae26826..13b5f2f813d 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -181,15 +181,15 @@ static void initialize(struct lguest *lg) /* The Guest tells us where we're not to deliver interrupts by putting * the range of addresses into "struct lguest_data". */ if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) - || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) - /* We tell the Guest that it can't use the top 4MB of virtual - * addresses used by the Switcher. */ - || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)) + || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) kill_guest(lg, "bad guest page %p", lg->lguest_data); /* We write the current time into the Guest's data page once now. */ write_timestamp(lg); + /* page_tables.c will also do some setup. */ + page_table_guest_data_init(lg); + /* This is the one case where the above accesses might have been the * first write to a Guest page. This may have caused a copy-on-write * fault, but the Guest might be referring to the old (read-only) diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index a57d757eab6..3271c0031a1 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -62,8 +62,9 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) * it). */ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) { - unsigned long gstack; + unsigned long gstack, origstack; u32 eflags, ss, irq_enable; + unsigned long virtstack; /* There are two cases for interrupts: one where the Guest is already * in the kernel, and a more complex one where the Guest is in @@ -71,8 +72,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) if ((lg->regs->ss&0x3) != GUEST_PL) { /* The Guest told us their kernel stack with the SET_STACK * hypercall: both the virtual address and the segment */ - gstack = guest_pa(lg, lg->esp1); + virtstack = lg->esp1; ss = lg->ss1; + + origstack = gstack = guest_pa(lg, virtstack); /* We push the old stack segment and pointer onto the new * stack: when the Guest does an "iret" back from the interrupt * handler the CPU will notice they're dropping privilege @@ -81,8 +84,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) push_guest_stack(lg, &gstack, lg->regs->esp); } else { /* We're staying on the same Guest (kernel) stack. */ - gstack = guest_pa(lg, lg->regs->esp); + virtstack = lg->regs->esp; ss = lg->regs->ss; + + origstack = gstack = guest_pa(lg, virtstack); } /* Remember that we never let the Guest actually disable interrupts, so @@ -108,7 +113,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) /* Now we've pushed all the old state, we change the stack, the code * segment and the address to execute. */ lg->regs->ss = ss; - lg->regs->esp = gstack + lg->page_offset; + lg->regs->esp = virtstack + (gstack - origstack); lg->regs->cs = (__KERNEL_CS|GUEST_PL); lg->regs->eip = idt_address(lo, hi); diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 7408cebe995..e4845d7f068 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -63,7 +63,7 @@ struct lguest /* This provides the offset to the base of guest-physical * memory in the Launcher. */ void __user *mem_base; - u32 page_offset; + unsigned long kernel_address; u32 cr2; int halted; int ts; @@ -165,6 +165,8 @@ void guest_set_pte(struct lguest *lg, unsigned long gpgdir, void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); int demand_page(struct lguest *info, unsigned long cr2, int errcode); void pin_page(struct lguest *lg, unsigned long vaddr); +unsigned long guest_pa(struct lguest *lg, unsigned long vaddr); +void page_table_guest_data_init(struct lguest *lg); /* /core.c: */ void lguest_arch_host_init(void); @@ -229,9 +231,5 @@ do { \ } while(0) /* (End of aside) :*/ -static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) -{ - return vaddr - lg->page_offset; -} #endif /* __ASSEMBLY__ */ #endif /* _LGUEST_H */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index b184652e45d..61b177e1e64 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -111,7 +111,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return run_guest(lg, (unsigned long __user *)user); } -/*L:020 The initialization write supplies 5 pointer sized (32 or 64 bit) +/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) * values (in addition to the LHREQ_INITIALIZE value). These are: * * base: The start of the Guest-physical memory inside the Launcher memory. @@ -124,12 +124,6 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) * pagetables (which are set up by the Launcher). * * start: The first instruction to execute ("eip" in x86-speak). - * - * page_offset: The PAGE_OFFSET constant in the Guest kernel. We should - * probably wean the code off this, but it's a very useful constant! Any - * address above this is within the Guest kernel, and any kernel address can - * quickly converted from physical to virtual by adding PAGE_OFFSET. It's - * 0xC0000000 (3G) by default, but it's configurable at kernel build time. */ static int initialize(struct file *file, const unsigned long __user *input) { @@ -137,7 +131,7 @@ static int initialize(struct file *file, const unsigned long __user *input) * Guest. */ struct lguest *lg; int err; - unsigned long args[5]; + unsigned long args[4]; /* We grab the Big Lguest lock, which protects against multiple * simultaneous initializations. */ @@ -162,7 +156,6 @@ static int initialize(struct file *file, const unsigned long __user *input) /* Populate the easy fields of our "struct lguest" */ lg->mem_base = (void __user *)(long)args[0]; lg->pfn_limit = args[1]; - lg->page_offset = args[4]; /* We need a complete page for the Guest registers: they are accessible * to the Guest and we can only grant it access to whole pages. */ diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index bfe3650b28d..fe3c7575647 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "lg.h" /*M:008 We hold reference to pages, which prevents them from being swapped. @@ -345,7 +346,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) { unsigned int i; /* Release every pgd entry up to the kernel's address. */ - for (i = 0; i < pgd_index(lg->page_offset); i++) + for (i = 0; i < pgd_index(lg->kernel_address); i++) release_pgd(lg, lg->pgdirs[idx].pgdir + i); } @@ -358,6 +359,25 @@ void guest_pagetable_flush_user(struct lguest *lg) } /*:*/ +/* We walk down the guest page tables to get a guest-physical address */ +unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) +{ + pgd_t gpgd; + pte_t gpte; + + /* First step: get the top-level Guest page table entry. */ + gpgd = __pgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); + /* Toplevel not present? We can't map it in. */ + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) + kill_guest(lg, "Bad address %#lx", vaddr); + + gpte = __pte(lgread_u32(lg, gpte_addr(lg, gpgd, vaddr))); + if (!(pte_flags(gpte) & _PAGE_PRESENT)) + kill_guest(lg, "Bad address %#lx", vaddr); + + return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); +} + /* We keep several page tables. This is a simple routine to find the page * table (if any) corresponding to this top-level address the Guest has given * us. */ @@ -500,7 +520,7 @@ void guest_set_pte(struct lguest *lg, { /* Kernel mappings must be changed on all top levels. Slow, but * doesn't happen often. */ - if (vaddr >= lg->page_offset) { + if (vaddr >= lg->kernel_address) { unsigned int i; for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) if (lg->pgdirs[i].pgdir) @@ -550,11 +570,6 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) * its first page table is. We set some things up here: */ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) { - /* In flush_user_mappings() we loop from 0 to - * "pgd_index(lg->page_offset)". This assumes it won't hit - * the Switcher mappings, so check that now. */ - if (pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) - return -EINVAL; /* We start on the first shadow page table, and give it a blank PGD * page. */ lg->pgdidx = 0; @@ -565,6 +580,24 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) return 0; } +/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ +void page_table_guest_data_init(struct lguest *lg) +{ + /* We get the kernel address: above this is all kernel memory. */ + if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) + /* We tell the Guest that it can't use the top 4MB of virtual + * addresses used by the Switcher. */ + || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) + || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) + kill_guest(lg, "bad guest page %p", lg->lguest_data); + + /* In flush_user_mappings() we loop from 0 to + * "pgd_index(lg->kernel_address)". This assumes it won't hit the + * Switcher mappings, so check that now. */ + if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) + kill_guest(lg, "bad kernel address %#lx", lg->kernel_address); +} + /* When a Guest dies, our cleanup is fairly simple. */ void free_guest_pagetable(struct lguest *lg) { diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index a125109446d..39f64c95de1 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -216,9 +216,10 @@ static int emulate_insn(struct lguest *lg) * guest_pa just subtracts the Guest's page_offset. */ unsigned long physaddr = guest_pa(lg, lg->regs->eip); - /* The guest_pa() function only works for Guest kernel addresses, but - * that's all we're trying to do anyway. */ - if (lg->regs->eip < lg->page_offset) + /* This must be the Guest kernel trying to do something, not userspace! + * The bottom two bits of the CS segment register are the privilege + * level. */ + if ((lg->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ -- cgit v1.2.3