From 9499f5e7ed5224c40706f0cec6542a9916bc7606 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:16:35 -0600
Subject: virtio: add names to virtqueue struct, mapping from devices to
 queues.

Add a linked list of all virtqueues for a virtio device: this helps for
debugging and is also needed for upcoming interface change.

Also, add a "name" field for clearer debug messages.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/lguest_device.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index df44d962626..4babed899d5 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -228,7 +228,8 @@ extern void lguest_setup_irq(unsigned int irq);
  * function. */
 static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 				    unsigned index,
-				    void (*callback)(struct virtqueue *vq))
+				    void (*callback)(struct virtqueue *vq),
+				    const char *name)
 {
 	struct lguest_device *ldev = to_lgdev(vdev);
 	struct lguest_vq_info *lvq;
@@ -263,7 +264,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 	/* OK, tell virtio_ring.c to set up a virtqueue now we know its size
 	 * and we've got a pointer to its pages. */
 	vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN,
-				 vdev, lvq->pages, lg_notify, callback);
+				 vdev, lvq->pages, lg_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto unmap;
-- 
cgit v1.2.3


From d2a7ddda9ffb1c8961abff6714b0f1eb925c120f Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 12 Jun 2009 22:16:36 -0600
Subject: virtio: find_vqs/del_vqs virtio operations

This replaces find_vq/del_vq with find_vqs/del_vqs virtio operations,
and updates all drivers. This is needed for MSI support, because MSI
needs to know the total number of vectors upfront.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (+ lguest/9p compile fixes)
---
 drivers/lguest/lguest_device.c | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 4babed899d5..e082cdac88b 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -313,6 +313,38 @@ static void lg_del_vq(struct virtqueue *vq)
 	kfree(lvq);
 }
 
+static void lg_del_vqs(struct virtio_device *vdev)
+{
+	struct virtqueue *vq, *n;
+
+	list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+		lg_del_vq(vq);
+}
+
+static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+		       struct virtqueue *vqs[],
+		       vq_callback_t *callbacks[],
+		       const char *names[])
+{
+	struct lguest_device *ldev = to_lgdev(vdev);
+	int i;
+
+	/* We must have this many virtqueues. */
+	if (nvqs > ldev->desc->num_vq)
+		return -ENOENT;
+
+	for (i = 0; i < nvqs; ++i) {
+		vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]);
+		if (IS_ERR(vqs[i]))
+			goto error;
+	}
+	return 0;
+
+error:
+	lg_del_vqs(vdev);
+	return PTR_ERR(vqs[i]);
+}
+
 /* The ops structure which hooks everything together. */
 static struct virtio_config_ops lguest_config_ops = {
 	.get_features = lg_get_features,
@@ -322,8 +354,8 @@ static struct virtio_config_ops lguest_config_ops = {
 	.get_status = lg_get_status,
 	.set_status = lg_set_status,
 	.reset = lg_reset,
-	.find_vq = lg_find_vq,
-	.del_vq = lg_del_vq,
+	.find_vqs = lg_find_vqs,
+	.del_vqs = lg_del_vqs,
 };
 
 /* The root device for the lguest virtio devices.  This makes them appear as
-- 
cgit v1.2.3


From a6c372de6e4b9a8188b66badcee3e3792eccdd26 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:01 -0600
Subject: lguest: fix lguest wake on guest clock tick, or fd activity

The Launcher could be inside the Guest on another CPU; wake_up_process
will do nothing because it is "running".  kick_process will knock it
back into our kernel in this case, otherwise we'll miss it until the
next guest exit.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/interrupts_and_traps.c | 6 +++---
 drivers/lguest/lguest_user.c          | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 6e99adbe194..9ea26ad88c9 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -511,9 +511,9 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
 
 	/* Remember the first interrupt is the timer interrupt. */
 	set_bit(0, cpu->irqs_pending);
-	/* If the Guest is actually stopped, we need to wake it up. */
-	if (cpu->halted)
-		wake_up_process(cpu->tsk);
+	/* Guest may be stopped or running on another CPU. */
+	if (!wake_up_process(cpu->tsk))
+		kick_process(cpu->tsk);
 	return HRTIMER_NORESTART;
 }
 
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index b8ee103eed5..bcdcf3453e7 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -24,8 +24,8 @@ static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
 
 	if (on) {
 		cpu->break_out = 1;
-		/* Pop it out of the Guest (may be running on different CPU) */
-		wake_up_process(cpu->tsk);
+		if (!wake_up_process(cpu->tsk))
+			kick_process(cpu->tsk);
 		/* Wait for them to reset it */
 		return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
 	} else {
-- 
cgit v1.2.3


From abd41f037e1a64543000ed73b42f616d04d92700 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:02 -0600
Subject: lguest: fix race in halt code

When the Guest does the LHCALL_HALT hypercall, we go to sleep, expecting
that a timer or the Waker will wake_up_process() us.

But we do it in a stupid way, leaving a classic missing wakeup race.

So split maybe_do_interrupt() into interrupt_pending() and
try_deliver_interrupt(), and check maybe_do_interrupt() and the
"break_out" flag before calling schedule.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c                 | 14 ++++++++++++--
 drivers/lguest/interrupts_and_traps.c | 26 +++++++++++++++++---------
 drivers/lguest/lg.h                   |  3 ++-
 3 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 4845fb3cf74..8ca1def5b14 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -188,6 +188,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 {
 	/* We stop running once the Guest is dead. */
 	while (!cpu->lg->dead) {
+		unsigned int irq;
+
 		/* First we run any hypercalls the Guest wants done. */
 		if (cpu->hcall)
 			do_hypercalls(cpu);
@@ -211,7 +213,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		/* Check if there are any interrupts which can be delivered now:
 		 * if so, this sets up the hander to be executed when we next
 		 * run the Guest. */
-		maybe_do_interrupt(cpu);
+		irq = interrupt_pending(cpu);
+		if (irq < LGUEST_IRQS)
+			try_deliver_interrupt(cpu, irq);
 
 		/* All long-lived kernel loops need to check with this horrible
 		 * thing called the freezer.  If the Host is trying to suspend,
@@ -227,7 +231,13 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		 * clock timer or LHREQ_BREAK from the Waker will wake us. */
 		if (cpu->halted) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			schedule();
+			/* Just before we sleep, make sure nothing snuck in
+			 * which we should be doing. */
+			if (interrupt_pending(cpu) < LGUEST_IRQS
+			    || cpu->break_out)
+				set_current_state(TASK_RUNNING);
+			else
+				schedule();
 			continue;
 		}
 
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 9ea26ad88c9..a8c966fee1e 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -128,30 +128,38 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
 /*H:205
  * Virtual Interrupts.
  *
- * maybe_do_interrupt() gets called before every entry to the Guest, to see if
- * we should divert the Guest to running an interrupt handler. */
-void maybe_do_interrupt(struct lg_cpu *cpu)
+ * interrupt_pending() returns the first pending interrupt which isn't blocked
+ * by the Guest.  It is called before every entry to the Guest, and just before
+ * we go to sleep when the Guest has halted itself. */
+unsigned int interrupt_pending(struct lg_cpu *cpu)
 {
 	unsigned int irq;
 	DECLARE_BITMAP(blk, LGUEST_IRQS);
-	struct desc_struct *idt;
 
 	/* If the Guest hasn't even initialized yet, we can do nothing. */
 	if (!cpu->lg->lguest_data)
-		return;
+		return LGUEST_IRQS;
 
 	/* Take our "irqs_pending" array and remove any interrupts the Guest
 	 * wants blocked: the result ends up in "blk". */
 	if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
 			   sizeof(blk)))
-		return;
+		return LGUEST_IRQS;
 	bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
 
 	/* Find the first interrupt. */
 	irq = find_first_bit(blk, LGUEST_IRQS);
-	/* None?  Nothing to do */
-	if (irq >= LGUEST_IRQS)
-		return;
+
+	return irq;
+}
+
+/* This actually diverts the Guest to running an interrupt handler, once an
+ * interrupt has been identified by interrupt_pending(). */
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq)
+{
+	struct desc_struct *idt;
+
+	BUG_ON(irq >= LGUEST_IRQS);
 
 	/* They may be in the middle of an iret, where they asked us never to
 	 * deliver interrupts. */
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index af92a176697..6743cf147d9 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -139,7 +139,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
 #define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
 
 /* interrupts_and_traps.c: */
-void maybe_do_interrupt(struct lg_cpu *cpu);
+unsigned int interrupt_pending(struct lg_cpu *cpu);
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq);
 bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
 			  u32 low, u32 hi);
-- 
cgit v1.2.3


From a32a8813d0173163ba44d8f9556e0d89fdc4fb46 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:02 -0600
Subject: lguest: improve interrupt handling, speed up stream networking

lguest never checked for pending interrupts when enabling interrupts, and
things still worked.  However, it makes a significant difference to TCP
performance, so it's time we fixed it by introducing a pending_irq flag
and checking it on irq_restore and irq_enable.

These two routines are now too big to patch into the 8/10 bytes
patch space, so we drop that code.

Note: The high latency on interrupt delivery had a very curious
effect: once everything else was optimized, networking without GSO was
faster than networking with GSO, since more interrupts were sent and
hence a greater chance of one getting through to the Guest!

Note2: (Almost) Closing the same loophole for iret doesn't have any
measurable effect, so I'm leaving that patch for the moment.

Before:
	1GB tcpblast Guest->Host:		30.7 seconds
	1GB tcpblast Guest->Host (no GSO):	76.0 seconds

After:
	1GB tcpblast Guest->Host:		6.8 seconds
	1GB tcpblast Guest->Host (no GSO):	27.8 seconds

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c                 |  7 ++++---
 drivers/lguest/hypercalls.c           |  4 ++++
 drivers/lguest/interrupts_and_traps.c | 16 +++++++++++++---
 drivers/lguest/lg.h                   |  4 ++--
 4 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 8ca1def5b14..03fbc88c002 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -189,6 +189,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 	/* We stop running once the Guest is dead. */
 	while (!cpu->lg->dead) {
 		unsigned int irq;
+		bool more;
 
 		/* First we run any hypercalls the Guest wants done. */
 		if (cpu->hcall)
@@ -213,9 +214,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		/* Check if there are any interrupts which can be delivered now:
 		 * if so, this sets up the hander to be executed when we next
 		 * run the Guest. */
-		irq = interrupt_pending(cpu);
+		irq = interrupt_pending(cpu, &more);
 		if (irq < LGUEST_IRQS)
-			try_deliver_interrupt(cpu, irq);
+			try_deliver_interrupt(cpu, irq, more);
 
 		/* All long-lived kernel loops need to check with this horrible
 		 * thing called the freezer.  If the Host is trying to suspend,
@@ -233,7 +234,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 			set_current_state(TASK_INTERRUPTIBLE);
 			/* Just before we sleep, make sure nothing snuck in
 			 * which we should be doing. */
-			if (interrupt_pending(cpu) < LGUEST_IRQS
+			if (interrupt_pending(cpu, &more) < LGUEST_IRQS
 			    || cpu->break_out)
 				set_current_state(TASK_RUNNING);
 			else
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 54d66f05fef..f252b71ae79 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -37,6 +37,10 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 		/* This call does nothing, except by breaking out of the Guest
 		 * it makes us process all the asynchronous hypercalls. */
 		break;
+	case LHCALL_SEND_INTERRUPTS:
+		/* This call does nothing too, but by breaking out of the Guest
+		 * it makes us process any pending interrupts. */
+		break;
 	case LHCALL_LGUEST_INIT:
 		/* You can't get here unless you're already initialized.  Don't
 		 * do that. */
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index a8c966fee1e..5a10754b479 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -131,7 +131,7 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
  * interrupt_pending() returns the first pending interrupt which isn't blocked
  * by the Guest.  It is called before every entry to the Guest, and just before
  * we go to sleep when the Guest has halted itself. */
-unsigned int interrupt_pending(struct lg_cpu *cpu)
+unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
 {
 	unsigned int irq;
 	DECLARE_BITMAP(blk, LGUEST_IRQS);
@@ -149,13 +149,14 @@ unsigned int interrupt_pending(struct lg_cpu *cpu)
 
 	/* Find the first interrupt. */
 	irq = find_first_bit(blk, LGUEST_IRQS);
+	*more = find_next_bit(blk, LGUEST_IRQS, irq+1);
 
 	return irq;
 }
 
 /* This actually diverts the Guest to running an interrupt handler, once an
  * interrupt has been identified by interrupt_pending(). */
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq)
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
 {
 	struct desc_struct *idt;
 
@@ -178,8 +179,12 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq)
 		u32 irq_enabled;
 		if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
 			irq_enabled = 0;
-		if (!irq_enabled)
+		if (!irq_enabled) {
+			/* Make sure they know an IRQ is pending. */
+			put_user(X86_EFLAGS_IF,
+				 &cpu->lg->lguest_data->irq_pending);
 			return;
+		}
 	}
 
 	/* Look at the IDT entry the Guest gave us for this interrupt.  The
@@ -202,6 +207,11 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq)
 	 * here is a compromise which means at least it gets updated every
 	 * timer interrupt. */
 	write_timestamp(cpu);
+
+	/* If there are no other interrupts we want to deliver, clear
+	 * the pending flag. */
+	if (!more)
+		put_user(0, &cpu->lg->lguest_data->irq_pending);
 }
 /*:*/
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 6743cf147d9..573896533ac 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -139,8 +139,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
 #define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
 
 /* interrupts_and_traps.c: */
-unsigned int interrupt_pending(struct lg_cpu *cpu);
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq);
+unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
 bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
 			  u32 low, u32 hi);
-- 
cgit v1.2.3


From 81b79b01d057f8c5a378c38d2f738775b972934a Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 20 May 2009 01:45:45 +0200
Subject: lguest: beyond ARRAY_SIZE of cpu->arch.gdt

Do not go beyond ARRAY_SIZE of cpu->arch.gdt

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/segments.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 7ede64ffeef..482ed5a1875 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -150,7 +150,7 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
 {
 	/* We assume the Guest has the same number of GDT entries as the
 	 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
-	if (num > ARRAY_SIZE(cpu->arch.gdt))
+	if (num >= ARRAY_SIZE(cpu->arch.gdt))
 		kill_guest(cpu, "too many gdt entries %i", num);
 
 	/* Set it up, then fix it. */
-- 
cgit v1.2.3


From f086122bb6e885f926f935b1418fca3b293375f0 Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Fri, 12 Jun 2009 22:27:04 -0600
Subject: lguest: Segment selectors are 16-bit long. Fix lg_cpu.ss1 definition.

If GDT_ENTRIES were every > 256, this could become a problem.

Signed-off-by: Matias Zabaljauregui <zabaljauregui at gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/lg.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 573896533ac..74af503ad63 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -49,7 +49,7 @@ struct lg_cpu {
 	u32 cr2;
 	int ts;
 	u32 esp1;
-	u8 ss1;
+	u16 ss1;
 
 	/* Bitmap of what has changed: see CHANGED_* above. */
 	int changed;
-- 
cgit v1.2.3


From ed1dc77810159a733240ba6751c1b31023bf8dd7 Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Sat, 30 May 2009 15:35:49 -0300
Subject: lguest: map switcher with executable page table entries

Map switcher with executable page table entries.
(This bug didn't matter before PAE and hence NX support -- RR)

Signed-off-by: Matias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c        | 2 +-
 drivers/lguest/page_tables.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 03fbc88c002..d0298dc45d9 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -95,7 +95,7 @@ static __init int map_switcher(void)
 	 * array of struct pages.  It increments that pointer, but we don't
 	 * care. */
 	pagep = switcher_page;
-	err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep);
+	err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
 	if (err) {
 		printk("lguest: map_vm_area failed: %i\n", err);
 		goto free_vma;
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index a059cf9980f..496995370fb 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -714,7 +714,7 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 
 	/* Make the last PGD entry for this Guest point to the Switcher's PTE
 	 * page for this CPU (with appropriate flags). */
-	switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL);
+	switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
 
 	cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
 
-- 
cgit v1.2.3


From 90603d15fa95605d1d08235b73e220d766f04bb0 Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Fri, 12 Jun 2009 22:27:06 -0600
Subject: lguest: use native_set_* macros, which properly handle 64-bit entries
 when PAE is activated

Some cleanups and replace direct assignment with native_set_* macros which properly handle 64-bit entries when PAE is activated

Signed-off-by: Matias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/page_tables.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 496995370fb..ffba723cd98 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -90,7 +90,7 @@ static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
 	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
 	/* You should never call this if the PGD entry wasn't valid */
 	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-	return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE];
+	return &page[pte_index(vaddr)];
 }
 
 /* These two functions just like the above two, except they access the Guest
@@ -105,7 +105,7 @@ static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
 {
 	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
 	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-	return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t);
+	return gpage + pte_index(vaddr) * sizeof(pte_t);
 }
 /*:*/
 
@@ -171,7 +171,7 @@ static void release_pte(pte_t pte)
 	/* Remember that get_user_pages_fast() took a reference to the page, in
 	 * get_pfn()?  We have to put it back now. */
 	if (pte_flags(pte) & _PAGE_PRESENT)
-		put_page(pfn_to_page(pte_pfn(pte)));
+		put_page(pte_page(pte));
 }
 /*:*/
 
@@ -273,7 +273,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		 * table entry, even if the Guest says it's writable.  That way
 		 * we will come back here when a write does actually occur, so
 		 * we can update the Guest's _PAGE_DIRTY flag. */
-		*spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0);
+		native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
 
 	/* Finally, we write the Guest PTE entry back: we've set the
 	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
@@ -323,7 +323,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
 }
 
 /*H:450 If we chase down the release_pgd() code, it looks like this: */
-static void release_pgd(struct lguest *lg, pgd_t *spgd)
+static void release_pgd(pgd_t *spgd)
 {
 	/* If the entry's not present, there's nothing to release. */
 	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
@@ -350,7 +350,7 @@ static void flush_user_mappings(struct lguest *lg, int idx)
 	unsigned int i;
 	/* Release every pgd entry up to the kernel's address. */
 	for (i = 0; i < pgd_index(lg->kernel_address); i++)
-		release_pgd(lg, lg->pgdirs[idx].pgdir + i);
+		release_pgd(lg->pgdirs[idx].pgdir + i);
 }
 
 /*H:440 (v) Flushing (throwing away) page tables,
@@ -431,7 +431,7 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 
 /*H:430 (iv) Switching page tables
  *
- * Now we've seen all the page table setting and manipulation, let's see what
+ * Now we've seen all the page table setting and manipulation, let's see
  * what happens when the Guest changes page tables (ie. changes the top-level
  * pgdir).  This occurs on almost every context switch. */
 void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
@@ -463,7 +463,7 @@ static void release_all_pagetables(struct lguest *lg)
 		if (lg->pgdirs[i].pgdir)
 			/* Every PGD entry except the Switcher at the top */
 			for (j = 0; j < SWITCHER_PGD_INDEX; j++)
-				release_pgd(lg, lg->pgdirs[i].pgdir + j);
+				release_pgd(lg->pgdirs[i].pgdir + j);
 }
 
 /* We also throw away everything when a Guest tells us it's changed a kernel
@@ -581,7 +581,7 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 	pgdir = find_pgdir(lg, gpgdir);
 	if (pgdir < ARRAY_SIZE(lg->pgdirs))
 		/* ... throw it away. */
-		release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
+		release_pgd(lg->pgdirs[pgdir].pgdir + idx);
 }
 
 /* Once we know how much memory we have we can construct simple identity
@@ -726,8 +726,9 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 	 * page is already mapped there, we don't have to copy them out
 	 * again. */
 	pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
-	regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL));
-	switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;
+	native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL));
+	native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)],
+			regs_pte);
 }
 /*:*/
 
@@ -752,21 +753,21 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
 
 	/* The first entries are easy: they map the Switcher code. */
 	for (i = 0; i < pages; i++) {
-		pte[i] = mk_pte(switcher_page[i],
-				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
+		native_set_pte(&pte[i], mk_pte(switcher_page[i],
+				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
 	}
 
 	/* The only other thing we map is this CPU's pair of pages. */
 	i = pages + cpu*2;
 
 	/* First page (Guest registers) is writable from the Guest */
-	pte[i] = pfn_pte(page_to_pfn(switcher_page[i]),
-			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW));
+	native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
+			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
 
 	/* The second page contains the "struct lguest_ro_state", and is
 	 * read-only. */
-	pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]),
-			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
+	native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
+			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
 }
 
 /* We've made it through the page table code.  Perhaps our tired brains are
-- 
cgit v1.2.3


From ebe0ba84f55950a89cb7af94c7ffc35ee3992f9e Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Sat, 30 May 2009 15:48:08 -0300
Subject: lguest: replace hypercall name LHCALL_SET_PMD with LHCALL_SET_PGD

replace LHCALL_SET_PMD with LHCALL_SET_PGD hypercall name
(That's really what it is, and the confusion gets worse with PAE support)

Signed-off-by: Matias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reported-by: Jeremy Fitzhardinge <jeremy@goop.org>
---
 drivers/lguest/hypercalls.c  | 4 ++--
 drivers/lguest/lg.h          | 2 +-
 drivers/lguest/page_tables.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index f252b71ae79..51149ca1461 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -79,8 +79,8 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 	case LHCALL_SET_PTE:
 		guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
 		break;
-	case LHCALL_SET_PMD:
-		guest_set_pmd(cpu->lg, args->arg1, args->arg2);
+	case LHCALL_SET_PGD:
+		guest_set_pgd(cpu->lg, args->arg1, args->arg2);
 		break;
 	case LHCALL_SET_CLOCKEVENT:
 		guest_set_clockevent(cpu, args->arg1);
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 74af503ad63..cacc2da2058 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -169,7 +169,7 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
 int init_guest_pagetable(struct lguest *lg);
 void free_guest_pagetable(struct lguest *lg);
 void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
-void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
+void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
 void guest_pagetable_clear_all(struct lg_cpu *cpu);
 void guest_pagetable_flush_user(struct lg_cpu *cpu);
 void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index ffba723cd98..6a54d76b623 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -568,7 +568,7 @@ void guest_set_pte(struct lg_cpu *cpu,
  *
  * So with that in mind here's our code to to update a (top-level) PGD entry:
  */
-void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
+void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 {
 	int pgdir;
 
-- 
cgit v1.2.3


From acdd0b6292b282c4511897ac2691a47befbf1c6a Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Fri, 12 Jun 2009 22:27:07 -0600
Subject: lguest: PAE support

This version requires that host and guest have the same PAE status.
NX cap is not offered to the guest, yet.

Signed-off-by: Matias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/Kconfig       |   2 +-
 drivers/lguest/hypercalls.c  |  10 ++
 drivers/lguest/lg.h          |   5 +
 drivers/lguest/page_tables.c | 351 ++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 329 insertions(+), 39 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index a3d3cbab359..8f63845db83 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,6 +1,6 @@
 config LGUEST
 	tristate "Linux hypervisor example code"
-	depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX
+	depends on X86_32 && EXPERIMENTAL && FUTEX
 	select HVC_DRIVER
 	---help---
 	  This is a very simple module which allows you to run
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 51149ca1461..c29ffa19cb7 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -77,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 		guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
 		break;
 	case LHCALL_SET_PTE:
+#ifdef CONFIG_X86_PAE
+		guest_set_pte(cpu, args->arg1, args->arg2,
+				__pte(args->arg3 | (u64)args->arg4 << 32));
+#else
 		guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
+#endif
 		break;
 	case LHCALL_SET_PGD:
 		guest_set_pgd(cpu->lg, args->arg1, args->arg2);
 		break;
+#ifdef CONFIG_X86_PAE
+	case LHCALL_SET_PMD:
+		guest_set_pmd(cpu->lg, args->arg1, args->arg2);
+		break;
+#endif
 	case LHCALL_SET_CLOCKEVENT:
 		guest_set_clockevent(cpu, args->arg1);
 		break;
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index cacc2da2058..6201ce59e88 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -137,6 +137,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
  * in the kernel. */
 #define pgd_flags(x)	(pgd_val(x) & ~PAGE_MASK)
 #define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
+#define pmd_flags(x)    (pmd_val(x) & ~PAGE_MASK)
+#define pmd_pfn(x)	(pmd_val(x) >> PAGE_SHIFT)
 
 /* interrupts_and_traps.c: */
 unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
@@ -170,6 +172,9 @@ int init_guest_pagetable(struct lguest *lg);
 void free_guest_pagetable(struct lguest *lg);
 void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
 void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
+#ifdef CONFIG_X86_PAE
+void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
+#endif
 void guest_pagetable_clear_all(struct lg_cpu *cpu);
 void guest_pagetable_flush_user(struct lg_cpu *cpu);
 void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 6a54d76b623..5e2c26adcf0 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -53,6 +53,17 @@
  * page.  */
 #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
 
+/* For PAE we need the PMD index as well. We use the last 2MB, so we
+ * will need the last pmd entry of the last pmd page.  */
+#ifdef CONFIG_X86_PAE
+#define SWITCHER_PMD_INDEX 	(PTRS_PER_PMD - 1)
+#define RESERVE_MEM 		2U
+#define CHECK_GPGD_MASK		_PAGE_PRESENT
+#else
+#define RESERVE_MEM 		4U
+#define CHECK_GPGD_MASK		_PAGE_TABLE
+#endif
+
 /* We actually need a separate PTE page for each CPU.  Remember that after the
  * Switcher code itself comes two pages for each CPU, and we don't want this
  * CPU's guest to see the pages of any other CPU. */
@@ -73,23 +84,58 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
 {
 	unsigned int index = pgd_index(vaddr);
 
+#ifndef CONFIG_X86_PAE
 	/* We kill any Guest trying to touch the Switcher addresses. */
 	if (index >= SWITCHER_PGD_INDEX) {
 		kill_guest(cpu, "attempt to access switcher pages");
 		index = 0;
 	}
+#endif
 	/* Return a pointer index'th pgd entry for the i'th page table. */
 	return &cpu->lg->pgdirs[i].pgdir[index];
 }
 
+#ifdef CONFIG_X86_PAE
+/* This routine then takes the PGD entry given above, which contains the
+ * address of the PMD page.  It then returns a pointer to the PMD entry for the
+ * given address. */
+static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
+{
+	unsigned int index = pmd_index(vaddr);
+	pmd_t *page;
+
+	/* We kill any Guest trying to touch the Switcher addresses. */
+	if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
+					index >= SWITCHER_PMD_INDEX) {
+		kill_guest(cpu, "attempt to access switcher pages");
+		index = 0;
+	}
+
+	/* You should never call this if the PGD entry wasn't valid */
+	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
+	page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
+
+	return &page[index];
+}
+#endif
+
 /* This routine then takes the page directory entry returned above, which
  * contains the address of the page table entry (PTE) page.  It then returns a
  * pointer to the PTE entry for the given address. */
-static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr)
+static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
 {
+#ifdef CONFIG_X86_PAE
+	pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
+	pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);
+
+	/* You should never call this if the PMD entry wasn't valid */
+	BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
+#else
 	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
 	/* You should never call this if the PGD entry wasn't valid */
 	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
+#endif
+
 	return &page[pte_index(vaddr)];
 }
 
@@ -101,10 +147,31 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
 	return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
 }
 
-static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr)
+#ifdef CONFIG_X86_PAE
+static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
 {
 	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
 	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
+	return gpage + pmd_index(vaddr) * sizeof(pmd_t);
+}
+#endif
+
+static unsigned long gpte_addr(struct lg_cpu *cpu,
+				pgd_t gpgd, unsigned long vaddr)
+{
+#ifdef CONFIG_X86_PAE
+	pmd_t gpmd;
+#endif
+	unsigned long gpage;
+
+	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
+#ifdef CONFIG_X86_PAE
+	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+	gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
+	BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
+#else
+	gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
+#endif
 	return gpage + pte_index(vaddr) * sizeof(pte_t);
 }
 /*:*/
@@ -184,11 +251,20 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
 
 static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
 {
-	if ((pgd_flags(gpgd) & ~_PAGE_TABLE) ||
+	if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
 	   (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
 		kill_guest(cpu, "bad page directory entry");
 }
 
+#ifdef CONFIG_X86_PAE
+static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
+{
+	if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
+	   (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
+		kill_guest(cpu, "bad page middle directory entry");
+}
+#endif
+
 /*H:330
  * (i) Looking up a page table entry when the Guest faults.
  *
@@ -207,6 +283,11 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 	pte_t gpte;
 	pte_t *spte;
 
+#ifdef CONFIG_X86_PAE
+	pmd_t *spmd;
+	pmd_t gpmd;
+#endif
+
 	/* First step: get the top-level Guest page table entry. */
 	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
@@ -228,12 +309,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		check_gpgd(cpu, gpgd);
 		/* And we copy the flags to the shadow PGD entry.  The page
 		 * number in the shadow PGD is the page we just allocated. */
-		*spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd));
+		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
 	}
 
+#ifdef CONFIG_X86_PAE
+	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+	/* middle level not present?  We can't map it in. */
+	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+		return false;
+
+	/* Now look at the matching shadow entry. */
+	spmd = spmd_addr(cpu, *spgd, vaddr);
+
+	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
+		/* No shadow entry: allocate a new shadow PTE page. */
+		unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+
+		/* This is not really the Guest's fault, but killing it is
+		* simple for this corner case. */
+		if (!ptepage) {
+			kill_guest(cpu, "out of memory allocating pte page");
+			return false;
+		}
+
+		/* We check that the Guest pmd is OK. */
+		check_gpmd(cpu, gpmd);
+
+		/* And we copy the flags to the shadow PMD entry.  The page
+		 * number in the shadow PMD is the page we just allocated. */
+		native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
+	}
+#endif
 	/* OK, now we look at the lower level in the Guest page table: keep its
 	 * address, because we might update it later. */
-	gpte_ptr = gpte_addr(gpgd, vaddr);
+	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
 	gpte = lgread(cpu, gpte_ptr, pte_t);
 
 	/* If this page isn't in the Guest page tables, we can't page it in. */
@@ -259,7 +368,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		gpte = pte_mkdirty(gpte);
 
 	/* Get the pointer to the shadow PTE entry we're going to set. */
-	spte = spte_addr(*spgd, vaddr);
+	spte = spte_addr(cpu, *spgd, vaddr);
 	/* If there was a valid shadow PTE entry here before, we release it.
 	 * This can happen with a write to a previously read-only entry. */
 	release_pte(*spte);
@@ -301,14 +410,23 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
 	pgd_t *spgd;
 	unsigned long flags;
 
+#ifdef CONFIG_X86_PAE
+	pmd_t *spmd;
+#endif
 	/* Look at the current top level entry: is it present? */
 	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
 	if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
 		return false;
 
+#ifdef CONFIG_X86_PAE
+	spmd = spmd_addr(cpu, *spgd, vaddr);
+	if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
+		return false;
+#endif
+
 	/* Check the flags on the pte entry itself: it must be present and
 	 * writable. */
-	flags = pte_flags(*(spte_addr(*spgd, vaddr)));
+	flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
 
 	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
 }
@@ -322,6 +440,41 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
 		kill_guest(cpu, "bad stack page %#lx", vaddr);
 }
 
+#ifdef CONFIG_X86_PAE
+static void release_pmd(pmd_t *spmd)
+{
+	/* If the entry's not present, there's nothing to release. */
+	if (pmd_flags(*spmd) & _PAGE_PRESENT) {
+		unsigned int i;
+		pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
+		/* For each entry in the page, we might need to release it. */
+		for (i = 0; i < PTRS_PER_PTE; i++)
+			release_pte(ptepage[i]);
+		/* Now we can free the page of PTEs */
+		free_page((long)ptepage);
+		/* And zero out the PMD entry so we never release it twice. */
+		native_set_pmd(spmd, __pmd(0));
+	}
+}
+
+static void release_pgd(pgd_t *spgd)
+{
+	/* If the entry's not present, there's nothing to release. */
+	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
+		unsigned int i;
+		pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
+
+		for (i = 0; i < PTRS_PER_PMD; i++)
+			release_pmd(&pmdpage[i]);
+
+		/* Now we can free the page of PMDs */
+		free_page((long)pmdpage);
+		/* And zero out the PGD entry so we never release it twice. */
+		set_pgd(spgd, __pgd(0));
+	}
+}
+
+#else /* !CONFIG_X86_PAE */
 /*H:450 If we chase down the release_pgd() code, it looks like this: */
 static void release_pgd(pgd_t *spgd)
 {
@@ -341,7 +494,7 @@ static void release_pgd(pgd_t *spgd)
 		*spgd = __pgd(0);
 	}
 }
-
+#endif
 /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings()
  * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
  * It simply releases every PTE page from 0 up to the Guest's kernel address. */
@@ -370,6 +523,9 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 	pgd_t gpgd;
 	pte_t gpte;
 
+#ifdef CONFIG_X86_PAE
+	pmd_t gpmd;
+#endif
 	/* First step: get the top-level Guest page table entry. */
 	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
 	/* Toplevel not present?  We can't map it in. */
@@ -378,7 +534,13 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 		return -1UL;
 	}
 
-	gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t);
+	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
+#ifdef CONFIG_X86_PAE
+	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+		kill_guest(cpu, "Bad address %#lx", vaddr);
+#endif
+	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
 	if (!(pte_flags(gpte) & _PAGE_PRESENT))
 		kill_guest(cpu, "Bad address %#lx", vaddr);
 
@@ -405,6 +567,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 			      int *blank_pgdir)
 {
 	unsigned int next;
+#ifdef CONFIG_X86_PAE
+	pmd_t *pmd_table;
+#endif
 
 	/* We pick one entry at random to throw out.  Choosing the Least
 	 * Recently Used might be better, but this is easy. */
@@ -416,10 +581,27 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
 		/* If the allocation fails, just keep using the one we have */
 		if (!cpu->lg->pgdirs[next].pgdir)
 			next = cpu->cpu_pgd;
-		else
-			/* This is a blank page, so there are no kernel
-			 * mappings: caller must map the stack! */
+		else {
+#ifdef CONFIG_X86_PAE
+			/* In PAE mode, allocate a pmd page and populate the
+			 * last pgd entry. */
+			pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+			if (!pmd_table) {
+				free_page((long)cpu->lg->pgdirs[next].pgdir);
+				set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
+				next = cpu->cpu_pgd;
+			} else {
+				set_pgd(cpu->lg->pgdirs[next].pgdir +
+					SWITCHER_PGD_INDEX,
+					__pgd(__pa(pmd_table) | _PAGE_PRESENT));
+				/* This is a blank page, so there are no kernel
+				 * mappings: caller must map the stack! */
+				*blank_pgdir = 1;
+			}
+#else
 			*blank_pgdir = 1;
+#endif
+		}
 	}
 	/* Record which Guest toplevel this shadows. */
 	cpu->lg->pgdirs[next].gpgdir = gpgdir;
@@ -460,10 +642,25 @@ static void release_all_pagetables(struct lguest *lg)
 
 	/* Every shadow pagetable this Guest has */
 	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		if (lg->pgdirs[i].pgdir)
+		if (lg->pgdirs[i].pgdir) {
+#ifdef CONFIG_X86_PAE
+			pgd_t *spgd;
+			pmd_t *pmdpage;
+			unsigned int k;
+
+			/* Get the last pmd page. */
+			spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
+			pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
+
+			/* And release the pmd entries of that pmd page,
+			 * except for the switcher pmd. */
+			for (k = 0; k < SWITCHER_PMD_INDEX; k++)
+				release_pmd(&pmdpage[k]);
+#endif
 			/* Every PGD entry except the Switcher at the top */
 			for (j = 0; j < SWITCHER_PGD_INDEX; j++)
 				release_pgd(lg->pgdirs[i].pgdir + j);
+		}
 }
 
 /* We also throw away everything when a Guest tells us it's changed a kernel
@@ -504,24 +701,37 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
 {
 	/* Look up the matching shadow page directory entry. */
 	pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
+#ifdef CONFIG_X86_PAE
+	pmd_t *spmd;
+#endif
 
 	/* If the top level isn't present, there's no entry to update. */
 	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-		/* Otherwise, we start by releasing the existing entry. */
-		pte_t *spte = spte_addr(*spgd, vaddr);
-		release_pte(*spte);
-
-		/* If they're setting this entry as dirty or accessed, we might
-		 * as well put that entry they've given us in now.  This shaves
-		 * 10% off a copy-on-write micro-benchmark. */
-		if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
-			check_gpte(cpu, gpte);
-			*spte = gpte_to_spte(cpu, gpte,
-					     pte_flags(gpte) & _PAGE_DIRTY);
-		} else
-			/* Otherwise kill it and we can demand_page() it in
-			 * later. */
-			*spte = __pte(0);
+#ifdef CONFIG_X86_PAE
+		spmd = spmd_addr(cpu, *spgd, vaddr);
+		if (pmd_flags(*spmd) & _PAGE_PRESENT) {
+#endif
+			/* Otherwise, we start by releasing
+			 * the existing entry. */
+			pte_t *spte = spte_addr(cpu, *spgd, vaddr);
+			release_pte(*spte);
+
+			/* If they're setting this entry as dirty or accessed,
+			 * we might as well put that entry they've given us
+			 * in now.  This shaves 10% off a
+			 * copy-on-write micro-benchmark. */
+			if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+				check_gpte(cpu, gpte);
+				native_set_pte(spte,
+						gpte_to_spte(cpu, gpte,
+						pte_flags(gpte) & _PAGE_DIRTY));
+			} else
+				/* Otherwise kill it and we can demand_page()
+				 * it in later. */
+				native_set_pte(spte, __pte(0));
+#ifdef CONFIG_X86_PAE
+		}
+#endif
 	}
 }
 
@@ -572,8 +782,6 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 {
 	int pgdir;
 
-	/* The kernel seems to try to initialize this early on: we ignore its
-	 * attempts to map over the Switcher. */
 	if (idx >= SWITCHER_PGD_INDEX)
 		return;
 
@@ -583,6 +791,12 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 		/* ... throw it away. */
 		release_pgd(lg->pgdirs[pgdir].pgdir + idx);
 }
+#ifdef CONFIG_X86_PAE
+void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
+{
+	guest_pagetable_clear_all(&lg->cpus[0]);
+}
+#endif
 
 /* Once we know how much memory we have we can construct simple identity
  * (which set virtual == physical) and linear mappings
@@ -596,8 +810,16 @@ static unsigned long setup_pagetables(struct lguest *lg,
 {
 	pgd_t __user *pgdir;
 	pte_t __user *linear;
-	unsigned int mapped_pages, i, linear_pages, phys_linear;
 	unsigned long mem_base = (unsigned long)lg->mem_base;
+	unsigned int mapped_pages, i, linear_pages;
+#ifdef CONFIG_X86_PAE
+	pmd_t __user *pmds;
+	unsigned int j;
+	pgd_t pgd;
+	pmd_t pmd;
+#else
+	unsigned int phys_linear;
+#endif
 
 	/* We have mapped_pages frames to map, so we need
 	 * linear_pages page tables to map them. */
@@ -610,6 +832,9 @@ static unsigned long setup_pagetables(struct lguest *lg,
 	/* Now we use the next linear_pages pages as pte pages */
 	linear = (void *)pgdir - linear_pages * PAGE_SIZE;
 
+#ifdef CONFIG_X86_PAE
+	pmds = (void *)linear - PAGE_SIZE;
+#endif
 	/* Linear mapping is easy: put every page's address into the
 	 * mapping in order. */
 	for (i = 0; i < mapped_pages; i++) {
@@ -621,6 +846,22 @@ static unsigned long setup_pagetables(struct lguest *lg,
 
 	/* The top level points to the linear page table pages above.
 	 * We setup the identity and linear mappings here. */
+#ifdef CONFIG_X86_PAE
+	for (i = 0, j; i < mapped_pages && j < PTRS_PER_PMD;
+	     i += PTRS_PER_PTE, j++) {
+		native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
+		- mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
+
+		if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
+			return -EFAULT;
+	}
+
+	set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT));
+	if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
+		return -EFAULT;
+	if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)
+		return -EFAULT;
+#else
 	phys_linear = (unsigned long)linear - mem_base;
 	for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
 		pgd_t pgd;
@@ -633,6 +874,7 @@ static unsigned long setup_pagetables(struct lguest *lg,
 				    &pgd, sizeof(pgd)))
 			return -EFAULT;
 	}
+#endif
 
 	/* We return the top level (guest-physical) address: remember where
 	 * this is. */
@@ -648,7 +890,10 @@ int init_guest_pagetable(struct lguest *lg)
 	u64 mem;
 	u32 initrd_size;
 	struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
-
+#ifdef CONFIG_X86_PAE
+	pgd_t *pgd;
+	pmd_t *pmd_table;
+#endif
 	/* Get the Guest memory size and the ramdisk size from the boot header
 	 * located at lg->mem_base (Guest address 0). */
 	if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
@@ -663,6 +908,15 @@ int init_guest_pagetable(struct lguest *lg)
 	lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
 	if (!lg->pgdirs[0].pgdir)
 		return -ENOMEM;
+#ifdef CONFIG_X86_PAE
+	pgd = lg->pgdirs[0].pgdir;
+	pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
+	if (!pmd_table)
+		return -ENOMEM;
+
+	set_pgd(pgd + SWITCHER_PGD_INDEX,
+		__pgd(__pa(pmd_table) | _PAGE_PRESENT));
+#endif
 	lg->cpus[0].cpu_pgd = 0;
 	return 0;
 }
@@ -672,17 +926,24 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
 {
 	/* We get the kernel address: above this is all kernel memory. */
 	if (get_user(cpu->lg->kernel_address,
-		     &cpu->lg->lguest_data->kernel_address)
-	    /* We tell the Guest that it can't use the top 4MB of virtual
-	     * addresses used by the Switcher. */
-	    || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem)
-	    || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir))
+		&cpu->lg->lguest_data->kernel_address)
+		/* We tell the Guest that it can't use the top 2 or 4 MB
+		 * of virtual addresses used by the Switcher. */
+		|| put_user(RESERVE_MEM * 1024 * 1024,
+			&cpu->lg->lguest_data->reserve_mem)
+		|| put_user(cpu->lg->pgdirs[0].gpgdir,
+			&cpu->lg->lguest_data->pgdir))
 		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
 
 	/* In flush_user_mappings() we loop from 0 to
 	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
 	 * Switcher mappings, so check that now. */
+#ifdef CONFIG_X86_PAE
+	if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
+		pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
+#else
 	if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
+#endif
 		kill_guest(cpu, "bad kernel address %#lx",
 				 cpu->lg->kernel_address);
 }
@@ -708,16 +969,30 @@ void free_guest_pagetable(struct lguest *lg)
 void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
 {
 	pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
-	pgd_t switcher_pgd;
 	pte_t regs_pte;
 	unsigned long pfn;
 
+#ifdef CONFIG_X86_PAE
+	pmd_t switcher_pmd;
+	pmd_t *pmd_table;
+
+	native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>
+		       PAGE_SHIFT, PAGE_KERNEL_EXEC));
+
+	pmd_table = __va(pgd_pfn(cpu->lg->
+			pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
+								<< PAGE_SHIFT);
+	native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
+#else
+	pgd_t switcher_pgd;
+
 	/* Make the last PGD entry for this Guest point to the Switcher's PTE
 	 * page for this CPU (with appropriate flags). */
 	switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
 
 	cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
 
+#endif
 	/* We also change the Switcher PTE page.  When we're running the Guest,
 	 * we want the Guest's "regs" page to appear where the first Switcher
 	 * page for this CPU is.  This is an optimization: when the Switcher
-- 
cgit v1.2.3


From 92b4d8df8436cdd74d22a2a5b6b23b9abc737a3e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:08 -0600
Subject: lguest: PAE fixes

1) j wasn't initialized in setup_pagetables, so they weren't set up for me
   causing immediate guest crashes.

2) gpte_addr should not re-read the pmd from the Guest.  Especially
   not BUG_ON() based on the value.  If we ever supported SMP guests,
   they could trigger that.  And the Launcher could also trigger it
   (tho currently root-only).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/page_tables.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 5e2c26adcf0..a6fe1abda24 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -154,26 +154,25 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
 	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
 	return gpage + pmd_index(vaddr) * sizeof(pmd_t);
 }
-#endif
 
 static unsigned long gpte_addr(struct lg_cpu *cpu,
-				pgd_t gpgd, unsigned long vaddr)
+			       pmd_t gpmd, unsigned long vaddr)
 {
-#ifdef CONFIG_X86_PAE
-	pmd_t gpmd;
-#endif
-	unsigned long gpage;
+	unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
 
-	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-#ifdef CONFIG_X86_PAE
-	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-	gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
 	BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
+	return gpage + pte_index(vaddr) * sizeof(pte_t);
+}
 #else
-	gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-#endif
+static unsigned long gpte_addr(struct lg_cpu *cpu,
+				pgd_t gpgd, unsigned long vaddr)
+{
+	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
+
+	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
 	return gpage + pte_index(vaddr) * sizeof(pte_t);
 }
+#endif
 /*:*/
 
 /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as
@@ -339,10 +338,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
 		 * number in the shadow PMD is the page we just allocated. */
 		native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
 	}
-#endif
+
+	/* OK, now we look at the lower level in the Guest page table: keep its
+	 * address, because we might update it later. */
+	gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
+#else
 	/* OK, now we look at the lower level in the Guest page table: keep its
 	 * address, because we might update it later. */
 	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
+#endif
 	gpte = lgread(cpu, gpte_ptr, pte_t);
 
 	/* If this page isn't in the Guest page tables, we can't page it in. */
@@ -522,7 +526,6 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 {
 	pgd_t gpgd;
 	pte_t gpte;
-
 #ifdef CONFIG_X86_PAE
 	pmd_t gpmd;
 #endif
@@ -534,13 +537,14 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
 		return -1UL;
 	}
 
-	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
 #ifdef CONFIG_X86_PAE
 	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
 	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
 		kill_guest(cpu, "Bad address %#lx", vaddr);
-#endif
+	gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
+#else
 	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
+#endif
 	if (!(pte_flags(gpte) & _PAGE_PRESENT))
 		kill_guest(cpu, "Bad address %#lx", vaddr);
 
@@ -847,7 +851,7 @@ static unsigned long setup_pagetables(struct lguest *lg,
 	/* The top level points to the linear page table pages above.
 	 * We setup the identity and linear mappings here. */
 #ifdef CONFIG_X86_PAE
-	for (i = 0, j; i < mapped_pages && j < PTRS_PER_PMD;
+	for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
 	     i += PTRS_PER_PTE, j++) {
 		native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
 		- mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
-- 
cgit v1.2.3


From 9f155a9b3d5a5444bcc5e049ec2547bb5107150e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:08 -0600
Subject: lguest: allow any process to send interrupts

We currently only allow the Launcher process to send interrupts, but it
as we already send interrupts from the hrtimer, it's a simple matter of
extracting that code into a common set_interrupt routine.

As we switch to a thread per virtqueue, this avoids a bottleneck through the
main Launcher process.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/interrupts_and_traps.c | 19 +++++++++++++++----
 drivers/lguest/lg.h                   |  1 +
 drivers/lguest/lguest_user.c          | 10 ++--------
 3 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 5a10754b479..0e9067b0d50 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -213,6 +213,20 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
 	if (!more)
 		put_user(0, &cpu->lg->lguest_data->irq_pending);
 }
+
+/* And this is the routine when we want to set an interrupt for the Guest. */
+void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
+{
+	/* Next time the Guest runs, the core code will see if it can deliver
+	 * this interrupt. */
+	set_bit(irq, cpu->irqs_pending);
+
+	/* Make sure it sees it; it might be asleep (eg. halted), or
+	 * running the Guest right now, in which case kick_process()
+	 * will knock it out. */
+	if (!wake_up_process(cpu->tsk))
+		kick_process(cpu->tsk);
+}
 /*:*/
 
 /* Linux uses trap 128 for system calls.  Plan9 uses 64, and Ron Minnich sent
@@ -528,10 +542,7 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
 	struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
 
 	/* Remember the first interrupt is the timer interrupt. */
-	set_bit(0, cpu->irqs_pending);
-	/* Guest may be stopped or running on another CPU. */
-	if (!wake_up_process(cpu->tsk))
-		kick_process(cpu->tsk);
+	set_interrupt(cpu, 0);
 	return HRTIMER_NORESTART;
 }
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 6201ce59e88..040cb70780e 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -143,6 +143,7 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
 /* interrupts_and_traps.c: */
 unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
 void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
+void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
 bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
 			  u32 low, u32 hi);
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index bcdcf3453e7..1982b45bd93 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -45,9 +45,8 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
 		return -EFAULT;
 	if (irq >= LGUEST_IRQS)
 		return -EINVAL;
-	/* Next time the Guest runs, the core code will see if it can deliver
-	 * this interrupt. */
-	set_bit(irq, cpu->irqs_pending);
+
+	set_interrupt(cpu, irq);
 	return 0;
 }
 
@@ -252,11 +251,6 @@ static ssize_t write(struct file *file, const char __user *in,
 		/* Once the Guest is dead, you can only read() why it died. */
 		if (lg->dead)
 			return -ENOENT;
-
-		/* If you're not the task which owns the Guest, all you can do
-		 * is break the Launcher out of running the Guest. */
-		if (current != cpu->tsk && req != LHREQ_BREAK)
-			return -EPERM;
 	}
 
 	switch (req) {
-- 
cgit v1.2.3


From df60aeef4f4fe0645d9a195a7689005520422de5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:09 -0600
Subject: lguest: use eventfds for device notification

Currently, when a Guest wants to perform I/O it calls LHCALL_NOTIFY with
an address: the main Launcher process returns with this address, and figures
out what device to run.

A far nicer model is to let processes bind an eventfd to an address: if we
find one, we simply signal the eventfd.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Davide Libenzi <davidel@xmailserver.org>
---
 drivers/lguest/Kconfig       |  2 +-
 drivers/lguest/core.c        |  8 ++--
 drivers/lguest/lg.h          | 13 ++++++
 drivers/lguest/lguest_user.c | 98 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 115 insertions(+), 6 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 8f63845db83..0aaa0597a62 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,6 +1,6 @@
 config LGUEST
 	tristate "Linux hypervisor example code"
-	depends on X86_32 && EXPERIMENTAL && FUTEX
+	depends on X86_32 && EXPERIMENTAL && EVENTFD
 	select HVC_DRIVER
 	---help---
 	  This is a very simple module which allows you to run
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index d0298dc45d9..508569c9571 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -198,9 +198,11 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		/* It's possible the Guest did a NOTIFY hypercall to the
 		 * Launcher, in which case we return from the read() now. */
 		if (cpu->pending_notify) {
-			if (put_user(cpu->pending_notify, user))
-				return -EFAULT;
-			return sizeof(cpu->pending_notify);
+			if (!send_notify_to_eventfd(cpu)) {
+				if (put_user(cpu->pending_notify, user))
+					return -EFAULT;
+				return sizeof(cpu->pending_notify);
+			}
 		}
 
 		/* Check for signals */
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 040cb70780e..32fefdc6ad3 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -82,6 +82,16 @@ struct lg_cpu {
 	struct lg_cpu_arch arch;
 };
 
+struct lg_eventfd {
+	unsigned long addr;
+	struct file *event;
+};
+
+struct lg_eventfd_map {
+	unsigned int num;
+	struct lg_eventfd map[];
+};
+
 /* The private info the thread maintains about the guest. */
 struct lguest
 {
@@ -102,6 +112,8 @@ struct lguest
 	unsigned int stack_pages;
 	u32 tsc_khz;
 
+	struct lg_eventfd_map *eventfds;
+
 	/* Dead? */
 	const char *dead;
 };
@@ -154,6 +166,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
 void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
 		const unsigned long *def);
 void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
+bool send_notify_to_eventfd(struct lg_cpu *cpu);
 void init_clockdev(struct lg_cpu *cpu);
 bool check_syscall_vector(struct lguest *lg);
 int init_interrupts(void);
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 1982b45bd93..f6bf255f183 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -7,6 +7,8 @@
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
 #include "lg.h"
 
 /*L:055 When something happens, the Waker process needs a way to stop the
@@ -35,6 +37,81 @@ static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
 	}
 }
 
+bool send_notify_to_eventfd(struct lg_cpu *cpu)
+{
+	unsigned int i;
+	struct lg_eventfd_map *map;
+
+	/* lg->eventfds is RCU-protected */
+	rcu_read_lock();
+	map = rcu_dereference(cpu->lg->eventfds);
+	for (i = 0; i < map->num; i++) {
+		if (map->map[i].addr == cpu->pending_notify) {
+			eventfd_signal(map->map[i].event, 1);
+			cpu->pending_notify = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return cpu->pending_notify == 0;
+}
+
+static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
+{
+	struct lg_eventfd_map *new, *old = lg->eventfds;
+
+	if (!addr)
+		return -EINVAL;
+
+	/* Replace the old array with the new one, carefully: others can
+	 * be accessing it at the same time */
+	new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
+		      GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	/* First make identical copy. */
+	memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
+	new->num = old->num;
+
+	/* Now append new entry. */
+	new->map[new->num].addr = addr;
+	new->map[new->num].event = eventfd_fget(fd);
+	if (IS_ERR(new->map[new->num].event)) {
+		kfree(new);
+		return PTR_ERR(new->map[new->num].event);
+	}
+	new->num++;
+
+	/* Now put new one in place. */
+	rcu_assign_pointer(lg->eventfds, new);
+
+	/* We're not in a big hurry.  Wait until noone's looking at old
+	 * version, then delete it. */
+	synchronize_rcu();
+	kfree(old);
+
+	return 0;
+}
+
+static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
+{
+	unsigned long addr, fd;
+	int err;
+
+	if (get_user(addr, input) != 0)
+		return -EFAULT;
+	input++;
+	if (get_user(fd, input) != 0)
+		return -EFAULT;
+
+	mutex_lock(&lguest_lock);
+	err = add_eventfd(lg, addr, fd);
+	mutex_unlock(&lguest_lock);
+
+	return 0;
+}
+
 /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
  * number to /dev/lguest. */
 static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
@@ -184,6 +261,13 @@ static int initialize(struct file *file, const unsigned long __user *input)
 		goto unlock;
 	}
 
+	lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
+	if (!lg->eventfds) {
+		err = -ENOMEM;
+		goto free_lg;
+	}
+	lg->eventfds->num = 0;
+
 	/* Populate the easy fields of our "struct lguest" */
 	lg->mem_base = (void __user *)args[0];
 	lg->pfn_limit = args[1];
@@ -191,7 +275,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
 	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
 	if (err)
-		goto release_guest;
+		goto free_eventfds;
 
 	/* Initialize the Guest's shadow page tables, using the toplevel
 	 * address the Launcher gave us.  This allocates memory, so can fail. */
@@ -210,7 +294,9 @@ static int initialize(struct file *file, const unsigned long __user *input)
 free_regs:
 	/* FIXME: This should be in free_vcpu */
 	free_page(lg->cpus[0].regs_page);
-release_guest:
+free_eventfds:
+	kfree(lg->eventfds);
+free_lg:
 	kfree(lg);
 unlock:
 	mutex_unlock(&lguest_lock);
@@ -260,6 +346,8 @@ static ssize_t write(struct file *file, const char __user *in,
 		return user_send_irq(cpu, input);
 	case LHREQ_BREAK:
 		return break_guest_out(cpu, input);
+	case LHREQ_EVENTFD:
+		return attach_eventfd(lg, input);
 	default:
 		return -EINVAL;
 	}
@@ -297,6 +385,12 @@ static int close(struct inode *inode, struct file *file)
 		 * the Launcher's memory management structure. */
 		mmput(lg->cpus[i].mm);
 	}
+
+	/* Release any eventfds they registered. */
+	for (i = 0; i < lg->eventfds->num; i++)
+		fput(lg->eventfds->map[i].event);
+	kfree(lg->eventfds);
+
 	/* If lg->dead doesn't contain an error code it will be NULL or a
 	 * kmalloc()ed string, either of which is ok to hand to kfree(). */
 	if (!IS_ERR(lg->dead))
-- 
cgit v1.2.3


From 5dac051bc6030963181b69faddd9e0ad04f85fa8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 12 Jun 2009 22:27:10 -0600
Subject: lguest: remove obsolete LHREQ_BREAK call

We no longer need an efficient mechanism to force the Guest back into
host userspace, as each device is serviced without bothering the main
Guest process (aka. the Launcher).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/lguest/core.c        | 11 +++--------
 drivers/lguest/lg.h          |  4 +---
 drivers/lguest/lguest_user.c | 31 -------------------------------
 3 files changed, 4 insertions(+), 42 deletions(-)

(limited to 'drivers/lguest')

diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 508569c9571..a6974e9b8eb 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -209,10 +209,6 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		if (signal_pending(current))
 			return -ERESTARTSYS;
 
-		/* If Waker set break_out, return to Launcher. */
-		if (cpu->break_out)
-			return -EAGAIN;
-
 		/* Check if there are any interrupts which can be delivered now:
 		 * if so, this sets up the hander to be executed when we next
 		 * run the Guest. */
@@ -231,13 +227,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 			break;
 
 		/* If the Guest asked to be stopped, we sleep.  The Guest's
-		 * clock timer or LHREQ_BREAK from the Waker will wake us. */
+		 * clock timer will wake us. */
 		if (cpu->halted) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			/* Just before we sleep, make sure nothing snuck in
+			/* Just before we sleep, make sure no interrupt snuck in
 			 * which we should be doing. */
-			if (interrupt_pending(cpu, &more) < LGUEST_IRQS
-			    || cpu->break_out)
+			if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
 				set_current_state(TASK_RUNNING);
 			else
 				schedule();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 32fefdc6ad3..d4e8979735c 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -71,9 +71,7 @@ struct lg_cpu {
 	/* Virtual clock device */
 	struct hrtimer hrt;
 
-	/* Do we need to stop what we're doing and return to userspace? */
-	int break_out;
-	wait_queue_head_t break_wq;
+	/* Did the Guest tell us to halt? */
 	int halted;
 
 	/* Pending virtual interrupts */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index f6bf255f183..32e29712105 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -11,32 +11,6 @@
 #include <linux/file.h>
 #include "lg.h"
 
-/*L:055 When something happens, the Waker process needs a way to stop the
- * kernel running the Guest and return to the Launcher.  So the Waker writes
- * LHREQ_BREAK and the value "1" to /dev/lguest to do this.  Once the Launcher
- * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
- * the Waker. */
-static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
-{
-	unsigned long on;
-
-	/* Fetch whether they're turning break on or off. */
-	if (get_user(on, input) != 0)
-		return -EFAULT;
-
-	if (on) {
-		cpu->break_out = 1;
-		if (!wake_up_process(cpu->tsk))
-			kick_process(cpu->tsk);
-		/* Wait for them to reset it */
-		return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
-	} else {
-		cpu->break_out = 0;
-		wake_up(&cpu->break_wq);
-		return 0;
-	}
-}
-
 bool send_notify_to_eventfd(struct lg_cpu *cpu)
 {
 	unsigned int i;
@@ -202,9 +176,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 	 * address. */
 	lguest_arch_setup_regs(cpu, start_ip);
 
-	/* Initialize the queue for the Waker to wait on */
-	init_waitqueue_head(&cpu->break_wq);
-
 	/* We keep a pointer to the Launcher task (ie. current task) for when
 	 * other Guests want to wake this one (eg. console input). */
 	cpu->tsk = current;
@@ -344,8 +315,6 @@ static ssize_t write(struct file *file, const char __user *in,
 		return initialize(file, input);
 	case LHREQ_IRQ:
 		return user_send_irq(cpu, input);
-	case LHREQ_BREAK:
-		return break_guest_out(cpu, input);
 	case LHREQ_EVENTFD:
 		return attach_eventfd(lg, input);
 	default:
-- 
cgit v1.2.3