24 files changed, 876 insertions, 551 deletions
diff --git a/arch/mips/au1000/Kconfig b/arch/mips/au1000/Kconfig
index 29c95d97217..a23d4154da0 100644
--- a/arch/mips/au1000/Kconfig
+++ b/arch/mips/au1000/Kconfig
@@ -137,6 +137,7 @@ config SOC_AU1200
 config SOC_AU1X00
 	bool
 	select 64BIT_PHYS_ADDR
+	select IRQ_CPU
 	select SYS_HAS_CPU_MIPS32_R1
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_APM_EMULATION
diff --git a/arch/mips/au1000/common/irq.c b/arch/mips/au1000/common/irq.c
index c00f308fd50..59e932a928d 100644
--- a/arch/mips/au1000/common/irq.c
+++ b/arch/mips/au1000/common/irq.c
@@ -1,11 +1,10 @@
 /*
- * BRIEF MODULE DESCRIPTION
- *	Au1000 interrupt routines.
- *
  * Copyright 2001 MontaVista Software Inc.
  * Author: MontaVista Software, Inc.
  *		ppopov@mvista.com or source@mvista.com
  *
+ * Copyright (C) 2007 Ralf Baechle (ralf@linux-mips.org)
+ *
  *  This program is free software; you can redistribute	 it and/or modify it
  *  under  the terms of	 the GNU General  Public License as published by the
  *  Free Software Foundation;  either version 2 of the	License, or (at your
@@ -32,6 +31,7 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 
+#include <asm/irq_cpu.h>
 #include <asm/mipsregs.h>
 #include <asm/mach-au1x00/au1000.h>
 #ifdef CONFIG_MIPS_PB1000
@@ -44,7 +44,7 @@
 #define EXT_INTC1_REQ1 5 /* IP 5 */
 #define MIPS_TIMER_IP  7 /* IP 7 */
 
-void	(*board_init_irq)(void);
+void (*board_init_irq)(void) __initdata = NULL;
 
 static DEFINE_SPINLOCK(irq_lock);
 
@@ -134,12 +134,14 @@ void restore_au1xxx_intctl(void)
 
 inline void local_enable_irq(unsigned int irq_nr)
 {
-	if (irq_nr > AU1000_LAST_INTC0_INT) {
-		au_writel(1 << (irq_nr - 32), IC1_MASKSET);
-		au_writel(1 << (irq_nr - 32), IC1_WAKESET);
+	unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
+
+	if (bit >= 32) {
+		au_writel(1 << (bit - 32), IC1_MASKSET);
+		au_writel(1 << (bit - 32), IC1_WAKESET);
 	} else {
-		au_writel(1 << irq_nr, IC0_MASKSET);
-		au_writel(1 << irq_nr, IC0_WAKESET);
+		au_writel(1 << bit, IC0_MASKSET);
+		au_writel(1 << bit, IC0_WAKESET);
 	}
 	au_sync();
 }
@@ -147,12 +149,14 @@ inline void local_enable_irq(unsigned int irq_nr)
 
 inline void local_disable_irq(unsigned int irq_nr)
 {
-	if (irq_nr > AU1000_LAST_INTC0_INT) {
-		au_writel(1 << (irq_nr - 32), IC1_MASKCLR);
-		au_writel(1 << (irq_nr - 32), IC1_WAKECLR);
+	unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
+
+	if (bit >= 32) {
+		au_writel(1 << (bit - 32), IC1_MASKCLR);
+		au_writel(1 << (bit - 32), IC1_WAKECLR);
 	} else {
-		au_writel(1 << irq_nr, IC0_MASKCLR);
-		au_writel(1 << irq_nr, IC0_WAKECLR);
+		au_writel(1 << bit, IC0_MASKCLR);
+		au_writel(1 << bit, IC0_WAKECLR);
 	}
 	au_sync();
 }
@@ -160,12 +164,14 @@ inline void local_disable_irq(unsigned int irq_nr)
 
 static inline void mask_and_ack_rise_edge_irq(unsigned int irq_nr)
 {
-	if (irq_nr > AU1000_LAST_INTC0_INT) {
-		au_writel(1 << (irq_nr - 32), IC1_RISINGCLR);
-		au_writel(1 << (irq_nr - 32), IC1_MASKCLR);
+	unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
+
+	if (bit >= 32) {
+		au_writel(1 << (bit - 32), IC1_RISINGCLR);
+		au_writel(1 << (bit - 32), IC1_MASKCLR);
 	} else {
-		au_writel(1 << irq_nr, IC0_RISINGCLR);
-		au_writel(1 << irq_nr, IC0_MASKCLR);
+		au_writel(1 << bit, IC0_RISINGCLR);
+		au_writel(1 << bit, IC0_MASKCLR);
 	}
 	au_sync();
 }
@@ -173,12 +179,14 @@ static inline void mask_and_ack_rise_edge_irq(unsigned int irq_nr)
 
 static inline void mask_and_ack_fall_edge_irq(unsigned int irq_nr)
 {
-	if (irq_nr > AU1000_LAST_INTC0_INT) {
-		au_writel(1 << (irq_nr - 32), IC1_FALLINGCLR);
-		au_writel(1 << (irq_nr - 32), IC1_MASKCLR);
+	unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
+
+	if (bit >= 32) {
+		au_writel(1 << (bit - 32), IC1_FALLINGCLR);
+		au_writel(1 << (bit - 32), IC1_MASKCLR);
 	} else {
-		au_writel(1 << irq_nr, IC0_FALLINGCLR);
-		au_writel(1 << irq_nr, IC0_MASKCLR);
+		au_writel(1 << bit, IC0_FALLINGCLR);
+		au_writel(1 << bit, IC0_MASKCLR);
 	}
 	au_sync();
 }
@@ -186,17 +194,20 @@ static inline void mask_and_ack_fall_edge_irq(unsigned int irq_nr)
 
 static inline void mask_and_ack_either_edge_irq(unsigned int irq_nr)
 {
-	/* This may assume that we don't get interrupts from
+	unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
+
+	/*
+	 * This may assume that we don't get interrupts from
 	 * both edges at once, or if we do, that we don't care.
 	 */
-	if (irq_nr > AU1000_LAST_INTC0_INT) {
-		au_writel(1 << (irq_nr - 32), IC1_FALLINGCLR);
-		au_writel(1 << (irq_nr - 32), IC1_RISINGCLR);
-		au_writel(1 << (irq_nr - 32), IC1_MASKCLR);
+	if (bit >= 32) {
+		au_writel(1 << (bit - 32), IC1_FALLINGCLR);
+		au_writel(1 << (bit - 32), IC1_RISINGCLR);
+		au_writel(1 << (bit - 32), IC1_MASKCLR);
 	} else {
-		au_writel(1 << irq_nr, IC0_FALLINGCLR);
-		au_writel(1 << irq_nr, IC0_RISINGCLR);
-		au_writel(1 << irq_nr, IC0_MASKCLR);
+		au_writel(1 << bit, IC0_FALLINGCLR);
+		au_writel(1 << bit, IC0_RISINGCLR);
+		au_writel(1 << bit, IC0_MASKCLR);
 	}
 	au_sync();
 }
@@ -213,10 +224,8 @@ static inline void mask_and_ack_level_irq(unsigned int irq_nr)
 		au_sync();
 	}
 #endif
-	return;
 }
 
-
 static void end_irq(unsigned int irq_nr)
 {
 	if (!(irq_desc[irq_nr].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
@@ -341,114 +350,118 @@ void startup_match20_interrupt(irq_handler_t handler)
 }
 #endif
 
-static void setup_local_irq(unsigned int irq_nr, int type, int int_req)
+static void __init setup_local_irq(unsigned int irq_nr, int type, int int_req)
 {
-	if (irq_nr > AU1000_MAX_INTR) return;
+	unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
+
+	if (irq_nr > AU1000_MAX_INTR)
+		return;
+
 	/* Config2[n], Config1[n], Config0[n] */
-	if (irq_nr > AU1000_LAST_INTC0_INT) {
+	if (bit >= 32) {
 		switch (type) {
 		case INTC_INT_RISE_EDGE: /* 0:0:1 */
-			au_writel(1 << (irq_nr - 32), IC1_CFG2CLR);
-			au_writel(1 << (irq_nr - 32), IC1_CFG1CLR);
-			au_writel(1 << (irq_nr - 32), IC1_CFG0SET);
+			au_writel(1 << (bit - 32), IC1_CFG2CLR);
+			au_writel(1 << (bit - 32), IC1_CFG1CLR);
+			au_writel(1 << (bit - 32), IC1_CFG0SET);
 			set_irq_chip(irq_nr, &rise_edge_irq_type);
 			break;
 		case INTC_INT_FALL_EDGE: /* 0:1:0 */
-			au_writel(1 << (irq_nr - 32), IC1_CFG2CLR);
-			au_writel(1 << (irq_nr - 32), IC1_CFG1SET);
-			au_writel(1 << (irq_nr - 32), IC1_CFG0CLR);
+			au_writel(1 << (bit - 32), IC1_CFG2CLR);
+			au_writel(1 << (bit - 32), IC1_CFG1SET);
+			au_writel(1 << (bit - 32), IC1_CFG0CLR);
 			set_irq_chip(irq_nr, &fall_edge_irq_type);
 			break;
 		case INTC_INT_RISE_AND_FALL_EDGE: /* 0:1:1 */
-			au_writel(1 << (irq_nr - 32), IC1_CFG2CLR);
-			au_writel(1 << (irq_nr - 32), IC1_CFG1SET);
-			au_writel(1 << (irq_nr - 32), IC1_CFG0SET);
+			au_writel(1 << (bit - 32), IC1_CFG2CLR);
+			au_writel(1 << (bit - 32), IC1_CFG1SET);
+			au_writel(1 << (bit - 32), IC1_CFG0SET);
 			set_irq_chip(irq_nr, &either_edge_irq_type);
 			break;
 		case INTC_INT_HIGH_LEVEL: /* 1:0:1 */
-			au_writel(1 << (irq_nr - 32), IC1_CFG2SET);
-			au_writel(1 << (irq_nr - 32), IC1_CFG1CLR);
-			au_writel(1 << (irq_nr - 32), IC1_CFG0SET);
+			au_writel(1 << (bit - 32), IC1_CFG2SET);
+			au_writel(1 << (bit - 32), IC1_CFG1CLR);
+			au_writel(1 << (bit - 32), IC1_CFG0SET);
 			set_irq_chip(irq_nr, &level_irq_type);
 			break;
 		case INTC_INT_LOW_LEVEL: /* 1:1:0 */
-			au_writel(1 << (irq_nr - 32), IC1_CFG2SET);
-			au_writel(1 << (irq_nr - 32), IC1_CFG1SET);
-			au_writel(1 << (irq_nr - 32), IC1_CFG0CLR);
+			au_writel(1 << (bit - 32), IC1_CFG2SET);
+			au_writel(1 << (bit - 32), IC1_CFG1SET);
+			au_writel(1 << (bit - 32), IC1_CFG0CLR);
 			set_irq_chip(irq_nr, &level_irq_type);
 			break;
 		case INTC_INT_DISABLED: /* 0:0:0 */
-			au_writel(1 << (irq_nr - 32), IC1_CFG0CLR);
-			au_writel(1 << (irq_nr - 32), IC1_CFG1CLR);
-			au_writel(1 << (irq_nr - 32), IC1_CFG2CLR);
+			au_writel(1 << (bit - 32), IC1_CFG0CLR);
+			au_writel(1 << (bit - 32), IC1_CFG1CLR);
+			au_writel(1 << (bit - 32), IC1_CFG2CLR);
 			break;
 		default: /* disable the interrupt */
 			printk(KERN_WARNING "unexpected int type %d (irq %d)\n",
 			       type, irq_nr);
-			au_writel(1 << (irq_nr - 32), IC1_CFG0CLR);
-			au_writel(1 << (irq_nr - 32), IC1_CFG1CLR);
-			au_writel(1 << (irq_nr - 32), IC1_CFG2CLR);
+			au_writel(1 << (bit - 32), IC1_CFG0CLR);
+			au_writel(1 << (bit - 32), IC1_CFG1CLR);
+			au_writel(1 << (bit - 32), IC1_CFG2CLR);
 			return;
 		}
 		if (int_req) /* assign to interrupt request 1 */
-			au_writel(1 << (irq_nr - 32), IC1_ASSIGNCLR);
+			au_writel(1 << (bit - 32), IC1_ASSIGNCLR);
 		else	     /* assign to interrupt request 0 */
-			au_writel(1 << (irq_nr - 32), IC1_ASSIGNSET);
-		au_writel(1 << (irq_nr - 32), IC1_SRCSET);
-		au_writel(1 << (irq_nr - 32), IC1_MASKCLR);
-		au_writel(1 << (irq_nr - 32), IC1_WAKECLR);
+			au_writel(1 << (bit - 32), IC1_ASSIGNSET);
+		au_writel(1 << (bit - 32), IC1_SRCSET);
+		au_writel(1 << (bit - 32), IC1_MASKCLR);
+		au_writel(1 << (bit - 32), IC1_WAKECLR);
 	} else {
 		switch (type) {
 		case INTC_INT_RISE_EDGE: /* 0:0:1 */
-			au_writel(1 << irq_nr, IC0_CFG2CLR);
-			au_writel(1 << irq_nr, IC0_CFG1CLR);
-			au_writel(1 << irq_nr, IC0_CFG0SET);
+			au_writel(1 << bit, IC0_CFG2CLR);
+			au_writel(1 << bit, IC0_CFG1CLR);
+			au_writel(1 << bit, IC0_CFG0SET);
 			set_irq_chip(irq_nr, &rise_edge_irq_type);
 			break;
 		case INTC_INT_FALL_EDGE: /* 0:1:0 */
-			au_writel(1 << irq_nr, IC0_CFG2CLR);
-			au_writel(1 << irq_nr, IC0_CFG1SET);
-			au_writel(1 << irq_nr, IC0_CFG0CLR);
+			au_writel(1 << bit, IC0_CFG2CLR);
+			au_writel(1 << bit, IC0_CFG1SET);
+			au_writel(1 << bit, IC0_CFG0CLR);
 			set_irq_chip(irq_nr, &fall_edge_irq_type);
 			break;
 		case INTC_INT_RISE_AND_FALL_EDGE: /* 0:1:1 */
-			au_writel(1 << irq_nr, IC0_CFG2CLR);
-			au_writel(1 << irq_nr, IC0_CFG1SET);
-			au_writel(1 << irq_nr, IC0_CFG0SET);
+			au_writel(1 << bit, IC0_CFG2CLR);
+			au_writel(1 << bit, IC0_CFG1SET);
+			au_writel(1 << bit, IC0_CFG0SET);
 			set_irq_chip(irq_nr, &either_edge_irq_type);
 			break;
 		case INTC_INT_HIGH_LEVEL: /* 1:0:1 */
-			au_writel(1 << irq_nr, IC0_CFG2SET);
-			au_writel(1 << irq_nr, IC0_CFG1CLR);
-			au_writel(1 << irq_nr, IC0_CFG0SET);
+			au_writel(1 << bit, IC0_CFG2SET);
+			au_writel(1 << bit, IC0_CFG1CLR);
+			au_writel(1 << bit, IC0_CFG0SET);
 			set_irq_chip(irq_nr, &level_irq_type);
 			break;
 		case INTC_INT_LOW_LEVEL: /* 1:1:0 */
-			au_writel(1 << irq_nr, IC0_CFG2SET);
-			au_writel(1 << irq_nr, IC0_CFG1SET);
-			au_writel(1 << irq_nr, IC0_CFG0CLR);
+			au_writel(1 << bit, IC0_CFG2SET);
+			au_writel(1 << bit, IC0_CFG1SET);
+			au_writel(1 << bit, IC0_CFG0CLR);
 			set_irq_chip(irq_nr, &level_irq_type);
 			break;
 		case INTC_INT_DISABLED: /* 0:0:0 */
-			au_writel(1 << irq_nr, IC0_CFG0CLR);
-			au_writel(1 << irq_nr, IC0_CFG1CLR);
-			au_writel(1 << irq_nr, IC0_CFG2CLR);
+			au_writel(1 << bit, IC0_CFG0CLR);
+			au_writel(1 << bit, IC0_CFG1CLR);
+			au_writel(1 << bit, IC0_CFG2CLR);
 			break;
 		default: /* disable the interrupt */
 			printk(KERN_WARNING "unexpected int type %d (irq %d)\n",
 			       type, irq_nr);
-			au_writel(1 << irq_nr, IC0_CFG0CLR);
-			au_writel(1 << irq_nr, IC0_CFG1CLR);
-			au_writel(1 << irq_nr, IC0_CFG2CLR);
+			au_writel(1 << bit, IC0_CFG0CLR);
+			au_writel(1 << bit, IC0_CFG1CLR);
+			au_writel(1 << bit, IC0_CFG2CLR);
 			return;
 		}
 		if (int_req) /* assign to interrupt request 1 */
-			au_writel(1 << irq_nr, IC0_ASSIGNCLR);
+			au_writel(1 << bit, IC0_ASSIGNCLR);
 		else	     /* assign to interrupt request 0 */
-			au_writel(1 << irq_nr, IC0_ASSIGNSET);
-		au_writel(1 << irq_nr, IC0_SRCSET);
-		au_writel(1 << irq_nr, IC0_MASKCLR);
-		au_writel(1 << irq_nr, IC0_WAKECLR);
+			au_writel(1 << bit, IC0_ASSIGNSET);
+		au_writel(1 << bit, IC0_SRCSET);
+		au_writel(1 << bit, IC0_MASKCLR);
+		au_writel(1 << bit, IC0_WAKECLR);
 	}
 	au_sync();
 }
@@ -461,8 +474,8 @@ static void setup_local_irq(unsigned int irq_nr, int type, int int_req)
 
 static void intc0_req0_irqdispatch(void)
 {
-	int irq = 0;
 	static unsigned long intc0_req0;
+	unsigned int bit;
 
 	intc0_req0 |= au_readl(IC0_REQ0INT);
 
@@ -481,25 +494,25 @@ static void intc0_req0_irqdispatch(void)
 		return;
 	}
 #endif
-	irq = ffs(intc0_req0);
-	intc0_req0 &= ~(1 << irq);
-	do_IRQ(irq);
+	bit = ffs(intc0_req0);
+	intc0_req0 &= ~(1 << bit);
+	do_IRQ(MIPS_CPU_IRQ_BASE + bit);
 }
 
 
 static void intc0_req1_irqdispatch(void)
 {
-	int irq = 0;
 	static unsigned long intc0_req1;
+	unsigned int bit;
 
 	intc0_req1 |= au_readl(IC0_REQ1INT);
 
 	if (!intc0_req1)
 		return;
 
-	irq = ffs(intc0_req1);
-	intc0_req1 &= ~(1 << irq);
-	do_IRQ(irq);
+	bit = ffs(intc0_req1);
+	intc0_req1 &= ~(1 << bit);
+	do_IRQ(bit);
 }
 
 
@@ -509,43 +522,41 @@ static void intc0_req1_irqdispatch(void)
  */
 static void intc1_req0_irqdispatch(void)
 {
-	int irq = 0;
 	static unsigned long intc1_req0;
+	unsigned int bit;
 
 	intc1_req0 |= au_readl(IC1_REQ0INT);
 
 	if (!intc1_req0)
 		return;
 
-	irq = ffs(intc1_req0);
-	intc1_req0 &= ~(1 << irq);
-	irq += 32;
-	do_IRQ(irq);
+	bit = ffs(intc1_req0);
+	intc1_req0 &= ~(1 << bit);
+	do_IRQ(MIPS_CPU_IRQ_BASE + 32 + bit);
 }
 
 
 static void intc1_req1_irqdispatch(void)
 {
-	int irq = 0;
 	static unsigned long intc1_req1;
+	unsigned int bit;
 
 	intc1_req1 |= au_readl(IC1_REQ1INT);
 
 	if (!intc1_req1)
 		return;
 
-	irq = ffs(intc1_req1);
-	intc1_req1 &= ~(1 << irq);
-	irq += 32;
-	do_IRQ(irq);
+	bit = ffs(intc1_req1);
+	intc1_req1 &= ~(1 << bit);
+	do_IRQ(MIPS_CPU_IRQ_BASE + 32 + bit);
 }
 
 asmlinkage void plat_irq_dispatch(void)
 {
-	unsigned int pending = read_c0_status() & read_c0_cause() & ST0_IM;
+	unsigned int pending = read_c0_status() & read_c0_cause();
 
 	if (pending & CAUSEF_IP7)
-		do_IRQ(63);
+		do_IRQ(MIPS_CPU_IRQ_BASE + 7);
 	else if (pending & CAUSEF_IP2)
 		intc0_req0_irqdispatch();
 	else if (pending & CAUSEF_IP3)
@@ -561,17 +572,15 @@ asmlinkage void plat_irq_dispatch(void)
 void __init arch_init_irq(void)
 {
 	int i;
-	unsigned long cp0_status;
 	struct au1xxx_irqmap *imp;
 	extern struct au1xxx_irqmap au1xxx_irq_map[];
 	extern struct au1xxx_irqmap au1xxx_ic0_map[];
 	extern int au1xxx_nr_irqs;
 	extern int au1xxx_ic0_nr_irqs;
 
-	cp0_status = read_c0_status();
-
-	/* Initialize interrupt controllers to a safe state.
-	*/
+	/*
+	 * Initialize interrupt controllers to a safe state.
+	 */
 	au_writel(0xffffffff, IC0_CFG0CLR);
 	au_writel(0xffffffff, IC0_CFG1CLR);
 	au_writel(0xffffffff, IC0_CFG2CLR);
@@ -594,16 +603,20 @@ void __init arch_init_irq(void)
 	au_writel(0xffffffff, IC1_RISINGCLR);
 	au_writel(0x00000000, IC1_TESTBIT);
 
-	/* Initialize IC0, which is fixed per processor.
-	*/
+	mips_cpu_irq_init();
+
+	/*
+	 * Initialize IC0, which is fixed per processor.
+	 */
 	imp = au1xxx_ic0_map;
 	for (i = 0; i < au1xxx_ic0_nr_irqs; i++) {
 		setup_local_irq(imp->im_irq, imp->im_type, imp->im_request);
 		imp++;
 	}
 
-	/* Now set up the irq mapping for the board.
-	*/
+	/*
+	 * Now set up the irq mapping for the board.
+	 */
 	imp = au1xxx_irq_map;
 	for (i = 0; i < au1xxx_nr_irqs; i++) {
 		setup_local_irq(imp->im_irq, imp->im_type, imp->im_request);
@@ -615,5 +628,5 @@ void __init arch_init_irq(void)
 	/* Board specific IRQ initialization.
 	*/
 	if (board_init_irq)
-		(*board_init_irq)();
+		board_init_irq();
 }
diff --git a/arch/mips/au1000/common/power.c b/arch/mips/au1000/common/power.c
index 6f57f72a7d5..54047d69b82 100644
--- a/arch/mips/au1000/common/power.c
+++ b/arch/mips/au1000/common/power.c
@@ -403,9 +403,9 @@ static int pm_do_freq(ctl_table * ctl, int write, struct file *file,
 	}
 
 
-	/* We don't want _any_ interrupts other than
-	 * match20. Otherwise our au1000_calibrate_delay()
-	 * calculation will be off, potentially a lot.
+	/*
+	 * We don't want _any_ interrupts other than match20. Otherwise our
+	 * au1000_calibrate_delay() calculation will be off, potentially a lot.
 	 */
 	intc0_mask = save_local_and_disable(0);
 	intc1_mask = save_local_and_disable(1);
@@ -414,6 +414,7 @@ static int pm_do_freq(ctl_table * ctl, int write, struct file *file,
 	au1000_calibrate_delay();
 	restore_local_and_enable(0, intc0_mask);
 	restore_local_and_enable(1, intc1_mask);
+
 	return retval;
 }
 
diff --git a/arch/mips/au1000/pb1200/irqmap.c b/arch/mips/au1000/pb1200/irqmap.c
index 3bee274445f..5f48b060379 100644
--- a/arch/mips/au1000/pb1200/irqmap.c
+++ b/arch/mips/au1000/pb1200/irqmap.c
@@ -74,7 +74,7 @@ irqreturn_t pb1200_cascade_handler( int irq, void *dev_id)
 	bcsr->int_status = bisr;
 	for( ; bisr; bisr &= (bisr-1) )
 	{
-		extirq_nr = PB1200_INT_BEGIN + au_ffs(bisr);
+		extirq_nr = PB1200_INT_BEGIN + ffs(bisr);
 		/* Ack and dispatch IRQ */
 		do_IRQ(extirq_nr);
 	}
diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig
index 0280ef389d8..b536d7c6379 100644
--- a/arch/mips/configs/mtx1_defconfig
+++ b/arch/mips/configs/mtx1_defconfig
@@ -3021,7 +3021,7 @@ CONFIG_MAGIC_SYSRQ=y
 # CONFIG_DEBUG_FS is not set
 # CONFIG_HEADERS_CHECK is not set
 # CONFIG_DEBUG_KERNEL is not set
-# CONFIG_CROSSCOMPILE is not set
+CONFIG_CROSSCOMPILE=y
 CONFIG_CMDLINE=""
 CONFIG_SYS_SUPPORTS_KGDB=y
 
diff --git a/arch/mips/kernel/head.S b/arch/mips/kernel/head.S
index e46782b0ebc..bf164a562ac 100644
--- a/arch/mips/kernel/head.S
+++ b/arch/mips/kernel/head.S
@@ -140,7 +140,7 @@
 
 EXPORT(_stext)
 
-#ifndef CONFIG_BOOT_RAW
+#ifdef CONFIG_BOOT_RAW
 	/*
 	 * Give us a fighting chance of running if execution beings at the
 	 * kernel load address.  This is needed because this platform does
@@ -149,6 +149,8 @@ EXPORT(_stext)
 	__INIT
 #endif
 
+	__INIT_REFOK
+
 NESTED(kernel_entry, 16, sp)			# kernel entry point
 
 	kernel_entry_setup			# cpu specific setup
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index 05b365167a0..e4b5e647b14 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -391,6 +391,50 @@ static void mips_event_handler(struct clock_event_device *dev)
 {
 }
 
+/*
+ * FIXME: This doesn't hold for the relocated E9000 compare interrupt.
+ */
+static int c0_compare_int_pending(void)
+{
+	return (read_c0_cause() >> cp0_compare_irq) & 0x100;
+}
+
+static int c0_compare_int_usable(void)
+{
+	const unsigned int delta = 0x300000;
+	unsigned int cnt;
+
+	/*
+	 * IP7 already pending?  Try to clear it by acking the timer.
+	 */
+	if (c0_compare_int_pending()) {
+		write_c0_compare(read_c0_compare());
+		irq_disable_hazard();
+		if (c0_compare_int_pending())
+			return 0;
+	}
+
+	cnt = read_c0_count();
+	cnt += delta;
+	write_c0_compare(cnt);
+
+	while ((long)(read_c0_count() - cnt) <= 0)
+		;	/* Wait for expiry  */
+
+	if (!c0_compare_int_pending())
+		return 0;
+
+	write_c0_compare(read_c0_compare());
+	irq_disable_hazard();
+	if (c0_compare_int_pending())
+		return 0;
+
+	/*
+	 * Feels like a real count / compare timer.
+	 */
+	return 1;
+}
+
 void __cpuinit mips_clockevent_init(void)
 {
 	uint64_t mips_freq = mips_hpt_frequency;
@@ -412,6 +456,9 @@ void __cpuinit mips_clockevent_init(void)
 		return;
 #endif
 
+	if (!c0_compare_int_usable())
+		return;
+
 	cd = &per_cpu(mips_clockevent_device, cpu);
 
 	cd->name		= "MIPS";
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 9c0c478d71a..bbf01b81a4f 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -9,9 +9,10 @@
  * Copyright (C) 1999 Silicon Graphics, Inc.
  * Kevin D. Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com
  * Copyright (C) 2000, 01 MIPS Technologies, Inc.
- * Copyright (C) 2002, 2003, 2004, 2005  Maciej W. Rozycki
+ * Copyright (C) 2002, 2003, 2004, 2005, 2007  Maciej W. Rozycki
  */
 #include <linux/bug.h>
+#include <linux/compiler.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -410,7 +411,7 @@ asmlinkage void do_be(struct pt_regs *regs)
 }
 
 /*
- * ll/sc emulation
+ * ll/sc, rdhwr, sync emulation
  */
 
 #define OPCODE 0xfc000000
@@ -419,9 +420,11 @@ asmlinkage void do_be(struct pt_regs *regs)
 #define OFFSET 0x0000ffff
 #define LL     0xc0000000
 #define SC     0xe0000000
+#define SPEC0  0x00000000
 #define SPEC3  0x7c000000
 #define RD     0x0000f800
 #define FUNC   0x0000003f
+#define SYNC   0x0000000f
 #define RDHWR  0x0000003b
 
 /*
@@ -432,11 +435,10 @@ unsigned long ll_bit;
 
 static struct task_struct *ll_task = NULL;
 
-static inline void simulate_ll(struct pt_regs *regs, unsigned int opcode)
+static inline int simulate_ll(struct pt_regs *regs, unsigned int opcode)
 {
 	unsigned long value, __user *vaddr;
 	long offset;
-	int signal = 0;
 
 	/*
 	 * analyse the ll instruction that just caused a ri exception
@@ -451,14 +453,10 @@ static inline void simulate_ll(struct pt_regs *regs, unsigned int opcode)
 	vaddr = (unsigned long __user *)
 	        ((unsigned long)(regs->regs[(opcode & BASE) >> 21]) + offset);
 
-	if ((unsigned long)vaddr & 3) {
-		signal = SIGBUS;
-		goto sig;
-	}
-	if (get_user(value, vaddr)) {
-		signal = SIGSEGV;
-		goto sig;
-	}
+	if ((unsigned long)vaddr & 3)
+		return SIGBUS;
+	if (get_user(value, vaddr))
+		return SIGSEGV;
 
 	preempt_disable();
 
@@ -471,22 +469,16 @@ static inline void simulate_ll(struct pt_regs *regs, unsigned int opcode)
 
 	preempt_enable();
 
-	compute_return_epc(regs);
-
 	regs->regs[(opcode & RT) >> 16] = value;
 
-	return;
-
-sig:
-	force_sig(signal, current);
+	return 0;
 }
 
-static inline void simulate_sc(struct pt_regs *regs, unsigned int opcode)
+static inline int simulate_sc(struct pt_regs *regs, unsigned int opcode)
 {
 	unsigned long __user *vaddr;
 	unsigned long reg;
 	long offset;
-	int signal = 0;
 
 	/*
 	 * analyse the sc instruction that just caused a ri exception
@@ -502,34 +494,25 @@ static inline void simulate_sc(struct pt_regs *regs, unsigned int opcode)
 	        ((unsigned long)(regs->regs[(opcode & BASE) >> 21]) + offset);
 	reg = (opcode & RT) >> 16;
 
-	if ((unsigned long)vaddr & 3) {
-		signal = SIGBUS;
-		goto sig;
-	}
+	if ((unsigned long)vaddr & 3)
+		return SIGBUS;
 
 	preempt_disable();
 
 	if (ll_bit == 0 || ll_task != current) {
-		compute_return_epc(regs);
 		regs->regs[reg] = 0;
 		preempt_enable();
-		return;
+		return 0;
 	}
 
 	preempt_enable();
 
-	if (put_user(regs->regs[reg], vaddr)) {
-		signal = SIGSEGV;
-		goto sig;
-	}
+	if (put_user(regs->regs[reg], vaddr))
+		return SIGSEGV;
 
-	compute_return_epc(regs);
 	regs->regs[reg] = 1;
 
-	return;
-
-sig:
-	force_sig(signal, current);
+	return 0;
 }
 
 /*
@@ -539,27 +522,14 @@ sig:
  * few processors such as NEC's VR4100 throw reserved instruction exceptions
  * instead, so we're doing the emulation thing in both exception handlers.
  */
-static inline int simulate_llsc(struct pt_regs *regs)
+static int simulate_llsc(struct pt_regs *regs, unsigned int opcode)
 {
-	unsigned int opcode;
-
-	if (get_user(opcode, (unsigned int __user *) exception_epc(regs)))
-		goto out_sigsegv;
-
-	if ((opcode & OPCODE) == LL) {
-		simulate_ll(regs, opcode);
-		return 0;
-	}
-	if ((opcode & OPCODE) == SC) {
-		simulate_sc(regs, opcode);
-		return 0;
-	}
-
-	return -EFAULT;			/* Strange things going on ... */
+	if ((opcode & OPCODE) == LL)
+		return simulate_ll(regs, opcode);
+	if ((opcode & OPCODE) == SC)
+		return simulate_sc(regs, opcode);
 
-out_sigsegv:
-	force_sig(SIGSEGV, current);
-	return -EFAULT;
+	return -1;			/* Must be something else ... */
 }
 
 /*
@@ -567,16 +537,9 @@ out_sigsegv:
  * registers not implemented in hardware.  The only current use of this
  * is the thread area pointer.
  */
-static inline int simulate_rdhwr(struct pt_regs *regs)
+static int simulate_rdhwr(struct pt_regs *regs, unsigned int opcode)
 {
 	struct thread_info *ti = task_thread_info(current);
-	unsigned int opcode;
-
-	if (get_user(opcode, (unsigned int __user *) exception_epc(regs)))
-		goto out_sigsegv;
-
-	if (unlikely(compute_return_epc(regs)))
-		return -EFAULT;
 
 	if ((opcode & OPCODE) == SPEC3 && (opcode & FUNC) == RDHWR) {
 		int rd = (opcode & RD) >> 11;
@@ -586,16 +549,20 @@ static inline int simulate_rdhwr(struct pt_regs *regs)
 				regs->regs[rt] = ti->tp_value;
 				return 0;
 			default:
-				return -EFAULT;
+				return -1;
 		}
 	}
 
 	/* Not ours.  */
-	return -EFAULT;
+	return -1;
+}
 
-out_sigsegv:
-	force_sig(SIGSEGV, current);
-	return -EFAULT;
+static int simulate_sync(struct pt_regs *regs, unsigned int opcode)
+{
+	if ((opcode & OPCODE) == SPEC0 && (opcode & FUNC) == SYNC)
+		return 0;
+
+	return -1;			/* Must be something else ... */
 }
 
 asmlinkage void do_ov(struct pt_regs *regs)
@@ -767,16 +734,35 @@ out_sigsegv:
 
 asmlinkage void do_ri(struct pt_regs *regs)
 {
-	die_if_kernel("Reserved instruction in kernel code", regs);
+	unsigned int __user *epc = (unsigned int __user *)exception_epc(regs);
+	unsigned long old_epc = regs->cp0_epc;
+	unsigned int opcode = 0;
+	int status = -1;
 
-	if (!cpu_has_llsc)
-		if (!simulate_llsc(regs))
-			return;
+	die_if_kernel("Reserved instruction in kernel code", regs);
 
-	if (!simulate_rdhwr(regs))
+	if (unlikely(compute_return_epc(regs) < 0))
 		return;
 
-	force_sig(SIGILL, current);
+	if (unlikely(get_user(opcode, epc) < 0))
+		status = SIGSEGV;
+
+	if (!cpu_has_llsc && status < 0)
+		status = simulate_llsc(regs, opcode);
+
+	if (status < 0)
+		status = simulate_rdhwr(regs, opcode);
+
+	if (status < 0)
+		status = simulate_sync(regs, opcode);
+
+	if (status < 0)
+		status = SIGILL;
+
+	if (unlikely(status > 0)) {
+		regs->cp0_epc = old_epc;		/* Undo skip-over.  */
+		force_sig(status, current);
+	}
 }
 
 /*
@@ -808,7 +794,11 @@ static void mt_ase_fp_affinity(void)
 
 asmlinkage void do_cpu(struct pt_regs *regs)
 {
+	unsigned int __user *epc;
+	unsigned long old_epc;
+	unsigned int opcode;
 	unsigned int cpid;
+	int status;
 
 	die_if_kernel("do_cpu invoked from kernel context!", regs);
 
@@ -816,14 +806,32 @@ asmlinkage void do_cpu(struct pt_regs *regs)
 
 	switch (cpid) {
 	case 0:
-		if (!cpu_has_llsc)
-			if (!simulate_llsc(regs))
-				return;
+		epc = (unsigned int __user *)exception_epc(regs);
+		old_epc = regs->cp0_epc;
+		opcode = 0;
+		status = -1;
 
-		if (!simulate_rdhwr(regs))
+		if (unlikely(compute_return_epc(regs) < 0))
 			return;
 
-		break;
+		if (unlikely(get_user(opcode, epc) < 0))
+			status = SIGSEGV;
+
+		if (!cpu_has_llsc && status < 0)
+			status = simulate_llsc(regs, opcode);
+
+		if (status < 0)
+			status = simulate_rdhwr(regs, opcode);
+
+		if (status < 0)
+			status = SIGILL;
+
+		if (unlikely(status > 0)) {
+			regs->cp0_epc = old_epc;	/* Undo skip-over.  */
+			force_sig(status, current);
+		}
+
+		return;
 
 	case 1:
 		if (used_math())	/* Using the FPU again.  */
diff --git a/arch/mips/sgi-ip22/ip22-time.c b/arch/mips/sgi-ip22/ip22-time.c
index 9b9bffd2e8f..10e50549165 100644
--- a/arch/mips/sgi-ip22/ip22-time.c
+++ b/arch/mips/sgi-ip22/ip22-time.c
@@ -192,12 +192,3 @@ void indy_8254timer_irq(void)
 	ArcEnterInteractiveMode();
 	irq_exit();
 }
-
-void __init plat_timer_setup(struct irqaction *irq)
-{
-	/* over-write the handler, we use our own way */
-	irq->handler = no_action;
-
-	/* setup irqaction */
-	setup_irq(SGI_TIMER_IRQ, irq);
-}
diff --git a/arch/mips/sibyte/bcm1480/time.c b/arch/mips/sibyte/bcm1480/time.c
index 40d7126cd5b..5b4bfbbb5a2 100644
--- a/arch/mips/sibyte/bcm1480/time.c
+++ b/arch/mips/sibyte/bcm1480/time.c
@@ -84,7 +84,7 @@ static void sibyte_set_mode(enum clock_event_mode mode,
 	void __iomem *timer_cfg, *timer_init;
 
 	timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
-	timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
+	timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT));
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
diff --git a/arch/mips/sibyte/sb1250/time.c b/arch/mips/sibyte/sb1250/time.c
index 38199ad8fc5..fe11fed8e0d 100644
--- a/arch/mips/sibyte/sb1250/time.c
+++ b/arch/mips/sibyte/sb1250/time.c
@@ -83,7 +83,7 @@ static void sibyte_set_mode(enum clock_event_mode mode,
 	void __iomem *timer_cfg, *timer_init;
 
 	timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
-	timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
+	timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT));
 
 	switch(mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
@@ -111,7 +111,7 @@ sibyte_next_event(unsigned long delta, struct clock_event_device *evt)
 	void __iomem *timer_cfg, *timer_init;
 
 	timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
-	timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
+	timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT));
 
 	__raw_writeq(0, timer_cfg);
 	__raw_writeq(delta, timer_init);
@@ -155,7 +155,7 @@ static void sibyte_set_mode(enum clock_event_mode mode,
 	void __iomem *timer_cfg, *timer_init;
 
 	timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
-	timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
+	timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT));
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
@@ -183,7 +183,7 @@ sibyte_next_event(unsigned long delta, struct clock_event_device *evt)
 	void __iomem *timer_cfg, *timer_init;
 
 	timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
-	timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
+	timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT));
 
 	__raw_writeq(0, timer_cfg);
 	__raw_writeq(delta, timer_init);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a3ae8e6c8b3..3bd2688bd44 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -390,8 +390,8 @@ void apply_paravirt(struct paravirt_patch_site *start,
 		BUG_ON(p->len > MAX_PATCH_LEN);
 		/* prep the buffer with the original instructions */
 		memcpy(insnbuf, p->instr, p->len);
-		used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
-					  (unsigned long)p->instr, p->len);
+		used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
+					 (unsigned long)p->instr, p->len);
 
 		BUG_ON(used > p->len);
 
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 8029742c0fc..f1b7cdda82b 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -116,12 +116,14 @@ void foo(void)
 
 #ifdef CONFIG_PARAVIRT
 	BLANK();
-	OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
-	OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
-	OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
-	OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
-	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
-	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
+	OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
+	OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
+	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
+	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 #endif
 
 #ifdef CONFIG_XEN
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 8099fea0a72..dc7f938e501 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -437,7 +437,7 @@ ldt_ss:
 	 * is still available to implement the setting of the high
 	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
 	 */
-	cmpl $0, paravirt_ops+PARAVIRT_enabled
+	cmpl $0, pv_info+PARAVIRT_enabled
 	jne restore_nocheck
 #endif
 
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
index 739cfb207dd..6a80d67c212 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -42,32 +42,33 @@ void _paravirt_nop(void)
 static void __init default_banner(void)
 {
 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
-	       paravirt_ops.name);
+	       pv_info.name);
 }
 
 char *memory_setup(void)
 {
-	return paravirt_ops.memory_setup();
+	return pv_init_ops.memory_setup();
 }
 
 /* Simple instruction patching code. */
-#define DEF_NATIVE(name, code)					\
-	extern const char start_##name[], end_##name[];		\
-	asm("start_" #name ": " code "; end_" #name ":")
-
-DEF_NATIVE(irq_disable, "cli");
-DEF_NATIVE(irq_enable, "sti");
-DEF_NATIVE(restore_fl, "push %eax; popf");
-DEF_NATIVE(save_fl, "pushf; pop %eax");
-DEF_NATIVE(iret, "iret");
-DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
-DEF_NATIVE(read_cr2, "mov %cr2, %eax");
-DEF_NATIVE(write_cr3, "mov %eax, %cr3");
-DEF_NATIVE(read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(clts, "clts");
-DEF_NATIVE(read_tsc, "rdtsc");
-
-DEF_NATIVE(ud2a, "ud2a");
+#define DEF_NATIVE(ops, name, code)					\
+	extern const char start_##ops##_##name[], end_##ops##_##name[];	\
+	asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
+DEF_NATIVE(pv_cpu_ops, iret, "iret");
+DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(pv_cpu_ops, clts, "clts");
+DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+
+/* Undefined instruction for dealing with missing ops pointers. */
+static const unsigned char ud2a[] = { 0x0f, 0x0b };
 
 static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 			     unsigned long addr, unsigned len)
@@ -76,37 +77,29 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 	unsigned ret;
 
 	switch(type) {
-#define SITE(x)	case PARAVIRT_PATCH(x):	start = start_##x; end = end_##x; goto patch_site
-		SITE(irq_disable);
-		SITE(irq_enable);
-		SITE(restore_fl);
-		SITE(save_fl);
-		SITE(iret);
-		SITE(irq_enable_sysexit);
-		SITE(read_cr2);
-		SITE(read_cr3);
-		SITE(write_cr3);
-		SITE(clts);
-		SITE(read_tsc);
+#define SITE(ops, x)						\
+	case PARAVIRT_PATCH(ops.x):				\
+		start = start_##ops##_##x;			\
+		end = end_##ops##_##x;				\
+		goto patch_site
+
+	SITE(pv_irq_ops, irq_disable);
+	SITE(pv_irq_ops, irq_enable);
+	SITE(pv_irq_ops, restore_fl);
+	SITE(pv_irq_ops, save_fl);
+	SITE(pv_cpu_ops, iret);
+	SITE(pv_cpu_ops, irq_enable_sysexit);
+	SITE(pv_mmu_ops, read_cr2);
+	SITE(pv_mmu_ops, read_cr3);
+	SITE(pv_mmu_ops, write_cr3);
+	SITE(pv_cpu_ops, clts);
+	SITE(pv_cpu_ops, read_tsc);
 #undef SITE
 
 	patch_site:
 		ret = paravirt_patch_insns(ibuf, len, start, end);
 		break;
 
-	case PARAVIRT_PATCH(make_pgd):
-	case PARAVIRT_PATCH(make_pte):
-	case PARAVIRT_PATCH(pgd_val):
-	case PARAVIRT_PATCH(pte_val):
-#ifdef CONFIG_X86_PAE
-	case PARAVIRT_PATCH(make_pmd):
-	case PARAVIRT_PATCH(pmd_val):
-#endif
-		/* These functions end up returning exactly what
-		   they're passed, in the same registers. */
-		ret = paravirt_patch_nop();
-		break;
-
 	default:
 		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
 		break;
@@ -150,7 +143,7 @@ unsigned paravirt_patch_call(void *insnbuf,
 	return 5;
 }
 
-unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
+unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
 			    unsigned long addr, unsigned len)
 {
 	struct branch *b = insnbuf;
@@ -165,22 +158,37 @@ unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
 	return 5;
 }
 
+/* Neat trick to map patch type back to the call within the
+ * corresponding structure. */
+static void *get_call_destination(u8 type)
+{
+	struct paravirt_patch_template tmpl = {
+		.pv_init_ops = pv_init_ops,
+		.pv_time_ops = pv_time_ops,
+		.pv_cpu_ops = pv_cpu_ops,
+		.pv_irq_ops = pv_irq_ops,
+		.pv_apic_ops = pv_apic_ops,
+		.pv_mmu_ops = pv_mmu_ops,
+	};
+	return *((void **)&tmpl + type);
+}
+
 unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 				unsigned long addr, unsigned len)
 {
-	void *opfunc = *((void **)&paravirt_ops + type);
+	void *opfunc = get_call_destination(type);
 	unsigned ret;
 
 	if (opfunc == NULL)
 		/* If there's no function, patch it with a ud2a (BUG) */
-		ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a);
+		ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
 	else if (opfunc == paravirt_nop)
 		/* If the operation is a nop, then nop the callsite */
 		ret = paravirt_patch_nop();
-	else if (type == PARAVIRT_PATCH(iret) ||
-		 type == PARAVIRT_PATCH(irq_enable_sysexit))
+	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit))
 		/* If operation requires a jmp, then jmp */
-		ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len);
+		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
 	else
 		/* Otherwise call the function; assume target could
 		   clobber any caller-save reg */
@@ -205,7 +213,7 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
 
 void init_IRQ(void)
 {
-	paravirt_ops.init_IRQ();
+	pv_irq_ops.init_IRQ();
 }
 
 static void native_flush_tlb(void)
@@ -233,7 +241,7 @@ extern void native_irq_enable_sysexit(void);
 
 static int __init print_banner(void)
 {
-	paravirt_ops.banner();
+	pv_init_ops.banner();
 	return 0;
 }
 core_initcall(print_banner);
@@ -273,47 +281,96 @@ int paravirt_disable_iospace(void)
 	return ret;
 }
 
-struct paravirt_ops paravirt_ops = {
+static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+
+static inline void enter_lazy(enum paravirt_lazy_mode mode)
+{
+	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+	BUG_ON(preemptible());
+
+	x86_write_percpu(paravirt_lazy_mode, mode);
+}
+
+void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+{
+	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode);
+	BUG_ON(preemptible());
+
+	x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+}
+
+void paravirt_enter_lazy_mmu(void)
+{
+	enter_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_leave_lazy_mmu(void)
+{
+	paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_enter_lazy_cpu(void)
+{
+	enter_lazy(PARAVIRT_LAZY_CPU);
+}
+
+void paravirt_leave_lazy_cpu(void)
+{
+	paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+}
+
+enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
+{
+	return x86_read_percpu(paravirt_lazy_mode);
+}
+
+struct pv_info pv_info = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
 	.kernel_rpl = 0,
 	.shared_kernel_pmd = 1,	/* Only used when CONFIG_X86_PAE is set */
+};
 
- 	.patch = native_patch,
+struct pv_init_ops pv_init_ops = {
+	.patch = native_patch,
 	.banner = default_banner,
 	.arch_setup = paravirt_nop,
 	.memory_setup = machine_specific_memory_setup,
+};
+
+struct pv_time_ops pv_time_ops = {
+	.time_init = hpet_time_init,
 	.get_wallclock = native_get_wallclock,
 	.set_wallclock = native_set_wallclock,
-	.time_init = hpet_time_init,
+	.sched_clock = native_sched_clock,
+	.get_cpu_khz = native_calculate_cpu_khz,
+};
+
+struct pv_irq_ops pv_irq_ops = {
 	.init_IRQ = native_init_IRQ,
+	.save_fl = native_save_fl,
+	.restore_fl = native_restore_fl,
+	.irq_disable = native_irq_disable,
+	.irq_enable = native_irq_enable,
+	.safe_halt = native_safe_halt,
+	.halt = native_halt,
+};
 
+struct pv_cpu_ops pv_cpu_ops = {
 	.cpuid = native_cpuid,
 	.get_debugreg = native_get_debugreg,
 	.set_debugreg = native_set_debugreg,
 	.clts = native_clts,
 	.read_cr0 = native_read_cr0,
 	.write_cr0 = native_write_cr0,
-	.read_cr2 = native_read_cr2,
-	.write_cr2 = native_write_cr2,
-	.read_cr3 = native_read_cr3,
-	.write_cr3 = native_write_cr3,
 	.read_cr4 = native_read_cr4,
 	.read_cr4_safe = native_read_cr4_safe,
 	.write_cr4 = native_write_cr4,
-	.save_fl = native_save_fl,
-	.restore_fl = native_restore_fl,
-	.irq_disable = native_irq_disable,
-	.irq_enable = native_irq_enable,
-	.safe_halt = native_safe_halt,
-	.halt = native_halt,
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
 	.write_msr = native_write_msr_safe,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
-	.sched_clock = native_sched_clock,
-	.get_cpu_khz = native_calculate_cpu_khz,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
 	.load_gdt = native_load_gdt,
@@ -327,9 +384,19 @@ struct paravirt_ops paravirt_ops = {
 	.write_idt_entry = write_dt_entry,
 	.load_esp0 = native_load_esp0,
 
+	.irq_enable_sysexit = native_irq_enable_sysexit,
+	.iret = native_iret,
+
 	.set_iopl_mask = native_set_iopl_mask,
 	.io_delay = native_io_delay,
 
+	.lazy_mode = {
+		.enter = paravirt_nop,
+		.leave = paravirt_nop,
+	},
+};
+
+struct pv_apic_ops pv_apic_ops = {
 #ifdef CONFIG_X86_LOCAL_APIC
 	.apic_write = native_apic_write,
 	.apic_write_atomic = native_apic_write_atomic,
@@ -338,11 +405,17 @@ struct paravirt_ops paravirt_ops = {
 	.setup_secondary_clock = setup_secondary_APIC_clock,
 	.startup_ipi_hook = paravirt_nop,
 #endif
-	.set_lazy_mode = paravirt_nop,
+};
 
+struct pv_mmu_ops pv_mmu_ops = {
 	.pagetable_setup_start = native_pagetable_setup_start,
 	.pagetable_setup_done = native_pagetable_setup_done,
 
+	.read_cr2 = native_read_cr2,
+	.write_cr2 = native_write_cr2,
+	.read_cr3 = native_read_cr3,
+	.write_cr3 = native_write_cr3,
+
 	.flush_tlb_user = native_flush_tlb,
 	.flush_tlb_kernel = native_flush_tlb_global,
 	.flush_tlb_single = native_flush_tlb_single,
@@ -381,12 +454,19 @@ struct paravirt_ops paravirt_ops = {
 	.make_pte = native_make_pte,
 	.make_pgd = native_make_pgd,
 
-	.irq_enable_sysexit = native_irq_enable_sysexit,
-	.iret = native_iret,
-
 	.dup_mmap = paravirt_nop,
 	.exit_mmap = paravirt_nop,
 	.activate_mm = paravirt_nop,
+
+	.lazy_mode = {
+		.enter = paravirt_nop,
+		.leave = paravirt_nop,
+	},
 };
 
-EXPORT_SYMBOL(paravirt_ops);
+EXPORT_SYMBOL_GPL(pv_time_ops);
+EXPORT_SYMBOL_GPL(pv_cpu_ops);
+EXPORT_SYMBOL_GPL(pv_mmu_ops);
+EXPORT_SYMBOL_GPL(pv_apic_ops);
+EXPORT_SYMBOL_GPL(pv_info);
+EXPORT_SYMBOL    (pv_irq_ops);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 18673e0f193..f02bad68aba 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -134,21 +134,21 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
 			  unsigned long eip, unsigned len)
 {
 	switch (type) {
-		case PARAVIRT_PATCH(irq_disable):
+		case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
 			return patch_internal(VMI_CALL_DisableInterrupts, len,
 					      insns, eip);
-		case PARAVIRT_PATCH(irq_enable):
+		case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
 			return patch_internal(VMI_CALL_EnableInterrupts, len,
 					      insns, eip);
-		case PARAVIRT_PATCH(restore_fl):
+		case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
 			return patch_internal(VMI_CALL_SetInterruptMask, len,
 					      insns, eip);
-		case PARAVIRT_PATCH(save_fl):
+		case PARAVIRT_PATCH(pv_irq_ops.save_fl):
 			return patch_internal(VMI_CALL_GetInterruptMask, len,
 					      insns, eip);
-		case PARAVIRT_PATCH(iret):
+		case PARAVIRT_PATCH(pv_cpu_ops.iret):
 			return patch_internal(VMI_CALL_IRET, len, insns, eip);
-		case PARAVIRT_PATCH(irq_enable_sysexit):
+		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
 			return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
 		default:
 			break;
@@ -552,24 +552,22 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 }
 #endif
 
-static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
+static void vmi_enter_lazy_cpu(void)
 {
-	static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
-
-	if (!vmi_ops.set_lazy_mode)
-		return;
+	paravirt_enter_lazy_cpu();
+	vmi_ops.set_lazy_mode(2);
+}
 
-	/* Modes should never nest or overlap */
-	BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE ||
-					     mode == PARAVIRT_LAZY_FLUSH));
+static void vmi_enter_lazy_mmu(void)
+{
+	paravirt_enter_lazy_mmu();
+	vmi_ops.set_lazy_mode(1);
+}
 
-	if (mode == PARAVIRT_LAZY_FLUSH) {
-		vmi_ops.set_lazy_mode(0);
-		vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode));
-	} else {
-		vmi_ops.set_lazy_mode(mode);
-		__get_cpu_var(lazy_mode) = mode;
-	}
+static void vmi_leave_lazy(void)
+{
+	paravirt_leave_lazy(paravirt_get_lazy_mode());
+	vmi_ops.set_lazy_mode(0);
 }
 
 static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -690,9 +688,9 @@ do {								\
 	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
 				    VMI_CALL_##vmicall);	\
 	if (rel->type == VMI_RELOCATION_CALL_REL) 		\
-		paravirt_ops.opname = (void *)rel->eip;		\
+		opname = (void *)rel->eip;			\
 	else if (rel->type == VMI_RELOCATION_NOP) 		\
-		paravirt_ops.opname = (void *)vmi_nop;		\
+		opname = (void *)vmi_nop;			\
 	else if (rel->type != VMI_RELOCATION_NONE)		\
 		printk(KERN_WARNING "VMI: Unknown relocation "	\
 				    "type %d for " #vmicall"\n",\
@@ -712,7 +710,7 @@ do {								\
 				    VMI_CALL_##vmicall);	\
 	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);		\
 	if (rel->type == VMI_RELOCATION_CALL_REL) {		\
-		paravirt_ops.opname = wrapper;			\
+		opname = wrapper;				\
 		vmi_ops.cache = (void *)rel->eip;		\
 	}							\
 } while (0)
@@ -732,11 +730,11 @@ static inline int __init activate_vmi(void)
 	}
 	savesegment(cs, kernel_cs);
 
-	paravirt_ops.paravirt_enabled = 1;
-	paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
+	pv_info.paravirt_enabled = 1;
+	pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
+	pv_info.name = "vmi";
 
-	paravirt_ops.patch = vmi_patch;
-	paravirt_ops.name = "vmi";
+	pv_init_ops.patch = vmi_patch;
 
 	/*
 	 * Many of these operations are ABI compatible with VMI.
@@ -754,26 +752,26 @@ static inline int __init activate_vmi(void)
 	 */
 
 	/* CPUID is special, so very special it gets wrapped like a present */
-	para_wrap(cpuid, vmi_cpuid, cpuid, CPUID);
-
-	para_fill(clts, CLTS);
-	para_fill(get_debugreg, GetDR);
-	para_fill(set_debugreg, SetDR);
-	para_fill(read_cr0, GetCR0);
-	para_fill(read_cr2, GetCR2);
-	para_fill(read_cr3, GetCR3);
-	para_fill(read_cr4, GetCR4);
-	para_fill(write_cr0, SetCR0);
-	para_fill(write_cr2, SetCR2);
-	para_fill(write_cr3, SetCR3);
-	para_fill(write_cr4, SetCR4);
-	para_fill(save_fl, GetInterruptMask);
-	para_fill(restore_fl, SetInterruptMask);
-	para_fill(irq_disable, DisableInterrupts);
-	para_fill(irq_enable, EnableInterrupts);
-
-	para_fill(wbinvd, WBINVD);
-	para_fill(read_tsc, RDTSC);
+	para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
+
+	para_fill(pv_cpu_ops.clts, CLTS);
+	para_fill(pv_cpu_ops.get_debugreg, GetDR);
+	para_fill(pv_cpu_ops.set_debugreg, SetDR);
+	para_fill(pv_cpu_ops.read_cr0, GetCR0);
+	para_fill(pv_mmu_ops.read_cr2, GetCR2);
+	para_fill(pv_mmu_ops.read_cr3, GetCR3);
+	para_fill(pv_cpu_ops.read_cr4, GetCR4);
+	para_fill(pv_cpu_ops.write_cr0, SetCR0);
+	para_fill(pv_mmu_ops.write_cr2, SetCR2);
+	para_fill(pv_mmu_ops.write_cr3, SetCR3);
+	para_fill(pv_cpu_ops.write_cr4, SetCR4);
+	para_fill(pv_irq_ops.save_fl, GetInterruptMask);
+	para_fill(pv_irq_ops.restore_fl, SetInterruptMask);
+	para_fill(pv_irq_ops.irq_disable, DisableInterrupts);
+	para_fill(pv_irq_ops.irq_enable, EnableInterrupts);
+
+	para_fill(pv_cpu_ops.wbinvd, WBINVD);
+	para_fill(pv_cpu_ops.read_tsc, RDTSC);
 
 	/* The following we emulate with trap and emulate for now */
 	/* paravirt_ops.read_msr = vmi_rdmsr */
@@ -781,29 +779,38 @@ static inline int __init activate_vmi(void)
 	/* paravirt_ops.rdpmc = vmi_rdpmc */
 
 	/* TR interface doesn't pass TR value, wrap */
-	para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR);
+	para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
 
 	/* LDT is special, too */
-	para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
-	para_fill(load_gdt, SetGDT);
-	para_fill(load_idt, SetIDT);
-	para_fill(store_gdt, GetGDT);
-	para_fill(store_idt, GetIDT);
-	para_fill(store_tr, GetTR);
-	paravirt_ops.load_tls = vmi_load_tls;
-	para_fill(write_ldt_entry, WriteLDTEntry);
-	para_fill(write_gdt_entry, WriteGDTEntry);
-	para_fill(write_idt_entry, WriteIDTEntry);
-	para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
-	para_fill(set_iopl_mask, SetIOPLMask);
-	para_fill(io_delay, IODelay);
-	para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
+	para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
+
+	para_fill(pv_cpu_ops.load_gdt, SetGDT);
+	para_fill(pv_cpu_ops.load_idt, SetIDT);
+	para_fill(pv_cpu_ops.store_gdt, GetGDT);
+	para_fill(pv_cpu_ops.store_idt, GetIDT);
+	para_fill(pv_cpu_ops.store_tr, GetTR);
+	pv_cpu_ops.load_tls = vmi_load_tls;
+	para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry);
+	para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry);
+	para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry);
+	para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
+	para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
+	para_fill(pv_cpu_ops.io_delay, IODelay);
+
+	para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
+		  set_lazy_mode, SetLazyMode);
+	para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
+		  set_lazy_mode, SetLazyMode);
+
+	para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
+		  set_lazy_mode, SetLazyMode);
+	para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
+		  set_lazy_mode, SetLazyMode);
 
 	/* user and kernel flush are just handled with different flags to FlushTLB */
-	para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
-	para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
-	para_fill(flush_tlb_single, InvalPage);
+	para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
+	para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
+	para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
 
 	/*
 	 * Until a standard flag format can be agreed on, we need to
@@ -819,41 +826,41 @@ static inline int __init activate_vmi(void)
 #endif
 
 	if (vmi_ops.set_pte) {
-		paravirt_ops.set_pte = vmi_set_pte;
-		paravirt_ops.set_pte_at = vmi_set_pte_at;
-		paravirt_ops.set_pmd = vmi_set_pmd;
+		pv_mmu_ops.set_pte = vmi_set_pte;
+		pv_mmu_ops.set_pte_at = vmi_set_pte_at;
+		pv_mmu_ops.set_pmd = vmi_set_pmd;
 #ifdef CONFIG_X86_PAE
-		paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
-		paravirt_ops.set_pte_present = vmi_set_pte_present;
-		paravirt_ops.set_pud = vmi_set_pud;
-		paravirt_ops.pte_clear = vmi_pte_clear;
-		paravirt_ops.pmd_clear = vmi_pmd_clear;
+		pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
+		pv_mmu_ops.set_pte_present = vmi_set_pte_present;
+		pv_mmu_ops.set_pud = vmi_set_pud;
+		pv_mmu_ops.pte_clear = vmi_pte_clear;
+		pv_mmu_ops.pmd_clear = vmi_pmd_clear;
 #endif
 	}
 
 	if (vmi_ops.update_pte) {
-		paravirt_ops.pte_update = vmi_update_pte;
-		paravirt_ops.pte_update_defer = vmi_update_pte_defer;
+		pv_mmu_ops.pte_update = vmi_update_pte;
+		pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
 	}
 
 	vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
 	if (vmi_ops.allocate_page) {
-		paravirt_ops.alloc_pt = vmi_allocate_pt;
-		paravirt_ops.alloc_pd = vmi_allocate_pd;
-		paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+		pv_mmu_ops.alloc_pt = vmi_allocate_pt;
+		pv_mmu_ops.alloc_pd = vmi_allocate_pd;
+		pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone;
 	}
 
 	vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
 	if (vmi_ops.release_page) {
-		paravirt_ops.release_pt = vmi_release_pt;
-		paravirt_ops.release_pd = vmi_release_pd;
+		pv_mmu_ops.release_pt = vmi_release_pt;
+		pv_mmu_ops.release_pd = vmi_release_pd;
 	}
 
 	/* Set linear is needed in all cases */
 	vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
 #ifdef CONFIG_HIGHPTE
 	if (vmi_ops.set_linear_mapping)
-		paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
+		pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
 #endif
 
 	/*
@@ -863,17 +870,17 @@ static inline int __init activate_vmi(void)
 	 * the backend.  They are performance critical anyway, so requiring
 	 * a patch is not a big problem.
 	 */
-	paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
-	paravirt_ops.iret = (void *)0xbadbab0;
+	pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
+	pv_cpu_ops.iret = (void *)0xbadbab0;
 
 #ifdef CONFIG_SMP
-	para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
+	para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	para_fill(apic_read, APICRead);
-	para_fill(apic_write, APICWrite);
-	para_fill(apic_write_atomic, APICWrite);
+	para_fill(pv_apic_ops.apic_read, APICRead);
+	para_fill(pv_apic_ops.apic_write, APICWrite);
+	para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
 #endif
 
 	/*
@@ -891,15 +898,15 @@ static inline int __init activate_vmi(void)
 		vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
 		vmi_timer_ops.cancel_alarm =
 			 vmi_get_function(VMI_CALL_CancelAlarm);
-		paravirt_ops.time_init = vmi_time_init;
-		paravirt_ops.get_wallclock = vmi_get_wallclock;
-		paravirt_ops.set_wallclock = vmi_set_wallclock;
+		pv_time_ops.time_init = vmi_time_init;
+		pv_time_ops.get_wallclock = vmi_get_wallclock;
+		pv_time_ops.set_wallclock = vmi_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
-		paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
-		paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
+		pv_apic_ops.setup_boot_clock = vmi_time_bsp_init;
+		pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
 #endif
-		paravirt_ops.sched_clock = vmi_sched_clock;
- 		paravirt_ops.get_cpu_khz = vmi_cpu_khz;
+		pv_time_ops.sched_clock = vmi_sched_clock;
+ 		pv_time_ops.get_cpu_khz = vmi_cpu_khz;
 
 		/* We have true wallclock functions; disable CMOS clock sync */
 		no_sync_cmos_clock = 1;
@@ -908,7 +915,7 @@ static inline int __init activate_vmi(void)
 		disable_vmi_timer = 1;
 	}
 
-	para_fill(safe_halt, Halt);
+	para_fill(pv_irq_ops.safe_halt, Halt);
 
 	/*
 	 * Alternative instruction rewriting doesn't happen soon enough
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index e4e37d4f4c5..c7d19471261 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -748,24 +748,12 @@ struct kmem_cache *pmd_cache;
 
 void __init pgtable_cache_init(void)
 {
-	size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
-
-	if (PTRS_PER_PMD > 1) {
+	if (PTRS_PER_PMD > 1)
 		pmd_cache = kmem_cache_create("pmd",
-					PTRS_PER_PMD*sizeof(pmd_t),
-					PTRS_PER_PMD*sizeof(pmd_t),
-					SLAB_PANIC,
-					pmd_ctor);
-		if (!SHARED_KERNEL_PMD) {
-			/* If we're in PAE mode and have a non-shared
-			   kernel pmd, then the pgd size must be a
-			   page size.  This is because the pgd_list
-			   links through the page structure, so there
-			   can only be one pgd per page for this to
-			   work. */
-			pgd_size = PAGE_SIZE;
-		}
-	}
+					      PTRS_PER_PMD*sizeof(pmd_t),
+					      PTRS_PER_PMD*sizeof(pmd_t),
+					      SLAB_PANIC,
+					      pmd_ctor);
 }
 
 /*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 265f7dd3234..94c39aaf695 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -51,11 +51,25 @@
 
 EXPORT_SYMBOL_GPL(hypercall_page);
 
-DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
-
 DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
-DEFINE_PER_CPU(unsigned long, xen_cr3);
+
+/*
+ * Note about cr3 (pagetable base) values:
+ *
+ * xen_cr3 contains the current logical cr3 value; it contains the
+ * last set cr3.  This may not be the current effective cr3, because
+ * its update may be being lazily deferred.  However, a vcpu looking
+ * at its own cr3 can use this value knowing that it everything will
+ * be self-consistent.
+ *
+ * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+ * hypercall to set the vcpu cr3 is complete (so it may be a little
+ * out of date, but it will never be set early).  If one vcpu is
+ * looking at another vcpu's cr3 value, it should use this variable.
+ */
+DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
+DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
 
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);
@@ -99,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu)
 	info.mfn = virt_to_mfn(vcpup);
 	info.offset = offset_in_page(vcpup);
 
-	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
 	       cpu, vcpup, info.mfn, info.offset);
 
 	/* Check to see if the hypervisor will put the vcpu_info
@@ -123,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu)
 static void __init xen_banner(void)
 {
 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
-	       paravirt_ops.name);
+	       pv_info.name);
 	printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
 }
 
@@ -248,29 +262,10 @@ static void xen_halt(void)
 		xen_safe_halt();
 }
 
-static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+static void xen_leave_lazy(void)
 {
-	BUG_ON(preemptible());
-
-	switch (mode) {
-	case PARAVIRT_LAZY_NONE:
-		BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
-		break;
-
-	case PARAVIRT_LAZY_MMU:
-	case PARAVIRT_LAZY_CPU:
-		BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
-		break;
-
-	case PARAVIRT_LAZY_FLUSH:
-		/* flush if necessary, but don't change state */
-		if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
-			xen_mc_flush();
-		return;
-	}
-
+	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	xen_mc_flush();
-	x86_write_percpu(xen_lazy_mode, mode);
 }
 
 static unsigned long xen_store_tr(void)
@@ -357,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
 	 * loaded properly.  This will go away as soon as Xen has been
 	 * modified to not save/restore %gs for normal hypercalls.
 	 */
-	if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
 		loadsegment(gs, 0);
 }
 
@@ -631,32 +626,36 @@ static unsigned long xen_read_cr3(void)
 	return x86_read_percpu(xen_cr3);
 }
 
+static void set_current_cr3(void *v)
+{
+	x86_write_percpu(xen_current_cr3, (unsigned long)v);
+}
+
 static void xen_write_cr3(unsigned long cr3)
 {
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+	unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+
 	BUG_ON(preemptible());
 
-	if (cr3 == x86_read_percpu(xen_cr3)) {
-		/* just a simple tlb flush */
-		xen_flush_tlb();
-		return;
-	}
+	mcs = xen_mc_entry(sizeof(*op));  /* disables interrupts */
 
+	/* Update while interrupts are disabled, so its atomic with
+	   respect to ipis */
 	x86_write_percpu(xen_cr3, cr3);
 
+	op = mcs.args;
+	op->cmd = MMUEXT_NEW_BASEPTR;
+	op->arg1.mfn = mfn;
 
-	{
-		struct mmuext_op *op;
-		struct multicall_space mcs = xen_mc_entry(sizeof(*op));
-		unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
-
-		op = mcs.args;
-		op->cmd = MMUEXT_NEW_BASEPTR;
-		op->arg1.mfn = mfn;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 
-		MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+	/* Update xen_update_cr3 once the batch has actually
+	   been submitted. */
+	xen_mc_callback(set_current_cr3, (void *)cr3);
 
-		xen_mc_issue(PARAVIRT_LAZY_CPU);
-	}
+	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
 }
 
 /* Early in boot, while setting up the initial pagetable, assume
@@ -667,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
 }
 
+static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
+{
+	struct mmuext_op op;
+	op.cmd = level;
+	op.arg1.mfn = pfn_to_mfn(pfn);
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
 /* This needs to make sure the new pte page is pinned iff its being
    attached to a pinned pagetable. */
 static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
@@ -676,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
 	if (PagePinned(virt_to_page(mm->pgd))) {
 		SetPagePinned(page);
 
-		if (!PageHighMem(page))
+		if (!PageHighMem(page)) {
 			make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-		else
+			pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+		} else
 			/* make sure there are no stray mappings of
 			   this page */
 			kmap_flush_unused();
@@ -691,8 +700,10 @@ static void xen_release_pt(u32 pfn)
 	struct page *page = pfn_to_page(pfn);
 
 	if (PagePinned(page)) {
-		if (!PageHighMem(page))
+		if (!PageHighMem(page)) {
+			pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
 			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+		}
 	}
 }
 
@@ -737,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
 	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
 
 	/* special set_pte for pagetable initialization */
-	paravirt_ops.set_pte = xen_set_pte_init;
+	pv_mmu_ops.set_pte = xen_set_pte_init;
 
 	init_mm.pgd = base;
 	/*
@@ -784,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 {
 	/* This will work as long as patching hasn't happened yet
 	   (which it hasn't) */
-	paravirt_ops.alloc_pt = xen_alloc_pt;
-	paravirt_ops.set_pte = xen_set_pte;
+	pv_mmu_ops.alloc_pt = xen_alloc_pt;
+	pv_mmu_ops.set_pte = xen_set_pte;
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		/*
@@ -807,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 	/* Actually pin the pagetable down, but we can't set PG_pinned
 	   yet because the page structures don't exist yet. */
 	{
-		struct mmuext_op op;
+		unsigned level;
+
 #ifdef CONFIG_X86_PAE
-		op.cmd = MMUEXT_PIN_L3_TABLE;
+		level = MMUEXT_PIN_L3_TABLE;
 #else
-		op.cmd = MMUEXT_PIN_L3_TABLE;
+		level = MMUEXT_PIN_L2_TABLE;
 #endif
-		op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
-		if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-			BUG();
+
+		pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
 	}
 }
 
@@ -832,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void)
 	if (have_vcpu_info_placement) {
 		printk(KERN_INFO "Xen: using vcpu_info placement\n");
 
-		paravirt_ops.save_fl = xen_save_fl_direct;
-		paravirt_ops.restore_fl = xen_restore_fl_direct;
-		paravirt_ops.irq_disable = xen_irq_disable_direct;
-		paravirt_ops.irq_enable = xen_irq_enable_direct;
-		paravirt_ops.read_cr2 = xen_read_cr2_direct;
-		paravirt_ops.iret = xen_iret_direct;
+		pv_irq_ops.save_fl = xen_save_fl_direct;
+		pv_irq_ops.restore_fl = xen_restore_fl_direct;
+		pv_irq_ops.irq_disable = xen_irq_disable_direct;
+		pv_irq_ops.irq_enable = xen_irq_enable_direct;
+		pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
+		pv_cpu_ops.iret = xen_iret_direct;
 	}
 }
 
@@ -849,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 
 	start = end = reloc = NULL;
 
-#define SITE(x)								\
-	case PARAVIRT_PATCH(x):						\
+#define SITE(op, x)							\
+	case PARAVIRT_PATCH(op.x):					\
 	if (have_vcpu_info_placement) {					\
 		start = (char *)xen_##x##_direct;			\
 		end = xen_##x##_direct_end;				\
@@ -859,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 	goto patch_site
 
 	switch (type) {
-		SITE(irq_enable);
-		SITE(irq_disable);
-		SITE(save_fl);
-		SITE(restore_fl);
+		SITE(pv_irq_ops, irq_enable);
+		SITE(pv_irq_ops, irq_disable);
+		SITE(pv_irq_ops, save_fl);
+		SITE(pv_irq_ops, restore_fl);
 #undef SITE
 
 	patch_site:
@@ -894,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 	return ret;
 }
 
-static const struct paravirt_ops xen_paravirt_ops __initdata = {
+static const struct pv_info xen_info __initdata = {
 	.paravirt_enabled = 1,
 	.shared_kernel_pmd = 0,
 
 	.name = "Xen",
-	.banner = xen_banner,
+};
 
+static const struct pv_init_ops xen_init_ops __initdata = {
 	.patch = xen_patch,
 
+	.banner = xen_banner,
 	.memory_setup = xen_memory_setup,
 	.arch_setup = xen_arch_setup,
-	.init_IRQ = xen_init_IRQ,
 	.post_allocator_init = xen_mark_init_mm_pinned,
+};
 
+static const struct pv_time_ops xen_time_ops __initdata = {
 	.time_init = xen_time_init,
+
 	.set_wallclock = xen_set_wallclock,
 	.get_wallclock = xen_get_wallclock,
 	.get_cpu_khz = xen_cpu_khz,
 	.sched_clock = xen_sched_clock,
+};
 
+static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.cpuid = xen_cpuid,
 
 	.set_debugreg = xen_set_debugreg,
@@ -924,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.read_cr0 = native_read_cr0,
 	.write_cr0 = native_write_cr0,
 
-	.read_cr2 = xen_read_cr2,
-	.write_cr2 = xen_write_cr2,
-
-	.read_cr3 = xen_read_cr3,
-	.write_cr3 = xen_write_cr3,
-
 	.read_cr4 = native_read_cr4,
 	.read_cr4_safe = native_read_cr4_safe,
 	.write_cr4 = xen_write_cr4,
 
-	.save_fl = xen_save_fl,
-	.restore_fl = xen_restore_fl,
-	.irq_disable = xen_irq_disable,
-	.irq_enable = xen_irq_enable,
-	.safe_halt = xen_safe_halt,
-	.halt = xen_halt,
 	.wbinvd = native_wbinvd,
 
 	.read_msr = native_read_msr_safe,
@@ -968,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.set_iopl_mask = xen_set_iopl_mask,
 	.io_delay = xen_io_delay,
 
+	.lazy_mode = {
+		.enter = paravirt_enter_lazy_cpu,
+		.leave = xen_leave_lazy,
+	},
+};
+
+static const struct pv_irq_ops xen_irq_ops __initdata = {
+	.init_IRQ = xen_init_IRQ,
+	.save_fl = xen_save_fl,
+	.restore_fl = xen_restore_fl,
+	.irq_disable = xen_irq_disable,
+	.irq_enable = xen_irq_enable,
+	.safe_halt = xen_safe_halt,
+	.halt = xen_halt,
+};
+
+static const struct pv_apic_ops xen_apic_ops __initdata = {
 #ifdef CONFIG_X86_LOCAL_APIC
 	.apic_write = xen_apic_write,
 	.apic_write_atomic = xen_apic_write,
@@ -976,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.setup_secondary_clock = paravirt_nop,
 	.startup_ipi_hook = paravirt_nop,
 #endif
+};
+
+static const struct pv_mmu_ops xen_mmu_ops __initdata = {
+	.pagetable_setup_start = xen_pagetable_setup_start,
+	.pagetable_setup_done = xen_pagetable_setup_done,
+
+	.read_cr2 = xen_read_cr2,
+	.write_cr2 = xen_write_cr2,
+
+	.read_cr3 = xen_read_cr3,
+	.write_cr3 = xen_write_cr3,
 
 	.flush_tlb_user = xen_flush_tlb,
 	.flush_tlb_kernel = xen_flush_tlb,
@@ -985,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.pte_update = paravirt_nop,
 	.pte_update_defer = paravirt_nop,
 
-	.pagetable_setup_start = xen_pagetable_setup_start,
-	.pagetable_setup_done = xen_pagetable_setup_done,
-
 	.alloc_pt = xen_alloc_pt_init,
 	.release_pt = xen_release_pt,
 	.alloc_pd = paravirt_nop,
@@ -1023,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.dup_mmap = xen_dup_mmap,
 	.exit_mmap = xen_exit_mmap,
 
-	.set_lazy_mode = xen_set_lazy_mode,
+	.lazy_mode = {
+		.enter = paravirt_enter_lazy_mmu,
+		.leave = xen_leave_lazy,
+	},
 };
 
 #ifdef CONFIG_SMP
@@ -1079,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = {
 };
 
 
+static void __init xen_reserve_top(void)
+{
+	unsigned long top = HYPERVISOR_VIRT_START;
+	struct xen_platform_parameters pp;
+
+	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+		top = pp.virt_start;
+
+	reserve_top_address(-top + 2 * PAGE_SIZE);
+}
+
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -1090,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void)
 	BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
 
 	/* Install Xen paravirt ops */
-	paravirt_ops = xen_paravirt_ops;
+	pv_info = xen_info;
+	pv_init_ops = xen_init_ops;
+	pv_time_ops = xen_time_ops;
+	pv_cpu_ops = xen_cpu_ops;
+	pv_irq_ops = xen_irq_ops;
+	pv_apic_ops = xen_apic_ops;
+	pv_mmu_ops = xen_mmu_ops;
+
 	machine_ops = xen_machine_ops;
 
 #ifdef CONFIG_SMP
@@ -1112,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void)
 	/* keep using Xen gdt for now; no urgent need to change it */
 
 	x86_write_percpu(xen_cr3, __pa(pgd));
+	x86_write_percpu(xen_current_cr3, __pa(pgd));
 
 #ifdef CONFIG_SMP
 	/* Don't do the full vcpu_info placement stuff until we have a
@@ -1123,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void)
 	xen_setup_vcpu_info_placement();
 #endif
 
-	paravirt_ops.kernel_rpl = 1;
+	pv_info.kernel_rpl = 1;
 	if (xen_feature(XENFEAT_supervisor_mode_kernel))
-		paravirt_ops.kernel_rpl = 0;
+		pv_info.kernel_rpl = 0;
 
 	/* set the limit of our address space */
-	reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+	xen_reserve_top();
 
 	/* set up basic CPUID stuff */
 	cpu_detect(&new_cpu_data);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0bb7f001910..b2e32f9d007 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -154,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval)
 {
 	if (mm == current->mm || mm == &init_mm) {
-		if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
 			struct multicall_space mcs;
 			mcs = xen_mc_entry(0);
 
@@ -303,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
 }
 #endif	/* CONFIG_X86_PAE */
 
-
+enum pt_level {
+	PT_PGD,
+	PT_PUD,
+	PT_PMD,
+	PT_PTE
+};
 
 /*
   (Yet another) pagetable walker.  This one is intended for pinning a
@@ -315,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
   FIXADDR_TOP.  But the important bit is that we don't pin beyond
   there, because then we start getting into Xen's ptes.
 */
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
 		    unsigned long limit)
 {
 	pgd_t *pgd = pgd_base;
@@ -340,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
 		pud = pud_offset(pgd, 0);
 
 		if (PTRS_PER_PUD > 1) /* not folded */
-			flush |= (*func)(virt_to_page(pud), 0);
+			flush |= (*func)(virt_to_page(pud), PT_PUD);
 
 		for (; addr != pud_limit; pud++, addr = pud_next) {
 			pmd_t *pmd;
@@ -359,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
 			pmd = pmd_offset(pud, 0);
 
 			if (PTRS_PER_PMD > 1) /* not folded */
-				flush |= (*func)(virt_to_page(pmd), 0);
+				flush |= (*func)(virt_to_page(pmd), PT_PMD);
 
 			for (; addr != pmd_limit; pmd++) {
 				addr += (PAGE_SIZE * PTRS_PER_PTE);
@@ -371,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
 				if (pmd_none(*pmd))
 					continue;
 
-				flush |= (*func)(pmd_page(*pmd), 0);
+				flush |= (*func)(pmd_page(*pmd), PT_PTE);
 			}
 		}
 	}
 
-	flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+	flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
 
 	return flush;
 }
 
-static int pin_page(struct page *page, unsigned flags)
+static spinlock_t *lock_pte(struct page *page)
+{
+	spinlock_t *ptl = NULL;
+
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+	ptl = __pte_lockptr(page);
+	spin_lock(ptl);
+#endif
+
+	return ptl;
+}
+
+static void do_unlock(void *v)
+{
+	spinlock_t *ptl = v;
+	spin_unlock(ptl);
+}
+
+static void xen_do_pin(unsigned level, unsigned long pfn)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+
+	mcs = __xen_mc_entry(sizeof(*op));
+	op = mcs.args;
+	op->cmd = level;
+	op->arg1.mfn = pfn_to_mfn(pfn);
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+}
+
+static int pin_page(struct page *page, enum pt_level level)
 {
 	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
 	int flush;
@@ -396,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
 		void *pt = lowmem_page_address(page);
 		unsigned long pfn = page_to_pfn(page);
 		struct multicall_space mcs = __xen_mc_entry(0);
+		spinlock_t *ptl;
 
 		flush = 0;
 
+		ptl = NULL;
+		if (level == PT_PTE)
+			ptl = lock_pte(page);
+
 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 					pfn_pte(pfn, PAGE_KERNEL_RO),
-					flags);
+					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
+
+		if (level == PT_PTE)
+			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
+
+		if (ptl) {
+			/* Queue a deferred unlock for when this batch
+			   is completed. */
+			xen_mc_callback(do_unlock, ptl);
+		}
 	}
 
 	return flush;
@@ -412,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
    read-only, and can be pinned. */
 void xen_pgd_pin(pgd_t *pgd)
 {
-	struct multicall_space mcs;
-	struct mmuext_op *op;
+	unsigned level;
 
 	xen_mc_batch();
 
@@ -424,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
 		xen_mc_batch();
 	}
 
-	mcs = __xen_mc_entry(sizeof(*op));
-	op = mcs.args;
-
 #ifdef CONFIG_X86_PAE
-	op->cmd = MMUEXT_PIN_L3_TABLE;
+	level = MMUEXT_PIN_L3_TABLE;
 #else
-	op->cmd = MMUEXT_PIN_L2_TABLE;
+	level = MMUEXT_PIN_L2_TABLE;
 #endif
-	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_do_pin(level, PFN_DOWN(__pa(pgd)));
 
 	xen_mc_issue(0);
 }
@@ -441,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
 /* The init_mm pagetable is really pinned as soon as its created, but
    that's before we have page structures to store the bits.  So do all
    the book-keeping now. */
-static __init int mark_pinned(struct page *page, unsigned flags)
+static __init int mark_pinned(struct page *page, enum pt_level level)
 {
 	SetPagePinned(page);
 	return 0;
@@ -452,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
 	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
 }
 
-static int unpin_page(struct page *page, unsigned flags)
+static int unpin_page(struct page *page, enum pt_level level)
 {
 	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
 
 	if (pgfl && !PageHighMem(page)) {
 		void *pt = lowmem_page_address(page);
 		unsigned long pfn = page_to_pfn(page);
-		struct multicall_space mcs = __xen_mc_entry(0);
+		spinlock_t *ptl = NULL;
+		struct multicall_space mcs;
+
+		if (level == PT_PTE) {
+			ptl = lock_pte(page);
+
+			xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
+		}
+
+		mcs = __xen_mc_entry(0);
 
 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 					pfn_pte(pfn, PAGE_KERNEL),
-					flags);
+					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
+
+		if (ptl) {
+			/* unlock when batch completed */
+			xen_mc_callback(do_unlock, ptl);
+		}
 	}
 
 	return 0;		/* never need to flush on unpin */
@@ -472,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
 /* Release a pagetables pages back as normal RW */
 static void xen_pgd_unpin(pgd_t *pgd)
 {
-	struct mmuext_op *op;
-	struct multicall_space mcs;
-
 	xen_mc_batch();
 
-	mcs = __xen_mc_entry(sizeof(*op));
-
-	op = mcs.args;
-	op->cmd = MMUEXT_UNPIN_TABLE;
-	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
-
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 
 	pgd_walk(pgd, unpin_page, TASK_SIZE);
 
@@ -514,20 +564,43 @@ static void drop_other_mm_ref(void *info)
 
 	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
 		leave_mm(smp_processor_id());
+
+	/* If this cpu still has a stale cr3 reference, then make sure
+	   it has been flushed. */
+	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+		load_cr3(swapper_pg_dir);
+		arch_flush_lazy_cpu_mode();
+	}
 }
 
 static void drop_mm_ref(struct mm_struct *mm)
 {
+	cpumask_t mask;
+	unsigned cpu;
+
 	if (current->active_mm == mm) {
 		if (current->mm == mm)
 			load_cr3(swapper_pg_dir);
 		else
 			leave_mm(smp_processor_id());
+		arch_flush_lazy_cpu_mode();
 	}
 
-	if (!cpus_empty(mm->cpu_vm_mask))
-		xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
-					   mm, 1);
+	/* Get the "official" set of cpus referring to our pagetable. */
+	mask = mm->cpu_vm_mask;
+
+	/* It's possible that a vcpu may have a stale reference to our
+	   cr3, because its in lazy mode, and it hasn't yet flushed
+	   its set of pending hypercalls yet.  In this case, we can
+	   look at its actual current cr3 value, and force it to flush
+	   if needed. */
+	for_each_online_cpu(cpu) {
+		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
+			cpu_set(cpu, mask);
+	}
+
+	if (!cpus_empty(mask))
+		xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
 }
 #else
 static void drop_mm_ref(struct mm_struct *mm)
@@ -562,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
 	/* pgd may not be pinned in the error exit path of execve */
 	if (PagePinned(virt_to_page(mm->pgd)))
 		xen_pgd_unpin(mm->pgd);
+
 	spin_unlock(&mm->page_table_lock);
 }
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index c837e8e463d..5e6f36f6d87 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -26,13 +26,22 @@
 
 #include "multicalls.h"
 
+#define MC_DEBUG	1
+
 #define MC_BATCH	32
 #define MC_ARGS		(MC_BATCH * 16 / sizeof(u64))
 
 struct mc_buffer {
 	struct multicall_entry entries[MC_BATCH];
+#if MC_DEBUG
+	struct multicall_entry debug[MC_BATCH];
+#endif
 	u64 args[MC_ARGS];
-	unsigned mcidx, argidx;
+	struct callback {
+		void (*fn)(void *);
+		void *data;
+	} callbacks[MC_BATCH];
+	unsigned mcidx, argidx, cbidx;
 };
 
 static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
@@ -43,6 +52,7 @@ void xen_mc_flush(void)
 	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
 	int ret = 0;
 	unsigned long flags;
+	int i;
 
 	BUG_ON(preemptible());
 
@@ -51,13 +61,31 @@ void xen_mc_flush(void)
 	local_irq_save(flags);
 
 	if (b->mcidx) {
-		int i;
+#if MC_DEBUG
+		memcpy(b->debug, b->entries,
+		       b->mcidx * sizeof(struct multicall_entry));
+#endif
 
 		if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
 			BUG();
 		for (i = 0; i < b->mcidx; i++)
 			if (b->entries[i].result < 0)
 				ret++;
+
+#if MC_DEBUG
+		if (ret) {
+			printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
+			       ret, smp_processor_id());
+			for(i = 0; i < b->mcidx; i++) {
+				printk("  call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
+				       i+1, b->mcidx,
+				       b->debug[i].op,
+				       b->debug[i].args[0],
+				       b->entries[i].result);
+			}
+		}
+#endif
+
 		b->mcidx = 0;
 		b->argidx = 0;
 	} else
@@ -65,6 +93,13 @@ void xen_mc_flush(void)
 
 	local_irq_restore(flags);
 
+	for(i = 0; i < b->cbidx; i++) {
+		struct callback *cb = &b->callbacks[i];
+
+		(*cb->fn)(cb->data);
+	}
+	b->cbidx = 0;
+
 	BUG_ON(ret);
 }
 
@@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args)
 
 	return ret;
 }
+
+void xen_mc_callback(void (*fn)(void *), void *data)
+{
+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+	struct callback *cb;
+
+	if (b->cbidx == MC_BATCH)
+		xen_mc_flush();
+
+	cb = &b->callbacks[b->cbidx++];
+	cb->fn = fn;
+	cb->data = data;
+}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index e6f7530b156..8bae996d99a 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -35,11 +35,14 @@ void xen_mc_flush(void);
 /* Issue a multicall if we're not in a lazy mode */
 static inline void xen_mc_issue(unsigned mode)
 {
-	if ((xen_get_lazy_mode() & mode) == 0)
+	if ((paravirt_get_lazy_mode() & mode) == 0)
 		xen_mc_flush();
 
 	/* restore flags saved in xen_mc_batch */
 	local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
 }
 
+/* Set up a callback to be called when the current batch is flushed */
+void xen_mc_callback(void (*fn)(void *), void *data);
+
 #endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 6c058585459..c1b131bcdcb 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -371,7 +371,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 			       void *info, int wait)
 {
 	struct call_data_struct data;
-	int cpus;
+	int cpus, cpu;
+	bool yield;
 
 	/* Holding any lock stops cpus from going down. */
 	spin_lock(&call_lock);
@@ -400,9 +401,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 	/* Send a message to other CPUs and wait for them to respond */
 	xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
 
-	/* Make sure other vcpus get a chance to run.
-	   XXX too severe?  Maybe we should check the other CPU's states? */
-	HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+	/* Make sure other vcpus get a chance to run if they need to. */
+	yield = false;
+	for_each_cpu_mask(cpu, mask)
+		if (xen_vcpu_stolen(cpu))
+			yield = true;
+
+	if (yield)
+		HYPERVISOR_sched_op(SCHEDOP_yield, 0);
 
 	/* Wait for response */
 	while (atomic_read(&data.started) != cpus ||
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index dfd6db69ead..d083ff5ef08 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
 	} while (get64(&state->state_entry_time) != state_time);
 }
 
+/* return true when a vcpu could run but has no real cpu to run on */
+bool xen_vcpu_stolen(int vcpu)
+{
+	return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
+}
+
 static void setup_runstate_info(int cpu)
 {
 	struct vcpu_register_runstate_memory_area area;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b9aaea45f07..b02a909bfd4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps);
 
 DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DECLARE_PER_CPU(unsigned long, xen_cr3);
+DECLARE_PER_CPU(unsigned long, xen_current_cr3);
 
 extern struct start_info *xen_start_info;
 extern struct shared_info *HYPERVISOR_shared_info;
@@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void);
 int xen_set_wallclock(unsigned long time);
 unsigned long long xen_sched_clock(void);
 
-void xen_mark_init_mm_pinned(void);
-
-DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+bool xen_vcpu_stolen(int vcpu);
 
-static inline unsigned xen_get_lazy_mode(void)
-{
-	return x86_read_percpu(xen_lazy_mode);
-}
+void xen_mark_init_mm_pinned(void);
 
 void __init xen_fill_possible_map(void);