185 files changed, 7036 insertions, 2692 deletions
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 1e89a51ea49..88519daab6e 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -62,7 +62,6 @@ aic7*reg_print.c*
 aic7*seq.h*
 aicasm
 aicdb.h*
-asm
 asm-offsets.h
 asm_offsets.h
 autoconf.h*
diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c
index 8373736c24d..68cc3efae56 100644
--- a/arch/arm/mach-orion5x/common.c
+++ b/arch/arm/mach-orion5x/common.c
@@ -31,6 +31,7 @@
 #include <plat/ehci-orion.h>
 #include <plat/mv_xor.h>
 #include <plat/orion_nand.h>
+#include <plat/orion5x_wdt.h>
 #include <plat/time.h>
 #include "common.h"
 
@@ -536,6 +537,29 @@ void __init orion5x_xor_init(void)
 
 
 /*****************************************************************************
+ * Watchdog
+ ****************************************************************************/
+static struct orion5x_wdt_platform_data orion5x_wdt_data = {
+	.tclk			= 0,
+};
+
+static struct platform_device orion5x_wdt_device = {
+	.name		= "orion5x_wdt",
+	.id		= -1,
+	.dev		= {
+		.platform_data	= &orion5x_wdt_data,
+	},
+	.num_resources	= 0,
+};
+
+void __init orion5x_wdt_init(void)
+{
+	orion5x_wdt_data.tclk = orion5x_tclk;
+	platform_device_register(&orion5x_wdt_device);
+}
+
+
+/*****************************************************************************
  * Time handling
  ****************************************************************************/
 int orion5x_tclk;
@@ -634,6 +658,11 @@ void __init orion5x_init(void)
 		printk(KERN_INFO "Orion: Applying 5281 D0 WFI workaround.\n");
 		disable_hlt();
 	}
+
+	/*
+	 * Register watchdog driver
+	 */
+	orion5x_wdt_init();
 }
 
 /*
diff --git a/arch/arm/plat-orion/include/plat/orion5x_wdt.h b/arch/arm/plat-orion/include/plat/orion5x_wdt.h
new file mode 100644
index 00000000000..3c9cf6a305e
--- /dev/null
+++ b/arch/arm/plat-orion/include/plat/orion5x_wdt.h
@@ -0,0 +1,18 @@
+/*
+ * arch/arm/plat-orion/include/plat/orion5x_wdt.h
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef __PLAT_ORION5X_WDT_H
+#define __PLAT_ORION5X_WDT_H
+
+struct orion5x_wdt_platform_data {
+	u32	tclk;		/* no <linux/clk.h> support yet */
+};
+
+
+#endif
+
diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index bfa86b6af7c..0ee5bd7a988 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -166,7 +166,40 @@ struct saved_vpd {
 	unsigned long  vcpuid[5];
 	unsigned long  vpsr;
 	unsigned long  vpr;
-	unsigned long  vcr[128];
+	union {
+		unsigned long  vcr[128];
+		struct {
+			unsigned long dcr;
+			unsigned long itm;
+			unsigned long iva;
+			unsigned long rsv1[5];
+			unsigned long pta;
+			unsigned long rsv2[7];
+			unsigned long ipsr;
+			unsigned long isr;
+			unsigned long rsv3;
+			unsigned long iip;
+			unsigned long ifa;
+			unsigned long itir;
+			unsigned long iipa;
+			unsigned long ifs;
+			unsigned long iim;
+			unsigned long iha;
+			unsigned long rsv4[38];
+			unsigned long lid;
+			unsigned long ivr;
+			unsigned long tpr;
+			unsigned long eoi;
+			unsigned long irr[4];
+			unsigned long itv;
+			unsigned long pmv;
+			unsigned long cmcv;
+			unsigned long rsv5[5];
+			unsigned long lrr0;
+			unsigned long lrr1;
+			unsigned long rsv6[46];
+		};
+	};
 };
 
 struct kvm_regs {
@@ -214,4 +247,18 @@ struct kvm_sregs {
 struct kvm_fpu {
 };
 
+#define KVM_IA64_VCPU_STACK_SHIFT	16
+#define KVM_IA64_VCPU_STACK_SIZE	(1UL << KVM_IA64_VCPU_STACK_SHIFT)
+
+struct kvm_ia64_vcpu_stack {
+	unsigned char stack[KVM_IA64_VCPU_STACK_SIZE];
+};
+
+struct kvm_debug_exit_arch {
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+};
+
 #endif
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 34866366165..4542651e6ac 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -112,7 +112,11 @@
 #define VCPU_STRUCT_SHIFT	16
 #define VCPU_STRUCT_SIZE	(__IA64_UL_CONST(1) << VCPU_STRUCT_SHIFT)
 
-#define KVM_STK_OFFSET		VCPU_STRUCT_SIZE
+/*
+ * This must match KVM_IA64_VCPU_STACK_{SHIFT,SIZE} arch/ia64/include/asm/kvm.h
+ */
+#define KVM_STK_SHIFT		16
+#define KVM_STK_OFFSET		(__IA64_UL_CONST(1)<< KVM_STK_SHIFT)
 
 #define KVM_VM_STRUCT_SHIFT	19
 #define KVM_VM_STRUCT_SIZE	(__IA64_UL_CONST(1) << KVM_VM_STRUCT_SHIFT)
@@ -153,10 +157,10 @@ struct kvm_vm_data {
 	struct kvm_vcpu_data vcpu_data[KVM_MAX_VCPUS];
 };
 
-#define VCPU_BASE(n)	KVM_VM_DATA_BASE + \
-				offsetof(struct kvm_vm_data, vcpu_data[n])
-#define VM_BASE		KVM_VM_DATA_BASE + \
-				offsetof(struct kvm_vm_data, kvm_vm_struct)
+#define VCPU_BASE(n)	(KVM_VM_DATA_BASE + \
+				offsetof(struct kvm_vm_data, vcpu_data[n]))
+#define KVM_VM_BASE	(KVM_VM_DATA_BASE + \
+				offsetof(struct kvm_vm_data, kvm_vm_struct))
 #define KVM_MEM_DIRTY_LOG_BASE	KVM_VM_DATA_BASE + \
 				offsetof(struct kvm_vm_data, kvm_mem_dirty_log)
 
@@ -235,8 +239,6 @@ struct kvm_vm_data {
 
 struct kvm;
 struct kvm_vcpu;
-struct kvm_guest_debug{
-};
 
 struct kvm_mmio_req {
 	uint64_t addr;          /*  physical address		*/
@@ -462,6 +464,8 @@ struct kvm_arch {
 	unsigned long	metaphysical_rr4;
 	unsigned long	vmm_init_rr;
 
+	int		online_vcpus;
+
 	struct kvm_ioapic *vioapic;
 	struct kvm_vm_stat stat;
 	struct kvm_sal_data rdv_sal_data;
diff --git a/arch/ia64/include/asm/msidef.h b/arch/ia64/include/asm/msidef.h
new file mode 100644
index 00000000000..592c1047a0c
--- /dev/null
+++ b/arch/ia64/include/asm/msidef.h
@@ -0,0 +1,42 @@
+#ifndef _IA64_MSI_DEF_H
+#define _IA64_MSI_DEF_H
+
+/*
+ * Shifts for APIC-based data
+ */
+
+#define     MSI_DATA_VECTOR_SHIFT	0
+#define	    MSI_DATA_VECTOR(v)		(((u8)v) << MSI_DATA_VECTOR_SHIFT)
+#define     MSI_DATA_VECTOR_MASK	0xffffff00
+
+#define     MSI_DATA_DELIVERY_MODE_SHIFT	8
+#define     MSI_DATA_DELIVERY_FIXED	(0 << MSI_DATA_DELIVERY_MODE_SHIFT)
+#define     MSI_DATA_DELIVERY_LOWPRI	(1 << MSI_DATA_DELIVERY_MODE_SHIFT)
+
+#define     MSI_DATA_LEVEL_SHIFT	14
+#define     MSI_DATA_LEVEL_DEASSERT	(0 << MSI_DATA_LEVEL_SHIFT)
+#define     MSI_DATA_LEVEL_ASSERT	(1 << MSI_DATA_LEVEL_SHIFT)
+
+#define     MSI_DATA_TRIGGER_SHIFT	15
+#define     MSI_DATA_TRIGGER_EDGE	(0 << MSI_DATA_TRIGGER_SHIFT)
+#define     MSI_DATA_TRIGGER_LEVEL	(1 << MSI_DATA_TRIGGER_SHIFT)
+
+/*
+ * Shift/mask fields for APIC-based bus address
+ */
+
+#define     MSI_ADDR_DEST_ID_SHIFT	4
+#define     MSI_ADDR_HEADER		0xfee00000
+
+#define     MSI_ADDR_DEST_ID_MASK	0xfff0000f
+#define     MSI_ADDR_DEST_ID_CPU(cpu)	((cpu) << MSI_ADDR_DEST_ID_SHIFT)
+
+#define     MSI_ADDR_DEST_MODE_SHIFT	2
+#define     MSI_ADDR_DEST_MODE_PHYS	(0 << MSI_ADDR_DEST_MODE_SHIFT)
+#define	    MSI_ADDR_DEST_MODE_LOGIC	(1 << MSI_ADDR_DEST_MODE_SHIFT)
+
+#define     MSI_ADDR_REDIRECTION_SHIFT	3
+#define     MSI_ADDR_REDIRECTION_CPU	(0 << MSI_ADDR_REDIRECTION_SHIFT)
+#define     MSI_ADDR_REDIRECTION_LOWPRI	(1 << MSI_ADDR_REDIRECTION_SHIFT)
+
+#endif/* _IA64_MSI_DEF_H */
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 89033933903..368ee4e5266 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -7,44 +7,7 @@
 #include <linux/msi.h>
 #include <linux/dmar.h>
 #include <asm/smp.h>
-
-/*
- * Shifts for APIC-based data
- */
-
-#define MSI_DATA_VECTOR_SHIFT		0
-#define	    MSI_DATA_VECTOR(v)		(((u8)v) << MSI_DATA_VECTOR_SHIFT)
-#define MSI_DATA_VECTOR_MASK		0xffffff00
-
-#define MSI_DATA_DELIVERY_SHIFT		8
-#define     MSI_DATA_DELIVERY_FIXED	(0 << MSI_DATA_DELIVERY_SHIFT)
-#define     MSI_DATA_DELIVERY_LOWPRI	(1 << MSI_DATA_DELIVERY_SHIFT)
-
-#define MSI_DATA_LEVEL_SHIFT		14
-#define     MSI_DATA_LEVEL_DEASSERT	(0 << MSI_DATA_LEVEL_SHIFT)
-#define     MSI_DATA_LEVEL_ASSERT	(1 << MSI_DATA_LEVEL_SHIFT)
-
-#define MSI_DATA_TRIGGER_SHIFT		15
-#define     MSI_DATA_TRIGGER_EDGE	(0 << MSI_DATA_TRIGGER_SHIFT)
-#define     MSI_DATA_TRIGGER_LEVEL	(1 << MSI_DATA_TRIGGER_SHIFT)
-
-/*
- * Shift/mask fields for APIC-based bus address
- */
-
-#define MSI_TARGET_CPU_SHIFT		4
-#define MSI_ADDR_HEADER			0xfee00000
-
-#define MSI_ADDR_DESTID_MASK		0xfff0000f
-#define     MSI_ADDR_DESTID_CPU(cpu)	((cpu) << MSI_TARGET_CPU_SHIFT)
-
-#define MSI_ADDR_DESTMODE_SHIFT		2
-#define     MSI_ADDR_DESTMODE_PHYS	(0 << MSI_ADDR_DESTMODE_SHIFT)
-#define	    MSI_ADDR_DESTMODE_LOGIC	(1 << MSI_ADDR_DESTMODE_SHIFT)
-
-#define MSI_ADDR_REDIRECTION_SHIFT	3
-#define     MSI_ADDR_REDIRECTION_CPU	(0 << MSI_ADDR_REDIRECTION_SHIFT)
-#define     MSI_ADDR_REDIRECTION_LOWPRI	(1 << MSI_ADDR_REDIRECTION_SHIFT)
+#include <asm/msidef.h>
 
 static struct irq_chip	ia64_msi_chip;
 
@@ -65,8 +28,8 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
 	read_msi_msg(irq, &msg);
 
 	addr = msg.address_lo;
-	addr &= MSI_ADDR_DESTID_MASK;
-	addr |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
+	addr &= MSI_ADDR_DEST_ID_MASK;
+	addr |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
 	msg.address_lo = addr;
 
 	data = msg.data;
@@ -98,9 +61,9 @@ int ia64_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
 	msg.address_hi = 0;
 	msg.address_lo =
 		MSI_ADDR_HEADER |
-		MSI_ADDR_DESTMODE_PHYS |
+		MSI_ADDR_DEST_MODE_PHYS |
 		MSI_ADDR_REDIRECTION_CPU |
-		MSI_ADDR_DESTID_CPU(dest_phys_id);
+		MSI_ADDR_DEST_ID_CPU(dest_phys_id);
 
 	msg.data =
 		MSI_DATA_TRIGGER_EDGE |
@@ -183,8 +146,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DESTID_MASK;
-	msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
+	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+	msg.address_lo |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
 
 	dmar_msi_write(irq, &msg);
 	irq_desc[irq].affinity = *mask;
@@ -215,9 +178,9 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 	msg->address_hi = 0;
 	msg->address_lo =
 		MSI_ADDR_HEADER |
-		MSI_ADDR_DESTMODE_PHYS |
+		MSI_ADDR_DEST_MODE_PHYS |
 		MSI_ADDR_REDIRECTION_CPU |
-		MSI_ADDR_DESTID_CPU(dest);
+		MSI_ADDR_DEST_ID_CPU(dest);
 
 	msg->data =
 		MSI_DATA_TRIGGER_EDGE |
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index f833a0b4188..0a2d6b86075 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -4,6 +4,10 @@
 config HAVE_KVM
 	bool
 
+config HAVE_KVM_IRQCHIP
+       bool
+       default y
+
 menuconfig VIRTUALIZATION
 	bool "Virtualization"
 	depends on HAVE_KVM || IA64
diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h
index c6786e8b1bf..c0785a72827 100644
--- a/arch/ia64/kvm/irq.h
+++ b/arch/ia64/kvm/irq.h
@@ -23,6 +23,8 @@
 #ifndef __IRQ_H
 #define __IRQ_H
 
+#include "lapic.h"
+
 static inline int irqchip_in_kernel(struct kvm *kvm)
 {
 	return 1;
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 28f982045f2..076b00d1dbf 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -182,7 +182,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	switch (ext) {
 	case KVM_CAP_IRQCHIP:
 	case KVM_CAP_MP_STATE:
-
+	case KVM_CAP_IRQ_INJECT_STATUS:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -314,7 +314,7 @@ static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id,
 	union ia64_lid lid;
 	int i;
 
-	for (i = 0; i < KVM_MAX_VCPUS; i++) {
+	for (i = 0; i < kvm->arch.online_vcpus; i++) {
 		if (kvm->vcpus[i]) {
 			lid.val = VCPU_LID(kvm->vcpus[i]);
 			if (lid.id == id && lid.eid == eid)
@@ -388,7 +388,7 @@ static int handle_global_purge(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	call_data.ptc_g_data = p->u.ptc_g_data;
 
-	for (i = 0; i < KVM_MAX_VCPUS; i++) {
+	for (i = 0; i < kvm->arch.online_vcpus; i++) {
 		if (!kvm->vcpus[i] || kvm->vcpus[i]->arch.mp_state ==
 						KVM_MP_STATE_UNINITIALIZED ||
 					vcpu == kvm->vcpus[i])
@@ -788,6 +788,8 @@ struct  kvm *kvm_arch_create_vm(void)
 		return ERR_PTR(-ENOMEM);
 	kvm_init_vm(kvm);
 
+	kvm->arch.online_vcpus = 0;
+
 	return kvm;
 
 }
@@ -919,7 +921,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_ioapic_init(kvm);
 		if (r)
 			goto out;
+		r = kvm_setup_default_irq_routing(kvm);
+		if (r) {
+			kfree(kvm->arch.vioapic);
+			goto out;
+		}
 		break;
+	case KVM_IRQ_LINE_STATUS:
 	case KVM_IRQ_LINE: {
 		struct kvm_irq_level irq_event;
 
@@ -927,10 +935,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
+			__s32 status;
 			mutex_lock(&kvm->lock);
-			kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 				    irq_event.irq, irq_event.level);
 			mutex_unlock(&kvm->lock);
+			if (ioctl == KVM_IRQ_LINE_STATUS) {
+				irq_event.status = status;
+				if (copy_to_user(argp, &irq_event,
+							sizeof irq_event))
+					goto out;
+			}
 			r = 0;
 		}
 		break;
@@ -1149,7 +1164,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 		/*Initialize itc offset for vcpus*/
 		itc_offset = 0UL - ia64_getreg(_IA64_REG_AR_ITC);
-		for (i = 0; i < KVM_MAX_VCPUS; i++) {
+		for (i = 0; i < kvm->arch.online_vcpus; i++) {
 			v = (struct kvm_vcpu *)((char *)vcpu +
 					sizeof(struct kvm_vcpu_data) * i);
 			v->arch.itc_offset = itc_offset;
@@ -1283,6 +1298,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 		goto fail;
 	}
 
+	kvm->arch.online_vcpus++;
+
 	return vcpu;
 fail:
 	return ERR_PTR(r);
@@ -1303,8 +1320,8 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return -EINVAL;
 }
 
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-		struct kvm_debug_guest *dbg)
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
 {
 	return -EINVAL;
 }
@@ -1421,6 +1438,23 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	return 0;
 }
 
+int kvm_arch_vcpu_ioctl_get_stack(struct kvm_vcpu *vcpu,
+				  struct kvm_ia64_vcpu_stack *stack)
+{
+	memcpy(stack, vcpu, sizeof(struct kvm_ia64_vcpu_stack));
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_stack(struct kvm_vcpu *vcpu,
+				  struct kvm_ia64_vcpu_stack *stack)
+{
+	memcpy(vcpu + 1, &stack->stack[0] + sizeof(struct kvm_vcpu),
+	       sizeof(struct kvm_ia64_vcpu_stack) - sizeof(struct kvm_vcpu));
+
+	vcpu->arch.exit_data = ((struct kvm_vcpu *)stack)->arch.exit_data;
+	return 0;
+}
+
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 
@@ -1430,9 +1464,78 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 
 long kvm_arch_vcpu_ioctl(struct file *filp,
-		unsigned int ioctl, unsigned long arg)
+			 unsigned int ioctl, unsigned long arg)
 {
-	return -EINVAL;
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	struct kvm_ia64_vcpu_stack *stack = NULL;
+	long r;
+
+	switch (ioctl) {
+	case KVM_IA64_VCPU_GET_STACK: {
+		struct kvm_ia64_vcpu_stack __user *user_stack;
+	        void __user *first_p = argp;
+
+		r = -EFAULT;
+		if (copy_from_user(&user_stack, first_p, sizeof(void *)))
+			goto out;
+
+		if (!access_ok(VERIFY_WRITE, user_stack,
+			       sizeof(struct kvm_ia64_vcpu_stack))) {
+			printk(KERN_INFO "KVM_IA64_VCPU_GET_STACK: "
+			       "Illegal user destination address for stack\n");
+			goto out;
+		}
+		stack = kzalloc(sizeof(struct kvm_ia64_vcpu_stack), GFP_KERNEL);
+		if (!stack) {
+			r = -ENOMEM;
+			goto out;
+		}
+
+		r = kvm_arch_vcpu_ioctl_get_stack(vcpu, stack);
+		if (r)
+			goto out;
+
+		if (copy_to_user(user_stack, stack,
+				 sizeof(struct kvm_ia64_vcpu_stack)))
+			goto out;
+
+		break;
+	}
+	case KVM_IA64_VCPU_SET_STACK: {
+		struct kvm_ia64_vcpu_stack __user *user_stack;
+	        void __user *first_p = argp;
+
+		r = -EFAULT;
+		if (copy_from_user(&user_stack, first_p, sizeof(void *)))
+			goto out;
+
+		if (!access_ok(VERIFY_READ, user_stack,
+			    sizeof(struct kvm_ia64_vcpu_stack))) {
+			printk(KERN_INFO "KVM_IA64_VCPU_SET_STACK: "
+			       "Illegal user address for stack\n");
+			goto out;
+		}
+		stack = kmalloc(sizeof(struct kvm_ia64_vcpu_stack), GFP_KERNEL);
+		if (!stack) {
+			r = -ENOMEM;
+			goto out;
+		}
+		if (copy_from_user(stack, user_stack,
+				   sizeof(struct kvm_ia64_vcpu_stack)))
+			goto out;
+
+		r = kvm_arch_vcpu_ioctl_set_stack(vcpu, stack);
+		break;
+	}
+
+	default:
+		r = -EINVAL;
+	}
+
+out:
+	kfree(stack);
+	return r;
 }
 
 int kvm_arch_set_memory_region(struct kvm *kvm,
@@ -1472,7 +1575,7 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 }
 
 long kvm_arch_dev_ioctl(struct file *filp,
-		unsigned int ioctl, unsigned long arg)
+			unsigned int ioctl, unsigned long arg)
 {
 	return -EINVAL;
 }
@@ -1737,7 +1840,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
 	struct kvm_vcpu *lvcpu = kvm->vcpus[0];
 	int i;
 
-	for (i = 1; i < KVM_MAX_VCPUS; i++) {
+	for (i = 1; i < kvm->arch.online_vcpus; i++) {
 		if (!kvm->vcpus[i])
 			continue;
 		if (lvcpu->arch.xtp > kvm->vcpus[i]->arch.xtp)
diff --git a/arch/ia64/kvm/kvm_fw.c b/arch/ia64/kvm/kvm_fw.c
index cb7600bdff9..a8ae52ed563 100644
--- a/arch/ia64/kvm/kvm_fw.c
+++ b/arch/ia64/kvm/kvm_fw.c
@@ -227,6 +227,18 @@ static struct ia64_pal_retval pal_proc_get_features(struct kvm_vcpu *vcpu)
 	return result;
 }
 
+static struct ia64_pal_retval pal_register_info(struct kvm_vcpu *vcpu)
+{
+
+	struct ia64_pal_retval result = {0, 0, 0, 0};
+	long in0, in1, in2, in3;
+
+	kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3);
+	result.status = ia64_pal_register_info(in1, &result.v1, &result.v2);
+
+	return result;
+}
+
 static struct ia64_pal_retval pal_cache_info(struct kvm_vcpu *vcpu)
 {
 
@@ -268,8 +280,12 @@ static struct ia64_pal_retval pal_vm_summary(struct kvm_vcpu *vcpu)
 static struct ia64_pal_retval pal_vm_info(struct kvm_vcpu *vcpu)
 {
 	struct ia64_pal_retval result;
+	unsigned long in0, in1, in2, in3;
 
-	INIT_PAL_STATUS_UNIMPLEMENTED(result);
+	kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3);
+
+	result.status = ia64_pal_vm_info(in1, in2,
+			(pal_tc_info_u_t *)&result.v1, &result.v2);
 
 	return result;
 }
@@ -292,6 +308,108 @@ static void prepare_for_halt(struct kvm_vcpu *vcpu)
 	vcpu->arch.timer_fired = 0;
 }
 
+static struct ia64_pal_retval pal_perf_mon_info(struct kvm_vcpu *vcpu)
+{
+	long status;
+	unsigned long in0, in1, in2, in3, r9;
+	unsigned long pm_buffer[16];
+
+	kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3);
+	status = ia64_pal_perf_mon_info(pm_buffer,
+				(pal_perf_mon_info_u_t *) &r9);
+	if (status != 0) {
+		printk(KERN_DEBUG"PAL_PERF_MON_INFO fails ret=%ld\n", status);
+	} else {
+		if (in1)
+			memcpy((void *)in1, pm_buffer, sizeof(pm_buffer));
+		else {
+			status = PAL_STATUS_EINVAL;
+			printk(KERN_WARNING"Invalid parameters "
+						"for PAL call:0x%lx!\n", in0);
+		}
+	}
+	return (struct ia64_pal_retval){status, r9, 0, 0};
+}
+
+static struct ia64_pal_retval pal_halt_info(struct kvm_vcpu *vcpu)
+{
+	unsigned long in0, in1, in2, in3;
+	long status;
+	unsigned long res = 1000UL | (1000UL << 16) | (10UL << 32)
+					| (1UL << 61) | (1UL << 60);
+
+	kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3);
+	if (in1) {
+		memcpy((void *)in1, &res, sizeof(res));
+		status = 0;
+	} else{
+		status = PAL_STATUS_EINVAL;
+		printk(KERN_WARNING"Invalid parameters "
+					"for PAL call:0x%lx!\n", in0);
+	}
+
+	return (struct ia64_pal_retval){status, 0, 0, 0};
+}
+
+static struct ia64_pal_retval pal_mem_attrib(struct kvm_vcpu *vcpu)
+{
+	unsigned long r9;
+	long status;
+
+	status = ia64_pal_mem_attrib(&r9);
+
+	return (struct ia64_pal_retval){status, r9, 0, 0};
+}
+
+static void remote_pal_prefetch_visibility(void *v)
+{
+	s64 trans_type = (s64)v;
+	ia64_pal_prefetch_visibility(trans_type);
+}
+
+static struct ia64_pal_retval pal_prefetch_visibility(struct kvm_vcpu *vcpu)
+{
+	struct ia64_pal_retval result = {0, 0, 0, 0};
+	unsigned long in0, in1, in2, in3;
+	kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3);
+	result.status = ia64_pal_prefetch_visibility(in1);
+	if (result.status == 0) {
+		/* Must be performed on all remote processors
+		in the coherence domain. */
+		smp_call_function(remote_pal_prefetch_visibility,
+					(void *)in1, 1);
+		/* Unnecessary on remote processor for other vcpus!*/
+		result.status = 1;
+	}
+	return result;
+}
+
+static void remote_pal_mc_drain(void *v)
+{
+	ia64_pal_mc_drain();
+}
+
+static struct ia64_pal_retval pal_get_brand_info(struct kvm_vcpu *vcpu)
+{
+	struct ia64_pal_retval result = {0, 0, 0, 0};
+	unsigned long in0, in1, in2, in3;
+
+	kvm_get_pal_call_data(vcpu, &in0, &in1, &in2, &in3);
+
+	if (in1 == 0 && in2) {
+		char brand_info[128];
+		result.status = ia64_pal_get_brand_info(brand_info);
+		if (result.status == PAL_STATUS_SUCCESS)
+			memcpy((void *)in2, brand_info, 128);
+	} else {
+		result.status = PAL_STATUS_REQUIRES_MEMORY;
+		printk(KERN_WARNING"Invalid parameters for "
+					"PAL call:0x%lx!\n", in0);
+	}
+
+	return result;
+}
+
 int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 
@@ -300,14 +418,22 @@ int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	int ret = 1;
 
 	gr28 = kvm_get_pal_call_index(vcpu);
-	/*printk("pal_call index:%lx\n",gr28);*/
 	switch (gr28) {
 	case PAL_CACHE_FLUSH:
 		result = pal_cache_flush(vcpu);
 		break;
+	case PAL_MEM_ATTRIB:
+		result = pal_mem_attrib(vcpu);
+		break;
 	case PAL_CACHE_SUMMARY:
 		result = pal_cache_summary(vcpu);
 		break;
+	case PAL_PERF_MON_INFO:
+		result = pal_perf_mon_info(vcpu);
+		break;
+	case PAL_HALT_INFO:
+		result = pal_halt_info(vcpu);
+		break;
 	case PAL_HALT_LIGHT:
 	{
 		INIT_PAL_STATUS_SUCCESS(result);
@@ -317,6 +443,16 @@ int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	}
 		break;
 
+	case PAL_PREFETCH_VISIBILITY:
+		result = pal_prefetch_visibility(vcpu);
+		break;
+	case PAL_MC_DRAIN:
+		result.status = ia64_pal_mc_drain();
+		/* FIXME: All vcpus likely call PAL_MC_DRAIN.
+		   That causes the congestion. */
+		smp_call_function(remote_pal_mc_drain, NULL, 1);
+		break;
+
 	case PAL_FREQ_RATIOS:
 		result = pal_freq_ratios(vcpu);
 		break;
@@ -346,6 +482,9 @@ int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		INIT_PAL_STATUS_SUCCESS(result);
 		result.v1 = (1L << 32) | 1L;
 		break;
+	case PAL_REGISTER_INFO:
+		result = pal_register_info(vcpu);
+		break;
 	case PAL_VM_PAGE_SIZE:
 		result.status = ia64_pal_vm_page_size(&result.v0,
 							&result.v1);
@@ -365,12 +504,18 @@ int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		result.status = ia64_pal_version(
 				(pal_version_u_t *)&result.v0,
 				(pal_version_u_t *)&result.v1);
-
 		break;
 	case PAL_FIXED_ADDR:
 		result.status = PAL_STATUS_SUCCESS;
 		result.v0 = vcpu->vcpu_id;
 		break;
+	case PAL_BRAND_INFO:
+		result = pal_get_brand_info(vcpu);
+		break;
+	case PAL_GET_PSTATE:
+	case PAL_CACHE_SHARED_INFO:
+		INIT_PAL_STATUS_UNIMPLEMENTED(result);
+		break;
 	default:
 		INIT_PAL_STATUS_UNIMPLEMENTED(result);
 		printk(KERN_WARNING"kvm: Unsupported pal call,"
diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c
index 230eae482f3..b1dc80952d9 100644
--- a/arch/ia64/kvm/process.c
+++ b/arch/ia64/kvm/process.c
@@ -167,7 +167,6 @@ static u64 vcpu_get_itir_on_fault(struct kvm_vcpu *vcpu, u64 ifa)
 	return (rr1.val);
 }
 
-
 /*
  * Set vIFA & vITIR & vIHA, when vPSR.ic =1
  * Parameter:
@@ -222,8 +221,6 @@ void itlb_fault(struct kvm_vcpu *vcpu, u64 vadr)
 	inject_guest_interruption(vcpu, IA64_INST_TLB_VECTOR);
 }
 
-
-
 /*
  * Data Nested TLB Fault
  *  @ Data Nested TLB Vector
@@ -245,7 +242,6 @@ void alt_dtlb(struct kvm_vcpu *vcpu, u64 vadr)
 	inject_guest_interruption(vcpu, IA64_ALT_DATA_TLB_VECTOR);
 }
 
-
 /*
  * Data TLB Fault
  *  @ Data TLB vector
@@ -265,8 +261,6 @@ static void _vhpt_fault(struct kvm_vcpu *vcpu, u64 vadr)
 	/* If vPSR.ic, IFA, ITIR, IHA*/
 	set_ifa_itir_iha(vcpu, vadr, 1, 1, 1);
 	inject_guest_interruption(vcpu, IA64_VHPT_TRANS_VECTOR);
-
-
 }
 
 /*
@@ -279,7 +273,6 @@ void ivhpt_fault(struct kvm_vcpu *vcpu, u64 vadr)
 	_vhpt_fault(vcpu, vadr);
 }
 
-
 /*
  * VHPT Data Fault
  *  @ VHPT Translation vector
@@ -290,8 +283,6 @@ void dvhpt_fault(struct kvm_vcpu *vcpu, u64 vadr)
 	_vhpt_fault(vcpu, vadr);
 }
 
-
-
 /*
  * Deal with:
  *  General Exception vector
@@ -301,7 +292,6 @@ void _general_exception(struct kvm_vcpu *vcpu)
 	inject_guest_interruption(vcpu, IA64_GENEX_VECTOR);
 }
 
-
 /*
  * Illegal Operation Fault
  *  @ General Exception Vector
@@ -419,19 +409,16 @@ static void __page_not_present(struct kvm_vcpu *vcpu, u64 vadr)
 	inject_guest_interruption(vcpu, IA64_PAGE_NOT_PRESENT_VECTOR);
 }
 
-
 void data_page_not_present(struct kvm_vcpu *vcpu, u64 vadr)
 {
 	__page_not_present(vcpu, vadr);
 }
 
-
 void inst_page_not_present(struct kvm_vcpu *vcpu, u64 vadr)
 {
 	__page_not_present(vcpu, vadr);
 }
 
-
 /* Deal with
  *  Data access rights vector
  */
@@ -563,22 +550,64 @@ void reflect_interruption(u64 ifa, u64 isr, u64 iim,
 	inject_guest_interruption(vcpu, vector);
 }
 
+static unsigned long kvm_trans_pal_call_args(struct kvm_vcpu *vcpu,
+						unsigned long arg)
+{
+	struct thash_data *data;
+	unsigned long gpa, poff;
+
+	if (!is_physical_mode(vcpu)) {
+		/* Depends on caller to provide the DTR or DTC mapping.*/
+		data = vtlb_lookup(vcpu, arg, D_TLB);
+		if (data)
+			gpa = data->page_flags & _PAGE_PPN_MASK;
+		else {
+			data = vhpt_lookup(arg);
+			if (!data)
+				return 0;
+			gpa = data->gpaddr & _PAGE_PPN_MASK;
+		}
+
+		poff = arg & (PSIZE(data->ps) - 1);
+		arg = PAGEALIGN(gpa, data->ps) | poff;
+	}
+	arg = kvm_gpa_to_mpa(arg << 1 >> 1);
+
+	return (unsigned long)__va(arg);
+}
+
 static void set_pal_call_data(struct kvm_vcpu *vcpu)
 {
 	struct exit_ctl_data *p = &vcpu->arch.exit_data;
+	unsigned long gr28 = vcpu_get_gr(vcpu, 28);
+	unsigned long gr29 = vcpu_get_gr(vcpu, 29);
+	unsigned long gr30 = vcpu_get_gr(vcpu, 30);
 
 	/*FIXME:For static and stacked convention, firmware
 	 * has put the parameters in gr28-gr31 before
 	 * break to vmm  !!*/
 
-	p->u.pal_data.gr28 = vcpu_get_gr(vcpu, 28);
-	p->u.pal_data.gr29 = vcpu_get_gr(vcpu, 29);
-	p->u.pal_data.gr30 = vcpu_get_gr(vcpu, 30);
+	switch (gr28) {
+	case PAL_PERF_MON_INFO:
+	case PAL_HALT_INFO:
+		p->u.pal_data.gr29 =  kvm_trans_pal_call_args(vcpu, gr29);
+		p->u.pal_data.gr30 = vcpu_get_gr(vcpu, 30);
+		break;
+	case PAL_BRAND_INFO:
+		p->u.pal_data.gr29 = gr29;;
+		p->u.pal_data.gr30 = kvm_trans_pal_call_args(vcpu, gr30);
+		break;
+	default:
+		p->u.pal_data.gr29 = gr29;;
+		p->u.pal_data.gr30 = vcpu_get_gr(vcpu, 30);
+	}
+	p->u.pal_data.gr28 = gr28;
 	p->u.pal_data.gr31 = vcpu_get_gr(vcpu, 31);
+
 	p->exit_reason = EXIT_REASON_PAL_CALL;
 }
 
-static void set_pal_call_result(struct kvm_vcpu *vcpu)
+static void get_pal_call_result(struct kvm_vcpu *vcpu)
 {
 	struct exit_ctl_data *p = &vcpu->arch.exit_data;
 
@@ -606,7 +635,7 @@ static void set_sal_call_data(struct kvm_vcpu *vcpu)
 	p->exit_reason = EXIT_REASON_SAL_CALL;
 }
 
-static void set_sal_call_result(struct kvm_vcpu *vcpu)
+static void get_sal_call_result(struct kvm_vcpu *vcpu)
 {
 	struct exit_ctl_data *p = &vcpu->arch.exit_data;
 
@@ -629,13 +658,13 @@ void  kvm_ia64_handle_break(unsigned long ifa, struct kvm_pt_regs *regs,
 		if (iim == DOMN_PAL_REQUEST) {
 			set_pal_call_data(v);
 			vmm_transition(v);
-			set_pal_call_result(v);
+			get_pal_call_result(v);
 			vcpu_increment_iip(v);
 			return;
 		} else if (iim == DOMN_SAL_REQUEST) {
 			set_sal_call_data(v);
 			vmm_transition(v);
-			set_sal_call_result(v);
+			get_sal_call_result(v);
 			vcpu_increment_iip(v);
 			return;
 		}
@@ -703,7 +732,6 @@ void vhpi_detection(struct kvm_vcpu *vcpu)
 	}
 }
 
-
 void leave_hypervisor_tail(void)
 {
 	struct kvm_vcpu *v = current_vcpu;
@@ -737,7 +765,6 @@ void leave_hypervisor_tail(void)
 	}
 }
 
-
 static inline void handle_lds(struct kvm_pt_regs *regs)
 {
 	regs->cr_ipsr |= IA64_PSR_ED;
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index ecd526b5532..d4d28050587 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -112,7 +112,6 @@ void switch_to_physical_rid(struct kvm_vcpu *vcpu)
 	return;
 }
 
-
 void switch_to_virtual_rid(struct kvm_vcpu *vcpu)
 {
 	unsigned long psr;
@@ -166,8 +165,6 @@ void switch_mm_mode(struct kvm_vcpu *vcpu, struct ia64_psr old_psr,
 	return;
 }
 
-
-
 /*
  * In physical mode, insert tc/tr for region 0 and 4 uses
  * RID[0] and RID[4] which is for physical mode emulation.
@@ -269,7 +266,6 @@ static inline unsigned long fph_index(struct kvm_pt_regs *regs,
 	return rotate_reg(96, rrb_fr, (regnum - IA64_FIRST_ROTATING_FR));
 }
 
-
 /*
  * The inverse of the above: given bspstore and the number of
  * registers, calculate ar.bsp.
@@ -811,12 +807,15 @@ static inline void vcpu_set_itm(struct kvm_vcpu *vcpu, u64 val);
 static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
 {
 	struct kvm_vcpu *v;
+	struct kvm *kvm;
 	int i;
 	long itc_offset = val - ia64_getreg(_IA64_REG_AR_ITC);
 	unsigned long vitv = VCPU(vcpu, itv);
 
+	kvm = (struct kvm *)KVM_VM_BASE;
+
 	if (vcpu->vcpu_id == 0) {
-		for (i = 0; i < KVM_MAX_VCPUS; i++) {
+		for (i = 0; i < kvm->arch.online_vcpus; i++) {
 			v = (struct kvm_vcpu *)((char *)vcpu +
 					sizeof(struct kvm_vcpu_data) * i);
 			VMX(v, itc_offset) = itc_offset;
@@ -1039,8 +1038,6 @@ u64 vcpu_tak(struct kvm_vcpu *vcpu, u64 vadr)
 	return key;
 }
 
-
-
 void kvm_thash(struct kvm_vcpu *vcpu, INST64 inst)
 {
 	unsigned long thash, vadr;
@@ -1050,7 +1047,6 @@ void kvm_thash(struct kvm_vcpu *vcpu, INST64 inst)
 	vcpu_set_gr(vcpu, inst.M46.r1, thash, 0);
 }
 
-
 void kvm_ttag(struct kvm_vcpu *vcpu, INST64 inst)
 {
 	unsigned long tag, vadr;
@@ -1131,7 +1127,6 @@ int vcpu_tpa(struct kvm_vcpu *vcpu, u64 vadr, u64 *padr)
 	return IA64_NO_FAULT;
 }
 
-
 int kvm_tpa(struct kvm_vcpu *vcpu, INST64 inst)
 {
 	unsigned long r1, r3;
@@ -1154,7 +1149,6 @@ void kvm_tak(struct kvm_vcpu *vcpu, INST64 inst)
 	vcpu_set_gr(vcpu, inst.M46.r1, r1, 0);
 }
 
-
 /************************************
  * Insert/Purge translation register/cache
  ************************************/
@@ -1385,7 +1379,6 @@ void kvm_mov_to_ar_reg(struct kvm_vcpu *vcpu, INST64 inst)
 	vcpu_set_itc(vcpu, r2);
 }
 
-
 void kvm_mov_from_ar_reg(struct kvm_vcpu *vcpu, INST64 inst)
 {
 	unsigned long r1;
@@ -1393,8 +1386,9 @@ void kvm_mov_from_ar_reg(struct kvm_vcpu *vcpu, INST64 inst)
 	r1 = vcpu_get_itc(vcpu);
 	vcpu_set_gr(vcpu, inst.M31.r1, r1, 0);
 }
+
 /**************************************************************************
-  struct kvm_vcpu*protection key register access routines
+  struct kvm_vcpu protection key register access routines
  **************************************************************************/
 
 unsigned long vcpu_get_pkr(struct kvm_vcpu *vcpu, unsigned long reg)
@@ -1407,20 +1401,6 @@ void vcpu_set_pkr(struct kvm_vcpu *vcpu, unsigned long reg, unsigned long val)
 	ia64_set_pkr(reg, val);
 }
 
-
-unsigned long vcpu_get_itir_on_fault(struct kvm_vcpu *vcpu, unsigned long ifa)
-{
-	union ia64_rr rr, rr1;
-
-	rr.val = vcpu_get_rr(vcpu, ifa);
-	rr1.val = 0;
-	rr1.ps = rr.ps;
-	rr1.rid = rr.rid;
-	return (rr1.val);
-}
-
-
-
 /********************************
  * Moves to privileged registers
  ********************************/
@@ -1464,8 +1444,6 @@ unsigned long vcpu_set_rr(struct kvm_vcpu *vcpu, unsigned long reg,
 	return (IA64_NO_FAULT);
 }
 
-
-
 void kvm_mov_to_rr(struct kvm_vcpu *vcpu, INST64 inst)
 {
 	unsigned long r3, r2;
@@ -1510,8 +1488,6 @@ void kvm_mov_to_pkr(struct kvm_vcpu *vcpu, INST64 inst)
 	vcpu_set_pkr(vcpu, r3, r2);
 }
 
-
-
 void kvm_mov_from_rr(struct kvm_vcpu *vcpu, INST64 inst)
 {
 	unsigned long r3, r1;
@@ -1557,7 +1533,6 @@ void kvm_mov_from_pmc(struct kvm_vcpu *vcpu, INST64 inst)
 	vcpu_set_gr(vcpu, inst.M43.r1, r1, 0);
 }
 
-
 unsigned long vcpu_get_cpuid(struct kvm_vcpu *vcpu, unsigned long reg)
 {
 	/* FIXME: This could get called as a result of a rsvd-reg fault */
@@ -1609,7 +1584,6 @@ unsigned long kvm_mov_to_cr(struct kvm_vcpu *vcpu, INST64 inst)
 	return 0;
 }
 
-
 unsigned long kvm_mov_from_cr(struct kvm_vcpu *vcpu, INST64 inst)
 {
 	unsigned long tgt = inst.M33.r1;
@@ -1633,8 +1607,6 @@ unsigned long kvm_mov_from_cr(struct kvm_vcpu *vcpu, INST64 inst)
 	return 0;
 }
 
-
-
 void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val)
 {
 
@@ -1776,9 +1748,6 @@ void vcpu_bsw1(struct kvm_vcpu *vcpu)
 	}
 }
 
-
-
-
 void vcpu_rfi(struct kvm_vcpu *vcpu)
 {
 	unsigned long ifs, psr;
@@ -1796,7 +1765,6 @@ void vcpu_rfi(struct kvm_vcpu *vcpu)
 	regs->cr_iip = VCPU(vcpu, iip);
 }
 
-
 /*
    VPSR can't keep track of below bits of guest PSR
    This function gets guest PSR
diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h
index b2f12a562bd..042af92ced8 100644
--- a/arch/ia64/kvm/vcpu.h
+++ b/arch/ia64/kvm/vcpu.h
@@ -703,7 +703,7 @@ extern u64 guest_vhpt_lookup(u64 iha, u64 *pte);
 extern void thash_purge_entries(struct kvm_vcpu *v, u64 va, u64 ps);
 extern void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps);
 extern u64 translate_phy_pte(u64 *pte, u64 itir, u64 va);
-extern int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte,
+extern void thash_purge_and_insert(struct kvm_vcpu *v, u64 pte,
 		u64 itir, u64 ifa, int type);
 extern void thash_purge_all(struct kvm_vcpu *v);
 extern struct thash_data *vtlb_lookup(struct kvm_vcpu *v,
@@ -738,7 +738,7 @@ void kvm_init_vhpt(struct kvm_vcpu *v);
 void thash_init(struct thash_cb *hcb, u64 sz);
 
 void panic_vm(struct kvm_vcpu *v, const char *fmt, ...);
-
+u64 kvm_gpa_to_mpa(u64 gpa);
 extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2, u64 arg3,
 		u64 arg4, u64 arg5, u64 arg6, u64 arg7);
 
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
index 6b6307a3bd5..38232b37668 100644
--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -164,11 +164,11 @@ static void vhpt_insert(u64 pte, u64 itir, u64 ifa, u64 gpte)
 	unsigned long ps, gpaddr;
 
 	ps = itir_ps(itir);
+	rr.val = ia64_get_rr(ifa);
 
-	gpaddr = ((gpte & _PAGE_PPN_MASK) >> ps << ps) |
-		(ifa & ((1UL << ps) - 1));
+	 gpaddr = ((gpte & _PAGE_PPN_MASK) >> ps << ps) |
+					(ifa & ((1UL << ps) - 1));
 
-	rr.val = ia64_get_rr(ifa);
 	head = (struct thash_data *)ia64_thash(ifa);
 	head->etag = INVALID_TI_TAG;
 	ia64_mf();
@@ -412,16 +412,14 @@ u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
 
 /*
  * Purge overlap TCs and then insert the new entry to emulate itc ops.
- *    Notes: Only TC entry can purge and insert.
- *    1 indicates this is MMIO
+ * Notes: Only TC entry can purge and insert.
  */
-int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
+void  thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
 						u64 ifa, int type)
 {
 	u64 ps;
 	u64 phy_pte, io_mask, index;
 	union ia64_rr vrr, mrr;
-	int ret = 0;
 
 	ps = itir_ps(itir);
 	vrr.val = vcpu_get_rr(v, ifa);
@@ -441,25 +439,19 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
 		phy_pte &= ~_PAGE_MA_MASK;
 	}
 
-	if (pte & VTLB_PTE_IO)
-		ret = 1;
-
 	vtlb_purge(v, ifa, ps);
 	vhpt_purge(v, ifa, ps);
 
-	if (ps == mrr.ps) {
-		if (!(pte&VTLB_PTE_IO)) {
-			vhpt_insert(phy_pte, itir, ifa, pte);
-		} else {
-			vtlb_insert(v, pte, itir, ifa);
-			vcpu_quick_region_set(VMX(v, tc_regions), ifa);
-		}
-	} else if (ps > mrr.ps) {
+	if ((ps != mrr.ps) || (pte & VTLB_PTE_IO)) {
 		vtlb_insert(v, pte, itir, ifa);
 		vcpu_quick_region_set(VMX(v, tc_regions), ifa);
-		if (!(pte&VTLB_PTE_IO))
-			vhpt_insert(phy_pte, itir, ifa, pte);
-	} else {
+	}
+	if (pte & VTLB_PTE_IO)
+		return;
+
+	if (ps >= mrr.ps)
+		vhpt_insert(phy_pte, itir, ifa, pte);
+	else {
 		u64 psr;
 		phy_pte  &= ~PAGE_FLAGS_RV_MASK;
 		psr = ia64_clear_ic();
@@ -469,7 +461,6 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
 	if (!(pte&VTLB_PTE_IO))
 		mark_pages_dirty(v, pte, ps);
 
-	return ret;
 }
 
 /*
@@ -509,7 +500,6 @@ void thash_purge_all(struct kvm_vcpu *v)
 	local_flush_tlb_all();
 }
 
-
 /*
  * Lookup the hash table and its collision chain to find an entry
  * covering this address rid:va or the entry.
@@ -517,7 +507,6 @@ void thash_purge_all(struct kvm_vcpu *v)
  * INPUT:
  *  in: TLB format for both VHPT & TLB.
  */
-
 struct thash_data *vtlb_lookup(struct kvm_vcpu *v, u64 va, int is_data)
 {
 	struct thash_data  *cch;
@@ -547,7 +536,6 @@ struct thash_data *vtlb_lookup(struct kvm_vcpu *v, u64 va, int is_data)
 	return NULL;
 }
 
-
 /*
  * Initialize internal control data before service.
  */
@@ -573,6 +561,10 @@ void thash_init(struct thash_cb *hcb, u64 sz)
 u64 kvm_get_mpt_entry(u64 gpfn)
 {
 	u64 *base = (u64 *) KVM_P2M_BASE;
+
+	if (gpfn >= (KVM_P2M_SIZE >> 3))
+		panic_vm(current_vcpu, "Invalid gpfn =%lx\n", gpfn);
+
 	return *(base + gpfn);
 }
 
@@ -589,7 +581,6 @@ u64 kvm_gpa_to_mpa(u64 gpa)
 	return (pte >> PAGE_SHIFT << PAGE_SHIFT) | (gpa & ~PAGE_MASK);
 }
 
-
 /*
  * Fetch guest bundle code.
  * INPUT:
@@ -631,7 +622,6 @@ int fetch_code(struct kvm_vcpu *vcpu, u64 gip, IA64_BUNDLE *pbundle)
 	return IA64_NO_FAULT;
 }
 
-
 void kvm_init_vhpt(struct kvm_vcpu *v)
 {
 	v->arch.vhpt.num = VHPT_NUM_ENTRIES;
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index f993e4198d5..755f1b1948c 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -52,4 +52,11 @@ struct kvm_fpu {
 	__u64 fpr[32];
 };
 
+struct kvm_debug_exit_arch {
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_44x.h b/arch/powerpc/include/asm/kvm_44x.h
index f49031b632c..d22d39942a9 100644
--- a/arch/powerpc/include/asm/kvm_44x.h
+++ b/arch/powerpc/include/asm/kvm_44x.h
@@ -28,6 +28,13 @@
  * need to find some way of advertising it. */
 #define KVM44x_GUEST_TLB_SIZE 64
 
+struct kvmppc_44x_tlbe {
+	u32 tid; /* Only the low 8 bits are used. */
+	u32 word0;
+	u32 word1;
+	u32 word2;
+};
+
 struct kvmppc_44x_shadow_ref {
 	struct page *page;
 	u16 gtlb_index;
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 2197764796d..56bfae59837 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -42,7 +42,12 @@
 #define BOOKE_INTERRUPT_DTLB_MISS 13
 #define BOOKE_INTERRUPT_ITLB_MISS 14
 #define BOOKE_INTERRUPT_DEBUG 15
-#define BOOKE_MAX_INTERRUPT 15
+
+/* E500 */
+#define BOOKE_INTERRUPT_SPE_UNAVAIL 32
+#define BOOKE_INTERRUPT_SPE_FP_DATA 33
+#define BOOKE_INTERRUPT_SPE_FP_ROUND 34
+#define BOOKE_INTERRUPT_PERFORMANCE_MONITOR 35
 
 #define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h
new file mode 100644
index 00000000000..9d497ce4972
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_e500.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, <yu.liu@freescale.com>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/include/asm/kvm_44x.h,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __ASM_KVM_E500_H__
+#define __ASM_KVM_E500_H__
+
+#include <linux/kvm_host.h>
+
+#define BOOKE_INTERRUPT_SIZE 36
+
+#define E500_PID_NUM   3
+#define E500_TLB_NUM   2
+
+struct tlbe{
+	u32 mas1;
+	u32 mas2;
+	u32 mas3;
+	u32 mas7;
+};
+
+struct kvmppc_vcpu_e500 {
+	/* Unmodified copy of the guest's TLB. */
+	struct tlbe *guest_tlb[E500_TLB_NUM];
+	/* TLB that's actually used when the guest is running. */
+	struct tlbe *shadow_tlb[E500_TLB_NUM];
+	/* Pages which are referenced in the shadow TLB. */
+	struct page **shadow_pages[E500_TLB_NUM];
+
+	unsigned int guest_tlb_size[E500_TLB_NUM];
+	unsigned int shadow_tlb_size[E500_TLB_NUM];
+	unsigned int guest_tlb_nv[E500_TLB_NUM];
+
+	u32 host_pid[E500_PID_NUM];
+	u32 pid[E500_PID_NUM];
+
+	u32 mas0;
+	u32 mas1;
+	u32 mas2;
+	u32 mas3;
+	u32 mas4;
+	u32 mas5;
+	u32 mas6;
+	u32 mas7;
+	u32 l1csr1;
+	u32 hid0;
+	u32 hid1;
+
+	struct kvm_vcpu vcpu;
+};
+
+static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu);
+}
+
+#endif /* __ASM_KVM_E500_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index c1e436fe773..dfdf13c9fef 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -64,13 +64,6 @@ struct kvm_vcpu_stat {
 	u32 halt_wakeup;
 };
 
-struct kvmppc_44x_tlbe {
-	u32 tid; /* Only the low 8 bits are used. */
-	u32 word0;
-	u32 word1;
-	u32 word2;
-};
-
 enum kvm_exit_types {
 	MMIO_EXITS,
 	DCR_EXITS,
@@ -118,11 +111,6 @@ struct kvm_arch {
 struct kvm_vcpu_arch {
 	u32 host_stack;
 	u32 host_pid;
-	u32 host_dbcr0;
-	u32 host_dbcr1;
-	u32 host_dbcr2;
-	u32 host_iac[4];
-	u32 host_msr;
 
 	u64 fpr[32];
 	ulong gpr[32];
@@ -157,7 +145,7 @@ struct kvm_vcpu_arch {
 	u32 tbu;
 	u32 tcr;
 	u32 tsr;
-	u32 ivor[16];
+	u32 ivor[64];
 	ulong ivpr;
 	u32 pir;
 
@@ -170,6 +158,7 @@ struct kvm_vcpu_arch {
 	u32 ccr1;
 	u32 dbcr0;
 	u32 dbcr1;
+	u32 dbsr;
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 	struct kvmppc_exit_timing timing_exit;
@@ -200,10 +189,4 @@ struct kvm_vcpu_arch {
 	unsigned long pending_exceptions;
 };
 
-struct kvm_guest_debug {
-	int enabled;
-	unsigned long bp[4];
-	int singlestep;
-};
-
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 36d2a50a848..2c6ee349df5 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -52,13 +52,19 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 
+/* Core-specific hooks */
+
 extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
-                           u64 asid, u32 flags, u32 max_bytes,
                            unsigned int gtlb_idx);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
 extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
-
-/* Core-specific hooks */
+extern void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu);
+extern int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
+extern int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
+extern gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index,
+                              gva_t eaddr);
+extern void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu);
 
 extern struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm,
                                                 unsigned int id);
@@ -71,9 +77,6 @@ extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
 extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
 
-extern void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu);
-extern void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu);
-
 extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/asm/mmu-fsl-booke.h b/arch/powerpc/include/asm/mmu-fsl-booke.h
index 3f941c0f7e8..4285b64a65e 100644
--- a/arch/powerpc/include/asm/mmu-fsl-booke.h
+++ b/arch/powerpc/include/asm/mmu-fsl-booke.h
@@ -75,6 +75,8 @@
 
 #ifndef __ASSEMBLY__
 
+extern unsigned int tlbcam_index;
+
 typedef struct {
 	unsigned int	id;
 	unsigned int	active;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 19ee491e9e2..42fe4da4e8a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -49,7 +49,7 @@
 #include <asm/iseries/alpaca.h>
 #endif
 #ifdef CONFIG_KVM
-#include <asm/kvm_44x.h>
+#include <linux/kvm_host.h>
 #endif
 
 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
@@ -361,8 +361,6 @@ int main(void)
 	DEFINE(PTE_SIZE, sizeof(pte_t));
 
 #ifdef CONFIG_KVM
-	DEFINE(TLBE_BYTES, sizeof(struct kvmppc_44x_tlbe));
-
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index a66bec57265..0cef809cec2 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -28,72 +28,6 @@
 
 #include "44x_tlb.h"
 
-/* Note: clearing MSR[DE] just means that the debug interrupt will not be
- * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
- * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
- * will be delivered as an "imprecise debug event" (which is indicated by
- * DBSR[IDE].
- */
-static void kvm44x_disable_debug_interrupts(void)
-{
-	mtmsr(mfmsr() & ~MSR_DE);
-}
-
-void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu)
-{
-	kvm44x_disable_debug_interrupts();
-
-	mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
-	mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
-	mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
-	mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
-	mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
-	mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
-	mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
-	mtmsr(vcpu->arch.host_msr);
-}
-
-void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
-{
-	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-	u32 dbcr0 = 0;
-
-	vcpu->arch.host_msr = mfmsr();
-	kvm44x_disable_debug_interrupts();
-
-	/* Save host debug register state. */
-	vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
-	vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
-	vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
-	vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
-	vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
-	vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
-	vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
-
-	/* set registers up for guest */
-
-	if (dbg->bp[0]) {
-		mtspr(SPRN_IAC1, dbg->bp[0]);
-		dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
-	}
-	if (dbg->bp[1]) {
-		mtspr(SPRN_IAC2, dbg->bp[1]);
-		dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
-	}
-	if (dbg->bp[2]) {
-		mtspr(SPRN_IAC3, dbg->bp[2]);
-		dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
-	}
-	if (dbg->bp[3]) {
-		mtspr(SPRN_IAC4, dbg->bp[3]);
-		dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
-	}
-
-	mtspr(SPRN_DBCR0, dbcr0);
-	mtspr(SPRN_DBCR1, 0);
-	mtspr(SPRN_DBCR2, 0);
-}
-
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	kvmppc_44x_tlb_load(vcpu);
@@ -149,8 +83,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
                                struct kvm_translation *tr)
 {
-	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
-	struct kvmppc_44x_tlbe *gtlbe;
 	int index;
 	gva_t eaddr;
 	u8 pid;
@@ -166,9 +98,7 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
 		return 0;
 	}
 
-	gtlbe = &vcpu_44x->guest_tlb[index];
-
-	tr->physical_address = tlb_xlate(gtlbe, eaddr);
+	tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr);
 	/* XXX what does "writeable" and "usermode" even mean? */
 	tr->valid = 1;
 
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index 82489a743a6..61af58fcece 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -27,25 +27,12 @@
 #include "booke.h"
 #include "44x_tlb.h"
 
-#define OP_RFI      19
-
-#define XOP_RFI     50
-#define XOP_MFMSR   83
-#define XOP_WRTEE   131
-#define XOP_MTMSR   146
-#define XOP_WRTEEI  163
 #define XOP_MFDCR   323
 #define XOP_MTDCR   451
 #define XOP_TLBSX   914
 #define XOP_ICCCI   966
 #define XOP_TLBWE   978
 
-static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.pc = vcpu->arch.srr0;
-	kvmppc_set_msr(vcpu, vcpu->arch.srr1);
-}
-
 int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                            unsigned int inst, int *advance)
 {
@@ -59,48 +46,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int ws;
 
 	switch (get_op(inst)) {
-	case OP_RFI:
-		switch (get_xop(inst)) {
-		case XOP_RFI:
-			kvmppc_emul_rfi(vcpu);
-			kvmppc_set_exit_type(vcpu, EMULATED_RFI_EXITS);
-			*advance = 0;
-			break;
-
-		default:
-			emulated = EMULATE_FAIL;
-			break;
-		}
-		break;
-
 	case 31:
 		switch (get_xop(inst)) {
 
-		case XOP_MFMSR:
-			rt = get_rt(inst);
-			vcpu->arch.gpr[rt] = vcpu->arch.msr;
-			kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
-			break;
-
-		case XOP_MTMSR:
-			rs = get_rs(inst);
-			kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS);
-			kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
-			break;
-
-		case XOP_WRTEE:
-			rs = get_rs(inst);
-			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
-							 | (vcpu->arch.gpr[rs] & MSR_EE);
-			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
-			break;
-
-		case XOP_WRTEEI:
-			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
-							 | (inst & MSR_EE);
-			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
-			break;
-
 		case XOP_MFDCR:
 			dcrn = get_dcrn(inst);
 			rt = get_rt(inst);
@@ -186,186 +134,51 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		emulated = EMULATE_FAIL;
 	}
 
+	if (emulated == EMULATE_FAIL)
+		emulated = kvmppc_booke_emulate_op(run, vcpu, inst, advance);
+
 	return emulated;
 }
 
 int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 {
+	int emulated = EMULATE_DONE;
+
 	switch (sprn) {
-	case SPRN_MMUCR:
-		vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
 	case SPRN_PID:
 		kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
+	case SPRN_MMUCR:
+		vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
 	case SPRN_CCR0:
 		vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
 	case SPRN_CCR1:
 		vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break;
-	case SPRN_DEAR:
-		vcpu->arch.dear = vcpu->arch.gpr[rs]; break;
-	case SPRN_ESR:
-		vcpu->arch.esr = vcpu->arch.gpr[rs]; break;
-	case SPRN_DBCR0:
-		vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break;
-	case SPRN_DBCR1:
-		vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break;
-	case SPRN_TSR:
-		vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break;
-	case SPRN_TCR:
-		vcpu->arch.tcr = vcpu->arch.gpr[rs];
-		kvmppc_emulate_dec(vcpu);
-		break;
-
-	/* Note: SPRG4-7 are user-readable. These values are
-	 * loaded into the real SPRGs when resuming the
-	 * guest. */
-	case SPRN_SPRG4:
-		vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break;
-	case SPRN_SPRG5:
-		vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break;
-	case SPRN_SPRG6:
-		vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break;
-	case SPRN_SPRG7:
-		vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
-
-	case SPRN_IVPR:
-		vcpu->arch.ivpr = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR0:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR1:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR2:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR3:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR4:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR5:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR6:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR7:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR8:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR9:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR10:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR11:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR12:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR13:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR14:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs];
-		break;
-	case SPRN_IVOR15:
-		vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs];
-		break;
-
 	default:
-		return EMULATE_FAIL;
+		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs);
 	}
 
 	kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
-	return EMULATE_DONE;
+	return emulated;
 }
 
 int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 {
+	int emulated = EMULATE_DONE;
+
 	switch (sprn) {
-	/* 440 */
+	case SPRN_PID:
+		vcpu->arch.gpr[rt] = vcpu->arch.pid; break;
 	case SPRN_MMUCR:
 		vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break;
 	case SPRN_CCR0:
 		vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break;
 	case SPRN_CCR1:
 		vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break;
-
-	/* Book E */
-	case SPRN_PID:
-		vcpu->arch.gpr[rt] = vcpu->arch.pid; break;
-	case SPRN_IVPR:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break;
-	case SPRN_DEAR:
-		vcpu->arch.gpr[rt] = vcpu->arch.dear; break;
-	case SPRN_ESR:
-		vcpu->arch.gpr[rt] = vcpu->arch.esr; break;
-	case SPRN_DBCR0:
-		vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break;
-	case SPRN_DBCR1:
-		vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
-
-	case SPRN_IVOR0:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
-		break;
-	case SPRN_IVOR1:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
-		break;
-	case SPRN_IVOR2:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
-		break;
-	case SPRN_IVOR3:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
-		break;
-	case SPRN_IVOR4:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
-		break;
-	case SPRN_IVOR5:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
-		break;
-	case SPRN_IVOR6:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
-		break;
-	case SPRN_IVOR7:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
-		break;
-	case SPRN_IVOR8:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
-		break;
-	case SPRN_IVOR9:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
-		break;
-	case SPRN_IVOR10:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
-		break;
-	case SPRN_IVOR11:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
-		break;
-	case SPRN_IVOR12:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
-		break;
-	case SPRN_IVOR13:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
-		break;
-	case SPRN_IVOR14:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
-		break;
-	case SPRN_IVOR15:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
-		break;
-
 	default:
-		return EMULATE_FAIL;
+		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt);
 	}
 
 	kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
-	return EMULATE_DONE;
+	return emulated;
 }
 
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 9a34b8edb9e..4a16f472cc1 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -208,20 +208,38 @@ int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
 	return -1;
 }
 
-int kvmppc_44x_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
+gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index,
+                       gva_t eaddr)
+{
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+	struct kvmppc_44x_tlbe *gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
+	unsigned int pgmask = get_tlb_bytes(gtlbe) - 1;
+
+	return get_tlb_raddr(gtlbe) | (eaddr & pgmask);
+}
+
+int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
 	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
 
 	return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 }
 
-int kvmppc_44x_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
+int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
 	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
 
 	return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 }
 
+void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu)
+{
+}
+
 static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x,
                                       unsigned int stlb_index)
 {
@@ -248,7 +266,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x,
 	KVMTRACE_1D(STLB_INVAL, &vcpu_44x->vcpu, stlb_index, handler);
 }
 
-void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu)
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	int i;
@@ -269,15 +287,19 @@ void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu)
  * Caller must ensure that the specified guest TLB entry is safe to insert into
  * the shadow TLB.
  */
-void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, u64 asid,
-                    u32 flags, u32 max_bytes, unsigned int gtlb_index)
+void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
+                    unsigned int gtlb_index)
 {
 	struct kvmppc_44x_tlbe stlbe;
 	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+	struct kvmppc_44x_tlbe *gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
 	struct kvmppc_44x_shadow_ref *ref;
 	struct page *new_page;
 	hpa_t hpaddr;
 	gfn_t gfn;
+	u32 asid = gtlbe->tid;
+	u32 flags = gtlbe->word2;
+	u32 max_bytes = get_tlb_bytes(gtlbe);
 	unsigned int victim;
 
 	/* Select TLB entry to clobber. Indirectly guard against races with the TLB
@@ -448,10 +470,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 	}
 
 	if (tlbe_is_host_safe(vcpu, tlbe)) {
-		u64 asid;
 		gva_t eaddr;
 		gpa_t gpaddr;
-		u32 flags;
 		u32 bytes;
 
 		eaddr = get_tlb_eaddr(tlbe);
@@ -462,10 +482,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 		eaddr &= ~(bytes - 1);
 		gpaddr &= ~(bytes - 1);
 
-		asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
-		flags = tlbe->word2 & 0xffff;
-
-		kvmppc_mmu_map(vcpu, eaddr, gpaddr, asid, flags, bytes, gtlb_index);
+		kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);
 	}
 
 	KVMTRACE_5D(GTLB_WRITE, vcpu, gtlb_index, tlbe->tid, tlbe->word0,
diff --git a/arch/powerpc/kvm/44x_tlb.h b/arch/powerpc/kvm/44x_tlb.h
index 772191f29e6..a9ff80e5152 100644
--- a/arch/powerpc/kvm/44x_tlb.h
+++ b/arch/powerpc/kvm/44x_tlb.h
@@ -25,8 +25,6 @@
 
 extern int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr,
                                 unsigned int pid, unsigned int as);
-extern int kvmppc_44x_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
-extern int kvmppc_44x_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
 
 extern int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb,
                                  u8 rc);
@@ -85,11 +83,4 @@ static inline unsigned int get_mmucr_sts(const struct kvm_vcpu *vcpu)
 	return (vcpu->arch.mmucr >> 16) & 0x1;
 }
 
-static inline gpa_t tlb_xlate(struct kvmppc_44x_tlbe *tlbe, gva_t eaddr)
-{
-	unsigned int pgmask = get_tlb_bytes(tlbe) - 1;
-
-	return get_tlb_raddr(tlbe) | (eaddr & pgmask);
-}
-
 #endif /* __KVM_POWERPC_TLB_H__ */
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 6dbdc4817d8..5a152a52796 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -2,6 +2,9 @@
 # KVM configuration
 #
 
+config HAVE_KVM_IRQCHIP
+       bool
+
 menuconfig VIRTUALIZATION
 	bool "Virtualization"
 	---help---
@@ -43,6 +46,19 @@ config KVM_EXIT_TIMING
 
 	  If unsure, say N.
 
+config KVM_E500
+	bool "KVM support for PowerPC E500 processors"
+	depends on EXPERIMENTAL && E500
+	select KVM
+	---help---
+	  Support running unmodified E500 guest kernels in virtual machines on
+	  E500 host processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
 config KVM_TRACE
 	bool "KVM trace support"
 	depends on KVM && MARKERS && SYSFS
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index df7ba59e6d5..4b2df66c79d 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -16,8 +16,18 @@ AFLAGS_booke_interrupts.o := -I$(obj)
 
 kvm-440-objs := \
 	booke.o \
+	booke_emulate.o \
 	booke_interrupts.o \
 	44x.o \
 	44x_tlb.o \
 	44x_emulate.o
 obj-$(CONFIG_KVM_440) += kvm-440.o
+
+kvm-e500-objs := \
+	booke.o \
+	booke_emulate.o \
+	booke_interrupts.o \
+	e500.o \
+	e500_tlb.o \
+	e500_emulate.o
+obj-$(CONFIG_KVM_E500) += kvm-e500.o
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 35485dd6927..642e4204cf2 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -30,10 +30,8 @@
 #include <asm/kvm_ppc.h>
 #include "timing.h"
 #include <asm/cacheflush.h>
-#include <asm/kvm_44x.h>
 
 #include "booke.h"
-#include "44x_tlb.h"
 
 unsigned long kvmppc_booke_handlers;
 
@@ -120,6 +118,9 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	case BOOKE_IRQPRIO_DATA_STORAGE:
 	case BOOKE_IRQPRIO_INST_STORAGE:
 	case BOOKE_IRQPRIO_FP_UNAVAIL:
+	case BOOKE_IRQPRIO_SPE_UNAVAIL:
+	case BOOKE_IRQPRIO_SPE_FP_DATA:
+	case BOOKE_IRQPRIO_SPE_FP_ROUND:
 	case BOOKE_IRQPRIO_AP_UNAVAIL:
 	case BOOKE_IRQPRIO_ALIGNMENT:
 		allowed = 1;
@@ -165,7 +166,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 	unsigned int priority;
 
 	priority = __ffs(*pending);
-	while (priority <= BOOKE_MAX_INTERRUPT) {
+	while (priority <= BOOKE_IRQPRIO_MAX) {
 		if (kvmppc_booke_irqprio_deliver(vcpu, priority))
 			break;
 
@@ -263,6 +264,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		r = RESUME_GUEST;
 		break;
 
+	case BOOKE_INTERRUPT_SPE_UNAVAIL:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_UNAVAIL);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_SPE_FP_DATA:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_SPE_FP_ROUND:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND);
+		r = RESUME_GUEST;
+		break;
+
 	case BOOKE_INTERRUPT_DATA_STORAGE:
 		vcpu->arch.dear = vcpu->arch.fault_dear;
 		vcpu->arch.esr = vcpu->arch.fault_esr;
@@ -284,29 +300,27 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		r = RESUME_GUEST;
 		break;
 
-	/* XXX move to a 440-specific file. */
 	case BOOKE_INTERRUPT_DTLB_MISS: {
-		struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
-		struct kvmppc_44x_tlbe *gtlbe;
 		unsigned long eaddr = vcpu->arch.fault_dear;
 		int gtlb_index;
+		gpa_t gpaddr;
 		gfn_t gfn;
 
 		/* Check the guest TLB. */
-		gtlb_index = kvmppc_44x_dtlb_index(vcpu, eaddr);
+		gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
 		if (gtlb_index < 0) {
 			/* The guest didn't have a mapping for it. */
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
 			vcpu->arch.dear = vcpu->arch.fault_dear;
 			vcpu->arch.esr = vcpu->arch.fault_esr;
+			kvmppc_mmu_dtlb_miss(vcpu);
 			kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS);
 			r = RESUME_GUEST;
 			break;
 		}
 
-		gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
-		vcpu->arch.paddr_accessed = tlb_xlate(gtlbe, eaddr);
-		gfn = vcpu->arch.paddr_accessed >> PAGE_SHIFT;
+		gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);
+		gfn = gpaddr >> PAGE_SHIFT;
 
 		if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
 			/* The guest TLB had a mapping, but the shadow TLB
@@ -315,13 +329,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			 * b) the guest used a large mapping which we're faking
 			 * Either way, we need to satisfy the fault without
 			 * invoking the guest. */
-			kvmppc_mmu_map(vcpu, eaddr, vcpu->arch.paddr_accessed, gtlbe->tid,
-			               gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index);
+			kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);
 			kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
 			r = RESUME_GUEST;
 		} else {
 			/* Guest has mapped and accessed a page which is not
 			 * actually RAM. */
+			vcpu->arch.paddr_accessed = gpaddr;
 			r = kvmppc_emulate_mmio(run, vcpu);
 			kvmppc_account_exit(vcpu, MMIO_EXITS);
 		}
@@ -329,10 +343,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	}
 
-	/* XXX move to a 440-specific file. */
 	case BOOKE_INTERRUPT_ITLB_MISS: {
-		struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
-		struct kvmppc_44x_tlbe *gtlbe;
 		unsigned long eaddr = vcpu->arch.pc;
 		gpa_t gpaddr;
 		gfn_t gfn;
@@ -341,18 +352,18 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		r = RESUME_GUEST;
 
 		/* Check the guest TLB. */
-		gtlb_index = kvmppc_44x_itlb_index(vcpu, eaddr);
+		gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr);
 		if (gtlb_index < 0) {
 			/* The guest didn't have a mapping for it. */
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
+			kvmppc_mmu_itlb_miss(vcpu);
 			kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS);
 			break;
 		}
 
 		kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS);
 
-		gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
-		gpaddr = tlb_xlate(gtlbe, eaddr);
+		gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);
 		gfn = gpaddr >> PAGE_SHIFT;
 
 		if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
@@ -362,8 +373,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			 * b) the guest used a large mapping which we're faking
 			 * Either way, we need to satisfy the fault without
 			 * invoking the guest. */
-			kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlbe->tid,
-			               gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index);
+			kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);
 		} else {
 			/* Guest mapped and leaped at non-RAM! */
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK);
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index cf7c94ca24b..d59bcca1f9d 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -22,6 +22,7 @@
 
 #include <linux/types.h>
 #include <linux/kvm_host.h>
+#include <asm/kvm_ppc.h>
 #include "timing.h"
 
 /* interrupt priortity ordering */
@@ -30,17 +31,24 @@
 #define BOOKE_IRQPRIO_ALIGNMENT 2
 #define BOOKE_IRQPRIO_PROGRAM 3
 #define BOOKE_IRQPRIO_FP_UNAVAIL 4
-#define BOOKE_IRQPRIO_SYSCALL 5
-#define BOOKE_IRQPRIO_AP_UNAVAIL 6
-#define BOOKE_IRQPRIO_DTLB_MISS 7
-#define BOOKE_IRQPRIO_ITLB_MISS 8
-#define BOOKE_IRQPRIO_MACHINE_CHECK 9
-#define BOOKE_IRQPRIO_DEBUG 10
-#define BOOKE_IRQPRIO_CRITICAL 11
-#define BOOKE_IRQPRIO_WATCHDOG 12
-#define BOOKE_IRQPRIO_EXTERNAL 13
-#define BOOKE_IRQPRIO_FIT 14
-#define BOOKE_IRQPRIO_DECREMENTER 15
+#define BOOKE_IRQPRIO_SPE_UNAVAIL 5
+#define BOOKE_IRQPRIO_SPE_FP_DATA 6
+#define BOOKE_IRQPRIO_SPE_FP_ROUND 7
+#define BOOKE_IRQPRIO_SYSCALL 8
+#define BOOKE_IRQPRIO_AP_UNAVAIL 9
+#define BOOKE_IRQPRIO_DTLB_MISS 10
+#define BOOKE_IRQPRIO_ITLB_MISS 11
+#define BOOKE_IRQPRIO_MACHINE_CHECK 12
+#define BOOKE_IRQPRIO_DEBUG 13
+#define BOOKE_IRQPRIO_CRITICAL 14
+#define BOOKE_IRQPRIO_WATCHDOG 15
+#define BOOKE_IRQPRIO_EXTERNAL 16
+#define BOOKE_IRQPRIO_FIT 17
+#define BOOKE_IRQPRIO_DECREMENTER 18
+#define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19
+#define BOOKE_IRQPRIO_MAX 19
+
+extern unsigned long kvmppc_booke_handlers;
 
 /* Helper function for "full" MSR writes. No need to call this if only EE is
  * changing. */
@@ -57,4 +65,9 @@ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 	};
 }
 
+int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                            unsigned int inst, int *advance);
+int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
+int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs);
+
 #endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
new file mode 100644
index 00000000000..aebc65e93f4
--- /dev/null
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -0,0 +1,266 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/disassemble.h>
+
+#include "booke.h"
+
+#define OP_19_XOP_RFI     50
+
+#define OP_31_XOP_MFMSR   83
+#define OP_31_XOP_WRTEE   131
+#define OP_31_XOP_MTMSR   146
+#define OP_31_XOP_WRTEEI  163
+
+static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.pc = vcpu->arch.srr0;
+	kvmppc_set_msr(vcpu, vcpu->arch.srr1);
+}
+
+int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                            unsigned int inst, int *advance)
+{
+	int emulated = EMULATE_DONE;
+	int rs;
+	int rt;
+
+	switch (get_op(inst)) {
+	case 19:
+		switch (get_xop(inst)) {
+		case OP_19_XOP_RFI:
+			kvmppc_emul_rfi(vcpu);
+			kvmppc_set_exit_type(vcpu, EMULATED_RFI_EXITS);
+			*advance = 0;
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+			break;
+		}
+		break;
+
+	case 31:
+		switch (get_xop(inst)) {
+
+		case OP_31_XOP_MFMSR:
+			rt = get_rt(inst);
+			vcpu->arch.gpr[rt] = vcpu->arch.msr;
+			kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
+			break;
+
+		case OP_31_XOP_MTMSR:
+			rs = get_rs(inst);
+			kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS);
+			kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
+			break;
+
+		case OP_31_XOP_WRTEE:
+			rs = get_rs(inst);
+			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+							 | (vcpu->arch.gpr[rs] & MSR_EE);
+			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
+			break;
+
+		case OP_31_XOP_WRTEEI:
+			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+							 | (inst & MSR_EE);
+			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+		}
+
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	return emulated;
+}
+
+int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+	case SPRN_DEAR:
+		vcpu->arch.dear = vcpu->arch.gpr[rs]; break;
+	case SPRN_ESR:
+		vcpu->arch.esr = vcpu->arch.gpr[rs]; break;
+	case SPRN_DBCR0:
+		vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break;
+	case SPRN_DBCR1:
+		vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break;
+	case SPRN_DBSR:
+		vcpu->arch.dbsr &= ~vcpu->arch.gpr[rs]; break;
+	case SPRN_TSR:
+		vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break;
+	case SPRN_TCR:
+		vcpu->arch.tcr = vcpu->arch.gpr[rs];
+		kvmppc_emulate_dec(vcpu);
+		break;
+
+	/* Note: SPRG4-7 are user-readable. These values are
+	 * loaded into the real SPRGs when resuming the
+	 * guest. */
+	case SPRN_SPRG4:
+		vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break;
+	case SPRN_SPRG5:
+		vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break;
+	case SPRN_SPRG6:
+		vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break;
+	case SPRN_SPRG7:
+		vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
+
+	case SPRN_IVPR:
+		vcpu->arch.ivpr = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR0:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR1:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR2:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR3:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR4:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR5:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR6:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR7:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR8:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR9:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR10:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR11:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR12:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR13:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR14:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR15:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs];
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	return emulated;
+}
+
+int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+	case SPRN_IVPR:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break;
+	case SPRN_DEAR:
+		vcpu->arch.gpr[rt] = vcpu->arch.dear; break;
+	case SPRN_ESR:
+		vcpu->arch.gpr[rt] = vcpu->arch.esr; break;
+	case SPRN_DBCR0:
+		vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break;
+	case SPRN_DBCR1:
+		vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
+	case SPRN_DBSR:
+		vcpu->arch.gpr[rt] = vcpu->arch.dbsr; break;
+
+	case SPRN_IVOR0:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
+		break;
+	case SPRN_IVOR1:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
+		break;
+	case SPRN_IVOR2:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
+		break;
+	case SPRN_IVOR3:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
+		break;
+	case SPRN_IVOR4:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
+		break;
+	case SPRN_IVOR5:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
+		break;
+	case SPRN_IVOR6:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
+		break;
+	case SPRN_IVOR7:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
+		break;
+	case SPRN_IVOR8:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
+		break;
+	case SPRN_IVOR9:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
+		break;
+	case SPRN_IVOR10:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
+		break;
+	case SPRN_IVOR11:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
+		break;
+	case SPRN_IVOR12:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
+		break;
+	case SPRN_IVOR13:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
+		break;
+	case SPRN_IVOR14:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
+		break;
+	case SPRN_IVOR15:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	return emulated;
+}
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 084ebcd7dd8..d0c6f841bbd 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -86,6 +86,9 @@ KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG
 KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS
 KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS
 KVM_HANDLER BOOKE_INTERRUPT_DEBUG
+KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL
+KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA
+KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND
 
 _GLOBAL(kvmppc_handler_len)
 	.long kvmppc_handler_1 - kvmppc_handler_0
@@ -347,7 +350,9 @@ lightweight_exit:
 	lwz	r3, VCPU_SHADOW_PID(r4)
 	mtspr	SPRN_PID, r3
 
+#ifdef CONFIG_44x
 	iccci	0, 0 /* XXX hack */
+#endif
 
 	/* Load some guest volatiles. */
 	lwz	r0, VCPU_GPR(r0)(r4)
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
new file mode 100644
index 00000000000..d8067fd81cd
--- /dev/null
+++ b/arch/powerpc/kvm/e500.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, <yu.liu@freescale.com>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/44x.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/tlbflush.h>
+#include <asm/kvm_e500.h>
+#include <asm/kvm_ppc.h>
+
+#include "booke.h"
+#include "e500_tlb.h"
+
+void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	kvmppc_e500_tlb_load(vcpu, cpu);
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	kvmppc_e500_tlb_put(vcpu);
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	int r;
+
+	if (strcmp(cur_cpu_spec->cpu_name, "e500v2") == 0)
+		r = 0;
+	else
+		r = -ENOTSUPP;
+
+	return r;
+}
+
+int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	kvmppc_e500_tlb_setup(vcpu_e500);
+
+	/* Use the same core vertion as host's */
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
+
+	return 0;
+}
+
+/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
+int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
+                               struct kvm_translation *tr)
+{
+	int index;
+	gva_t eaddr;
+	u8 pid;
+	u8 as;
+
+	eaddr = tr->linear_address;
+	pid = (tr->linear_address >> 32) & 0xff;
+	as = (tr->linear_address >> 40) & 0x1;
+
+	index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as);
+	if (index < 0) {
+		tr->valid = 0;
+		return 0;
+	}
+
+	tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr);
+	/* XXX what does "writeable" and "usermode" even mean? */
+	tr->valid = 1;
+
+	return 0;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500;
+	struct kvm_vcpu *vcpu;
+	int err;
+
+	vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!vcpu_e500) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	vcpu = &vcpu_e500->vcpu;
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	err = kvmppc_e500_tlb_init(vcpu_e500);
+	if (err)
+		goto uninit_vcpu;
+
+	return vcpu;
+
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
+free_vcpu:
+	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
+out:
+	return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+	kvm_vcpu_uninit(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
+}
+
+static int kvmppc_e500_init(void)
+{
+	int r, i;
+	unsigned long ivor[3];
+	unsigned long max_ivor = 0;
+
+	r = kvmppc_booke_init();
+	if (r)
+		return r;
+
+	/* copy extra E500 exception handlers */
+	ivor[0] = mfspr(SPRN_IVOR32);
+	ivor[1] = mfspr(SPRN_IVOR33);
+	ivor[2] = mfspr(SPRN_IVOR34);
+	for (i = 0; i < 3; i++) {
+		if (ivor[i] > max_ivor)
+			max_ivor = ivor[i];
+
+		memcpy((void *)kvmppc_booke_handlers + ivor[i],
+		       kvmppc_handlers_start + (i + 16) * kvmppc_handler_len,
+		       kvmppc_handler_len);
+	}
+	flush_icache_range(kvmppc_booke_handlers,
+			kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
+
+	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE);
+}
+
+static void kvmppc_e500_exit(void)
+{
+	kvmppc_booke_exit();
+}
+
+module_init(kvmppc_e500_init);
+module_exit(kvmppc_e500_exit);
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
new file mode 100644
index 00000000000..3f760414b9f
--- /dev/null
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, <yu.liu@freescale.com>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/44x_emulate.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/kvm_e500.h>
+
+#include "booke.h"
+#include "e500_tlb.h"
+
+#define XOP_TLBIVAX 786
+#define XOP_TLBSX   914
+#define XOP_TLBRE   946
+#define XOP_TLBWE   978
+
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+	int emulated = EMULATE_DONE;
+	int ra;
+	int rb;
+
+	switch (get_op(inst)) {
+	case 31:
+		switch (get_xop(inst)) {
+
+		case XOP_TLBRE:
+			emulated = kvmppc_e500_emul_tlbre(vcpu);
+			break;
+
+		case XOP_TLBWE:
+			emulated = kvmppc_e500_emul_tlbwe(vcpu);
+			break;
+
+		case XOP_TLBSX:
+			rb = get_rb(inst);
+			emulated = kvmppc_e500_emul_tlbsx(vcpu,rb);
+			break;
+
+		case XOP_TLBIVAX:
+			ra = get_ra(inst);
+			rb = get_rb(inst);
+			emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb);
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+		}
+
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	if (emulated == EMULATE_FAIL)
+		emulated = kvmppc_booke_emulate_op(run, vcpu, inst, advance);
+
+	return emulated;
+}
+
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+	case SPRN_PID:
+		vcpu_e500->pid[0] = vcpu->arch.shadow_pid =
+			vcpu->arch.pid = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_PID1:
+		vcpu_e500->pid[1] = vcpu->arch.gpr[rs]; break;
+	case SPRN_PID2:
+		vcpu_e500->pid[2] = vcpu->arch.gpr[rs]; break;
+	case SPRN_MAS0:
+		vcpu_e500->mas0 = vcpu->arch.gpr[rs]; break;
+	case SPRN_MAS1:
+		vcpu_e500->mas1 = vcpu->arch.gpr[rs]; break;
+	case SPRN_MAS2:
+		vcpu_e500->mas2 = vcpu->arch.gpr[rs]; break;
+	case SPRN_MAS3:
+		vcpu_e500->mas3 = vcpu->arch.gpr[rs]; break;
+	case SPRN_MAS4:
+		vcpu_e500->mas4 = vcpu->arch.gpr[rs]; break;
+	case SPRN_MAS6:
+		vcpu_e500->mas6 = vcpu->arch.gpr[rs]; break;
+	case SPRN_MAS7:
+		vcpu_e500->mas7 = vcpu->arch.gpr[rs]; break;
+	case SPRN_L1CSR1:
+		vcpu_e500->l1csr1 = vcpu->arch.gpr[rs]; break;
+	case SPRN_HID0:
+		vcpu_e500->hid0 = vcpu->arch.gpr[rs]; break;
+	case SPRN_HID1:
+		vcpu_e500->hid1 = vcpu->arch.gpr[rs]; break;
+
+	case SPRN_MMUCSR0:
+		emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500,
+				vcpu->arch.gpr[rs]);
+		break;
+
+	/* extra exceptions */
+	case SPRN_IVOR32:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR33:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR34:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = vcpu->arch.gpr[rs];
+		break;
+	case SPRN_IVOR35:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = vcpu->arch.gpr[rs];
+		break;
+
+	default:
+		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs);
+	}
+
+	return emulated;
+}
+
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+	case SPRN_PID:
+		vcpu->arch.gpr[rt] = vcpu_e500->pid[0]; break;
+	case SPRN_PID1:
+		vcpu->arch.gpr[rt] = vcpu_e500->pid[1]; break;
+	case SPRN_PID2:
+		vcpu->arch.gpr[rt] = vcpu_e500->pid[2]; break;
+	case SPRN_MAS0:
+		vcpu->arch.gpr[rt] = vcpu_e500->mas0; break;
+	case SPRN_MAS1:
+		vcpu->arch.gpr[rt] = vcpu_e500->mas1; break;
+	case SPRN_MAS2:
+		vcpu->arch.gpr[rt] = vcpu_e500->mas2; break;
+	case SPRN_MAS3:
+		vcpu->arch.gpr[rt] = vcpu_e500->mas3; break;
+	case SPRN_MAS4:
+		vcpu->arch.gpr[rt] = vcpu_e500->mas4; break;
+	case SPRN_MAS6:
+		vcpu->arch.gpr[rt] = vcpu_e500->mas6; break;
+	case SPRN_MAS7:
+		vcpu->arch.gpr[rt] = vcpu_e500->mas7; break;
+
+	case SPRN_TLB0CFG:
+		vcpu->arch.gpr[rt] = mfspr(SPRN_TLB0CFG);
+		vcpu->arch.gpr[rt] &= ~0xfffUL;
+		vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[0];
+		break;
+
+	case SPRN_TLB1CFG:
+		vcpu->arch.gpr[rt] = mfspr(SPRN_TLB1CFG);
+		vcpu->arch.gpr[rt] &= ~0xfffUL;
+		vcpu->arch.gpr[rt] |= vcpu_e500->guest_tlb_size[1];
+		break;
+
+	case SPRN_L1CSR1:
+		vcpu->arch.gpr[rt] = vcpu_e500->l1csr1; break;
+	case SPRN_HID0:
+		vcpu->arch.gpr[rt] = vcpu_e500->hid0; break;
+	case SPRN_HID1:
+		vcpu->arch.gpr[rt] = vcpu_e500->hid1; break;
+
+	case SPRN_MMUCSR0:
+		vcpu->arch.gpr[rt] = 0; break;
+
+	/* extra exceptions */
+	case SPRN_IVOR32:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
+		break;
+	case SPRN_IVOR33:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA];
+		break;
+	case SPRN_IVOR34:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND];
+		break;
+	case SPRN_IVOR35:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
+		break;
+	default:
+		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt);
+	}
+
+	return emulated;
+}
+
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
new file mode 100644
index 00000000000..0e773fc2d5e
--- /dev/null
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, yu.liu@freescale.com
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_e500.h>
+
+#include "../mm/mmu_decl.h"
+#include "e500_tlb.h"
+
+#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1)
+
+static unsigned int tlb1_entry_num;
+
+void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct tlbe *tlbe;
+	int i, tlbsel;
+
+	printk("| %8s | %8s | %8s | %8s | %8s |\n",
+			"nr", "mas1", "mas2", "mas3", "mas7");
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		printk("Guest TLB%d:\n", tlbsel);
+		for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) {
+			tlbe = &vcpu_e500->guest_tlb[tlbsel][i];
+			if (tlbe->mas1 & MAS1_VALID)
+				printk(" G[%d][%3d] |  %08X | %08X | %08X | %08X |\n",
+					tlbsel, i, tlbe->mas1, tlbe->mas2,
+					tlbe->mas3, tlbe->mas7);
+		}
+	}
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		printk("Shadow TLB%d:\n", tlbsel);
+		for (i = 0; i < vcpu_e500->shadow_tlb_size[tlbsel]; i++) {
+			tlbe = &vcpu_e500->shadow_tlb[tlbsel][i];
+			if (tlbe->mas1 & MAS1_VALID)
+				printk(" S[%d][%3d] |  %08X | %08X | %08X | %08X |\n",
+					tlbsel, i, tlbe->mas1, tlbe->mas2,
+					tlbe->mas3, tlbe->mas7);
+		}
+	}
+}
+
+static inline unsigned int tlb0_get_next_victim(
+		struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	unsigned int victim;
+
+	victim = vcpu_e500->guest_tlb_nv[0]++;
+	if (unlikely(vcpu_e500->guest_tlb_nv[0] >= KVM_E500_TLB0_WAY_NUM))
+		vcpu_e500->guest_tlb_nv[0] = 0;
+
+	return victim;
+}
+
+static inline unsigned int tlb1_max_shadow_size(void)
+{
+	return tlb1_entry_num - tlbcam_index;
+}
+
+static inline int tlbe_is_writable(struct tlbe *tlbe)
+{
+	return tlbe->mas3 & (MAS3_SW|MAS3_UW);
+}
+
+static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
+{
+	/* Mask off reserved bits. */
+	mas3 &= MAS3_ATTRIB_MASK;
+
+	if (!usermode) {
+		/* Guest is in supervisor mode,
+		 * so we need to translate guest
+		 * supervisor permissions into user permissions. */
+		mas3 &= ~E500_TLB_USER_PERM_MASK;
+		mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
+	}
+
+	return mas3 | E500_TLB_SUPER_PERM_MASK;
+}
+
+static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
+{
+#ifdef CONFIG_SMP
+	return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M;
+#else
+	return mas2 & MAS2_ATTRIB_MASK;
+#endif
+}
+
+/*
+ * writing shadow tlb entry to host TLB
+ */
+static inline void __write_host_tlbe(struct tlbe *stlbe)
+{
+	mtspr(SPRN_MAS1, stlbe->mas1);
+	mtspr(SPRN_MAS2, stlbe->mas2);
+	mtspr(SPRN_MAS3, stlbe->mas3);
+	mtspr(SPRN_MAS7, stlbe->mas7);
+	__asm__ __volatile__ ("tlbwe\n" : : );
+}
+
+static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
+		int tlbsel, int esel)
+{
+	struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
+
+	local_irq_disable();
+	if (tlbsel == 0) {
+		__write_host_tlbe(stlbe);
+	} else {
+		unsigned register mas0;
+
+		mas0 = mfspr(SPRN_MAS0);
+
+		mtspr(SPRN_MAS0, MAS0_TLBSEL(1) | MAS0_ESEL(to_htlb1_esel(esel)));
+		__write_host_tlbe(stlbe);
+
+		mtspr(SPRN_MAS0, mas0);
+	}
+	local_irq_enable();
+}
+
+void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int i;
+	unsigned register mas0;
+
+	/* Load all valid TLB1 entries to reduce guest tlb miss fault */
+	local_irq_disable();
+	mas0 = mfspr(SPRN_MAS0);
+	for (i = 0; i < tlb1_max_shadow_size(); i++) {
+		struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i];
+
+		if (get_tlb_v(stlbe)) {
+			mtspr(SPRN_MAS0, MAS0_TLBSEL(1)
+					| MAS0_ESEL(to_htlb1_esel(i)));
+			__write_host_tlbe(stlbe);
+		}
+	}
+	mtspr(SPRN_MAS0, mas0);
+	local_irq_enable();
+}
+
+void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu)
+{
+	_tlbil_all();
+}
+
+/* Search the guest TLB for a matching entry. */
+static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
+		gva_t eaddr, int tlbsel, unsigned int pid, int as)
+{
+	int i;
+
+	/* XXX Replace loop with fancy data structures. */
+	for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++) {
+		struct tlbe *tlbe = &vcpu_e500->guest_tlb[tlbsel][i];
+		unsigned int tid;
+
+		if (eaddr < get_tlb_eaddr(tlbe))
+			continue;
+
+		if (eaddr > get_tlb_end(tlbe))
+			continue;
+
+		tid = get_tlb_tid(tlbe);
+		if (tid && (tid != pid))
+			continue;
+
+		if (!get_tlb_v(tlbe))
+			continue;
+
+		if (get_tlb_ts(tlbe) != as && as != -1)
+			continue;
+
+		return i;
+	}
+
+	return -1;
+}
+
+static void kvmppc_e500_shadow_release(struct kvmppc_vcpu_e500 *vcpu_e500,
+		int tlbsel, int esel)
+{
+	struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
+	struct page *page = vcpu_e500->shadow_pages[tlbsel][esel];
+
+	if (page) {
+		vcpu_e500->shadow_pages[tlbsel][esel] = NULL;
+
+		if (get_tlb_v(stlbe)) {
+			if (tlbe_is_writable(stlbe))
+				kvm_release_page_dirty(page);
+			else
+				kvm_release_page_clean(page);
+		}
+	}
+}
+
+static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
+		int tlbsel, int esel)
+{
+	struct tlbe *stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
+
+	kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
+	stlbe->mas1 = 0;
+	KVMTRACE_5D(STLB_INVAL, &vcpu_e500->vcpu, index_of(tlbsel, esel),
+			stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7,
+			handler);
+}
+
+static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
+		gva_t eaddr, gva_t eend, u32 tid)
+{
+	unsigned int pid = tid & 0xff;
+	unsigned int i;
+
+	/* XXX Replace loop with fancy data structures. */
+	for (i = 0; i < vcpu_e500->guest_tlb_size[1]; i++) {
+		struct tlbe *stlbe = &vcpu_e500->shadow_tlb[1][i];
+		unsigned int tid;
+
+		if (!get_tlb_v(stlbe))
+			continue;
+
+		if (eend < get_tlb_eaddr(stlbe))
+			continue;
+
+		if (eaddr > get_tlb_end(stlbe))
+			continue;
+
+		tid = get_tlb_tid(stlbe);
+		if (tid && (tid != pid))
+			continue;
+
+		kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i);
+		write_host_tlbe(vcpu_e500, 1, i);
+	}
+}
+
+static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
+		unsigned int eaddr, int as)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	unsigned int victim, pidsel, tsized;
+	int tlbsel;
+
+	/* since we only have two TLBs, only lower bit is used. */
+	tlbsel = (vcpu_e500->mas4 >> 28) & 0x1;
+	victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0;
+	pidsel = (vcpu_e500->mas4 >> 16) & 0xf;
+	tsized = (vcpu_e500->mas4 >> 8) & 0xf;
+
+	vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
+		| MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+	vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
+		| MAS1_TID(vcpu_e500->pid[pidsel])
+		| MAS1_TSIZE(tsized);
+	vcpu_e500->mas2 = (eaddr & MAS2_EPN)
+		| (vcpu_e500->mas4 & MAS2_ATTRIB_MASK);
+	vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3;
+	vcpu_e500->mas6 = (vcpu_e500->mas6 & MAS6_SPID1)
+		| (get_cur_pid(vcpu) << 16)
+		| (as ? MAS6_SAS : 0);
+	vcpu_e500->mas7 = 0;
+}
+
+static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+	u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel)
+{
+	struct page *new_page;
+	struct tlbe *stlbe;
+	hpa_t hpaddr;
+
+	stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
+
+	/* Get reference to new page. */
+	new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn);
+	if (is_error_page(new_page)) {
+		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
+		kvm_release_page_clean(new_page);
+		return;
+	}
+	hpaddr = page_to_phys(new_page);
+
+	/* Drop reference to old page. */
+	kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
+
+	vcpu_e500->shadow_pages[tlbsel][esel] = new_page;
+
+	/* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */
+	stlbe->mas1 = MAS1_TSIZE(BOOKE_PAGESZ_4K)
+		| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
+	stlbe->mas2 = (gvaddr & MAS2_EPN)
+		| e500_shadow_mas2_attrib(gtlbe->mas2,
+				vcpu_e500->vcpu.arch.msr & MSR_PR);
+	stlbe->mas3 = (hpaddr & MAS3_RPN)
+		| e500_shadow_mas3_attrib(gtlbe->mas3,
+				vcpu_e500->vcpu.arch.msr & MSR_PR);
+	stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN;
+
+	KVMTRACE_5D(STLB_WRITE, &vcpu_e500->vcpu, index_of(tlbsel, esel),
+			stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7,
+			handler);
+}
+
+/* XXX only map the one-one case, for now use TLB0 */
+static int kvmppc_e500_stlbe_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+		int tlbsel, int esel)
+{
+	struct tlbe *gtlbe;
+
+	gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+
+	kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
+			get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
+			gtlbe, tlbsel, esel);
+
+	return esel;
+}
+
+/* Caller must ensure that the specified guest TLB entry is safe to insert into
+ * the shadow TLB. */
+/* XXX for both one-one and one-to-many , for now use TLB1 */
+static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+		u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe)
+{
+	unsigned int victim;
+
+	victim = vcpu_e500->guest_tlb_nv[1]++;
+
+	if (unlikely(vcpu_e500->guest_tlb_nv[1] >= tlb1_max_shadow_size()))
+		vcpu_e500->guest_tlb_nv[1] = 0;
+
+	kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim);
+
+	return victim;
+}
+
+/* Invalidate all guest kernel mappings when enter usermode,
+ * so that when they fault back in they will get the
+ * proper permission bits. */
+void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
+{
+	if (usermode) {
+		struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+		int i;
+
+		/* XXX Replace loop with fancy data structures. */
+		for (i = 0; i < tlb1_max_shadow_size(); i++)
+			kvmppc_e500_stlbe_invalidate(vcpu_e500, 1, i);
+
+		_tlbil_all();
+	}
+}
+
+static int kvmppc_e500_gtlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
+		int tlbsel, int esel)
+{
+	struct tlbe *gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+
+	if (unlikely(get_tlb_iprot(gtlbe)))
+		return -1;
+
+	if (tlbsel == 1) {
+		kvmppc_e500_tlb1_invalidate(vcpu_e500, get_tlb_eaddr(gtlbe),
+				get_tlb_end(gtlbe),
+				get_tlb_tid(gtlbe));
+	} else {
+		kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel);
+	}
+
+	gtlbe->mas1 = 0;
+
+	return 0;
+}
+
+int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
+{
+	int esel;
+
+	if (value & MMUCSR0_TLB0FI)
+		for (esel = 0; esel < vcpu_e500->guest_tlb_size[0]; esel++)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel);
+	if (value & MMUCSR0_TLB1FI)
+		for (esel = 0; esel < vcpu_e500->guest_tlb_size[1]; esel++)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
+
+	_tlbil_all();
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	unsigned int ia;
+	int esel, tlbsel;
+	gva_t ea;
+
+	ea = ((ra) ? vcpu->arch.gpr[ra] : 0) + vcpu->arch.gpr[rb];
+
+	ia = (ea >> 2) & 0x1;
+
+	/* since we only have two TLBs, only lower bit is used. */
+	tlbsel = (ea >> 3) & 0x1;
+
+	if (ia) {
+		/* invalidate all entries */
+		for (esel = 0; esel < vcpu_e500->guest_tlb_size[tlbsel]; esel++)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+	} else {
+		ea &= 0xfffff000;
+		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel,
+				get_cur_pid(vcpu), -1);
+		if (esel >= 0)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+	}
+
+	_tlbil_all();
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int tlbsel, esel;
+	struct tlbe *gtlbe;
+
+	tlbsel = get_tlb_tlbsel(vcpu_e500);
+	esel = get_tlb_esel(vcpu_e500, tlbsel);
+
+	gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+	vcpu_e500->mas0 &= ~MAS0_NV(~0);
+	vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+	vcpu_e500->mas1 = gtlbe->mas1;
+	vcpu_e500->mas2 = gtlbe->mas2;
+	vcpu_e500->mas3 = gtlbe->mas3;
+	vcpu_e500->mas7 = gtlbe->mas7;
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int as = !!get_cur_sas(vcpu_e500);
+	unsigned int pid = get_cur_spid(vcpu_e500);
+	int esel, tlbsel;
+	struct tlbe *gtlbe = NULL;
+	gva_t ea;
+
+	ea = vcpu->arch.gpr[rb];
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
+		if (esel >= 0) {
+			gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+			break;
+		}
+	}
+
+	if (gtlbe) {
+		vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel)
+			| MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+		vcpu_e500->mas1 = gtlbe->mas1;
+		vcpu_e500->mas2 = gtlbe->mas2;
+		vcpu_e500->mas3 = gtlbe->mas3;
+		vcpu_e500->mas7 = gtlbe->mas7;
+	} else {
+		int victim;
+
+		/* since we only have two TLBs, only lower bit is used. */
+		tlbsel = vcpu_e500->mas4 >> 28 & 0x1;
+		victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0;
+
+		vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
+			| MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
+		vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0)
+			| (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0))
+			| (vcpu_e500->mas4 & MAS4_TSIZED(~0));
+		vcpu_e500->mas2 &= MAS2_EPN;
+		vcpu_e500->mas2 |= vcpu_e500->mas4 & MAS2_ATTRIB_MASK;
+		vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3;
+		vcpu_e500->mas7 = 0;
+	}
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	u64 eaddr;
+	u64 raddr;
+	u32 tid;
+	struct tlbe *gtlbe;
+	int tlbsel, esel, stlbsel, sesel;
+
+	tlbsel = get_tlb_tlbsel(vcpu_e500);
+	esel = get_tlb_esel(vcpu_e500, tlbsel);
+
+	gtlbe = &vcpu_e500->guest_tlb[tlbsel][esel];
+
+	if (get_tlb_v(gtlbe) && tlbsel == 1) {
+		eaddr = get_tlb_eaddr(gtlbe);
+		tid = get_tlb_tid(gtlbe);
+		kvmppc_e500_tlb1_invalidate(vcpu_e500, eaddr,
+				get_tlb_end(gtlbe), tid);
+	}
+
+	gtlbe->mas1 = vcpu_e500->mas1;
+	gtlbe->mas2 = vcpu_e500->mas2;
+	gtlbe->mas3 = vcpu_e500->mas3;
+	gtlbe->mas7 = vcpu_e500->mas7;
+
+	KVMTRACE_5D(GTLB_WRITE, vcpu, vcpu_e500->mas0,
+			gtlbe->mas1, gtlbe->mas2, gtlbe->mas3, gtlbe->mas7,
+			handler);
+
+	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
+	if (tlbe_is_host_safe(vcpu, gtlbe)) {
+		switch (tlbsel) {
+		case 0:
+			/* TLB0 */
+			gtlbe->mas1 &= ~MAS1_TSIZE(~0);
+			gtlbe->mas1 |= MAS1_TSIZE(BOOKE_PAGESZ_4K);
+
+			stlbsel = 0;
+			sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel);
+
+			break;
+
+		case 1:
+			/* TLB1 */
+			eaddr = get_tlb_eaddr(gtlbe);
+			raddr = get_tlb_raddr(gtlbe);
+
+			/* Create a 4KB mapping on the host.
+			 * If the guest wanted a large page,
+			 * only the first 4KB is mapped here and the rest
+			 * are mapped on the fly. */
+			stlbsel = 1;
+			sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr,
+					raddr >> PAGE_SHIFT, gtlbe);
+			break;
+
+		default:
+			BUG();
+		}
+		write_host_tlbe(vcpu_e500, stlbsel, sesel);
+	}
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
+{
+	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
+
+	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
+}
+
+int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
+{
+	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
+
+	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
+}
+
+void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu)
+{
+	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
+
+	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as);
+}
+
+void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu)
+{
+	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
+
+	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as);
+}
+
+gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
+			gva_t eaddr)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct tlbe *gtlbe =
+		&vcpu_e500->guest_tlb[tlbsel_of(index)][esel_of(index)];
+	u64 pgmask = get_tlb_bytes(gtlbe) - 1;
+
+	return get_tlb_raddr(gtlbe) | (eaddr & pgmask);
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int tlbsel, i;
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++)
+		for (i = 0; i < vcpu_e500->guest_tlb_size[tlbsel]; i++)
+			kvmppc_e500_shadow_release(vcpu_e500, tlbsel, i);
+
+	/* discard all guest mapping */
+	_tlbil_all();
+}
+
+void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
+			unsigned int index)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int tlbsel = tlbsel_of(index);
+	int esel = esel_of(index);
+	int stlbsel, sesel;
+
+	switch (tlbsel) {
+	case 0:
+		stlbsel = 0;
+		sesel = esel;
+		break;
+
+	case 1: {
+		gfn_t gfn = gpaddr >> PAGE_SHIFT;
+		struct tlbe *gtlbe
+			= &vcpu_e500->guest_tlb[tlbsel][esel];
+
+		stlbsel = 1;
+		sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe);
+		break;
+	}
+
+	default:
+		BUG();
+		break;
+	}
+	write_host_tlbe(vcpu_e500, stlbsel, sesel);
+}
+
+int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
+				gva_t eaddr, unsigned int pid, int as)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int esel, tlbsel;
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as);
+		if (esel >= 0)
+			return index_of(tlbsel, esel);
+	}
+
+	return -1;
+}
+
+void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	struct tlbe *tlbe;
+
+	/* Insert large initial mapping for guest. */
+	tlbe = &vcpu_e500->guest_tlb[1][0];
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_256M);
+	tlbe->mas2 = 0;
+	tlbe->mas3 = E500_TLB_SUPER_PERM_MASK;
+	tlbe->mas7 = 0;
+
+	/* 4K map for serial output. Used by kernel wrapper. */
+	tlbe = &vcpu_e500->guest_tlb[1][1];
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_4K);
+	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
+	tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
+	tlbe->mas7 = 0;
+}
+
+int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF;
+
+	vcpu_e500->guest_tlb_size[0] = KVM_E500_TLB0_SIZE;
+	vcpu_e500->guest_tlb[0] =
+		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
+	if (vcpu_e500->guest_tlb[0] == NULL)
+		goto err_out;
+
+	vcpu_e500->shadow_tlb_size[0] = KVM_E500_TLB0_SIZE;
+	vcpu_e500->shadow_tlb[0] =
+		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
+	if (vcpu_e500->shadow_tlb[0] == NULL)
+		goto err_out_guest0;
+
+	vcpu_e500->guest_tlb_size[1] = KVM_E500_TLB1_SIZE;
+	vcpu_e500->guest_tlb[1] =
+		kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL);
+	if (vcpu_e500->guest_tlb[1] == NULL)
+		goto err_out_shadow0;
+
+	vcpu_e500->shadow_tlb_size[1] = tlb1_entry_num;
+	vcpu_e500->shadow_tlb[1] =
+		kzalloc(sizeof(struct tlbe) * tlb1_entry_num, GFP_KERNEL);
+	if (vcpu_e500->shadow_tlb[1] == NULL)
+		goto err_out_guest1;
+
+	vcpu_e500->shadow_pages[0] = (struct page **)
+		kzalloc(sizeof(struct page *) * KVM_E500_TLB0_SIZE, GFP_KERNEL);
+	if (vcpu_e500->shadow_pages[0] == NULL)
+		goto err_out_shadow1;
+
+	vcpu_e500->shadow_pages[1] = (struct page **)
+		kzalloc(sizeof(struct page *) * tlb1_entry_num, GFP_KERNEL);
+	if (vcpu_e500->shadow_pages[1] == NULL)
+		goto err_out_page0;
+
+	return 0;
+
+err_out_page0:
+	kfree(vcpu_e500->shadow_pages[0]);
+err_out_shadow1:
+	kfree(vcpu_e500->shadow_tlb[1]);
+err_out_guest1:
+	kfree(vcpu_e500->guest_tlb[1]);
+err_out_shadow0:
+	kfree(vcpu_e500->shadow_tlb[0]);
+err_out_guest0:
+	kfree(vcpu_e500->guest_tlb[0]);
+err_out:
+	return -1;
+}
+
+void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kfree(vcpu_e500->shadow_pages[1]);
+	kfree(vcpu_e500->shadow_pages[0]);
+	kfree(vcpu_e500->shadow_tlb[1]);
+	kfree(vcpu_e500->guest_tlb[1]);
+	kfree(vcpu_e500->shadow_tlb[0]);
+	kfree(vcpu_e500->guest_tlb[0]);
+}
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h
new file mode 100644
index 00000000000..45b064b7690
--- /dev/null
+++ b/arch/powerpc/kvm/e500_tlb.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2008 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, yu.liu@freescale.com
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.h,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __KVM_E500_TLB_H__
+#define __KVM_E500_TLB_H__
+
+#include <linux/kvm_host.h>
+#include <asm/mmu-fsl-booke.h>
+#include <asm/tlb.h>
+#include <asm/kvm_e500.h>
+
+#define KVM_E500_TLB0_WAY_SIZE_BIT	7	/* Fixed */
+#define KVM_E500_TLB0_WAY_SIZE		(1UL << KVM_E500_TLB0_WAY_SIZE_BIT)
+#define KVM_E500_TLB0_WAY_SIZE_MASK	(KVM_E500_TLB0_WAY_SIZE - 1)
+
+#define KVM_E500_TLB0_WAY_NUM_BIT	1	/* No greater than 7 */
+#define KVM_E500_TLB0_WAY_NUM		(1UL << KVM_E500_TLB0_WAY_NUM_BIT)
+#define KVM_E500_TLB0_WAY_NUM_MASK	(KVM_E500_TLB0_WAY_NUM - 1)
+
+#define KVM_E500_TLB0_SIZE  (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM)
+#define KVM_E500_TLB1_SIZE  16
+
+#define index_of(tlbsel, esel)	(((tlbsel) << 16) | ((esel) & 0xFFFF))
+#define tlbsel_of(index)	((index) >> 16)
+#define esel_of(index)		((index) & 0xFFFF)
+
+#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW)
+#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW)
+#define MAS2_ATTRIB_MASK \
+	  (MAS2_X0 | MAS2_X1)
+#define MAS3_ATTRIB_MASK \
+	  (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \
+	   | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK)
+
+extern void kvmppc_dump_tlbs(struct kvm_vcpu *);
+extern int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *, ulong);
+extern int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *);
+extern int kvmppc_e500_emul_tlbre(struct kvm_vcpu *);
+extern int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *, int, int);
+extern int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *, int);
+extern int kvmppc_e500_tlb_search(struct kvm_vcpu *, gva_t, unsigned int, int);
+extern void kvmppc_e500_tlb_put(struct kvm_vcpu *);
+extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int);
+extern int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *);
+extern void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *);
+extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
+
+/* TLB helper functions */
+static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
+{
+	return (tlbe->mas1 >> 8) & 0xf;
+}
+
+static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
+{
+	return tlbe->mas2 & 0xfffff000;
+}
+
+static inline u64 get_tlb_bytes(const struct tlbe *tlbe)
+{
+	unsigned int pgsize = get_tlb_size(tlbe);
+	return 1ULL << 10 << (pgsize << 1);
+}
+
+static inline gva_t get_tlb_end(const struct tlbe *tlbe)
+{
+	u64 bytes = get_tlb_bytes(tlbe);
+	return get_tlb_eaddr(tlbe) + bytes - 1;
+}
+
+static inline u64 get_tlb_raddr(const struct tlbe *tlbe)
+{
+	u64 rpn = tlbe->mas7;
+	return (rpn << 32) | (tlbe->mas3 & 0xfffff000);
+}
+
+static inline unsigned int get_tlb_tid(const struct tlbe *tlbe)
+{
+	return (tlbe->mas1 >> 16) & 0xff;
+}
+
+static inline unsigned int get_tlb_ts(const struct tlbe *tlbe)
+{
+	return (tlbe->mas1 >> 12) & 0x1;
+}
+
+static inline unsigned int get_tlb_v(const struct tlbe *tlbe)
+{
+	return (tlbe->mas1 >> 31) & 0x1;
+}
+
+static inline unsigned int get_tlb_iprot(const struct tlbe *tlbe)
+{
+	return (tlbe->mas1 >> 30) & 0x1;
+}
+
+static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pid & 0xff;
+}
+
+static inline unsigned int get_cur_spid(
+		const struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	return (vcpu_e500->mas6 >> 16) & 0xff;
+}
+
+static inline unsigned int get_cur_sas(
+		const struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	return vcpu_e500->mas6 & 0x1;
+}
+
+static inline unsigned int get_tlb_tlbsel(
+		const struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	/*
+	 * Manual says that tlbsel has 2 bits wide.
+	 * Since we only have two TLBs, only lower bit is used.
+	 */
+	return (vcpu_e500->mas0 >> 28) & 0x1;
+}
+
+static inline unsigned int get_tlb_nv_bit(
+		const struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	return vcpu_e500->mas0 & 0xfff;
+}
+
+static inline unsigned int get_tlb_esel_bit(
+		const struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	return (vcpu_e500->mas0 >> 16) & 0xfff;
+}
+
+static inline unsigned int get_tlb_esel(
+		const struct kvmppc_vcpu_e500 *vcpu_e500,
+		int tlbsel)
+{
+	unsigned int esel = get_tlb_esel_bit(vcpu_e500);
+
+	if (tlbsel == 0) {
+		esel &= KVM_E500_TLB0_WAY_NUM_MASK;
+		esel |= ((vcpu_e500->mas2 >> 12) & KVM_E500_TLB0_WAY_SIZE_MASK)
+				<< KVM_E500_TLB0_WAY_NUM_BIT;
+	} else {
+		esel &= KVM_E500_TLB1_SIZE - 1;
+	}
+
+	return esel;
+}
+
+static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
+			const struct tlbe *tlbe)
+{
+	gpa_t gpa;
+
+	if (!get_tlb_v(tlbe))
+		return 0;
+
+	/* Does it match current guest AS? */
+	/* XXX what about IS != DS? */
+	if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
+		return 0;
+
+	gpa = get_tlb_raddr(tlbe);
+	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
+		/* Mapping is not for RAM. */
+		return 0;
+
+	return 1;
+}
+
+#endif /* __KVM_E500_TLB_H__ */
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index d1d38daa93f..a561d6e8da1 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -30,6 +30,39 @@
 #include <asm/disassemble.h>
 #include "timing.h"
 
+#define OP_TRAP 3
+
+#define OP_31_XOP_LWZX      23
+#define OP_31_XOP_LBZX      87
+#define OP_31_XOP_STWX      151
+#define OP_31_XOP_STBX      215
+#define OP_31_XOP_STBUX     247
+#define OP_31_XOP_LHZX      279
+#define OP_31_XOP_LHZUX     311
+#define OP_31_XOP_MFSPR     339
+#define OP_31_XOP_STHX      407
+#define OP_31_XOP_STHUX     439
+#define OP_31_XOP_MTSPR     467
+#define OP_31_XOP_DCBI      470
+#define OP_31_XOP_LWBRX     534
+#define OP_31_XOP_TLBSYNC   566
+#define OP_31_XOP_STWBRX    662
+#define OP_31_XOP_LHBRX     790
+#define OP_31_XOP_STHBRX    918
+
+#define OP_LWZ  32
+#define OP_LWZU 33
+#define OP_LBZ  34
+#define OP_LBZU 35
+#define OP_STW  36
+#define OP_STWU 37
+#define OP_STB  38
+#define OP_STBU 39
+#define OP_LHZ  40
+#define OP_LHZU 41
+#define OP_STH  44
+#define OP_STHU 45
+
 void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.tcr & TCR_DIE) {
@@ -78,7 +111,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
 
 	switch (get_op(inst)) {
-	case 3:                                             /* trap */
+	case OP_TRAP:
 		vcpu->arch.esr |= ESR_PTR;
 		kvmppc_core_queue_program(vcpu);
 		advance = 0;
@@ -87,31 +120,31 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	case 31:
 		switch (get_xop(inst)) {
 
-		case 23:                                        /* lwzx */
+		case OP_31_XOP_LWZX:
 			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 			break;
 
-		case 87:                                        /* lbzx */
+		case OP_31_XOP_LBZX:
 			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
 			break;
 
-		case 151:                                       /* stwx */
+		case OP_31_XOP_STWX:
 			rs = get_rs(inst);
 			emulated = kvmppc_handle_store(run, vcpu,
 			                               vcpu->arch.gpr[rs],
 			                               4, 1);
 			break;
 
-		case 215:                                       /* stbx */
+		case OP_31_XOP_STBX:
 			rs = get_rs(inst);
 			emulated = kvmppc_handle_store(run, vcpu,
 			                               vcpu->arch.gpr[rs],
 			                               1, 1);
 			break;
 
-		case 247:                                       /* stbux */
+		case OP_31_XOP_STBUX:
 			rs = get_rs(inst);
 			ra = get_ra(inst);
 			rb = get_rb(inst);
@@ -126,12 +159,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			vcpu->arch.gpr[rs] = ea;
 			break;
 
-		case 279:                                       /* lhzx */
+		case OP_31_XOP_LHZX:
 			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
 			break;
 
-		case 311:                                       /* lhzux */
+		case OP_31_XOP_LHZUX:
 			rt = get_rt(inst);
 			ra = get_ra(inst);
 			rb = get_rb(inst);
@@ -144,7 +177,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			vcpu->arch.gpr[ra] = ea;
 			break;
 
-		case 339:                                       /* mfspr */
+		case OP_31_XOP_MFSPR:
 			sprn = get_sprn(inst);
 			rt = get_rt(inst);
 
@@ -185,7 +218,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			}
 			break;
 
-		case 407:                                       /* sthx */
+		case OP_31_XOP_STHX:
 			rs = get_rs(inst);
 			ra = get_ra(inst);
 			rb = get_rb(inst);
@@ -195,7 +228,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			                               2, 1);
 			break;
 
-		case 439:                                       /* sthux */
+		case OP_31_XOP_STHUX:
 			rs = get_rs(inst);
 			ra = get_ra(inst);
 			rb = get_rb(inst);
@@ -210,7 +243,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			vcpu->arch.gpr[ra] = ea;
 			break;
 
-		case 467:                                       /* mtspr */
+		case OP_31_XOP_MTSPR:
 			sprn = get_sprn(inst);
 			rs = get_rs(inst);
 			switch (sprn) {
@@ -246,7 +279,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			}
 			break;
 
-		case 470:                                       /* dcbi */
+		case OP_31_XOP_DCBI:
 			/* Do nothing. The guest is performing dcbi because
 			 * hardware DMA is not snooped by the dcache, but
 			 * emulated DMA either goes through the dcache as
@@ -254,15 +287,15 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			 * coherence. */
 			break;
 
-		case 534:                                       /* lwbrx */
+		case OP_31_XOP_LWBRX:
 			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 0);
 			break;
 
-		case 566:                                       /* tlbsync */
+		case OP_31_XOP_TLBSYNC:
 			break;
 
-		case 662:                                       /* stwbrx */
+		case OP_31_XOP_STWBRX:
 			rs = get_rs(inst);
 			ra = get_ra(inst);
 			rb = get_rb(inst);
@@ -272,12 +305,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			                               4, 0);
 			break;
 
-		case 790:                                       /* lhbrx */
+		case OP_31_XOP_LHBRX:
 			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0);
 			break;
 
-		case 918:                                       /* sthbrx */
+		case OP_31_XOP_STHBRX:
 			rs = get_rs(inst);
 			ra = get_ra(inst);
 			rb = get_rb(inst);
@@ -293,37 +326,37 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		}
 		break;
 
-	case 32:                                                /* lwz */
+	case OP_LWZ:
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 		break;
 
-	case 33:                                                /* lwzu */
+	case OP_LWZU:
 		ra = get_ra(inst);
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 		vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
 		break;
 
-	case 34:                                                /* lbz */
+	case OP_LBZ:
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
 		break;
 
-	case 35:                                                /* lbzu */
+	case OP_LBZU:
 		ra = get_ra(inst);
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
 		vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
 		break;
 
-	case 36:                                                /* stw */
+	case OP_STW:
 		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
 		                               4, 1);
 		break;
 
-	case 37:                                                /* stwu */
+	case OP_STWU:
 		ra = get_ra(inst);
 		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
@@ -331,13 +364,13 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
 		break;
 
-	case 38:                                                /* stb */
+	case OP_STB:
 		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
 		                               1, 1);
 		break;
 
-	case 39:                                                /* stbu */
+	case OP_STBU:
 		ra = get_ra(inst);
 		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
@@ -345,25 +378,25 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
 		break;
 
-	case 40:                                                /* lhz */
+	case OP_LHZ:
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
 		break;
 
-	case 41:                                                /* lhzu */
+	case OP_LHZU:
 		ra = get_ra(inst);
 		rt = get_rt(inst);
 		emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
 		vcpu->arch.gpr[ra] = vcpu->arch.paddr_accessed;
 		break;
 
-	case 44:                                                /* sth */
+	case OP_STH:
 		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
 		                               2, 1);
 		break;
 
-	case 45:                                                /* sthu */
+	case OP_STHU:
 		ra = get_ra(inst);
 		rs = get_rs(inst);
 		emulated = kvmppc_handle_store(run, vcpu, vcpu->arch.gpr[rs],
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 5f81256287f..9057335fdc6 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -216,46 +216,23 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
-	kvmppc_core_destroy_mmu(vcpu);
+	kvmppc_mmu_destroy(vcpu);
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	if (vcpu->guest_debug.enabled)
-		kvmppc_core_load_guest_debugstate(vcpu);
-
 	kvmppc_core_vcpu_load(vcpu, cpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->guest_debug.enabled)
-		kvmppc_core_load_host_debugstate(vcpu);
-
-	/* Don't leave guest TLB entries resident when being de-scheduled. */
-	/* XXX It would be nice to differentiate between heavyweight exit and
-	 * sched_out here, since we could avoid the TLB flush for heavyweight
-	 * exits. */
-	_tlbil_all();
 	kvmppc_core_vcpu_put(vcpu);
 }
 
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-                                    struct kvm_debug_guest *dbg)
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+                                        struct kvm_guest_debug *dbg)
 {
-	int i;
-
-	vcpu->guest_debug.enabled = dbg->enabled;
-	if (vcpu->guest_debug.enabled) {
-		for (i=0; i < ARRAY_SIZE(vcpu->guest_debug.bp); i++) {
-			if (dbg->breakpoints[i].enabled)
-				vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
-			else
-				vcpu->guest_debug.bp[i] = 0;
-		}
-	}
-
-	return 0;
+	return -EINVAL;
 }
 
 static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h
index e1f54654e3a..0b2f829f6d5 100644
--- a/arch/s390/include/asm/kvm.h
+++ b/arch/s390/include/asm/kvm.h
@@ -42,4 +42,11 @@ struct kvm_fpu {
 	__u64 fprs[16];
 };
 
+struct kvm_debug_exit_arch {
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+};
+
 #endif
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 3c55e4107dc..c6e674f5fca 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -21,9 +21,6 @@
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
 
-struct kvm_guest_debug {
-};
-
 struct sca_entry {
 	atomic_t scn;
 	__u64	reserved;
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index e051cad1f1e..3e260b7e37b 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -4,6 +4,9 @@
 config HAVE_KVM
        bool
 
+config HAVE_KVM_IRQCHIP
+       bool
+
 menuconfig VIRTUALIZATION
 	bool "Virtualization"
 	default y
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 61236102203..9d19803111b 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -103,7 +103,7 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
 static intercept_handler_t instruction_handlers[256] = {
 	[0x83] = kvm_s390_handle_diag,
 	[0xae] = kvm_s390_handle_sigp,
-	[0xb2] = kvm_s390_handle_priv,
+	[0xb2] = kvm_s390_handle_b2,
 	[0xb7] = handle_lctl,
 	[0xeb] = handle_lctlg,
 };
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f4fe28a2521..0189356fe20 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -555,9 +555,14 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 		VCPU_EVENT(vcpu, 3, "inject: program check %d (from user)",
 			   s390int->parm);
 		break;
+	case KVM_S390_SIGP_SET_PREFIX:
+		inti->prefix.address = s390int->parm;
+		inti->type = s390int->type;
+		VCPU_EVENT(vcpu, 3, "inject: set prefix to %x (from user)",
+			   s390int->parm);
+		break;
 	case KVM_S390_SIGP_STOP:
 	case KVM_S390_RESTART:
-	case KVM_S390_SIGP_SET_PREFIX:
 	case KVM_S390_INT_EMERGENCY:
 		VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
 		inti->type = s390int->type;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0d33893e1e8..cbfe91e1012 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -422,8 +422,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	return -EINVAL; /* not implemented yet */
 }
 
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-				    struct kvm_debug_guest *dbg)
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
 {
 	return -EINVAL; /* not implemented yet */
 }
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 3893cf12eac..00bbe69b78d 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -50,7 +50,7 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
 
 /* implemented in priv.c */
-int kvm_s390_handle_priv(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
 
 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 3605df45dd4..4b88834b8dd 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -304,12 +304,24 @@ static intercept_handler_t priv_handlers[256] = {
 	[0xb1] = handle_stfl,
 };
 
-int kvm_s390_handle_priv(struct kvm_vcpu *vcpu)
+int kvm_s390_handle_b2(struct kvm_vcpu *vcpu)
 {
 	intercept_handler_t handler;
 
+	/*
+	 * a lot of B2 instructions are priviledged. We first check for
+	 * the priviledges ones, that we can handle in the kernel. If the
+	 * kernel can handle this instruction, we check for the problem
+	 * state bit and (a) handle the instruction or (b) send a code 2
+	 * program check.
+	 * Anything else goes to userspace.*/
 	handler = priv_handlers[vcpu->arch.sie_block->ipa & 0x00ff];
-	if (handler)
-		return handler(vcpu);
+	if (handler) {
+		if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+			return kvm_s390_inject_program_int(vcpu,
+						   PGM_PRIVILEGED_OPERATION);
+		else
+			return handler(vcpu);
+	}
 	return -ENOTSUPP;
 }
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 2a01b9e0280..f27dbedf086 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -153,8 +153,6 @@ static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
 
 	switch (parameter & 0xff) {
 	case 0:
-		printk(KERN_WARNING "kvm: request to switch to ESA/390 mode"
-							" not supported");
 		rc = 3; /* not operational */
 		break;
 	case 1:
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 886c9402ec4..dc3f6cf1170 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -15,6 +15,7 @@
 #define __KVM_HAVE_DEVICE_ASSIGNMENT
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
+#define __KVM_HAVE_GUEST_DEBUG
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -212,7 +213,30 @@ struct kvm_pit_channel_state {
 	__s64 count_load_time;
 };
 
+struct kvm_debug_exit_arch {
+	__u32 exception;
+	__u32 pad;
+	__u64 pc;
+	__u64 dr6;
+	__u64 dr7;
+};
+
+#define KVM_GUESTDBG_USE_SW_BP		0x00010000
+#define KVM_GUESTDBG_USE_HW_BP		0x00020000
+#define KVM_GUESTDBG_INJECT_DB		0x00040000
+#define KVM_GUESTDBG_INJECT_BP		0x00080000
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+	__u64 debugreg[8];
+};
+
 struct kvm_pit_state {
 	struct kvm_pit_channel_state channels[3];
 };
+
+struct kvm_reinject_control {
+	__u8 pit_reinject;
+	__u8 reserved[31];
+};
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 730843d1d2f..f0faf58044f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,7 @@
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
+#include <asm/msr-index.h>
 
 #define KVM_MAX_VCPUS 16
 #define KVM_MEMORY_SLOTS 32
@@ -134,11 +135,18 @@ enum {
 
 #define KVM_NR_MEM_OBJS 40
 
-struct kvm_guest_debug {
-	int enabled;
-	unsigned long bp[4];
-	int singlestep;
-};
+#define KVM_NR_DB_REGS	4
+
+#define DR6_BD		(1 << 13)
+#define DR6_BS		(1 << 14)
+#define DR6_FIXED_1	0xffff0ff0
+#define DR6_VOLATILE	0x0000e00f
+
+#define DR7_BP_EN_MASK	0x000000ff
+#define DR7_GE		(1 << 9)
+#define DR7_GD		(1 << 13)
+#define DR7_FIXED_1	0x00000400
+#define DR7_VOLATILE	0xffff23ff
 
 /*
  * We don't want allocation failures within the mmu code, so we preallocate
@@ -162,7 +170,8 @@ struct kvm_pte_chain {
  *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
  *   bits 4:7 - page table level for this shadow (1-4)
  *   bits 8:9 - page table quadrant for 2-level guests
- *   bit   16 - "metaphysical" - gfn is not a real page (huge page/real mode)
+ *   bit   16 - direct mapping of virtual to physical mapping at gfn
+ *              used for real mode and two-dimensional paging
  *   bits 17:19 - common access permissions for all ptes in this shadow page
  */
 union kvm_mmu_page_role {
@@ -172,9 +181,10 @@ union kvm_mmu_page_role {
 		unsigned level:4;
 		unsigned quadrant:2;
 		unsigned pad_for_nice_hex_output:6;
-		unsigned metaphysical:1;
+		unsigned direct:1;
 		unsigned access:3;
 		unsigned invalid:1;
+		unsigned cr4_pge:1;
 	};
 };
 
@@ -218,6 +228,18 @@ struct kvm_pv_mmu_op_buffer {
 	char buf[512] __aligned(sizeof(long));
 };
 
+struct kvm_pio_request {
+	unsigned long count;
+	int cur_count;
+	gva_t guest_gva;
+	int in;
+	int port;
+	int size;
+	int string;
+	int down;
+	int rep;
+};
+
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
  * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
@@ -236,6 +258,7 @@ struct kvm_mmu {
 	hpa_t root_hpa;
 	int root_level;
 	int shadow_root_level;
+	union kvm_mmu_page_role base_role;
 
 	u64 *pae_root;
 };
@@ -258,6 +281,7 @@ struct kvm_vcpu_arch {
 	unsigned long cr3;
 	unsigned long cr4;
 	unsigned long cr8;
+	u32 hflags;
 	u64 pdptrs[4]; /* pae */
 	u64 shadow_efer;
 	u64 apic_base;
@@ -338,6 +362,15 @@ struct kvm_vcpu_arch {
 
 	struct mtrr_state_type mtrr_state;
 	u32 pat;
+
+	int switch_db_regs;
+	unsigned long host_db[KVM_NR_DB_REGS];
+	unsigned long host_dr6;
+	unsigned long host_dr7;
+	unsigned long db[KVM_NR_DB_REGS];
+	unsigned long dr6;
+	unsigned long dr7;
+	unsigned long eff_db[KVM_NR_DB_REGS];
 };
 
 struct kvm_mem_alias {
@@ -378,6 +411,7 @@ struct kvm_arch{
 
 	unsigned long irq_sources_bitmap;
 	unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
+	u64 vm_init_tsc;
 };
 
 struct kvm_vm_stat {
@@ -446,8 +480,7 @@ struct kvm_x86_ops {
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
 	int (*set_guest_debug)(struct kvm_vcpu *vcpu,
-			       struct kvm_debug_guest *dbg);
-	void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
+			       struct kvm_guest_debug *dbg);
 	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -583,16 +616,12 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
 			   u32 error_code);
 
-void kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_set_irq(void *opaque, int irq, int level);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
 void fx_init(struct kvm_vcpu *vcpu);
 
-int emulator_read_std(unsigned long addr,
-		      void *val,
-		      unsigned int bytes,
-		      struct kvm_vcpu *vcpu);
 int emulator_write_emulated(unsigned long addr,
 			    const void *val,
 			    unsigned int bytes,
@@ -737,6 +766,10 @@ enum {
 	TASK_SWITCH_GATE = 3,
 };
 
+#define HF_GIF_MASK		(1 << 0)
+#define HF_HIF_MASK		(1 << 1)
+#define HF_VINTR_MASK		(1 << 2)
+
 /*
  * Hardware virtualization extension instructions may fault if a
  * reboot turns off virtualization while processes are running.
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc59ae0..f4e505f286b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -18,11 +18,15 @@
 #define _EFER_LME		8  /* Long mode enable */
 #define _EFER_LMA		10 /* Long mode active (read-only) */
 #define _EFER_NX		11 /* No execute enable */
+#define _EFER_SVME		12 /* Enable virtualization */
+#define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
 
 #define EFER_SCE		(1<<_EFER_SCE)
 #define EFER_LME		(1<<_EFER_LME)
 #define EFER_LMA		(1<<_EFER_LMA)
 #define EFER_NX			(1<<_EFER_NX)
+#define EFER_SVME		(1<<_EFER_SVME)
+#define EFER_FFXSR		(1<<_EFER_FFXSR)
 
 /* Intel MSRs. Some also available on other CPUs */
 #define MSR_IA32_PERFCTR0		0x000000c1
@@ -360,4 +364,9 @@
 #define MSR_IA32_VMX_PROCBASED_CTLS2    0x0000048b
 #define MSR_IA32_VMX_EPT_VPID_CAP       0x0000048c
 
+/* AMD-V MSRs */
+
+#define MSR_VM_CR                       0xc0010114
+#define MSR_VM_HSAVE_PA                 0xc0010117
+
 #endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1b8afa78e86..82ada75f3eb 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -174,10 +174,6 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_CPUID_FEATURE_SHIFT 2
 #define SVM_CPUID_FUNC 0x8000000a
 
-#define MSR_EFER_SVME_MASK (1ULL << 12)
-#define MSR_VM_CR       0xc0010114
-#define MSR_VM_HSAVE_PA 0xc0010117ULL
-
 #define SVM_VM_CR_SVM_DISABLE 4
 
 #define SVM_SELECTOR_S_SHIFT 4
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 59363627523..e0f9aa16358 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -118,7 +118,7 @@ static inline void cpu_svm_disable(void)
 
 	wrmsrl(MSR_VM_HSAVE_PA, 0);
 	rdmsrl(MSR_EFER, efer);
-	wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+	wrmsrl(MSR_EFER, efer & ~EFER_SVME);
 }
 
 /** Makes sure SVM is disabled, if it is supported on the CPU
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index d0238e6151d..498f944010b 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -270,8 +270,9 @@ enum vmcs_field {
 
 #define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
 #define INTR_TYPE_NMI_INTR		(2 << 8) /* NMI */
-#define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
+#define INTR_TYPE_HARD_EXCEPTION	(3 << 8) /* processor exception */
 #define INTR_TYPE_SOFT_INTR             (4 << 8) /* software interrupt */
+#define INTR_TYPE_SOFT_EXCEPTION	(6 << 8) /* software exception */
 
 /* GUEST_INTERRUPTIBILITY_INFO flags. */
 #define GUEST_INTR_STATE_STI		0x00000001
@@ -311,7 +312,7 @@ enum vmcs_field {
 #define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
 #define TYPE_MOV_TO_DR                  (0 << 4)
 #define TYPE_MOV_FROM_DR                (1 << 4)
-#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose reg. */
+#define DEBUG_REG_ACCESS_REG(eq)        (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
 
 
 /* segment AR */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b81125f0bde..0a303c3ed11 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -4,6 +4,10 @@
 config HAVE_KVM
        bool
 
+config HAVE_KVM_IRQCHIP
+       bool
+       default y
+
 menuconfig VIRTUALIZATION
 	bool "Virtualization"
 	depends on HAVE_KVM || X86
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 72bd275a9b5..c13bb92d315 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -201,6 +201,9 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 	if (!atomic_inc_and_test(&pt->pending))
 		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
 
+	if (!pt->reinject)
+		atomic_set(&pt->pending, 1);
+
 	if (vcpu0 && waitqueue_active(&vcpu0->wq))
 		wake_up_interruptible(&vcpu0->wq);
 
@@ -536,6 +539,16 @@ void kvm_pit_reset(struct kvm_pit *pit)
 	pit->pit_state.irq_ack = 1;
 }
 
+static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
+{
+	struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
+
+	if (!mask) {
+		atomic_set(&pit->pit_state.pit_timer.pending, 0);
+		pit->pit_state.irq_ack = 1;
+	}
+}
+
 struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 {
 	struct kvm_pit *pit;
@@ -545,9 +558,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	if (!pit)
 		return NULL;
 
-	mutex_lock(&kvm->lock);
 	pit->irq_source_id = kvm_request_irq_source_id(kvm);
-	mutex_unlock(&kvm->lock);
 	if (pit->irq_source_id < 0) {
 		kfree(pit);
 		return NULL;
@@ -580,10 +591,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	pit_state->irq_ack_notifier.gsi = 0;
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
 	kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
+	pit_state->pit_timer.reinject = true;
 	mutex_unlock(&pit->pit_state.lock);
 
 	kvm_pit_reset(pit);
 
+	pit->mask_notifier.func = pit_mask_notifer;
+	kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+
 	return pit;
 }
 
@@ -592,6 +607,8 @@ void kvm_free_pit(struct kvm *kvm)
 	struct hrtimer *timer;
 
 	if (kvm->arch.vpit) {
+		kvm_unregister_irq_mask_notifier(kvm, 0,
+					       &kvm->arch.vpit->mask_notifier);
 		mutex_lock(&kvm->arch.vpit->pit_state.lock);
 		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
 		hrtimer_cancel(timer);
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 4178022b97a..6acbe4b505d 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -9,6 +9,7 @@ struct kvm_kpit_timer {
 	s64 period; /* unit: ns */
 	s64 scheduled;
 	atomic_t pending;
+	bool reinject;
 };
 
 struct kvm_kpit_channel_state {
@@ -45,6 +46,7 @@ struct kvm_pit {
 	struct kvm *kvm;
 	struct kvm_kpit_state pit_state;
 	int irq_source_id;
+	struct kvm_irq_mask_notifier mask_notifier;
 };
 
 #define KVM_PIT_BASE_ADDRESS	    0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 179dcb0103f..1ccb50c74f1 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -32,11 +32,13 @@
 #include <linux/kvm_host.h>
 
 static void pic_lock(struct kvm_pic *s)
+	__acquires(&s->lock)
 {
 	spin_lock(&s->lock);
 }
 
 static void pic_unlock(struct kvm_pic *s)
+	__releases(&s->lock)
 {
 	struct kvm *kvm = s->kvm;
 	unsigned acks = s->pending_acks;
@@ -49,7 +51,8 @@ static void pic_unlock(struct kvm_pic *s)
 	spin_unlock(&s->lock);
 
 	while (acks) {
-		kvm_notify_acked_irq(kvm, __ffs(acks));
+		kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)),
+				     __ffs(acks));
 		acks &= acks - 1;
 	}
 
@@ -76,12 +79,13 @@ void kvm_pic_clear_isr_ack(struct kvm *kvm)
 /*
  * set irq level. If an edge is detected, then the IRR is set to 1
  */
-static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
+static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
 {
-	int mask;
+	int mask, ret = 1;
 	mask = 1 << irq;
 	if (s->elcr & mask)	/* level triggered */
 		if (level) {
+			ret = !(s->irr & mask);
 			s->irr |= mask;
 			s->last_irr |= mask;
 		} else {
@@ -90,11 +94,15 @@ static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
 		}
 	else	/* edge triggered */
 		if (level) {
-			if ((s->last_irr & mask) == 0)
+			if ((s->last_irr & mask) == 0) {
+				ret = !(s->irr & mask);
 				s->irr |= mask;
+			}
 			s->last_irr |= mask;
 		} else
 			s->last_irr &= ~mask;
+
+	return (s->imr & mask) ? -1 : ret;
 }
 
 /*
@@ -171,16 +179,19 @@ void kvm_pic_update_irq(struct kvm_pic *s)
 	pic_unlock(s);
 }
 
-void kvm_pic_set_irq(void *opaque, int irq, int level)
+int kvm_pic_set_irq(void *opaque, int irq, int level)
 {
 	struct kvm_pic *s = opaque;
+	int ret = -1;
 
 	pic_lock(s);
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
-		pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
+		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
 		pic_update_irq(s);
 	}
 	pic_unlock(s);
+
+	return ret;
 }
 
 /*
@@ -232,7 +243,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	}
 	pic_update_irq(s);
 	pic_unlock(s);
-	kvm_notify_acked_irq(kvm, irq);
+	kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq);
 
 	return intno;
 }
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 82579ee538d..9f593188129 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -32,6 +32,8 @@
 #include "lapic.h"
 
 #define PIC_NUM_PINS 16
+#define SELECT_PIC(irq) \
+	((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
 
 struct kvm;
 struct kvm_vcpu;
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 8e5ee99551f..ed66e4c078d 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -18,7 +18,6 @@ static const u32 host_save_user_msrs[] = {
 };
 
 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-#define NUM_DB_REGS 4
 
 struct kvm_vcpu;
 
@@ -29,18 +28,23 @@ struct vcpu_svm {
 	struct svm_cpu_data *svm_data;
 	uint64_t asid_generation;
 
-	unsigned long db_regs[NUM_DB_REGS];
-
 	u64 next_rip;
 
 	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
 	u64 host_gs_base;
 	unsigned long host_cr2;
-	unsigned long host_db_regs[NUM_DB_REGS];
-	unsigned long host_dr6;
-	unsigned long host_dr7;
 
 	u32 *msrpm;
+	struct vmcb *hsave;
+	u64 hsave_msr;
+
+	u64 nested_vmcb;
+
+	/* These are the merged vectors */
+	u32 *nested_msrpm;
+
+	/* gpa pointers to the real vectors */
+	u64 nested_vmcb_msrpm;
 };
 
 #endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2d4477c7147..2a36f7f7c4c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -145,11 +145,20 @@ struct kvm_rmap_desc {
 	struct kvm_rmap_desc *more;
 };
 
-struct kvm_shadow_walk {
-	int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
-		     u64 addr, u64 *spte, int level);
+struct kvm_shadow_walk_iterator {
+	u64 addr;
+	hpa_t shadow_addr;
+	int level;
+	u64 *sptep;
+	unsigned index;
 };
 
+#define for_each_shadow_entry(_vcpu, _addr, _walker)    \
+	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
+	     shadow_walk_okay(&(_walker));			\
+	     shadow_walk_next(&(_walker)))
+
+
 struct kvm_unsync_walk {
 	int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
 };
@@ -343,7 +352,6 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
 
 	BUG_ON(!mc->nobjs);
 	p = mc->objects[--mc->nobjs];
-	memset(p, 0, size);
 	return p;
 }
 
@@ -794,10 +802,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&sp->oos_link);
-	ASSERT(is_empty_shadow_page(sp->spt));
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
-	sp->global = 1;
 	sp->parent_pte = parent_pte;
 	--vcpu->kvm->arch.n_free_mmu_pages;
 	return sp;
@@ -983,8 +989,8 @@ struct kvm_mmu_pages {
 	     idx < 512;					\
 	     idx = find_next_bit(bitmap, 512, idx+1))
 
-int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
-		   int idx)
+static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+			 int idx)
 {
 	int i;
 
@@ -1059,7 +1065,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.metaphysical
+		if (sp->gfn == gfn && !sp->role.direct
 		    && !sp->role.invalid) {
 			pgprintk("%s: found role %x\n",
 				 __func__, sp->role.word);
@@ -1115,8 +1121,9 @@ struct mmu_page_path {
 			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
 			i = mmu_pages_next(&pvec, &parents, i))
 
-int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
-		   int i)
+static int mmu_pages_next(struct kvm_mmu_pages *pvec,
+			  struct mmu_page_path *parents,
+			  int i)
 {
 	int n;
 
@@ -1135,7 +1142,7 @@ int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
 	return n;
 }
 
-void mmu_pages_clear_parents(struct mmu_page_path *parents)
+static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 {
 	struct kvm_mmu_page *sp;
 	unsigned int level = 0;
@@ -1193,7 +1200,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
 					     unsigned level,
-					     int metaphysical,
+					     int direct,
 					     unsigned access,
 					     u64 *parent_pte)
 {
@@ -1204,10 +1211,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *tmp;
 
-	role.word = 0;
-	role.glevels = vcpu->arch.mmu.root_level;
+	role = vcpu->arch.mmu.base_role;
 	role.level = level;
-	role.metaphysical = metaphysical;
+	role.direct = direct;
 	role.access = access;
 	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1242,8 +1248,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
 	sp->gfn = gfn;
 	sp->role = role;
+	sp->global = role.cr4_pge;
 	hlist_add_head(&sp->hash_link, bucket);
-	if (!metaphysical) {
+	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		account_shadowed(vcpu->kvm, gfn);
@@ -1255,35 +1262,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	return sp;
 }
 
-static int walk_shadow(struct kvm_shadow_walk *walker,
-		       struct kvm_vcpu *vcpu, u64 addr)
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+			     struct kvm_vcpu *vcpu, u64 addr)
 {
-	hpa_t shadow_addr;
-	int level;
-	int r;
-	u64 *sptep;
-	unsigned index;
-
-	shadow_addr = vcpu->arch.mmu.root_hpa;
-	level = vcpu->arch.mmu.shadow_root_level;
-	if (level == PT32E_ROOT_LEVEL) {
-		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
-		shadow_addr &= PT64_BASE_ADDR_MASK;
-		if (!shadow_addr)
-			return 1;
-		--level;
+	iterator->addr = addr;
+	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
+	iterator->level = vcpu->arch.mmu.shadow_root_level;
+	if (iterator->level == PT32E_ROOT_LEVEL) {
+		iterator->shadow_addr
+			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
+		--iterator->level;
+		if (!iterator->shadow_addr)
+			iterator->level = 0;
 	}
+}
 
-	while (level >= PT_PAGE_TABLE_LEVEL) {
-		index = SHADOW_PT_INDEX(addr, level);
-		sptep = ((u64 *)__va(shadow_addr)) + index;
-		r = walker->entry(walker, vcpu, addr, sptep, level);
-		if (r)
-			return r;
-		shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
-		--level;
-	}
-	return 0;
+static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
+{
+	if (iterator->level < PT_PAGE_TABLE_LEVEL)
+		return false;
+	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
+	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
+	return true;
+}
+
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
+	--iterator->level;
 }
 
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
@@ -1388,7 +1395,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
 	kvm_flush_remote_tlbs(kvm);
-	if (!sp->role.invalid && !sp->role.metaphysical)
+	if (!sp->role.invalid && !sp->role.direct)
 		unaccount_shadowed(kvm, sp->gfn);
 	if (sp->unsync)
 		kvm_unlink_unsync_page(kvm, sp);
@@ -1451,7 +1458,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.metaphysical) {
+		if (sp->gfn == gfn && !sp->role.direct) {
 			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
 			r = 1;
@@ -1463,11 +1470,20 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
+	unsigned index;
+	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
+	struct hlist_node *node, *nn;
 
-	while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
-		pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
-		kvm_mmu_zap_page(kvm, sp);
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &kvm->arch.mmu_page_hash[index];
+	hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
+		if (sp->gfn == gfn && !sp->role.direct
+		    && !sp->role.invalid) {
+			pgprintk("%s: zap %lx %x\n",
+				 __func__, gfn, sp->role.word);
+			kvm_mmu_zap_page(kvm, sp);
+		}
 	}
 }
 
@@ -1622,7 +1638,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	/* don't unsync if pagetable is shadowed with multiple roles */
 	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != sp->gfn || s->role.metaphysical)
+		if (s->gfn != sp->gfn || s->role.direct)
 			continue;
 		if (s->role.word != sp->role.word)
 			return 1;
@@ -1669,8 +1685,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	u64 mt_mask = shadow_mt_mask;
 	struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
 
-	if (!(vcpu->arch.cr4 & X86_CR4_PGE))
-		global = 0;
 	if (!global && sp->global) {
 		sp->global = 0;
 		if (sp->unsync) {
@@ -1777,12 +1791,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			pgprintk("hfn old %lx new %lx\n",
 				 spte_to_pfn(*shadow_pte), pfn);
 			rmap_remove(vcpu->kvm, shadow_pte);
-		} else {
-			if (largepage)
-				was_rmapped = is_large_pte(*shadow_pte);
-			else
-				was_rmapped = 1;
-		}
+		} else
+			was_rmapped = 1;
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
 		      dirty, largepage, global, gfn, pfn, speculative, true)) {
@@ -1820,67 +1830,42 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
 
-struct direct_shadow_walk {
-	struct kvm_shadow_walk walker;
-	pfn_t pfn;
-	int write;
-	int largepage;
-	int pt_write;
-};
-
-static int direct_map_entry(struct kvm_shadow_walk *_walk,
-			    struct kvm_vcpu *vcpu,
-			    u64 addr, u64 *sptep, int level)
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+			int largepage, gfn_t gfn, pfn_t pfn)
 {
-	struct direct_shadow_walk *walk =
-		container_of(_walk, struct direct_shadow_walk, walker);
+	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
+	int pt_write = 0;
 	gfn_t pseudo_gfn;
-	gfn_t gfn = addr >> PAGE_SHIFT;
-
-	if (level == PT_PAGE_TABLE_LEVEL
-	    || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
-		mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
-			     0, walk->write, 1, &walk->pt_write,
-			     walk->largepage, 0, gfn, walk->pfn, false);
-		++vcpu->stat.pf_fixed;
-		return 1;
-	}
 
-	if (*sptep == shadow_trap_nonpresent_pte) {
-		pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
-		sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
-				      1, ACC_ALL, sptep);
-		if (!sp) {
-			pgprintk("nonpaging_map: ENOMEM\n");
-			kvm_release_pfn_clean(walk->pfn);
-			return -ENOMEM;
+	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
+		if (iterator.level == PT_PAGE_TABLE_LEVEL
+		    || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
+			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
+				     0, write, 1, &pt_write,
+				     largepage, 0, gfn, pfn, false);
+			++vcpu->stat.pf_fixed;
+			break;
 		}
 
-		set_shadow_pte(sptep,
-			       __pa(sp->spt)
-			       | PT_PRESENT_MASK | PT_WRITABLE_MASK
-			       | shadow_user_mask | shadow_x_mask);
-	}
-	return 0;
-}
+		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+			pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
+					      iterator.level - 1,
+					      1, ACC_ALL, iterator.sptep);
+			if (!sp) {
+				pgprintk("nonpaging_map: ENOMEM\n");
+				kvm_release_pfn_clean(pfn);
+				return -ENOMEM;
+			}
 
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			int largepage, gfn_t gfn, pfn_t pfn)
-{
-	int r;
-	struct direct_shadow_walk walker = {
-		.walker = { .entry = direct_map_entry, },
-		.pfn = pfn,
-		.largepage = largepage,
-		.write = write,
-		.pt_write = 0,
-	};
-
-	r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
-	if (r < 0)
-		return r;
-	return walker.pt_write;
+			set_shadow_pte(iterator.sptep,
+				       __pa(sp->spt)
+				       | PT_PRESENT_MASK | PT_WRITABLE_MASK
+				       | shadow_user_mask | shadow_x_mask);
+		}
+	}
+	return pt_write;
 }
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1962,7 +1947,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	int i;
 	gfn_t root_gfn;
 	struct kvm_mmu_page *sp;
-	int metaphysical = 0;
+	int direct = 0;
 
 	root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
 
@@ -1971,18 +1956,18 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
 		ASSERT(!VALID_PAGE(root));
 		if (tdp_enabled)
-			metaphysical = 1;
+			direct = 1;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
-				      PT64_ROOT_LEVEL, metaphysical,
+				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		vcpu->arch.mmu.root_hpa = root;
 		return;
 	}
-	metaphysical = !is_paging(vcpu);
+	direct = !is_paging(vcpu);
 	if (tdp_enabled)
-		metaphysical = 1;
+		direct = 1;
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -1996,7 +1981,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
-				      PT32_ROOT_LEVEL, metaphysical,
+				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
@@ -2251,17 +2236,23 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
+	int r;
+
 	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	if (!is_paging(vcpu))
-		return nonpaging_init_context(vcpu);
+		r = nonpaging_init_context(vcpu);
 	else if (is_long_mode(vcpu))
-		return paging64_init_context(vcpu);
+		r = paging64_init_context(vcpu);
 	else if (is_pae(vcpu))
-		return paging32E_init_context(vcpu);
+		r = paging32E_init_context(vcpu);
 	else
-		return paging32_init_context(vcpu);
+		r = paging32_init_context(vcpu);
+
+	vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
+
+	return r;
 }
 
 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -2492,7 +2483,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-		if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
+		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
 			continue;
 		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -3130,7 +3121,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 	gfn_t gfn;
 
 	list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
-		if (sp->role.metaphysical)
+		if (sp->role.direct)
 			continue;
 
 		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 258e5d56298..eaab2145f62 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,7 +54,7 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 static inline int is_long_mode(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
-	return vcpu->arch.shadow_efer & EFER_LME;
+	return vcpu->arch.shadow_efer & EFER_LMA;
 #else
 	return 0;
 #endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9fd78b6e17a..6bd70206c56 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,7 +25,6 @@
 #if PTTYPE == 64
 	#define pt_element_t u64
 	#define guest_walker guest_walker64
-	#define shadow_walker shadow_walker64
 	#define FNAME(name) paging##64_##name
 	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
@@ -42,7 +41,6 @@
 #elif PTTYPE == 32
 	#define pt_element_t u32
 	#define guest_walker guest_walker32
-	#define shadow_walker shadow_walker32
 	#define FNAME(name) paging##32_##name
 	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
@@ -73,18 +71,6 @@ struct guest_walker {
 	u32 error_code;
 };
 
-struct shadow_walker {
-	struct kvm_shadow_walk walker;
-	struct guest_walker *guest_walker;
-	int user_fault;
-	int write_fault;
-	int largepage;
-	int *ptwrite;
-	pfn_t pfn;
-	u64 *sptep;
-	gpa_t pte_gpa;
-};
-
 static gfn_t gpte_to_gfn(pt_element_t gpte)
 {
 	return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -283,91 +269,79 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
-static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
-				    struct kvm_vcpu *vcpu, u64 addr,
-				    u64 *sptep, int level)
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+			 struct guest_walker *gw,
+			 int user_fault, int write_fault, int largepage,
+			 int *ptwrite, pfn_t pfn)
 {
-	struct shadow_walker *sw =
-		container_of(_sw, struct shadow_walker, walker);
-	struct guest_walker *gw = sw->guest_walker;
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *shadow_page;
-	u64 spte;
-	int metaphysical;
+	u64 spte, *sptep;
+	int direct;
 	gfn_t table_gfn;
 	int r;
+	int level;
 	pt_element_t curr_pte;
+	struct kvm_shadow_walk_iterator iterator;
 
-	if (level == PT_PAGE_TABLE_LEVEL
-	    || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
-		mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
-			     sw->user_fault, sw->write_fault,
-			     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
-			     sw->ptwrite, sw->largepage,
-			     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
-			     gw->gfn, sw->pfn, false);
-		sw->sptep = sptep;
-		return 1;
-	}
+	if (!is_present_pte(gw->ptes[gw->level - 1]))
+		return NULL;
 
-	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
-		return 0;
+	for_each_shadow_entry(vcpu, addr, iterator) {
+		level = iterator.level;
+		sptep = iterator.sptep;
+		if (level == PT_PAGE_TABLE_LEVEL
+		    || (largepage && level == PT_DIRECTORY_LEVEL)) {
+			mmu_set_spte(vcpu, sptep, access,
+				     gw->pte_access & access,
+				     user_fault, write_fault,
+				     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
+				     ptwrite, largepage,
+				     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
+				     gw->gfn, pfn, false);
+			break;
+		}
 
-	if (is_large_pte(*sptep)) {
-		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
-		kvm_flush_remote_tlbs(vcpu->kvm);
-		rmap_remove(vcpu->kvm, sptep);
-	}
+		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+			continue;
 
-	if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
-		metaphysical = 1;
-		if (!is_dirty_pte(gw->ptes[level - 1]))
-			access &= ~ACC_WRITE_MASK;
-		table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
-	} else {
-		metaphysical = 0;
-		table_gfn = gw->table_gfn[level - 2];
-	}
-	shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
-				       metaphysical, access, sptep);
-	if (!metaphysical) {
-		r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
-					  &curr_pte, sizeof(curr_pte));
-		if (r || curr_pte != gw->ptes[level - 2]) {
-			kvm_mmu_put_page(shadow_page, sptep);
-			kvm_release_pfn_clean(sw->pfn);
-			sw->sptep = NULL;
-			return 1;
+		if (is_large_pte(*sptep)) {
+			rmap_remove(vcpu->kvm, sptep);
+			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+			kvm_flush_remote_tlbs(vcpu->kvm);
 		}
-	}
 
-	spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
-		| PT_WRITABLE_MASK | PT_USER_MASK;
-	*sptep = spte;
-	return 0;
-}
-
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
-			 struct guest_walker *guest_walker,
-			 int user_fault, int write_fault, int largepage,
-			 int *ptwrite, pfn_t pfn)
-{
-	struct shadow_walker walker = {
-		.walker = { .entry = FNAME(shadow_walk_entry), },
-		.guest_walker = guest_walker,
-		.user_fault = user_fault,
-		.write_fault = write_fault,
-		.largepage = largepage,
-		.ptwrite = ptwrite,
-		.pfn = pfn,
-	};
-
-	if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
-		return NULL;
+		if (level == PT_DIRECTORY_LEVEL
+		    && gw->level == PT_DIRECTORY_LEVEL) {
+			direct = 1;
+			if (!is_dirty_pte(gw->ptes[level - 1]))
+				access &= ~ACC_WRITE_MASK;
+			table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
+		} else {
+			direct = 0;
+			table_gfn = gw->table_gfn[level - 2];
+		}
+		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+					       direct, access, sptep);
+		if (!direct) {
+			r = kvm_read_guest_atomic(vcpu->kvm,
+						  gw->pte_gpa[level - 2],
+						  &curr_pte, sizeof(curr_pte));
+			if (r || curr_pte != gw->ptes[level - 2]) {
+				kvm_mmu_put_page(shadow_page, sptep);
+				kvm_release_pfn_clean(pfn);
+				sptep = NULL;
+				break;
+			}
+		}
 
-	walk_shadow(&walker.walker, vcpu, addr);
+		spte = __pa(shadow_page->spt)
+			| PT_PRESENT_MASK | PT_ACCESSED_MASK
+			| PT_WRITABLE_MASK | PT_USER_MASK;
+		*sptep = spte;
+	}
 
-	return walker.sptep;
+	return sptep;
 }
 
 /*
@@ -465,54 +439,56 @@ out_unlock:
 	return 0;
 }
 
-static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
-				      struct kvm_vcpu *vcpu, u64 addr,
-				      u64 *sptep, int level)
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
-	struct shadow_walker *sw =
-		container_of(_sw, struct shadow_walker, walker);
-
-	/* FIXME: properly handle invlpg on large guest pages */
-	if (level == PT_PAGE_TABLE_LEVEL ||
-	    ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
-		struct kvm_mmu_page *sp = page_header(__pa(sptep));
+	struct kvm_shadow_walk_iterator iterator;
+	pt_element_t gpte;
+	gpa_t pte_gpa = -1;
+	int level;
+	u64 *sptep;
+	int need_flush = 0;
 
-		sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
-		sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+	spin_lock(&vcpu->kvm->mmu_lock);
 
-		if (is_shadow_present_pte(*sptep)) {
-			rmap_remove(vcpu->kvm, sptep);
-			if (is_large_pte(*sptep))
-				--vcpu->kvm->stat.lpages;
+	for_each_shadow_entry(vcpu, gva, iterator) {
+		level = iterator.level;
+		sptep = iterator.sptep;
+
+		/* FIXME: properly handle invlpg on large guest pages */
+		if (level == PT_PAGE_TABLE_LEVEL ||
+		    ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+			struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+			pte_gpa = (sp->gfn << PAGE_SHIFT);
+			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+			if (is_shadow_present_pte(*sptep)) {
+				rmap_remove(vcpu->kvm, sptep);
+				if (is_large_pte(*sptep))
+					--vcpu->kvm->stat.lpages;
+				need_flush = 1;
+			}
+			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+			break;
 		}
-		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
-		return 1;
-	}
-	if (!is_shadow_present_pte(*sptep))
-		return 1;
-	return 0;
-}
 
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	pt_element_t gpte;
-	struct shadow_walker walker = {
-		.walker = { .entry = FNAME(shadow_invlpg_entry), },
-		.pte_gpa = -1,
-	};
+		if (!is_shadow_present_pte(*sptep))
+			break;
+	}
 
-	spin_lock(&vcpu->kvm->mmu_lock);
-	walk_shadow(&walker.walker, vcpu, gva);
+	if (need_flush)
+		kvm_flush_remote_tlbs(vcpu->kvm);
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	if (walker.pte_gpa == -1)
+
+	if (pte_gpa == -1)
 		return;
-	if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
+	if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
 				  sizeof(pt_element_t)))
 		return;
 	if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
 		if (mmu_topup_memory_caches(vcpu))
 			return;
-		kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
+		kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
 				  sizeof(pt_element_t), 0);
 	}
 }
@@ -540,7 +516,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 	pt_element_t pt[256 / sizeof(pt_element_t)];
 	gpa_t pte_gpa;
 
-	if (sp->role.metaphysical
+	if (sp->role.direct
 	    || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
 		nonpaging_prefetch_page(vcpu, sp);
 		return;
@@ -619,7 +595,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 #undef pt_element_t
 #undef guest_walker
-#undef shadow_walker
 #undef FNAME
 #undef PT_BASE_ADDR_MASK
 #undef PT_INDEX
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a9e769e4e25..1821c207819 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -38,9 +38,6 @@ MODULE_LICENSE("GPL");
 #define IOPM_ALLOC_ORDER 2
 #define MSRPM_ALLOC_ORDER 1
 
-#define DR7_GD_MASK (1 << 13)
-#define DR6_BD_MASK (1 << 13)
-
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
@@ -50,6 +47,15 @@ MODULE_LICENSE("GPL");
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
+/* Turn on to get debugging output*/
+/* #define NESTED_DEBUG */
+
+#ifdef NESTED_DEBUG
+#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
+#else
+#define nsvm_printk(fmt, args...) do {} while(0)
+#endif
+
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
@@ -60,14 +66,29 @@ static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
 
+static int nested = 0;
+module_param(nested, int, S_IRUGO);
+
 static void kvm_reput_irq(struct vcpu_svm *svm);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 
+static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
+static int nested_svm_vmexit(struct vcpu_svm *svm);
+static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
+			     void *arg2, void *opaque);
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+				      bool has_error_code, u32 error_code);
+
 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 {
 	return container_of(vcpu, struct vcpu_svm, vcpu);
 }
 
+static inline bool is_nested(struct vcpu_svm *svm)
+{
+	return svm->nested_vmcb;
+}
+
 static unsigned long iopm_base;
 
 struct kvm_ldttss_desc {
@@ -157,32 +178,6 @@ static inline void kvm_write_cr2(unsigned long val)
 	asm volatile ("mov %0, %%cr2" :: "r" (val));
 }
 
-static inline unsigned long read_dr6(void)
-{
-	unsigned long dr6;
-
-	asm volatile ("mov %%dr6, %0" : "=r" (dr6));
-	return dr6;
-}
-
-static inline void write_dr6(unsigned long val)
-{
-	asm volatile ("mov %0, %%dr6" :: "r" (val));
-}
-
-static inline unsigned long read_dr7(void)
-{
-	unsigned long dr7;
-
-	asm volatile ("mov %%dr7, %0" : "=r" (dr7));
-	return dr7;
-}
-
-static inline void write_dr7(unsigned long val)
-{
-	asm volatile ("mov %0, %%dr7" :: "r" (val));
-}
-
 static inline void force_new_asid(struct kvm_vcpu *vcpu)
 {
 	to_svm(vcpu)->asid_generation--;
@@ -198,7 +193,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	if (!npt_enabled && !(efer & EFER_LMA))
 		efer &= ~EFER_LME;
 
-	to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
+	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
 	vcpu->arch.shadow_efer = efer;
 }
 
@@ -207,6 +202,11 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	/* If we are within a nested VM we'd better #VMEXIT and let the
+	   guest handle the exception */
+	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
+		return;
+
 	svm->vmcb->control.event_inj = nr
 		| SVM_EVTINJ_VALID
 		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
@@ -242,7 +242,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	kvm_rip_write(vcpu, svm->next_rip);
 	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 
-	vcpu->arch.interrupt_window_open = 1;
+	vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
 static int has_svm(void)
@@ -250,7 +250,7 @@ static int has_svm(void)
 	const char *msg;
 
 	if (!cpu_has_svm(&msg)) {
-		printk(KERN_INFO "has_svn: %s\n", msg);
+		printk(KERN_INFO "has_svm: %s\n", msg);
 		return 0;
 	}
 
@@ -292,7 +292,7 @@ static void svm_hardware_enable(void *garbage)
 	svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
 	rdmsrl(MSR_EFER, efer);
-	wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
+	wrmsrl(MSR_EFER, efer | EFER_SVME);
 
 	wrmsrl(MSR_VM_HSAVE_PA,
 	       page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
@@ -417,6 +417,14 @@ static __init int svm_hardware_setup(void)
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
+	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+		kvm_enable_efer_bits(EFER_FFXSR);
+
+	if (nested) {
+		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+		kvm_enable_efer_bits(EFER_SVME);
+	}
+
 	for_each_online_cpu(cpu) {
 		r = svm_cpu_init(cpu);
 		if (r)
@@ -559,7 +567,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-	save->efer = MSR_EFER_SVME_MASK;
+	save->efer = EFER_SVME;
 	save->dr6 = 0xffff0ff0;
 	save->dr7 = 0x400;
 	save->rflags = 2;
@@ -591,6 +599,9 @@ static void init_vmcb(struct vcpu_svm *svm)
 		save->cr4 = 0;
 	}
 	force_new_asid(&svm->vcpu);
+
+	svm->nested_vmcb = 0;
+	svm->vcpu.arch.hflags = HF_GIF_MASK;
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -615,6 +626,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	struct vcpu_svm *svm;
 	struct page *page;
 	struct page *msrpm_pages;
+	struct page *hsave_page;
+	struct page *nested_msrpm_pages;
 	int err;
 
 	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -637,14 +650,25 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
 	if (!msrpm_pages)
 		goto uninit;
+
+	nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+	if (!nested_msrpm_pages)
+		goto uninit;
+
 	svm->msrpm = page_address(msrpm_pages);
 	svm_vcpu_init_msrpm(svm->msrpm);
 
+	hsave_page = alloc_page(GFP_KERNEL);
+	if (!hsave_page)
+		goto uninit;
+	svm->hsave = page_address(hsave_page);
+
+	svm->nested_msrpm = page_address(nested_msrpm_pages);
+
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
-	memset(svm->db_regs, 0, sizeof(svm->db_regs));
 	init_vmcb(svm);
 
 	fx_init(&svm->vcpu);
@@ -669,6 +693,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 
 	__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
 	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+	__free_page(virt_to_page(svm->hsave));
+	__free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, svm);
 }
@@ -718,6 +744,16 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	to_svm(vcpu)->vmcb->save.rflags = rflags;
 }
 
+static void svm_set_vintr(struct vcpu_svm *svm)
+{
+	svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
+}
+
+static void svm_clear_vintr(struct vcpu_svm *svm)
+{
+	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+}
+
 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
 {
 	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
@@ -760,20 +796,37 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
 	var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
 
-	/*
-	 * SVM always stores 0 for the 'G' bit in the CS selector in
-	 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
-	 * Intel's VMENTRY has a check on the 'G' bit.
-	 */
-	if (seg == VCPU_SREG_CS)
+	switch (seg) {
+	case VCPU_SREG_CS:
+		/*
+		 * SVM always stores 0 for the 'G' bit in the CS selector in
+		 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
+		 * Intel's VMENTRY has a check on the 'G' bit.
+		 */
 		var->g = s->limit > 0xfffff;
-
-	/*
-	 * Work around a bug where the busy flag in the tr selector
-	 * isn't exposed
-	 */
-	if (seg == VCPU_SREG_TR)
+		break;
+	case VCPU_SREG_TR:
+		/*
+		 * Work around a bug where the busy flag in the tr selector
+		 * isn't exposed
+		 */
 		var->type |= 0x2;
+		break;
+	case VCPU_SREG_DS:
+	case VCPU_SREG_ES:
+	case VCPU_SREG_FS:
+	case VCPU_SREG_GS:
+		/*
+		 * The accessed bit must always be set in the segment
+		 * descriptor cache, although it can be cleared in the
+		 * descriptor, the cached bit always remains at 1. Since
+		 * Intel has a check on this, set it here to support
+		 * cross-vendor migration.
+		 */
+		if (!var->unusable)
+			var->type |= 0x1;
+		break;
+	}
 
 	var->unusable = !var->present;
 }
@@ -905,9 +958,37 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 
 }
 
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-	return -EOPNOTSUPP;
+	int old_debug = vcpu->guest_debug;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	vcpu->guest_debug = dbg->control;
+
+	svm->vmcb->control.intercept_exceptions &=
+		~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			svm->vmcb->control.intercept_exceptions |=
+				1 << DB_VECTOR;
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			svm->vmcb->control.intercept_exceptions |=
+				1 << BP_VECTOR;
+	} else
+		vcpu->guest_debug = 0;
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+		svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
+	else
+		svm->vmcb->save.dr7 = vcpu->arch.dr7;
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+	else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
+		svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+	return 0;
 }
 
 static int svm_get_irq(struct kvm_vcpu *vcpu)
@@ -949,7 +1030,29 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
 
 static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
 {
-	unsigned long val = to_svm(vcpu)->db_regs[dr];
+	struct vcpu_svm *svm = to_svm(vcpu);
+	unsigned long val;
+
+	switch (dr) {
+	case 0 ... 3:
+		val = vcpu->arch.db[dr];
+		break;
+	case 6:
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+			val = vcpu->arch.dr6;
+		else
+			val = svm->vmcb->save.dr6;
+		break;
+	case 7:
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+			val = vcpu->arch.dr7;
+		else
+			val = svm->vmcb->save.dr7;
+		break;
+	default:
+		val = 0;
+	}
+
 	KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
 	return val;
 }
@@ -959,33 +1062,40 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	*exception = 0;
+	KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
 
-	if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
-		svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
-		svm->vmcb->save.dr6 |= DR6_BD_MASK;
-		*exception = DB_VECTOR;
-		return;
-	}
+	*exception = 0;
 
 	switch (dr) {
 	case 0 ... 3:
-		svm->db_regs[dr] = value;
+		vcpu->arch.db[dr] = value;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+			vcpu->arch.eff_db[dr] = value;
 		return;
 	case 4 ... 5:
-		if (vcpu->arch.cr4 & X86_CR4_DE) {
+		if (vcpu->arch.cr4 & X86_CR4_DE)
 			*exception = UD_VECTOR;
+		return;
+	case 6:
+		if (value & 0xffffffff00000000ULL) {
+			*exception = GP_VECTOR;
 			return;
 		}
-	case 7: {
-		if (value & ~((1ULL << 32) - 1)) {
+		vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
+		return;
+	case 7:
+		if (value & 0xffffffff00000000ULL) {
 			*exception = GP_VECTOR;
 			return;
 		}
-		svm->vmcb->save.dr7 = value;
+		vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+			svm->vmcb->save.dr7 = vcpu->arch.dr7;
+			vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
+		}
 		return;
-	}
 	default:
+		/* FIXME: Possible case? */
 		printk(KERN_DEBUG "%s: unexpected dr %u\n",
 		       __func__, dr);
 		*exception = UD_VECTOR;
@@ -1031,6 +1141,27 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
 
+static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (!(svm->vcpu.guest_debug &
+	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+		kvm_queue_exception(&svm->vcpu, DB_VECTOR);
+		return 1;
+	}
+	kvm_run->exit_reason = KVM_EXIT_DEBUG;
+	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+	kvm_run->debug.arch.exception = DB_VECTOR;
+	return 0;
+}
+
+static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	kvm_run->exit_reason = KVM_EXIT_DEBUG;
+	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+	kvm_run->debug.arch.exception = BP_VECTOR;
+	return 0;
+}
+
 static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	int er;
@@ -1080,7 +1211,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
-	int size, down, in, string, rep;
+	int size, in, string;
 	unsigned port;
 
 	++svm->vcpu.stat.io_exits;
@@ -1099,8 +1230,6 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
 	port = io_info >> 16;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
-	rep = (io_info & SVM_IOIO_REP_MASK) != 0;
-	down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
 
 	skip_emulated_instruction(&svm->vcpu);
 	return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
@@ -1139,6 +1268,567 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int nested_svm_check_permissions(struct vcpu_svm *svm)
+{
+	if (!(svm->vcpu.arch.shadow_efer & EFER_SVME)
+	    || !is_paging(&svm->vcpu)) {
+		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	if (svm->vmcb->save.cpl) {
+		kvm_inject_gp(&svm->vcpu, 0);
+		return 1;
+	}
+
+       return 0;
+}
+
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+				      bool has_error_code, u32 error_code)
+{
+	if (is_nested(svm)) {
+		svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+		svm->vmcb->control.exit_code_hi = 0;
+		svm->vmcb->control.exit_info_1 = error_code;
+		svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+		if (nested_svm_exit_handled(svm, false)) {
+			nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
+
+			nested_svm_vmexit(svm);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static inline int nested_svm_intr(struct vcpu_svm *svm)
+{
+	if (is_nested(svm)) {
+		if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+			return 0;
+
+		if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
+			return 0;
+
+		svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+
+		if (nested_svm_exit_handled(svm, false)) {
+			nsvm_printk("VMexit -> INTR\n");
+			nested_svm_vmexit(svm);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
+{
+	struct page *page;
+
+	down_read(&current->mm->mmap_sem);
+	page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
+
+	if (is_error_page(page)) {
+		printk(KERN_INFO "%s: could not find page at 0x%llx\n",
+		       __func__, gpa);
+		kvm_release_page_clean(page);
+		kvm_inject_gp(&svm->vcpu, 0);
+		return NULL;
+	}
+	return page;
+}
+
+static int nested_svm_do(struct vcpu_svm *svm,
+			 u64 arg1_gpa, u64 arg2_gpa, void *opaque,
+			 int (*handler)(struct vcpu_svm *svm,
+					void *arg1,
+					void *arg2,
+					void *opaque))
+{
+	struct page *arg1_page;
+	struct page *arg2_page = NULL;
+	void *arg1;
+	void *arg2 = NULL;
+	int retval;
+
+	arg1_page = nested_svm_get_page(svm, arg1_gpa);
+	if(arg1_page == NULL)
+		return 1;
+
+	if (arg2_gpa) {
+		arg2_page = nested_svm_get_page(svm, arg2_gpa);
+		if(arg2_page == NULL) {
+			kvm_release_page_clean(arg1_page);
+			return 1;
+		}
+	}
+
+	arg1 = kmap_atomic(arg1_page, KM_USER0);
+	if (arg2_gpa)
+		arg2 = kmap_atomic(arg2_page, KM_USER1);
+
+	retval = handler(svm, arg1, arg2, opaque);
+
+	kunmap_atomic(arg1, KM_USER0);
+	if (arg2_gpa)
+		kunmap_atomic(arg2, KM_USER1);
+
+	kvm_release_page_dirty(arg1_page);
+	if (arg2_gpa)
+		kvm_release_page_dirty(arg2_page);
+
+	return retval;
+}
+
+static int nested_svm_exit_handled_real(struct vcpu_svm *svm,
+					void *arg1,
+					void *arg2,
+					void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	bool kvm_overrides = *(bool *)opaque;
+	u32 exit_code = svm->vmcb->control.exit_code;
+
+	if (kvm_overrides) {
+		switch (exit_code) {
+		case SVM_EXIT_INTR:
+		case SVM_EXIT_NMI:
+			return 0;
+		/* For now we are always handling NPFs when using them */
+		case SVM_EXIT_NPF:
+			if (npt_enabled)
+				return 0;
+			break;
+		/* When we're shadowing, trap PFs */
+		case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+			if (!npt_enabled)
+				return 0;
+			break;
+		default:
+			break;
+		}
+	}
+
+	switch (exit_code) {
+	case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
+		u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
+		if (nested_vmcb->control.intercept_cr_read & cr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
+		u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
+		if (nested_vmcb->control.intercept_cr_write & cr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
+		u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
+		if (nested_vmcb->control.intercept_dr_read & dr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
+		u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
+		if (nested_vmcb->control.intercept_dr_write & dr_bits)
+			return 1;
+		break;
+	}
+	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+		if (nested_vmcb->control.intercept_exceptions & excp_bits)
+			return 1;
+		break;
+	}
+	default: {
+		u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
+		nsvm_printk("exit code: 0x%x\n", exit_code);
+		if (nested_vmcb->control.intercept & exit_bits)
+			return 1;
+	}
+	}
+
+	return 0;
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
+				       void *arg1, void *arg2,
+				       void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	u8 *msrpm = (u8 *)arg2;
+        u32 t0, t1;
+	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	u32 param = svm->vmcb->control.exit_info_1 & 1;
+
+	if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+		return 0;
+
+	switch(msr) {
+	case 0 ... 0x1fff:
+		t0 = (msr * 2) % 8;
+		t1 = msr / 8;
+		break;
+	case 0xc0000000 ... 0xc0001fff:
+		t0 = (8192 + msr - 0xc0000000) * 2;
+		t1 = (t0 / 8);
+		t0 %= 8;
+		break;
+	case 0xc0010000 ... 0xc0011fff:
+		t0 = (16384 + msr - 0xc0010000) * 2;
+		t1 = (t0 / 8);
+		t0 %= 8;
+		break;
+	default:
+		return 1;
+		break;
+	}
+	if (msrpm[t1] & ((1 << param) << t0))
+		return 1;
+
+	return 0;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
+{
+	bool k = kvm_override;
+
+	switch (svm->vmcb->control.exit_code) {
+	case SVM_EXIT_MSR:
+		return nested_svm_do(svm, svm->nested_vmcb,
+				     svm->nested_vmcb_msrpm, NULL,
+				     nested_svm_exit_handled_msr);
+	default: break;
+	}
+
+	return nested_svm_do(svm, svm->nested_vmcb, 0, &k,
+			     nested_svm_exit_handled_real);
+}
+
+static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
+				  void *arg2, void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	struct vmcb *hsave = svm->hsave;
+	u64 nested_save[] = { nested_vmcb->save.cr0,
+			      nested_vmcb->save.cr3,
+			      nested_vmcb->save.cr4,
+			      nested_vmcb->save.efer,
+			      nested_vmcb->control.intercept_cr_read,
+			      nested_vmcb->control.intercept_cr_write,
+			      nested_vmcb->control.intercept_dr_read,
+			      nested_vmcb->control.intercept_dr_write,
+			      nested_vmcb->control.intercept_exceptions,
+			      nested_vmcb->control.intercept,
+			      nested_vmcb->control.msrpm_base_pa,
+			      nested_vmcb->control.iopm_base_pa,
+			      nested_vmcb->control.tsc_offset };
+
+	/* Give the current vmcb to the guest */
+	memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb));
+	nested_vmcb->save.cr0 = nested_save[0];
+	if (!npt_enabled)
+		nested_vmcb->save.cr3 = nested_save[1];
+	nested_vmcb->save.cr4 = nested_save[2];
+	nested_vmcb->save.efer = nested_save[3];
+	nested_vmcb->control.intercept_cr_read = nested_save[4];
+	nested_vmcb->control.intercept_cr_write = nested_save[5];
+	nested_vmcb->control.intercept_dr_read = nested_save[6];
+	nested_vmcb->control.intercept_dr_write = nested_save[7];
+	nested_vmcb->control.intercept_exceptions = nested_save[8];
+	nested_vmcb->control.intercept = nested_save[9];
+	nested_vmcb->control.msrpm_base_pa = nested_save[10];
+	nested_vmcb->control.iopm_base_pa = nested_save[11];
+	nested_vmcb->control.tsc_offset = nested_save[12];
+
+	/* We always set V_INTR_MASKING and remember the old value in hflags */
+	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+		nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
+
+	if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) &&
+	    (nested_vmcb->control.int_vector)) {
+		nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n",
+				nested_vmcb->control.int_vector);
+	}
+
+	/* Restore the original control entries */
+	svm->vmcb->control = hsave->control;
+
+	/* Kill any pending exceptions */
+	if (svm->vcpu.arch.exception.pending == true)
+		nsvm_printk("WARNING: Pending Exception\n");
+	svm->vcpu.arch.exception.pending = false;
+
+	/* Restore selected save entries */
+	svm->vmcb->save.es = hsave->save.es;
+	svm->vmcb->save.cs = hsave->save.cs;
+	svm->vmcb->save.ss = hsave->save.ss;
+	svm->vmcb->save.ds = hsave->save.ds;
+	svm->vmcb->save.gdtr = hsave->save.gdtr;
+	svm->vmcb->save.idtr = hsave->save.idtr;
+	svm->vmcb->save.rflags = hsave->save.rflags;
+	svm_set_efer(&svm->vcpu, hsave->save.efer);
+	svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
+	svm_set_cr4(&svm->vcpu, hsave->save.cr4);
+	if (npt_enabled) {
+		svm->vmcb->save.cr3 = hsave->save.cr3;
+		svm->vcpu.arch.cr3 = hsave->save.cr3;
+	} else {
+		kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+	}
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
+	svm->vmcb->save.dr7 = 0;
+	svm->vmcb->save.cpl = 0;
+	svm->vmcb->control.exit_int_info = 0;
+
+	svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+	/* Exit nested SVM mode */
+	svm->nested_vmcb = 0;
+
+	return 0;
+}
+
+static int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+	nsvm_printk("VMexit\n");
+	if (nested_svm_do(svm, svm->nested_vmcb, 0,
+			  NULL, nested_svm_vmexit_real))
+		return 1;
+
+	kvm_mmu_reset_context(&svm->vcpu);
+	kvm_mmu_load(&svm->vcpu);
+
+	return 0;
+}
+
+static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1,
+				  void *arg2, void *opaque)
+{
+	int i;
+	u32 *nested_msrpm = (u32*)arg1;
+	for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
+		svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
+	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm);
+
+	return 0;
+}
+
+static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
+			    void *arg2, void *opaque)
+{
+	struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+	struct vmcb *hsave = svm->hsave;
+
+	/* nested_vmcb is our indicator if nested SVM is activated */
+	svm->nested_vmcb = svm->vmcb->save.rax;
+
+	/* Clear internal status */
+	svm->vcpu.arch.exception.pending = false;
+
+	/* Save the old vmcb, so we don't need to pick what we save, but
+	   can restore everything when a VMEXIT occurs */
+	memcpy(hsave, svm->vmcb, sizeof(struct vmcb));
+	/* We need to remember the original CR3 in the SPT case */
+	if (!npt_enabled)
+		hsave->save.cr3 = svm->vcpu.arch.cr3;
+	hsave->save.cr4 = svm->vcpu.arch.cr4;
+	hsave->save.rip = svm->next_rip;
+
+	if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
+		svm->vcpu.arch.hflags |= HF_HIF_MASK;
+	else
+		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
+
+	/* Load the nested guest state */
+	svm->vmcb->save.es = nested_vmcb->save.es;
+	svm->vmcb->save.cs = nested_vmcb->save.cs;
+	svm->vmcb->save.ss = nested_vmcb->save.ss;
+	svm->vmcb->save.ds = nested_vmcb->save.ds;
+	svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
+	svm->vmcb->save.idtr = nested_vmcb->save.idtr;
+	svm->vmcb->save.rflags = nested_vmcb->save.rflags;
+	svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
+	svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
+	svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
+	if (npt_enabled) {
+		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
+		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
+	} else {
+		kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+		kvm_mmu_reset_context(&svm->vcpu);
+	}
+	svm->vmcb->save.cr2 = nested_vmcb->save.cr2;
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
+	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+	/* In case we don't even reach vcpu_run, the fields are not updated */
+	svm->vmcb->save.rax = nested_vmcb->save.rax;
+	svm->vmcb->save.rsp = nested_vmcb->save.rsp;
+	svm->vmcb->save.rip = nested_vmcb->save.rip;
+	svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
+	svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
+	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+
+	/* We don't want a nested guest to be more powerful than the guest,
+	   so all intercepts are ORed */
+	svm->vmcb->control.intercept_cr_read |=
+		nested_vmcb->control.intercept_cr_read;
+	svm->vmcb->control.intercept_cr_write |=
+		nested_vmcb->control.intercept_cr_write;
+	svm->vmcb->control.intercept_dr_read |=
+		nested_vmcb->control.intercept_dr_read;
+	svm->vmcb->control.intercept_dr_write |=
+		nested_vmcb->control.intercept_dr_write;
+	svm->vmcb->control.intercept_exceptions |=
+		nested_vmcb->control.intercept_exceptions;
+
+	svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+
+	svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+
+	force_new_asid(&svm->vcpu);
+	svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
+	svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
+	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
+	if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
+		nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
+				nested_vmcb->control.int_ctl);
+	}
+	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
+		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
+	else
+		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
+
+	nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
+			nested_vmcb->control.exit_int_info,
+			nested_vmcb->control.int_state);
+
+	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
+	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
+	svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
+	if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
+		nsvm_printk("Injecting Event: 0x%x\n",
+				nested_vmcb->control.event_inj);
+	svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
+	svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
+
+	svm->vcpu.arch.hflags |= HF_GIF_MASK;
+
+	return 0;
+}
+
+static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+	to_vmcb->save.fs = from_vmcb->save.fs;
+	to_vmcb->save.gs = from_vmcb->save.gs;
+	to_vmcb->save.tr = from_vmcb->save.tr;
+	to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+	to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+	to_vmcb->save.star = from_vmcb->save.star;
+	to_vmcb->save.lstar = from_vmcb->save.lstar;
+	to_vmcb->save.cstar = from_vmcb->save.cstar;
+	to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+	to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+	to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+	to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+
+	return 1;
+}
+
+static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
+			     void *arg2, void *opaque)
+{
+	return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb);
+}
+
+static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
+			     void *arg2, void *opaque)
+{
+	return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb);
+}
+
+static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload);
+
+	return 1;
+}
+
+static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave);
+
+	return 1;
+}
+
+static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	nsvm_printk("VMrun\n");
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	if (nested_svm_do(svm, svm->vmcb->save.rax, 0,
+			  NULL, nested_svm_vmrun))
+		return 1;
+
+	if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0,
+		      NULL, nested_svm_vmrun_msrpm))
+		return 1;
+
+	return 1;
+}
+
+static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	svm->vcpu.arch.hflags |= HF_GIF_MASK;
+
+	return 1;
+}
+
+static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (nested_svm_check_permissions(svm))
+		return 1;
+
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+	skip_emulated_instruction(&svm->vcpu);
+
+	svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+
+	/* After a CLGI no interrupts should come */
+	svm_clear_vintr(svm);
+	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+
+	return 1;
+}
+
 static int invalid_op_interception(struct vcpu_svm *svm,
 				   struct kvm_run *kvm_run)
 {
@@ -1250,6 +1940,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	case MSR_IA32_LASTINTTOIP:
 		*data = svm->vmcb->save.last_excp_to;
 		break;
+	case MSR_VM_HSAVE_PA:
+		*data = svm->hsave_msr;
+		break;
+	case MSR_VM_CR:
+		*data = 0;
+		break;
+	case MSR_IA32_UCODE_REV:
+		*data = 0x01000065;
+		break;
 	default:
 		return kvm_get_msr_common(vcpu, ecx, data);
 	}
@@ -1344,6 +2043,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
 
 		break;
+	case MSR_VM_HSAVE_PA:
+		svm->hsave_msr = data;
+		break;
 	default:
 		return kvm_set_msr_common(vcpu, ecx, data);
 	}
@@ -1380,7 +2082,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
 {
 	KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
 
-	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	/*
 	 * If the user space waits to inject interrupts, exit as soon as
@@ -1417,6 +2119,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_WRITE_DR3]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_DR5]			= emulate_on_interception,
 	[SVM_EXIT_WRITE_DR7]			= emulate_on_interception,
+	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
+	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
 	[SVM_EXIT_EXCP_BASE + PF_VECTOR] 	= pf_interception,
 	[SVM_EXIT_EXCP_BASE + NM_VECTOR] 	= nm_interception,
@@ -1436,12 +2140,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_MSR]				= msr_interception,
 	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
 	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
-	[SVM_EXIT_VMRUN]			= invalid_op_interception,
+	[SVM_EXIT_VMRUN]			= vmrun_interception,
 	[SVM_EXIT_VMMCALL]			= vmmcall_interception,
-	[SVM_EXIT_VMLOAD]			= invalid_op_interception,
-	[SVM_EXIT_VMSAVE]			= invalid_op_interception,
-	[SVM_EXIT_STGI]				= invalid_op_interception,
-	[SVM_EXIT_CLGI]				= invalid_op_interception,
+	[SVM_EXIT_VMLOAD]			= vmload_interception,
+	[SVM_EXIT_VMSAVE]			= vmsave_interception,
+	[SVM_EXIT_STGI]				= stgi_interception,
+	[SVM_EXIT_CLGI]				= clgi_interception,
 	[SVM_EXIT_SKINIT]			= invalid_op_interception,
 	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
 	[SVM_EXIT_MONITOR]			= invalid_op_interception,
@@ -1457,6 +2161,17 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
 		    (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
 
+	if (is_nested(svm)) {
+		nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
+			    exit_code, svm->vmcb->control.exit_info_1,
+			    svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
+		if (nested_svm_exit_handled(svm, true)) {
+			nested_svm_vmexit(svm);
+			nsvm_printk("-> #VMEXIT\n");
+			return 1;
+		}
+	}
+
 	if (npt_enabled) {
 		int mmu_reload = 0;
 		if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -1544,6 +2259,8 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	nested_svm_intr(svm);
+
 	svm_inject_irq(svm, irq);
 }
 
@@ -1589,11 +2306,17 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
 	if (!kvm_cpu_has_interrupt(vcpu))
 		goto out;
 
+	if (nested_svm_intr(svm))
+		goto out;
+
+	if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
+		goto out;
+
 	if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
 	    (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
 	    (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
 		/* unable to deliver irq, set pending irq */
-		vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
+		svm_set_vintr(svm);
 		svm_inject_irq(svm, 0x0);
 		goto out;
 	}
@@ -1615,7 +2338,8 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
 	}
 
 	svm->vcpu.arch.interrupt_window_open =
-		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
+		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+		 (svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
 static void svm_do_inject_vector(struct vcpu_svm *svm)
@@ -1637,9 +2361,13 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb_control_area *control = &svm->vmcb->control;
 
+	if (nested_svm_intr(svm))
+		return;
+
 	svm->vcpu.arch.interrupt_window_open =
 		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
+		 (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
+		 (svm->vcpu.arch.hflags & HF_GIF_MASK));
 
 	if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
 		/*
@@ -1652,9 +2380,9 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	 */
 	if (!svm->vcpu.arch.interrupt_window_open &&
 	    (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
-		control->intercept |= 1ULL << INTERCEPT_VINTR;
-	 else
-		control->intercept &= ~(1ULL << INTERCEPT_VINTR);
+		svm_set_vintr(svm);
+	else
+		svm_clear_vintr(svm);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -1662,22 +2390,6 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
 	return 0;
 }
 
-static void save_db_regs(unsigned long *db_regs)
-{
-	asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
-	asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
-	asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
-	asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
-}
-
-static void load_db_regs(unsigned long *db_regs)
-{
-	asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
-	asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
-	asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
-	asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
-}
-
 static void svm_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	force_new_asid(vcpu);
@@ -1736,19 +2448,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	gs_selector = kvm_read_gs();
 	ldt_selector = kvm_read_ldt();
 	svm->host_cr2 = kvm_read_cr2();
-	svm->host_dr6 = read_dr6();
-	svm->host_dr7 = read_dr7();
-	svm->vmcb->save.cr2 = vcpu->arch.cr2;
+	if (!is_nested(svm))
+		svm->vmcb->save.cr2 = vcpu->arch.cr2;
 	/* required for live migration with NPT */
 	if (npt_enabled)
 		svm->vmcb->save.cr3 = vcpu->arch.cr3;
 
-	if (svm->vmcb->save.dr7 & 0xff) {
-		write_dr7(0);
-		save_db_regs(svm->host_db_regs);
-		load_db_regs(svm->db_regs);
-	}
-
 	clgi();
 
 	local_irq_enable();
@@ -1824,16 +2529,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 #endif
 		);
 
-	if ((svm->vmcb->save.dr7 & 0xff))
-		load_db_regs(svm->host_db_regs);
-
 	vcpu->arch.cr2 = svm->vmcb->save.cr2;
 	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
 	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
 	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
-	write_dr6(svm->host_dr6);
-	write_dr7(svm->host_dr7);
 	kvm_write_cr2(svm->host_cr2);
 
 	kvm_load_fs(fs_selector);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7611af57682..bb481330716 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -91,6 +91,7 @@ struct vcpu_vmx {
 	} rmode;
 	int vpid;
 	bool emulation_required;
+	enum emulation_result invalid_state_emulation_result;
 
 	/* Support for vnmi-less CPUs */
 	int soft_vnmi_blocked;
@@ -189,21 +190,21 @@ static inline int is_page_fault(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+		(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_no_device(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+		(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_invalid_opcode(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+		(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
 }
 
 static inline int is_external_interrupt(u32 intr_info)
@@ -480,8 +481,13 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
 	if (!vcpu->fpu_active)
 		eb |= 1u << NM_VECTOR;
-	if (vcpu->guest_debug.enabled)
-		eb |= 1u << DB_VECTOR;
+	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			eb |= 1u << DB_VECTOR;
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			eb |= 1u << BP_VECTOR;
+	}
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
 	if (vm_need_ept())
@@ -747,29 +753,33 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-	if (has_error_code)
+	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+	}
 
 	if (vcpu->arch.rmode.active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = nr;
 		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
-		if (nr == BP_VECTOR)
+		if (nr == BP_VECTOR || nr == OF_VECTOR)
 			vmx->rmode.irq.rip++;
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			     nr | INTR_TYPE_SOFT_INTR
-			     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
-			     | INTR_INFO_VALID_MASK);
+		intr_info |= INTR_TYPE_SOFT_INTR;
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
 		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
 		return;
 	}
 
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-		     nr | INTR_TYPE_EXCEPTION
-		     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
-		     | INTR_INFO_VALID_MASK);
+	if (nr == BP_VECTOR || nr == OF_VECTOR) {
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+	} else
+		intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 }
 
 static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
@@ -856,11 +866,8 @@ static u64 guest_read_tsc(void)
  * writes 'guest_tsc' into guest's timestamp counter "register"
  * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
  */
-static void guest_write_tsc(u64 guest_tsc)
+static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 {
-	u64 host_tsc;
-
-	rdtscll(host_tsc);
 	vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
 }
 
@@ -925,14 +932,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct kvm_msr_entry *msr;
+	u64 host_tsc;
 	int ret = 0;
 
 	switch (msr_index) {
-#ifdef CONFIG_X86_64
 	case MSR_EFER:
 		vmx_load_host_state(vmx);
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
 		break;
+#ifdef CONFIG_X86_64
 	case MSR_FS_BASE:
 		vmcs_writel(GUEST_FS_BASE, data);
 		break;
@@ -950,7 +958,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		vmcs_writel(GUEST_SYSENTER_ESP, data);
 		break;
 	case MSR_IA32_TIME_STAMP_COUNTER:
-		guest_write_tsc(data);
+		rdtscll(host_tsc);
+		guest_write_tsc(data, host_tsc);
 		break;
 	case MSR_P6_PERFCTR0:
 	case MSR_P6_PERFCTR1:
@@ -999,40 +1008,28 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 	}
 }
 
-static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
+static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-	unsigned long dr7 = 0x400;
-	int old_singlestep;
-
-	old_singlestep = vcpu->guest_debug.singlestep;
-
-	vcpu->guest_debug.enabled = dbg->enabled;
-	if (vcpu->guest_debug.enabled) {
-		int i;
+	int old_debug = vcpu->guest_debug;
+	unsigned long flags;
 
-		dr7 |= 0x200;  /* exact */
-		for (i = 0; i < 4; ++i) {
-			if (!dbg->breakpoints[i].enabled)
-				continue;
-			vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
-			dr7 |= 2 << (i*2);    /* global enable */
-			dr7 |= 0 << (i*4+16); /* execution breakpoint */
-		}
+	vcpu->guest_debug = dbg->control;
+	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+		vcpu->guest_debug = 0;
 
-		vcpu->guest_debug.singlestep = dbg->singlestep;
-	} else
-		vcpu->guest_debug.singlestep = 0;
-
-	if (old_singlestep && !vcpu->guest_debug.singlestep) {
-		unsigned long flags;
+	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+		vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
+	else
+		vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
 
-		flags = vmcs_readl(GUEST_RFLAGS);
+	flags = vmcs_readl(GUEST_RFLAGS);
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+	else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
 		flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-		vmcs_writel(GUEST_RFLAGS, flags);
-	}
+	vmcs_writel(GUEST_RFLAGS, flags);
 
 	update_exception_bitmap(vcpu);
-	vmcs_writel(GUEST_DR7, dr7);
 
 	return 0;
 }
@@ -1433,6 +1430,29 @@ continue_rmode:
 	init_rmode(vcpu->kvm);
 }
 
+static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+
+	vcpu->arch.shadow_efer = efer;
+	if (!msr)
+		return;
+	if (efer & EFER_LMA) {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+			     vmcs_read32(VM_ENTRY_CONTROLS) |
+			     VM_ENTRY_IA32E_MODE);
+		msr->data = efer;
+	} else {
+		vmcs_write32(VM_ENTRY_CONTROLS,
+			     vmcs_read32(VM_ENTRY_CONTROLS) &
+			     ~VM_ENTRY_IA32E_MODE);
+
+		msr->data = efer & ~EFER_LME;
+	}
+	setup_msrs(vmx);
+}
+
 #ifdef CONFIG_X86_64
 
 static void enter_lmode(struct kvm_vcpu *vcpu)
@@ -1447,13 +1467,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 			     (guest_tr_ar & ~AR_TYPE_MASK)
 			     | AR_TYPE_BUSY_64_TSS);
 	}
-
 	vcpu->arch.shadow_efer |= EFER_LMA;
-
-	find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
-	vmcs_write32(VM_ENTRY_CONTROLS,
-		     vmcs_read32(VM_ENTRY_CONTROLS)
-		     | VM_ENTRY_IA32E_MODE);
+	vmx_set_efer(vcpu, vcpu->arch.shadow_efer);
 }
 
 static void exit_lmode(struct kvm_vcpu *vcpu)
@@ -1612,30 +1627,6 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	vmcs_writel(GUEST_CR4, hw_cr4);
 }
 
-static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
-
-	vcpu->arch.shadow_efer = efer;
-	if (!msr)
-		return;
-	if (efer & EFER_LMA) {
-		vmcs_write32(VM_ENTRY_CONTROLS,
-				     vmcs_read32(VM_ENTRY_CONTROLS) |
-				     VM_ENTRY_IA32E_MODE);
-		msr->data = efer;
-
-	} else {
-		vmcs_write32(VM_ENTRY_CONTROLS,
-				     vmcs_read32(VM_ENTRY_CONTROLS) &
-				     ~VM_ENTRY_IA32E_MODE);
-
-		msr->data = efer & ~EFER_LME;
-	}
-	setup_msrs(vmx);
-}
-
 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1653,7 +1644,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 	var->limit = vmcs_read32(sf->limit);
 	var->selector = vmcs_read16(sf->selector);
 	ar = vmcs_read32(sf->ar_bytes);
-	if (ar & AR_UNUSABLE_MASK)
+	if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
 		ar = 0;
 	var->type = ar & 15;
 	var->s = (ar >> 4) & 1;
@@ -1788,14 +1779,16 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
 	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
 	cs_rpl = cs.selector & SELECTOR_RPL_MASK;
 
+	if (cs.unusable)
+		return false;
 	if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
 		return false;
 	if (!cs.s)
 		return false;
-	if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
+	if (cs.type & AR_TYPE_WRITEABLE_MASK) {
 		if (cs.dpl > cs_rpl)
 			return false;
-	} else if (cs.type & AR_TYPE_CODE_MASK) {
+	} else {
 		if (cs.dpl != cs_rpl)
 			return false;
 	}
@@ -1814,7 +1807,9 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu)
 	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
 	ss_rpl = ss.selector & SELECTOR_RPL_MASK;
 
-	if ((ss.type != 3) || (ss.type != 7))
+	if (ss.unusable)
+		return true;
+	if (ss.type != 3 && ss.type != 7)
 		return false;
 	if (!ss.s)
 		return false;
@@ -1834,6 +1829,8 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
 	vmx_get_segment(vcpu, &var, seg);
 	rpl = var.selector & SELECTOR_RPL_MASK;
 
+	if (var.unusable)
+		return true;
 	if (!var.s)
 		return false;
 	if (!var.present)
@@ -1855,9 +1852,11 @@ static bool tr_valid(struct kvm_vcpu *vcpu)
 
 	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
 
+	if (tr.unusable)
+		return false;
 	if (tr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
 		return false;
-	if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
+	if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
 		return false;
 	if (!tr.present)
 		return false;
@@ -1871,6 +1870,8 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
 
 	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
 
+	if (ldtr.unusable)
+		return true;
 	if (ldtr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
 		return false;
 	if (ldtr.type != 2)
@@ -2112,7 +2113,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
 	u32 host_sysenter_cs, msr_low, msr_high;
 	u32 junk;
-	u64 host_pat;
+	u64 host_pat, tsc_this, tsc_base;
 	unsigned long a;
 	struct descriptor_table dt;
 	int i;
@@ -2240,6 +2241,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
 	vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
 
+	tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
+	rdtscll(tsc_this);
+	if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
+		tsc_base = tsc_this;
+
+	guest_write_tsc(0, tsc_base);
 
 	return 0;
 }
@@ -2319,7 +2326,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		kvm_rip_write(vcpu, 0);
 	kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 
-	/* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
 	vmcs_writel(GUEST_DR7, 0x400);
 
 	vmcs_writel(GUEST_GDTR_BASE, 0);
@@ -2332,8 +2338,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
 	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
-	guest_write_tsc(0);
-
 	/* Special registers */
 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 
@@ -2486,6 +2490,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 {
 	vmx_update_window_states(vcpu);
 
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_STI |
+				GUEST_INTR_STATE_MOV_SS);
+
 	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
 		if (vcpu->arch.interrupt.pending) {
 			enable_nmi_window(vcpu);
@@ -2536,24 +2545,6 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 	return 0;
 }
 
-static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
-{
-	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-
-	set_debugreg(dbg->bp[0], 0);
-	set_debugreg(dbg->bp[1], 1);
-	set_debugreg(dbg->bp[2], 2);
-	set_debugreg(dbg->bp[3], 3);
-
-	if (dbg->singlestep) {
-		unsigned long flags;
-
-		flags = vmcs_readl(GUEST_RFLAGS);
-		flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-		vmcs_writel(GUEST_RFLAGS, flags);
-	}
-}
-
 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 				  int vec, u32 err_code)
 {
@@ -2570,9 +2561,17 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 	 *        the required debugging infrastructure rework.
 	 */
 	switch (vec) {
-	case DE_VECTOR:
 	case DB_VECTOR:
+		if (vcpu->guest_debug &
+		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			return 0;
+		kvm_queue_exception(vcpu, vec);
+		return 1;
 	case BP_VECTOR:
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			return 0;
+		/* fall through */
+	case DE_VECTOR:
 	case OF_VECTOR:
 	case BR_VECTOR:
 	case UD_VECTOR:
@@ -2589,8 +2588,8 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 intr_info, error_code;
-	unsigned long cr2, rip;
+	u32 intr_info, ex_no, error_code;
+	unsigned long cr2, rip, dr6;
 	u32 vect_info;
 	enum emulation_result er;
 
@@ -2649,14 +2648,30 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		return 1;
 	}
 
-	if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
-	    (INTR_TYPE_EXCEPTION | 1)) {
+	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+	switch (ex_no) {
+	case DB_VECTOR:
+		dr6 = vmcs_readl(EXIT_QUALIFICATION);
+		if (!(vcpu->guest_debug &
+		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+			vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
+			kvm_queue_exception(vcpu, DB_VECTOR);
+			return 1;
+		}
+		kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
+		/* fall through */
+	case BP_VECTOR:
 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
-		return 0;
+		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+		kvm_run->debug.arch.exception = ex_no;
+		break;
+	default:
+		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+		kvm_run->ex.exception = ex_no;
+		kvm_run->ex.error_code = error_code;
+		break;
 	}
-	kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
-	kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
-	kvm_run->ex.error_code = error_code;
 	return 0;
 }
 
@@ -2677,7 +2692,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	unsigned long exit_qualification;
-	int size, down, in, string, rep;
+	int size, in, string;
 	unsigned port;
 
 	++vcpu->stat.io_exits;
@@ -2693,8 +2708,6 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	size = (exit_qualification & 7) + 1;
 	in = (exit_qualification & 8) != 0;
-	down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
-	rep = (exit_qualification & 32) != 0;
 	port = exit_qualification >> 16;
 
 	skip_emulated_instruction(vcpu);
@@ -2795,21 +2808,44 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	unsigned long val;
 	int dr, reg;
 
-	/*
-	 * FIXME: this code assumes the host is debugging the guest.
-	 *        need to deal with guest debugging itself too.
-	 */
+	dr = vmcs_readl(GUEST_DR7);
+	if (dr & DR7_GD) {
+		/*
+		 * As the vm-exit takes precedence over the debug trap, we
+		 * need to emulate the latter, either for the host or the
+		 * guest debugging itself.
+		 */
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+			kvm_run->debug.arch.dr6 = vcpu->arch.dr6;
+			kvm_run->debug.arch.dr7 = dr;
+			kvm_run->debug.arch.pc =
+				vmcs_readl(GUEST_CS_BASE) +
+				vmcs_readl(GUEST_RIP);
+			kvm_run->debug.arch.exception = DB_VECTOR;
+			kvm_run->exit_reason = KVM_EXIT_DEBUG;
+			return 0;
+		} else {
+			vcpu->arch.dr7 &= ~DR7_GD;
+			vcpu->arch.dr6 |= DR6_BD;
+			vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+			kvm_queue_exception(vcpu, DB_VECTOR);
+			return 1;
+		}
+	}
+
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	dr = exit_qualification & 7;
-	reg = (exit_qualification >> 8) & 15;
-	if (exit_qualification & 16) {
-		/* mov from dr */
+	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
+	if (exit_qualification & TYPE_MOV_FROM_DR) {
 		switch (dr) {
+		case 0 ... 3:
+			val = vcpu->arch.db[dr];
+			break;
 		case 6:
-			val = 0xffff0ff0;
+			val = vcpu->arch.dr6;
 			break;
 		case 7:
-			val = 0x400;
+			val = vcpu->arch.dr7;
 			break;
 		default:
 			val = 0;
@@ -2817,7 +2853,38 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		kvm_register_write(vcpu, reg, val);
 		KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
 	} else {
-		/* mov to dr */
+		val = vcpu->arch.regs[reg];
+		switch (dr) {
+		case 0 ... 3:
+			vcpu->arch.db[dr] = val;
+			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+				vcpu->arch.eff_db[dr] = val;
+			break;
+		case 4 ... 5:
+			if (vcpu->arch.cr4 & X86_CR4_DE)
+				kvm_queue_exception(vcpu, UD_VECTOR);
+			break;
+		case 6:
+			if (val & 0xffffffff00000000ULL) {
+				kvm_queue_exception(vcpu, GP_VECTOR);
+				break;
+			}
+			vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+			break;
+		case 7:
+			if (val & 0xffffffff00000000ULL) {
+				kvm_queue_exception(vcpu, GP_VECTOR);
+				break;
+			}
+			vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+				vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
+				vcpu->arch.switch_db_regs =
+					(val & DR7_BP_EN_MASK);
+			}
+			break;
+		}
+		KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
 	}
 	skip_emulated_instruction(vcpu);
 	return 1;
@@ -2968,17 +3035,25 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 	tss_selector = exit_qualification;
 
-	return kvm_task_switch(vcpu, tss_selector, reason);
+	if (!kvm_task_switch(vcpu, tss_selector, reason))
+		return 0;
+
+	/* clear all local breakpoint enable flags */
+	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
+
+	/*
+	 * TODO: What about debug traps on tss switch?
+	 *       Are we supposed to inject them and update dr6?
+	 */
+
+	return 1;
 }
 
 static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	u64 exit_qualification;
-	enum emulation_result er;
 	gpa_t gpa;
-	unsigned long hva;
 	int gla_validity;
-	int r;
 
 	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
 
@@ -3001,32 +3076,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-	hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
-	if (!kvm_is_error_hva(hva)) {
-		r = kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
-		if (r < 0) {
-			printk(KERN_ERR "EPT: Not enough memory!\n");
-			return -ENOMEM;
-		}
-		return 1;
-	} else {
-		/* must be MMIO */
-		er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
-
-		if (er == EMULATE_FAIL) {
-			printk(KERN_ERR
-			 "EPT: Fail to handle EPT violation vmexit!er is %d\n",
-			 er);
-			printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
-			 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
-			 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
-			printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
-				(long unsigned int)exit_qualification);
-			return -ENOTSUPP;
-		} else if (er == EMULATE_DO_MMIO)
-			return 0;
-	}
-	return 1;
+	return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
 }
 
 static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -3046,7 +3096,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 				struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int err;
+	enum emulation_result err = EMULATE_DONE;
 
 	preempt_enable();
 	local_irq_enable();
@@ -3071,10 +3121,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 	local_irq_disable();
 	preempt_disable();
 
-	/* Guest state should be valid now except if we need to
-	 * emulate an MMIO */
-	if (guest_state_valid(vcpu))
-		vmx->emulation_required = 0;
+	vmx->invalid_state_emulation_result = err;
 }
 
 /*
@@ -3123,8 +3170,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	/* If we need to emulate an MMIO from handle_invalid_guest_state
 	 * we just return 0 */
-	if (vmx->emulation_required && emulate_invalid_guest_state)
-		return 0;
+	if (vmx->emulation_required && emulate_invalid_guest_state) {
+		if (guest_state_valid(vcpu))
+			vmx->emulation_required = 0;
+		return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
+	}
 
 	/* Access CR3 don't cause VMExit in paging mode, so we need
 	 * to sync with guest real CR3. */
@@ -3238,7 +3288,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 			vmx->vcpu.arch.nmi_injected = false;
 	}
 	kvm_clear_exception_queue(&vmx->vcpu);
-	if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
+	if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION ||
+				type == INTR_TYPE_SOFT_EXCEPTION)) {
 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
 			error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
 			kvm_queue_exception_e(&vmx->vcpu, vector, error);
@@ -3259,6 +3310,11 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 
 	vmx_update_window_states(vcpu);
 
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_STI |
+				GUEST_INTR_STATE_MOV_SS);
+
 	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
 		if (vcpu->arch.interrupt.pending) {
 			enable_nmi_window(vcpu);
@@ -3347,6 +3403,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	 */
 	vmcs_writel(HOST_CR0, read_cr0());
 
+	set_debugreg(vcpu->arch.dr6, 6);
+
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -3441,6 +3499,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
 	vcpu->arch.regs_dirty = 0;
 
+	get_debugreg(vcpu->arch.dr6, 6);
+
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
@@ -3595,7 +3655,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.vcpu_put = vmx_vcpu_put,
 
 	.set_guest_debug = set_guest_debug,
-	.guest_debug_pre = kvm_guest_debug_pre,
 	.get_msr = vmx_get_msr,
 	.set_msr = vmx_set_msr,
 	.get_segment_base = vmx_get_segment_base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 758b7a155ae..8ca100a9eca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -36,6 +36,7 @@
 #include <linux/highmem.h>
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
+#include <linux/cpufreq.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -69,6 +70,8 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
 
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				    struct kvm_cpuid_entry2 __user *entries);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+					      u32 function, u32 index);
 
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -173,6 +176,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 			   u32 error_code)
 {
 	++vcpu->stat.pf_guest;
+
 	if (vcpu->arch.exception.pending) {
 		if (vcpu->arch.exception.nr == PF_VECTOR) {
 			printk(KERN_DEBUG "kvm: inject_page_fault:"
@@ -361,6 +365,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
+	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 }
@@ -442,6 +447,11 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+static inline u32 bit(int bitno)
+{
+	return 1 << (bitno & 31);
+}
+
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -456,7 +466,7 @@ static u32 msrs_to_save[] = {
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
 	MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-	MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
+	MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 };
 
 static unsigned num_msrs_to_save;
@@ -481,6 +491,28 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 		return;
 	}
 
+	if (efer & EFER_FFXSR) {
+		struct kvm_cpuid_entry2 *feat;
+
+		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
+			printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	}
+
+	if (efer & EFER_SVME) {
+		struct kvm_cpuid_entry2 *feat;
+
+		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
+			printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
+			kvm_inject_gp(vcpu, 0);
+			return;
+		}
+	}
+
 	kvm_x86_ops->set_efer(vcpu, efer);
 
 	efer &= ~EFER_LMA;
@@ -586,6 +618,8 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
 		 hv_clock->tsc_to_system_mul);
 }
 
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
 	struct timespec ts;
@@ -596,9 +630,9 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	if ((!vcpu->time_page))
 		return;
 
-	if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
-		kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = tsc_khz;
+	if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
+		kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
+		vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	}
 
 	/* Keep irq disabled to prevent changes to the clock */
@@ -629,6 +663,16 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 }
 
+static int kvm_request_guest_time_update(struct kvm_vcpu *v)
+{
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+
+	if (!vcpu->time_page)
+		return 0;
+	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+	return 1;
+}
+
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
@@ -722,6 +766,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		break;
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_UCODE_WRITE:
+	case MSR_VM_HSAVE_PA:
 		break;
 	case 0x200 ... 0x2ff:
 		return set_msr_mtrr(vcpu, msr, data);
@@ -758,7 +803,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			vcpu->arch.time_page = NULL;
 		}
 
-		kvm_write_guest_time(vcpu);
+		kvm_request_guest_time_update(vcpu);
 		break;
 	}
 	default:
@@ -843,6 +888,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_LASTBRANCHTOIP:
 	case MSR_IA32_LASTINTFROMIP:
 	case MSR_IA32_LASTINTTOIP:
+	case MSR_VM_HSAVE_PA:
 		data = 0;
 		break;
 	case MSR_MTRRcap:
@@ -967,10 +1013,13 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
+	case KVM_CAP_CLOCKSOURCE:
 	case KVM_CAP_PIT:
 	case KVM_CAP_NOP_IO_DELAY:
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_SYNC_MMU:
+	case KVM_CAP_REINJECT_CONTROL:
+	case KVM_CAP_IRQ_INJECT_STATUS:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -991,9 +1040,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_IOMMU:
 		r = iommu_found();
 		break;
-	case KVM_CAP_CLOCKSOURCE:
-		r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC);
-		break;
 	default:
 		r = 0;
 		break;
@@ -1044,7 +1090,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
-			cpuid_arg->entries);
+						      cpuid_arg->entries);
 		if (r)
 			goto out;
 
@@ -1064,7 +1110,7 @@ out:
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
-	kvm_write_guest_time(vcpu);
+	kvm_request_guest_time_update(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1142,8 +1188,8 @@ out:
 }
 
 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+				     struct kvm_cpuid2 *cpuid,
+				     struct kvm_cpuid_entry2 __user *entries)
 {
 	int r;
 
@@ -1162,8 +1208,8 @@ out:
 }
 
 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+				     struct kvm_cpuid2 *cpuid,
+				     struct kvm_cpuid_entry2 __user *entries)
 {
 	int r;
 
@@ -1172,7 +1218,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 		goto out;
 	r = -EFAULT;
 	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
-			   vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+			 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out;
 	return 0;
 
@@ -1181,18 +1227,13 @@ out:
 	return r;
 }
 
-static inline u32 bit(int bitno)
-{
-	return 1 << (bitno & 31);
-}
-
 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-			  u32 index)
+			   u32 index)
 {
 	entry->function = function;
 	entry->index = index;
 	cpuid_count(entry->function, entry->index,
-		&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
 	entry->flags = 0;
 }
 
@@ -1222,15 +1263,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 #ifdef CONFIG_X86_64
 		bit(X86_FEATURE_LM) |
 #endif
+		bit(X86_FEATURE_FXSR_OPT) |
 		bit(X86_FEATURE_MMXEXT) |
 		bit(X86_FEATURE_3DNOWEXT) |
 		bit(X86_FEATURE_3DNOW);
 	const u32 kvm_supported_word3_x86_features =
 		bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
 	const u32 kvm_supported_word6_x86_features =
-		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
+		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
+		bit(X86_FEATURE_SVM);
 
-	/* all func 2 cpuid_count() should be called on the same cpu */
+	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
 	do_cpuid_1_ent(entry, function, index);
 	++*nent;
@@ -1304,7 +1347,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 }
 
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries)
+				     struct kvm_cpuid_entry2 __user *entries)
 {
 	struct kvm_cpuid_entry2 *cpuid_entries;
 	int limit, nent = 0, r = -E2BIG;
@@ -1321,7 +1364,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	limit = cpuid_entries[0].eax;
 	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
 		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-				&nent, cpuid->nent);
+			     &nent, cpuid->nent);
 	r = -E2BIG;
 	if (nent >= cpuid->nent)
 		goto out_free;
@@ -1330,10 +1373,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	limit = cpuid_entries[nent - 1].eax;
 	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
 		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			       &nent, cpuid->nent);
+			     &nent, cpuid->nent);
 	r = -EFAULT;
 	if (copy_to_user(entries, cpuid_entries,
-			nent * sizeof(struct kvm_cpuid_entry2)))
+			 nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out_free;
 	cpuid->nent = nent;
 	r = 0;
@@ -1477,7 +1520,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
-				cpuid_arg->entries);
+					      cpuid_arg->entries);
 		if (r)
 			goto out;
 		break;
@@ -1490,7 +1533,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
 			goto out;
 		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
-				cpuid_arg->entries);
+					      cpuid_arg->entries);
 		if (r)
 			goto out;
 		r = -EFAULT;
@@ -1710,6 +1753,15 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 	return r;
 }
 
+static int kvm_vm_ioctl_reinject(struct kvm *kvm,
+				 struct kvm_reinject_control *control)
+{
+	if (!kvm->arch.vpit)
+		return -ENXIO;
+	kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+	return 0;
+}
+
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
@@ -1807,13 +1859,26 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			}
 		} else
 			goto out;
+		r = kvm_setup_default_irq_routing(kvm);
+		if (r) {
+			kfree(kvm->arch.vpic);
+			kfree(kvm->arch.vioapic);
+			goto out;
+		}
 		break;
 	case KVM_CREATE_PIT:
+		mutex_lock(&kvm->lock);
+		r = -EEXIST;
+		if (kvm->arch.vpit)
+			goto create_pit_unlock;
 		r = -ENOMEM;
 		kvm->arch.vpit = kvm_create_pit(kvm);
 		if (kvm->arch.vpit)
 			r = 0;
+	create_pit_unlock:
+		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_IRQ_LINE_STATUS:
 	case KVM_IRQ_LINE: {
 		struct kvm_irq_level irq_event;
 
@@ -1821,10 +1886,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
+			__s32 status;
 			mutex_lock(&kvm->lock);
-			kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-				    irq_event.irq, irq_event.level);
+			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+					irq_event.irq, irq_event.level);
 			mutex_unlock(&kvm->lock);
+			if (ioctl == KVM_IRQ_LINE_STATUS) {
+				irq_event.status = status;
+				if (copy_to_user(argp, &irq_event,
+							sizeof irq_event))
+					goto out;
+			}
 			r = 0;
 		}
 		break;
@@ -1907,6 +1979,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_REINJECT_CONTROL: {
+		struct kvm_reinject_control control;
+		r =  -EFAULT;
+		if (copy_from_user(&control, argp, sizeof(control)))
+			goto out;
+		r = kvm_vm_ioctl_reinject(kvm, &control);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	default:
 		;
 	}
@@ -1960,10 +2043,38 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
 	return dev;
 }
 
-int emulator_read_std(unsigned long addr,
-			     void *val,
-			     unsigned int bytes,
-			     struct kvm_vcpu *vcpu)
+static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+			       struct kvm_vcpu *vcpu)
+{
+	void *data = val;
+	int r = X86EMUL_CONTINUE;
+
+	while (bytes) {
+		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
+		unsigned offset = addr & (PAGE_SIZE-1);
+		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
+		int ret;
+
+		if (gpa == UNMAPPED_GVA) {
+			r = X86EMUL_PROPAGATE_FAULT;
+			goto out;
+		}
+		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
+		if (ret < 0) {
+			r = X86EMUL_UNHANDLEABLE;
+			goto out;
+		}
+
+		bytes -= toread;
+		data += toread;
+		addr += toread;
+	}
+out:
+	return r;
+}
+
+static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+				struct kvm_vcpu *vcpu)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
@@ -1971,27 +2082,27 @@ int emulator_read_std(unsigned long addr,
 	while (bytes) {
 		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
 		unsigned offset = addr & (PAGE_SIZE-1);
-		unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
+		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
 
 		if (gpa == UNMAPPED_GVA) {
 			r = X86EMUL_PROPAGATE_FAULT;
 			goto out;
 		}
-		ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
+		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
 		if (ret < 0) {
 			r = X86EMUL_UNHANDLEABLE;
 			goto out;
 		}
 
-		bytes -= tocopy;
-		data += tocopy;
-		addr += tocopy;
+		bytes -= towrite;
+		data += towrite;
+		addr += towrite;
 	}
 out:
 	return r;
 }
-EXPORT_SYMBOL_GPL(emulator_read_std);
+
 
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
@@ -2013,8 +2124,8 @@ static int emulator_read_emulated(unsigned long addr,
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		goto mmio;
 
-	if (emulator_read_std(addr, val, bytes, vcpu)
-			== X86EMUL_CONTINUE)
+	if (kvm_read_guest_virt(addr, val, bytes, vcpu)
+				== X86EMUL_CONTINUE)
 		return X86EMUL_CONTINUE;
 	if (gpa == UNMAPPED_GVA)
 		return X86EMUL_PROPAGATE_FAULT;
@@ -2217,7 +2328,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 
 	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
 
-	emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
+	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
 
 	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
 	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -2225,7 +2336,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
 static struct x86_emulate_ops emulate_ops = {
-	.read_std            = emulator_read_std,
+	.read_std            = kvm_read_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
@@ -2327,40 +2438,19 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
-		if (vcpu->arch.pio.guest_pages[i]) {
-			kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
-			vcpu->arch.pio.guest_pages[i] = NULL;
-		}
-}
-
 static int pio_copy_data(struct kvm_vcpu *vcpu)
 {
 	void *p = vcpu->arch.pio_data;
-	void *q;
+	gva_t q = vcpu->arch.pio.guest_gva;
 	unsigned bytes;
-	int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
+	int ret;
 
-	q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
-		 PAGE_KERNEL);
-	if (!q) {
-		free_pio_guest_pages(vcpu);
-		return -ENOMEM;
-	}
-	q += vcpu->arch.pio.guest_page_offset;
 	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
 	if (vcpu->arch.pio.in)
-		memcpy(q, p, bytes);
+		ret = kvm_write_guest_virt(q, p, bytes, vcpu);
 	else
-		memcpy(p, q, bytes);
-	q -= vcpu->arch.pio.guest_page_offset;
-	vunmap(q);
-	free_pio_guest_pages(vcpu);
-	return 0;
+		ret = kvm_read_guest_virt(q, p, bytes, vcpu);
+	return ret;
 }
 
 int complete_pio(struct kvm_vcpu *vcpu)
@@ -2471,7 +2561,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.string = 0;
 	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.guest_page_offset = 0;
 	vcpu->arch.pio.rep = 0;
 
 	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
@@ -2499,9 +2588,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		  gva_t address, int rep, unsigned port)
 {
 	unsigned now, in_page;
-	int i, ret = 0;
-	int nr_pages = 1;
-	struct page *page;
+	int ret = 0;
 	struct kvm_io_device *pio_dev;
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2513,7 +2600,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.string = 1;
 	vcpu->arch.pio.down = down;
-	vcpu->arch.pio.guest_page_offset = offset_in_page(address);
 	vcpu->arch.pio.rep = rep;
 
 	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
@@ -2533,15 +2619,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	else
 		in_page = offset_in_page(address) + size;
 	now = min(count, (unsigned long)in_page / size);
-	if (!now) {
-		/*
-		 * String I/O straddles page boundary.  Pin two guest pages
-		 * so that we satisfy atomicity constraints.  Do just one
-		 * transaction to avoid complexity.
-		 */
-		nr_pages = 2;
+	if (!now)
 		now = 1;
-	}
 	if (down) {
 		/*
 		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
@@ -2556,15 +2635,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
 		kvm_x86_ops->skip_emulated_instruction(vcpu);
 
-	for (i = 0; i < nr_pages; ++i) {
-		page = gva_to_page(vcpu, address + i * PAGE_SIZE);
-		vcpu->arch.pio.guest_pages[i] = page;
-		if (!page) {
-			kvm_inject_gp(vcpu, 0);
-			free_pio_guest_pages(vcpu);
-			return 1;
-		}
-	}
+	vcpu->arch.pio.guest_gva = address;
 
 	pio_dev = vcpu_find_pio_dev(vcpu, port,
 				    vcpu->arch.pio.cur_count,
@@ -2572,7 +2643,11 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	if (!vcpu->arch.pio.in) {
 		/* string PIO write */
 		ret = pio_copy_data(vcpu);
-		if (ret >= 0 && pio_dev) {
+		if (ret == X86EMUL_PROPAGATE_FAULT) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+		if (ret == 0 && pio_dev) {
 			pio_string_write(pio_dev, vcpu);
 			complete_pio(vcpu);
 			if (vcpu->arch.pio.count == 0)
@@ -2587,9 +2662,72 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
 
+static void bounce_off(void *info)
+{
+	/* nothing */
+}
+
+static unsigned int  ref_freq;
+static unsigned long tsc_khz_ref;
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+				     void *data)
+{
+	struct cpufreq_freqs *freq = data;
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int i, send_ipi = 0;
+
+	if (!ref_freq)
+		ref_freq = freq->old;
+
+	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+		return 0;
+	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+		return 0;
+	per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+
+	spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (vcpu->cpu != freq->cpu)
+				continue;
+			if (!kvm_request_guest_time_update(vcpu))
+				continue;
+			if (vcpu->cpu != smp_processor_id())
+				send_ipi++;
+		}
+	}
+	spin_unlock(&kvm_lock);
+
+	if (freq->old < freq->new && send_ipi) {
+		/*
+		 * We upscale the frequency.  Must make the guest
+		 * doesn't see old kvmclock values while running with
+		 * the new frequency, otherwise we risk the guest sees
+		 * time go backwards.
+		 *
+		 * In case we update the frequency for another cpu
+		 * (which might be in guest context) send an interrupt
+		 * to kick the cpu out of guest context.  Next time
+		 * guest context is entered kvmclock will be updated,
+		 * so the guest will not see stale values.
+		 */
+		smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
+	}
+	return 0;
+}
+
+static struct notifier_block kvmclock_cpufreq_notifier_block = {
+        .notifier_call  = kvmclock_cpufreq_notifier
+};
+
 int kvm_arch_init(void *opaque)
 {
-	int r;
+	int r, cpu;
 	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
 
 	if (kvm_x86_ops) {
@@ -2620,6 +2758,15 @@ int kvm_arch_init(void *opaque)
 	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
+
+	for_each_possible_cpu(cpu)
+		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+		tsc_khz_ref = tsc_khz;
+		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+					  CPUFREQ_TRANSITION_NOTIFIER);
+	}
+
 	return 0;
 
 out:
@@ -2827,25 +2974,20 @@ static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
 	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
 		return 0;
 	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
-		!(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
 		return 0;
 	return 1;
 }
 
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+					      u32 function, u32 index)
 {
 	int i;
-	u32 function, index;
-	struct kvm_cpuid_entry2 *e, *best;
+	struct kvm_cpuid_entry2 *best = NULL;
 
-	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
-	best = NULL;
 	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+		struct kvm_cpuid_entry2 *e;
+
 		e = &vcpu->arch.cpuid_entries[i];
 		if (is_matching_cpuid_entry(e, function, index)) {
 			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
@@ -2860,6 +3002,21 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 			if (!best || e->function > best->function)
 				best = e;
 	}
+	return best;
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+	u32 function, index;
+	struct kvm_cpuid_entry2 *best;
+
+	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
+	best = kvm_find_cpuid_entry(vcpu, function, index);
 	if (best) {
 		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
 		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
@@ -2945,6 +3102,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->requests) {
 		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
 			__kvm_migrate_timers(vcpu);
+		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+			kvm_write_guest_time(vcpu);
 		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
 			kvm_mmu_sync_roots(vcpu);
 		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
@@ -2979,9 +3138,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}
 
-	if (vcpu->guest_debug.enabled)
-		kvm_x86_ops->guest_debug_pre(vcpu);
-
 	vcpu->guest_mode = 1;
 	/*
 	 * Make sure that guest_mode assignment won't happen after
@@ -3002,10 +3158,34 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	kvm_guest_enter();
 
+	get_debugreg(vcpu->arch.host_dr6, 6);
+	get_debugreg(vcpu->arch.host_dr7, 7);
+	if (unlikely(vcpu->arch.switch_db_regs)) {
+		get_debugreg(vcpu->arch.host_db[0], 0);
+		get_debugreg(vcpu->arch.host_db[1], 1);
+		get_debugreg(vcpu->arch.host_db[2], 2);
+		get_debugreg(vcpu->arch.host_db[3], 3);
+
+		set_debugreg(0, 7);
+		set_debugreg(vcpu->arch.eff_db[0], 0);
+		set_debugreg(vcpu->arch.eff_db[1], 1);
+		set_debugreg(vcpu->arch.eff_db[2], 2);
+		set_debugreg(vcpu->arch.eff_db[3], 3);
+	}
 
 	KVMTRACE_0D(VMENTRY, vcpu, entryexit);
 	kvm_x86_ops->run(vcpu, kvm_run);
 
+	if (unlikely(vcpu->arch.switch_db_regs)) {
+		set_debugreg(0, 7);
+		set_debugreg(vcpu->arch.host_db[0], 0);
+		set_debugreg(vcpu->arch.host_db[1], 1);
+		set_debugreg(vcpu->arch.host_db[2], 2);
+		set_debugreg(vcpu->arch.host_db[3], 3);
+	}
+	set_debugreg(vcpu->arch.host_dr6, 6);
+	set_debugreg(vcpu->arch.host_dr7, 7);
+
 	vcpu->guest_mode = 0;
 	local_irq_enable();
 
@@ -3192,7 +3372,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	/*
 	 * Don't leak debug flags in case they were set for guest debugging
 	 */
-	if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
 
 	vcpu_put(vcpu);
@@ -3811,15 +3991,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-				    struct kvm_debug_guest *dbg)
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
 {
-	int r;
+	int i, r;
 
 	vcpu_load(vcpu);
 
+	if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
+	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
+		for (i = 0; i < KVM_NR_DB_REGS; ++i)
+			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
+		vcpu->arch.switch_db_regs =
+			(dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
+	} else {
+		for (i = 0; i < KVM_NR_DB_REGS; i++)
+			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+		vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
+	}
+
 	r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
 
+	if (dbg->control & KVM_GUESTDBG_INJECT_DB)
+		kvm_queue_exception(vcpu, DB_VECTOR);
+	else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
+		kvm_queue_exception(vcpu, BP_VECTOR);
+
 	vcpu_put(vcpu);
 
 	return r;
@@ -4007,6 +4204,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.nmi_pending = false;
 	vcpu->arch.nmi_injected = false;
 
+	vcpu->arch.switch_db_regs = 0;
+	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+	vcpu->arch.dr6 = DR6_FIXED_1;
+	vcpu->arch.dr7 = DR7_FIXED_1;
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -4100,6 +4302,8 @@ struct  kvm *kvm_arch_create_vm(void)
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
+	rdtscll(kvm->arch.vm_init_tsc);
+
 	return kvm;
 }
 
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d174db7a337..ca91749d208 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -178,7 +178,7 @@ static u32 opcode_table[256] = {
 	0, ImplicitOps | Stack, 0, 0,
 	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
 	/* 0xC8 - 0xCF */
-	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0,
 	/* 0xD0 - 0xD7 */
 	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
 	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -1136,18 +1136,19 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
-		       struct x86_emulate_ops *ops)
+		       struct x86_emulate_ops *ops,
+		       void *dest, int len)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
 	rc = ops->read_emulated(register_address(c, ss_base(ctxt),
 						 c->regs[VCPU_REGS_RSP]),
-				&c->src.val, c->src.bytes, ctxt->vcpu);
+				dest, len, ctxt->vcpu);
 	if (rc != 0)
 		return rc;
 
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
 	return rc;
 }
 
@@ -1157,11 +1158,9 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	c->src.bytes = c->dst.bytes;
-	rc = emulate_pop(ctxt, ops);
+	rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
 	if (rc != 0)
 		return rc;
-	c->dst.val = c->src.val;
 	return 0;
 }
 
@@ -1279,6 +1278,25 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 	return 0;
 }
 
+static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+	unsigned long cs;
+
+	rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
+	if (rc)
+		return rc;
+	if (c->op_bytes == 4)
+		c->eip = (u32)c->eip;
+	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
+	if (rc)
+		return rc;
+	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS);
+	return rc;
+}
+
 static inline int writeback(struct x86_emulate_ctxt *ctxt,
 			    struct x86_emulate_ops *ops)
 {
@@ -1467,11 +1485,9 @@ special_insn:
 		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
-		c->src.bytes = c->op_bytes;
-		rc = emulate_pop(ctxt, ops);
+		rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
 		if (rc != 0)
 			goto done;
-		c->dst.val = c->src.val;
 		break;
 	case 0x63:		/* movsxd */
 		if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1738,6 +1754,11 @@ special_insn:
 	mov:
 		c->dst.val = c->src.val;
 		break;
+	case 0xcb:		/* ret far */
+		rc = emulate_ret_far(ctxt, ops);
+		if (rc)
+			goto done;
+		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		c->src.val = 1;
 		emulate_grp2(ctxt);
@@ -1908,11 +1929,16 @@ twobyte_insn:
 			c->dst.type = OP_NONE;
 			break;
 		case 3: /* lidt/vmmcall */
-			if (c->modrm_mod == 3 && c->modrm_rm == 1) {
-				rc = kvm_fix_hypercall(ctxt->vcpu);
-				if (rc)
-					goto done;
-				kvm_emulate_hypercall(ctxt->vcpu);
+			if (c->modrm_mod == 3) {
+				switch (c->modrm_rm) {
+				case 1:
+					rc = kvm_fix_hypercall(ctxt->vcpu);
+					if (rc)
+						goto done;
+					break;
+				default:
+					goto cannot_emulate;
+				}
 			} else {
 				rc = read_descriptor(ctxt, ops, c->src.ptr,
 						     &size, &address,
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index f1e82a92e61..5130fc55b8e 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -927,8 +927,7 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
 	unsigned long flags;
 	int ret = 0;
 
-	service_mask = service_mask ? service_mask :
-		       __constant_cpu_to_be64(~0ULL);
+	service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
 	service_id &= service_mask;
 	if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
 	    (service_id != IB_CM_ASSIGN_SERVICE_ID))
@@ -954,7 +953,7 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
 	spin_lock_irqsave(&cm.lock, flags);
 	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
 		cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
-		cm_id->service_mask = __constant_cpu_to_be64(~0ULL);
+		cm_id->service_mask = ~cpu_to_be64(0);
 	} else {
 		cm_id->service_id = service_id;
 		cm_id->service_mask = service_mask;
@@ -1134,7 +1133,7 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
 			goto error1;
 	}
 	cm_id->service_id = param->service_id;
-	cm_id->service_mask = __constant_cpu_to_be64(~0ULL);
+	cm_id->service_mask = ~cpu_to_be64(0);
 	cm_id_priv->timeout_ms = cm_convert_to_ms(
 				    param->primary_path->packet_life_time) * 2 +
 				 cm_convert_to_ms(
@@ -1545,7 +1544,7 @@ static int cm_req_handler(struct cm_work *work)
 	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
 	cm_id_priv->id.context = listen_cm_id_priv->id.context;
 	cm_id_priv->id.service_id = req_msg->service_id;
-	cm_id_priv->id.service_mask = __constant_cpu_to_be64(~0ULL);
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
 
 	cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
 	cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
@@ -2898,7 +2897,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
 		goto out;
 
 	cm_id->service_id = param->service_id;
-	cm_id->service_mask = __constant_cpu_to_be64(~0ULL);
+	cm_id->service_mask = ~cpu_to_be64(0);
 	cm_id_priv->timeout_ms = param->timeout_ms;
 	cm_id_priv->max_cm_retries = param->max_cm_retries;
 	ret = cm_alloc_msg(cm_id_priv, &msg);
@@ -2992,7 +2991,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
 	cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
 	cm_id_priv->id.context = cur_cm_id_priv->id.context;
 	cm_id_priv->id.service_id = sidr_req_msg->service_id;
-	cm_id_priv->id.service_mask = __constant_cpu_to_be64(~0ULL);
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
 
 	cm_format_sidr_req_event(work, &cur_cm_id_priv->id);
 	cm_process_work(cm_id_priv, work);
@@ -3789,7 +3788,7 @@ static int __init ib_cm_init(void)
 	rwlock_init(&cm.device_lock);
 	spin_lock_init(&cm.lock);
 	cm.listen_service_table = RB_ROOT;
-	cm.listen_service_id = __constant_be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
+	cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
 	cm.remote_id_table = RB_ROOT;
 	cm.remote_qp_table = RB_ROOT;
 	cm.remote_sidr_table = RB_ROOT;
diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h
index aec9c7af825..7e63c08f697 100644
--- a/drivers/infiniband/core/cm_msgs.h
+++ b/drivers/infiniband/core/cm_msgs.h
@@ -44,17 +44,17 @@
 
 #define IB_CM_CLASS_VERSION	2 /* IB specification 1.2 */
 
-#define CM_REQ_ATTR_ID	    __constant_htons(0x0010)
-#define CM_MRA_ATTR_ID	    __constant_htons(0x0011)
-#define CM_REJ_ATTR_ID	    __constant_htons(0x0012)
-#define CM_REP_ATTR_ID	    __constant_htons(0x0013)
-#define CM_RTU_ATTR_ID	    __constant_htons(0x0014)
-#define CM_DREQ_ATTR_ID	    __constant_htons(0x0015)
-#define CM_DREP_ATTR_ID	    __constant_htons(0x0016)
-#define CM_SIDR_REQ_ATTR_ID __constant_htons(0x0017)
-#define CM_SIDR_REP_ATTR_ID __constant_htons(0x0018)
-#define CM_LAP_ATTR_ID      __constant_htons(0x0019)
-#define CM_APR_ATTR_ID      __constant_htons(0x001A)
+#define CM_REQ_ATTR_ID		cpu_to_be16(0x0010)
+#define CM_MRA_ATTR_ID		cpu_to_be16(0x0011)
+#define CM_REJ_ATTR_ID		cpu_to_be16(0x0012)
+#define CM_REP_ATTR_ID		cpu_to_be16(0x0013)
+#define CM_RTU_ATTR_ID		cpu_to_be16(0x0014)
+#define CM_DREQ_ATTR_ID		cpu_to_be16(0x0015)
+#define CM_DREP_ATTR_ID		cpu_to_be16(0x0016)
+#define CM_SIDR_REQ_ATTR_ID	cpu_to_be16(0x0017)
+#define CM_SIDR_REP_ATTR_ID	cpu_to_be16(0x0018)
+#define CM_LAP_ATTR_ID		cpu_to_be16(0x0019)
+#define CM_APR_ATTR_ID		cpu_to_be16(0x001A)
 
 enum cm_msg_sequence {
 	CM_MSG_SEQUENCE_REQ,
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 7913b804311..d1fba415333 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -193,7 +193,7 @@ void ib_dealloc_device(struct ib_device *device)
 
 	BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
 
-	ib_device_unregister_sysfs(device);
+	kobject_put(&device->dev.kobj);
 }
 EXPORT_SYMBOL(ib_dealloc_device);
 
@@ -348,6 +348,8 @@ void ib_unregister_device(struct ib_device *device)
 
 	mutex_unlock(&device_mutex);
 
+	ib_device_unregister_sysfs(device);
+
 	spin_lock_irqsave(&device->client_data_lock, flags);
 	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
 		kfree(context);
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 5c54fc2350b..de922a04ca2 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -301,6 +301,16 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 	mad_agent_priv->agent.context = context;
 	mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
 	mad_agent_priv->agent.port_num = port_num;
+	spin_lock_init(&mad_agent_priv->lock);
+	INIT_LIST_HEAD(&mad_agent_priv->send_list);
+	INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+	INIT_LIST_HEAD(&mad_agent_priv->done_list);
+	INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+	INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
+	INIT_LIST_HEAD(&mad_agent_priv->local_list);
+	INIT_WORK(&mad_agent_priv->local_work, local_completions);
+	atomic_set(&mad_agent_priv->refcount, 1);
+	init_completion(&mad_agent_priv->comp);
 
 	spin_lock_irqsave(&port_priv->reg_lock, flags);
 	mad_agent_priv->agent.hi_tid = ++ib_mad_client_id;
@@ -350,17 +360,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
 	list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
 	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
 
-	spin_lock_init(&mad_agent_priv->lock);
-	INIT_LIST_HEAD(&mad_agent_priv->send_list);
-	INIT_LIST_HEAD(&mad_agent_priv->wait_list);
-	INIT_LIST_HEAD(&mad_agent_priv->done_list);
-	INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
-	INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
-	INIT_LIST_HEAD(&mad_agent_priv->local_list);
-	INIT_WORK(&mad_agent_priv->local_work, local_completions);
-	atomic_set(&mad_agent_priv->refcount, 1);
-	init_completion(&mad_agent_priv->comp);
-
 	return &mad_agent_priv->agent;
 
 error4:
@@ -743,9 +742,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 		break;
 	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
 		kmem_cache_free(ib_mad_cache, mad_priv);
-		kfree(local);
-		ret = 1;
-		goto out;
+		break;
 	case IB_MAD_RESULT_SUCCESS:
 		/* Treat like an incoming receive MAD */
 		port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
@@ -756,10 +753,12 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 						        &mad_priv->mad.mad);
 		}
 		if (!port_priv || !recv_mad_agent) {
+			/*
+			 * No receiving agent so drop packet and
+			 * generate send completion.
+			 */
 			kmem_cache_free(ib_mad_cache, mad_priv);
-			kfree(local);
-			ret = 0;
-			goto out;
+			break;
 		}
 		local->mad_priv = mad_priv;
 		local->recv_mad_agent = recv_mad_agent;
@@ -2356,7 +2355,7 @@ static void local_completions(struct work_struct *work)
 	struct ib_mad_local_private *local;
 	struct ib_mad_agent_private *recv_mad_agent;
 	unsigned long flags;
-	int recv = 0;
+	int free_mad;
 	struct ib_wc wc;
 	struct ib_mad_send_wc mad_send_wc;
 
@@ -2370,14 +2369,15 @@ static void local_completions(struct work_struct *work)
 				   completion_list);
 		list_del(&local->completion_list);
 		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		free_mad = 0;
 		if (local->mad_priv) {
 			recv_mad_agent = local->recv_mad_agent;
 			if (!recv_mad_agent) {
 				printk(KERN_ERR PFX "No receive MAD agent for local completion\n");
+				free_mad = 1;
 				goto local_send_completion;
 			}
 
-			recv = 1;
 			/*
 			 * Defined behavior is to complete response
 			 * before request
@@ -2422,7 +2422,7 @@ local_send_completion:
 
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 		atomic_dec(&mad_agent_priv->refcount);
-		if (!recv)
+		if (free_mad)
 			kmem_cache_free(ib_mad_cache, local->mad_priv);
 		kfree(local);
 	}
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
index 3af2b84cd83..57a3c6f947b 100644
--- a/drivers/infiniband/core/mad_rmpp.c
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -735,7 +735,7 @@ process_rmpp_data(struct ib_mad_agent_private *agent,
 		goto bad;
 	}
 
-	if (rmpp_hdr->seg_num == __constant_htonl(1)) {
+	if (rmpp_hdr->seg_num == cpu_to_be32(1)) {
 		if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) {
 			rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
 			goto bad;
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 7863a50d56f..1865049e80f 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -395,6 +395,8 @@ static void update_sm_ah(struct work_struct *work)
 	}
 
 	spin_lock_irq(&port->ah_lock);
+	if (port->sm_ah)
+		kref_put(&port->sm_ah->ref, free_sm_ah);
 	port->sm_ah = new_ah;
 	spin_unlock_irq(&port->ah_lock);
 
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index b43f7d3682d..5c04cfb54cb 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -66,11 +66,6 @@ struct port_table_attribute {
 	int			index;
 };
 
-static inline int ibdev_is_alive(const struct ib_device *dev)
-{
-	return dev->reg_state == IB_DEV_REGISTERED;
-}
-
 static ssize_t port_attr_show(struct kobject *kobj,
 			      struct attribute *attr, char *buf)
 {
@@ -80,8 +75,6 @@ static ssize_t port_attr_show(struct kobject *kobj,
 
 	if (!port_attr->show)
 		return -EIO;
-	if (!ibdev_is_alive(p->ibdev))
-		return -ENODEV;
 
 	return port_attr->show(p, port_attr, buf);
 }
@@ -562,9 +555,6 @@ static ssize_t show_node_type(struct device *device,
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 
-	if (!ibdev_is_alive(dev))
-		return -ENODEV;
-
 	switch (dev->node_type) {
 	case RDMA_NODE_IB_CA:	  return sprintf(buf, "%d: CA\n", dev->node_type);
 	case RDMA_NODE_RNIC:	  return sprintf(buf, "%d: RNIC\n", dev->node_type);
@@ -581,9 +571,6 @@ static ssize_t show_sys_image_guid(struct device *device,
 	struct ib_device_attr attr;
 	ssize_t ret;
 
-	if (!ibdev_is_alive(dev))
-		return -ENODEV;
-
 	ret = ib_query_device(dev, &attr);
 	if (ret)
 		return ret;
@@ -600,9 +587,6 @@ static ssize_t show_node_guid(struct device *device,
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 
-	if (!ibdev_is_alive(dev))
-		return -ENODEV;
-
 	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
 		       be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
 		       be16_to_cpu(((__be16 *) &dev->node_guid)[1]),
@@ -848,6 +832,9 @@ void ib_device_unregister_sysfs(struct ib_device *device)
 	struct kobject *p, *t;
 	struct ib_port *port;
 
+	/* Hold kobject until ib_dealloc_device() */
+	kobject_get(&device->dev.kobj);
+
 	list_for_each_entry_safe(p, t, &device->port_list, entry) {
 		list_del(&p->entry);
 		port = container_of(p, struct ib_port, kobj);
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index 11efd3528ce..a4a82bff710 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -450,7 +450,7 @@ static int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq)
 	if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe))
 		return 0;
 
-	if ((CQE_OPCODE(*cqe) == T3_SEND) && RQ_TYPE(*cqe) &&
+	if (CQE_SEND_OPCODE(*cqe) && RQ_TYPE(*cqe) &&
 	    Q_EMPTY(wq->rq_rptr, wq->rq_wptr))
 		return 0;
 
@@ -941,6 +941,23 @@ int cxio_rdev_open(struct cxio_rdev *rdev_p)
 	if (!rdev_p->t3cdev_p)
 		rdev_p->t3cdev_p = dev2t3cdev(netdev_p);
 	rdev_p->t3cdev_p->ulp = (void *) rdev_p;
+
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_EMBEDDED_INFO,
+					 &(rdev_p->fw_info));
+	if (err) {
+		printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n",
+		     __func__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+	if (G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers) != CXIO_FW_MAJ) {
+		printk(KERN_ERR MOD "fatal firmware version mismatch: "
+		       "need version %u but adapter has version %u\n",
+		       CXIO_FW_MAJ,
+		       G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers));
+		err = -EINVAL;
+		goto err1;
+	}
+
 	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS,
 					 &(rdev_p->rnic_info));
 	if (err) {
@@ -1207,11 +1224,12 @@ int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
 		}
 
 		/* incoming SEND with no receive posted failures */
-		if ((CQE_OPCODE(*hw_cqe) == T3_SEND) && RQ_TYPE(*hw_cqe) &&
+		if (CQE_SEND_OPCODE(*hw_cqe) && RQ_TYPE(*hw_cqe) &&
 		    Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
 			ret = -1;
 			goto skip_cqe;
 		}
+		BUG_ON((*cqe_flushed == 0) && !SW_CQE(*hw_cqe));
 		goto proc_cqe;
 	}
 
@@ -1226,6 +1244,13 @@ int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
 		 * then we complete this with TPT_ERR_MSN and mark the wq in
 		 * error.
 		 */
+
+		if (Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
+			wq->error = 1;
+			ret = -1;
+			goto skip_cqe;
+		}
+
 		if (unlikely((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) {
 			wq->error = 1;
 			hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN));
@@ -1280,6 +1305,7 @@ proc_cqe:
 			cxio_hal_pblpool_free(wq->rdev,
 				wq->rq[Q_PTR2IDX(wq->rq_rptr,
 				wq->rq_size_log2)].pbl_addr, T3_STAG0_PBL_SIZE);
+		BUG_ON(Q_EMPTY(wq->rq_rptr, wq->rq_wptr));
 		wq->rq_rptr++;
 	}
 
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
index 9ed65b05517..094a66d1480 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -61,6 +61,8 @@
 
 #define T3_MAX_DEV_NAME_LEN 32
 
+#define CXIO_FW_MAJ 7
+
 struct cxio_hal_ctrl_qp {
 	u32 wptr;
 	u32 rptr;
@@ -108,6 +110,7 @@ struct cxio_rdev {
 	struct gen_pool *pbl_pool;
 	struct gen_pool *rqt_pool;
 	struct list_head entry;
+	struct ch_embedded_info fw_info;
 	u32	flags;
 #define	CXIO_ERROR_FATAL	1
 };
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
index 04618f7bfbb..ff9be1a1310 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_wr.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h
@@ -604,6 +604,12 @@ struct t3_cqe {
 #define CQE_STATUS(x)     (G_CQE_STATUS(be32_to_cpu((x).header)))
 #define CQE_OPCODE(x)     (G_CQE_OPCODE(be32_to_cpu((x).header)))
 
+#define CQE_SEND_OPCODE(x)( \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_INV) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE_INV))
+
 #define CQE_LEN(x)        (be32_to_cpu((x).len))
 
 /* used for RQ completion processing */
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index 44e936e48a3..8699947aaf6 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -1678,6 +1678,9 @@ static int terminate(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
 {
 	struct iwch_ep *ep = ctx;
 
+	if (state_read(&ep->com) != FPDU_MODE)
+		return CPL_RET_BUF_DONE;
+
 	PDBG("%s ep %p\n", __func__, ep);
 	skb_pull(skb, sizeof(struct cpl_rdma_terminate));
 	PDBG("%s saving %d bytes of term msg\n", __func__, skb->len);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c
index 7b67a677172..743c5d8b880 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_ev.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_ev.c
@@ -179,11 +179,6 @@ void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb)
 	case TPT_ERR_BOUND:
 	case TPT_ERR_INVALIDATE_SHARED_MR:
 	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
-		printk(KERN_ERR "%s - CQE Err qpid 0x%x opcode %d status 0x%x "
-		       "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __func__,
-		       CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe),
-		       CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe),
-		       CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
 		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
 		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1);
 		break;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index 19661b2f040..c758fbd5847 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -99,8 +99,8 @@ static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
 	if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
 		plen = 4;
 		wqe->write.sgl[0].stag = wr->ex.imm_data;
-		wqe->write.sgl[0].len = __constant_cpu_to_be32(0);
-		wqe->write.num_sgle = __constant_cpu_to_be32(0);
+		wqe->write.sgl[0].len = cpu_to_be32(0);
+		wqe->write.num_sgle = cpu_to_be32(0);
 		*flit_cnt = 6;
 	} else {
 		plen = 0;
@@ -195,15 +195,12 @@ static int build_inv_stag(union t3_wr *wqe, struct ib_send_wr *wr,
 	return 0;
 }
 
-/*
- * TBD: this is going to be moved to firmware. Missing pdid/qpid check for now.
- */
 static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list,
 			    u32 num_sgle, u32 * pbl_addr, u8 * page_size)
 {
 	int i;
 	struct iwch_mr *mhp;
-	u32 offset;
+	u64 offset;
 	for (i = 0; i < num_sgle; i++) {
 
 		mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8);
@@ -235,8 +232,8 @@ static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list,
 			return -EINVAL;
 		}
 		offset = sg_list[i].addr - mhp->attr.va_fbo;
-		offset += ((u32) mhp->attr.va_fbo) %
-		          (1UL << (12 + mhp->attr.page_size));
+		offset += mhp->attr.va_fbo &
+			  ((1UL << (12 + mhp->attr.page_size)) - 1);
 		pbl_addr[i] = ((mhp->attr.pbl_addr -
 			        rhp->rdev.rnic_info.pbl_base) >> 3) +
 			      (offset >> (12 + mhp->attr.page_size));
@@ -266,8 +263,8 @@ static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe,
 		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
 
 		/* to in the WQE == the offset into the page */
-		wqe->recv.sgl[i].to = cpu_to_be64(((u32) wr->sg_list[i].addr) %
-				(1UL << (12 + page_size[i])));
+		wqe->recv.sgl[i].to = cpu_to_be64(((u32)wr->sg_list[i].addr) &
+				((1UL << (12 + page_size[i])) - 1));
 
 		/* pbl_addr is the adapters address in the PBL */
 		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]);
diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c
index 44447aaa550..c568b28f4e2 100644
--- a/drivers/infiniband/hw/ehca/ehca_sqp.c
+++ b/drivers/infiniband/hw/ehca/ehca_sqp.c
@@ -46,11 +46,11 @@
 #include "ehca_iverbs.h"
 #include "hcp_if.h"
 
-#define IB_MAD_STATUS_REDIRECT		__constant_htons(0x0002)
-#define IB_MAD_STATUS_UNSUP_VERSION	__constant_htons(0x0004)
-#define IB_MAD_STATUS_UNSUP_METHOD	__constant_htons(0x0008)
+#define IB_MAD_STATUS_REDIRECT		cpu_to_be16(0x0002)
+#define IB_MAD_STATUS_UNSUP_VERSION	cpu_to_be16(0x0004)
+#define IB_MAD_STATUS_UNSUP_METHOD	cpu_to_be16(0x0008)
 
-#define IB_PMA_CLASS_PORT_INFO		__constant_htons(0x0001)
+#define IB_PMA_CLASS_PORT_INFO		cpu_to_be16(0x0001)
 
 /**
  * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue
diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c
index dc37277f1c8..fc7181985e8 100644
--- a/drivers/infiniband/hw/ipath/ipath_eeprom.c
+++ b/drivers/infiniband/hw/ipath/ipath_eeprom.c
@@ -772,8 +772,8 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd)
 			 "0x%x, not 0x%x\n", csum, ifp->if_csum);
 		goto done;
 	}
-	if (*(__be64 *) ifp->if_guid == 0ULL ||
-	    *(__be64 *) ifp->if_guid == __constant_cpu_to_be64(-1LL)) {
+	if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
+	    *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
 		ipath_dev_err(dd, "Invalid GUID %llx from flash; "
 			      "ignoring\n",
 			      *(unsigned long long *) ifp->if_guid);
diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c
index 64aeefbd2a5..077879c0bdb 100644
--- a/drivers/infiniband/hw/ipath/ipath_init_chip.c
+++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c
@@ -455,7 +455,7 @@ static void init_shadow_tids(struct ipath_devdata *dd)
 	if (!addrs) {
 		ipath_dev_err(dd, "failed to allocate shadow dma handle "
 			      "array, no expected sends!\n");
-		vfree(dd->ipath_pageshadow);
+		vfree(pages);
 		dd->ipath_pageshadow = NULL;
 		return;
 	}
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c
index 17a12319747..16a702d4601 100644
--- a/drivers/infiniband/hw/ipath/ipath_mad.c
+++ b/drivers/infiniband/hw/ipath/ipath_mad.c
@@ -37,10 +37,10 @@
 #include "ipath_verbs.h"
 #include "ipath_common.h"
 
-#define IB_SMP_UNSUP_VERSION	__constant_htons(0x0004)
-#define IB_SMP_UNSUP_METHOD	__constant_htons(0x0008)
-#define IB_SMP_UNSUP_METH_ATTR	__constant_htons(0x000C)
-#define IB_SMP_INVALID_FIELD	__constant_htons(0x001C)
+#define IB_SMP_UNSUP_VERSION	cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD	cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR	cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD	cpu_to_be16(0x001C)
 
 static int reply(struct ib_smp *smp)
 {
@@ -789,12 +789,12 @@ static int recv_subn_set_pkeytable(struct ib_smp *smp,
 	return recv_subn_get_pkeytable(smp, ibdev);
 }
 
-#define IB_PMA_CLASS_PORT_INFO		__constant_htons(0x0001)
-#define IB_PMA_PORT_SAMPLES_CONTROL	__constant_htons(0x0010)
-#define IB_PMA_PORT_SAMPLES_RESULT	__constant_htons(0x0011)
-#define IB_PMA_PORT_COUNTERS		__constant_htons(0x0012)
-#define IB_PMA_PORT_COUNTERS_EXT	__constant_htons(0x001D)
-#define IB_PMA_PORT_SAMPLES_RESULT_EXT	__constant_htons(0x001E)
+#define IB_PMA_CLASS_PORT_INFO		cpu_to_be16(0x0001)
+#define IB_PMA_PORT_SAMPLES_CONTROL	cpu_to_be16(0x0010)
+#define IB_PMA_PORT_SAMPLES_RESULT	cpu_to_be16(0x0011)
+#define IB_PMA_PORT_COUNTERS		cpu_to_be16(0x0012)
+#define IB_PMA_PORT_COUNTERS_EXT	cpu_to_be16(0x001D)
+#define IB_PMA_PORT_SAMPLES_RESULT_EXT	cpu_to_be16(0x001E)
 
 struct ib_perf {
 	u8 base_version;
@@ -884,19 +884,19 @@ struct ib_pma_portcounters {
 	__be32 port_rcv_packets;
 } __attribute__ ((packed));
 
-#define IB_PMA_SEL_SYMBOL_ERROR			__constant_htons(0x0001)
-#define IB_PMA_SEL_LINK_ERROR_RECOVERY		__constant_htons(0x0002)
-#define IB_PMA_SEL_LINK_DOWNED			__constant_htons(0x0004)
-#define IB_PMA_SEL_PORT_RCV_ERRORS		__constant_htons(0x0008)
-#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS	__constant_htons(0x0010)
-#define IB_PMA_SEL_PORT_XMIT_DISCARDS		__constant_htons(0x0040)
-#define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS	__constant_htons(0x0200)
-#define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS	__constant_htons(0x0400)
-#define IB_PMA_SEL_PORT_VL15_DROPPED		__constant_htons(0x0800)
-#define IB_PMA_SEL_PORT_XMIT_DATA		__constant_htons(0x1000)
-#define IB_PMA_SEL_PORT_RCV_DATA		__constant_htons(0x2000)
-#define IB_PMA_SEL_PORT_XMIT_PACKETS		__constant_htons(0x4000)
-#define IB_PMA_SEL_PORT_RCV_PACKETS		__constant_htons(0x8000)
+#define IB_PMA_SEL_SYMBOL_ERROR			cpu_to_be16(0x0001)
+#define IB_PMA_SEL_LINK_ERROR_RECOVERY		cpu_to_be16(0x0002)
+#define IB_PMA_SEL_LINK_DOWNED			cpu_to_be16(0x0004)
+#define IB_PMA_SEL_PORT_RCV_ERRORS		cpu_to_be16(0x0008)
+#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS	cpu_to_be16(0x0010)
+#define IB_PMA_SEL_PORT_XMIT_DISCARDS		cpu_to_be16(0x0040)
+#define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS	cpu_to_be16(0x0200)
+#define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS	cpu_to_be16(0x0400)
+#define IB_PMA_SEL_PORT_VL15_DROPPED		cpu_to_be16(0x0800)
+#define IB_PMA_SEL_PORT_XMIT_DATA		cpu_to_be16(0x1000)
+#define IB_PMA_SEL_PORT_RCV_DATA		cpu_to_be16(0x2000)
+#define IB_PMA_SEL_PORT_XMIT_PACKETS		cpu_to_be16(0x4000)
+#define IB_PMA_SEL_PORT_RCV_PACKETS		cpu_to_be16(0x8000)
 
 struct ib_pma_portcounters_ext {
 	u8 reserved;
@@ -913,14 +913,14 @@ struct ib_pma_portcounters_ext {
 	__be64 port_multicast_rcv_packets;
 } __attribute__ ((packed));
 
-#define IB_PMA_SELX_PORT_XMIT_DATA		__constant_htons(0x0001)
-#define IB_PMA_SELX_PORT_RCV_DATA		__constant_htons(0x0002)
-#define IB_PMA_SELX_PORT_XMIT_PACKETS		__constant_htons(0x0004)
-#define IB_PMA_SELX_PORT_RCV_PACKETS		__constant_htons(0x0008)
-#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS	__constant_htons(0x0010)
-#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS	__constant_htons(0x0020)
-#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS	__constant_htons(0x0040)
-#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS	__constant_htons(0x0080)
+#define IB_PMA_SELX_PORT_XMIT_DATA		cpu_to_be16(0x0001)
+#define IB_PMA_SELX_PORT_RCV_DATA		cpu_to_be16(0x0002)
+#define IB_PMA_SELX_PORT_XMIT_PACKETS		cpu_to_be16(0x0004)
+#define IB_PMA_SELX_PORT_RCV_PACKETS		cpu_to_be16(0x0008)
+#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS	cpu_to_be16(0x0010)
+#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS	cpu_to_be16(0x0020)
+#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS	cpu_to_be16(0x0040)
+#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS	cpu_to_be16(0x0080)
 
 static int recv_pma_get_classportinfo(struct ib_perf *pmp)
 {
@@ -933,7 +933,7 @@ static int recv_pma_get_classportinfo(struct ib_perf *pmp)
 		pmp->status |= IB_SMP_INVALID_FIELD;
 
 	/* Indicate AllPortSelect is valid (only one port anyway) */
-	p->cap_mask = __constant_cpu_to_be16(1 << 8);
+	p->cap_mask = cpu_to_be16(1 << 8);
 	p->base_version = 1;
 	p->class_version = 1;
 	/*
@@ -951,12 +951,11 @@ static int recv_pma_get_classportinfo(struct ib_perf *pmp)
  * We support 5 counters which only count the mandatory quantities.
  */
 #define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
-#define COUNTER_MASK0_9 \
-	__constant_cpu_to_be32(COUNTER_MASK(1, 0) | \
-			       COUNTER_MASK(1, 1) | \
-			       COUNTER_MASK(1, 2) | \
-			       COUNTER_MASK(1, 3) | \
-			       COUNTER_MASK(1, 4))
+#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \
+				    COUNTER_MASK(1, 1) | \
+				    COUNTER_MASK(1, 2) | \
+				    COUNTER_MASK(1, 3) | \
+				    COUNTER_MASK(1, 4))
 
 static int recv_pma_get_portsamplescontrol(struct ib_perf *pmp,
 					   struct ib_device *ibdev, u8 port)
@@ -1137,7 +1136,7 @@ static int recv_pma_get_portsamplesresult_ext(struct ib_perf *pmp,
 		status = dev->pma_sample_status;
 	p->sample_status = cpu_to_be16(status);
 	/* 64 bits */
-	p->extended_width = __constant_cpu_to_be32(0x80000000);
+	p->extended_width = cpu_to_be32(0x80000000);
 	for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
 		p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
 		    cpu_to_be64(
@@ -1185,7 +1184,7 @@ static int recv_pma_get_portcounters(struct ib_perf *pmp,
 		pmp->status |= IB_SMP_INVALID_FIELD;
 
 	if (cntrs.symbol_error_counter > 0xFFFFUL)
-		p->symbol_error_counter = __constant_cpu_to_be16(0xFFFF);
+		p->symbol_error_counter = cpu_to_be16(0xFFFF);
 	else
 		p->symbol_error_counter =
 			cpu_to_be16((u16)cntrs.symbol_error_counter);
@@ -1199,17 +1198,17 @@ static int recv_pma_get_portcounters(struct ib_perf *pmp,
 	else
 		p->link_downed_counter = (u8)cntrs.link_downed_counter;
 	if (cntrs.port_rcv_errors > 0xFFFFUL)
-		p->port_rcv_errors = __constant_cpu_to_be16(0xFFFF);
+		p->port_rcv_errors = cpu_to_be16(0xFFFF);
 	else
 		p->port_rcv_errors =
 			cpu_to_be16((u16) cntrs.port_rcv_errors);
 	if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
-		p->port_rcv_remphys_errors = __constant_cpu_to_be16(0xFFFF);
+		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
 	else
 		p->port_rcv_remphys_errors =
 			cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
 	if (cntrs.port_xmit_discards > 0xFFFFUL)
-		p->port_xmit_discards = __constant_cpu_to_be16(0xFFFF);
+		p->port_xmit_discards = cpu_to_be16(0xFFFF);
 	else
 		p->port_xmit_discards =
 			cpu_to_be16((u16)cntrs.port_xmit_discards);
@@ -1220,24 +1219,24 @@ static int recv_pma_get_portcounters(struct ib_perf *pmp,
 	p->lli_ebor_errors = (cntrs.local_link_integrity_errors << 4) |
 		cntrs.excessive_buffer_overrun_errors;
 	if (cntrs.vl15_dropped > 0xFFFFUL)
-		p->vl15_dropped = __constant_cpu_to_be16(0xFFFF);
+		p->vl15_dropped = cpu_to_be16(0xFFFF);
 	else
 		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
 	if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
-		p->port_xmit_data = __constant_cpu_to_be32(0xFFFFFFFF);
+		p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
 	else
 		p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
 	if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
-		p->port_rcv_data = __constant_cpu_to_be32(0xFFFFFFFF);
+		p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
 	else
 		p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
 	if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
-		p->port_xmit_packets = __constant_cpu_to_be32(0xFFFFFFFF);
+		p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
 	else
 		p->port_xmit_packets =
 			cpu_to_be32((u32)cntrs.port_xmit_packets);
 	if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
-		p->port_rcv_packets = __constant_cpu_to_be32(0xFFFFFFFF);
+		p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
 	else
 		p->port_rcv_packets =
 			cpu_to_be32((u32) cntrs.port_rcv_packets);
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
index 9170710b950..79b3dbc9717 100644
--- a/drivers/infiniband/hw/ipath/ipath_rc.c
+++ b/drivers/infiniband/hw/ipath/ipath_rc.c
@@ -1744,7 +1744,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		/* Signal completion event if the solicited bit is set. */
 		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
 			       (ohdr->bth[0] &
-				__constant_cpu_to_be32(1 << 23)) != 0);
+				cpu_to_be32(1 << 23)) != 0);
 		break;
 
 	case OP(RDMA_WRITE_FIRST):
diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c
index 8e255adf5d9..4b069859085 100644
--- a/drivers/infiniband/hw/ipath/ipath_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_sdma.c
@@ -781,10 +781,10 @@ retry:
 		descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0];
 	descqp -= 2;
 	/* SDmaLastDesc */
-	descqp[0] |= __constant_cpu_to_le64(1ULL << 11);
+	descqp[0] |= cpu_to_le64(1ULL << 11);
 	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) {
 		/* SDmaIntReq */
-		descqp[0] |= __constant_cpu_to_le64(1ULL << 15);
+		descqp[0] |= cpu_to_le64(1ULL << 15);
 	}
 
 	/* Commit writes to memory and advance the tail on the chip */
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c
index 82cc588b8bf..22e60998f1a 100644
--- a/drivers/infiniband/hw/ipath/ipath_uc.c
+++ b/drivers/infiniband/hw/ipath/ipath_uc.c
@@ -419,7 +419,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		/* Signal completion event if the solicited bit is set. */
 		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
 			       (ohdr->bth[0] &
-				__constant_cpu_to_be32(1 << 23)) != 0);
+				cpu_to_be32(1 << 23)) != 0);
 		break;
 
 	case OP(RDMA_WRITE_FIRST):
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
index 91c74cc797a..6076cb61bf6 100644
--- a/drivers/infiniband/hw/ipath/ipath_ud.c
+++ b/drivers/infiniband/hw/ipath/ipath_ud.c
@@ -370,7 +370,7 @@ int ipath_make_ud_req(struct ipath_qp *qp)
 	 */
 	ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
 		ah_attr->dlid != IPATH_PERMISSIVE_LID ?
-		__constant_cpu_to_be32(IPATH_MULTICAST_QPN) :
+		cpu_to_be32(IPATH_MULTICAST_QPN) :
 		cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
 	ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK);
 	/*
@@ -573,7 +573,7 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	/* Signal completion event if the solicited bit is set. */
 	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
 		       (ohdr->bth[0] &
-			__constant_cpu_to_be32(1 << 23)) != 0);
+			cpu_to_be32(1 << 23)) != 0);
 
 bail:;
 }
diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c
index 0190edc8044..855911e7396 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_pages.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c
@@ -209,20 +209,20 @@ void ipath_release_user_pages_on_close(struct page **p, size_t num_pages)
 
 	mm = get_task_mm(current);
 	if (!mm)
-		goto bail;
+		return;
 
 	work = kmalloc(sizeof(*work), GFP_KERNEL);
 	if (!work)
 		goto bail_mm;
 
-	goto bail;
-
 	INIT_WORK(&work->work, user_pages_account);
 	work->mm = mm;
 	work->num_pages = num_pages;
 
+	schedule_work(&work->work);
+	return;
+
 bail_mm:
 	mmput(mm);
-bail:
 	return;
 }
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
index 82d9a0b5ca2..7bff4b9baa0 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
@@ -667,13 +667,13 @@ static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd,
 
 static inline __le64 ipath_sdma_make_first_desc0(__le64 descq)
 {
-	return descq | __constant_cpu_to_le64(1ULL << 12);
+	return descq | cpu_to_le64(1ULL << 12);
 }
 
 static inline __le64 ipath_sdma_make_last_desc0(__le64 descq)
 {
 					      /* last */  /* dma head */
-	return descq | __constant_cpu_to_le64(1ULL << 11 | 1ULL << 13);
+	return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
 }
 
 static inline __le64 ipath_sdma_make_desc1(u64 addr)
@@ -763,7 +763,7 @@ static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
 		if (ofs >= IPATH_SMALLBUF_DWORDS) {
 			for (i = 0; i < pkt->naddr; i++) {
 				dd->ipath_sdma_descq[dtail].qw[0] |=
-					__constant_cpu_to_le64(1ULL << 14);
+					cpu_to_le64(1ULL << 14);
 				if (++dtail == dd->ipath_sdma_descq_cnt)
 					dtail = 0;
 			}
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index cdf0e6abd34..9289ab4b0ae 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -1585,7 +1585,7 @@ static int ipath_query_port(struct ib_device *ibdev,
 	u64 ibcstat;
 
 	memset(props, 0, sizeof(*props));
-	props->lid = lid ? lid : __constant_be16_to_cpu(IB_LID_PERMISSIVE);
+	props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
 	props->lmc = dd->ipath_lmc;
 	props->sm_lid = dev->sm_lid;
 	props->sm_sl = dev->sm_sl;
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
index 11e3f613df9..ae6cff4abff 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.h
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.h
@@ -86,11 +86,11 @@
 #define IB_PMA_SAMPLE_STATUS_RUNNING	0x02
 
 /* Mandatory IB performance counter select values. */
-#define IB_PMA_PORT_XMIT_DATA	__constant_htons(0x0001)
-#define IB_PMA_PORT_RCV_DATA	__constant_htons(0x0002)
-#define IB_PMA_PORT_XMIT_PKTS	__constant_htons(0x0003)
-#define IB_PMA_PORT_RCV_PKTS	__constant_htons(0x0004)
-#define IB_PMA_PORT_XMIT_WAIT	__constant_htons(0x0005)
+#define IB_PMA_PORT_XMIT_DATA	cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA	cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS	cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS	cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT	cpu_to_be16(0x0005)
 
 struct ib_reth {
 	__be64 vaddr;
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 606f1e2ef28..19e68ab6616 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -147,7 +147,8 @@ static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
  * Snoop SM MADs for port info and P_Key table sets, so we can
  * synthesize LID change and P_Key change events.
  */
-static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad)
+static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad,
+				u16 prev_lid)
 {
 	struct ib_event event;
 
@@ -157,6 +158,7 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad)
 		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
 			struct ib_port_info *pinfo =
 				(struct ib_port_info *) ((struct ib_smp *) mad)->data;
+			u16 lid = be16_to_cpu(pinfo->lid);
 
 			update_sm_ah(to_mdev(ibdev), port_num,
 				     be16_to_cpu(pinfo->sm_lid),
@@ -165,12 +167,15 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad)
 			event.device	       = ibdev;
 			event.element.port_num = port_num;
 
-			if (pinfo->clientrereg_resv_subnetto & 0x80)
+			if (pinfo->clientrereg_resv_subnetto & 0x80) {
 				event.event    = IB_EVENT_CLIENT_REREGISTER;
-			else
-				event.event    = IB_EVENT_LID_CHANGE;
+				ib_dispatch_event(&event);
+			}
 
-			ib_dispatch_event(&event);
+			if (prev_lid != lid) {
+				event.event    = IB_EVENT_LID_CHANGE;
+				ib_dispatch_event(&event);
+			}
 		}
 
 		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
@@ -228,8 +233,9 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
 			struct ib_wc *in_wc, struct ib_grh *in_grh,
 			struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
-	u16 slid;
+	u16 slid, prev_lid = 0;
 	int err;
+	struct ib_port_attr pattr;
 
 	slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
 
@@ -263,6 +269,13 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
 	} else
 		return IB_MAD_RESULT_SUCCESS;
 
+	if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+	    !ib_query_port(ibdev, port_num, &pattr))
+		prev_lid = pattr.lid;
+
 	err = mlx4_MAD_IFC(to_mdev(ibdev),
 			   mad_flags & IB_MAD_IGNORE_MKEY,
 			   mad_flags & IB_MAD_IGNORE_BKEY,
@@ -271,7 +284,7 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
 		return IB_MAD_RESULT_FAILURE;
 
 	if (!out_mad->mad_hdr.status) {
-		smp_snoop(ibdev, port_num, in_mad);
+		smp_snoop(ibdev, port_num, in_mad, prev_lid);
 		node_desc_override(ibdev, out_mad);
 	}
 
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 61588bd273b..2ccb9d31771 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -699,11 +699,12 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 	struct mlx4_ib_dev *ibdev = ibdev_ptr;
 	int p;
 
+	mlx4_ib_mad_cleanup(ibdev);
+	ib_unregister_device(&ibdev->ib_dev);
+
 	for (p = 1; p <= ibdev->num_ports; ++p)
 		mlx4_CLOSE_PORT(dev, p);
 
-	mlx4_ib_mad_cleanup(ibdev);
-	ib_unregister_device(&ibdev->ib_dev);
 	iounmap(ibdev->uar_map);
 	mlx4_uar_free(dev, &ibdev->priv_uar);
 	mlx4_pd_free(dev, ibdev->priv_pdn);
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index a91cb4c3fa5..f385a24d31d 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -71,17 +71,17 @@ enum {
 };
 
 static const __be32 mlx4_ib_opcode[] = {
-	[IB_WR_SEND]			= __constant_cpu_to_be32(MLX4_OPCODE_SEND),
-	[IB_WR_LSO]			= __constant_cpu_to_be32(MLX4_OPCODE_LSO),
-	[IB_WR_SEND_WITH_IMM]		= __constant_cpu_to_be32(MLX4_OPCODE_SEND_IMM),
-	[IB_WR_RDMA_WRITE]		= __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
-	[IB_WR_RDMA_WRITE_WITH_IMM]	= __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
-	[IB_WR_RDMA_READ]		= __constant_cpu_to_be32(MLX4_OPCODE_RDMA_READ),
-	[IB_WR_ATOMIC_CMP_AND_SWP]	= __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
-	[IB_WR_ATOMIC_FETCH_AND_ADD]	= __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
-	[IB_WR_SEND_WITH_INV]		= __constant_cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
-	[IB_WR_LOCAL_INV]		= __constant_cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
-	[IB_WR_FAST_REG_MR]		= __constant_cpu_to_be32(MLX4_OPCODE_FMR),
+	[IB_WR_SEND]			= cpu_to_be32(MLX4_OPCODE_SEND),
+	[IB_WR_LSO]			= cpu_to_be32(MLX4_OPCODE_LSO),
+	[IB_WR_SEND_WITH_IMM]		= cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+	[IB_WR_RDMA_WRITE]		= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+	[IB_WR_RDMA_WRITE_WITH_IMM]	= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+	[IB_WR_RDMA_READ]		= cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+	[IB_WR_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+	[IB_WR_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+	[IB_WR_SEND_WITH_INV]		= cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+	[IB_WR_LOCAL_INV]		= cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+	[IB_WR_FAST_REG_MR]		= cpu_to_be32(MLX4_OPCODE_FMR),
 };
 
 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index 640449582ab..5648659ff0b 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -104,7 +104,8 @@ static void update_sm_ah(struct mthca_dev *dev,
  */
 static void smp_snoop(struct ib_device *ibdev,
 		      u8 port_num,
-		      struct ib_mad *mad)
+		      struct ib_mad *mad,
+		      u16 prev_lid)
 {
 	struct ib_event event;
 
@@ -114,6 +115,7 @@ static void smp_snoop(struct ib_device *ibdev,
 		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
 			struct ib_port_info *pinfo =
 				(struct ib_port_info *) ((struct ib_smp *) mad)->data;
+			u16 lid = be16_to_cpu(pinfo->lid);
 
 			mthca_update_rate(to_mdev(ibdev), port_num);
 			update_sm_ah(to_mdev(ibdev), port_num,
@@ -123,12 +125,15 @@ static void smp_snoop(struct ib_device *ibdev,
 			event.device           = ibdev;
 			event.element.port_num = port_num;
 
-			if (pinfo->clientrereg_resv_subnetto & 0x80)
+			if (pinfo->clientrereg_resv_subnetto & 0x80) {
 				event.event    = IB_EVENT_CLIENT_REREGISTER;
-			else
-				event.event    = IB_EVENT_LID_CHANGE;
+				ib_dispatch_event(&event);
+			}
 
-			ib_dispatch_event(&event);
+			if (prev_lid != lid) {
+				event.event    = IB_EVENT_LID_CHANGE;
+				ib_dispatch_event(&event);
+			}
 		}
 
 		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
@@ -196,6 +201,8 @@ int mthca_process_mad(struct ib_device *ibdev,
 	int err;
 	u8 status;
 	u16 slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+	u16 prev_lid = 0;
+	struct ib_port_attr pattr;
 
 	/* Forward locally generated traps to the SM */
 	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP &&
@@ -233,6 +240,12 @@ int mthca_process_mad(struct ib_device *ibdev,
 			return IB_MAD_RESULT_SUCCESS;
 	} else
 		return IB_MAD_RESULT_SUCCESS;
+	if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+	    !ib_query_port(ibdev, port_num, &pattr))
+		prev_lid = pattr.lid;
 
 	err = mthca_MAD_IFC(to_mdev(ibdev),
 			    mad_flags & IB_MAD_IGNORE_MKEY,
@@ -252,7 +265,7 @@ int mthca_process_mad(struct ib_device *ibdev,
 	}
 
 	if (!out_mad->mad_hdr.status) {
-		smp_snoop(ibdev, port_num, in_mad);
+		smp_snoop(ibdev, port_num, in_mad, prev_lid);
 		node_desc_override(ibdev, out_mad);
 	}
 
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index b9611ade9ea..ca599767ffb 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
  * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
index 13a5bb1a7bc..04b12ad2339 100644
--- a/drivers/infiniband/hw/nes/nes.h
+++ b/drivers/infiniband/hw/nes/nes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
  * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 4a65b96db2c..52425154acd 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -103,6 +103,7 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt);
 static void nes_disconnect_worker(struct work_struct *work);
 
 static int send_mpa_request(struct nes_cm_node *, struct sk_buff *);
+static int send_mpa_reject(struct nes_cm_node *);
 static int send_syn(struct nes_cm_node *, u32, struct sk_buff *);
 static int send_reset(struct nes_cm_node *, struct sk_buff *);
 static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb);
@@ -113,8 +114,7 @@ static void process_packet(struct nes_cm_node *, struct sk_buff *,
 static void active_open_err(struct nes_cm_node *, struct sk_buff *, int);
 static void passive_open_err(struct nes_cm_node *, struct sk_buff *, int);
 static void cleanup_retrans_entry(struct nes_cm_node *);
-static void handle_rcv_mpa(struct nes_cm_node *, struct sk_buff *,
-	enum nes_cm_event_type);
+static void handle_rcv_mpa(struct nes_cm_node *, struct sk_buff *);
 static void free_retrans_entry(struct nes_cm_node *cm_node);
 static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph,
 	struct sk_buff *skb, int optionsize, int passive);
@@ -124,6 +124,8 @@ static void cm_event_connected(struct nes_cm_event *);
 static void cm_event_connect_error(struct nes_cm_event *);
 static void cm_event_reset(struct nes_cm_event *);
 static void cm_event_mpa_req(struct nes_cm_event *);
+static void cm_event_mpa_reject(struct nes_cm_event *);
+static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node);
 
 static void print_core(struct nes_cm_core *core);
 
@@ -196,7 +198,6 @@ static struct nes_cm_event *create_event(struct nes_cm_node *cm_node,
  */
 static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb)
 {
-	int ret;
 	if (!skb) {
 		nes_debug(NES_DBG_CM, "skb set to NULL\n");
 		return -1;
@@ -206,11 +207,27 @@ static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb)
 	form_cm_frame(skb, cm_node, NULL, 0, &cm_node->mpa_frame,
 			cm_node->mpa_frame_size, SET_ACK);
 
-	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
-	if (ret < 0)
-		return ret;
+	return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+}
 
-	return 0;
+
+
+static int send_mpa_reject(struct nes_cm_node *cm_node)
+{
+	struct sk_buff  *skb = NULL;
+
+	skb = dev_alloc_skb(MAX_CM_BUFFER);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -ENOMEM;
+	}
+
+	/* send an MPA reject frame */
+	form_cm_frame(skb, cm_node, NULL, 0, &cm_node->mpa_frame,
+			cm_node->mpa_frame_size, SET_ACK | SET_FIN);
+
+	cm_node->state = NES_CM_STATE_FIN_WAIT1;
+	return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
 }
 
 
@@ -218,14 +235,17 @@ static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb)
  * recv_mpa - process a received TCP pkt, we are expecting an
  * IETF MPA frame
  */
-static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 len)
+static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type,
+		u32 len)
 {
 	struct ietf_mpa_frame *mpa_frame;
 
+	*type = NES_MPA_REQUEST_ACCEPT;
+
 	/* assume req frame is in tcp data payload */
 	if (len < sizeof(struct ietf_mpa_frame)) {
 		nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len);
-		return -1;
+		return -EINVAL;
 	}
 
 	mpa_frame = (struct ietf_mpa_frame *)buffer;
@@ -234,14 +254,25 @@ static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 len)
 	if (cm_node->mpa_frame_size + sizeof(struct ietf_mpa_frame) != len) {
 		nes_debug(NES_DBG_CM, "The received ietf buffer was not right"
 				" complete (%x + %x != %x)\n",
-				cm_node->mpa_frame_size, (u32)sizeof(struct ietf_mpa_frame), len);
-		return -1;
+				cm_node->mpa_frame_size,
+				(u32)sizeof(struct ietf_mpa_frame), len);
+		return -EINVAL;
+	}
+	/* make sure it does not exceed the max size */
+	if (len > MAX_CM_BUFFER) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was too large"
+				" (%x + %x != %x)\n",
+				cm_node->mpa_frame_size,
+				(u32)sizeof(struct ietf_mpa_frame), len);
+		return -EINVAL;
 	}
 
 	/* copy entire MPA frame to our cm_node's frame */
 	memcpy(cm_node->mpa_frame_buf, buffer + sizeof(struct ietf_mpa_frame),
 			cm_node->mpa_frame_size);
 
+	if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT)
+		*type = NES_MPA_REQUEST_REJECT;
 	return 0;
 }
 
@@ -380,7 +411,7 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb,
 
 	new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC);
 	if (!new_send)
-		return -1;
+		return -ENOMEM;
 
 	/* new_send->timetosend = currenttime */
 	new_send->retrycount = NES_DEFAULT_RETRYS;
@@ -394,9 +425,11 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb,
 
 	if (type == NES_TIMER_TYPE_CLOSE) {
 		new_send->timetosend += (HZ/10);
-		spin_lock_irqsave(&cm_node->recv_list_lock, flags);
-		list_add_tail(&new_send->list, &cm_node->recv_list);
-		spin_unlock_irqrestore(&cm_node->recv_list_lock, flags);
+		if (cm_node->recv_entry) {
+			WARN_ON(1);
+			return -EINVAL;
+		}
+		cm_node->recv_entry = new_send;
 	}
 
 	if (type == NES_TIMER_TYPE_SEND) {
@@ -435,24 +468,78 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb,
 	return ret;
 }
 
+static void nes_retrans_expired(struct nes_cm_node *cm_node)
+{
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_CLOSING:
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		break;
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_MPAREJ_RCVD:
+		send_reset(cm_node, NULL);
+		break;
+	default:
+		create_event(cm_node, NES_CM_EVENT_ABORTED);
+	}
+}
+
+static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node)
+{
+	struct nes_timer_entry *recv_entry = cm_node->recv_entry;
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	struct nes_qp *nesqp;
+	unsigned long qplockflags;
+
+	if (!recv_entry)
+		return;
+	nesqp = (struct nes_qp *)recv_entry->skb;
+	if (nesqp) {
+		spin_lock_irqsave(&nesqp->lock, qplockflags);
+		if (nesqp->cm_id) {
+			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
+				"refcount = %d: HIT A "
+				"NES_TIMER_TYPE_CLOSE with something "
+				"to do!!!\n", nesqp->hwqp.qp_id, cm_id,
+				atomic_read(&nesqp->refcount));
+			nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+			nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
+			nesqp->ibqp_state = IB_QPS_ERR;
+			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+			nes_cm_disconn(nesqp);
+		} else {
+			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
+				"refcount = %d: HIT A "
+				"NES_TIMER_TYPE_CLOSE with nothing "
+				"to do!!!\n", nesqp->hwqp.qp_id, cm_id,
+				atomic_read(&nesqp->refcount));
+		}
+	} else if (rem_node) {
+		/* TIME_WAIT state */
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+	if (cm_node->cm_id)
+		cm_id->rem_ref(cm_id);
+	kfree(recv_entry);
+	cm_node->recv_entry = NULL;
+}
 
 /**
  * nes_cm_timer_tick
  */
 static void nes_cm_timer_tick(unsigned long pass)
 {
-	unsigned long flags, qplockflags;
+	unsigned long flags;
 	unsigned long nexttimeout = jiffies + NES_LONG_TIME;
-	struct iw_cm_id *cm_id;
 	struct nes_cm_node *cm_node;
 	struct nes_timer_entry *send_entry, *recv_entry;
-	struct list_head *list_core, *list_core_temp;
-	struct list_head *list_node, *list_node_temp;
+	struct list_head *list_core_temp;
+	struct list_head *list_node;
 	struct nes_cm_core *cm_core = g_cm_core;
-	struct nes_qp *nesqp;
 	u32 settimer = 0;
 	int ret = NETDEV_TX_OK;
-	enum nes_cm_node_state last_state;
 
 	struct list_head timer_list;
 	INIT_LIST_HEAD(&timer_list);
@@ -461,7 +548,7 @@ static void nes_cm_timer_tick(unsigned long pass)
 	list_for_each_safe(list_node, list_core_temp,
 				&cm_core->connected_nodes) {
 		cm_node = container_of(list_node, struct nes_cm_node, list);
-		if (!list_empty(&cm_node->recv_list) || (cm_node->send_entry)) {
+		if ((cm_node->recv_entry) || (cm_node->send_entry)) {
 			add_ref_cm_node(cm_node);
 			list_add(&cm_node->timer_entry, &timer_list);
 		}
@@ -471,54 +558,18 @@ static void nes_cm_timer_tick(unsigned long pass)
 	list_for_each_safe(list_node, list_core_temp, &timer_list) {
 		cm_node = container_of(list_node, struct nes_cm_node,
 					timer_entry);
-		spin_lock_irqsave(&cm_node->recv_list_lock, flags);
-		list_for_each_safe(list_core, list_node_temp,
-			&cm_node->recv_list) {
-			recv_entry = container_of(list_core,
-				struct nes_timer_entry, list);
-			if (!recv_entry)
-				break;
+		recv_entry = cm_node->recv_entry;
+
+		if (recv_entry) {
 			if (time_after(recv_entry->timetosend, jiffies)) {
 				if (nexttimeout > recv_entry->timetosend ||
-					!settimer) {
+						!settimer) {
 					nexttimeout = recv_entry->timetosend;
 					settimer = 1;
 				}
-				continue;
-			}
-			list_del(&recv_entry->list);
-			cm_id = cm_node->cm_id;
-			spin_unlock_irqrestore(&cm_node->recv_list_lock, flags);
-			nesqp = (struct nes_qp *)recv_entry->skb;
-			spin_lock_irqsave(&nesqp->lock, qplockflags);
-			if (nesqp->cm_id) {
-				nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
-					"refcount = %d: HIT A "
-					"NES_TIMER_TYPE_CLOSE with something "
-					"to do!!!\n", nesqp->hwqp.qp_id, cm_id,
-					atomic_read(&nesqp->refcount));
-				nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
-				nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
-				nesqp->ibqp_state = IB_QPS_ERR;
-				spin_unlock_irqrestore(&nesqp->lock,
-					qplockflags);
-				nes_cm_disconn(nesqp);
-			} else {
-				spin_unlock_irqrestore(&nesqp->lock,
-					qplockflags);
-				nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
-					"refcount = %d: HIT A "
-					"NES_TIMER_TYPE_CLOSE with nothing "
-					"to do!!!\n", nesqp->hwqp.qp_id, cm_id,
-					atomic_read(&nesqp->refcount));
-			}
-			if (cm_id)
-				cm_id->rem_ref(cm_id);
-
-			kfree(recv_entry);
-			spin_lock_irqsave(&cm_node->recv_list_lock, flags);
+			} else
+				handle_recv_entry(cm_node, 1);
 		}
-		spin_unlock_irqrestore(&cm_node->recv_list_lock, flags);
 
 		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
 		do {
@@ -533,12 +584,11 @@ static void nes_cm_timer_tick(unsigned long pass)
 						nexttimeout =
 							send_entry->timetosend;
 						settimer = 1;
-						break;
 					}
 				} else {
 					free_retrans_entry(cm_node);
-					break;
 				}
+				break;
 			}
 
 			if ((cm_node->state == NES_CM_STATE_TSA) ||
@@ -550,16 +600,12 @@ static void nes_cm_timer_tick(unsigned long pass)
 			if (!send_entry->retranscount ||
 				!send_entry->retrycount) {
 				cm_packets_dropped++;
-				last_state = cm_node->state;
-				cm_node->state = NES_CM_STATE_CLOSED;
 				free_retrans_entry(cm_node);
+
 				spin_unlock_irqrestore(
 					&cm_node->retrans_list_lock, flags);
-				if (last_state == NES_CM_STATE_SYN_RCVD)
-					rem_ref_cm_node(cm_core, cm_node);
-				else
-					create_event(cm_node,
-						NES_CM_EVENT_ABORTED);
+				nes_retrans_expired(cm_node);
+				cm_node->state = NES_CM_STATE_CLOSED;
 				spin_lock_irqsave(&cm_node->retrans_list_lock,
 					flags);
 				break;
@@ -714,7 +760,7 @@ static int send_reset(struct nes_cm_node *cm_node, struct sk_buff *skb)
 		skb = dev_alloc_skb(MAX_CM_BUFFER);
 	if (!skb) {
 		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
-		return -1;
+		return -ENOMEM;
 	}
 
 	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, flags);
@@ -778,14 +824,10 @@ static struct nes_cm_node *find_node(struct nes_cm_core *cm_core,
 	unsigned long flags;
 	struct list_head *hte;
 	struct nes_cm_node *cm_node;
-	__be32 tmp_addr = cpu_to_be32(loc_addr);
 
 	/* get a handle on the hte */
 	hte = &cm_core->connected_nodes;
 
-	nes_debug(NES_DBG_CM, "Searching for an owner node: %pI4:%x from core %p->%p\n",
-		  &tmp_addr, loc_port, cm_core, hte);
-
 	/* walk list and find cm_node associated with this session ID */
 	spin_lock_irqsave(&cm_core->ht_lock, flags);
 	list_for_each_entry(cm_node, hte, list) {
@@ -875,7 +917,8 @@ static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node
 static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core,
 	struct nes_cm_listener *listener, int free_hanging_nodes)
 {
-	int ret = 1;
+	int ret = -EINVAL;
+	int err = 0;
 	unsigned long flags;
 	struct list_head *list_pos = NULL;
 	struct list_head *list_temp = NULL;
@@ -904,10 +947,60 @@ static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core,
 
 	list_for_each_safe(list_pos, list_temp, &reset_list) {
 		cm_node = container_of(list_pos, struct nes_cm_node,
-					reset_entry);
-		cleanup_retrans_entry(cm_node);
-		send_reset(cm_node, NULL);
-		rem_ref_cm_node(cm_node->cm_core, cm_node);
+				reset_entry);
+		{
+			struct nes_cm_node *loopback = cm_node->loopbackpartner;
+			if (NES_CM_STATE_FIN_WAIT1 <= cm_node->state) {
+				rem_ref_cm_node(cm_node->cm_core, cm_node);
+			} else {
+				if (!loopback) {
+					cleanup_retrans_entry(cm_node);
+					err = send_reset(cm_node, NULL);
+					if (err) {
+						cm_node->state =
+							 NES_CM_STATE_CLOSED;
+						WARN_ON(1);
+					} else {
+						cm_node->state =
+							NES_CM_STATE_CLOSED;
+						rem_ref_cm_node(
+							cm_node->cm_core,
+							cm_node);
+					}
+				} else {
+					struct nes_cm_event event;
+
+					event.cm_node = loopback;
+					event.cm_info.rem_addr =
+							loopback->rem_addr;
+					event.cm_info.loc_addr =
+							loopback->loc_addr;
+					event.cm_info.rem_port =
+							loopback->rem_port;
+					event.cm_info.loc_port =
+							 loopback->loc_port;
+					event.cm_info.cm_id = loopback->cm_id;
+					cm_event_connect_error(&event);
+					loopback->state = NES_CM_STATE_CLOSED;
+
+					event.cm_node = cm_node;
+					event.cm_info.rem_addr =
+							 cm_node->rem_addr;
+					event.cm_info.loc_addr =
+							 cm_node->loc_addr;
+					event.cm_info.rem_port =
+							 cm_node->rem_port;
+					event.cm_info.loc_port =
+							 cm_node->loc_port;
+					event.cm_info.cm_id = cm_node->cm_id;
+					cm_event_reset(&event);
+
+					rem_ref_cm_node(cm_node->cm_core,
+							 cm_node);
+
+				}
+			}
+		}
 	}
 
 	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
@@ -968,6 +1061,7 @@ static inline int mini_cm_accelerated(struct nes_cm_core *cm_core,
 	if (cm_node->accept_pend) {
 		BUG_ON(!cm_node->listener);
 		atomic_dec(&cm_node->listener->pend_accepts_cnt);
+		cm_node->accept_pend = 0;
 		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
 	}
 
@@ -994,7 +1088,7 @@ static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip)
 	memset(&fl, 0, sizeof fl);
 	fl.nl_u.ip4_u.daddr = htonl(dst_ip);
 	if (ip_route_output_key(&init_net, &rt, &fl)) {
-		printk("%s: ip_route_output_key failed for 0x%08X\n",
+		printk(KERN_ERR "%s: ip_route_output_key failed for 0x%08X\n",
 				__func__, dst_ip);
 		return rc;
 	}
@@ -1057,8 +1151,6 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core,
 			cm_node->cm_id);
 
 	spin_lock_init(&cm_node->retrans_list_lock);
-	INIT_LIST_HEAD(&cm_node->recv_list);
-	spin_lock_init(&cm_node->recv_list_lock);
 
 	cm_node->loopbackpartner = NULL;
 	atomic_set(&cm_node->ref_count, 1);
@@ -1126,10 +1218,7 @@ static int add_ref_cm_node(struct nes_cm_node *cm_node)
 static int rem_ref_cm_node(struct nes_cm_core *cm_core,
 	struct nes_cm_node *cm_node)
 {
-	unsigned long flags, qplockflags;
-	struct nes_timer_entry *recv_entry;
-	struct iw_cm_id *cm_id;
-	struct list_head *list_core, *list_node_temp;
+	unsigned long flags;
 	struct nes_qp *nesqp;
 
 	if (!cm_node)
@@ -1150,38 +1239,9 @@ static int rem_ref_cm_node(struct nes_cm_core *cm_core,
 		atomic_dec(&cm_node->listener->pend_accepts_cnt);
 		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
 	}
-	BUG_ON(cm_node->send_entry);
-	spin_lock_irqsave(&cm_node->recv_list_lock, flags);
-	list_for_each_safe(list_core, list_node_temp, &cm_node->recv_list) {
-		recv_entry = container_of(list_core, struct nes_timer_entry,
-				list);
-		list_del(&recv_entry->list);
-		cm_id = cm_node->cm_id;
-		spin_unlock_irqrestore(&cm_node->recv_list_lock, flags);
-		nesqp = (struct nes_qp *)recv_entry->skb;
-		spin_lock_irqsave(&nesqp->lock, qplockflags);
-		if (nesqp->cm_id) {
-			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p: HIT A "
-				"NES_TIMER_TYPE_CLOSE with something to do!\n",
-				nesqp->hwqp.qp_id, cm_id);
-			nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
-			nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
-			nesqp->ibqp_state = IB_QPS_ERR;
-			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-			nes_cm_disconn(nesqp);
-		} else {
-			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
-			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p: HIT A "
-				"NES_TIMER_TYPE_CLOSE with nothing to do!\n",
-				nesqp->hwqp.qp_id, cm_id);
-		}
-		cm_id->rem_ref(cm_id);
-
-		kfree(recv_entry);
-		spin_lock_irqsave(&cm_node->recv_list_lock, flags);
-	}
-	spin_unlock_irqrestore(&cm_node->recv_list_lock, flags);
-
+	WARN_ON(cm_node->send_entry);
+	if (cm_node->recv_entry)
+		handle_recv_entry(cm_node, 0);
 	if (cm_node->listener) {
 		mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0);
 	} else {
@@ -1266,8 +1326,7 @@ static void drop_packet(struct sk_buff *skb)
 	dev_kfree_skb_any(skb);
 }
 
-static void handle_fin_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
-	struct tcphdr *tcph)
+static void handle_fin_pkt(struct nes_cm_node *cm_node)
 {
 	nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. "
 		"refcnt=%d\n", cm_node, cm_node->state,
@@ -1279,23 +1338,30 @@ static void handle_fin_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 	case NES_CM_STATE_SYN_SENT:
 	case NES_CM_STATE_ESTABLISHED:
 	case NES_CM_STATE_MPAREQ_SENT:
+	case NES_CM_STATE_MPAREJ_RCVD:
 		cm_node->state = NES_CM_STATE_LAST_ACK;
-		send_fin(cm_node, skb);
+		send_fin(cm_node, NULL);
 		break;
 	case NES_CM_STATE_FIN_WAIT1:
 		cm_node->state = NES_CM_STATE_CLOSING;
-		send_ack(cm_node, skb);
+		send_ack(cm_node, NULL);
+		/* Wait for ACK as this is simultanous close..
+		* After we receive ACK, do not send anything..
+		* Just rm the node.. Done.. */
 		break;
 	case NES_CM_STATE_FIN_WAIT2:
 		cm_node->state = NES_CM_STATE_TIME_WAIT;
-		send_ack(cm_node, skb);
+		send_ack(cm_node, NULL);
+		schedule_nes_timer(cm_node, NULL,  NES_TIMER_TYPE_CLOSE, 1, 0);
+		break;
+	case NES_CM_STATE_TIME_WAIT:
 		cm_node->state = NES_CM_STATE_CLOSED;
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
 		break;
 	case NES_CM_STATE_TSA:
 	default:
 		nes_debug(NES_DBG_CM, "Error Rcvd FIN for node-%p state = %d\n",
 			cm_node, cm_node->state);
-		drop_packet(skb);
 		break;
 	}
 }
@@ -1341,23 +1407,35 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		cleanup_retrans_entry(cm_node);
 		drop_packet(skb);
 		break;
+	case NES_CM_STATE_TIME_WAIT:
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		drop_packet(skb);
+		break;
+	case NES_CM_STATE_FIN_WAIT1:
+		cleanup_retrans_entry(cm_node);
+		nes_debug(NES_DBG_CM, "Bad state %s[%u]\n", __func__, __LINE__);
 	default:
 		drop_packet(skb);
 		break;
 	}
 }
 
-static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb,
-	enum nes_cm_event_type type)
+
+static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb)
 {
 
-	int	ret;
+	int	ret = 0;
 	int datasize = skb->len;
 	u8 *dataloc = skb->data;
-	ret = parse_mpa(cm_node, dataloc, datasize);
-	if (ret < 0) {
+
+	enum nes_cm_event_type type = NES_CM_EVENT_UNKNOWN;
+	u32     res_type;
+	ret = parse_mpa(cm_node, dataloc, &res_type, datasize);
+	if (ret) {
 		nes_debug(NES_DBG_CM, "didn't like MPA Request\n");
-		if (type == NES_CM_EVENT_CONNECTED) {
+		if (cm_node->state == NES_CM_STATE_MPAREQ_SENT) {
 			nes_debug(NES_DBG_CM, "%s[%u] create abort for "
 				"cm_node=%p listener=%p state=%d\n", __func__,
 				__LINE__, cm_node, cm_node->listener,
@@ -1366,18 +1444,38 @@ static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		} else {
 			passive_open_err(cm_node, skb, 1);
 		}
-	} else {
-		cleanup_retrans_entry(cm_node);
-		dev_kfree_skb_any(skb);
-		if (type == NES_CM_EVENT_CONNECTED)
+		return;
+	}
+
+	switch (cm_node->state) {
+	case NES_CM_STATE_ESTABLISHED:
+		if (res_type == NES_MPA_REQUEST_REJECT) {
+			/*BIG problem as we are receiving the MPA.. So should
+			* not be REJECT.. This is Passive Open.. We can
+			* only receive it Reject for Active Open...*/
+			WARN_ON(1);
+		}
+		cm_node->state = NES_CM_STATE_MPAREQ_RCVD;
+		type = NES_CM_EVENT_MPA_REQ;
+		atomic_set(&cm_node->passive_state,
+				NES_PASSIVE_STATE_INDICATED);
+		break;
+	case NES_CM_STATE_MPAREQ_SENT:
+		if (res_type == NES_MPA_REQUEST_REJECT) {
+			type = NES_CM_EVENT_MPA_REJECT;
+			cm_node->state = NES_CM_STATE_MPAREJ_RCVD;
+		} else {
+			type = NES_CM_EVENT_CONNECTED;
 			cm_node->state = NES_CM_STATE_TSA;
-		else
-			atomic_set(&cm_node->passive_state,
-					NES_PASSIVE_STATE_INDICATED);
-		create_event(cm_node, type);
+		}
 
+		break;
+	default:
+		WARN_ON(1);
+		break;
 	}
-	return ;
+	dev_kfree_skb_any(skb);
+	create_event(cm_node, type);
 }
 
 static void indicate_pkt_err(struct nes_cm_node *cm_node, struct sk_buff *skb)
@@ -1465,8 +1563,6 @@ static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		break;
 	case NES_CM_STATE_LISTENING:
 		/* Passive OPEN */
-		cm_node->accept_pend = 1;
-		atomic_inc(&cm_node->listener->pend_accepts_cnt);
 		if (atomic_read(&cm_node->listener->pend_accepts_cnt) >
 				cm_node->listener->backlog) {
 			nes_debug(NES_DBG_CM, "drop syn due to backlog "
@@ -1484,6 +1580,9 @@ static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		}
 		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
 		BUG_ON(cm_node->send_entry);
+		cm_node->accept_pend = 1;
+		atomic_inc(&cm_node->listener->pend_accepts_cnt);
+
 		cm_node->state = NES_CM_STATE_SYN_RCVD;
 		send_syn(cm_node, 1, skb);
 		break;
@@ -1518,6 +1617,7 @@ static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 	inc_sequence = ntohl(tcph->seq);
 	switch (cm_node->state) {
 	case NES_CM_STATE_SYN_SENT:
+		cleanup_retrans_entry(cm_node);
 		/* active open */
 		if (check_syn(cm_node, tcph, skb))
 			return;
@@ -1567,10 +1667,7 @@ static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 	u32 rem_seq;
 	int ret;
 	int optionsize;
-	u32 temp_seq = cm_node->tcp_cntxt.loc_seq_num;
-
 	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
-	cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
 
 	if (check_seq(cm_node, tcph, skb))
 		return;
@@ -1580,7 +1677,7 @@ static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 	rem_seq = ntohl(tcph->seq);
 	rem_seq_ack =  ntohl(tcph->ack_seq);
 	datasize = skb->len;
-
+	cleanup_retrans_entry(cm_node);
 	switch (cm_node->state) {
 	case NES_CM_STATE_SYN_RCVD:
 		/* Passive OPEN */
@@ -1588,7 +1685,6 @@ static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		if (ret)
 			break;
 		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
-		cm_node->tcp_cntxt.loc_seq_num = temp_seq;
 		if (cm_node->tcp_cntxt.rem_ack_num !=
 		    cm_node->tcp_cntxt.loc_seq_num) {
 			nes_debug(NES_DBG_CM, "rem_ack_num != loc_seq_num\n");
@@ -1597,31 +1693,30 @@ static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 			return;
 		}
 		cm_node->state = NES_CM_STATE_ESTABLISHED;
+		cleanup_retrans_entry(cm_node);
 		if (datasize) {
 			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-			cm_node->state = NES_CM_STATE_MPAREQ_RCVD;
-			handle_rcv_mpa(cm_node, skb, NES_CM_EVENT_MPA_REQ);
-		 } else { /* rcvd ACK only */
+			handle_rcv_mpa(cm_node, skb);
+		} else { /* rcvd ACK only */
 			dev_kfree_skb_any(skb);
 			cleanup_retrans_entry(cm_node);
 		 }
 		break;
 	case NES_CM_STATE_ESTABLISHED:
 		/* Passive OPEN */
-		/* We expect mpa frame to be received only */
+		cleanup_retrans_entry(cm_node);
 		if (datasize) {
 			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-			cm_node->state = NES_CM_STATE_MPAREQ_RCVD;
-			handle_rcv_mpa(cm_node, skb,
-				NES_CM_EVENT_MPA_REQ);
+			handle_rcv_mpa(cm_node, skb);
 		} else
 			drop_packet(skb);
 		break;
 	case NES_CM_STATE_MPAREQ_SENT:
+		cleanup_retrans_entry(cm_node);
 		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
 		if (datasize) {
 			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
-			handle_rcv_mpa(cm_node, skb, NES_CM_EVENT_CONNECTED);
+			handle_rcv_mpa(cm_node, skb);
 		} else { /* Could be just an ack pkt.. */
 			cleanup_retrans_entry(cm_node);
 			dev_kfree_skb_any(skb);
@@ -1632,13 +1727,24 @@ static void handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		cleanup_retrans_entry(cm_node);
 		send_reset(cm_node, skb);
 		break;
+	case NES_CM_STATE_LAST_ACK:
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		cm_node->cm_id->rem_ref(cm_node->cm_id);
+	case NES_CM_STATE_CLOSING:
+		cleanup_retrans_entry(cm_node);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		drop_packet(skb);
+		break;
 	case NES_CM_STATE_FIN_WAIT1:
+		cleanup_retrans_entry(cm_node);
+		drop_packet(skb);
+		cm_node->state = NES_CM_STATE_FIN_WAIT2;
+		break;
 	case NES_CM_STATE_SYN_SENT:
 	case NES_CM_STATE_FIN_WAIT2:
 	case NES_CM_STATE_TSA:
 	case NES_CM_STATE_MPAREQ_RCVD:
-	case NES_CM_STATE_LAST_ACK:
-	case NES_CM_STATE_CLOSING:
 	case NES_CM_STATE_UNKNOWN:
 	default:
 		drop_packet(skb);
@@ -1748,6 +1854,7 @@ static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,
 {
 	enum nes_tcpip_pkt_type	pkt_type = NES_PKT_TYPE_UNKNOWN;
 	struct tcphdr *tcph = tcp_hdr(skb);
+	u32     fin_set = 0;
 	skb_pull(skb, ip_hdr(skb)->ihl << 2);
 
 	nes_debug(NES_DBG_CM, "process_packet: cm_node=%p state =%d syn=%d "
@@ -1760,10 +1867,10 @@ static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		pkt_type = NES_PKT_TYPE_SYN;
 		if (tcph->ack)
 			pkt_type = NES_PKT_TYPE_SYNACK;
-	} else if (tcph->fin)
-		pkt_type = NES_PKT_TYPE_FIN;
-	else if (tcph->ack)
+	} else if (tcph->ack)
 		pkt_type = NES_PKT_TYPE_ACK;
+	if (tcph->fin)
+		fin_set = 1;
 
 	switch (pkt_type) {
 	case NES_PKT_TYPE_SYN:
@@ -1774,15 +1881,16 @@ static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,
 		break;
 	case NES_PKT_TYPE_ACK:
 		handle_ack_pkt(cm_node, skb, tcph);
+		if (fin_set)
+			handle_fin_pkt(cm_node);
 		break;
 	case NES_PKT_TYPE_RST:
 		handle_rst_pkt(cm_node, skb, tcph);
 		break;
-	case NES_PKT_TYPE_FIN:
-		handle_fin_pkt(cm_node, skb, tcph);
-		break;
 	default:
 		drop_packet(skb);
+		if (fin_set)
+			handle_fin_pkt(cm_node);
 		break;
 	}
 }
@@ -1925,7 +2033,7 @@ static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core,
 				loopbackremotenode->tcp_cntxt.rcv_wscale;
 			loopbackremotenode->tcp_cntxt.snd_wscale =
 				cm_node->tcp_cntxt.rcv_wscale;
-
+			loopbackremotenode->state = NES_CM_STATE_MPAREQ_RCVD;
 			create_event(loopbackremotenode, NES_CM_EVENT_MPA_REQ);
 		}
 		return cm_node;
@@ -1980,7 +2088,11 @@ static int mini_cm_reject(struct nes_cm_core *cm_core,
 	struct ietf_mpa_frame *mpa_frame, struct nes_cm_node *cm_node)
 {
 	int ret = 0;
+	int err = 0;
 	int passive_state;
+	struct nes_cm_event event;
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	struct nes_cm_node *loopback = cm_node->loopbackpartner;
 
 	nes_debug(NES_DBG_CM, "%s cm_node=%p type=%d state=%d\n",
 		__func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state);
@@ -1989,12 +2101,38 @@ static int mini_cm_reject(struct nes_cm_core *cm_core,
 		return ret;
 	cleanup_retrans_entry(cm_node);
 
-	passive_state = atomic_add_return(1, &cm_node->passive_state);
-	cm_node->state = NES_CM_STATE_CLOSED;
-	if (passive_state == NES_SEND_RESET_EVENT)
+	if (!loopback) {
+		passive_state = atomic_add_return(1, &cm_node->passive_state);
+		if (passive_state == NES_SEND_RESET_EVENT) {
+			cm_node->state = NES_CM_STATE_CLOSED;
+			rem_ref_cm_node(cm_core, cm_node);
+		} else {
+			ret = send_mpa_reject(cm_node);
+			if (ret) {
+				cm_node->state = NES_CM_STATE_CLOSED;
+				err = send_reset(cm_node, NULL);
+				if (err)
+					WARN_ON(1);
+			} else
+				cm_id->add_ref(cm_id);
+		}
+	} else {
+		cm_node->cm_id = NULL;
+		event.cm_node = loopback;
+		event.cm_info.rem_addr = loopback->rem_addr;
+		event.cm_info.loc_addr = loopback->loc_addr;
+		event.cm_info.rem_port = loopback->rem_port;
+		event.cm_info.loc_port = loopback->loc_port;
+		event.cm_info.cm_id = loopback->cm_id;
+		cm_event_mpa_reject(&event);
 		rem_ref_cm_node(cm_core, cm_node);
-	else
-		ret = send_reset(cm_node, NULL);
+		loopback->state = NES_CM_STATE_CLOSING;
+
+		cm_id = loopback->cm_id;
+		rem_ref_cm_node(cm_core, loopback);
+		cm_id->rem_ref(cm_id);
+	}
+
 	return ret;
 }
 
@@ -2031,6 +2169,7 @@ static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_nod
 	case NES_CM_STATE_CLOSING:
 		ret = -1;
 		break;
+	case NES_CM_STATE_MPAREJ_RCVD:
 	case NES_CM_STATE_LISTENING:
 	case NES_CM_STATE_UNKNOWN:
 	case NES_CM_STATE_INITED:
@@ -2227,15 +2366,15 @@ static int mini_cm_set(struct nes_cm_core *cm_core, u32 type, u32 value)
 	int ret = 0;
 
 	switch (type) {
-		case NES_CM_SET_PKT_SIZE:
-			cm_core->mtu = value;
-			break;
-		case NES_CM_SET_FREE_PKT_Q_SIZE:
-			cm_core->free_tx_pkt_max = value;
-			break;
-		default:
-			/* unknown set option */
-			ret = -EINVAL;
+	case NES_CM_SET_PKT_SIZE:
+		cm_core->mtu = value;
+		break;
+	case NES_CM_SET_FREE_PKT_Q_SIZE:
+		cm_core->free_tx_pkt_max = value;
+		break;
+	default:
+		/* unknown set option */
+		ret = -EINVAL;
 	}
 
 	return ret;
@@ -2654,9 +2793,7 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 			NES_QPCONTEXT_ORDIRD_WRPDU);
 	} else {
 		nesqp->nesqp_context->ird_ord_sizes |=
-			cpu_to_le32((NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
-			NES_QPCONTEXT_ORDIRD_WRPDU |
-			NES_QPCONTEXT_ORDIRD_ALSMM));
+			cpu_to_le32(NES_QPCONTEXT_ORDIRD_WRPDU);
 	}
 	nesqp->skip_lsmm = 1;
 
@@ -2778,23 +2915,35 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
 {
 	struct nes_cm_node *cm_node;
+	struct nes_cm_node *loopback;
+
 	struct nes_cm_core *cm_core;
 
 	atomic_inc(&cm_rejects);
 	cm_node = (struct nes_cm_node *) cm_id->provider_data;
+	loopback = cm_node->loopbackpartner;
 	cm_core = cm_node->cm_core;
+	cm_node->cm_id = cm_id;
 	cm_node->mpa_frame_size = sizeof(struct ietf_mpa_frame) + pdata_len;
 
+	if (cm_node->mpa_frame_size > MAX_CM_BUFFER)
+		return -EINVAL;
+
 	strcpy(&cm_node->mpa_frame.key[0], IEFT_MPA_KEY_REP);
-	memcpy(&cm_node->mpa_frame.priv_data, pdata, pdata_len);
+	if (loopback) {
+		memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len);
+		loopback->mpa_frame.priv_data_len = pdata_len;
+		loopback->mpa_frame_size = sizeof(struct ietf_mpa_frame) +
+				pdata_len;
+	} else {
+		memcpy(&cm_node->mpa_frame.priv_data, pdata, pdata_len);
+		cm_node->mpa_frame.priv_data_len = cpu_to_be16(pdata_len);
+	}
 
-	cm_node->mpa_frame.priv_data_len = cpu_to_be16(pdata_len);
 	cm_node->mpa_frame.rev = mpa_version;
 	cm_node->mpa_frame.flags = IETF_MPA_FLAGS_CRC | IETF_MPA_FLAGS_REJECT;
 
-	cm_core->api->reject(cm_core, &cm_node->mpa_frame, cm_node);
-
-	return 0;
+	return cm_core->api->reject(cm_core, &cm_node->mpa_frame, cm_node);
 }
 
 
@@ -3303,13 +3452,56 @@ static void cm_event_mpa_req(struct nes_cm_event *event)
 	cm_event.remote_addr.sin_family = AF_INET;
 	cm_event.remote_addr.sin_port = htons(event->cm_info.rem_port);
 	cm_event.remote_addr.sin_addr.s_addr = htonl(event->cm_info.rem_addr);
+	cm_event.private_data = cm_node->mpa_frame_buf;
+	cm_event.private_data_len  = (u8) cm_node->mpa_frame_size;
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
+				__func__, __LINE__, ret);
+	return;
+}
+
+
+static void cm_event_mpa_reject(struct nes_cm_event *event)
+{
+	struct iw_cm_id   *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_cm_node *cm_node;
+	int ret;
+
+	cm_node = event->cm_node;
+	if (!cm_node)
+		return;
+	cm_id = cm_node->cm_id;
+
+	atomic_inc(&cm_connect_reqs);
+	nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
+			cm_node, cm_id, jiffies);
+
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = -ECONNREFUSED;
+	cm_event.provider_data = cm_id->provider_data;
+
+	cm_event.local_addr.sin_family = AF_INET;
+	cm_event.local_addr.sin_port = htons(event->cm_info.loc_port);
+	cm_event.local_addr.sin_addr.s_addr = htonl(event->cm_info.loc_addr);
+
+	cm_event.remote_addr.sin_family = AF_INET;
+	cm_event.remote_addr.sin_port = htons(event->cm_info.rem_port);
+	cm_event.remote_addr.sin_addr.s_addr = htonl(event->cm_info.rem_addr);
 
-		cm_event.private_data                = cm_node->mpa_frame_buf;
-		cm_event.private_data_len            = (u8) cm_node->mpa_frame_size;
+	cm_event.private_data = cm_node->mpa_frame_buf;
+	cm_event.private_data_len = (u8) cm_node->mpa_frame_size;
+
+	nes_debug(NES_DBG_CM, "call CM_EVENT_MPA_REJECTED, local_addr=%08x, "
+			"remove_addr=%08x\n",
+			cm_event.local_addr.sin_addr.s_addr,
+			cm_event.remote_addr.sin_addr.s_addr);
 
 	ret = cm_id->event_handler(cm_id, &cm_event);
 	if (ret)
-		printk("%s[%u] OFA CM event_handler returned, ret=%d\n",
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
 				__func__, __LINE__, ret);
 
 	return;
@@ -3374,6 +3566,14 @@ static void nes_cm_event_handler(struct work_struct *work)
 		cm_event_connected(event);
 		nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n");
 		break;
+	case NES_CM_EVENT_MPA_REJECT:
+		if ((!event->cm_node->cm_id) ||
+				(event->cm_node->state == NES_CM_STATE_TSA))
+			break;
+		cm_event_mpa_reject(event);
+		nes_debug(NES_DBG_CM, "CM Event: REJECT\n");
+		break;
+
 	case NES_CM_EVENT_ABORTED:
 		if ((!event->cm_node->cm_id) ||
 			(event->cm_node->state == NES_CM_STATE_TSA))
diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h
index fafa35042eb..d5f778202eb 100644
--- a/drivers/infiniband/hw/nes/nes_cm.h
+++ b/drivers/infiniband/hw/nes/nes_cm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -39,6 +39,9 @@
 #define NES_MANAGE_APBVT_DEL 0
 #define NES_MANAGE_APBVT_ADD 1
 
+#define NES_MPA_REQUEST_ACCEPT  1
+#define NES_MPA_REQUEST_REJECT  2
+
 /* IETF MPA -- defines, enums, structs */
 #define IEFT_MPA_KEY_REQ  "MPA ID Req Frame"
 #define IEFT_MPA_KEY_REP  "MPA ID Rep Frame"
@@ -186,6 +189,7 @@ enum nes_cm_node_state {
 	NES_CM_STATE_ACCEPTING,
 	NES_CM_STATE_MPAREQ_SENT,
 	NES_CM_STATE_MPAREQ_RCVD,
+	NES_CM_STATE_MPAREJ_RCVD,
 	NES_CM_STATE_TSA,
 	NES_CM_STATE_FIN_WAIT1,
 	NES_CM_STATE_FIN_WAIT2,
@@ -278,13 +282,12 @@ struct nes_cm_node {
 	struct nes_timer_entry	*send_entry;
 
 	spinlock_t                retrans_list_lock;
-	struct list_head          recv_list;
-	spinlock_t                recv_list_lock;
+	struct nes_timer_entry  *recv_entry;
 
 	int                       send_write0;
 	union {
 		struct ietf_mpa_frame mpa_frame;
-		u8                    mpa_frame_buf[NES_CM_DEFAULT_MTU];
+		u8                    mpa_frame_buf[MAX_CM_BUFFER];
 	};
 	u16                       mpa_frame_size;
 	struct iw_cm_id           *cm_id;
@@ -326,6 +329,7 @@ enum  nes_cm_event_type {
 	NES_CM_EVENT_MPA_REQ,
 	NES_CM_EVENT_MPA_CONNECT,
 	NES_CM_EVENT_MPA_ACCEPT,
+	NES_CM_EVENT_MPA_REJECT,
 	NES_CM_EVENT_MPA_ESTABLISHED,
 	NES_CM_EVENT_CONNECTED,
 	NES_CM_EVENT_CLOSED,
diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h
index da9daba8e66..0fb8d81d9a6 100644
--- a/drivers/infiniband/hw/nes/nes_context.h
+++ b/drivers/infiniband/hw/nes/nes_context.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
index 53df9de2342..52e734042b8 100644
--- a/drivers/infiniband/hw/nes/nes_hw.c
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -254,6 +254,7 @@ struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) {
 	u32 adapter_size;
 	u32 arp_table_size;
 	u16 vendor_id;
+	u16 device_id;
 	u8  OneG_Mode;
 	u8  func_index;
 
@@ -356,6 +357,13 @@ struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) {
 		return NULL;
 	}
 
+	nesadapter->vendor_id = (((u32) nesadapter->mac_addr_high) << 8) |
+				(nesadapter->mac_addr_low >> 24);
+
+	pci_bus_read_config_word(nesdev->pcidev->bus, nesdev->pcidev->devfn,
+				 PCI_DEVICE_ID, &device_id);
+	nesadapter->vendor_part_id = device_id;
+
 	if (nes_init_serdes(nesdev, hw_rev, port_count, nesadapter,
 							OneG_Mode)) {
 		kfree(nesadapter);
@@ -1636,7 +1644,6 @@ int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev)
 	nesvnic->post_cqp_request = nes_post_cqp_request;
 	nesvnic->mcrq_mcast_filter = NULL;
 
-	spin_lock_init(&nesvnic->nic.sq_lock);
 	spin_lock_init(&nesvnic->nic.rq_lock);
 
 	/* setup the RQ */
@@ -2261,6 +2268,8 @@ static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq)
 
 		if (++head >= aeq_size)
 			head = 0;
+
+		nes_write32(nesdev->regs + NES_AEQ_ALLOC, 1 << 16);
 	}
 	while (1);
 	aeq->aeq_head = head;
@@ -2622,9 +2631,9 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
 						} else
 							break;
 					}
-					if (skb)
-						dev_kfree_skb_any(skb);
 				}
+				if (skb)
+					dev_kfree_skb_any(skb);
 				nesnic->sq_tail++;
 				nesnic->sq_tail &= nesnic->sq_size-1;
 				if (sq_cqes > 128) {
diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h
index bc0b4de0445..f41a8710d2a 100644
--- a/drivers/infiniband/hw/nes/nes_hw.h
+++ b/drivers/infiniband/hw/nes/nes_hw.h
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+* Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
@@ -61,6 +61,7 @@ enum pci_regs {
 	NES_CQ_ACK = 0x0034,
 	NES_WQE_ALLOC = 0x0040,
 	NES_CQE_ALLOC = 0x0044,
+	NES_AEQ_ALLOC = 0x0048
 };
 
 enum indexed_regs {
@@ -875,7 +876,6 @@ struct nes_hw_nic {
 	u8 replenishing_rq;
 	u8 reserved;
 
-	spinlock_t sq_lock;
 	spinlock_t rq_lock;
 };
 
@@ -1147,7 +1147,6 @@ struct nes_ib_device;
 struct nes_vnic {
 	struct nes_ib_device *nesibdev;
 	u64 sq_full;
-	u64 sq_locked;
 	u64 tso_requests;
 	u64 segmented_tso_requests;
 	u64 linearized_skbs;
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index ae8c6888b53..8d3e4c6f237 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -400,8 +400,7 @@ static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
 	if (skb_headlen(skb) == skb->len) {
 		if (skb_headlen(skb) <= NES_FIRST_FRAG_SIZE) {
 			nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_2_1_IDX] = 0;
-			nesnic->tx_skb[nesnic->sq_head] = NULL;
-			dev_kfree_skb(skb);
+			nesnic->tx_skb[nesnic->sq_head] = skb;
 		}
 	} else {
 		/* Deal with Fragments */
@@ -453,7 +452,6 @@ static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 	u32 wqe_count=1;
 	u32 send_rc;
 	struct iphdr *iph;
-	unsigned long flags;
 	__le16 *wqe_fragment_length;
 	u32 nr_frags;
 	u32 original_first_length;
@@ -480,13 +478,6 @@ static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 	if (netif_queue_stopped(netdev))
 		return NETDEV_TX_BUSY;
 
-	local_irq_save(flags);
-	if (!spin_trylock(&nesnic->sq_lock)) {
-		local_irq_restore(flags);
-		nesvnic->sq_locked++;
-		return NETDEV_TX_LOCKED;
-	}
-
 	/* Check if SQ is full */
 	if ((((nesnic->sq_tail+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) == 1) {
 		if (!netif_queue_stopped(netdev)) {
@@ -498,7 +489,6 @@ static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 			}
 		}
 		nesvnic->sq_full++;
-		spin_unlock_irqrestore(&nesnic->sq_lock, flags);
 		return NETDEV_TX_BUSY;
 	}
 
@@ -531,7 +521,6 @@ sq_no_longer_full:
 					}
 				}
 				nesvnic->sq_full++;
-				spin_unlock_irqrestore(&nesnic->sq_lock, flags);
 				nes_debug(NES_DBG_NIC_TX, "%s: HNIC SQ full- TSO request has too many frags!\n",
 						netdev->name);
 				return NETDEV_TX_BUSY;
@@ -656,17 +645,13 @@ tso_sq_no_longer_full:
 			skb_set_transport_header(skb, hoffset);
 			skb_set_network_header(skb, nhoffset);
 			send_rc = nes_nic_send(skb, netdev);
-			if (send_rc != NETDEV_TX_OK) {
-				spin_unlock_irqrestore(&nesnic->sq_lock, flags);
+			if (send_rc != NETDEV_TX_OK)
 				return NETDEV_TX_OK;
-			}
 		}
 	} else {
 		send_rc = nes_nic_send(skb, netdev);
-		if (send_rc != NETDEV_TX_OK) {
-			spin_unlock_irqrestore(&nesnic->sq_lock, flags);
+		if (send_rc != NETDEV_TX_OK)
 			return NETDEV_TX_OK;
-		}
 	}
 
 	barrier();
@@ -676,7 +661,6 @@ tso_sq_no_longer_full:
 				(wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id);
 
 	netdev->trans_start = jiffies;
-	spin_unlock_irqrestore(&nesnic->sq_lock, flags);
 
 	return NETDEV_TX_OK;
 }
@@ -1012,7 +996,6 @@ static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = {
 	"Pause Frames Received",
 	"Internal Routing Errors",
 	"SQ SW Dropped SKBs",
-	"SQ Locked",
 	"SQ Full",
 	"Segmented TSO Requests",
 	"Rx Symbol Errors",
@@ -1129,16 +1112,17 @@ static void nes_netdev_get_ethtool_stats(struct net_device *netdev,
 	struct nes_device *nesdev = nesvnic->nesdev;
 	u32 nic_count;
 	u32 u32temp;
+	u32 index = 0;
 
 	target_ethtool_stats->n_stats = NES_ETHTOOL_STAT_COUNT;
-	target_stat_values[0] = nesvnic->nesdev->link_status_interrupts;
-	target_stat_values[1] = nesvnic->linearized_skbs;
-	target_stat_values[2] = nesvnic->tso_requests;
+	target_stat_values[index] = nesvnic->nesdev->link_status_interrupts;
+	target_stat_values[++index] = nesvnic->linearized_skbs;
+	target_stat_values[++index] = nesvnic->tso_requests;
 
 	u32temp = nes_read_indexed(nesdev,
 			NES_IDX_MAC_TX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
 	nesvnic->nesdev->mac_pause_frames_sent += u32temp;
-	target_stat_values[3] = nesvnic->nesdev->mac_pause_frames_sent;
+	target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_sent;
 
 	u32temp = nes_read_indexed(nesdev,
 			NES_IDX_MAC_RX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
@@ -1209,60 +1193,59 @@ static void nes_netdev_get_ethtool_stats(struct net_device *netdev,
 		nesvnic->endnode_ipv4_tcp_retransmits += u32temp;
 	}
 
-	target_stat_values[4] = nesvnic->nesdev->mac_pause_frames_received;
-	target_stat_values[5] = nesdev->nesadapter->nic_rx_eth_route_err;
-	target_stat_values[6] = nesvnic->tx_sw_dropped;
-	target_stat_values[7] = nesvnic->sq_locked;
-	target_stat_values[8] = nesvnic->sq_full;
-	target_stat_values[9] = nesvnic->segmented_tso_requests;
-	target_stat_values[10] = nesvnic->nesdev->mac_rx_symbol_err_frames;
-	target_stat_values[11] = nesvnic->nesdev->mac_rx_jabber_frames;
-	target_stat_values[12] = nesvnic->nesdev->mac_rx_oversized_frames;
-	target_stat_values[13] = nesvnic->nesdev->mac_rx_short_frames;
-	target_stat_values[14] = nesvnic->endnode_nstat_rx_discard;
-	target_stat_values[15] = nesvnic->endnode_nstat_rx_octets;
-	target_stat_values[16] = nesvnic->endnode_nstat_rx_frames;
-	target_stat_values[17] = nesvnic->endnode_nstat_tx_octets;
-	target_stat_values[18] = nesvnic->endnode_nstat_tx_frames;
-	target_stat_values[19] = mh_detected;
-	target_stat_values[20] = mh_pauses_sent;
-	target_stat_values[21] = nesvnic->endnode_ipv4_tcp_retransmits;
-	target_stat_values[22] = atomic_read(&cm_connects);
-	target_stat_values[23] = atomic_read(&cm_accepts);
-	target_stat_values[24] = atomic_read(&cm_disconnects);
-	target_stat_values[25] = atomic_read(&cm_connecteds);
-	target_stat_values[26] = atomic_read(&cm_connect_reqs);
-	target_stat_values[27] = atomic_read(&cm_rejects);
-	target_stat_values[28] = atomic_read(&mod_qp_timouts);
-	target_stat_values[29] = atomic_read(&qps_created);
-	target_stat_values[30] = atomic_read(&sw_qps_destroyed);
-	target_stat_values[31] = atomic_read(&qps_destroyed);
-	target_stat_values[32] = atomic_read(&cm_closes);
-	target_stat_values[33] = cm_packets_sent;
-	target_stat_values[34] = cm_packets_bounced;
-	target_stat_values[35] = cm_packets_created;
-	target_stat_values[36] = cm_packets_received;
-	target_stat_values[37] = cm_packets_dropped;
-	target_stat_values[38] = cm_packets_retrans;
-	target_stat_values[39] = cm_listens_created;
-	target_stat_values[40] = cm_listens_destroyed;
-	target_stat_values[41] = cm_backlog_drops;
-	target_stat_values[42] = atomic_read(&cm_loopbacks);
-	target_stat_values[43] = atomic_read(&cm_nodes_created);
-	target_stat_values[44] = atomic_read(&cm_nodes_destroyed);
-	target_stat_values[45] = atomic_read(&cm_accel_dropped_pkts);
-	target_stat_values[46] = atomic_read(&cm_resets_recvd);
-	target_stat_values[47] = int_mod_timer_init;
-	target_stat_values[48] = int_mod_cq_depth_1;
-	target_stat_values[49] = int_mod_cq_depth_4;
-	target_stat_values[50] = int_mod_cq_depth_16;
-	target_stat_values[51] = int_mod_cq_depth_24;
-	target_stat_values[52] = int_mod_cq_depth_32;
-	target_stat_values[53] = int_mod_cq_depth_128;
-	target_stat_values[54] = int_mod_cq_depth_256;
-	target_stat_values[55] = nesvnic->lro_mgr.stats.aggregated;
-	target_stat_values[56] = nesvnic->lro_mgr.stats.flushed;
-	target_stat_values[57] = nesvnic->lro_mgr.stats.no_desc;
+	target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_received;
+	target_stat_values[++index] = nesdev->nesadapter->nic_rx_eth_route_err;
+	target_stat_values[++index] = nesvnic->tx_sw_dropped;
+	target_stat_values[++index] = nesvnic->sq_full;
+	target_stat_values[++index] = nesvnic->segmented_tso_requests;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_symbol_err_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_jabber_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_oversized_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_short_frames;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_discard;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_octets;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_frames;
+	target_stat_values[++index] = nesvnic->endnode_nstat_tx_octets;
+	target_stat_values[++index] = nesvnic->endnode_nstat_tx_frames;
+	target_stat_values[++index] = mh_detected;
+	target_stat_values[++index] = mh_pauses_sent;
+	target_stat_values[++index] = nesvnic->endnode_ipv4_tcp_retransmits;
+	target_stat_values[++index] = atomic_read(&cm_connects);
+	target_stat_values[++index] = atomic_read(&cm_accepts);
+	target_stat_values[++index] = atomic_read(&cm_disconnects);
+	target_stat_values[++index] = atomic_read(&cm_connecteds);
+	target_stat_values[++index] = atomic_read(&cm_connect_reqs);
+	target_stat_values[++index] = atomic_read(&cm_rejects);
+	target_stat_values[++index] = atomic_read(&mod_qp_timouts);
+	target_stat_values[++index] = atomic_read(&qps_created);
+	target_stat_values[++index] = atomic_read(&sw_qps_destroyed);
+	target_stat_values[++index] = atomic_read(&qps_destroyed);
+	target_stat_values[++index] = atomic_read(&cm_closes);
+	target_stat_values[++index] = cm_packets_sent;
+	target_stat_values[++index] = cm_packets_bounced;
+	target_stat_values[++index] = cm_packets_created;
+	target_stat_values[++index] = cm_packets_received;
+	target_stat_values[++index] = cm_packets_dropped;
+	target_stat_values[++index] = cm_packets_retrans;
+	target_stat_values[++index] = cm_listens_created;
+	target_stat_values[++index] = cm_listens_destroyed;
+	target_stat_values[++index] = cm_backlog_drops;
+	target_stat_values[++index] = atomic_read(&cm_loopbacks);
+	target_stat_values[++index] = atomic_read(&cm_nodes_created);
+	target_stat_values[++index] = atomic_read(&cm_nodes_destroyed);
+	target_stat_values[++index] = atomic_read(&cm_accel_dropped_pkts);
+	target_stat_values[++index] = atomic_read(&cm_resets_recvd);
+	target_stat_values[++index] = int_mod_timer_init;
+	target_stat_values[++index] = int_mod_cq_depth_1;
+	target_stat_values[++index] = int_mod_cq_depth_4;
+	target_stat_values[++index] = int_mod_cq_depth_16;
+	target_stat_values[++index] = int_mod_cq_depth_24;
+	target_stat_values[++index] = int_mod_cq_depth_32;
+	target_stat_values[++index] = int_mod_cq_depth_128;
+	target_stat_values[++index] = int_mod_cq_depth_256;
+	target_stat_values[++index] = nesvnic->lro_mgr.stats.aggregated;
+	target_stat_values[++index] = nesvnic->lro_mgr.stats.flushed;
+	target_stat_values[++index] = nesvnic->lro_mgr.stats.no_desc;
 
 }
 
@@ -1589,7 +1572,7 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev,
 		void __iomem *mmio_addr)
 {
 	u64 u64temp;
-	struct nes_vnic *nesvnic = NULL;
+	struct nes_vnic *nesvnic;
 	struct net_device *netdev;
 	struct nic_qp_map *curr_qp_map;
 	u32 u32temp;
@@ -1601,6 +1584,7 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev,
 		printk(KERN_ERR PFX "nesvnic etherdev alloc failed");
 		return NULL;
 	}
+	nesvnic = netdev_priv(netdev);
 
 	nes_debug(NES_DBG_INIT, "netdev = %p, %s\n", netdev, netdev->name);
 
@@ -1618,10 +1602,10 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev,
 	netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128);
 	nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n");
 	netdev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+	netdev->vlan_rx_register = nes_netdev_vlan_rx_register;
 	netdev->features |= NETIF_F_LLTX;
 
 	/* Fill in the port structure */
-	nesvnic = netdev_priv(netdev);
 	nesvnic->netdev = netdev;
 	nesvnic->nesdev = nesdev;
 	nesvnic->msg_enable = netif_msg_init(debug, default_msg);
diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h
index e64306bce80..cc90c14b49e 100644
--- a/drivers/infiniband/hw/nes/nes_user.h
+++ b/drivers/infiniband/hw/nes/nes_user.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 - 2008 NetEffect.  All rights reserved.
+ * Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Cisco Systems.  All rights reserved.
  * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
index 6f3bc1b6bf2..a282031d15c 100644
--- a/drivers/infiniband/hw/nes/nes_utils.c
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index d93a6562817..7e5b5ba13a7 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -551,6 +551,7 @@ static int nes_dealloc_fmr(struct ib_fmr *ibfmr)
 	struct nes_device *nesdev = nesvnic->nesdev;
 	struct nes_adapter *nesadapter = nesdev->nesadapter;
 	int i = 0;
+	int rc;
 
 	/* free the resources */
 	if (nesfmr->leaf_pbl_cnt == 0) {
@@ -572,7 +573,9 @@ static int nes_dealloc_fmr(struct ib_fmr *ibfmr)
 	nesmr->ibmw.rkey = ibfmr->rkey;
 	nesmr->ibmw.uobject = NULL;
 
-	if (nesfmr->nesmr.pbls_used != 0) {
+	rc = nes_dealloc_mw(&nesmr->ibmw);
+
+	if ((rc == 0) && (nesfmr->nesmr.pbls_used != 0)) {
 		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
 		if (nesfmr->nesmr.pbl_4k) {
 			nesadapter->free_4kpbl += nesfmr->nesmr.pbls_used;
@@ -584,7 +587,7 @@ static int nes_dealloc_fmr(struct ib_fmr *ibfmr)
 		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
 	}
 
-	return nes_dealloc_mw(&nesmr->ibmw);
+	return rc;
 }
 
 
@@ -1886,21 +1889,75 @@ static int nes_destroy_cq(struct ib_cq *ib_cq)
 	return ret;
 }
 
+/**
+ * root_256
+ */
+static u32 root_256(struct nes_device *nesdev,
+		    struct nes_root_vpbl *root_vpbl,
+		    struct nes_root_vpbl *new_root,
+		    u16 pbl_count_4k,
+		    u16 pbl_count_256)
+{
+	u64 leaf_pbl;
+	int i, j, k;
+
+	if (pbl_count_4k == 1) {
+		new_root->pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
+						512, &new_root->pbl_pbase);
+
+		if (new_root->pbl_vbase == NULL)
+			return 0;
+
+		leaf_pbl = (u64)root_vpbl->pbl_pbase;
+		for (i = 0; i < 16; i++) {
+			new_root->pbl_vbase[i].pa_low =
+				cpu_to_le32((u32)leaf_pbl);
+			new_root->pbl_vbase[i].pa_high =
+				cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
+			leaf_pbl += 256;
+		}
+	} else {
+		for (i = 3; i >= 0; i--) {
+			j = i * 16;
+			root_vpbl->pbl_vbase[j] = root_vpbl->pbl_vbase[i];
+			leaf_pbl = le32_to_cpu(root_vpbl->pbl_vbase[j].pa_low) +
+			    (((u64)le32_to_cpu(root_vpbl->pbl_vbase[j].pa_high))
+				<< 32);
+			for (k = 1; k < 16; k++) {
+				leaf_pbl += 256;
+				root_vpbl->pbl_vbase[j + k].pa_low =
+						cpu_to_le32((u32)leaf_pbl);
+				root_vpbl->pbl_vbase[j + k].pa_high =
+				    cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
+			}
+		}
+	}
+
+	return 1;
+}
+
 
 /**
  * nes_reg_mr
  */
 static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 		u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl,
-		dma_addr_t single_buffer, u16 pbl_count, u16 residual_page_count,
-		int acc, u64 *iova_start)
+		dma_addr_t single_buffer, u16 pbl_count_4k,
+		u16 residual_page_count_4k, int acc, u64 *iova_start,
+		u16 *actual_pbl_cnt, u8 *used_4k_pbls)
 {
 	struct nes_hw_cqp_wqe *cqp_wqe;
 	struct nes_cqp_request *cqp_request;
 	unsigned long flags;
 	int ret;
 	struct nes_adapter *nesadapter = nesdev->nesadapter;
-	/* int count; */
+	uint pg_cnt = 0;
+	u16 pbl_count_256;
+	u16 pbl_count = 0;
+	u8  use_256_pbls = 0;
+	u8  use_4k_pbls = 0;
+	u16 use_two_level = (pbl_count_4k > 1) ? 1 : 0;
+	struct nes_root_vpbl new_root = {0, 0, 0};
 	u32 opcode = 0;
 	u16 major_code;
 
@@ -1913,41 +1970,70 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 	cqp_request->waiting = 1;
 	cqp_wqe = &cqp_request->cqp_wqe;
 
-	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-	/* track PBL resources */
-	if (pbl_count != 0) {
-		if (pbl_count > 1) {
-			/* Two level PBL */
-			if ((pbl_count+1) > nesadapter->free_4kpbl) {
-				nes_debug(NES_DBG_MR, "Out of 4KB Pbls for two level request.\n");
-				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-				nes_free_cqp_request(nesdev, cqp_request);
-				return -ENOMEM;
-			} else {
-				nesadapter->free_4kpbl -= pbl_count+1;
-			}
-		} else if (residual_page_count > 32) {
-			if (pbl_count > nesadapter->free_4kpbl) {
-				nes_debug(NES_DBG_MR, "Out of 4KB Pbls.\n");
-				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-				nes_free_cqp_request(nesdev, cqp_request);
-				return -ENOMEM;
-			} else {
-				nesadapter->free_4kpbl -= pbl_count;
+	if (pbl_count_4k) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+
+		pg_cnt = ((pbl_count_4k - 1) * 512) + residual_page_count_4k;
+		pbl_count_256 = (pg_cnt + 31) / 32;
+		if (pg_cnt <= 32) {
+			if (pbl_count_256 <= nesadapter->free_256pbl)
+				use_256_pbls = 1;
+			else if (pbl_count_4k <= nesadapter->free_4kpbl)
+				use_4k_pbls = 1;
+		} else if (pg_cnt <= 2048) {
+			if (((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) &&
+			    (nesadapter->free_4kpbl > (nesadapter->max_4kpbl >> 1))) {
+				use_4k_pbls = 1;
+			} else if ((pbl_count_256 + 1) <= nesadapter->free_256pbl) {
+				use_256_pbls = 1;
+				use_two_level = 1;
+			} else if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
+				use_4k_pbls = 1;
 			}
 		} else {
-			if (pbl_count > nesadapter->free_256pbl) {
-				nes_debug(NES_DBG_MR, "Out of 256B Pbls.\n");
-				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
-				nes_free_cqp_request(nesdev, cqp_request);
-				return -ENOMEM;
-			} else {
-				nesadapter->free_256pbl -= pbl_count;
-			}
+			if ((pbl_count_4k + 1) <= nesadapter->free_4kpbl)
+				use_4k_pbls = 1;
 		}
+
+		if (use_256_pbls) {
+			pbl_count = pbl_count_256;
+			nesadapter->free_256pbl -= pbl_count + use_two_level;
+		} else if (use_4k_pbls) {
+			pbl_count =  pbl_count_4k;
+			nesadapter->free_4kpbl -= pbl_count + use_two_level;
+		} else {
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+			nes_debug(NES_DBG_MR, "Out of Pbls\n");
+			nes_free_cqp_request(nesdev, cqp_request);
+			return -ENOMEM;
+		}
+
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
 	}
 
-	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	if (use_256_pbls && use_two_level) {
+		if (root_256(nesdev, root_vpbl, &new_root, pbl_count_4k, pbl_count_256) == 1) {
+			if (new_root.pbl_pbase != 0)
+				root_vpbl = &new_root;
+		} else {
+			spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+			nesadapter->free_256pbl += pbl_count_256 + use_two_level;
+			use_256_pbls = 0;
+
+			if (pbl_count_4k == 1)
+				use_two_level = 0;
+			pbl_count = pbl_count_4k;
+
+			if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
+				nesadapter->free_4kpbl -= pbl_count + use_two_level;
+				use_4k_pbls = 1;
+			}
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+			if (use_4k_pbls == 0)
+				return -ENOMEM;
+		}
+	}
 
 	opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ |
 					NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR;
@@ -1976,10 +2062,9 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 	} else {
 		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase);
 		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count);
-		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX,
-				(((pbl_count - 1) * 4096) + (residual_page_count*8)));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (pg_cnt * 8));
 
-		if ((pbl_count > 1) || (residual_page_count > 32))
+		if (use_4k_pbls)
 			cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
 	}
 	barrier();
@@ -1996,13 +2081,25 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 	major_code = cqp_request->major_code;
 	nes_put_cqp_request(nesdev, cqp_request);
 
+	if ((!ret || major_code) && pbl_count != 0) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		if (use_256_pbls)
+			nesadapter->free_256pbl += pbl_count + use_two_level;
+		else if (use_4k_pbls)
+			nesadapter->free_4kpbl += pbl_count + use_two_level;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+	if (new_root.pbl_pbase)
+		pci_free_consistent(nesdev->pcidev, 512, new_root.pbl_vbase,
+				    new_root.pbl_pbase);
+
 	if (!ret)
 		return -ETIME;
 	else if (major_code)
 		return -EIO;
-	else
-		return 0;
 
+	*actual_pbl_cnt = pbl_count + use_two_level;
+	*used_4k_pbls = use_4k_pbls;
 	return 0;
 }
 
@@ -2167,18 +2264,14 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
 		pbl_count = root_pbl_index;
 	}
 	ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl,
-			buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start);
+			buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start,
+			&nesmr->pbls_used, &nesmr->pbl_4k);
 
 	if (ret == 0) {
 		nesmr->ibmr.rkey = stag;
 		nesmr->ibmr.lkey = stag;
 		nesmr->mode = IWNES_MEMREG_TYPE_MEM;
 		ibmr = &nesmr->ibmr;
-		nesmr->pbl_4k = ((pbl_count > 1) || (cur_pbl_index > 32)) ? 1 : 0;
-		nesmr->pbls_used = pbl_count;
-		if (pbl_count > 1) {
-			nesmr->pbls_used++;
-		}
 	} else {
 		kfree(nesmr);
 		ibmr = ERR_PTR(-ENOMEM);
@@ -2456,8 +2549,9 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 					stag, (unsigned int)iova_start,
 					(unsigned int)region_length, stag_index,
 					(unsigned long long)region->length, pbl_count);
-			ret = nes_reg_mr( nesdev, nespd, stag, region->length, &root_vpbl,
-					first_dma_addr, pbl_count, (u16)cur_pbl_index, acc, &iova_start);
+			ret = nes_reg_mr(nesdev, nespd, stag, region->length, &root_vpbl,
+					 first_dma_addr, pbl_count, (u16)cur_pbl_index, acc,
+					 &iova_start, &nesmr->pbls_used, &nesmr->pbl_4k);
 
 			nes_debug(NES_DBG_MR, "ret=%d\n", ret);
 
@@ -2466,11 +2560,6 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				nesmr->ibmr.lkey = stag;
 				nesmr->mode = IWNES_MEMREG_TYPE_MEM;
 				ibmr = &nesmr->ibmr;
-				nesmr->pbl_4k = ((pbl_count > 1) || (cur_pbl_index > 32)) ? 1 : 0;
-				nesmr->pbls_used = pbl_count;
-				if (pbl_count > 1) {
-					nesmr->pbls_used++;
-				}
 			} else {
 				ib_umem_release(region);
 				kfree(nesmr);
@@ -2609,24 +2698,6 @@ static int nes_dereg_mr(struct ib_mr *ib_mr)
 	cqp_request->waiting = 1;
 	cqp_wqe = &cqp_request->cqp_wqe;
 
-	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
-	if (nesmr->pbls_used != 0) {
-		if (nesmr->pbl_4k) {
-			nesadapter->free_4kpbl += nesmr->pbls_used;
-			if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) {
-				printk(KERN_ERR PFX "free 4KB PBLs(%u) has exceeded the max(%u)\n",
-						nesadapter->free_4kpbl, nesadapter->max_4kpbl);
-			}
-		} else {
-			nesadapter->free_256pbl += nesmr->pbls_used;
-			if (nesadapter->free_256pbl > nesadapter->max_256pbl) {
-				printk(KERN_ERR PFX "free 256B PBLs(%u) has exceeded the max(%u)\n",
-						nesadapter->free_256pbl, nesadapter->max_256pbl);
-			}
-		}
-	}
-
-	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
 	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
 	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
 			NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO |
@@ -2644,11 +2715,6 @@ static int nes_dereg_mr(struct ib_mr *ib_mr)
 			" CQP Major:Minor codes = 0x%04X:0x%04X\n",
 			ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code);
 
-	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-			(ib_mr->rkey & 0x0fffff00) >> 8);
-
-	kfree(nesmr);
-
 	major_code = cqp_request->major_code;
 	minor_code = cqp_request->minor_code;
 
@@ -2664,8 +2730,33 @@ static int nes_dereg_mr(struct ib_mr *ib_mr)
 				" to destroy STag, ib_mr=%p, rkey = 0x%08X\n",
 				major_code, minor_code, ib_mr, ib_mr->rkey);
 		return -EIO;
-	} else
-		return 0;
+	}
+
+	if (nesmr->pbls_used != 0) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		if (nesmr->pbl_4k) {
+			nesadapter->free_4kpbl += nesmr->pbls_used;
+			if (nesadapter->free_4kpbl > nesadapter->max_4kpbl)
+				printk(KERN_ERR PFX "free 4KB PBLs(%u) has "
+					"exceeded the max(%u)\n",
+					nesadapter->free_4kpbl,
+					nesadapter->max_4kpbl);
+		} else {
+			nesadapter->free_256pbl += nesmr->pbls_used;
+			if (nesadapter->free_256pbl > nesadapter->max_256pbl)
+				printk(KERN_ERR PFX "free 256B PBLs(%u) has "
+					"exceeded the max(%u)\n",
+					nesadapter->free_256pbl,
+					nesadapter->max_256pbl);
+		}
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+			(ib_mr->rkey & 0x0fffff00) >> 8);
+
+	kfree(nesmr);
+
+	return 0;
 }
 
 
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h
index ae0ca9bc83b..5e48f67fbe8 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.h
+++ b/drivers/infiniband/hw/nes/nes_verbs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 - 2008 NetEffect, Inc. All rights reserved.
+ * Copyright (c) 2006 - 2009 Intel-NE, Inc.  All rights reserved.
  * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index ca837b0a889..421a6640c9b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -660,8 +660,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 
 	path = __path_find(dev, phdr->hwaddr + 4);
 	if (!path || !path->valid) {
-		if (!path)
+		int new_path = 0;
+
+		if (!path) {
 			path = path_rec_create(dev, phdr->hwaddr + 4);
+			new_path = 1;
+		}
 		if (path) {
 			/* put pseudoheader back on for next time */
 			skb_push(skb, sizeof *phdr);
@@ -669,7 +673,8 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 
 			if (!path->query && path_rec_start(dev, path)) {
 				spin_unlock_irqrestore(&priv->lock, flags);
-				path_free(dev, path);
+				if (new_path)
+					path_free(dev, path);
 				return;
 			} else
 				__path_add(dev, path);
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 319b188145b..ea9e1556e0d 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -401,13 +401,6 @@ static void iser_route_handler(struct rdma_cm_id *cma_id)
 	if (ret)
 		goto failure;
 
-	iser_dbg("path.mtu is %d setting it to %d\n",
-		 cma_id->route.path_rec->mtu, IB_MTU_1024);
-
-	/* we must set the MTU to 1024 as this is what the target is assuming */
-	if (cma_id->route.path_rec->mtu > IB_MTU_1024)
-		cma_id->route.path_rec->mtu = IB_MTU_1024;
-
 	memset(&conn_param, 0, sizeof conn_param);
 	conn_param.responder_resources = 4;
 	conn_param.initiator_depth     = 1;
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index a7a97bf998f..21040a0d81f 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -1,7 +1,7 @@
 obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
 
 mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
-		mr.o pd.o port.o profile.o qp.o reset.o srq.o
+		mr.o pd.o port.o profile.o qp.o reset.o sense.o srq.o
 
 obj-$(CONFIG_MLX4_EN)               += mlx4_en.o
 
diff --git a/drivers/net/mlx4/catas.c b/drivers/net/mlx4/catas.c
index f094ee00c41..aa9674b7f19 100644
--- a/drivers/net/mlx4/catas.c
+++ b/drivers/net/mlx4/catas.c
@@ -42,7 +42,6 @@ enum {
 static DEFINE_SPINLOCK(catas_lock);
 
 static LIST_HEAD(catas_list);
-static struct workqueue_struct *catas_wq;
 static struct work_struct catas_work;
 
 static int internal_err_reset = 1;
@@ -77,7 +76,7 @@ static void poll_catas(unsigned long dev_ptr)
 			list_add(&priv->catas_err.list, &catas_list);
 			spin_unlock(&catas_lock);
 
-			queue_work(catas_wq, &catas_work);
+			queue_work(mlx4_wq, &catas_work);
 		}
 	} else
 		mod_timer(&priv->catas_err.timer,
@@ -146,18 +145,7 @@ void mlx4_stop_catas_poll(struct mlx4_dev *dev)
 	spin_unlock_irq(&catas_lock);
 }
 
-int __init mlx4_catas_init(void)
+void  __init mlx4_catas_init(void)
 {
 	INIT_WORK(&catas_work, catas_reset);
-
-	catas_wq = create_singlethread_workqueue("mlx4_err");
-	if (!catas_wq)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void mlx4_catas_cleanup(void)
-{
-	destroy_workqueue(catas_wq);
 }
diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index 2c19bff7cba..8830dcb92ec 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -163,6 +163,7 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 	int cqn;
 	int eqes_found = 0;
 	int set_ci = 0;
+	int port;
 
 	while ((eqe = next_eqe_sw(eq))) {
 		/*
@@ -203,11 +204,16 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 			break;
 
 		case MLX4_EVENT_TYPE_PORT_CHANGE:
-			mlx4_dispatch_event(dev,
-					    eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_ACTIVE ?
-					    MLX4_DEV_EVENT_PORT_UP :
-					    MLX4_DEV_EVENT_PORT_DOWN,
-					    be32_to_cpu(eqe->event.port_change.port) >> 28);
+			port = be32_to_cpu(eqe->event.port_change.port) >> 28;
+			if (eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN) {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN,
+						    port);
+				mlx4_priv(dev)->sense.do_sense_port[port] = 1;
+			} else {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP,
+						    port);
+				mlx4_priv(dev)->sense.do_sense_port[port] = 0;
+			}
 			break;
 
 		case MLX4_EVENT_TYPE_CQ_ERROR:
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 6ef2490d5c3..a66f5b2fd28 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -51,6 +51,8 @@ MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(DRV_VERSION);
 
+struct workqueue_struct *mlx4_wq;
+
 #ifdef CONFIG_MLX4_DEBUG
 
 int mlx4_debug_level = 0;
@@ -98,24 +100,23 @@ module_param_named(use_prio, use_prio, bool, 0444);
 MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports "
 		  "(0/1, default 0)");
 
-static int mlx4_check_port_params(struct mlx4_dev *dev,
-				  enum mlx4_port_type *port_type)
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type)
 {
 	int i;
 
 	for (i = 0; i < dev->caps.num_ports - 1; i++) {
-		if (port_type[i] != port_type[i+1] &&
-		    !(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
-			mlx4_err(dev, "Only same port types supported "
-				 "on this HCA, aborting.\n");
-			return -EINVAL;
+		if (port_type[i] != port_type[i + 1]) {
+			if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+				mlx4_err(dev, "Only same port types supported "
+					 "on this HCA, aborting.\n");
+				return -EINVAL;
+			}
+			if (port_type[i] == MLX4_PORT_TYPE_ETH &&
+			    port_type[i + 1] == MLX4_PORT_TYPE_IB)
+				return -EINVAL;
 		}
 	}
-	if ((port_type[0] == MLX4_PORT_TYPE_ETH) &&
-	    (port_type[1] == MLX4_PORT_TYPE_IB)) {
-		mlx4_err(dev, "eth-ib configuration is not supported.\n");
-		return -EINVAL;
-	}
 
 	for (i = 0; i < dev->caps.num_ports; i++) {
 		if (!(port_type[i] & dev->caps.supported_type[i+1])) {
@@ -225,6 +226,9 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 			dev->caps.port_type[i] = MLX4_PORT_TYPE_IB;
 		else
 			dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH;
+		dev->caps.possible_type[i] = dev->caps.port_type[i];
+		mlx4_priv(dev)->sense.sense_allowed[i] =
+			dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO;
 
 		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
 			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
@@ -263,14 +267,16 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
  * Change the port configuration of the device.
  * Every user of this function must hold the port mutex.
  */
-static int mlx4_change_port_types(struct mlx4_dev *dev,
-				  enum mlx4_port_type *port_types)
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types)
 {
 	int err = 0;
 	int change = 0;
 	int port;
 
 	for (port = 0; port <  dev->caps.num_ports; port++) {
+		/* Change the port type only if the new type is different
+		 * from the current, and not set to Auto */
 		if (port_types[port] != dev->caps.port_type[port + 1]) {
 			change = 1;
 			dev->caps.port_type[port + 1] = port_types[port];
@@ -302,10 +308,17 @@ static ssize_t show_port_type(struct device *dev,
 	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
 						   port_attr);
 	struct mlx4_dev *mdev = info->dev;
+	char type[8];
+
+	sprintf(type, "%s",
+		(mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB) ?
+		"ib" : "eth");
+	if (mdev->caps.possible_type[info->port] == MLX4_PORT_TYPE_AUTO)
+		sprintf(buf, "auto (%s)\n", type);
+	else
+		sprintf(buf, "%s\n", type);
 
-	return sprintf(buf, "%s\n",
-		       mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB ?
-		       "ib" : "eth");
+	return strlen(buf);
 }
 
 static ssize_t set_port_type(struct device *dev,
@@ -317,6 +330,7 @@ static ssize_t set_port_type(struct device *dev,
 	struct mlx4_dev *mdev = info->dev;
 	struct mlx4_priv *priv = mlx4_priv(mdev);
 	enum mlx4_port_type types[MLX4_MAX_PORTS];
+	enum mlx4_port_type new_types[MLX4_MAX_PORTS];
 	int i;
 	int err = 0;
 
@@ -324,26 +338,56 @@ static ssize_t set_port_type(struct device *dev,
 		info->tmp_type = MLX4_PORT_TYPE_IB;
 	else if (!strcmp(buf, "eth\n"))
 		info->tmp_type = MLX4_PORT_TYPE_ETH;
+	else if (!strcmp(buf, "auto\n"))
+		info->tmp_type = MLX4_PORT_TYPE_AUTO;
 	else {
 		mlx4_err(mdev, "%s is not supported port type\n", buf);
 		return -EINVAL;
 	}
 
+	mlx4_stop_sense(mdev);
 	mutex_lock(&priv->port_mutex);
-	for (i = 0; i < mdev->caps.num_ports; i++)
+	/* Possible type is always the one that was delivered */
+	mdev->caps.possible_type[info->port] = info->tmp_type;
+
+	for (i = 0; i < mdev->caps.num_ports; i++) {
 		types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type :
-					mdev->caps.port_type[i+1];
+					mdev->caps.possible_type[i+1];
+		if (types[i] == MLX4_PORT_TYPE_AUTO)
+			types[i] = mdev->caps.port_type[i+1];
+	}
 
-	err = mlx4_check_port_params(mdev, types);
+	if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+		for (i = 1; i <= mdev->caps.num_ports; i++) {
+			if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+				mdev->caps.possible_type[i] = mdev->caps.port_type[i];
+				err = -EINVAL;
+			}
+		}
+	}
+	if (err) {
+		mlx4_err(mdev, "Auto sensing is not supported on this HCA. "
+			       "Set only 'eth' or 'ib' for both ports "
+			       "(should be the same)\n");
+		goto out;
+	}
+
+	mlx4_do_sense_ports(mdev, new_types, types);
+
+	err = mlx4_check_port_params(mdev, new_types);
 	if (err)
 		goto out;
 
-	for (i = 1; i <= mdev->caps.num_ports; i++)
-		priv->port[i].tmp_type = 0;
+	/* We are about to apply the changes after the configuration
+	 * was verified, no need to remember the temporary types
+	 * any more */
+	for (i = 0; i < mdev->caps.num_ports; i++)
+		priv->port[i + 1].tmp_type = 0;
 
-	err = mlx4_change_port_types(mdev, types);
+	err = mlx4_change_port_types(mdev, new_types);
 
 out:
+	mlx4_start_sense(mdev);
 	mutex_unlock(&priv->port_mutex);
 	return err ? err : count;
 }
@@ -1117,6 +1161,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_port;
 
+	mlx4_sense_init(dev);
+	mlx4_start_sense(dev);
+
 	pci_set_drvdata(pdev, dev);
 
 	return 0;
@@ -1182,6 +1229,7 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	int p;
 
 	if (dev) {
+		mlx4_stop_sense(dev);
 		mlx4_unregister_device(dev);
 
 		for (p = 1; p <= dev->caps.num_ports; p++) {
@@ -1230,6 +1278,8 @@ static struct pci_device_id mlx4_pci_table[] = {
 	{ PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */
 	{ PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */
 	{ PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
 	{ 0, }
 };
 
@@ -1264,9 +1314,11 @@ static int __init mlx4_init(void)
 	if (mlx4_verify_params())
 		return -EINVAL;
 
-	ret = mlx4_catas_init();
-	if (ret)
-		return ret;
+	mlx4_catas_init();
+
+	mlx4_wq = create_singlethread_workqueue("mlx4");
+	if (!mlx4_wq)
+		return -ENOMEM;
 
 	ret = pci_register_driver(&mlx4_driver);
 	return ret < 0 ? ret : 0;
@@ -1275,7 +1327,7 @@ static int __init mlx4_init(void)
 static void __exit mlx4_cleanup(void)
 {
 	pci_unregister_driver(&mlx4_driver);
-	mlx4_catas_cleanup();
+	destroy_workqueue(mlx4_wq);
 }
 
 module_init(mlx4_init);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index e0213bad61c..5bd79c2b184 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -40,6 +40,7 @@
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
 #include <linux/timer.h>
+#include <linux/workqueue.h>
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/driver.h>
@@ -276,6 +277,13 @@ struct mlx4_port_info {
 	struct mlx4_vlan_table	vlan_table;
 };
 
+struct mlx4_sense {
+	struct mlx4_dev		*dev;
+	u8			do_sense_port[MLX4_MAX_PORTS + 1];
+	u8			sense_allowed[MLX4_MAX_PORTS + 1];
+	struct delayed_work	sense_poll;
+};
+
 struct mlx4_priv {
 	struct mlx4_dev		dev;
 
@@ -305,6 +313,7 @@ struct mlx4_priv {
 	struct mlx4_uar		driver_uar;
 	void __iomem	       *kar;
 	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
+	struct mlx4_sense       sense;
 	struct mutex		port_mutex;
 };
 
@@ -313,6 +322,10 @@ static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
 	return container_of(dev, struct mlx4_priv, dev);
 }
 
+#define MLX4_SENSE_RANGE	(HZ * 3)
+
+extern struct workqueue_struct *mlx4_wq;
+
 u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
 void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj);
 u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
@@ -346,8 +359,7 @@ void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
 
 void mlx4_start_catas_poll(struct mlx4_dev *dev);
 void mlx4_stop_catas_poll(struct mlx4_dev *dev);
-int mlx4_catas_init(void);
-void mlx4_catas_cleanup(void);
+void mlx4_catas_init(void);
 int mlx4_restart_one(struct pci_dev *pdev);
 int mlx4_register_device(struct mlx4_dev *dev);
 void mlx4_unregister_device(struct mlx4_dev *dev);
@@ -379,6 +391,17 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
 
 void mlx4_handle_catas_err(struct mlx4_dev *dev);
 
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults);
+void mlx4_start_sense(struct mlx4_dev *dev);
+void mlx4_stop_sense(struct mlx4_dev *dev);
+void mlx4_sense_init(struct mlx4_dev *dev);
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type);
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types);
+
 void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
 void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
 
diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c
index 0a057e5dc63..7cce3342ef8 100644
--- a/drivers/net/mlx4/port.c
+++ b/drivers/net/mlx4/port.c
@@ -298,20 +298,17 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	int err;
-	u8 is_eth = dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 
 	memset(mailbox->buf, 0, 256);
-	if (is_eth) {
-		((u8 *) mailbox->buf)[3] = 6;
-		((__be16 *) mailbox->buf)[4] = cpu_to_be16(1 << 15);
-		((__be16 *) mailbox->buf)[6] = cpu_to_be16(1 << 15);
-	} else
-		((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
-	err = mlx4_cmd(dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
+	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+		return 0;
+
+	((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
+	err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT,
 		       MLX4_CMD_TIME_CLASS_B);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
diff --git a/drivers/net/mlx4/sense.c b/drivers/net/mlx4/sense.c
new file mode 100644
index 00000000000..6d5089ecb5a
--- /dev/null
+++ b/drivers/net/mlx4/sense.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+static int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+			   enum mlx4_port_type *type)
+{
+	u64 out_param;
+	int err = 0;
+
+	err = mlx4_cmd_imm(dev, 0, &out_param, port, 0,
+			   MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B);
+	if (err) {
+		mlx4_err(dev, "Sense command failed for port: %d\n", port);
+		return err;
+	}
+
+	if (out_param > 2) {
+		mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", out_param);
+		return EINVAL;
+	}
+
+	*type = out_param;
+	return 0;
+}
+
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults)
+{
+	struct mlx4_sense *sense = &mlx4_priv(dev)->sense;
+	int err;
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		stype[i - 1] = 0;
+		if (sense->do_sense_port[i] && sense->sense_allowed[i] &&
+		    dev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+			err = mlx4_SENSE_PORT(dev, i, &stype[i - 1]);
+			if (err)
+				stype[i - 1] = defaults[i - 1];
+		} else
+			stype[i - 1] = defaults[i - 1];
+	}
+
+	/*
+	 * Adjust port configuration:
+	 * If port 1 sensed nothing and port 2 is IB, set both as IB
+	 * If port 2 sensed nothing and port 1 is Eth, set both as Eth
+	 */
+	if (stype[0] == MLX4_PORT_TYPE_ETH) {
+		for (i = 1; i < dev->caps.num_ports; i++)
+			stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_ETH;
+	}
+	if (stype[dev->caps.num_ports - 1] == MLX4_PORT_TYPE_IB) {
+		for (i = 0; i < dev->caps.num_ports - 1; i++)
+			stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_IB;
+	}
+
+	/*
+	 * If sensed nothing, remain in current configuration.
+	 */
+	for (i = 0; i < dev->caps.num_ports; i++)
+		stype[i] = stype[i] ? stype[i] : defaults[i];
+
+}
+
+static void mlx4_sense_port(struct work_struct *work)
+{
+	struct delayed_work *delay = container_of(work, struct delayed_work, work);
+	struct mlx4_sense *sense = container_of(delay, struct mlx4_sense,
+						sense_poll);
+	struct mlx4_dev *dev = sense->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	enum mlx4_port_type stype[MLX4_MAX_PORTS];
+
+	mutex_lock(&priv->port_mutex);
+	mlx4_do_sense_ports(dev, stype, &dev->caps.port_type[1]);
+
+	if (mlx4_check_port_params(dev, stype))
+		goto sense_again;
+
+	if (mlx4_change_port_types(dev, stype))
+		mlx4_err(dev, "Failed to change port_types\n");
+
+sense_again:
+	mutex_unlock(&priv->port_mutex);
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
+}
+
+void mlx4_start_sense(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP))
+		return;
+
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
+}
+
+void mlx4_stop_sense(struct mlx4_dev *dev)
+{
+	cancel_delayed_work_sync(&mlx4_priv(dev)->sense.sense_poll);
+}
+
+void  mlx4_sense_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+	int port;
+
+	sense->dev = dev;
+	for (port = 1; port <= dev->caps.num_ports; port++)
+		sense->do_sense_port[port] = 1;
+
+	INIT_DELAYED_WORK_DEFERRABLE(&sense->sense_poll, mlx4_sense_port);
+}
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 325c10ff6a2..55f64af072a 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -940,21 +940,6 @@ config WDT
 	  To compile this driver as a module, choose M here: the
 	  module will be called wdt.
 
-config WDT_501
-	bool "WDT501 features"
-	depends on WDT
-	help
-	  Saying Y here and creating a character special file /dev/temperature
-	  with major number 10 and minor number 131 ("man mknod") will give
-	  you a thermometer inside your computer: reading from
-	  /dev/temperature yields one byte, the temperature in degrees
-	  Fahrenheit. This works only if you have a WDT501P watchdog board
-	  installed.
-
-	  If you want to enable the Fan Tachometer on the WDT501P, then you
-	  can do this via the tachometer parameter. Only do this if you have a
-	  fan tachometer actually set up.
-
 #
 # PCI-based Watchdog Cards
 #
diff --git a/drivers/watchdog/acquirewdt.c b/drivers/watchdog/acquirewdt.c
index 3e57aa4d643..4d18c874d96 100644
--- a/drivers/watchdog/acquirewdt.c
+++ b/drivers/watchdog/acquirewdt.c
@@ -1,7 +1,7 @@
 /*
  *	Acquire Single Board Computer Watchdog Timer driver
  *
- *      Based on wdt.c. Original copyright messages:
+ *	Based on wdt.c. Original copyright messages:
  *
  *	(c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>,
  *						All Rights Reserved.
@@ -17,9 +17,9 @@
  *
  *	(c) Copyright 1995    Alan Cox <alan@lxorguk.ukuu.org.uk>
  *
- *      14-Dec-2001 Matt Domsch <Matt_Domsch@dell.com>
- *          Added nowayout module option to override CONFIG_WATCHDOG_NOWAYOUT
- *          Can't add timeout - driver doesn't allow changing value
+ *	14-Dec-2001 Matt Domsch <Matt_Domsch@dell.com>
+ *	    Added nowayout module option to override CONFIG_WATCHDOG_NOWAYOUT
+ *	    Can't add timeout - driver doesn't allow changing value
  */
 
 /*
diff --git a/drivers/watchdog/advantechwdt.c b/drivers/watchdog/advantechwdt.c
index a1d7856ea6e..824d076a5cd 100644
--- a/drivers/watchdog/advantechwdt.c
+++ b/drivers/watchdog/advantechwdt.c
@@ -138,7 +138,9 @@ static long advwdt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	void __user *argp = (void __user *)arg;
 	int __user *p = argp;
 	static struct watchdog_info ident = {
-		.options = WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE,
+		.options = WDIOF_KEEPALIVEPING |
+			   WDIOF_SETTIMEOUT |
+			   WDIOF_MAGICCLOSE,
 		.firmware_version = 1,
 		.identity = WATCHDOG_NAME,
 	};
@@ -259,7 +261,8 @@ static int __devinit advwdt_probe(struct platform_device *dev)
 		goto unreg_stop;
 	}
 
-	/* Check that the heartbeat value is within it's range ; if not reset to the default */
+	/* Check that the heartbeat value is within it's range ;
+	 * if not reset to the default */
 	if (advwdt_set_heartbeat(timeout)) {
 		advwdt_set_heartbeat(WATCHDOG_TIMEOUT);
 		printk(KERN_INFO PFX
diff --git a/drivers/watchdog/alim1535_wdt.c b/drivers/watchdog/alim1535_wdt.c
index 2a7690ecf97..937a80fb61e 100644
--- a/drivers/watchdog/alim1535_wdt.c
+++ b/drivers/watchdog/alim1535_wdt.c
@@ -60,7 +60,7 @@ static void ali_start(void)
 
 	pci_read_config_dword(ali_pci, 0xCC, &val);
 	val &= ~0x3F;	/* Mask count */
-	val |= (1<<25) | ali_timeout_bits;
+	val |= (1 << 25) | ali_timeout_bits;
 	pci_write_config_dword(ali_pci, 0xCC, val);
 
 	spin_unlock(&ali_lock);
@@ -79,8 +79,8 @@ static void ali_stop(void)
 	spin_lock(&ali_lock);
 
 	pci_read_config_dword(ali_pci, 0xCC, &val);
-	val &= ~0x3F;	/* Mask count to zero (disabled) */
-	val &= ~(1<<25);/* and for safety mask the reset enable */
+	val &= ~0x3F;		/* Mask count to zero (disabled) */
+	val &= ~(1 << 25);	/* and for safety mask the reset enable */
 	pci_write_config_dword(ali_pci, 0xCC, val);
 
 	spin_unlock(&ali_lock);
@@ -89,7 +89,7 @@ static void ali_stop(void)
 /*
  *	ali_keepalive	-	send a keepalive to the watchdog
  *
- *      Send a keepalive to the timer (actually we restart the timer).
+ *	Send a keepalive to the timer (actually we restart the timer).
  */
 
 static void ali_keepalive(void)
@@ -109,11 +109,11 @@ static int ali_settimer(int t)
 	if (t < 0)
 		return -EINVAL;
 	else if (t < 60)
-		ali_timeout_bits = t|(1<<6);
+		ali_timeout_bits = t|(1 << 6);
 	else if (t < 3600)
-		ali_timeout_bits = (t/60)|(1<<7);
+		ali_timeout_bits = (t / 60)|(1 << 7);
 	else if (t < 18000)
-		ali_timeout_bits = (t/300)|(1<<6)|(1<<7);
+		ali_timeout_bits = (t / 300)|(1 << 6)|(1 << 7);
 	else
 		return -EINVAL;
 
@@ -138,7 +138,7 @@ static int ali_settimer(int t)
  */
 
 static ssize_t ali_write(struct file *file, const char __user *data,
-			      size_t len, loff_t *ppos)
+						size_t len, loff_t *ppos)
 {
 	/* See if we got the magic character 'V' and reload the timer */
 	if (len) {
@@ -348,9 +348,9 @@ static int __init ali_find_watchdog(void)
 	/* Timer bits */
 	wdog &= ~0x3F;
 	/* Issued events */
-	wdog &= ~((1<<27)|(1<<26)|(1<<25)|(1<<24));
+	wdog &= ~((1 << 27)|(1 << 26)|(1 << 25)|(1 << 24));
 	/* No monitor bits */
-	wdog &= ~((1<<16)|(1<<13)|(1<<12)|(1<<11)|(1<<10)|(1<<9));
+	wdog &= ~((1 << 16)|(1 << 13)|(1 << 12)|(1 << 11)|(1 << 10)|(1 << 9));
 
 	pci_write_config_dword(pdev, 0xCC, wdog);
 
diff --git a/drivers/watchdog/alim7101_wdt.c b/drivers/watchdog/alim7101_wdt.c
index a045ef86943..90f98df5f10 100644
--- a/drivers/watchdog/alim7101_wdt.c
+++ b/drivers/watchdog/alim7101_wdt.c
@@ -355,7 +355,8 @@ static int __init alim7101_wdt_init(void)
 	alim7101_pmu = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M7101,
 		NULL);
 	if (!alim7101_pmu) {
-		printk(KERN_INFO PFX "ALi M7101 PMU not present - WDT not set\n");
+		printk(KERN_INFO PFX
+			"ALi M7101 PMU not present - WDT not set\n");
 		return -EBUSY;
 	}
 
@@ -399,7 +400,8 @@ static int __init alim7101_wdt_init(void)
 
 	rc = misc_register(&wdt_miscdev);
 	if (rc) {
-		printk(KERN_ERR PFX "cannot register miscdev on minor=%d (err=%d)\n",
+		printk(KERN_ERR PFX
+			"cannot register miscdev on minor=%d (err=%d)\n",
 			wdt_miscdev.minor, rc);
 		goto err_out_reboot;
 	}
diff --git a/drivers/watchdog/at91sam9_wdt.c b/drivers/watchdog/at91sam9_wdt.c
index a56ac84381b..435b0573fb0 100644
--- a/drivers/watchdog/at91sam9_wdt.c
+++ b/drivers/watchdog/at91sam9_wdt.c
@@ -201,7 +201,7 @@ static long at91_wdt_ioctl(struct file *file,
  * Pat the watchdog whenever device is written to.
  */
 static ssize_t at91_wdt_write(struct file *file, const char *data, size_t len,
-      loff_t *ppos)
+								loff_t *ppos)
 {
 	if (!len)
 		return 0;
diff --git a/drivers/watchdog/cpwd.c b/drivers/watchdog/cpwd.c
index 084dfe9cecf..41070e4771a 100644
--- a/drivers/watchdog/cpwd.c
+++ b/drivers/watchdog/cpwd.c
@@ -1,13 +1,13 @@
 /* cpwd.c - driver implementation for hardware watchdog
  * timers found on Sun Microsystems CP1400 and CP1500 boards.
  *
- * This device supports both the generic Linux watchdog 
+ * This device supports both the generic Linux watchdog
  * interface and Solaris-compatible ioctls as best it is
  * able.
  *
  * NOTE: 	CP1400 systems appear to have a defective intr_mask
  * 			register on the PLD, preventing the disabling of
- * 			timer interrupts.  We use a timer to periodically 
+ * 			timer interrupts.  We use a timer to periodically
  * 			reset 'stopped' watchdogs on affected platforms.
  *
  * Copyright (c) 2000 Eric Brower (ebrower@usa.net)
@@ -28,10 +28,9 @@
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
+#include <linux/uaccess.h>
 
 #include <asm/irq.h>
-#include <asm/uaccess.h>
-
 #include <asm/watchdog.h>
 
 #define DRIVER_NAME	"cpwd"
@@ -43,8 +42,8 @@
 #define WD_BLIMIT	0xFFFF
 
 #define WD0_MINOR	212
-#define WD1_MINOR	213	
-#define WD2_MINOR	214	
+#define WD1_MINOR	213
+#define WD2_MINOR	214
 
 /* Internal driver definitions.  */
 #define WD0_ID			0
@@ -91,16 +90,16 @@ struct cpwd {
 
 static struct cpwd *cpwd_device;
 
-/* Sun uses Altera PLD EPF8820ATC144-4 
+/* Sun uses Altera PLD EPF8820ATC144-4
  * providing three hardware watchdogs:
  *
- * 	1) RIC - sends an interrupt when triggered
- * 	2) XIR - asserts XIR_B_RESET when triggered, resets CPU
- * 	3) POR - asserts POR_B_RESET when triggered, resets CPU, backplane, board
+ * 1) RIC - sends an interrupt when triggered
+ * 2) XIR - asserts XIR_B_RESET when triggered, resets CPU
+ * 3) POR - asserts POR_B_RESET when triggered, resets CPU, backplane, board
  *
  *** Timer register block definition (struct wd_timer_regblk)
  *
- * dcntr and limit registers (halfword access):      
+ * dcntr and limit registers (halfword access):
  * -------------------
  * | 15 | ...| 1 | 0 |
  * -------------------
@@ -108,7 +107,8 @@ static struct cpwd *cpwd_device;
  * -------------------
  * dcntr - 	Current 16-bit downcounter value.
  * 			When downcounter reaches '0' watchdog expires.
- * 			Reading this register resets downcounter with 'limit' value.
+ * 			Reading this register resets downcounter with
+ * 			'limit' value.
  * limit - 	16-bit countdown value in 1/10th second increments.
  * 			Writing this register begins countdown with input value.
  * 			Reading from this register does not affect counter.
@@ -158,11 +158,11 @@ static int wd0_timeout = 0;
 static int wd1_timeout = 0;
 static int wd2_timeout = 0;
 
-module_param	(wd0_timeout, int, 0);
+module_param(wd0_timeout, int, 0);
 MODULE_PARM_DESC(wd0_timeout, "Default watchdog0 timeout in 1/10secs");
-module_param 	(wd1_timeout, int, 0);
+module_param(wd1_timeout, int, 0);
 MODULE_PARM_DESC(wd1_timeout, "Default watchdog1 timeout in 1/10secs");
-module_param 	(wd2_timeout, int, 0);
+module_param(wd2_timeout, int, 0);
 MODULE_PARM_DESC(wd2_timeout, "Default watchdog2 timeout in 1/10secs");
 
 MODULE_AUTHOR("Eric Brower <ebrower@usa.net>");
@@ -201,9 +201,9 @@ static u8 cpwd_readb(void __iomem *addr)
 static void cpwd_toggleintr(struct cpwd *p, int index, int enable)
 {
 	unsigned char curregs = cpwd_readb(p->regs + PLD_IMASK);
-	unsigned char setregs = 
-		(index == -1) ? 
-		(WD0_INTR_MASK | WD1_INTR_MASK | WD2_INTR_MASK) : 
+	unsigned char setregs =
+		(index == -1) ?
+		(WD0_INTR_MASK | WD1_INTR_MASK | WD2_INTR_MASK) :
 		(p->devs[index].intr_mask);
 
 	if (enable == WD_INTR_ON)
@@ -303,24 +303,24 @@ static int cpwd_getstatus(struct cpwd *p, int index)
 	unsigned char ret  = WD_STOPPED;
 
 	/* determine STOPPED */
-	if (!stat) 
+	if (!stat)
 		return ret;
 
 	/* determine EXPIRED vs FREERUN vs RUNNING */
 	else if (WD_S_EXPIRED & stat) {
 		ret = WD_EXPIRED;
-	} else if(WD_S_RUNNING & stat) {
+	} else if (WD_S_RUNNING & stat) {
 		if (intr & p->devs[index].intr_mask) {
 			ret = WD_FREERUN;
 		} else {
 			/* Fudge WD_EXPIRED status for defective CP1400--
-			 * IF timer is running 
-			 * 	AND brokenstop is set 
+			 * IF timer is running
+			 * 	AND brokenstop is set
 			 * 	AND an interrupt has been serviced
 			 * we are WD_EXPIRED.
 			 *
-			 * IF timer is running 
-			 * 	AND brokenstop is set 
+			 * IF timer is running
+			 * 	AND brokenstop is set
 			 * 	AND no interrupt has been serviced
 			 * we are WD_FREERUN.
 			 */
@@ -329,7 +329,8 @@ static int cpwd_getstatus(struct cpwd *p, int index)
 				if (p->devs[index].runstatus & WD_STAT_SVCD) {
 					ret = WD_EXPIRED;
 				} else {
-					/* we could as well pretend we are expired */
+					/* we could as well pretend
+					 * we are expired */
 					ret = WD_FREERUN;
 				}
 			} else {
@@ -342,7 +343,7 @@ static int cpwd_getstatus(struct cpwd *p, int index)
 	if (p->devs[index].runstatus & WD_STAT_SVCD)
 		ret |= WD_SERVICED;
 
-	return(ret);
+	return ret;
 }
 
 static irqreturn_t cpwd_interrupt(int irq, void *dev_id)
@@ -367,22 +368,22 @@ static int cpwd_open(struct inode *inode, struct file *f)
 	struct cpwd *p = cpwd_device;
 
 	lock_kernel();
-	switch(iminor(inode)) {
-		case WD0_MINOR:
-		case WD1_MINOR:
-		case WD2_MINOR:
-			break;
+	switch (iminor(inode)) {
+	case WD0_MINOR:
+	case WD1_MINOR:
+	case WD2_MINOR:
+		break;
 
-		default:
-			unlock_kernel();
-			return -ENODEV;
+	default:
+		unlock_kernel();
+		return -ENODEV;
 	}
 
 	/* Register IRQ on first open of device */
 	if (!p->initialized) {
-		if (request_irq(p->irq, &cpwd_interrupt, 
+		if (request_irq(p->irq, &cpwd_interrupt,
 				IRQF_SHARED, DRIVER_NAME, p)) {
-			printk(KERN_ERR PFX "Cannot register IRQ %d\n", 
+			printk(KERN_ERR PFX "Cannot register IRQ %d\n",
 				p->irq);
 			unlock_kernel();
 			return -EBUSY;
@@ -400,8 +401,7 @@ static int cpwd_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int cpwd_ioctl(struct inode *inode, struct file *file, 
-		      unsigned int cmd, unsigned long arg)
+static long cpwd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	static struct watchdog_info info = {
 		.options		= WDIOF_SETTIMEOUT,
@@ -409,6 +409,7 @@ static int cpwd_ioctl(struct inode *inode, struct file *file,
 		.identity		= DRIVER_NAME,
 	};
 	void __user *argp = (void __user *)arg;
+	struct inode *inode = file->f_path.dentry->d_inode;
 	int index = iminor(inode) - WD0_MINOR;
 	struct cpwd *p = cpwd_device;
 	int setopt = 0;
@@ -442,7 +443,7 @@ static int cpwd_ioctl(struct inode *inode, struct file *file,
 			cpwd_starttimer(p, index);
 		} else {
 			return -EINVAL;
-		}	
+		}
 		break;
 
 	/* Solaris-compatible IOCTLs */
@@ -458,7 +459,7 @@ static int cpwd_ioctl(struct inode *inode, struct file *file,
 
 	case WIOCSTOP:
 		if (p->enabled)
-			return(-EINVAL);
+			return -EINVAL;
 
 		cpwd_stoptimer(p, index);
 		break;
@@ -481,7 +482,7 @@ static long cpwd_compat_ioctl(struct file *file, unsigned int cmd,
 	case WIOCSTOP:
 	case WIOCGSTAT:
 		lock_kernel();
-		rval = cpwd_ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
+		rval = cpwd_ioctl(file, cmd, arg);
 		unlock_kernel();
 		break;
 
@@ -493,7 +494,7 @@ static long cpwd_compat_ioctl(struct file *file, unsigned int cmd,
 	return rval;
 }
 
-static ssize_t cpwd_write(struct file *file, const char __user *buf, 
+static ssize_t cpwd_write(struct file *file, const char __user *buf,
 			  size_t count, loff_t *ppos)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
@@ -508,20 +509,20 @@ static ssize_t cpwd_write(struct file *file, const char __user *buf,
 	return 0;
 }
 
-static ssize_t cpwd_read(struct file * file, char __user *buffer,
+static ssize_t cpwd_read(struct file *file, char __user *buffer,
 			 size_t count, loff_t *ppos)
 {
 	return -EINVAL;
 }
 
 static const struct file_operations cpwd_fops = {
-	.owner =	THIS_MODULE,
-	.ioctl =	cpwd_ioctl,
-	.compat_ioctl =	cpwd_compat_ioctl,
-	.open =		cpwd_open,
-	.write =	cpwd_write,
-	.read =		cpwd_read,
-	.release =	cpwd_release,
+	.owner =		THIS_MODULE,
+	.unlocked_ioctl =	cpwd_ioctl,
+	.compat_ioctl =		cpwd_compat_ioctl,
+	.open =			cpwd_open,
+	.write =		cpwd_write,
+	.read =			cpwd_read,
+	.release =		cpwd_release,
 };
 
 static int __devinit cpwd_probe(struct of_device *op,
diff --git a/drivers/watchdog/davinci_wdt.c b/drivers/watchdog/davinci_wdt.c
index 2e136028673..c51d0b0ea0c 100644
--- a/drivers/watchdog/davinci_wdt.c
+++ b/drivers/watchdog/davinci_wdt.c
@@ -24,7 +24,7 @@
 #include <linux/spinlock.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
-#include <mach/hardware.h>
+#include <linux/device.h>
 
 #define MODULE_NAME "DAVINCI-WDT: "
 
@@ -75,9 +75,9 @@ static void wdt_service(void)
 	spin_lock(&io_lock);
 
 	/* put watchdog in service state */
-	davinci_writel(WDKEY_SEQ0, wdt_base + WDTCR);
+	iowrite32(WDKEY_SEQ0, wdt_base + WDTCR);
 	/* put watchdog in active state */
-	davinci_writel(WDKEY_SEQ1, wdt_base + WDTCR);
+	iowrite32(WDKEY_SEQ1, wdt_base + WDTCR);
 
 	spin_unlock(&io_lock);
 }
@@ -90,29 +90,29 @@ static void wdt_enable(void)
 	spin_lock(&io_lock);
 
 	/* disable, internal clock source */
-	davinci_writel(0, wdt_base + TCR);
+	iowrite32(0, wdt_base + TCR);
 	/* reset timer, set mode to 64-bit watchdog, and unreset */
-	davinci_writel(0, wdt_base + TGCR);
+	iowrite32(0, wdt_base + TGCR);
 	tgcr = TIMMODE_64BIT_WDOG | TIM12RS_UNRESET | TIM34RS_UNRESET;
-	davinci_writel(tgcr, wdt_base + TGCR);
+	iowrite32(tgcr, wdt_base + TGCR);
 	/* clear counter regs */
-	davinci_writel(0, wdt_base + TIM12);
-	davinci_writel(0, wdt_base + TIM34);
+	iowrite32(0, wdt_base + TIM12);
+	iowrite32(0, wdt_base + TIM34);
 	/* set timeout period */
 	timer_margin = (((u64)heartbeat * CLOCK_TICK_RATE) & 0xffffffff);
-	davinci_writel(timer_margin, wdt_base + PRD12);
+	iowrite32(timer_margin, wdt_base + PRD12);
 	timer_margin = (((u64)heartbeat * CLOCK_TICK_RATE) >> 32);
-	davinci_writel(timer_margin, wdt_base + PRD34);
+	iowrite32(timer_margin, wdt_base + PRD34);
 	/* enable run continuously */
-	davinci_writel(ENAMODE12_PERIODIC, wdt_base + TCR);
+	iowrite32(ENAMODE12_PERIODIC, wdt_base + TCR);
 	/* Once the WDT is in pre-active state write to
 	 * TIM12, TIM34, PRD12, PRD34, TCR, TGCR, WDTCR are
 	 * write protected (except for the WDKEY field)
 	 */
 	/* put watchdog in pre-active state */
-	davinci_writel(WDKEY_SEQ0 | WDEN, wdt_base + WDTCR);
+	iowrite32(WDKEY_SEQ0 | WDEN, wdt_base + WDTCR);
 	/* put watchdog in active state */
-	davinci_writel(WDKEY_SEQ1 | WDEN, wdt_base + WDTCR);
+	iowrite32(WDKEY_SEQ1 | WDEN, wdt_base + WDTCR);
 
 	spin_unlock(&io_lock);
 }
@@ -197,17 +197,16 @@ static int davinci_wdt_probe(struct platform_device *pdev)
 {
 	int ret = 0, size;
 	struct resource *res;
+	struct device *dev = &pdev->dev;
 
 	if (heartbeat < 1 || heartbeat > MAX_HEARTBEAT)
 		heartbeat = DEFAULT_HEARTBEAT;
 
-	printk(KERN_INFO MODULE_NAME
-		"DaVinci Watchdog Timer: heartbeat %d sec\n", heartbeat);
+	dev_info(dev, "heartbeat %d sec\n", heartbeat);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (res == NULL) {
-		printk(KERN_INFO MODULE_NAME
-			"failed to get memory region resource\n");
+		dev_err(dev, "failed to get memory region resource\n");
 		return -ENOENT;
 	}
 
@@ -215,20 +214,26 @@ static int davinci_wdt_probe(struct platform_device *pdev)
 	wdt_mem = request_mem_region(res->start, size, pdev->name);
 
 	if (wdt_mem == NULL) {
-		printk(KERN_INFO MODULE_NAME "failed to get memory region\n");
+		dev_err(dev, "failed to get memory region\n");
 		return -ENOENT;
 	}
-	wdt_base = (void __iomem *)(res->start);
+
+	wdt_base = ioremap(res->start, size);
+	if (!wdt_base) {
+		dev_err(dev, "failed to map memory region\n");
+		return -ENOMEM;
+	}
 
 	ret = misc_register(&davinci_wdt_miscdev);
 	if (ret < 0) {
-		printk(KERN_ERR MODULE_NAME "cannot register misc device\n");
+		dev_err(dev, "cannot register misc device\n");
 		release_resource(wdt_mem);
 		kfree(wdt_mem);
 	} else {
 		set_bit(WDT_DEVICE_INITED, &wdt_status);
 	}
 
+	iounmap(wdt_base);
 	return ret;
 }
 
diff --git a/drivers/watchdog/eurotechwdt.c b/drivers/watchdog/eurotechwdt.c
index a171fc6ae1c..9add3541fb4 100644
--- a/drivers/watchdog/eurotechwdt.c
+++ b/drivers/watchdog/eurotechwdt.c
@@ -8,19 +8,19 @@
  *	Based on wdt.c.
  *	Original copyright messages:
  *
- *      (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>,
+ *	(c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>,
  *						All Rights Reserved.
  *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
  *
- *      Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
- *      warranty for any of this software. This material is provided
- *      "AS-IS" and at no charge.
+ *	Neither Alan Cox nor CymruNet Ltd. admit liability nor provide
+ *	warranty for any of this software. This material is provided
+ *	"AS-IS" and at no charge.
  *
- *      (c) Copyright 1995    Alan Cox <alan@lxorguk.ukuu.org.uk>*
+ *	(c) Copyright 1995    Alan Cox <alan@lxorguk.ukuu.org.uk>*
  */
 
 /* Changelog:
@@ -37,7 +37,7 @@
  *	add expect_close support
  *
  * 2002.05.30 - Joel Becker <joel.becker@oracle.com>
- * 	Added Matt Domsch's nowayout module option.
+ *	Added Matt Domsch's nowayout module option.
  */
 
 /*
@@ -151,7 +151,7 @@ static void eurwdt_activate_timer(void)
 	if (irq == 0)
 		printk(KERN_INFO ": interrupt disabled\n");
 
-	eurwdt_write_reg(WDT_TIMER_CFG, irq<<4);
+	eurwdt_write_reg(WDT_TIMER_CFG, irq << 4);
 
 	eurwdt_write_reg(WDT_UNIT_SEL, WDT_UNIT_SECS);	/* we use seconds */
 	eurwdt_set_timeout(0);	/* the default timeout */
diff --git a/drivers/watchdog/geodewdt.c b/drivers/watchdog/geodewdt.c
index 6799a6de66f..9acf0015a1e 100644
--- a/drivers/watchdog/geodewdt.c
+++ b/drivers/watchdog/geodewdt.c
@@ -34,11 +34,15 @@
 
 static int timeout = WATCHDOG_TIMEOUT;
 module_param(timeout, int, 0);
-MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds. 1<= timeout <=131, default=" __MODULE_STRING(WATCHDOG_TIMEOUT) ".");
+MODULE_PARM_DESC(timeout,
+	"Watchdog timeout in seconds. 1<= timeout <=131, default="
+				__MODULE_STRING(WATCHDOG_TIMEOUT) ".");
 
 static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
-MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+MODULE_PARM_DESC(nowayout,
+	"Watchdog cannot be stopped once started (default="
+				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
 static struct platform_device *geodewdt_platform_device;
 static unsigned long wdt_flags;
@@ -269,7 +273,8 @@ static int __init geodewdt_init(void)
 	if (ret)
 		return ret;
 
-	geodewdt_platform_device = platform_device_register_simple(DRV_NAME, -1, NULL, 0);
+	geodewdt_platform_device = platform_device_register_simple(DRV_NAME,
+								-1, NULL, 0);
 	if (IS_ERR(geodewdt_platform_device)) {
 		ret = PTR_ERR(geodewdt_platform_device);
 		goto err;
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 763c1ea5dce..6cf155d6b35 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -47,6 +47,7 @@
 #define PCI_BIOS32_PARAGRAPH_LEN	16
 #define PCI_ROM_BASE1			0x000F0000
 #define ROM_SIZE			0x10000
+#define HPWDT_VERSION			"1.01"
 
 struct bios32_service_dir {
 	u32 signature;
@@ -130,17 +131,14 @@ static void *cru_rom_addr;
 static struct cmn_registers cmn_regs;
 
 static struct pci_device_id hpwdt_devices[] = {
-	{
-	 .vendor = PCI_VENDOR_ID_COMPAQ,
-	 .device = 0xB203,
-	 .subvendor = PCI_ANY_ID,
-	 .subdevice = PCI_ANY_ID,
-	},
+	{ PCI_DEVICE(PCI_VENDOR_ID_COMPAQ, 0xB203) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_HP, 0x3306) },
 	{0},			/* terminate list */
 };
 MODULE_DEVICE_TABLE(pci, hpwdt_devices);
 
-extern asmlinkage void asminline_call(struct cmn_registers *pi86Regs, unsigned long *pRomEntry);
+extern asmlinkage void asminline_call(struct cmn_registers *pi86Regs,
+						unsigned long *pRomEntry);
 
 #ifndef CONFIG_X86_64
 /* --32 Bit Bios------------------------------------------------------------ */
@@ -605,7 +603,7 @@ static long hpwdt_ioctl(struct file *file, unsigned int cmd,
 /*
  *	Kernel interfaces
  */
-static struct file_operations hpwdt_fops = {
+static const struct file_operations hpwdt_fops = {
 	.owner = THIS_MODULE,
 	.llseek = no_llseek,
 	.write = hpwdt_write,
@@ -704,10 +702,11 @@ static int __devinit hpwdt_init_one(struct pci_dev *dev,
 	}
 
 	printk(KERN_INFO
-		"hp Watchdog Timer Driver: 1.00"
+		"hp Watchdog Timer Driver: %s"
 		", timer margin: %d seconds (nowayout=%d)"
 		", allow kernel dump: %s (default = 0/OFF).\n",
-		soft_margin, nowayout, (allow_kdump == 0) ? "OFF" : "ON");
+		HPWDT_VERSION, soft_margin, nowayout,
+		(allow_kdump == 0) ? "OFF" : "ON");
 
 	return 0;
 
@@ -757,6 +756,7 @@ static int __init hpwdt_init(void)
 MODULE_AUTHOR("Tom Mingarelli");
 MODULE_DESCRIPTION("hp watchdog driver");
 MODULE_LICENSE("GPL");
+MODULE_VERSION(HPWDT_VERSION);
 MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
 
 module_param(soft_margin, int, 0);
diff --git a/drivers/watchdog/i6300esb.c b/drivers/watchdog/i6300esb.c
index 74f951c18b9..2dbe83570d6 100644
--- a/drivers/watchdog/i6300esb.c
+++ b/drivers/watchdog/i6300esb.c
@@ -13,7 +13,7 @@
  *
  *	The timer is implemented in the following I/O controller hubs:
  *	(See the intel documentation on http://developer.intel.com.)
- *	6300ESB chip : document number 300641-003
+ *	6300ESB chip : document number 300641-004
  *
  *  2004YYZZ Ross Biro
  *	Initial version 0.01
@@ -34,7 +34,7 @@
 #include <linux/mm.h>
 #include <linux/miscdevice.h>
 #include <linux/watchdog.h>
-#include <linux/reboot.h>
+#include <linux/platform_device.h>
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/ioport.h>
@@ -42,7 +42,7 @@
 #include <linux/io.h>
 
 /* Module and version information */
-#define ESB_VERSION "0.03"
+#define ESB_VERSION "0.04"
 #define ESB_MODULE_NAME "i6300ESB timer"
 #define ESB_DRIVER_NAME ESB_MODULE_NAME ", v" ESB_VERSION
 #define PFX ESB_MODULE_NAME ": "
@@ -81,6 +81,7 @@ static unsigned long timer_alive;
 static struct pci_dev *esb_pci;
 static unsigned short triggered; /* The status of the watchdog upon boot */
 static char esb_expect_close;
+static struct platform_device *esb_platform_device;
 
 /* module parameters */
 /* 30 sec default heartbeat (1 < heartbeat < 2*1023) */
@@ -114,13 +115,18 @@ static inline void esb_unlock_registers(void)
 	writeb(ESB_UNLOCK2, ESB_RELOAD_REG);
 }
 
-static void esb_timer_start(void)
+static int esb_timer_start(void)
 {
 	u8 val;
 
+	spin_lock(&esb_lock);
+	esb_unlock_registers();
+	writew(ESB_WDT_RELOAD, ESB_RELOAD_REG);
 	/* Enable or Enable + Lock? */
 	val = 0x02 | (nowayout ? 0x01 : 0x00);
 	pci_write_config_byte(esb_pci, ESB_LOCK_REG, val);
+	spin_unlock(&esb_lock);
+	return 0;
 }
 
 static int esb_timer_stop(void)
@@ -207,7 +213,6 @@ static int esb_open(struct inode *inode, struct file *file)
 		return -EBUSY;
 
 	/* Reload and activate timer */
-	esb_timer_keepalive();
 	esb_timer_start();
 
 	return nonseekable_open(inode, file);
@@ -240,7 +245,8 @@ static ssize_t esb_write(struct file *file, const char __user *data,
 			 * five months ago... */
 			esb_expect_close = 0;
 
-			/* scan to see whether or not we got the magic character */
+			/* scan to see whether or not we got the
+			 * magic character */
 			for (i = 0; i != len; i++) {
 				char c;
 				if (get_user(c, data + i))
@@ -292,7 +298,6 @@ static long esb_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		}
 
 		if (new_options & WDIOS_ENABLECARD) {
-			esb_timer_keepalive();
 			esb_timer_start();
 			retval = 0;
 		}
@@ -319,19 +324,6 @@ static long esb_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 }
 
 /*
- *      Notify system
- */
-
-static int esb_notify_sys(struct notifier_block *this,
-					unsigned long code, void *unused)
-{
-	if (code == SYS_DOWN || code == SYS_HALT)
-		esb_timer_stop();	/* Turn the WDT off */
-
-	return NOTIFY_DONE;
-}
-
-/*
  *      Kernel Interfaces
  */
 
@@ -350,10 +342,6 @@ static struct miscdevice esb_miscdev = {
 	.fops = &esb_fops,
 };
 
-static struct notifier_block esb_notifier = {
-	.notifier_call = esb_notify_sys,
-};
-
 /*
  * Data for PCI driver interface
  *
@@ -372,7 +360,7 @@ MODULE_DEVICE_TABLE(pci, esb_pci_tbl);
  *      Init & exit routines
  */
 
-static unsigned char __init esb_getdevice(void)
+static unsigned char __devinit esb_getdevice(void)
 {
 	u8 val1;
 	unsigned short val2;
@@ -443,7 +431,7 @@ err_devput:
 	return 0;
 }
 
-static int __init watchdog_init(void)
+static int __devinit esb_probe(struct platform_device *dev)
 {
 	int ret;
 
@@ -459,19 +447,13 @@ static int __init watchdog_init(void)
 			"heartbeat value must be 1<heartbeat<2046, using %d\n",
 								heartbeat);
 	}
-	ret = register_reboot_notifier(&esb_notifier);
-	if (ret != 0) {
-		printk(KERN_ERR PFX
-			"cannot register reboot notifier (err=%d)\n", ret);
-		goto err_unmap;
-	}
 
 	ret = misc_register(&esb_miscdev);
 	if (ret != 0) {
 		printk(KERN_ERR PFX
 			"cannot register miscdev on minor=%d (err=%d)\n",
 							WATCHDOG_MINOR, ret);
-		goto err_notifier;
+		goto err_unmap;
 	}
 	esb_timer_stop();
 	printk(KERN_INFO PFX
@@ -479,8 +461,6 @@ static int __init watchdog_init(void)
 						BASEADDR, heartbeat, nowayout);
 	return 0;
 
-err_notifier:
-	unregister_reboot_notifier(&esb_notifier);
 err_unmap:
 	iounmap(BASEADDR);
 /* err_release: */
@@ -492,7 +472,7 @@ err_unmap:
 	return ret;
 }
 
-static void __exit watchdog_cleanup(void)
+static int __devexit esb_remove(struct platform_device *dev)
 {
 	/* Stop the timer before we leave */
 	if (!nowayout)
@@ -500,11 +480,58 @@ static void __exit watchdog_cleanup(void)
 
 	/* Deregister */
 	misc_deregister(&esb_miscdev);
-	unregister_reboot_notifier(&esb_notifier);
 	iounmap(BASEADDR);
 	pci_release_region(esb_pci, 0);
 	pci_disable_device(esb_pci);
 	pci_dev_put(esb_pci);
+	return 0;
+}
+
+static void esb_shutdown(struct platform_device *dev)
+{
+	esb_timer_stop();
+}
+
+static struct platform_driver esb_platform_driver = {
+	.probe          = esb_probe,
+	.remove         = __devexit_p(esb_remove),
+	.shutdown       = esb_shutdown,
+	.driver         = {
+		.owner  = THIS_MODULE,
+		.name   = ESB_MODULE_NAME,
+	},
+};
+
+static int __init watchdog_init(void)
+{
+	int err;
+
+	printk(KERN_INFO PFX "Intel 6300ESB WatchDog Timer Driver v%s\n",
+		ESB_VERSION);
+
+	err = platform_driver_register(&esb_platform_driver);
+	if (err)
+		return err;
+
+	esb_platform_device = platform_device_register_simple(ESB_MODULE_NAME,
+								-1, NULL, 0);
+	if (IS_ERR(esb_platform_device)) {
+		err = PTR_ERR(esb_platform_device);
+		goto unreg_platform_driver;
+	}
+
+	return 0;
+
+unreg_platform_driver:
+	platform_driver_unregister(&esb_platform_driver);
+	return err;
+}
+
+static void __exit watchdog_cleanup(void)
+{
+	platform_device_unregister(esb_platform_device);
+	platform_driver_unregister(&esb_platform_driver);
+	printk(KERN_INFO PFX "Watchdog Module Unloaded.\n");
 }
 
 module_init(watchdog_init);
diff --git a/drivers/watchdog/iTCO_vendor_support.c b/drivers/watchdog/iTCO_vendor_support.c
index d8264ad0be4..d3c0f6de552 100644
--- a/drivers/watchdog/iTCO_vendor_support.c
+++ b/drivers/watchdog/iTCO_vendor_support.c
@@ -47,7 +47,8 @@
 
 static int vendorsupport;
 module_param(vendorsupport, int, 0);
-MODULE_PARM_DESC(vendorsupport, "iTCO vendor specific support mode, default=0 (none), 1=SuperMicro Pent3, 2=SuperMicro Pent4+");
+MODULE_PARM_DESC(vendorsupport, "iTCO vendor specific support mode, default="
+			"0 (none), 1=SuperMicro Pent3, 2=SuperMicro Pent4+");
 
 /*
  *	Vendor Specific Support
@@ -305,7 +306,8 @@ static void __exit iTCO_vendor_exit_module(void)
 module_init(iTCO_vendor_init_module);
 module_exit(iTCO_vendor_exit_module);
 
-MODULE_AUTHOR("Wim Van Sebroeck <wim@iguana.be>, R. Seretny <lkpatches@paypc.com>");
+MODULE_AUTHOR("Wim Van Sebroeck <wim@iguana.be>, "
+		"R. Seretny <lkpatches@paypc.com>");
 MODULE_DESCRIPTION("Intel TCO Vendor Specific WatchDog Timer Driver Support");
 MODULE_VERSION(DRV_VERSION);
 MODULE_LICENSE("GPL");
diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index 352334947ea..648250b998c 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -273,7 +273,9 @@ static struct platform_device *iTCO_wdt_platform_device;
 #define WATCHDOG_HEARTBEAT 30	/* 30 sec default heartbeat */
 static int heartbeat = WATCHDOG_HEARTBEAT;  /* in seconds */
 module_param(heartbeat, int, 0);
-MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. (2<heartbeat<39 (TCO v1) or 613 (TCO v2), default=" __MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
+MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. "
+	"(2<heartbeat<39 (TCO v1) or 613 (TCO v2), default="
+				__MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
 
 static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
@@ -346,7 +348,8 @@ static int iTCO_wdt_start(void)
 	/* disable chipset's NO_REBOOT bit */
 	if (iTCO_wdt_unset_NO_REBOOT_bit()) {
 		spin_unlock(&iTCO_wdt_private.io_lock);
-		printk(KERN_ERR PFX "failed to reset NO_REBOOT flag, reboot disabled by hardware\n");
+		printk(KERN_ERR PFX "failed to reset NO_REBOOT flag, "
+					"reboot disabled by hardware\n");
 		return -EIO;
 	}
 
@@ -669,7 +672,8 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev,
 
 	/* Check chipset's NO_REBOOT bit */
 	if (iTCO_wdt_unset_NO_REBOOT_bit() && iTCO_vendor_check_noreboot_on()) {
-		printk(KERN_ERR PFX "failed to reset NO_REBOOT flag, reboot disabled by hardware\n");
+		printk(KERN_ERR PFX "failed to reset NO_REBOOT flag, "
+					"reboot disabled by hardware\n");
 		ret = -ENODEV;	/* Cannot reset NO_REBOOT bit */
 		goto out;
 	}
@@ -716,8 +720,9 @@ static int __devinit iTCO_wdt_init(struct pci_dev *pdev,
 	   if not reset to the default */
 	if (iTCO_wdt_set_heartbeat(heartbeat)) {
 		iTCO_wdt_set_heartbeat(WATCHDOG_HEARTBEAT);
-		printk(KERN_INFO PFX "heartbeat value must be 2 < heartbeat < 39 (TCO v1) or 613 (TCO v2), using %d\n",
-							heartbeat);
+		printk(KERN_INFO PFX
+			"heartbeat value must be 2 < heartbeat < 39 (TCO v1) "
+				"or 613 (TCO v2), using %d\n", heartbeat);
 	}
 
 	ret = misc_register(&iTCO_wdt_miscdev);
diff --git a/drivers/watchdog/it87_wdt.c b/drivers/watchdog/it87_wdt.c
index afb8af397a9..cc133c531d0 100644
--- a/drivers/watchdog/it87_wdt.c
+++ b/drivers/watchdog/it87_wdt.c
@@ -188,8 +188,8 @@ static inline int superio_inb(int reg)
 
 static inline void superio_outb(int val, int reg)
 {
-       outb(reg, REG);
-       outb(val, VAL);
+	outb(reg, REG);
+	outb(val, VAL);
 }
 
 static inline int superio_inw(int reg)
@@ -204,10 +204,10 @@ static inline int superio_inw(int reg)
 
 static inline void superio_outw(int val, int reg)
 {
-       outb(reg++, REG);
-       outb(val >> 8, VAL);
-       outb(reg, REG);
-       outb(val, VAL);
+	outb(reg++, REG);
+	outb(val >> 8, VAL);
+	outb(reg, REG);
+	outb(val, VAL);
 }
 
 /* watchdog timer handling */
diff --git a/drivers/watchdog/mpc5200_wdt.c b/drivers/watchdog/mpc5200_wdt.c
index db91892558f..465fe36adad 100644
--- a/drivers/watchdog/mpc5200_wdt.c
+++ b/drivers/watchdog/mpc5200_wdt.c
@@ -9,8 +9,8 @@
 #include <asm/mpc52xx.h>
 
 
-#define GPT_MODE_WDT		(1<<15)
-#define GPT_MODE_CE		(1<<12)
+#define GPT_MODE_WDT		(1 << 15)
+#define GPT_MODE_CE		(1 << 12)
 #define GPT_MODE_MS_TIMER	(0x4)
 
 
diff --git a/drivers/watchdog/mpcore_wdt.c b/drivers/watchdog/mpcore_wdt.c
index 1130ad697ce..1512ab8b175 100644
--- a/drivers/watchdog/mpcore_wdt.c
+++ b/drivers/watchdog/mpcore_wdt.c
@@ -5,7 +5,7 @@
  *
  *	Based on the SoftDog driver:
  *	(c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>,
- * 						All Rights Reserved.
+ *						All Rights Reserved.
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
diff --git a/drivers/watchdog/mtx-1_wdt.c b/drivers/watchdog/mtx-1_wdt.c
index 3acce623f20..539b6f6ba7f 100644
--- a/drivers/watchdog/mtx-1_wdt.c
+++ b/drivers/watchdog/mtx-1_wdt.c
@@ -5,7 +5,7 @@
  *							All Rights Reserved.
  *                              http://www.4g-systems.biz
  *
- * 	(C) Copyright 2007 OpenWrt.org, Florian Fainelli <florian@openwrt.org>
+ *	(C) Copyright 2007 OpenWrt.org, Florian Fainelli <florian@openwrt.org>
  *
  *      This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
diff --git a/drivers/watchdog/orion5x_wdt.c b/drivers/watchdog/orion5x_wdt.c
index b64ae1a1783..e81441f103d 100644
--- a/drivers/watchdog/orion5x_wdt.c
+++ b/drivers/watchdog/orion5x_wdt.c
@@ -16,11 +16,13 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
+#include <linux/platform_device.h>
 #include <linux/watchdog.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/spinlock.h>
+#include <plat/orion5x_wdt.h>
 
 /*
  * Watchdog timer block registers.
@@ -29,13 +31,14 @@
 #define  WDT_EN			0x0010
 #define WDT_VAL			(TIMER_VIRT_BASE + 0x0024)
 
-#define ORION5X_TCLK		166666667
-#define WDT_MAX_DURATION	(0xffffffff / ORION5X_TCLK)
+#define WDT_MAX_CYCLE_COUNT	0xffffffff
 #define WDT_IN_USE		0
 #define WDT_OK_TO_CLOSE		1
 
 static int nowayout = WATCHDOG_NOWAYOUT;
-static int heartbeat =  WDT_MAX_DURATION;	/* (seconds) */
+static int heartbeat = -1;		/* module parameter (seconds) */
+static unsigned int wdt_max_duration;	/* (seconds) */
+static unsigned int wdt_tclk;
 static unsigned long wdt_status;
 static spinlock_t wdt_lock;
 
@@ -46,7 +49,7 @@ static void wdt_enable(void)
 	spin_lock(&wdt_lock);
 
 	/* Set watchdog duration */
-	writel(ORION5X_TCLK * heartbeat, WDT_VAL);
+	writel(wdt_tclk * heartbeat, WDT_VAL);
 
 	/* Clear watchdog timer interrupt */
 	reg = readl(BRIDGE_CAUSE);
@@ -88,7 +91,7 @@ static void wdt_disable(void)
 static int orion5x_wdt_get_timeleft(int *time_left)
 {
 	spin_lock(&wdt_lock);
-	*time_left = readl(WDT_VAL) / ORION5X_TCLK;
+	*time_left = readl(WDT_VAL) / wdt_tclk;
 	spin_unlock(&wdt_lock);
 	return 0;
 }
@@ -158,7 +161,7 @@ static long orion5x_wdt_ioctl(struct file *file, unsigned int cmd,
 		if (ret)
 			break;
 
-		if (time <= 0 || time > WDT_MAX_DURATION) {
+		if (time <= 0 || time > wdt_max_duration) {
 			ret = -EINVAL;
 			break;
 		}
@@ -210,23 +213,69 @@ static struct miscdevice orion5x_wdt_miscdev = {
 	.fops		= &orion5x_wdt_fops,
 };
 
-static int __init orion5x_wdt_init(void)
+static int __devinit orion5x_wdt_probe(struct platform_device *pdev)
 {
+	struct orion5x_wdt_platform_data *pdata = pdev->dev.platform_data;
 	int ret;
 
-	spin_lock_init(&wdt_lock);
+	if (pdata) {
+		wdt_tclk = pdata->tclk;
+	} else {
+		printk(KERN_ERR "Orion5x Watchdog misses platform data\n");
+		return -ENODEV;
+	}
+
+	if (orion5x_wdt_miscdev.parent)
+		return -EBUSY;
+	orion5x_wdt_miscdev.parent = &pdev->dev;
+
+	wdt_max_duration = WDT_MAX_CYCLE_COUNT / wdt_tclk;
+	if (heartbeat <= 0 || heartbeat > wdt_max_duration)
+		heartbeat = wdt_max_duration;
 
 	ret = misc_register(&orion5x_wdt_miscdev);
-	if (ret == 0)
-		printk("Orion5x Watchdog Timer: heartbeat %d sec\n",
-								heartbeat);
+	if (ret)
+		return ret;
+
+	printk(KERN_INFO "Orion5x Watchdog Timer: Initial timeout %d sec%s\n",
+				heartbeat, nowayout ? ", nowayout" : "");
+	return 0;
+}
+
+static int __devexit orion5x_wdt_remove(struct platform_device *pdev)
+{
+	int ret;
+
+	if (test_bit(WDT_IN_USE, &wdt_status)) {
+		wdt_disable();
+		clear_bit(WDT_IN_USE, &wdt_status);
+	}
+
+	ret = misc_deregister(&orion5x_wdt_miscdev);
+	if (!ret)
+		orion5x_wdt_miscdev.parent = NULL;
 
 	return ret;
 }
 
+static struct platform_driver orion5x_wdt_driver = {
+	.probe		= orion5x_wdt_probe,
+	.remove		= __devexit_p(orion5x_wdt_remove),
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= "orion5x_wdt",
+	},
+};
+
+static int __init orion5x_wdt_init(void)
+{
+	spin_lock_init(&wdt_lock);
+	return platform_driver_register(&orion5x_wdt_driver);
+}
+
 static void __exit orion5x_wdt_exit(void)
 {
-	misc_deregister(&orion5x_wdt_miscdev);
+	platform_driver_unregister(&orion5x_wdt_driver);
 }
 
 module_init(orion5x_wdt_init);
@@ -236,8 +285,7 @@ MODULE_AUTHOR("Sylver Bruneau <sylver.bruneau@googlemail.com>");
 MODULE_DESCRIPTION("Orion5x Processor Watchdog");
 
 module_param(heartbeat, int, 0);
-MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds (default is "
-					__MODULE_STRING(WDT_MAX_DURATION) ")");
+MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds");
 
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started");
diff --git a/drivers/watchdog/pc87413_wdt.c b/drivers/watchdog/pc87413_wdt.c
index 484c215e9f3..1a2b916e3f8 100644
--- a/drivers/watchdog/pc87413_wdt.c
+++ b/drivers/watchdog/pc87413_wdt.c
@@ -536,7 +536,8 @@ static int __init pc87413_init(void)
 
 	ret = misc_register(&pc87413_miscdev);
 	if (ret != 0) {
-		printk(KERN_ERR PFX "cannot register miscdev on minor=%d (err=%d)\n",
+		printk(KERN_ERR PFX
+			"cannot register miscdev on minor=%d (err=%d)\n",
 			WATCHDOG_MINOR, ret);
 		unregister_reboot_notifier(&pc87413_notifier);
 		return ret;
@@ -574,7 +575,8 @@ static void __exit pc87413_exit(void)
 module_init(pc87413_init);
 module_exit(pc87413_exit);
 
-MODULE_AUTHOR("Sven Anders <anders@anduras.de>, Marcus Junker <junker@anduras.de>,");
+MODULE_AUTHOR("Sven Anders <anders@anduras.de>, "
+		"Marcus Junker <junker@anduras.de>,");
 MODULE_DESCRIPTION("PC87413 WDT driver");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/watchdog/pcwd.c b/drivers/watchdog/pcwd.c
index 9e1331a3b21..aa9512321f3 100644
--- a/drivers/watchdog/pcwd.c
+++ b/drivers/watchdog/pcwd.c
@@ -24,25 +24,25 @@
  *		version reporting.  Added read routine for temperature.
  *		Removed some extra defines, added an autodetect Revision
  *		routine.
- * 961006       Revised some documentation, fixed some cosmetic bugs.  Made
- *              drivers to panic the system if it's overheating at bootup.
+ * 961006	Revised some documentation, fixed some cosmetic bugs.  Made
+ *		drivers to panic the system if it's overheating at bootup.
  * 961118	Changed some verbiage on some of the output, tidied up
  *		code bits, and added compatibility to 2.1.x.
- * 970912       Enabled board on open and disable on close.
+ * 970912	Enabled board on open and disable on close.
  * 971107	Took account of recent VFS changes (broke read).
- * 971210       Disable board on initialisation in case board already ticking.
- * 971222       Changed open/close for temperature handling
- *              Michael Meskes <meskes@debian.org>.
- * 980112       Used minor numbers from include/linux/miscdevice.h
- * 990403       Clear reset status after reading control status register in
- *              pcwd_showprevstate(). [Marc Boucher <marc@mbsi.ca>]
+ * 971210	Disable board on initialisation in case board already ticking.
+ * 971222	Changed open/close for temperature handling
+ *		Michael Meskes <meskes@debian.org>.
+ * 980112	Used minor numbers from include/linux/miscdevice.h
+ * 990403	Clear reset status after reading control status register in
+ *		pcwd_showprevstate(). [Marc Boucher <marc@mbsi.ca>]
  * 990605	Made changes to code to support Firmware 1.22a, added
  *		fairly useless proc entry.
  * 990610	removed said useless proc code for the merge <alan>
  * 000403	Removed last traces of proc code. <davej>
  * 011214	Added nowayout module option to override
  *		CONFIG_WATCHDOG_NOWAYOUT <Matt_Domsch@dell.com>
- *              Added timeout module option to override default
+ *		Added timeout module option to override default
  */
 
 /*
@@ -76,8 +76,7 @@
 #define WATCHDOG_DRIVER_NAME "ISA-PC Watchdog"
 #define WATCHDOG_NAME "pcwd"
 #define PFX WATCHDOG_NAME ": "
-#define DRIVER_VERSION WATCHDOG_DRIVER_NAME " driver, v" WATCHDOG_VERSION " (" WATCHDOG_DATE ")\n"
-#define WD_VER WATCHDOG_VERSION " (" WATCHDOG_DATE ")"
+#define DRIVER_VERSION WATCHDOG_DRIVER_NAME " driver, v" WATCHDOG_VERSION "\n"
 
 /*
  * It should be noted that PCWD_REVISION_B was removed because A and B
@@ -200,7 +199,9 @@ MODULE_PARM_DESC(debug,
 #define WATCHDOG_HEARTBEAT 0
 static int heartbeat = WATCHDOG_HEARTBEAT;
 module_param(heartbeat, int, 0);
-MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. (2 <= heartbeat <= 7200 or 0=delay-time from dip-switches, default=" __MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
+MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. "
+	"(2 <= heartbeat <= 7200 or 0=delay-time from dip-switches, default="
+				__MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
 
 static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
@@ -239,7 +240,8 @@ static int send_isa_command(int cmd)
 	}
 
 	if (debug >= DEBUG)
-		printk(KERN_DEBUG PFX "received following data for cmd=0x%02x: port0=0x%02x last_port0=0x%02x\n",
+		printk(KERN_DEBUG PFX "received following data for "
+			"cmd=0x%02x: port0=0x%02x last_port0=0x%02x\n",
 			cmd, port0, last_port0);
 
 	return port0;
@@ -339,10 +341,12 @@ static void pcwd_show_card_info(void)
 							pcwd_private.io_addr);
 	else if (pcwd_private.revision == PCWD_REVISION_C) {
 		pcwd_get_firmware();
-		printk(KERN_INFO PFX "ISA-PC Watchdog (REV.C) detected at port 0x%04x (Firmware version: %s)\n",
+		printk(KERN_INFO PFX "ISA-PC Watchdog (REV.C) detected at port "
+			"0x%04x (Firmware version: %s)\n",
 			pcwd_private.io_addr, pcwd_private.fw_ver_str);
 		option_switches = pcwd_get_option_switches();
-		printk(KERN_INFO PFX "Option switches (0x%02x): Temperature Reset Enable=%s, Power On Delay=%s\n",
+		printk(KERN_INFO PFX "Option switches (0x%02x): "
+			"Temperature Reset Enable=%s, Power On Delay=%s\n",
 			option_switches,
 			((option_switches & 0x10) ? "ON" : "OFF"),
 			((option_switches & 0x08) ? "ON" : "OFF"));
@@ -358,7 +362,8 @@ static void pcwd_show_card_info(void)
 		printk(KERN_INFO PFX "Temperature Option Detected\n");
 
 	if (pcwd_private.boot_status & WDIOF_CARDRESET)
-		printk(KERN_INFO PFX "Previous reboot was caused by the card\n");
+		printk(KERN_INFO PFX
+			"Previous reboot was caused by the card\n");
 
 	if (pcwd_private.boot_status & WDIOF_OVERHEAT) {
 		printk(KERN_EMERG PFX
@@ -871,7 +876,7 @@ static int __devinit pcwd_isa_probe(struct device *dev, unsigned int id)
 	cards_found++;
 	if (cards_found == 1)
 		printk(KERN_INFO PFX "v%s Ken Hollis (kenji@bitgate.com)\n",
-								WD_VER);
+							WATCHDOG_VERSION);
 
 	if (cards_found > 1) {
 		printk(KERN_ERR PFX "This driver only supports 1 device\n");
@@ -1026,7 +1031,8 @@ static void __exit pcwd_cleanup_module(void)
 module_init(pcwd_init_module);
 module_exit(pcwd_cleanup_module);
 
-MODULE_AUTHOR("Ken Hollis <kenji@bitgate.com>, Wim Van Sebroeck <wim@iguana.be>");
+MODULE_AUTHOR("Ken Hollis <kenji@bitgate.com>, "
+		"Wim Van Sebroeck <wim@iguana.be>");
 MODULE_DESCRIPTION("Berkshire ISA-PC Watchdog driver");
 MODULE_VERSION(WATCHDOG_VERSION);
 MODULE_LICENSE("GPL");
diff --git a/drivers/watchdog/pcwd_pci.c b/drivers/watchdog/pcwd_pci.c
index 5d76422c402..698f51bff1b 100644
--- a/drivers/watchdog/pcwd_pci.c
+++ b/drivers/watchdog/pcwd_pci.c
@@ -24,7 +24,8 @@
  *	A bells and whistles driver is available from:
  *	http://www.kernel.org/pub/linux/kernel/people/wim/pcwd/pcwd_pci/
  *
- *	More info available at http://www.berkprod.com/ or http://www.pcwatchdog.com/
+ *	More info available at
+ *	http://www.berkprod.com/ or http://www.pcwatchdog.com/
  */
 
 /*
@@ -51,11 +52,10 @@
 
 /* Module and version information */
 #define WATCHDOG_VERSION "1.03"
-#define WATCHDOG_DATE "21 Jan 2007"
 #define WATCHDOG_DRIVER_NAME "PCI-PC Watchdog"
 #define WATCHDOG_NAME "pcwd_pci"
 #define PFX WATCHDOG_NAME ": "
-#define DRIVER_VERSION WATCHDOG_DRIVER_NAME " driver, v" WATCHDOG_VERSION " (" WATCHDOG_DATE ")\n"
+#define DRIVER_VERSION WATCHDOG_DRIVER_NAME " driver, v" WATCHDOG_VERSION "\n"
 
 /* Stuff for the PCI ID's  */
 #ifndef PCI_VENDOR_ID_QUICKLOGIC
@@ -76,7 +76,8 @@
 #define WD_PCI_TTRP		0x04	/* Temperature Trip status */
 #define WD_PCI_RL2A		0x08	/* Relay 2 Active */
 #define WD_PCI_RL1A		0x10	/* Relay 1 Active */
-#define WD_PCI_R2DS		0x40	/* Relay 2 Disable Temperature-trip/reset */
+#define WD_PCI_R2DS		0x40	/* Relay 2 Disable Temperature-trip /
+									reset */
 #define WD_PCI_RLY2		0x80	/* Activate Relay 2 on the board */
 /* Port 2 : Control Status #2 */
 #define WD_PCI_WDIS		0x10	/* Watchdog Disable */
@@ -114,12 +115,18 @@ static int cards_found;
 static int temp_panic;
 static unsigned long is_active;
 static char expect_release;
-static struct {				/* this is private data for each PCI-PC watchdog card */
-	int supports_temp;		/* Wether or not the card has a temperature device */
-	int boot_status;		/* The card's boot status */
-	unsigned long io_addr;		/* The cards I/O address */
-	spinlock_t io_lock;		/* the lock for io operations */
-	struct pci_dev *pdev;		/* the PCI-device */
+/* this is private data for each PCI-PC watchdog card */
+static struct {
+	/* Wether or not the card has a temperature device */
+	int supports_temp;
+	/* The card's boot status */
+	int boot_status;
+	/* The cards I/O address */
+	unsigned long io_addr;
+	/* the lock for io operations */
+	spinlock_t io_lock;
+	/* the PCI-device */
+	struct pci_dev *pdev;
 } pcipcwd_private;
 
 /* module parameters */
@@ -130,14 +137,18 @@ static int debug = QUIET;
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "Debug level: 0=Quiet, 1=Verbose, 2=Debug (default=0)");
 
-#define WATCHDOG_HEARTBEAT 0	/* default heartbeat = delay-time from dip-switches */
+#define WATCHDOG_HEARTBEAT 0	/* default heartbeat =
+						delay-time from dip-switches */
 static int heartbeat = WATCHDOG_HEARTBEAT;
 module_param(heartbeat, int, 0);
-MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. (0<heartbeat<65536 or 0=delay-time from dip-switches, default=" __MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
+MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. "
+	"(0<heartbeat<65536 or 0=delay-time from dip-switches, default="
+				__MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
 
 static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
-MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
+					__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
 /*
  *	Internal functions
@@ -148,8 +159,8 @@ static int send_command(int cmd, int *msb, int *lsb)
 	int got_response, count;
 
 	if (debug >= DEBUG)
-		printk(KERN_DEBUG PFX "sending following data cmd=0x%02x msb=0x%02x lsb=0x%02x\n",
-		cmd, *msb, *lsb);
+		printk(KERN_DEBUG PFX "sending following data "
+		"cmd=0x%02x msb=0x%02x lsb=0x%02x\n", cmd, *msb, *lsb);
 
 	spin_lock(&pcipcwd_private.io_lock);
 	/* If a command requires data it should be written first.
@@ -166,17 +177,20 @@ static int send_command(int cmd, int *msb, int *lsb)
 	 * the WRSP bit in port 2 and give it a max. timeout of
 	 * PCI_COMMAND_TIMEOUT to process */
 	got_response = inb_p(pcipcwd_private.io_addr + 2) & WD_PCI_WRSP;
-	for (count = 0; (count < PCI_COMMAND_TIMEOUT) && (!got_response); count++) {
+	for (count = 0; (count < PCI_COMMAND_TIMEOUT) && (!got_response);
+								count++) {
 		mdelay(1);
 		got_response = inb_p(pcipcwd_private.io_addr + 2) & WD_PCI_WRSP;
 	}
 
 	if (debug >= DEBUG) {
 		if (got_response) {
-			printk(KERN_DEBUG PFX "time to process command was: %d ms\n",
+			printk(KERN_DEBUG PFX
+				"time to process command was: %d ms\n",
 				count);
 		} else {
-			printk(KERN_DEBUG PFX "card did not respond on command!\n");
+			printk(KERN_DEBUG PFX
+				"card did not respond on command!\n");
 		}
 	}
 
@@ -189,7 +203,8 @@ static int send_command(int cmd, int *msb, int *lsb)
 		inb_p(pcipcwd_private.io_addr + 6);
 
 		if (debug >= DEBUG)
-			printk(KERN_DEBUG PFX "received following data for cmd=0x%02x: msb=0x%02x lsb=0x%02x\n",
+			printk(KERN_DEBUG PFX "received following data for "
+				"cmd=0x%02x: msb=0x%02x lsb=0x%02x\n",
 				cmd, *msb, *lsb);
 	}
 
@@ -218,7 +233,8 @@ static void pcipcwd_show_card_info(void)
 	char fw_ver_str[20];		/* The cards firmware version */
 	int option_switches;
 
-	got_fw_rev = send_command(CMD_GET_FIRMWARE_VERSION, &fw_rev_major, &fw_rev_minor);
+	got_fw_rev = send_command(CMD_GET_FIRMWARE_VERSION, &fw_rev_major,
+								&fw_rev_minor);
 	if (got_fw_rev)
 		sprintf(fw_ver_str, "%u.%02u", fw_rev_major, fw_rev_minor);
 	else
@@ -227,23 +243,27 @@ static void pcipcwd_show_card_info(void)
 	/* Get switch settings */
 	option_switches = pcipcwd_get_option_switches();
 
-	printk(KERN_INFO PFX "Found card at port 0x%04x (Firmware: %s) %s temp option\n",
+	printk(KERN_INFO PFX "Found card at port "
+		"0x%04x (Firmware: %s) %s temp option\n",
 		(int) pcipcwd_private.io_addr, fw_ver_str,
 		(pcipcwd_private.supports_temp ? "with" : "without"));
 
-	printk(KERN_INFO PFX "Option switches (0x%02x): Temperature Reset Enable=%s, Power On Delay=%s\n",
+	printk(KERN_INFO PFX "Option switches (0x%02x): "
+		"Temperature Reset Enable=%s, Power On Delay=%s\n",
 		option_switches,
 		((option_switches & 0x10) ? "ON" : "OFF"),
 		((option_switches & 0x08) ? "ON" : "OFF"));
 
 	if (pcipcwd_private.boot_status & WDIOF_CARDRESET)
-		printk(KERN_INFO PFX "Previous reset was caused by the Watchdog card\n");
+		printk(KERN_INFO PFX
+			"Previous reset was caused by the Watchdog card\n");
 
 	if (pcipcwd_private.boot_status & WDIOF_OVERHEAT)
 		printk(KERN_INFO PFX "Card sensed a CPU Overheat\n");
 
 	if (pcipcwd_private.boot_status == 0)
-		printk(KERN_INFO PFX "No previous trip detected - Cold boot or reset\n");
+		printk(KERN_INFO PFX
+			"No previous trip detected - Cold boot or reset\n");
 }
 
 static int pcipcwd_start(void)
@@ -283,7 +303,8 @@ static int pcipcwd_stop(void)
 	spin_unlock(&pcipcwd_private.io_lock);
 
 	if (!(stat_reg & WD_PCI_WDIS)) {
-		printk(KERN_ERR PFX "Card did not acknowledge disable attempt\n");
+		printk(KERN_ERR PFX
+			"Card did not acknowledge disable attempt\n");
 		return -1;
 	}
 
@@ -364,7 +385,8 @@ static int pcipcwd_clear_status(void)
 	}
 
 	/* clear trip status & LED and keep mode of relay 2 */
-	outb_p((control_status & WD_PCI_R2DS) | WD_PCI_WTRP, pcipcwd_private.io_addr + 1);
+	outb_p((control_status & WD_PCI_R2DS) | WD_PCI_WTRP,
+						pcipcwd_private.io_addr + 1);
 
 	/* clear reset counter */
 	msb = 0;
@@ -437,7 +459,8 @@ static ssize_t pcipcwd_write(struct file *file, const char __user *data,
 			 * five months ago... */
 			expect_release = 0;
 
-			/* scan to see whether or not we got the magic character */
+			/* scan to see whether or not we got the
+			 * magic character */
 			for (i = 0; i != len; i++) {
 				char c;
 				if (get_user(c, data + i))
@@ -531,7 +554,7 @@ static long pcipcwd_ioctl(struct file *file, unsigned int cmd,
 			return -EFAULT;
 
 		if (pcipcwd_set_heartbeat(new_heartbeat))
-		    return -EINVAL;
+			return -EINVAL;
 
 		pcipcwd_keepalive();
 		/* Fall */
@@ -560,7 +583,8 @@ static int pcipcwd_open(struct inode *inode, struct file *file)
 	/* /dev/watchdog can only be opened once */
 	if (test_and_set_bit(0, &is_active)) {
 		if (debug >= VERBOSE)
-			printk(KERN_ERR PFX "Attempt to open already opened device.\n");
+			printk(KERN_ERR PFX
+				"Attempt to open already opened device.\n");
 		return -EBUSY;
 	}
 
@@ -578,7 +602,8 @@ static int pcipcwd_release(struct inode *inode, struct file *file)
 	if (expect_release == 42) {
 		pcipcwd_stop();
 	} else {
-		printk(KERN_CRIT PFX "Unexpected close, not stopping watchdog!\n");
+		printk(KERN_CRIT PFX
+			"Unexpected close, not stopping watchdog!\n");
 		pcipcwd_keepalive();
 	}
 	expect_release = 0;
@@ -621,7 +646,8 @@ static int pcipcwd_temp_release(struct inode *inode, struct file *file)
  *	Notify system
  */
 
-static int pcipcwd_notify_sys(struct notifier_block *this, unsigned long code, void *unused)
+static int pcipcwd_notify_sys(struct notifier_block *this, unsigned long code,
+								void *unused)
 {
 	if (code == SYS_DOWN || code == SYS_HALT)
 		pcipcwd_stop();	/* Turn the WDT off */
@@ -722,34 +748,38 @@ static int __devinit pcipcwd_card_init(struct pci_dev *pdev,
 
 	/* If heartbeat = 0 then we use the heartbeat from the dip-switches */
 	if (heartbeat == 0)
-		heartbeat = heartbeat_tbl[(pcipcwd_get_option_switches() & 0x07)];
+		heartbeat =
+			heartbeat_tbl[(pcipcwd_get_option_switches() & 0x07)];
 
-	/* Check that the heartbeat value is within it's range ; if not reset to the default */
+	/* Check that the heartbeat value is within it's range ;
+	 * if not reset to the default */
 	if (pcipcwd_set_heartbeat(heartbeat)) {
 		pcipcwd_set_heartbeat(WATCHDOG_HEARTBEAT);
-		printk(KERN_INFO PFX "heartbeat value must be 0<heartbeat<65536, using %d\n",
+		printk(KERN_INFO PFX
+			"heartbeat value must be 0<heartbeat<65536, using %d\n",
 			WATCHDOG_HEARTBEAT);
 	}
 
 	ret = register_reboot_notifier(&pcipcwd_notifier);
 	if (ret != 0) {
-		printk(KERN_ERR PFX "cannot register reboot notifier (err=%d)\n",
-			ret);
+		printk(KERN_ERR PFX
+			"cannot register reboot notifier (err=%d)\n", ret);
 		goto err_out_release_region;
 	}
 
 	if (pcipcwd_private.supports_temp) {
 		ret = misc_register(&pcipcwd_temp_miscdev);
 		if (ret != 0) {
-			printk(KERN_ERR PFX "cannot register miscdev on minor=%d (err=%d)\n",
-				TEMP_MINOR, ret);
+			printk(KERN_ERR PFX "cannot register miscdev on "
+				"minor=%d (err=%d)\n", TEMP_MINOR, ret);
 			goto err_out_unregister_reboot;
 		}
 	}
 
 	ret = misc_register(&pcipcwd_miscdev);
 	if (ret != 0) {
-		printk(KERN_ERR PFX "cannot register miscdev on minor=%d (err=%d)\n",
+		printk(KERN_ERR PFX
+			"cannot register miscdev on minor=%d (err=%d)\n",
 			WATCHDOG_MINOR, ret);
 		goto err_out_misc_deregister;
 	}
diff --git a/drivers/watchdog/pcwd_usb.c b/drivers/watchdog/pcwd_usb.c
index afb089695da..052fe451851 100644
--- a/drivers/watchdog/pcwd_usb.c
+++ b/drivers/watchdog/pcwd_usb.c
@@ -51,12 +51,11 @@
 
 /* Use our own dbg macro */
 #undef dbg
-#define dbg(format, arg...) do { if (debug) printk(KERN_DEBUG PFX format "\n" , ## arg); } while (0)
-
+#define dbg(format, arg...) \
+	do { if (debug) printk(KERN_DEBUG PFX format "\n" , ## arg); } while (0)
 
 /* Module and Version Information */
 #define DRIVER_VERSION "1.02"
-#define DRIVER_DATE "21 Jan 2007"
 #define DRIVER_AUTHOR "Wim Van Sebroeck <wim@iguana.be>"
 #define DRIVER_DESC "Berkshire USB-PC Watchdog driver"
 #define DRIVER_LICENSE "GPL"
@@ -73,14 +72,18 @@ MODULE_ALIAS_MISCDEV(TEMP_MINOR);
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "Debug enabled or not");
 
-#define WATCHDOG_HEARTBEAT 0	/* default heartbeat = delay-time from dip-switches */
+#define WATCHDOG_HEARTBEAT 0	/* default heartbeat =
+						delay-time from dip-switches */
 static int heartbeat = WATCHDOG_HEARTBEAT;
 module_param(heartbeat, int, 0);
-MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. (0<heartbeat<65536 or 0=delay-time from dip-switches, default=" __MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
+MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. "
+	"(0<heartbeat<65536 or 0=delay-time from dip-switches, default="
+				__MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
 
 static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
-MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
+				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
 /* The vendor and product id's for the USB-PC Watchdog card */
 #define USB_PCWD_VENDOR_ID	0x0c98
@@ -91,20 +94,21 @@ static struct usb_device_id usb_pcwd_table[] = {
 	{ USB_DEVICE(USB_PCWD_VENDOR_ID, USB_PCWD_PRODUCT_ID) },
 	{ }					/* Terminating entry */
 };
-MODULE_DEVICE_TABLE (usb, usb_pcwd_table);
+MODULE_DEVICE_TABLE(usb, usb_pcwd_table);
 
 /* according to documentation max. time to process a command for the USB
  * watchdog card is 100 or 200 ms, so we give it 250 ms to do it's job */
 #define USB_COMMAND_TIMEOUT	250
 
 /* Watchdog's internal commands */
-#define CMD_READ_TEMP			0x02	/* Read Temperature; Re-trigger Watchdog */
+#define CMD_READ_TEMP			0x02	/* Read Temperature;
+							Re-trigger Watchdog */
 #define CMD_TRIGGER			CMD_READ_TEMP
 #define CMD_GET_STATUS			0x04	/* Get Status Information */
 #define CMD_GET_FIRMWARE_VERSION	0x08	/* Get Firmware Version */
 #define CMD_GET_DIP_SWITCH_SETTINGS	0x0c	/* Get Dip Switch Settings */
 #define CMD_READ_WATCHDOG_TIMEOUT	0x18	/* Read Current Watchdog Time */
-#define CMD_WRITE_WATCHDOG_TIMEOUT	0x19	/* Write Current Watchdog Time */
+#define CMD_WRITE_WATCHDOG_TIMEOUT	0x19	/* Write Current WatchdogTime */
 #define CMD_ENABLE_WATCHDOG		0x30	/* Enable / Disable Watchdog */
 #define CMD_DISABLE_WATCHDOG		CMD_ENABLE_WATCHDOG
 
@@ -129,23 +133,36 @@ static char expect_release;
 
 /* Structure to hold all of our device specific stuff */
 struct usb_pcwd_private {
-	struct usb_device	*udev;			/* save off the usb device pointer */
-	struct usb_interface	*interface;		/* the interface for this device */
-
-	unsigned int		interface_number;	/* the interface number used for cmd's */
-
-	unsigned char		*intr_buffer;		/* the buffer to intr data */
-	dma_addr_t		intr_dma;		/* the dma address for the intr buffer */
-	size_t			intr_size;		/* the size of the intr buffer */
-	struct urb		*intr_urb;		/* the urb used for the intr pipe */
-
-	unsigned char		cmd_command;		/* The command that is reported back */
-	unsigned char		cmd_data_msb;		/* The data MSB that is reported back */
-	unsigned char		cmd_data_lsb;		/* The data LSB that is reported back */
-	atomic_t		cmd_received;		/* true if we received a report after a command */
-
-	int			exists;			/* Wether or not the device exists */
-	struct mutex		mtx;			/* locks this structure */
+	/* save off the usb device pointer */
+	struct usb_device	*udev;
+	/* the interface for this device */
+	struct usb_interface	*interface;
+
+	/* the interface number used for cmd's */
+	unsigned int		interface_number;
+
+	/* the buffer to intr data */
+	unsigned char		*intr_buffer;
+	/* the dma address for the intr buffer */
+	dma_addr_t		intr_dma;
+	/* the size of the intr buffer */
+	size_t			intr_size;
+	/* the urb used for the intr pipe */
+	struct urb		*intr_urb;
+
+	/* The command that is reported back */
+	unsigned char		cmd_command;
+	/* The data MSB that is reported back */
+	unsigned char		cmd_data_msb;
+	/* The data LSB that is reported back */
+	unsigned char		cmd_data_lsb;
+	/* true if we received a report after a command */
+	atomic_t		cmd_received;
+
+	/* Wether or not the device exists */
+	int			exists;
+	/* locks this structure */
+	struct mutex		mtx;
 };
 static struct usb_pcwd_private *usb_pcwd_device;
 
@@ -153,7 +170,8 @@ static struct usb_pcwd_private *usb_pcwd_device;
 static DEFINE_MUTEX(disconnect_mutex);
 
 /* local function prototypes */
-static int usb_pcwd_probe(struct usb_interface *interface, const struct usb_device_id *id);
+static int usb_pcwd_probe(struct usb_interface *interface,
+						const struct usb_device_id *id);
 static void usb_pcwd_disconnect(struct usb_interface *interface);
 
 /* usb specific object needed to register this driver with the usb subsystem */
@@ -167,7 +185,8 @@ static struct usb_driver usb_pcwd_driver = {
 
 static void usb_pcwd_intr_done(struct urb *urb)
 {
-	struct usb_pcwd_private *usb_pcwd = (struct usb_pcwd_private *)urb->context;
+	struct usb_pcwd_private *usb_pcwd =
+				(struct usb_pcwd_private *)urb->context;
 	unsigned char *data = usb_pcwd->intr_buffer;
 	int retval;
 
@@ -178,11 +197,13 @@ static void usb_pcwd_intr_done(struct urb *urb)
 	case -ENOENT:
 	case -ESHUTDOWN:
 		/* this urb is terminated, clean up */
-		dbg("%s - urb shutting down with status: %d", __func__, urb->status);
+		dbg("%s - urb shutting down with status: %d", __func__,
+								urb->status);
 		return;
 	/* -EPIPE:  should clear the halt */
 	default:		/* error */
-		dbg("%s - nonzero urb status received: %d", __func__, urb->status);
+		dbg("%s - nonzero urb status received: %d", __func__,
+								urb->status);
 		goto resubmit;
 	}
 
@@ -199,22 +220,23 @@ static void usb_pcwd_intr_done(struct urb *urb)
 resubmit:
 	retval = usb_submit_urb(urb, GFP_ATOMIC);
 	if (retval)
-		printk(KERN_ERR PFX "can't resubmit intr, usb_submit_urb failed with result %d\n",
-			retval);
+		printk(KERN_ERR PFX "can't resubmit intr, "
+			"usb_submit_urb failed with result %d\n", retval);
 }
 
-static int usb_pcwd_send_command(struct usb_pcwd_private *usb_pcwd, unsigned char cmd,
-	unsigned char *msb, unsigned char *lsb)
+static int usb_pcwd_send_command(struct usb_pcwd_private *usb_pcwd,
+		unsigned char cmd, unsigned char *msb, unsigned char *lsb)
 {
 	int got_response, count;
 	unsigned char buf[6];
 
-	/* We will not send any commands if the USB PCWD device does not exist */
+	/* We will not send any commands if the USB PCWD device does
+	 * not exist */
 	if ((!usb_pcwd) || (!usb_pcwd->exists))
 		return -1;
 
-	/* The USB PC Watchdog uses a 6 byte report format. The board currently uses
-	 * only 3 of the six bytes of the report. */
+	/* The USB PC Watchdog uses a 6 byte report format.
+	 * The board currently uses only 3 of the six bytes of the report. */
 	buf[0] = cmd;			/* Byte 0 = CMD */
 	buf[1] = *msb;			/* Byte 1 = Data MSB */
 	buf[2] = *lsb;			/* Byte 2 = Data LSB */
@@ -229,12 +251,14 @@ static int usb_pcwd_send_command(struct usb_pcwd_private *usb_pcwd, unsigned cha
 			HID_REQ_SET_REPORT, HID_DT_REPORT,
 			0x0200, usb_pcwd->interface_number, buf, sizeof(buf),
 			USB_COMMAND_TIMEOUT) != sizeof(buf)) {
-		dbg("usb_pcwd_send_command: error in usb_control_msg for cmd 0x%x 0x%x 0x%x\n", cmd, *msb, *lsb);
+		dbg("usb_pcwd_send_command: error in usb_control_msg for "
+				"cmd 0x%x 0x%x 0x%x\n", cmd, *msb, *lsb);
 	}
 	/* wait till the usb card processed the command,
 	 * with a max. timeout of USB_COMMAND_TIMEOUT */
 	got_response = 0;
-	for (count = 0; (count < USB_COMMAND_TIMEOUT) && (!got_response); count++) {
+	for (count = 0; (count < USB_COMMAND_TIMEOUT) && (!got_response);
+								count++) {
 		mdelay(1);
 		if (atomic_read(&usb_pcwd->cmd_received))
 			got_response = 1;
@@ -256,10 +280,12 @@ static int usb_pcwd_start(struct usb_pcwd_private *usb_pcwd)
 	int retval;
 
 	/* Enable Watchdog */
-	retval = usb_pcwd_send_command(usb_pcwd, CMD_ENABLE_WATCHDOG, &msb, &lsb);
+	retval = usb_pcwd_send_command(usb_pcwd, CMD_ENABLE_WATCHDOG,
+								&msb, &lsb);
 
 	if ((retval == 0) || (lsb == 0)) {
-		printk(KERN_ERR PFX "Card did not acknowledge enable attempt\n");
+		printk(KERN_ERR PFX
+				"Card did not acknowledge enable attempt\n");
 		return -1;
 	}
 
@@ -273,10 +299,12 @@ static int usb_pcwd_stop(struct usb_pcwd_private *usb_pcwd)
 	int retval;
 
 	/* Disable Watchdog */
-	retval = usb_pcwd_send_command(usb_pcwd, CMD_DISABLE_WATCHDOG, &msb, &lsb);
+	retval = usb_pcwd_send_command(usb_pcwd, CMD_DISABLE_WATCHDOG,
+								&msb, &lsb);
 
 	if ((retval == 0) || (lsb != 0)) {
-		printk(KERN_ERR PFX "Card did not acknowledge disable attempt\n");
+		printk(KERN_ERR PFX
+			"Card did not acknowledge disable attempt\n");
 		return -1;
 	}
 
@@ -308,7 +336,8 @@ static int usb_pcwd_set_heartbeat(struct usb_pcwd_private *usb_pcwd, int t)
 	return 0;
 }
 
-static int usb_pcwd_get_temperature(struct usb_pcwd_private *usb_pcwd, int *temperature)
+static int usb_pcwd_get_temperature(struct usb_pcwd_private *usb_pcwd,
+							int *temperature)
 {
 	unsigned char msb, lsb;
 
@@ -323,7 +352,8 @@ static int usb_pcwd_get_temperature(struct usb_pcwd_private *usb_pcwd, int *temp
 	return 0;
 }
 
-static int usb_pcwd_get_timeleft(struct usb_pcwd_private *usb_pcwd, int *time_left)
+static int usb_pcwd_get_timeleft(struct usb_pcwd_private *usb_pcwd,
+								int *time_left)
 {
 	unsigned char msb, lsb;
 
@@ -341,7 +371,7 @@ static int usb_pcwd_get_timeleft(struct usb_pcwd_private *usb_pcwd, int *time_le
  */
 
 static ssize_t usb_pcwd_write(struct file *file, const char __user *data,
-			      size_t len, loff_t *ppos)
+						size_t len, loff_t *ppos)
 {
 	/* See if we got the magic character 'V' and reload the timer */
 	if (len) {
@@ -352,7 +382,8 @@ static ssize_t usb_pcwd_write(struct file *file, const char __user *data,
 			 * five months ago... */
 			expect_release = 0;
 
-			/* scan to see whether or not we got the magic character */
+			/* scan to see whether or not we got the
+			 * magic character */
 			for (i = 0; i != len; i++) {
 				char c;
 				if (get_user(c, data + i))
@@ -431,7 +462,7 @@ static long usb_pcwd_ioctl(struct file *file, unsigned int cmd,
 			return -EFAULT;
 
 		if (usb_pcwd_set_heartbeat(usb_pcwd_device, new_heartbeat))
-		    return -EINVAL;
+			return -EINVAL;
 
 		usb_pcwd_keepalive(usb_pcwd_device);
 		/* Fall */
@@ -475,7 +506,8 @@ static int usb_pcwd_release(struct inode *inode, struct file *file)
 	if (expect_release == 42) {
 		usb_pcwd_stop(usb_pcwd_device);
 	} else {
-		printk(KERN_CRIT PFX "Unexpected close, not stopping watchdog!\n");
+		printk(KERN_CRIT PFX
+			"Unexpected close, not stopping watchdog!\n");
 		usb_pcwd_keepalive(usb_pcwd_device);
 	}
 	expect_release = 0;
@@ -515,7 +547,8 @@ static int usb_pcwd_temperature_release(struct inode *inode, struct file *file)
  *	Notify system
  */
 
-static int usb_pcwd_notify_sys(struct notifier_block *this, unsigned long code, void *unused)
+static int usb_pcwd_notify_sys(struct notifier_block *this, unsigned long code,
+								void *unused)
 {
 	if (code == SYS_DOWN || code == SYS_HALT)
 		usb_pcwd_stop(usb_pcwd_device);	/* Turn the WDT off */
@@ -578,7 +611,8 @@ static inline void usb_pcwd_delete(struct usb_pcwd_private *usb_pcwd)
  *	Called by the usb core when a new device is connected that it thinks
  *	this driver might be interested in.
  */
-static int usb_pcwd_probe(struct usb_interface *interface, const struct usb_device_id *id)
+static int usb_pcwd_probe(struct usb_interface *interface,
+						const struct usb_device_id *id)
 {
 	struct usb_device *udev = interface_to_usbdev(interface);
 	struct usb_host_interface *iface_desc;
@@ -602,16 +636,15 @@ static int usb_pcwd_probe(struct usb_interface *interface, const struct usb_devi
 
 	/* check out that we have a HID device */
 	if (!(iface_desc->desc.bInterfaceClass == USB_CLASS_HID)) {
-		printk(KERN_ERR PFX "The device isn't a Human Interface Device\n");
+		printk(KERN_ERR PFX
+			"The device isn't a Human Interface Device\n");
 		return -ENODEV;
 	}
 
 	/* check out the endpoint: it has to be Interrupt & IN */
 	endpoint = &iface_desc->endpoint[0].desc;
 
-	if (!((endpoint->bEndpointAddress & USB_DIR_IN) &&
-	     ((endpoint->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK)
-				== USB_ENDPOINT_XFER_INT))) {
+	if (!usb_endpoint_is_int_in(endpoint)) {
 		/* we didn't find a Interrupt endpoint with direction IN */
 		printk(KERN_ERR PFX "Couldn't find an INTR & IN endpoint\n");
 		return -ENODEV;
@@ -634,10 +667,12 @@ static int usb_pcwd_probe(struct usb_interface *interface, const struct usb_devi
 	usb_pcwd->udev = udev;
 	usb_pcwd->interface = interface;
 	usb_pcwd->interface_number = iface_desc->desc.bInterfaceNumber;
-	usb_pcwd->intr_size = (le16_to_cpu(endpoint->wMaxPacketSize) > 8 ? le16_to_cpu(endpoint->wMaxPacketSize) : 8);
+	usb_pcwd->intr_size = (le16_to_cpu(endpoint->wMaxPacketSize) > 8 ?
+				le16_to_cpu(endpoint->wMaxPacketSize) : 8);
 
 	/* set up the memory buffer's */
-	usb_pcwd->intr_buffer = usb_buffer_alloc(udev, usb_pcwd->intr_size, GFP_ATOMIC, &usb_pcwd->intr_dma);
+	usb_pcwd->intr_buffer = usb_buffer_alloc(udev, usb_pcwd->intr_size,
+					GFP_ATOMIC, &usb_pcwd->intr_dma);
 	if (!usb_pcwd->intr_buffer) {
 		printk(KERN_ERR PFX "Out of memory\n");
 		goto error;
@@ -671,7 +706,8 @@ static int usb_pcwd_probe(struct usb_interface *interface, const struct usb_devi
 	usb_pcwd_stop(usb_pcwd);
 
 	/* Get the Firmware Version */
-	got_fw_rev = usb_pcwd_send_command(usb_pcwd, CMD_GET_FIRMWARE_VERSION, &fw_rev_major, &fw_rev_minor);
+	got_fw_rev = usb_pcwd_send_command(usb_pcwd, CMD_GET_FIRMWARE_VERSION,
+						&fw_rev_major, &fw_rev_minor);
 	if (got_fw_rev)
 		sprintf(fw_ver_str, "%u.%02u", fw_rev_major, fw_rev_minor);
 	else
@@ -681,9 +717,11 @@ static int usb_pcwd_probe(struct usb_interface *interface, const struct usb_devi
 		fw_ver_str);
 
 	/* Get switch settings */
-	usb_pcwd_send_command(usb_pcwd, CMD_GET_DIP_SWITCH_SETTINGS, &dummy, &option_switches);
+	usb_pcwd_send_command(usb_pcwd, CMD_GET_DIP_SWITCH_SETTINGS, &dummy,
+							&option_switches);
 
-	printk(KERN_INFO PFX "Option switches (0x%02x): Temperature Reset Enable=%s, Power On Delay=%s\n",
+	printk(KERN_INFO PFX "Option switches (0x%02x): "
+		"Temperature Reset Enable=%s, Power On Delay=%s\n",
 		option_switches,
 		((option_switches & 0x10) ? "ON" : "OFF"),
 		((option_switches & 0x08) ? "ON" : "OFF"));
@@ -692,30 +730,35 @@ static int usb_pcwd_probe(struct usb_interface *interface, const struct usb_devi
 	if (heartbeat == 0)
 		heartbeat = heartbeat_tbl[(option_switches & 0x07)];
 
-	/* Check that the heartbeat value is within it's range ; if not reset to the default */
+	/* Check that the heartbeat value is within it's range ;
+	 * if not reset to the default */
 	if (usb_pcwd_set_heartbeat(usb_pcwd, heartbeat)) {
 		usb_pcwd_set_heartbeat(usb_pcwd, WATCHDOG_HEARTBEAT);
-		printk(KERN_INFO PFX "heartbeat value must be 0<heartbeat<65536, using %d\n",
+		printk(KERN_INFO PFX
+			"heartbeat value must be 0<heartbeat<65536, using %d\n",
 			WATCHDOG_HEARTBEAT);
 	}
 
 	retval = register_reboot_notifier(&usb_pcwd_notifier);
 	if (retval != 0) {
-		printk(KERN_ERR PFX "cannot register reboot notifier (err=%d)\n",
+		printk(KERN_ERR PFX
+			"cannot register reboot notifier (err=%d)\n",
 			retval);
 		goto error;
 	}
 
 	retval = misc_register(&usb_pcwd_temperature_miscdev);
 	if (retval != 0) {
-		printk(KERN_ERR PFX "cannot register miscdev on minor=%d (err=%d)\n",
+		printk(KERN_ERR PFX
+			"cannot register miscdev on minor=%d (err=%d)\n",
 			TEMP_MINOR, retval);
 		goto err_out_unregister_reboot;
 	}
 
 	retval = misc_register(&usb_pcwd_miscdev);
 	if (retval != 0) {
-		printk(KERN_ERR PFX "cannot register miscdev on minor=%d (err=%d)\n",
+		printk(KERN_ERR PFX
+			"cannot register miscdev on minor=%d (err=%d)\n",
 			WATCHDOG_MINOR, retval);
 		goto err_out_misc_deregister;
 	}
@@ -801,7 +844,7 @@ static int __init usb_pcwd_init(void)
 		return result;
 	}
 
-	printk(KERN_INFO PFX DRIVER_DESC " v" DRIVER_VERSION " (" DRIVER_DATE ")\n");
+	printk(KERN_INFO PFX DRIVER_DESC " v" DRIVER_VERSION "\n");
 	return 0;
 }
 
diff --git a/drivers/watchdog/pnx4008_wdt.c b/drivers/watchdog/pnx4008_wdt.c
index 6d9f3d4a998..64135195f82 100644
--- a/drivers/watchdog/pnx4008_wdt.c
+++ b/drivers/watchdog/pnx4008_wdt.c
@@ -54,22 +54,22 @@
 
 /* WDTIM_CTRL bit definitions */
 #define COUNT_ENAB     1
-#define RESET_COUNT    (1<<1)
-#define DEBUG_EN       (1<<2)
+#define RESET_COUNT    (1 << 1)
+#define DEBUG_EN       (1 << 2)
 
 /* WDTIM_MCTRL bit definitions */
 #define MR0_INT        1
 #undef  RESET_COUNT0
-#define RESET_COUNT0   (1<<2)
-#define STOP_COUNT0    (1<<2)
-#define M_RES1         (1<<3)
-#define M_RES2         (1<<4)
-#define RESFRC1        (1<<5)
-#define RESFRC2        (1<<6)
+#define RESET_COUNT0   (1 << 2)
+#define STOP_COUNT0    (1 << 2)
+#define M_RES1         (1 << 3)
+#define M_RES2         (1 << 4)
+#define RESFRC1        (1 << 5)
+#define RESFRC2        (1 << 6)
 
 /* WDTIM_EMR bit definitions */
 #define EXT_MATCH0      1
-#define MATCH_OUTPUT_HIGH (2<<4)	/*a MATCH_CTRL setting */
+#define MATCH_OUTPUT_HIGH (2 << 4)	/*a MATCH_CTRL setting */
 
 /* WDTIM_RES bit definitions */
 #define WDOG_RESET      1	/* read only */
diff --git a/drivers/watchdog/rc32434_wdt.c b/drivers/watchdog/rc32434_wdt.c
index f3553fa40b1..f6cccc9df02 100644
--- a/drivers/watchdog/rc32434_wdt.c
+++ b/drivers/watchdog/rc32434_wdt.c
@@ -17,27 +17,29 @@
  *
  */
 
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/miscdevice.h>
-#include <linux/watchdog.h>
-#include <linux/reboot.h>
-#include <linux/smp_lock.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/uaccess.h>
-
-#include <asm/bootinfo.h>
-#include <asm/time.h>
-#include <asm/mach-rc32434/integ.h>
-
-#define VERSION "0.4"
+#include <linux/module.h>		/* For module specific items */
+#include <linux/moduleparam.h>		/* For new moduleparam's */
+#include <linux/types.h>		/* For standard types (like size_t) */
+#include <linux/errno.h>		/* For the -ENODEV/... values */
+#include <linux/kernel.h>		/* For printk/panic/... */
+#include <linux/fs.h>			/* For file operations */
+#include <linux/miscdevice.h>		/* For MODULE_ALIAS_MISCDEV
+							(WATCHDOG_MINOR) */
+#include <linux/watchdog.h>		/* For the watchdog specific items */
+#include <linux/init.h>			/* For __init/__exit/... */
+#include <linux/platform_device.h>	/* For platform_driver framework */
+#include <linux/spinlock.h>		/* For spin_lock/spin_unlock/... */
+#include <linux/uaccess.h>		/* For copy_to_user/put_user/... */
+
+#include <asm/mach-rc32434/integ.h>	/* For the Watchdog registers */
+
+#define PFX KBUILD_MODNAME ": "
+
+#define VERSION "1.0"
 
 static struct {
 	unsigned long inuse;
+	spinlock_t io_lock;
 } rc32434_wdt_device;
 
 static struct integ __iomem *wdt_reg;
@@ -58,6 +60,9 @@ extern unsigned int idt_cpu_freq;
 #define WATCHDOG_TIMEOUT 20
 
 static int timeout = WATCHDOG_TIMEOUT;
+module_param(timeout, int, 0);
+MODULE_PARM_DESC(timeout, "Watchdog timeout value, in seconds (default="
+		WATCHDOG_TIMEOUT ")");
 
 static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
@@ -68,10 +73,29 @@ MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
 #define SET_BITS(addr, or, nand) \
 	writel((readl(&addr) | or) & ~nand, &addr)
 
+static int rc32434_wdt_set(int new_timeout)
+{
+	int max_to = WTCOMP2SEC((u32)-1);
+
+	if (new_timeout < 0 || new_timeout > max_to) {
+		printk(KERN_ERR PFX "timeout value must be between 0 and %d",
+			max_to);
+		return -EINVAL;
+	}
+	timeout = new_timeout;
+	spin_lock(&rc32434_wdt_device.io_lock);
+	writel(SEC2WTCOMP(timeout), &wdt_reg->wtcompare);
+	spin_unlock(&rc32434_wdt_device.io_lock);
+
+	return 0;
+}
+
 static void rc32434_wdt_start(void)
 {
 	u32 or, nand;
 
+	spin_lock(&rc32434_wdt_device.io_lock);
+
 	/* zero the counter before enabling */
 	writel(0, &wdt_reg->wtcount);
 
@@ -85,38 +109,35 @@ static void rc32434_wdt_start(void)
 
 	SET_BITS(wdt_reg->errcs, or, nand);
 
+	/* set the timeout (either default or based on module param) */
+	rc32434_wdt_set(timeout);
+
 	/* reset WTC timeout bit and enable WDT */
 	nand = 1 << RC32434_WTC_TO;
 	or = 1 << RC32434_WTC_EN;
 
 	SET_BITS(wdt_reg->wtc, or, nand);
+
+	spin_unlock(&rc32434_wdt_device.io_lock);
+	printk(KERN_INFO PFX "Started watchdog timer.\n");
 }
 
 static void rc32434_wdt_stop(void)
 {
+	spin_lock(&rc32434_wdt_device.io_lock);
+
 	/* Disable WDT */
 	SET_BITS(wdt_reg->wtc, 0, 1 << RC32434_WTC_EN);
-}
-
-static int rc32434_wdt_set(int new_timeout)
-{
-	int max_to = WTCOMP2SEC((u32)-1);
-
-	if (new_timeout < 0 || new_timeout > max_to) {
-		printk(KERN_ERR KBUILD_MODNAME
-			": timeout value must be between 0 and %d",
-			max_to);
-		return -EINVAL;
-	}
-	timeout = new_timeout;
-	writel(SEC2WTCOMP(timeout), &wdt_reg->wtcompare);
 
-	return 0;
+	spin_unlock(&rc32434_wdt_device.io_lock);
+	printk(KERN_INFO PFX "Stopped watchdog timer.\n");
 }
 
 static void rc32434_wdt_ping(void)
 {
+	spin_lock(&rc32434_wdt_device.io_lock);
 	writel(0, &wdt_reg->wtcount);
+	spin_unlock(&rc32434_wdt_device.io_lock);
 }
 
 static int rc32434_wdt_open(struct inode *inode, struct file *file)
@@ -137,11 +158,10 @@ static int rc32434_wdt_release(struct inode *inode, struct file *file)
 {
 	if (expect_close == 42) {
 		rc32434_wdt_stop();
-		printk(KERN_INFO KBUILD_MODNAME ": disabling watchdog timer\n");
 		module_put(THIS_MODULE);
 	} else {
-		printk(KERN_CRIT KBUILD_MODNAME
-			": device closed unexpectedly. WDT will not stop !\n");
+		printk(KERN_CRIT PFX
+			"device closed unexpectedly. WDT will not stop!\n");
 		rc32434_wdt_ping();
 	}
 	clear_bit(0, &rc32434_wdt_device.inuse);
@@ -185,8 +205,9 @@ static long rc32434_wdt_ioctl(struct file *file, unsigned int cmd,
 		.identity =		"RC32434_WDT Watchdog",
 	};
 	switch (cmd) {
-	case WDIOC_KEEPALIVE:
-		rc32434_wdt_ping();
+	case WDIOC_GETSUPPORT:
+		if (copy_to_user(argp, &ident, sizeof(ident)))
+			return -EFAULT;
 		break;
 	case WDIOC_GETSTATUS:
 	case WDIOC_GETBOOTSTATUS:
@@ -194,10 +215,6 @@ static long rc32434_wdt_ioctl(struct file *file, unsigned int cmd,
 		if (copy_to_user(argp, &value, sizeof(int)))
 			return -EFAULT;
 		break;
-	case WDIOC_GETSUPPORT:
-		if (copy_to_user(argp, &ident, sizeof(ident)))
-			return -EFAULT;
-		break;
 	case WDIOC_SETOPTIONS:
 		if (copy_from_user(&value, argp, sizeof(int)))
 			return -EFAULT;
@@ -212,6 +229,9 @@ static long rc32434_wdt_ioctl(struct file *file, unsigned int cmd,
 			return -EINVAL;
 		}
 		break;
+	case WDIOC_KEEPALIVE:
+		rc32434_wdt_ping();
+		break;
 	case WDIOC_SETTIMEOUT:
 		if (copy_from_user(&new_timeout, argp, sizeof(int)))
 			return -EFAULT;
@@ -227,7 +247,7 @@ static long rc32434_wdt_ioctl(struct file *file, unsigned int cmd,
 	return 0;
 }
 
-static struct file_operations rc32434_wdt_fops = {
+static const struct file_operations rc32434_wdt_fops = {
 	.owner		= THIS_MODULE,
 	.llseek		= no_llseek,
 	.write		= rc32434_wdt_write,
@@ -242,8 +262,8 @@ static struct miscdevice rc32434_wdt_miscdev = {
 	.fops	= &rc32434_wdt_fops,
 };
 
-static char banner[] __devinitdata = KERN_INFO KBUILD_MODNAME
-		": Watchdog Timer version " VERSION ", timer margin: %d sec\n";
+static char banner[] __devinitdata = KERN_INFO PFX
+		"Watchdog Timer version " VERSION ", timer margin: %d sec\n";
 
 static int __devinit rc32434_wdt_probe(struct platform_device *pdev)
 {
@@ -252,22 +272,33 @@ static int __devinit rc32434_wdt_probe(struct platform_device *pdev)
 
 	r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rb532_wdt_res");
 	if (!r) {
-		printk(KERN_ERR KBUILD_MODNAME
-			"failed to retrieve resources\n");
+		printk(KERN_ERR PFX "failed to retrieve resources\n");
 		return -ENODEV;
 	}
 
 	wdt_reg = ioremap_nocache(r->start, r->end - r->start);
 	if (!wdt_reg) {
-		printk(KERN_ERR KBUILD_MODNAME
-			"failed to remap I/O resources\n");
+		printk(KERN_ERR PFX "failed to remap I/O resources\n");
 		return -ENXIO;
 	}
 
+	spin_lock_init(&rc32434_wdt_device.io_lock);
+
+	/* Make sure the watchdog is not running */
+	rc32434_wdt_stop();
+
+	/* Check that the heartbeat value is within it's range;
+	 * if not reset to the default */
+	if (rc32434_wdt_set(timeout)) {
+		rc32434_wdt_set(WATCHDOG_TIMEOUT);
+		printk(KERN_INFO PFX
+			"timeout value must be between 0 and %d\n",
+			WTCOMP2SEC((u32)-1));
+	}
+
 	ret = misc_register(&rc32434_wdt_miscdev);
 	if (ret < 0) {
-		printk(KERN_ERR KBUILD_MODNAME
-			"failed to register watchdog device\n");
+		printk(KERN_ERR PFX "failed to register watchdog device\n");
 		goto unmap;
 	}
 
@@ -287,22 +318,28 @@ static int __devexit rc32434_wdt_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static struct platform_driver rc32434_wdt = {
-	.probe	= rc32434_wdt_probe,
-	.remove	= __devexit_p(rc32434_wdt_remove),
-	.driver	= {
-		.name = "rc32434_wdt",
+static void rc32434_wdt_shutdown(struct platform_device *pdev)
+{
+	rc32434_wdt_stop();
+}
+
+static struct platform_driver rc32434_wdt_driver = {
+	.probe		= rc32434_wdt_probe,
+	.remove		= __devexit_p(rc32434_wdt_remove),
+	.shutdown	= rc32434_wdt_shutdown,
+	.driver		= {
+			.name = "rc32434_wdt",
 	}
 };
 
 static int __init rc32434_wdt_init(void)
 {
-	return platform_driver_register(&rc32434_wdt);
+	return platform_driver_register(&rc32434_wdt_driver);
 }
 
 static void __exit rc32434_wdt_exit(void)
 {
-	platform_driver_unregister(&rc32434_wdt);
+	platform_driver_unregister(&rc32434_wdt_driver);
 }
 
 module_init(rc32434_wdt_init);
diff --git a/drivers/watchdog/riowd.c b/drivers/watchdog/riowd.c
index 09cb1833ea2..1e8f02f440e 100644
--- a/drivers/watchdog/riowd.c
+++ b/drivers/watchdog/riowd.c
@@ -14,9 +14,8 @@
 #include <linux/watchdog.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
-
-#include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
 
 
 /* RIO uses the NatSemi Super I/O power management logical device
@@ -86,8 +85,7 @@ static int riowd_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static int riowd_ioctl(struct inode *inode, struct file *filp,
-		       unsigned int cmd, unsigned long arg)
+static long riowd_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	static struct watchdog_info info = {
 		.options		= WDIOF_SETTIMEOUT,
@@ -147,7 +145,8 @@ static int riowd_ioctl(struct inode *inode, struct file *filp,
 	return 0;
 }
 
-static ssize_t riowd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+static ssize_t riowd_write(struct file *file, const char __user *buf,
+						size_t count, loff_t *ppos)
 {
 	struct riowd *p = riowd_device;
 
@@ -160,12 +159,12 @@ static ssize_t riowd_write(struct file *file, const char __user *buf, size_t cou
 }
 
 static const struct file_operations riowd_fops = {
-	.owner =	THIS_MODULE,
-	.llseek =	no_llseek,
-	.ioctl =	riowd_ioctl,
-	.open =		riowd_open,
-	.write =	riowd_write,
-	.release =	riowd_release,
+	.owner =		THIS_MODULE,
+	.llseek =		no_llseek,
+	.unlocked_ioctl =	riowd_ioctl,
+	.open =			riowd_open,
+	.write =		riowd_write,
+	.release =		riowd_release,
 };
 
 static struct miscdevice riowd_miscdev = {
diff --git a/drivers/watchdog/sa1100_wdt.c b/drivers/watchdog/sa1100_wdt.c
index e19b4579471..5bd782f2783 100644
--- a/drivers/watchdog/sa1100_wdt.c
+++ b/drivers/watchdog/sa1100_wdt.c
@@ -1,8 +1,8 @@
 /*
  *	Watchdog driver for the SA11x0/PXA2xx
  *
- *      (c) Copyright 2000 Oleg Drokin <green@crimea.edu>
- *          Based on SoftDog driver by Alan Cox <alan@lxorguk.ukuu.org.uk>
+ *	(c) Copyright 2000 Oleg Drokin <green@crimea.edu>
+ *	    Based on SoftDog driver by Alan Cox <alan@lxorguk.ukuu.org.uk>
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
@@ -15,7 +15,7 @@
  *
  *	(c) Copyright 2000           Oleg Drokin <green@crimea.edu>
  *
- *      27/11/2000 Initial release
+ *	27/11/2000 Initial release
  */
 #include <linux/module.h>
 #include <linux/moduleparam.h>
diff --git a/drivers/watchdog/sbc60xxwdt.c b/drivers/watchdog/sbc60xxwdt.c
index 3266daaaecf..d1c390c7155 100644
--- a/drivers/watchdog/sbc60xxwdt.c
+++ b/drivers/watchdog/sbc60xxwdt.c
@@ -1,7 +1,7 @@
 /*
  *	60xx Single Board Computer Watchdog Timer driver for Linux 2.2.x
  *
- *      Based on acquirewdt.c by Alan Cox.
+ *	Based on acquirewdt.c by Alan Cox.
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
diff --git a/drivers/watchdog/sbc8360.c b/drivers/watchdog/sbc8360.c
index ae74f6bcfa2..b6e6799ec45 100644
--- a/drivers/watchdog/sbc8360.c
+++ b/drivers/watchdog/sbc8360.c
@@ -4,12 +4,12 @@
  *	(c) Copyright 2005 Webcon, Inc.
  *
  *	Based on ib700wdt.c, which is based on advantechwdt.c which is based
- *      on acquirewdt.c which is based on wdt.c.
+ *	on acquirewdt.c which is based on wdt.c.
  *
  *	(c) Copyright 2001 Charles Howes <chowes@vsol.net>
  *
- *      Based on advantechwdt.c which is based on acquirewdt.c which
- *       is based on wdt.c.
+ *	Based on advantechwdt.c which is based on acquirewdt.c which
+ *	is based on wdt.c.
  *
  *	(c) Copyright 2000-2001 Marek Michalkiewicz <marekm@linux.org.pl>
  *
@@ -30,9 +30,9 @@
  *
  *	(c) Copyright 1995    Alan Cox <alan@lxorguk.ukuu.org.uk>
  *
- *      14-Dec-2001 Matt Domsch <Matt_Domsch@dell.com>
- *           Added nowayout module option to override CONFIG_WATCHDOG_NOWAYOUT
- *           Added timeout module option to override default
+ *	14-Dec-2001 Matt Domsch <Matt_Domsch@dell.com>
+ *	     Added nowayout module option to override CONFIG_WATCHDOG_NOWAYOUT
+ *	     Added timeout module option to override default
  *
  */
 
diff --git a/drivers/watchdog/sbc_epx_c3.c b/drivers/watchdog/sbc_epx_c3.c
index 06553debc7b..e467ddcf796 100644
--- a/drivers/watchdog/sbc_epx_c3.c
+++ b/drivers/watchdog/sbc_epx_c3.c
@@ -35,7 +35,8 @@ static int epx_c3_alive;
 
 static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
-MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
+					__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
 #define EPXC3_WATCHDOG_CTL_REG 0x1ee /* write 1 to enable, 0 to disable */
 #define EPXC3_WATCHDOG_PET_REG 0x1ef /* write anything to pet once enabled */
diff --git a/drivers/watchdog/sc1200wdt.c b/drivers/watchdog/sc1200wdt.c
index 23da3ccd832..b5e19c1820a 100644
--- a/drivers/watchdog/sc1200wdt.c
+++ b/drivers/watchdog/sc1200wdt.c
@@ -71,7 +71,7 @@
 #define UART2_IRQ	0x04	/* Serial1 */
 /* 5 -7 are reserved */
 
-static char banner[] __initdata = KERN_INFO PFX SC1200_MODULE_VER;
+static char banner[] __initdata = PFX SC1200_MODULE_VER;
 static int timeout = 1;
 static int io = -1;
 static int io_len = 2;		/* for non plug and play */
@@ -392,7 +392,7 @@ static int __init sc1200wdt_init(void)
 {
 	int ret;
 
-	printk("%s\n", banner);
+	printk(KERN_INFO "%s\n", banner);
 
 #if defined CONFIG_PNP
 	if (isapnp) {
@@ -477,6 +477,7 @@ module_init(sc1200wdt_init);
 module_exit(sc1200wdt_exit);
 
 MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
-MODULE_DESCRIPTION("Driver for National Semiconductor PC87307/PC97307 watchdog component");
+MODULE_DESCRIPTION(
+	"Driver for National Semiconductor PC87307/PC97307 watchdog component");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
diff --git a/drivers/watchdog/sc520_wdt.c b/drivers/watchdog/sc520_wdt.c
index a2b6c1067ec..52b63f2f0da 100644
--- a/drivers/watchdog/sc520_wdt.c
+++ b/drivers/watchdog/sc520_wdt.c
@@ -1,8 +1,8 @@
 /*
  *	AMD Elan SC520 processor Watchdog Timer driver
  *
- *      Based on acquirewdt.c by Alan Cox,
- *           and sbc60xxwdt.c by Jakob Oestergaard <jakob@unthought.net>
+ *	Based on acquirewdt.c by Alan Cox,
+ *	     and sbc60xxwdt.c by Jakob Oestergaard <jakob@unthought.net>
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
@@ -11,7 +11,7 @@
  *
  *	The authors do NOT admit liability nor provide warranty for
  *	any of this software. This material is provided "AS-IS" in
- *      the hope that it may be useful for others.
+ *	the hope that it may be useful for others.
  *
  *	(c) Copyright 2001    Scott Jennings <linuxdrivers@oro.net>
  *           9/27 - 2001      [Initial release]
@@ -438,6 +438,7 @@ module_init(sc520_wdt_init);
 module_exit(sc520_wdt_unload);
 
 MODULE_AUTHOR("Scott and Bill Jennings");
-MODULE_DESCRIPTION("Driver for watchdog timer in AMD \"Elan\" SC520 uProcessor");
+MODULE_DESCRIPTION(
+	"Driver for watchdog timer in AMD \"Elan\" SC520 uProcessor");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
diff --git a/drivers/watchdog/smsc37b787_wdt.c b/drivers/watchdog/smsc37b787_wdt.c
index 2e56cad77d1..8a1f0bc3e27 100644
--- a/drivers/watchdog/smsc37b787_wdt.c
+++ b/drivers/watchdog/smsc37b787_wdt.c
@@ -2,7 +2,7 @@
  *	SMsC 37B787 Watchdog Timer driver for Linux 2.6.x.x
  *
  *	Based on acquirewdt.c by Alan Cox <alan@lxorguk.ukuu.org.uk>
- *       and some other existing drivers
+ *	and some other existing drivers
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
@@ -11,7 +11,7 @@
  *
  *	The authors do NOT admit liability nor provide warranty for
  *	any of this software. This material is provided "AS-IS" in
- *      the hope that it may be useful for others.
+ *	the hope that it may be useful for others.
  *
  *	(C) Copyright 2003-2006  Sven Anders <anders@anduras.de>
  *
@@ -22,19 +22,19 @@
  *
  *  Theory of operation:
  *
- *      A Watchdog Timer (WDT) is a hardware circuit that can
- *      reset the computer system in case of a software fault.
- *      You probably knew that already.
+ *	A Watchdog Timer (WDT) is a hardware circuit that can
+ *	reset the computer system in case of a software fault.
+ *	You probably knew that already.
  *
- *      Usually a userspace daemon will notify the kernel WDT driver
- *      via the /dev/watchdog special device file that userspace is
- *      still alive, at regular intervals.  When such a notification
- *      occurs, the driver will usually tell the hardware watchdog
- *      that everything is in order, and that the watchdog should wait
- *      for yet another little while to reset the system.
- *      If userspace fails (RAM error, kernel bug, whatever), the
- *      notifications cease to occur, and the hardware watchdog will
- *      reset the system (causing a reboot) after the timeout occurs.
+ *	Usually a userspace daemon will notify the kernel WDT driver
+ *	via the /dev/watchdog special device file that userspace is
+ *	still alive, at regular intervals.  When such a notification
+ *	occurs, the driver will usually tell the hardware watchdog
+ *	that everything is in order, and that the watchdog should wait
+ *	for yet another little while to reset the system.
+ *	If userspace fails (RAM error, kernel bug, whatever), the
+ *	notifications cease to occur, and the hardware watchdog will
+ *	reset the system (causing a reboot) after the timeout occurs.
  *
  * Create device with:
  *  mknod /dev/watchdog c 10 130
@@ -485,7 +485,7 @@ static long wb_smsc_wdt_ioctl(struct file *file,
 	case WDIOC_GETTIMEOUT:
 		new_timeout = timeout;
 		if (unit == UNIT_MINUTE)
-			  new_timeout *= 60;
+			new_timeout *= 60;
 		return put_user(new_timeout, uarg.i);
 	default:
 		return -ENOTTY;
diff --git a/drivers/watchdog/softdog.c b/drivers/watchdog/softdog.c
index 7204f966211..ebcc9cea5e9 100644
--- a/drivers/watchdog/softdog.c
+++ b/drivers/watchdog/softdog.c
@@ -1,7 +1,8 @@
 /*
  *	SoftDog	0.07:	A Software Watchdog Device
  *
- *	(c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>, All Rights Reserved.
+ *	(c) Copyright 1996 Alan Cox <alan@lxorguk.ukuu.org.uk>,
+ *							All Rights Reserved.
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
@@ -32,7 +33,7 @@
  *	Added WDIOC_GETTIMEOUT and WDIOC_SETTIMOUT.
  *
  *  20020530 Joel Becker <joel.becker@oracle.com>
- *  	Added Matt Domsch's nowayout module option.
+ *	Added Matt Domsch's nowayout module option.
  */
 
 #include <linux/module.h>
diff --git a/drivers/watchdog/w83697hf_wdt.c b/drivers/watchdog/w83697hf_wdt.c
index 3c7aa412b1f..a9c7f352fcb 100644
--- a/drivers/watchdog/w83697hf_wdt.c
+++ b/drivers/watchdog/w83697hf_wdt.c
@@ -462,6 +462,7 @@ module_init(wdt_init);
 module_exit(wdt_exit);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Marcus Junker <junker@anduras.de>, Samuel Tardieu <sam@rfc1149.net>");
+MODULE_AUTHOR("Marcus Junker <junker@anduras.de>, "
+		"Samuel Tardieu <sam@rfc1149.net>");
 MODULE_DESCRIPTION("w83697hf/hg WDT driver");
 MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
diff --git a/drivers/watchdog/w83697ug_wdt.c b/drivers/watchdog/w83697ug_wdt.c
index ada8ad82d99..883b5f79673 100644
--- a/drivers/watchdog/w83697ug_wdt.c
+++ b/drivers/watchdog/w83697ug_wdt.c
@@ -2,7 +2,7 @@
  *	w83697ug/uf WDT driver
  *
  *	(c) Copyright 2008 Flemming Fransen <ff@nrvissing.net>
- *              reused original code to supoprt w83697ug/uf.
+ *		reused original code to support w83697ug/uf.
  *
  *	Based on w83627hf_wdt.c which is based on advantechwdt.c
  *	which is based on wdt.c.
@@ -79,7 +79,7 @@ MODULE_PARM_DESC(nowayout,
 							(same as EFER) */
 #define WDT_EFDR (WDT_EFIR+1) /* Extended Function Data Register */
 
-static void w83697ug_select_wd_register(void)
+static int w83697ug_select_wd_register(void)
 {
 	unsigned char c;
 	unsigned char version;
@@ -102,7 +102,7 @@ static void w83697ug_select_wd_register(void)
 
 	} else {
 		printk(KERN_ERR PFX "No W83697UG/UF could be found\n");
-		return;
+		return -ENODEV;
 	}
 
 	outb_p(0x07, WDT_EFER); /* point to logical device number reg */
@@ -110,6 +110,8 @@ static void w83697ug_select_wd_register(void)
 	outb_p(0x30, WDT_EFER); /* select CR30 */
 	c = inb_p(WDT_EFDR);
 	outb_p(c || 0x01, WDT_EFDR); /* set bit 0 to activate GPIO2 */
+
+	return 0;
 }
 
 static void w83697ug_unselect_wd_register(void)
@@ -117,11 +119,14 @@ static void w83697ug_unselect_wd_register(void)
 	outb_p(0xAA, WDT_EFER); /* Leave extended function mode */
 }
 
-static void w83697ug_init(void)
+static int w83697ug_init(void)
 {
+	int ret;
 	unsigned char t;
 
-	w83697ug_select_wd_register();
+	ret = w83697ug_select_wd_register();
+	if (ret != 0)
+		return ret;
 
 	outb_p(0xF6, WDT_EFER); /* Select CRF6 */
 	t = inb_p(WDT_EFDR);    /* read CRF6 */
@@ -137,13 +142,15 @@ static void w83697ug_init(void)
 	outb_p(t, WDT_EFDR);    /* Write back to CRF5 */
 
 	w83697ug_unselect_wd_register();
+	return 0;
 }
 
 static void wdt_ctrl(int timeout)
 {
 	spin_lock(&io_lock);
 
-	w83697ug_select_wd_register();
+	if (w83697ug_select_wd_register() < 0)
+		return;
 
 	outb_p(0xF4, WDT_EFER);    /* Select CRF4 */
 	outb_p(timeout, WDT_EFDR); /* Write Timeout counter to CRF4 */
@@ -347,7 +354,9 @@ static int __init wdt_init(void)
 		goto out;
 	}
 
-	w83697ug_init();
+	ret = w83697ug_init();
+	if (ret != 0)
+		goto unreg_regions;
 
 	ret = register_reboot_notifier(&wdt_notifier);
 	if (ret != 0) {
diff --git a/drivers/watchdog/w83977f_wdt.c b/drivers/watchdog/w83977f_wdt.c
index 2525da5080c..0560182a1d0 100644
--- a/drivers/watchdog/w83977f_wdt.c
+++ b/drivers/watchdog/w83977f_wdt.c
@@ -426,7 +426,7 @@ static long wdt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			return -EFAULT;
 
 		if (wdt_set_timeout(new_timeout))
-		    return -EINVAL;
+			return -EINVAL;
 
 		wdt_keepalive();
 		/* Fall */
diff --git a/drivers/watchdog/wd501p.h b/drivers/watchdog/wd501p.h
index db34853c28a..0e3a497d562 100644
--- a/drivers/watchdog/wd501p.h
+++ b/drivers/watchdog/wd501p.h
@@ -11,9 +11,9 @@
  *
  *	http://www.cymru.net
  *
- *	This driver is provided under the GNU General Public License, incorporated
- *	herein by reference. The driver is provided without warranty or
- *	support.
+ *	This driver is provided under the GNU General Public License,
+ *	incorporated herein by reference. The driver is provided without
+ *	warranty or support.
  *
  *	Release 0.04.
  *
@@ -39,13 +39,13 @@
 /* programmable outputs: */
 #define WDT_PROGOUT		(io+15)	/* wr=enable, rd=disable */
 
-								/* FAN 501 500 */
-#define WDC_SR_WCCR		1	/* Active low */	/*  X   X   X  */
-#define WDC_SR_TGOOD		2				/*  X   X   -  */
-#define WDC_SR_ISOI0		4				/*  X   X   X  */
-#define WDC_SR_ISII1		8				/*  X   X   X  */
-#define WDC_SR_FANGOOD		16				/*  X   -   -  */
-#define WDC_SR_PSUOVER		32	/* Active low */	/*  X   X   -  */
-#define WDC_SR_PSUUNDR		64	/* Active low */	/*  X   X   -  */
-#define WDC_SR_IRQ		128	/* Active low */	/*  X   X   X  */
+							 /* FAN 501 500 */
+#define WDC_SR_WCCR		1	/* Active low */ /*  X   X   X  */
+#define WDC_SR_TGOOD		2			 /*  X   X   -  */
+#define WDC_SR_ISOI0		4			 /*  X   X   X  */
+#define WDC_SR_ISII1		8			 /*  X   X   X  */
+#define WDC_SR_FANGOOD		16			 /*  X   -   -  */
+#define WDC_SR_PSUOVER		32	/* Active low */ /*  X   X   -  */
+#define WDC_SR_PSUUNDR		64	/* Active low */ /*  X   X   -  */
+#define WDC_SR_IRQ		128	/* Active low */ /*  X   X   X  */
 
diff --git a/drivers/watchdog/wdt.c b/drivers/watchdog/wdt.c
index eddb9187e7b..3bbefe9a263 100644
--- a/drivers/watchdog/wdt.c
+++ b/drivers/watchdog/wdt.c
@@ -1,5 +1,5 @@
 /*
- *	Industrial Computer Source WDT500/501 driver
+ *	Industrial Computer Source WDT501 driver
  *
  *	(c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>,
  *						All Rights Reserved.
@@ -82,14 +82,16 @@ MODULE_PARM_DESC(io, "WDT io port (default=0x240)");
 module_param(irq, int, 0);
 MODULE_PARM_DESC(irq, "WDT irq (default=11)");
 
-#ifdef CONFIG_WDT_501
 /* Support for the Fan Tachometer on the WDT501-P */
 static int tachometer;
-
 module_param(tachometer, int, 0);
 MODULE_PARM_DESC(tachometer,
 		"WDT501-P Fan Tachometer support (0=disable, default=0)");
-#endif /* CONFIG_WDT_501 */
+
+static int type = 500;
+module_param(type, int, 0);
+MODULE_PARM_DESC(type,
+		"WDT501-P Card type (500 or 501 , default=500)");
 
 /*
  *	Programming support
@@ -158,7 +160,7 @@ static int wdt_stop(void)
  *	reloading the cascade counter.
  */
 
-static int wdt_ping(void)
+static void wdt_ping(void)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&wdt_lock, flags);
@@ -169,7 +171,6 @@ static int wdt_ping(void)
 	wdt_ctr_load(1, wd_heartbeat);	/* Heartbeat */
 	outb_p(0, WDT_DC);		/* Enable watchdog */
 	spin_unlock_irqrestore(&wdt_lock, flags);
-	return 0;
 }
 
 /**
@@ -193,7 +194,6 @@ static int wdt_set_heartbeat(int t)
 
 /**
  *	wdt_get_status:
- *	@status:		the new status.
  *
  *	Extract the status information from a WDT watchdog device. There are
  *	several board variants so we have to know which bits are valid. Some
@@ -202,36 +202,35 @@ static int wdt_set_heartbeat(int t)
  *	we then map the bits onto the status ioctl flags.
  */
 
-static int wdt_get_status(int *status)
+static int wdt_get_status(void)
 {
 	unsigned char new_status;
+	int status = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&wdt_lock, flags);
 	new_status = inb_p(WDT_SR);
 	spin_unlock_irqrestore(&wdt_lock, flags);
 
-	*status = 0;
 	if (new_status & WDC_SR_ISOI0)
-		*status |= WDIOF_EXTERN1;
+		status |= WDIOF_EXTERN1;
 	if (new_status & WDC_SR_ISII1)
-		*status |= WDIOF_EXTERN2;
-#ifdef CONFIG_WDT_501
-	if (!(new_status & WDC_SR_TGOOD))
-		*status |= WDIOF_OVERHEAT;
-	if (!(new_status & WDC_SR_PSUOVER))
-		*status |= WDIOF_POWEROVER;
-	if (!(new_status & WDC_SR_PSUUNDR))
-		*status |= WDIOF_POWERUNDER;
-	if (tachometer) {
-		if (!(new_status & WDC_SR_FANGOOD))
-			*status |= WDIOF_FANFAULT;
+		status |= WDIOF_EXTERN2;
+	if (type == 501) {
+		if (!(new_status & WDC_SR_TGOOD))
+			status |= WDIOF_OVERHEAT;
+		if (!(new_status & WDC_SR_PSUOVER))
+			status |= WDIOF_POWEROVER;
+		if (!(new_status & WDC_SR_PSUUNDR))
+			status |= WDIOF_POWERUNDER;
+		if (tachometer) {
+			if (!(new_status & WDC_SR_FANGOOD))
+				status |= WDIOF_FANFAULT;
+		}
 	}
-#endif /* CONFIG_WDT_501 */
-	return 0;
+	return status;
 }
 
-#ifdef CONFIG_WDT_501
 /**
  *	wdt_get_temperature:
  *
@@ -239,7 +238,7 @@ static int wdt_get_status(int *status)
  *	farenheit. It was designed by an imperial measurement luddite.
  */
 
-static int wdt_get_temperature(int *temperature)
+static int wdt_get_temperature(void)
 {
 	unsigned short c;
 	unsigned long flags;
@@ -247,10 +246,18 @@ static int wdt_get_temperature(int *temperature)
 	spin_lock_irqsave(&wdt_lock, flags);
 	c = inb_p(WDT_RT);
 	spin_unlock_irqrestore(&wdt_lock, flags);
-	*temperature = (c * 11 / 15) + 7;
-	return 0;
+	return (c * 11 / 15) + 7;
+}
+
+static void wdt_decode_501(int status)
+{
+	if (!(status & WDC_SR_TGOOD))
+		printk(KERN_CRIT "Overheat alarm.(%d)\n", inb_p(WDT_RT));
+	if (!(status & WDC_SR_PSUOVER))
+		printk(KERN_CRIT "PSU over voltage.\n");
+	if (!(status & WDC_SR_PSUUNDR))
+		printk(KERN_CRIT "PSU under voltage.\n");
 }
-#endif /* CONFIG_WDT_501 */
 
 /**
  *	wdt_interrupt:
@@ -275,18 +282,13 @@ static irqreturn_t wdt_interrupt(int irq, void *dev_id)
 
 	printk(KERN_CRIT "WDT status %d\n", status);
 
-#ifdef CONFIG_WDT_501
-	if (!(status & WDC_SR_TGOOD))
-		printk(KERN_CRIT "Overheat alarm.(%d)\n", inb_p(WDT_RT));
-	if (!(status & WDC_SR_PSUOVER))
-		printk(KERN_CRIT "PSU over voltage.\n");
-	if (!(status & WDC_SR_PSUUNDR))
-		printk(KERN_CRIT "PSU under voltage.\n");
-	if (tachometer) {
-		if (!(status & WDC_SR_FANGOOD))
-			printk(KERN_CRIT "Possible fan fault.\n");
+	if (type == 501) {
+		wdt_decode_501(status);
+		if (tachometer) {
+			if (!(status & WDC_SR_FANGOOD))
+				printk(KERN_CRIT "Possible fan fault.\n");
+		}
 	}
-#endif /* CONFIG_WDT_501 */
 	if (!(status & WDC_SR_WCCR)) {
 #ifdef SOFTWARE_REBOOT
 #ifdef ONLY_TESTING
@@ -366,17 +368,18 @@ static long wdt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	/* Add options according to the card we have */
 	ident.options |= (WDIOF_EXTERN1|WDIOF_EXTERN2);
-#ifdef CONFIG_WDT_501
-	ident.options |= (WDIOF_OVERHEAT|WDIOF_POWERUNDER|WDIOF_POWEROVER);
-	if (tachometer)
-		ident.options |= WDIOF_FANFAULT;
-#endif /* CONFIG_WDT_501 */
+	if (type == 501) {
+		ident.options |= (WDIOF_OVERHEAT|WDIOF_POWERUNDER|
+							WDIOF_POWEROVER);
+		if (tachometer)
+			ident.options |= WDIOF_FANFAULT;
+	}
 
 	switch (cmd) {
 	case WDIOC_GETSUPPORT:
 		return copy_to_user(argp, &ident, sizeof(ident)) ? -EFAULT : 0;
 	case WDIOC_GETSTATUS:
-		wdt_get_status(&status);
+		status = wdt_get_status();
 		return put_user(status, p);
 	case WDIOC_GETBOOTSTATUS:
 		return put_user(0, p);
@@ -446,7 +449,6 @@ static int wdt_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-#ifdef CONFIG_WDT_501
 /**
  *	wdt_temp_read:
  *	@file: file handle to the watchdog board
@@ -461,10 +463,7 @@ static int wdt_release(struct inode *inode, struct file *file)
 static ssize_t wdt_temp_read(struct file *file, char __user *buf,
 						size_t count, loff_t *ptr)
 {
-	int temperature;
-
-	if (wdt_get_temperature(&temperature))
-		return -EFAULT;
+	int temperature = wdt_get_temperature();
 
 	if (copy_to_user(buf, &temperature, 1))
 		return -EFAULT;
@@ -497,7 +496,6 @@ static int wdt_temp_release(struct inode *inode, struct file *file)
 {
 	return 0;
 }
-#endif /* CONFIG_WDT_501 */
 
 /**
  *	notify_sys:
@@ -539,7 +537,6 @@ static struct miscdevice wdt_miscdev = {
 	.fops	= &wdt_fops,
 };
 
-#ifdef CONFIG_WDT_501
 static const struct file_operations wdt_temp_fops = {
 	.owner		= THIS_MODULE,
 	.llseek		= no_llseek,
@@ -553,7 +550,6 @@ static struct miscdevice temp_miscdev = {
 	.name	= "temperature",
 	.fops	= &wdt_temp_fops,
 };
-#endif /* CONFIG_WDT_501 */
 
 /*
  *	The WDT card needs to learn about soft shutdowns in order to
@@ -577,9 +573,8 @@ static struct notifier_block wdt_notifier = {
 static void __exit wdt_exit(void)
 {
 	misc_deregister(&wdt_miscdev);
-#ifdef CONFIG_WDT_501
-	misc_deregister(&temp_miscdev);
-#endif /* CONFIG_WDT_501 */
+	if (type == 501)
+		misc_deregister(&temp_miscdev);
 	unregister_reboot_notifier(&wdt_notifier);
 	free_irq(irq, NULL);
 	release_region(io, 8);
@@ -597,12 +592,17 @@ static int __init wdt_init(void)
 {
 	int ret;
 
+	if (type != 500 && type != 501) {
+		printk(KERN_ERR "wdt: unknown card type '%d'.\n", type);
+		return -ENODEV;
+	}
+
 	/* Check that the heartbeat value is within it's range;
 	   if not reset to the default */
 	if (wdt_set_heartbeat(heartbeat)) {
 		wdt_set_heartbeat(WD_TIMO);
-		printk(KERN_INFO "wdt: heartbeat value must be 0 < heartbeat < 65536, using %d\n",
-			WD_TIMO);
+		printk(KERN_INFO "wdt: heartbeat value must be "
+			"0 < heartbeat < 65536, using %d\n", WD_TIMO);
 	}
 
 	if (!request_region(io, 8, "wdt501p")) {
@@ -625,15 +625,14 @@ static int __init wdt_init(void)
 		goto outirq;
 	}
 
-#ifdef CONFIG_WDT_501
-	ret = misc_register(&temp_miscdev);
-	if (ret) {
-		printk(KERN_ERR
-			"wdt: cannot register miscdev on minor=%d (err=%d)\n",
-							TEMP_MINOR, ret);
-		goto outrbt;
+	if (type == 501) {
+		ret = misc_register(&temp_miscdev);
+		if (ret) {
+			printk(KERN_ERR "wdt: cannot register miscdev "
+				"on minor=%d (err=%d)\n", TEMP_MINOR, ret);
+			goto outrbt;
+		}
 	}
-#endif /* CONFIG_WDT_501 */
 
 	ret = misc_register(&wdt_miscdev);
 	if (ret) {
@@ -643,28 +642,25 @@ static int __init wdt_init(void)
 		goto outmisc;
 	}
 
-	ret = 0;
-	printk(KERN_INFO "WDT500/501-P driver 0.10 at 0x%04x (Interrupt %d). heartbeat=%d sec (nowayout=%d)\n",
+	printk(KERN_INFO "WDT500/501-P driver 0.10 "
+		"at 0x%04x (Interrupt %d). heartbeat=%d sec (nowayout=%d)\n",
 		io, irq, heartbeat, nowayout);
-#ifdef CONFIG_WDT_501
-	printk(KERN_INFO "wdt: Fan Tachometer is %s\n",
+	if (type == 501)
+		printk(KERN_INFO "wdt: Fan Tachometer is %s\n",
 				(tachometer ? "Enabled" : "Disabled"));
-#endif /* CONFIG_WDT_501 */
-
-out:
-	return ret;
+	return 0;
 
 outmisc:
-#ifdef CONFIG_WDT_501
-	misc_deregister(&temp_miscdev);
+	if (type == 501)
+		misc_deregister(&temp_miscdev);
 outrbt:
-#endif /* CONFIG_WDT_501 */
 	unregister_reboot_notifier(&wdt_notifier);
 outirq:
 	free_irq(irq, NULL);
 outreg:
 	release_region(io, 8);
-	goto out;
+out:
+	return ret;
 }
 
 module_init(wdt_init);
diff --git a/drivers/watchdog/wdt977.c b/drivers/watchdog/wdt977.c
index 60e28d49ff5..90ef70eb47d 100644
--- a/drivers/watchdog/wdt977.c
+++ b/drivers/watchdog/wdt977.c
@@ -401,7 +401,7 @@ static long wdt977_ioctl(struct file *file, unsigned int cmd,
 			return -EFAULT;
 
 		if (wdt977_set_timeout(new_timeout))
-		    return -EINVAL;
+			return -EINVAL;
 
 		wdt977_keepalive();
 		/* Fall */
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 851388fafc7..65984006192 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -6,7 +6,16 @@ the server to treat subsequent connections, especially those that
 are authenticated as guest, as reconnections, invalidating the earlier
 user's smb session.  This fix allows cifs to mount multiple times to the
 same server with different userids without risking invalidating earlier
-established security contexts.
+established security contexts.  fsync now sends SMB Flush operation
+to better ensure that we wait for server to write all of the data to
+server disk (not just write it over the network).  Add new mount
+parameter to allow user to disable sending the (slow) SMB flush on
+fsync if desired (fsync still flushes all cached write data to the server).
+Posix file open support added (turned off after one attempt if server
+fails to support it properly, as with Samba server versions prior to 3.3.2)
+Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
+little memory for the "nativeFileSystem" field returned by the server
+during mount). 
 
 Version 1.56
 ------------
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 341a98965bd..6994a0f54f0 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -118,6 +118,18 @@ config CIFS_DEBUG2
 	   option can be turned off unless you are debugging
 	   cifs problems.  If unsure, say N.
 
+config CIFS_DFS_UPCALL
+	  bool "DFS feature support"
+	  depends on CIFS && KEYS
+	  help
+	    Distributed File System (DFS) support is used to access shares
+	    transparently in an enterprise name space, even if the share
+	    moves to a different server.  This feature also enables
+	    an upcall mechanism for CIFS which contacts userspace helper
+	    utilities to provide server name resolution (host names to
+	    IP addresses) which is needed for implicit mounts of DFS junction
+	    points. If unsure, say N.
+
 config CIFS_EXPERIMENTAL
 	  bool "CIFS Experimental Features (EXPERIMENTAL)"
 	  depends on CIFS && EXPERIMENTAL
@@ -131,12 +143,3 @@ config CIFS_EXPERIMENTAL
 	    (which is disabled by default). See the file fs/cifs/README
 	    for more details.  If unsure, say N.
 
-config CIFS_DFS_UPCALL
-	  bool "DFS feature support (EXPERIMENTAL)"
-	  depends on CIFS_EXPERIMENTAL
-	  depends on KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which contacts userspace
-	    helper utilities to provide server name resolution (host names to
-	    IP addresses) which is needed for implicit mounts of DFS junction
-	    points. If unsure, say N.
diff --git a/fs/cifs/README b/fs/cifs/README
index da4515e3be2..07434181623 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -472,6 +472,19 @@ A partial list of the supported mount options follows:
 		even if the cifs server would support posix advisory locks.
 		"forcemand" is accepted as a shorter form of this mount
 		option.
+ nostrictsync   If this mount option is set, when an application does an
+		fsync call then the cifs client does not send an SMB Flush
+		to the server (to force the server to write all dirty data
+		for this file immediately to disk), although cifs still sends
+		all dirty (cached) file data to the server and waits for the
+		server to respond to the write.  Since SMB Flush can be
+		very slow, and some servers may be reliable enough (to risk
+		delaying slightly flushing the data to disk on the server),
+		turning on this option may be useful to improve performance for
+		applications that fsync too much, at a small risk of server
+		crash.  If this mount option is not set, by default cifs will
+		send an SMB flush request (and wait for a response) on every
+		fsync call.
  nodfs          Disable DFS (global name space support) even if the
 		server claims to support it.  This can help work around
 		a problem with parsing of DFS paths with Samba server
@@ -692,13 +705,14 @@ require this helper. Note that NTLMv2 security (which does not require the
 cifs.upcall helper program), instead of using Kerberos, is sufficient for
 some use cases.
 
-Enabling DFS support (used to access shares transparently in an MS-DFS
-global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled.  In
-addition, DFS support for target shares which are specified as UNC
+DFS support allows transparent redirection to shares in an MS-DFS name space.
+In addition, DFS support for target shares which are specified as UNC
 names which begin with host names (rather than IP addresses) requires
 a user space helper (such as cifs.upcall) to be present in order to
 translate host names to ip address, and the user space helper must also
-be configured in the file /etc/request-key.conf
+be configured in the file /etc/request-key.conf.  Samba, Windows servers and
+many NAS appliances support DFS as a way of constructing a global name
+space to ease network configuration and improve reliability.
 
 To use cifs Kerberos and DFS support, the Linux keyutils package should be
 installed and something like the following lines should be added to the
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 490e34bbf27..877e4d9a115 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -340,6 +340,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 				seq_printf(m, "\nWrites: %d Bytes: %lld",
 					atomic_read(&tcon->num_writes),
 					(long long)(tcon->bytes_written));
+				seq_printf(m, "\nFlushes: %d",
+					atomic_read(&tcon->num_flushes));
 				seq_printf(m, "\nLocks: %d HardLinks: %d "
 					      "Symlinks: %d",
 					atomic_read(&tcon->num_locks),
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 85c0a74d034..5fdbf8a1447 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -104,9 +104,9 @@ static char *cifs_get_share_name(const char *node_name)
 
 
 /**
- * compose_mount_options	-	creates mount options for refferral
+ * cifs_compose_mount_options	-	creates mount options for refferral
  * @sb_mountdata:	parent/root DFS mount options (template)
- * @dentry:		point where we are going to mount
+ * @fullpath:		full path in UNC format
  * @ref:		server's referral
  * @devname:		pointer for saving device name
  *
@@ -116,8 +116,8 @@ static char *cifs_get_share_name(const char *node_name)
  * Returns: pointer to new mount options or ERR_PTR.
  * Caller is responcible for freeing retunrned value if it is not error.
  */
-static char *compose_mount_options(const char *sb_mountdata,
-				   struct dentry *dentry,
+char *cifs_compose_mount_options(const char *sb_mountdata,
+				   const char *fullpath,
 				   const struct dfs_info3_param *ref,
 				   char **devname)
 {
@@ -128,7 +128,6 @@ static char *compose_mount_options(const char *sb_mountdata,
 	char *srvIP = NULL;
 	char sep = ',';
 	int off, noff;
-	char *fullpath;
 
 	if (sb_mountdata == NULL)
 		return ERR_PTR(-EINVAL);
@@ -202,17 +201,6 @@ static char *compose_mount_options(const char *sb_mountdata,
 		goto compose_mount_options_err;
 	}
 
-	/*
-	 * this function gives us a path with a double backslash prefix. We
-	 * require a single backslash for DFS. Temporarily increment fullpath
-	 * to put it in the proper form and decrement before freeing it.
-	 */
-	fullpath = build_path_from_dentry(dentry);
-	if (!fullpath) {
-		rc = -ENOMEM;
-		goto compose_mount_options_err;
-	}
-	++fullpath;
 	tkn_e = strchr(tkn_e + 1, '\\');
 	if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
 		strncat(mountdata, &sep, 1);
@@ -221,8 +209,6 @@ static char *compose_mount_options(const char *sb_mountdata,
 			strcat(mountdata, tkn_e + 1);
 		strcat(mountdata, fullpath + ref->path_consumed);
 	}
-	--fullpath;
-	kfree(fullpath);
 
 	/*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
 	/*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
@@ -245,10 +231,20 @@ static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
 	struct vfsmount *mnt;
 	char *mountdata;
 	char *devname = NULL;
+	char *fullpath;
 
 	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-	mountdata = compose_mount_options(cifs_sb->mountdata,
-						dentry, ref, &devname);
+	/*
+	 * this function gives us a path with a double backslash prefix. We
+	 * require a single backslash for DFS.
+	 */
+	fullpath = build_path_from_dentry(dentry);
+	if (!fullpath)
+		return ERR_PTR(-ENOMEM);
+
+	mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
+			fullpath + 1, ref, &devname);
+	kfree(fullpath);
 
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index c4c306f7b06..4797787c6a4 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -32,6 +32,7 @@
 #define CIFS_MOUNT_OVERR_GID    0x800 /* override gid returned from server    */
 #define CIFS_MOUNT_DYNPERM      0x1000 /* allow in-memory only mode setting   */
 #define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
+#define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
 
 struct cifs_sb_info {
 	struct cifsTconInfo *tcon;	/* primary mount */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e004f6db5fc..9fbf4dff5da 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -254,6 +254,7 @@ struct cifsTconInfo {
 	atomic_t num_smbs_sent;
 	atomic_t num_writes;
 	atomic_t num_reads;
+	atomic_t num_flushes;
 	atomic_t num_oplock_brks;
 	atomic_t num_opens;
 	atomic_t num_closes;
@@ -298,6 +299,7 @@ struct cifsTconInfo {
 	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
 	bool local_lease:1; /* check leases (only) on local system not remote */
+	bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
 	bool need_reconnect:1; /* connection reset, tid now invalid */
 	/* BB add field for back pointer to sb struct(s)? */
 };
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b4e2e9f0ee3..b370489c8da 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifspdu.h
  *
- *   Copyright (c) International Business Machines  Corp., 2002,2008
+ *   Copyright (c) International Business Machines  Corp., 2002,2009
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -23,6 +23,7 @@
 #define _CIFSPDU_H
 
 #include <net/sock.h>
+#include "smbfsctl.h"
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 #define LANMAN_PROT 0
@@ -34,15 +35,15 @@
 #define POSIX_PROT  (CIFS_PROT+1)
 #define BAD_PROT 0xFFFF
 
-/* SMB command codes */
-/*
- * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+/* SMB command codes:
+ * Note some commands have minimal (wct=0,bcc=0), or uninteresting, responses
  * (ie which include no useful data other than the SMB error code itself).
- * Knowing this helps avoid response buffer allocations and copy in some cases
+ * This can allow us to avoid response buffer allocations and copy in some cases
  */
 #define SMB_COM_CREATE_DIRECTORY      0x00 /* trivial response */
 #define SMB_COM_DELETE_DIRECTORY      0x01 /* trivial response */
 #define SMB_COM_CLOSE                 0x04 /* triv req/rsp, timestamp ignored */
+#define SMB_COM_FLUSH                 0x05 /* triv req/rsp */
 #define SMB_COM_DELETE                0x06 /* trivial response */
 #define SMB_COM_RENAME                0x07 /* trivial response */
 #define SMB_COM_QUERY_INFORMATION     0x08 /* aka getattr */
@@ -790,6 +791,12 @@ typedef struct smb_com_close_rsp {
 	__u16 ByteCount;	/* bct = 0 */
 } __attribute__((packed)) CLOSE_RSP;
 
+typedef struct smb_com_flush_req {
+	struct smb_hdr hdr;	/* wct = 1 */
+	__u16 FileID;
+	__u16 ByteCount;	/* 0 */
+} __attribute__((packed)) FLUSH_REQ;
+
 typedef struct smb_com_findclose_req {
 	struct smb_hdr hdr; /* wct = 1 */
 	__u16 FileID;
@@ -1924,19 +1931,19 @@ typedef struct smb_com_transaction2_get_dfs_refer_req {
 #define DFS_TYPE_ROOT 0x0001
 
 /* Referral Entry Flags */
-#define DFS_NAME_LIST_REF 0x0200
+#define DFS_NAME_LIST_REF 0x0200 /* set for domain or DC referral responses */
+#define DFS_TARGET_SET_BOUNDARY 0x0400 /* only valid with version 4 dfs req */
 
-typedef struct dfs_referral_level_3 {
-	__le16 VersionNumber;
+typedef struct dfs_referral_level_3 { /* version 4 is same, + one flag bit */
+	__le16 VersionNumber;  /* must be 3 or 4 */
 	__le16 Size;
 	__le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */
-	__le16 ReferralEntryFlags; /* 0x0200 bit set only for domain
-				      or DC referral responce */
+	__le16 ReferralEntryFlags;
 	__le32 TimeToLive;
 	__le16 DfsPathOffset;
 	__le16 DfsAlternatePathOffset;
 	__le16 NetworkAddressOffset; /* offset of the link target */
-	__le16 ServiceSiteGuid;
+	__u8   ServiceSiteGuid[16];  /* MBZ, ignored */
 } __attribute__((packed)) REFERRAL3;
 
 typedef struct smb_com_transaction_get_dfs_refer_rsp {
@@ -1946,48 +1953,15 @@ typedef struct smb_com_transaction_get_dfs_refer_rsp {
 	__u8 Pad;
 	__le16 PathConsumed;
 	__le16 NumberOfReferrals;
-	__le16 DFSFlags;
-	__u16 Pad2;
+	__le32 DFSFlags;
 	REFERRAL3 referrals[1];	/* array of level 3 dfs_referral structures */
 	/* followed by the strings pointed to by the referral structures */
 } __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP;
 
 /* DFS Flags */
-#define DFSREF_REFERRAL_SERVER  0x0001
-#define DFSREF_STORAGE_SERVER   0x0002
-
-/* IOCTL information */
-/*
- * List of ioctl function codes that look to be of interest to remote clients
- * like this one.  Need to do some experimentation to make sure they all work
- * remotely.  Some of the following, such as the encryption/compression ones
- * would be invoked from tools via a specialized hook into the VFS rather
- * than via the standard vfs entry points
- */
-#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
-#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
-#define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
-#define FSCTL_LOCK_VOLUME            0x00090018
-#define FSCTL_UNLOCK_VOLUME          0x0009001C
-#define FSCTL_GET_COMPRESSION        0x0009003C
-#define FSCTL_SET_COMPRESSION        0x0009C040
-#define FSCTL_REQUEST_FILTER_OPLOCK  0x0009008C
-#define FSCTL_FILESYS_GET_STATISTICS 0x00090090
-#define FSCTL_SET_REPARSE_POINT      0x000900A4
-#define FSCTL_GET_REPARSE_POINT      0x000900A8
-#define FSCTL_DELETE_REPARSE_POINT   0x000900AC
-#define FSCTL_SET_SPARSE             0x000900C4
-#define FSCTL_SET_ZERO_DATA          0x000900C8
-#define FSCTL_SET_ENCRYPTION         0x000900D7
-#define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB
-#define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF
-#define FSCTL_READ_RAW_ENCRYPTED     0x000900E3
-#define FSCTL_SIS_COPYFILE           0x00090100
-#define FSCTL_SIS_LINK_FILES         0x0009C104
-
-#define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
-#define IO_REPARSE_TAG_HSM           0xC0000004
-#define IO_REPARSE_TAG_SIS           0x80000007
+#define DFSREF_REFERRAL_SERVER  0x00000001 /* all targets are DFS roots */
+#define DFSREF_STORAGE_SERVER   0x00000002 /* no further ref requests needed */
+#define DFSREF_TARGET_FAILBACK  0x00000004 /* only for DFS referral version 4 */
 
 /*
  ************************************************************************
@@ -2508,8 +2482,6 @@ struct data_blob {
 	6) Use nanosecond timestamps throughout all time fields if
 	   corresponding attribute flag is set
 	7) sendfile - handle based copy
-	8) Direct i/o
-	9) Misc fcntls?
 
 	what about fixing 64 bit alignment
 
@@ -2628,7 +2600,5 @@ typedef struct file_chattr_info {
 	__le64	mode; /* list of actual attribute bits on this inode */
 } __attribute__((packed)) FILE_CHATTR_INFO;  /* ext attributes
 						(chattr, chflags) level 0x206 */
-
-#endif
-
+#endif 				/* POSIX */
 #endif				/* _CIFSPDU_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 083dfc57c7a..4167716d32f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -44,6 +44,9 @@ extern void _FreeXid(unsigned int);
 extern char *build_path_from_dentry(struct dentry *);
 extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
+extern char *cifs_compose_mount_options(const char *sb_mountdata,
+		const char *fullpath, const struct dfs_info3_param *ref,
+		char **devname);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
 			struct smb_hdr * /* input */ ,
@@ -92,6 +95,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 
+extern int cifs_posix_open(char *full_path, struct inode **pinode,
+			   struct super_block *sb, int mode, int oflags,
+			   int *poplock, __u16 *pnetfid, int xid);
 extern void posix_fill_in_inode(struct inode *tmp_inode,
 				FILE_UNIX_BASIC_INFO *pData, int isNewInode);
 extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
@@ -281,6 +287,9 @@ extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
 			const int smb_file_id);
 
+extern int CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon,
+			const int smb_file_id);
+
 extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
 			const int netfid, unsigned int count,
 			const __u64 lseek, unsigned int *nbytes, char **buf,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 939e2f76b95..bc09c998631 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1934,6 +1934,27 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 }
 
 int
+CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
+{
+	int rc = 0;
+	FLUSH_REQ *pSMB = NULL;
+	cFYI(1, ("In CIFSSMBFlush"));
+
+	rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->FileID = (__u16) smb_file_id;
+	pSMB->ByteCount = 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	cifs_stats_inc(&tcon->num_flushes);
+	if (rc)
+		cERROR(1, ("Send error in Flush = %d", rc));
+
+	return rc;
+}
+
+int
 CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
 	      const char *fromName, const char *toName,
 	      const struct nls_table *nls_codepage, int remap)
@@ -2356,8 +2377,10 @@ winCreateHardLinkRetry:
 				     PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-		pSMB->OldFileName[name_len] = 0;	/* pad */
-		pSMB->OldFileName[name_len + 1] = 0x04;
+
+		/* protocol specifies ASCII buffer format (0x04) for unicode */
+		pSMB->OldFileName[name_len] = 0x04;
+		pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
 		name_len2 =
 		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
 				     toName, PATH_MAX, nls_codepage, remap);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index da0f4ffa061..0de3b5615a2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -95,6 +95,7 @@ struct smb_vol {
 	bool local_lease:1; /* check leases only on local system, not remote */
 	bool noblocksnd:1;
 	bool noautotune:1;
+	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned int sockopt;
@@ -1274,6 +1275,10 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->intr = 0;
 		} else if (strnicmp(data, "intr", 4) == 0) {
 			vol->intr = 1;
+		} else if (strnicmp(data, "nostrictsync", 12) == 0) {
+			vol->nostrictsync = 1;
+		} else if (strnicmp(data, "strictsync", 10) == 0) {
+			vol->nostrictsync = 0;
 		} else if (strnicmp(data, "serverino", 7) == 0) {
 			vol->server_ino = 1;
 		} else if (strnicmp(data, "noserverino", 9) == 0) {
@@ -2160,6 +2165,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
 	if (pvolume_info->nobrl)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+	if (pvolume_info->nostrictsync)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
 	if (pvolume_info->mand_lock)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
 	if (pvolume_info->cifs_acl)
@@ -3667,7 +3674,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 			    BCC(smb_buffer_response)) {
 				kfree(tcon->nativeFileSystem);
 				tcon->nativeFileSystem =
-				    kzalloc(length + 2, GFP_KERNEL);
+				    kzalloc(2*(length + 1), GFP_KERNEL);
 				if (tcon->nativeFileSystem)
 					cifs_strfromUCS_le(
 						tcon->nativeFileSystem,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 89fb7283265..f9b6f68be97 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,7 +129,7 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
-static int cifs_posix_open(char *full_path, struct inode **pinode,
+int cifs_posix_open(char *full_path, struct inode **pinode,
 		    struct super_block *sb, int mode, int oflags,
 		    int *poplock, __u16 *pnetfid, int xid)
 {
@@ -187,7 +187,9 @@ static int cifs_posix_open(char *full_path, struct inode **pinode,
 	if (!pinode)
 		goto posix_open_ret; /* caller does not need info */
 
-	*pinode = cifs_new_inode(sb, &presp_data->UniqueId);
+	if (*pinode == NULL)
+		*pinode = cifs_new_inode(sb, &presp_data->UniqueId);
+	/* else an inode was passed in. Update its info, don't create one */
 
 	/* We do not need to close the file if new_inode fails since
 	   the caller will retry qpathinfo as long as inode is null */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 12bb656fbe7..81747acca4c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -78,8 +78,36 @@ static inline int cifs_convert_flags(unsigned int flags)
 	return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES |
 		FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA |
 		FILE_READ_DATA);
+}
 
+static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
+{
+	fmode_t posix_flags = 0;
 
+	if ((flags & O_ACCMODE) == O_RDONLY)
+		posix_flags = FMODE_READ;
+	else if ((flags & O_ACCMODE) == O_WRONLY)
+		posix_flags = FMODE_WRITE;
+	else if ((flags & O_ACCMODE) == O_RDWR) {
+		/* GENERIC_ALL is too much permission to request
+		   can cause unnecessary access denied on create */
+		/* return GENERIC_ALL; */
+		posix_flags = FMODE_READ | FMODE_WRITE;
+	}
+	/* can not map O_CREAT or O_EXCL or O_TRUNC flags when
+	   reopening a file.  They had their effect on the original open */
+	if (flags & O_APPEND)
+		posix_flags |= (fmode_t)O_APPEND;
+	if (flags & O_SYNC)
+		posix_flags |= (fmode_t)O_SYNC;
+	if (flags & O_DIRECTORY)
+		posix_flags |= (fmode_t)O_DIRECTORY;
+	if (flags & O_NOFOLLOW)
+		posix_flags |= (fmode_t)O_NOFOLLOW;
+	if (flags & O_DIRECT)
+		posix_flags |= (fmode_t)O_DIRECT;
+
+	return posix_flags;
 }
 
 static inline int cifs_get_disposition(unsigned int flags)
@@ -97,6 +125,80 @@ static inline int cifs_get_disposition(unsigned int flags)
 }
 
 /* all arguments to this function must be checked for validity in caller */
+static inline int cifs_posix_open_inode_helper(struct inode *inode,
+			struct file *file, struct cifsInodeInfo *pCifsInode,
+			struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+/*	struct timespec temp; */   /* BB REMOVEME BB */
+
+	file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+	if (file->private_data == NULL)
+		return -ENOMEM;
+	pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
+	write_lock(&GlobalSMBSeslock);
+	list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
+
+	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
+	if (pCifsInode == NULL) {
+		write_unlock(&GlobalSMBSeslock);
+		return -EINVAL;
+	}
+
+	/* want handles we can use to read with first
+	   in the list so we do not have to walk the
+	   list to search for one in write_begin */
+	if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
+		list_add_tail(&pCifsFile->flist,
+			      &pCifsInode->openFileList);
+	} else {
+		list_add(&pCifsFile->flist,
+			 &pCifsInode->openFileList);
+	}
+
+	if (pCifsInode->clientCanCacheRead) {
+		/* we have the inode open somewhere else
+		   no need to discard cache data */
+		goto psx_client_can_cache;
+	}
+
+	/* BB FIXME need to fix this check to move it earlier into posix_open
+	   BB  fIX following section BB FIXME */
+
+	/* if not oplocked, invalidate inode pages if mtime or file
+	   size changed */
+/*	temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
+	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
+			   (file->f_path.dentry->d_inode->i_size ==
+			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
+		cFYI(1, ("inode unchanged on server"));
+	} else {
+		if (file->f_path.dentry->d_inode->i_mapping) {
+			rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
+			if (rc != 0)
+				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
+		}
+		cFYI(1, ("invalidating remote inode since open detected it "
+			 "changed"));
+		invalidate_remote_inode(file->f_path.dentry->d_inode);
+	} */
+
+psx_client_can_cache:
+	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+		pCifsInode->clientCanCacheAll = true;
+		pCifsInode->clientCanCacheRead = true;
+		cFYI(1, ("Exclusive Oplock granted on inode %p",
+			 file->f_path.dentry->d_inode));
+	} else if ((oplock & 0xF) == OPLOCK_READ)
+		pCifsInode->clientCanCacheRead = true;
+
+	/* will have to change the unlock if we reenable the
+	   filemap_fdatawrite (which does not seem necessary */
+	write_unlock(&GlobalSMBSeslock);
+	return 0;
+}
+
+/* all arguments to this function must be checked for validity in caller */
 static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 	struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
 	struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
@@ -167,7 +269,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	int rc = -EACCES;
 	int xid, oplock;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct cifsTconInfo *tcon;
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
 	struct list_head *tmp;
@@ -180,7 +282,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tcon = cifs_sb->tcon;
 
 	if (file->f_flags & O_CREAT) {
 		/* search inode for this file and fill in file->private_data */
@@ -220,6 +322,45 @@ int cifs_open(struct inode *inode, struct file *file)
 
 	cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
 		 inode, file->f_flags, full_path));
+
+	if (oplockEnabled)
+		oplock = REQ_OPLOCK;
+	else
+		oplock = 0;
+
+	if (!tcon->broken_posix_open && tcon->unix_ext &&
+	    (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+		/* can not refresh inode info since size could be stale */
+		rc = cifs_posix_open(full_path, &inode, inode->i_sb,
+				     cifs_sb->mnt_file_mode /* ignored */,
+				     oflags, &oplock, &netfid, xid);
+		if (rc == 0) {
+			cFYI(1, ("posix open succeeded"));
+			/* no need for special case handling of setting mode
+			   on read only files needed here */
+
+			cifs_posix_open_inode_helper(inode, file, pCifsInode,
+						     pCifsFile, oplock, netfid);
+			goto out;
+		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			if (tcon->ses->serverNOS)
+				cERROR(1, ("server %s of type %s returned"
+					   " unexpected error on SMB posix open"
+					   ", disabling posix open support."
+					   " Check if server update available.",
+					   tcon->ses->serverName,
+					   tcon->ses->serverNOS));
+			tcon->broken_posix_open = true;
+		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
+			 (rc != -EOPNOTSUPP)) /* path not found or net err */
+			goto out;
+		/* else fallthrough to retry open the old way on network i/o
+		   or DFS errors */
+	}
+
 	desiredAccess = cifs_convert_flags(file->f_flags);
 
 /*********************************************************************
@@ -248,11 +389,6 @@ int cifs_open(struct inode *inode, struct file *file)
 
 	disposition = cifs_get_disposition(file->f_flags);
 
-	if (oplockEnabled)
-		oplock = REQ_OPLOCK;
-	else
-		oplock = 0;
-
 	/* BB pass O_SYNC flag through on file attributes .. BB */
 
 	/* Also refresh inode by passing in file_info buf returned by SMBOpen
@@ -269,7 +405,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	}
 
 	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
-		rc = CIFSSMBOpen(xid, pTcon, full_path, disposition,
+		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
 			 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
 				 & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -278,7 +414,7 @@ int cifs_open(struct inode *inode, struct file *file)
 
 	if (rc == -EIO) {
 		/* Old server, try legacy style OpenX */
-		rc = SMBLegacyOpen(xid, pTcon, full_path, disposition,
+		rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
 			desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
 			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
 				& CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -295,12 +431,12 @@ int cifs_open(struct inode *inode, struct file *file)
 	}
 	pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
 	write_lock(&GlobalSMBSeslock);
-	list_add(&pCifsFile->tlist, &pTcon->openFileList);
+	list_add(&pCifsFile->tlist, &tcon->openFileList);
 
 	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
 	if (pCifsInode) {
 		rc = cifs_open_inode_helper(inode, file, pCifsInode,
-					    pCifsFile, pTcon,
+					    pCifsFile, tcon,
 					    &oplock, buf, full_path, xid);
 	} else {
 		write_unlock(&GlobalSMBSeslock);
@@ -309,7 +445,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	if (oplock & CIFS_CREATE_ACTION) {
 		/* time to set mode which we can not set earlier due to
 		   problems creating new read-only files */
-		if (pTcon->unix_ext) {
+		if (tcon->unix_ext) {
 			struct cifs_unix_set_info_args args = {
 				.mode	= inode->i_mode,
 				.uid	= NO_CHANGE_64,
@@ -319,7 +455,7 @@ int cifs_open(struct inode *inode, struct file *file)
 				.mtime	= NO_CHANGE_64,
 				.device	= 0,
 			};
-			CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+			CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
 					    cifs_sb->local_nls,
 					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -349,7 +485,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	int rc = -EACCES;
 	int xid, oplock;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct cifsTconInfo *tcon;
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
 	struct inode *inode;
@@ -387,7 +523,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	}
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tcon = cifs_sb->tcon;
 
 /* can not grab rename sem here because various ops, including
    those that already have the rename sem can end up causing writepage
@@ -404,20 +540,37 @@ reopen_error_exit:
 
 	cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
 		 inode, file->f_flags, full_path));
-	desiredAccess = cifs_convert_flags(file->f_flags);
 
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
 	else
 		oplock = 0;
 
+	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+		/* can not refresh inode info since size could be stale */
+		rc = cifs_posix_open(full_path, NULL, inode->i_sb,
+				     cifs_sb->mnt_file_mode /* ignored */,
+				     oflags, &oplock, &netfid, xid);
+		if (rc == 0) {
+			cFYI(1, ("posix reopen succeeded"));
+			goto reopen_success;
+		}
+		/* fallthrough to retry open the old way on errors, especially
+		   in the reconnect path it is important to retry hard */
+	}
+
+	desiredAccess = cifs_convert_flags(file->f_flags);
+
 	/* Can not refresh inode by passing in file_info buf to be returned
 	   by SMBOpen and then calling get_inode_info with returned buf
 	   since file might have write behind data that needs to be flushed
 	   and server version of file size can be stale. If we knew for sure
 	   that inode was not dirty locally we could do this */
 
-	rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess,
+	rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
 			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -426,6 +579,7 @@ reopen_error_exit:
 		cFYI(1, ("cifs_open returned 0x%x", rc));
 		cFYI(1, ("oplock: %d", oplock));
 	} else {
+reopen_success:
 		pCifsFile->netfid = netfid;
 		pCifsFile->invalidHandle = false;
 		up(&pCifsFile->fh_sem);
@@ -439,7 +593,7 @@ reopen_error_exit:
 			   go to server to get inode info */
 				pCifsInode->clientCanCacheAll = false;
 				pCifsInode->clientCanCacheRead = false;
-				if (pTcon->unix_ext)
+				if (tcon->unix_ext)
 					rc = cifs_get_inode_info_unix(&inode,
 						full_path, inode->i_sb, xid);
 				else
@@ -467,7 +621,6 @@ reopen_error_exit:
 			cifs_relock_file(pCifsFile);
 		}
 	}
-
 	kfree(full_path);
 	FreeXid(xid);
 	return rc;
@@ -1523,6 +1676,9 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
 	int xid;
 	int rc = 0;
+	struct cifsTconInfo *tcon;
+	struct cifsFileInfo *smbfile =
+		(struct cifsFileInfo *)file->private_data;
 	struct inode *inode = file->f_path.dentry->d_inode;
 
 	xid = GetXid();
@@ -1534,7 +1690,12 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	if (rc == 0) {
 		rc = CIFS_I(inode)->write_behind_rc;
 		CIFS_I(inode)->write_behind_rc = 0;
+		tcon = CIFS_SB(inode->i_sb)->tcon;
+		if (!rc && tcon && smbfile &&
+		   !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+			rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
 	}
+
 	FreeXid(xid);
 	return rc;
 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4690a360c85..a8797cc6080 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -763,6 +763,9 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	struct cifsTconInfo *pTcon = cifs_sb->tcon;
 	FILE_BASIC_INFO	info_buf;
 
+	if (attrs == NULL)
+		return -EINVAL;
+
 	if (attrs->ia_valid & ATTR_ATIME) {
 		set_time = true;
 		info_buf.LastAccessTime =
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
new file mode 100644
index 00000000000..7056b891e08
--- /dev/null
+++ b/fs/cifs/smbfsctl.h
@@ -0,0 +1,84 @@
+/*
+ *   fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* IOCTL information */
+/*
+ * List of ioctl/fsctl function codes that are or could be useful in the
+ * future to remote clients like cifs or SMB2 client.  There is probably
+ * a slightly larger set of fsctls that NTFS local filesystem could handle,
+ * including the seven below that we do not have struct definitions for.
+ * Even with protocol definitions for most of these now available, we still
+ * need to do some experimentation to identify which are practical to do
+ * remotely.  Some of the following, such as the encryption/compression ones
+ * could be invoked from tools via a specialized hook into the VFS rather
+ * than via the standard vfs entry points
+ */
+#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
+#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
+#define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
+#define FSCTL_LOCK_VOLUME            0x00090018
+#define FSCTL_UNLOCK_VOLUME          0x0009001C
+#define FSCTL_IS_PATHNAME_VALID      0x0009002C /* BB add struct */
+#define FSCTL_GET_COMPRESSION        0x0009003C /* BB add struct */
+#define FSCTL_SET_COMPRESSION        0x0009C040 /* BB add struct */
+#define FSCTL_QUERY_FAT_BPB          0x00090058 /* BB add struct */
+/* Verify the next FSCTL number, we had it as 0x00090090 before */
+#define FSCTL_FILESYSTEM_GET_STATS   0x00090060 /* BB add struct */
+#define FSCTL_GET_NTFS_VOLUME_DATA   0x00090064 /* BB add struct */
+#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */
+#define FSCTL_IS_VOLUME_DIRTY        0x00090078 /* BB add struct */
+#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */
+#define FSCTL_REQUEST_FILTER_OPLOCK  0x0009008C
+#define FSCTL_FIND_FILES_BY_SID      0x0009008F /* BB add struct */
+#define FSCTL_SET_OBJECT_ID          0x00090098 /* BB add struct */
+#define FSCTL_GET_OBJECT_ID          0x0009009C /* BB add struct */
+#define FSCTL_DELETE_OBJECT_ID       0x000900A0 /* BB add struct */
+#define FSCTL_SET_REPARSE_POINT      0x000900A4 /* BB add struct */
+#define FSCTL_GET_REPARSE_POINT      0x000900A8 /* BB add struct */
+#define FSCTL_DELETE_REPARSE_POINT   0x000900AC /* BB add struct */
+#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
+#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
+#define FSCTL_SET_SPARSE             0x000900C4 /* BB add struct */
+#define FSCTL_SET_ZERO_DATA          0x000900C8 /* BB add struct */
+#define FSCTL_SET_ENCRYPTION         0x000900D7 /* BB add struct */
+#define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB /* BB add struct */
+#define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF /* BB add struct */
+#define FSCTL_READ_RAW_ENCRYPTED     0x000900E3 /* BB add struct */
+#define FSCTL_READ_FILE_USN_DATA     0x000900EB /* BB add struct */
+#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
+#define FSCTL_SIS_COPYFILE           0x00090100 /* BB add struct */
+#define FSCTL_RECALL_FILE            0x00090117 /* BB add struct */
+#define FSCTL_QUERY_SPARING_INFO     0x00090138 /* BB add struct */
+#define FSCTL_SET_ZERO_ON_DEALLOC    0x00090194 /* BB add struct */
+#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
+#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
+#define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
+#define FSCTL_SIS_LINK_FILES         0x0009C104
+#define FSCTL_PIPE_PEEK              0x0011400C /* BB add struct */
+#define FSCTL_PIPE_TRANSCEIVE        0x0011C017 /* BB add struct */
+/* strange that the number for this op is not sequential with previous op */
+#define FSCTL_PIPE_WAIT              0x00110018 /* BB add struct */
+#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
+#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+
+#define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
+#define IO_REPARSE_TAG_HSM           0xC0000004
+#define IO_REPARSE_TAG_SIS           0x80000007
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5fa453b49a6..05e5c2e5c0d 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1435,6 +1435,10 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
+static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
 /*
  * Note that we always start a transaction even if we're not journalling
  * data.  This is to preserve ordering: any hole instantiation within
@@ -1505,6 +1509,15 @@ static int ext3_ordered_writepage(struct page *page,
 	if (ext3_journal_current_handle())
 		goto out_fail;
 
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				(1 << BH_Dirty)|(1 << BH_Uptodate));
+	} else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
+		/* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */
+		return block_write_full_page(page, NULL, wbc);
+	}
+	page_bufs = page_buffers(page);
+
 	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 
 	if (IS_ERR(handle)) {
@@ -1512,11 +1525,6 @@ static int ext3_ordered_writepage(struct page *page,
 		goto out_fail;
 	}
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
-	page_bufs = page_buffers(page);
 	walk_page_buffers(handle, page_bufs, 0,
 			PAGE_CACHE_SIZE, NULL, bget_one);
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 0424326f167..311a073afe8 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -48,7 +48,10 @@ struct kvm_irq_level {
 	 * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
 	 * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
 	 */
-	__u32 irq;
+	union {
+		__u32 irq;
+		__s32 status;
+	};
 	__u32 level;
 };
 
@@ -126,6 +129,7 @@ struct kvm_run {
 			__u64 data_offset; /* relative to kvm_run start */
 		} io;
 		struct {
+			struct kvm_debug_exit_arch arch;
 		} debug;
 		/* KVM_EXIT_MMIO */
 		struct {
@@ -217,21 +221,6 @@ struct kvm_interrupt {
 	__u32 irq;
 };
 
-struct kvm_breakpoint {
-	__u32 enabled;
-	__u32 padding;
-	__u64 address;
-};
-
-/* for KVM_DEBUG_GUEST */
-struct kvm_debug_guest {
-	/* int */
-	__u32 enabled;
-	__u32 pad;
-	struct kvm_breakpoint breakpoints[4];
-	__u32 singlestep;
-};
-
 /* for KVM_GET_DIRTY_LOG */
 struct kvm_dirty_log {
 	__u32 slot;
@@ -292,6 +281,17 @@ struct kvm_s390_interrupt {
 	__u64 parm64;
 };
 
+/* for KVM_SET_GUEST_DEBUG */
+
+#define KVM_GUESTDBG_ENABLE		0x00000001
+#define KVM_GUESTDBG_SINGLESTEP		0x00000002
+
+struct kvm_guest_debug {
+	__u32 control;
+	__u32 pad;
+	struct kvm_guest_debug_arch arch;
+};
+
 #define KVM_TRC_SHIFT           16
 /*
  * kvm trace categories
@@ -396,6 +396,57 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_USER_NMI
 #define KVM_CAP_USER_NMI 22
 #endif
+#ifdef __KVM_HAVE_GUEST_DEBUG
+#define KVM_CAP_SET_GUEST_DEBUG 23
+#endif
+#ifdef __KVM_HAVE_PIT
+#define KVM_CAP_REINJECT_CONTROL 24
+#endif
+#ifdef __KVM_HAVE_IOAPIC
+#define KVM_CAP_IRQ_ROUTING 25
+#endif
+#define KVM_CAP_IRQ_INJECT_STATUS 26
+#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
+#define KVM_CAP_DEVICE_DEASSIGNMENT 27
+#endif
+
+#ifdef KVM_CAP_IRQ_ROUTING
+
+struct kvm_irq_routing_irqchip {
+	__u32 irqchip;
+	__u32 pin;
+};
+
+struct kvm_irq_routing_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 pad;
+};
+
+/* gsi routing entry types */
+#define KVM_IRQ_ROUTING_IRQCHIP 1
+#define KVM_IRQ_ROUTING_MSI 2
+
+struct kvm_irq_routing_entry {
+	__u32 gsi;
+	__u32 type;
+	__u32 flags;
+	__u32 pad;
+	union {
+		struct kvm_irq_routing_irqchip irqchip;
+		struct kvm_irq_routing_msi msi;
+		__u32 pad[8];
+	} u;
+};
+
+struct kvm_irq_routing {
+	__u32 nr;
+	__u32 flags;
+	struct kvm_irq_routing_entry entries[0];
+};
+
+#endif
 
 /*
  * ioctls for VM fds
@@ -421,14 +472,19 @@ struct kvm_trace_rec {
 #define KVM_CREATE_PIT		  _IO(KVMIO,  0x64)
 #define KVM_GET_PIT		  _IOWR(KVMIO, 0x65, struct kvm_pit_state)
 #define KVM_SET_PIT		  _IOR(KVMIO,  0x66, struct kvm_pit_state)
+#define KVM_IRQ_LINE_STATUS	  _IOWR(KVMIO, 0x67, struct kvm_irq_level)
 #define KVM_REGISTER_COALESCED_MMIO \
 			_IOW(KVMIO,  0x67, struct kvm_coalesced_mmio_zone)
 #define KVM_UNREGISTER_COALESCED_MMIO \
 			_IOW(KVMIO,  0x68, struct kvm_coalesced_mmio_zone)
 #define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
 				   struct kvm_assigned_pci_dev)
+#define KVM_SET_GSI_ROUTING       _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
 			    struct kvm_assigned_irq)
+#define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
+#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
+				     struct kvm_assigned_pci_dev)
 
 /*
  * ioctls for vcpu fds
@@ -440,7 +496,8 @@ struct kvm_trace_rec {
 #define KVM_SET_SREGS             _IOW(KVMIO,  0x84, struct kvm_sregs)
 #define KVM_TRANSLATE             _IOWR(KVMIO, 0x85, struct kvm_translation)
 #define KVM_INTERRUPT             _IOW(KVMIO,  0x86, struct kvm_interrupt)
-#define KVM_DEBUG_GUEST           _IOW(KVMIO,  0x87, struct kvm_debug_guest)
+/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */
+#define KVM_DEBUG_GUEST           __KVM_DEPRECATED_DEBUG_GUEST
 #define KVM_GET_MSRS              _IOWR(KVMIO, 0x88, struct kvm_msrs)
 #define KVM_SET_MSRS              _IOW(KVMIO,  0x89, struct kvm_msrs)
 #define KVM_SET_CPUID             _IOW(KVMIO,  0x8a, struct kvm_cpuid)
@@ -469,6 +526,29 @@ struct kvm_trace_rec {
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
 /* Available with KVM_CAP_NMI */
 #define KVM_NMI                   _IO(KVMIO,  0x9a)
+/* Available with KVM_CAP_SET_GUEST_DEBUG */
+#define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
+
+/*
+ * Deprecated interfaces
+ */
+struct kvm_breakpoint {
+	__u32 enabled;
+	__u32 padding;
+	__u64 address;
+};
+
+struct kvm_debug_guest {
+	__u32 enabled;
+	__u32 pad;
+	struct kvm_breakpoint breakpoints[4];
+	__u32 singlestep;
+};
+
+#define __KVM_DEPRECATED_DEBUG_GUEST _IOW(KVMIO,  0x87, struct kvm_debug_guest)
+
+#define KVM_IA64_VCPU_GET_STACK   _IOR(KVMIO,  0x9a, void *)
+#define KVM_IA64_VCPU_SET_STACK   _IOW(KVMIO,  0x9b, void *)
 
 #define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
 #define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
@@ -522,6 +602,7 @@ struct kvm_assigned_irq {
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
+#define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
 #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
 
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bf6f703642f..894a56e365e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -37,6 +37,7 @@
 #define KVM_REQ_PENDING_TIMER      5
 #define KVM_REQ_UNHALT             6
 #define KVM_REQ_MMU_SYNC           7
+#define KVM_REQ_KVMCLOCK_UPDATE    8
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
@@ -73,7 +74,7 @@ struct kvm_vcpu {
 	struct kvm_run *run;
 	int guest_mode;
 	unsigned long requests;
-	struct kvm_guest_debug guest_debug;
+	unsigned long guest_debug;
 	int fpu_active;
 	int guest_fpu_loaded;
 	wait_queue_head_t wq;
@@ -107,6 +108,20 @@ struct kvm_memory_slot {
 	int user_alloc;
 };
 
+struct kvm_kernel_irq_routing_entry {
+	u32 gsi;
+	int (*set)(struct kvm_kernel_irq_routing_entry *e,
+		    struct kvm *kvm, int level);
+	union {
+		struct {
+			unsigned irqchip;
+			unsigned pin;
+		} irqchip;
+		struct msi_msg msi;
+	};
+	struct list_head link;
+};
+
 struct kvm {
 	struct mutex lock; /* protects the vcpus array and APIC accesses */
 	spinlock_t mmu_lock;
@@ -127,6 +142,11 @@ struct kvm {
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 #endif
 
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */
+	struct hlist_head mask_notifier_list;
+#endif
+
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
 	struct mmu_notifier mmu_notifier;
 	unsigned long mmu_notifier_seq;
@@ -237,7 +257,6 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 				   int user_alloc);
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg);
-void kvm_arch_destroy_vm(struct kvm *kvm);
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
@@ -255,8 +274,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state);
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state);
-int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-				    struct kvm_debug_guest *dbg);
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg);
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 
 int kvm_arch_init(void *opaque);
@@ -310,7 +329,6 @@ struct kvm_assigned_dev_kernel {
 	int host_irq;
 	bool host_irq_disabled;
 	int guest_irq;
-	struct msi_msg guest_msi;
 #define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
 #define KVM_ASSIGNED_DEV_GUEST_MSI	(1 << 1)
 #define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
@@ -321,8 +339,21 @@ struct kvm_assigned_dev_kernel {
 	struct pci_dev *dev;
 	struct kvm *kvm;
 };
-void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
+
+struct kvm_irq_mask_notifier {
+	void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+	int irq;
+	struct hlist_node link;
+};
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+				    struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+				      struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
+
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
 void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
@@ -464,4 +495,21 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
 }
 #endif
 
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+
+#define KVM_MAX_IRQ_ROUTES 1024
+
+int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_set_irq_routing(struct kvm *kvm,
+			const struct kvm_irq_routing_entry *entries,
+			unsigned nr,
+			unsigned flags);
+void kvm_free_irq_routing(struct kvm *kvm);
+
+#else
+
+static inline void kvm_free_irq_routing(struct kvm *kvm) {}
+
+#endif
+
 #endif
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 9b6f395c962..2b8318c83e5 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -40,17 +40,4 @@ typedef unsigned long  hfn_t;
 
 typedef hfn_t pfn_t;
 
-struct kvm_pio_request {
-	unsigned long count;
-	int cur_count;
-	struct page *guest_pages[2];
-	unsigned guest_page_offset;
-	int in;
-	int port;
-	int size;
-	int string;
-	int down;
-	int rep;
-};
-
 #endif /* __KVM_TYPES_H__ */
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index cf9c679ab38..0f82293a82e 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -55,6 +55,7 @@ enum {
 	MLX4_CMD_CLOSE_PORT	 = 0xa,
 	MLX4_CMD_QUERY_HCA	 = 0xb,
 	MLX4_CMD_QUERY_PORT	 = 0x43,
+	MLX4_CMD_SENSE_PORT	 = 0x4d,
 	MLX4_CMD_SET_PORT	 = 0xc,
 	MLX4_CMD_ACCESS_DDR	 = 0x2e,
 	MLX4_CMD_MAP_ICM	 = 0xffa,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 8f659cc2996..3aff8a6a389 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -155,8 +155,9 @@ enum mlx4_qp_region {
 };
 
 enum mlx4_port_type {
-	MLX4_PORT_TYPE_IB	= 1 << 0,
-	MLX4_PORT_TYPE_ETH	= 1 << 1,
+	MLX4_PORT_TYPE_IB	= 1,
+	MLX4_PORT_TYPE_ETH	= 2,
+	MLX4_PORT_TYPE_AUTO	= 3
 };
 
 enum mlx4_special_vlan_idx {
@@ -237,6 +238,7 @@ struct mlx4_caps {
 	enum mlx4_port_type	port_type[MLX4_MAX_PORTS + 1];
 	u8			supported_type[MLX4_MAX_PORTS + 1];
 	u32			port_mask;
+	enum mlx4_port_type	possible_type[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_buf_list {
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index ec7c6d99ed3..93885830430 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -314,12 +314,12 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
  */
 void ib_destroy_cm_id(struct ib_cm_id *cm_id);
 
-#define IB_SERVICE_ID_AGN_MASK	__constant_cpu_to_be64(0xFF00000000000000ULL)
-#define IB_CM_ASSIGN_SERVICE_ID __constant_cpu_to_be64(0x0200000000000000ULL)
-#define IB_CMA_SERVICE_ID	__constant_cpu_to_be64(0x0000000001000000ULL)
-#define IB_CMA_SERVICE_ID_MASK	__constant_cpu_to_be64(0xFFFFFFFFFF000000ULL)
-#define IB_SDP_SERVICE_ID	__constant_cpu_to_be64(0x0000000000010000ULL)
-#define IB_SDP_SERVICE_ID_MASK	__constant_cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
+#define IB_SERVICE_ID_AGN_MASK	cpu_to_be64(0xFF00000000000000ULL)
+#define IB_CM_ASSIGN_SERVICE_ID	cpu_to_be64(0x0200000000000000ULL)
+#define IB_CMA_SERVICE_ID	cpu_to_be64(0x0000000001000000ULL)
+#define IB_CMA_SERVICE_ID_MASK	cpu_to_be64(0xFFFFFFFFFF000000ULL)
+#define IB_SDP_SERVICE_ID	cpu_to_be64(0x0000000000010000ULL)
+#define IB_SDP_SERVICE_ID_MASK	cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
 
 struct ib_cm_compare_data {
 	u8  data[IB_CM_COMPARE_SIZE];
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index 5f6c40fffcf..d3b9401b77b 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -107,7 +107,7 @@
 #define	IB_MGMT_RMPP_STATUS_ABORT_MAX		127
 
 #define IB_QP0		0
-#define IB_QP1		__constant_htonl(1)
+#define IB_QP1		cpu_to_be32(1)
 #define IB_QP1_QKEY	0x80010000
 #define IB_QP_SET_QKEY	0x80000000
 
@@ -290,7 +290,7 @@ static inline void ib_set_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr, u8 rtime)
  */
 static inline void ib_set_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr, u8 flags)
 {
-	rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF1) |
+	rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF8) |
 				     (flags & 0x7);
 }
 
diff --git a/include/rdma/ib_smi.h b/include/rdma/ib_smi.h
index aaca0878668..98b9086d769 100644
--- a/include/rdma/ib_smi.h
+++ b/include/rdma/ib_smi.h
@@ -63,25 +63,25 @@ struct ib_smp {
 	u8	return_path[IB_SMP_MAX_PATH_HOPS];
 } __attribute__ ((packed));
 
-#define IB_SMP_DIRECTION			__constant_htons(0x8000)
+#define IB_SMP_DIRECTION			cpu_to_be16(0x8000)
 
 /* Subnet management attributes */
-#define IB_SMP_ATTR_NOTICE			__constant_htons(0x0002)
-#define IB_SMP_ATTR_NODE_DESC			__constant_htons(0x0010)
-#define IB_SMP_ATTR_NODE_INFO			__constant_htons(0x0011)
-#define IB_SMP_ATTR_SWITCH_INFO			__constant_htons(0x0012)
-#define IB_SMP_ATTR_GUID_INFO			__constant_htons(0x0014)
-#define IB_SMP_ATTR_PORT_INFO			__constant_htons(0x0015)
-#define IB_SMP_ATTR_PKEY_TABLE			__constant_htons(0x0016)
-#define IB_SMP_ATTR_SL_TO_VL_TABLE		__constant_htons(0x0017)
-#define IB_SMP_ATTR_VL_ARB_TABLE		__constant_htons(0x0018)
-#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE	__constant_htons(0x0019)
-#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE	__constant_htons(0x001A)
-#define IB_SMP_ATTR_MCAST_FORWARD_TABLE		__constant_htons(0x001B)
-#define IB_SMP_ATTR_SM_INFO			__constant_htons(0x0020)
-#define IB_SMP_ATTR_VENDOR_DIAG			__constant_htons(0x0030)
-#define IB_SMP_ATTR_LED_INFO			__constant_htons(0x0031)
-#define IB_SMP_ATTR_VENDOR_MASK			__constant_htons(0xFF00)
+#define IB_SMP_ATTR_NOTICE			cpu_to_be16(0x0002)
+#define IB_SMP_ATTR_NODE_DESC			cpu_to_be16(0x0010)
+#define IB_SMP_ATTR_NODE_INFO			cpu_to_be16(0x0011)
+#define IB_SMP_ATTR_SWITCH_INFO			cpu_to_be16(0x0012)
+#define IB_SMP_ATTR_GUID_INFO			cpu_to_be16(0x0014)
+#define IB_SMP_ATTR_PORT_INFO			cpu_to_be16(0x0015)
+#define IB_SMP_ATTR_PKEY_TABLE			cpu_to_be16(0x0016)
+#define IB_SMP_ATTR_SL_TO_VL_TABLE		cpu_to_be16(0x0017)
+#define IB_SMP_ATTR_VL_ARB_TABLE		cpu_to_be16(0x0018)
+#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE	cpu_to_be16(0x0019)
+#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE	cpu_to_be16(0x001A)
+#define IB_SMP_ATTR_MCAST_FORWARD_TABLE		cpu_to_be16(0x001B)
+#define IB_SMP_ATTR_SM_INFO			cpu_to_be16(0x0020)
+#define IB_SMP_ATTR_VENDOR_DIAG			cpu_to_be16(0x0030)
+#define IB_SMP_ATTR_LED_INFO			cpu_to_be16(0x0031)
+#define IB_SMP_ATTR_VENDOR_MASK			cpu_to_be16(0xFF00)
 
 struct ib_port_info {
 	__be64 mkey;
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 23b81cf242a..c3b99def9cb 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -83,24 +83,28 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 	return result;
 }
 
-static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
+static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
 {
 	union ioapic_redir_entry *pent;
+	int injected = -1;
 
 	pent = &ioapic->redirtbl[idx];
 
 	if (!pent->fields.mask) {
-		int injected = ioapic_deliver(ioapic, idx);
+		injected = ioapic_deliver(ioapic, idx);
 		if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
 			pent->fields.remote_irr = 1;
 	}
 	if (!pent->fields.trig_mode)
 		ioapic->irr &= ~(1 << idx);
+
+	return injected;
 }
 
 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 {
 	unsigned index;
+	bool mask_before, mask_after;
 
 	switch (ioapic->ioregsel) {
 	case IOAPIC_REG_VERSION:
@@ -120,6 +124,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		ioapic_debug("change redir index %x val %x\n", index, val);
 		if (index >= IOAPIC_NUM_PINS)
 			return;
+		mask_before = ioapic->redirtbl[index].fields.mask;
 		if (ioapic->ioregsel & 1) {
 			ioapic->redirtbl[index].bits &= 0xffffffff;
 			ioapic->redirtbl[index].bits |= (u64) val << 32;
@@ -128,6 +133,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 			ioapic->redirtbl[index].bits |= (u32) val;
 			ioapic->redirtbl[index].fields.remote_irr = 0;
 		}
+		mask_after = ioapic->redirtbl[index].fields.mask;
+		if (mask_before != mask_after)
+			kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after);
 		if (ioapic->irr & (1 << index))
 			ioapic_service(ioapic, index);
 		break;
@@ -202,7 +210,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 	u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
 	u32 deliver_bitmask;
 	struct kvm_vcpu *vcpu;
-	int vcpu_id, r = 0;
+	int vcpu_id, r = -1;
 
 	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
 		     "vector=%x trig_mode=%x\n",
@@ -242,7 +250,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 			deliver_bitmask &= ~(1 << vcpu_id);
 			vcpu = ioapic->kvm->vcpus[vcpu_id];
 			if (vcpu) {
-				r = ioapic_inj_irq(ioapic, vcpu, vector,
+				if (r < 0)
+					r = 0;
+				r += ioapic_inj_irq(ioapic, vcpu, vector,
 					       trig_mode, delivery_mode);
 			}
 		}
@@ -253,8 +263,10 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 				continue;
 			deliver_bitmask &= ~(1 << vcpu_id);
 			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu)
+			if (vcpu) {
 				ioapic_inj_nmi(vcpu);
+				r = 1;
+			}
 			else
 				ioapic_debug("NMI to vcpu %d failed\n",
 						vcpu->vcpu_id);
@@ -268,11 +280,12 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 	return r;
 }
 
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 {
 	u32 old_irr = ioapic->irr;
 	u32 mask = 1 << irq;
 	union ioapic_redir_entry entry;
+	int ret = 1;
 
 	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
 		entry = ioapic->redirtbl[irq];
@@ -283,25 +296,26 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 			ioapic->irr |= mask;
 			if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
 			    || !entry.fields.remote_irr)
-				ioapic_service(ioapic, irq);
+				ret = ioapic_service(ioapic, irq);
 		}
 	}
+	return ret;
 }
 
-static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi,
+static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin,
 				    int trigger_mode)
 {
 	union ioapic_redir_entry *ent;
 
-	ent = &ioapic->redirtbl[gsi];
+	ent = &ioapic->redirtbl[pin];
 
-	kvm_notify_acked_irq(ioapic->kvm, gsi);
+	kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin);
 
 	if (trigger_mode == IOAPIC_LEVEL_TRIG) {
 		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
 		ent->fields.remote_irr = 0;
-		if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
-			ioapic_service(ioapic, gsi);
+		if (!ent->fields.mask && (ioapic->irr & (1 << pin)))
+			ioapic_service(ioapic, pin);
 	}
 }
 
@@ -426,3 +440,4 @@ int kvm_ioapic_init(struct kvm *kvm)
 	kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
 	return 0;
 }
+
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 49c9581d258..a34bd5e6436 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -83,7 +83,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
 				       unsigned long bitmap);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
-void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
 u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 				u8 dest_mode);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index aa5d1e5c497..864ac5483ba 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -20,35 +20,132 @@
  */
 
 #include <linux/kvm_host.h>
+
+#include <asm/msidef.h>
+
 #include "irq.h"
 
 #include "ioapic.h"
 
-/* This should be called with the kvm->lock mutex held */
-void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
+static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+			   struct kvm *kvm, int level)
+{
+#ifdef CONFIG_X86
+	return kvm_pic_set_irq(pic_irqchip(kvm), e->irqchip.pin, level);
+#else
+	return -1;
+#endif
+}
+
+static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
+			      struct kvm *kvm, int level)
+{
+	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
+}
+
+static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+		       struct kvm *kvm, int level)
+{
+	int vcpu_id, r = -1;
+	struct kvm_vcpu *vcpu;
+	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+	int dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK)
+			>> MSI_ADDR_DEST_ID_SHIFT;
+	int vector = (e->msi.data & MSI_DATA_VECTOR_MASK)
+			>> MSI_DATA_VECTOR_SHIFT;
+	int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
+				(unsigned long *)&e->msi.address_lo);
+	int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
+				(unsigned long *)&e->msi.data);
+	int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
+				(unsigned long *)&e->msi.data);
+	u32 deliver_bitmask;
+
+	BUG_ON(!ioapic);
+
+	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
+				dest_id, dest_mode);
+	/* IOAPIC delivery mode value is the same as MSI here */
+	switch (delivery_mode) {
+	case IOAPIC_LOWEST_PRIORITY:
+		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
+				deliver_bitmask);
+		if (vcpu != NULL)
+			r = kvm_apic_set_irq(vcpu, vector, trig_mode);
+		else
+			printk(KERN_INFO "kvm: null lowest priority vcpu!\n");
+		break;
+	case IOAPIC_FIXED:
+		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+			if (!(deliver_bitmask & (1 << vcpu_id)))
+				continue;
+			deliver_bitmask &= ~(1 << vcpu_id);
+			vcpu = ioapic->kvm->vcpus[vcpu_id];
+			if (vcpu) {
+				if (r < 0)
+					r = 0;
+				r += kvm_apic_set_irq(vcpu, vector, trig_mode);
+			}
+		}
+		break;
+	default:
+		break;
+	}
+	return r;
+}
+
+/* This should be called with the kvm->lock mutex held
+ * Return value:
+ *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
+ *  = 0   Interrupt was coalesced (previous irq is still pending)
+ *  > 0   Number of CPUs interrupt was delivered to
+ */
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
 {
-	unsigned long *irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
+	struct kvm_kernel_irq_routing_entry *e;
+	unsigned long *irq_state, sig_level;
+	int ret = -1;
+
+	if (irq < KVM_IOAPIC_NUM_PINS) {
+		irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
 
-	/* Logical OR for level trig interrupt */
-	if (level)
-		set_bit(irq_source_id, irq_state);
-	else
-		clear_bit(irq_source_id, irq_state);
+		/* Logical OR for level trig interrupt */
+		if (level)
+			set_bit(irq_source_id, irq_state);
+		else
+			clear_bit(irq_source_id, irq_state);
+		sig_level = !!(*irq_state);
+	} else /* Deal with MSI/MSI-X */
+		sig_level = 1;
 
 	/* Not possible to detect if the guest uses the PIC or the
 	 * IOAPIC.  So set the bit in both. The guest will ignore
 	 * writes to the unused one.
 	 */
-	kvm_ioapic_set_irq(kvm->arch.vioapic, irq, !!(*irq_state));
-#ifdef CONFIG_X86
-	kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state));
-#endif
+	list_for_each_entry(e, &kvm->irq_routing, link)
+		if (e->gsi == irq) {
+			int r = e->set(e, kvm, sig_level);
+			if (r < 0)
+				continue;
+
+			ret = r + ((ret < 0) ? 0 : ret);
+		}
+	return ret;
 }
 
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
+	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_ack_notifier *kian;
 	struct hlist_node *n;
+	unsigned gsi = pin;
+
+	list_for_each_entry(e, &kvm->irq_routing, link)
+		if (e->irqchip.irqchip == irqchip &&
+		    e->irqchip.pin == pin) {
+			gsi = e->gsi;
+			break;
+		}
 
 	hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
 		if (kian->gsi == gsi)
@@ -99,3 +196,177 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 		clear_bit(irq_source_id, &kvm->arch.irq_states[i]);
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
 }
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+				    struct kvm_irq_mask_notifier *kimn)
+{
+	kimn->irq = irq;
+	hlist_add_head(&kimn->link, &kvm->mask_notifier_list);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+				      struct kvm_irq_mask_notifier *kimn)
+{
+	hlist_del(&kimn->link);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
+{
+	struct kvm_irq_mask_notifier *kimn;
+	struct hlist_node *n;
+
+	hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link)
+		if (kimn->irq == irq)
+			kimn->func(kimn, mask);
+}
+
+static void __kvm_free_irq_routing(struct list_head *irq_routing)
+{
+	struct kvm_kernel_irq_routing_entry *e, *n;
+
+	list_for_each_entry_safe(e, n, irq_routing, link)
+		kfree(e);
+}
+
+void kvm_free_irq_routing(struct kvm *kvm)
+{
+	__kvm_free_irq_routing(&kvm->irq_routing);
+}
+
+static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+			       const struct kvm_irq_routing_entry *ue)
+{
+	int r = -EINVAL;
+	int delta;
+
+	e->gsi = ue->gsi;
+	switch (ue->type) {
+	case KVM_IRQ_ROUTING_IRQCHIP:
+		delta = 0;
+		switch (ue->u.irqchip.irqchip) {
+		case KVM_IRQCHIP_PIC_MASTER:
+			e->set = kvm_set_pic_irq;
+			break;
+		case KVM_IRQCHIP_PIC_SLAVE:
+			e->set = kvm_set_pic_irq;
+			delta = 8;
+			break;
+		case KVM_IRQCHIP_IOAPIC:
+				e->set = kvm_set_ioapic_irq;
+			break;
+		default:
+			goto out;
+		}
+		e->irqchip.irqchip = ue->u.irqchip.irqchip;
+		e->irqchip.pin = ue->u.irqchip.pin + delta;
+		break;
+	case KVM_IRQ_ROUTING_MSI:
+		e->set = kvm_set_msi;
+		e->msi.address_lo = ue->u.msi.address_lo;
+		e->msi.address_hi = ue->u.msi.address_hi;
+		e->msi.data = ue->u.msi.data;
+		break;
+	default:
+		goto out;
+	}
+	r = 0;
+out:
+	return r;
+}
+
+
+int kvm_set_irq_routing(struct kvm *kvm,
+			const struct kvm_irq_routing_entry *ue,
+			unsigned nr,
+			unsigned flags)
+{
+	struct list_head irq_list = LIST_HEAD_INIT(irq_list);
+	struct list_head tmp = LIST_HEAD_INIT(tmp);
+	struct kvm_kernel_irq_routing_entry *e = NULL;
+	unsigned i;
+	int r;
+
+	for (i = 0; i < nr; ++i) {
+		r = -EINVAL;
+		if (ue->gsi >= KVM_MAX_IRQ_ROUTES)
+			goto out;
+		if (ue->flags)
+			goto out;
+		r = -ENOMEM;
+		e = kzalloc(sizeof(*e), GFP_KERNEL);
+		if (!e)
+			goto out;
+		r = setup_routing_entry(e, ue);
+		if (r)
+			goto out;
+		++ue;
+		list_add(&e->link, &irq_list);
+		e = NULL;
+	}
+
+	mutex_lock(&kvm->lock);
+	list_splice(&kvm->irq_routing, &tmp);
+	INIT_LIST_HEAD(&kvm->irq_routing);
+	list_splice(&irq_list, &kvm->irq_routing);
+	INIT_LIST_HEAD(&irq_list);
+	list_splice(&tmp, &irq_list);
+	mutex_unlock(&kvm->lock);
+
+	r = 0;
+
+out:
+	kfree(e);
+	__kvm_free_irq_routing(&irq_list);
+	return r;
+}
+
+#define IOAPIC_ROUTING_ENTRY(irq) \
+	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
+	  .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#ifdef CONFIG_X86
+#  define PIC_ROUTING_ENTRY(irq) \
+	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
+	  .u.irqchip.irqchip = SELECT_PIC(irq), .u.irqchip.pin = (irq) % 8 }
+#  define ROUTING_ENTRY2(irq) \
+	IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+#else
+#  define ROUTING_ENTRY2(irq) \
+	IOAPIC_ROUTING_ENTRY(irq)
+#endif
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+	ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+	ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+	ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+	ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+	ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+	ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+	ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+	ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+	ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+	ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+	ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+	ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+#ifdef CONFIG_IA64
+	ROUTING_ENTRY1(24), ROUTING_ENTRY1(25),
+	ROUTING_ENTRY1(26), ROUTING_ENTRY1(27),
+	ROUTING_ENTRY1(28), ROUTING_ENTRY1(29),
+	ROUTING_ENTRY1(30), ROUTING_ENTRY1(31),
+	ROUTING_ENTRY1(32), ROUTING_ENTRY1(33),
+	ROUTING_ENTRY1(34), ROUTING_ENTRY1(35),
+	ROUTING_ENTRY1(36), ROUTING_ENTRY1(37),
+	ROUTING_ENTRY1(38), ROUTING_ENTRY1(39),
+	ROUTING_ENTRY1(40), ROUTING_ENTRY1(41),
+	ROUTING_ENTRY1(42), ROUTING_ENTRY1(43),
+	ROUTING_ENTRY1(44), ROUTING_ENTRY1(45),
+	ROUTING_ENTRY1(46), ROUTING_ENTRY1(47),
+#endif
+};
+
+int kvm_setup_default_irq_routing(struct kvm *kvm)
+{
+	return kvm_set_irq_routing(kvm, default_routing,
+				   ARRAY_SIZE(default_routing), 0);
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 29a667ce35b..605697e9c4d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -47,10 +47,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-#ifdef CONFIG_X86
-#include <asm/msidef.h>
-#endif
-
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 #include "coalesced_mmio.h"
 #endif
@@ -85,57 +81,6 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 static bool kvm_rebooting;
 
 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
-
-#ifdef CONFIG_X86
-static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev)
-{
-	int vcpu_id;
-	struct kvm_vcpu *vcpu;
-	struct kvm_ioapic *ioapic = ioapic_irqchip(dev->kvm);
-	int dest_id = (dev->guest_msi.address_lo & MSI_ADDR_DEST_ID_MASK)
-			>> MSI_ADDR_DEST_ID_SHIFT;
-	int vector = (dev->guest_msi.data & MSI_DATA_VECTOR_MASK)
-			>> MSI_DATA_VECTOR_SHIFT;
-	int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
-				(unsigned long *)&dev->guest_msi.address_lo);
-	int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
-				(unsigned long *)&dev->guest_msi.data);
-	int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
-				(unsigned long *)&dev->guest_msi.data);
-	u32 deliver_bitmask;
-
-	BUG_ON(!ioapic);
-
-	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
-				dest_id, dest_mode);
-	/* IOAPIC delivery mode value is the same as MSI here */
-	switch (delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
-				deliver_bitmask);
-		if (vcpu != NULL)
-			kvm_apic_set_irq(vcpu, vector, trig_mode);
-		else
-			printk(KERN_INFO "kvm: null lowest priority vcpu!\n");
-		break;
-	case IOAPIC_FIXED:
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu)
-				kvm_apic_set_irq(vcpu, vector, trig_mode);
-		}
-		break;
-	default:
-		printk(KERN_INFO "kvm: unsupported MSI delivery mode\n");
-	}
-}
-#else
-static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) {}
-#endif
-
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
 {
@@ -162,13 +107,10 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 	 * finer-grained lock, update this
 	 */
 	mutex_lock(&assigned_dev->kvm->lock);
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_INTX)
-		kvm_set_irq(assigned_dev->kvm,
-			    assigned_dev->irq_source_id,
-			    assigned_dev->guest_irq, 1);
-	else if (assigned_dev->irq_requested_type &
-				KVM_ASSIGNED_DEV_GUEST_MSI) {
-		assigned_device_msi_dispatch(assigned_dev);
+	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+		    assigned_dev->guest_irq, 1);
+
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI) {
 		enable_irq(assigned_dev->host_irq);
 		assigned_dev->host_irq_disabled = false;
 	}
@@ -331,18 +273,24 @@ static int assigned_device_update_msi(struct kvm *kvm,
 {
 	int r;
 
+	adev->guest_irq = airq->guest_irq;
 	if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
 		/* x86 don't care upper address of guest msi message addr */
 		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI;
 		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX;
-		adev->guest_msi.address_lo = airq->guest_msi.addr_lo;
-		adev->guest_msi.data = airq->guest_msi.data;
 		adev->ack_notifier.gsi = -1;
 	} else if (msi2intx) {
 		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX;
 		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI;
-		adev->guest_irq = airq->guest_irq;
 		adev->ack_notifier.gsi = airq->guest_irq;
+	} else {
+		/*
+		 * Guest require to disable device MSI, we disable MSI and
+		 * re-enable INTx by default again. Notice it's only for
+		 * non-msi2intx.
+		 */
+		assigned_device_update_intx(kvm, adev, airq);
+		return 0;
 	}
 
 	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
@@ -379,6 +327,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 {
 	int r = 0;
 	struct kvm_assigned_dev_kernel *match;
+	u32 current_flags = 0, changed_flags;
 
 	mutex_lock(&kvm->lock);
 
@@ -416,8 +365,13 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		}
 	}
 
-	if ((!msi2intx &&
-	     (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI)) ||
+	if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) &&
+		 (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI))
+		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI;
+
+	changed_flags = assigned_irq->flags ^ current_flags;
+
+	if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) ||
 	    (msi2intx && match->dev->msi_enabled)) {
 #ifdef CONFIG_X86
 		r = assigned_device_update_msi(kvm, match, assigned_irq);
@@ -563,7 +517,7 @@ static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
 		goto out;
 	}
 
-	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
+	if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
 		kvm_deassign_device(kvm, match);
 
 	kvm_free_assigned_device(kvm, match);
@@ -581,8 +535,10 @@ static inline int valid_vcpu(int n)
 
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
-	if (pfn_valid(pfn))
-		return PageReserved(pfn_to_page(pfn));
+	if (pfn_valid(pfn)) {
+		struct page *page = compound_head(pfn_to_page(pfn));
+		return PageReserved(page);
+	}
 
 	return true;
 }
@@ -828,6 +784,10 @@ static struct kvm *kvm_create_vm(void)
 
 	if (IS_ERR(kvm))
 		goto out;
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	INIT_LIST_HEAD(&kvm->irq_routing);
+	INIT_HLIST_HEAD(&kvm->mask_notifier_list);
+#endif
 
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
@@ -909,6 +869,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
+	kvm_free_irq_routing(kvm);
 	kvm_io_bus_destroy(&kvm->pio_bus);
 	kvm_io_bus_destroy(&kvm->mmio_bus);
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
@@ -1755,13 +1716,13 @@ out_free2:
 		r = 0;
 		break;
 	}
-	case KVM_DEBUG_GUEST: {
-		struct kvm_debug_guest dbg;
+	case KVM_SET_GUEST_DEBUG: {
+		struct kvm_guest_debug dbg;
 
 		r = -EFAULT;
 		if (copy_from_user(&dbg, argp, sizeof dbg))
 			goto out;
-		r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
+		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
 		if (r)
 			goto out;
 		r = 0;
@@ -1929,6 +1890,36 @@ static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 #endif
+#ifdef KVM_CAP_IRQ_ROUTING
+	case KVM_SET_GSI_ROUTING: {
+		struct kvm_irq_routing routing;
+		struct kvm_irq_routing __user *urouting;
+		struct kvm_irq_routing_entry *entries;
+
+		r = -EFAULT;
+		if (copy_from_user(&routing, argp, sizeof(routing)))
+			goto out;
+		r = -EINVAL;
+		if (routing.nr >= KVM_MAX_IRQ_ROUTES)
+			goto out;
+		if (routing.flags)
+			goto out;
+		r = -ENOMEM;
+		entries = vmalloc(routing.nr * sizeof(*entries));
+		if (!entries)
+			goto out;
+		r = -EFAULT;
+		urouting = argp;
+		if (copy_from_user(entries, urouting->entries,
+				   routing.nr * sizeof(*entries)))
+			goto out_free_irq_routing;
+		r = kvm_set_irq_routing(kvm, entries, routing.nr,
+					routing.flags);
+	out_free_irq_routing:
+		vfree(entries);
+		break;
+	}
+#endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
@@ -1995,6 +1986,10 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
 	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
 		return 1;
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	case KVM_CAP_IRQ_ROUTING:
+		return KVM_MAX_IRQ_ROUTES;
+#endif
 	default:
 		break;
 	}