From ca60dfbb69afb549e33527cbf676e4daf8febfb5 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Tue, 24 Jun 2008 17:02:38 +0800
Subject: KVM: VMX: Rename misnamed msr bits

MSR_IA32_FEATURE_LOCKED is just a bit in fact, which shouldn't be prefixed with
MSR_.  So is MSR_IA32_FEATURE_VMXON_ENABLED.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 18 +++++++++---------
 arch/x86/kvm/vmx.h |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7041cc52b56..81dac721ec3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1031,9 +1031,9 @@ static __init int vmx_disabled_by_bios(void)
 	u64 msr;
 
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-	return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
-		       MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-	    == MSR_IA32_FEATURE_CONTROL_LOCKED;
+	return (msr & (IA32_FEATURE_CONTROL_LOCKED_BIT |
+		       IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
+	    == IA32_FEATURE_CONTROL_LOCKED_BIT;
 	/* locked but not enabled */
 }
 
@@ -1045,14 +1045,14 @@ static void hardware_enable(void *garbage)
 
 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-	if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
-		    MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-	    != (MSR_IA32_FEATURE_CONTROL_LOCKED |
-		MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+	if ((old & (IA32_FEATURE_CONTROL_LOCKED_BIT |
+		    IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
+	    != (IA32_FEATURE_CONTROL_LOCKED_BIT |
+		IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
 		/* enable and lock */
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
-		       MSR_IA32_FEATURE_CONTROL_LOCKED |
-		       MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
+		       IA32_FEATURE_CONTROL_LOCKED_BIT |
+		       IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT);
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
 	asm volatile (ASM_VMX_VMXON_RAX
 		      : : "a"(&phys_addr), "m"(phys_addr)
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 17e25995b65..86059f439cb 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,8 +331,8 @@ enum vmcs_field {
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
-#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
+#define IA32_FEATURE_CONTROL_LOCKED_BIT		0x1
+#define IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT	0x4
 
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	10
-- 
cgit v1.2.3


From 5fdbf9765b7ba6a45100851154768de703d51e76 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 27 Jun 2008 14:58:02 -0300
Subject: KVM: x86: accessors for guest registers

As suggested by Avi, introduce accessors to read/write guest registers.
This simplifies the ->cache_regs/->decache_regs interface, and improves
register caching which is important for VMX, where the cost of
vmcs_read/vmcs_write is significant.

[avi: fix warnings]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/kvm_cache_regs.h |  32 +++++
 arch/x86/kvm/lapic.c          |   4 +-
 arch/x86/kvm/svm.c            |  56 ++++-----
 arch/x86/kvm/vmx.c            | 103 ++++++++--------
 arch/x86/kvm/x86.c            | 268 ++++++++++++++++++++++--------------------
 arch/x86/kvm/x86_emulate.c    |  19 +--
 6 files changed, 254 insertions(+), 228 deletions(-)
 create mode 100644 arch/x86/kvm/kvm_cache_regs.h

(limited to 'arch')

diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 00000000000..1ff819dce7d
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,32 @@
+#ifndef ASM_KVM_CACHE_REGS_H
+#define ASM_KVM_CACHE_REGS_H
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
+					      enum kvm_reg reg)
+{
+	if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+		kvm_x86_ops->cache_reg(vcpu, reg);
+
+	return vcpu->arch.regs[reg];
+}
+
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
+				      enum kvm_reg reg,
+				      unsigned long val)
+{
+	vcpu->arch.regs[reg] = val;
+	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+	return kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73f43de69f6..9fde0ac2426 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,6 +32,7 @@
 #include <asm/current.h>
 #include <asm/apicdef.h>
 #include <asm/atomic.h>
+#include "kvm_cache_regs.h"
 #include "irq.h"
 
 #define PRId64 "d"
@@ -558,8 +559,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
 	struct kvm_run *run = vcpu->run;
 
 	set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
-	kvm_x86_ops->cache_regs(vcpu);
-	run->tpr_access.rip = vcpu->arch.rip;
+	run->tpr_access.rip = kvm_rip_read(vcpu);
 	run->tpr_access.is_write = write;
 }
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8233b86c778..54b0bf33e21 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -18,6 +18,7 @@
 #include "kvm_svm.h"
 #include "irq.h"
 #include "mmu.h"
+#include "kvm_cache_regs.h"
 
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -236,13 +237,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 		printk(KERN_DEBUG "%s: NOP\n", __func__);
 		return;
 	}
-	if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
-		printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
-		       __func__,
-		       svm->vmcb->save.rip,
-		       svm->next_rip);
+	if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
+		printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
+		       __func__, kvm_rip_read(vcpu), svm->next_rip);
 
-	vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
+	kvm_rip_write(vcpu, svm->next_rip);
 	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 
 	vcpu->arch.interrupt_window_open = 1;
@@ -581,6 +580,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	save->dr7 = 0x400;
 	save->rflags = 2;
 	save->rip = 0x0000fff0;
+	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
 	/*
 	 * cr0 val on cpu init should be 0x60000010, we enable cpu
@@ -615,10 +615,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 	init_vmcb(svm);
 
 	if (vcpu->vcpu_id != 0) {
-		svm->vmcb->save.rip = 0;
+		kvm_rip_write(vcpu, 0);
 		svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
 		svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
 	}
+	vcpu->arch.regs_avail = ~0;
+	vcpu->arch.regs_dirty = ~0;
 
 	return 0;
 }
@@ -721,23 +723,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 	rdtscll(vcpu->arch.host_tsc);
 }
 
-static void svm_cache_regs(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
-	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
-	vcpu->arch.rip = svm->vmcb->save.rip;
-}
-
-static void svm_decache_regs(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
-	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-	svm->vmcb->save.rip = vcpu->arch.rip;
-}
-
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
 	return to_svm(vcpu)->vmcb->save.rflags;
@@ -1139,14 +1124,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 
 static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	svm->next_rip = svm->vmcb->save.rip + 1;
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
 	skip_emulated_instruction(&svm->vcpu);
 	return kvm_emulate_halt(&svm->vcpu);
 }
 
 static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	svm->next_rip = svm->vmcb->save.rip + 3;
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
 	kvm_emulate_hypercall(&svm->vcpu);
 	return 1;
@@ -1178,7 +1163,7 @@ static int task_switch_interception(struct vcpu_svm *svm,
 
 static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	svm->next_rip = svm->vmcb->save.rip + 2;
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
 	kvm_emulate_cpuid(&svm->vcpu);
 	return 1;
 }
@@ -1273,9 +1258,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 		KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
 			    (u32)(data >> 32), handler);
 
-		svm->vmcb->save.rax = data & 0xffffffff;
+		svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
 		svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
-		svm->next_rip = svm->vmcb->save.rip + 2;
+		svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
 		skip_emulated_instruction(&svm->vcpu);
 	}
 	return 1;
@@ -1359,13 +1344,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-	u64 data = (svm->vmcb->save.rax & -1u)
+	u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
 		| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 
 	KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
 		    handler);
 
-	svm->next_rip = svm->vmcb->save.rip + 2;
+	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
 	if (svm_set_msr(&svm->vcpu, ecx, data))
 		kvm_inject_gp(&svm->vcpu, 0);
 	else
@@ -1723,6 +1708,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	u16 gs_selector;
 	u16 ldt_selector;
 
+	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
 	pre_svm_run(svm);
 
 	sync_lapic_to_cr8(vcpu);
@@ -1858,6 +1847,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		load_db_regs(svm->host_db_regs);
 
 	vcpu->arch.cr2 = svm->vmcb->save.cr2;
+	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
 	write_dr6(svm->host_dr6);
 	write_dr7(svm->host_dr7);
@@ -1977,8 +1969,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_gdt = svm_set_gdt,
 	.get_dr = svm_get_dr,
 	.set_dr = svm_set_dr,
-	.cache_regs = svm_cache_regs,
-	.decache_regs = svm_decache_regs,
 	.get_rflags = svm_get_rflags,
 	.set_rflags = svm_set_rflags,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81dac721ec3..5a3a0326c27 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,6 +26,7 @@
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
+#include "kvm_cache_regs.h"
 
 #include <asm/io.h>
 #include <asm/desc.h>
@@ -715,9 +716,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	unsigned long rip;
 	u32 interruptibility;
 
-	rip = vmcs_readl(GUEST_RIP);
+	rip = kvm_rip_read(vcpu);
 	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-	vmcs_writel(GUEST_RIP, rip);
+	kvm_rip_write(vcpu, rip);
 
 	/*
 	 * We emulated an instruction, so temporary interrupt blocking
@@ -947,24 +948,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 	return ret;
 }
 
-/*
- * Sync the rsp and rip registers into the vcpu structure.  This allows
- * registers to be accessed by indexing vcpu->arch.regs.
- */
-static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
-	vcpu->arch.rip = vmcs_readl(GUEST_RIP);
-}
-
-/*
- * Syncs rsp and rip back into the vmcs.  Should be called after possible
- * modification.
- */
-static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
+static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 {
-	vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
-	vmcs_writel(GUEST_RIP, vcpu->arch.rip);
+	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+	switch (reg) {
+	case VCPU_REGS_RSP:
+		vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+		break;
+	case VCPU_REGS_RIP:
+		vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+		break;
+	default:
+		break;
+	}
 }
 
 static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -2019,6 +2015,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	u64 msr;
 	int ret;
 
+	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
 	down_read(&vcpu->kvm->slots_lock);
 	if (!init_rmode(vmx->vcpu.kvm)) {
 		ret = -ENOMEM;
@@ -2072,10 +2069,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	vmcs_writel(GUEST_RFLAGS, 0x02);
 	if (vmx->vcpu.vcpu_id == 0)
-		vmcs_writel(GUEST_RIP, 0xfff0);
+		kvm_rip_write(vcpu, 0xfff0);
 	else
-		vmcs_writel(GUEST_RIP, 0);
-	vmcs_writel(GUEST_RSP, 0);
+		kvm_rip_write(vcpu, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 
 	/* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
 	vmcs_writel(GUEST_DR7, 0x400);
@@ -2139,11 +2136,11 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 	if (vcpu->arch.rmode.active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = irq;
-		vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
+		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			     irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-		vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
+		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
 		return;
 	}
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -2288,7 +2285,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	error_code = 0;
-	rip = vmcs_readl(GUEST_RIP);
+	rip = kvm_rip_read(vcpu);
 	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
@@ -2386,27 +2383,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	reg = (exit_qualification >> 8) & 15;
 	switch ((exit_qualification >> 4) & 3) {
 	case 0: /* mov to cr */
-		KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg],
-			    (u32)((u64)vcpu->arch.regs[reg] >> 32), handler);
+		KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
+			    (u32)kvm_register_read(vcpu, reg),
+			    (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
+			    handler);
 		switch (cr) {
 		case 0:
-			vcpu_load_rsp_rip(vcpu);
-			kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 3:
-			vcpu_load_rsp_rip(vcpu);
-			kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 4:
-			vcpu_load_rsp_rip(vcpu);
-			kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 8:
-			vcpu_load_rsp_rip(vcpu);
-			kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
+			kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
 			skip_emulated_instruction(vcpu);
 			if (irqchip_in_kernel(vcpu->kvm))
 				return 1;
@@ -2415,7 +2410,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		};
 		break;
 	case 2: /* clts */
-		vcpu_load_rsp_rip(vcpu);
 		vmx_fpu_deactivate(vcpu);
 		vcpu->arch.cr0 &= ~X86_CR0_TS;
 		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
@@ -2426,21 +2420,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	case 1: /*mov from cr*/
 		switch (cr) {
 		case 3:
-			vcpu_load_rsp_rip(vcpu);
-			vcpu->arch.regs[reg] = vcpu->arch.cr3;
-			vcpu_put_rsp_rip(vcpu);
+			kvm_register_write(vcpu, reg, vcpu->arch.cr3);
 			KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
-				    (u32)vcpu->arch.regs[reg],
-				    (u32)((u64)vcpu->arch.regs[reg] >> 32),
+				    (u32)kvm_register_read(vcpu, reg),
+				    (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
 				    handler);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 8:
-			vcpu_load_rsp_rip(vcpu);
-			vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
-			vcpu_put_rsp_rip(vcpu);
+			kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
 			KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
-				    (u32)vcpu->arch.regs[reg], handler);
+				    (u32)kvm_register_read(vcpu, reg), handler);
 			skip_emulated_instruction(vcpu);
 			return 1;
 		}
@@ -2472,7 +2462,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	dr = exit_qualification & 7;
 	reg = (exit_qualification >> 8) & 15;
-	vcpu_load_rsp_rip(vcpu);
 	if (exit_qualification & 16) {
 		/* mov from dr */
 		switch (dr) {
@@ -2485,12 +2474,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		default:
 			val = 0;
 		}
-		vcpu->arch.regs[reg] = val;
+		kvm_register_write(vcpu, reg, val);
 		KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
 	} else {
 		/* mov to dr */
 	}
-	vcpu_put_rsp_rip(vcpu);
 	skip_emulated_instruction(vcpu);
 	return 1;
 }
@@ -2735,8 +2723,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
-	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
-		    (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
+	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
+		    (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
 
 	/* Access CR3 don't cause VMExit in paging mode, so we need
 	 * to sync with guest real CR3. */
@@ -2922,9 +2910,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 {
 	vmx->rmode.irq.pending = 0;
-	if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
+	if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
 		return;
-	vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
+	kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
 	if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
 		vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
 		vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
@@ -2941,6 +2929,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info;
 
+	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
 	/*
 	 * Loading guest fpu may have cleared host cr0.ts
 	 */
@@ -3061,6 +3054,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 #endif
 	      );
 
+	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
+	vcpu->arch.regs_dirty = 0;
+
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
@@ -3224,8 +3220,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_idt = vmx_set_idt,
 	.get_gdt = vmx_get_gdt,
 	.set_gdt = vmx_set_gdt,
-	.cache_regs = vcpu_load_rsp_rip,
-	.decache_regs = vcpu_put_rsp_rip,
+	.cache_reg = vmx_cache_reg,
 	.get_rflags = vmx_get_rflags,
 	.set_rflags = vmx_set_rflags,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0d682fc6aeb..2f0696bc7d2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -19,6 +19,7 @@
 #include "mmu.h"
 #include "i8254.h"
 #include "tss.h"
+#include "kvm_cache_regs.h"
 
 #include <linux/clocksource.h>
 #include <linux/kvm.h>
@@ -61,6 +62,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				    struct kvm_cpuid_entry2 __user *entries);
 
 struct kvm_x86_ops *kvm_x86_ops;
+EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -2080,7 +2082,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 {
 	u8 opcodes[4];
-	unsigned long rip = vcpu->arch.rip;
+	unsigned long rip = kvm_rip_read(vcpu);
 	unsigned long rip_linear;
 
 	if (!printk_ratelimit())
@@ -2102,6 +2104,14 @@ static struct x86_emulate_ops emulate_ops = {
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
 };
 
+static void cache_all_regs(struct kvm_vcpu *vcpu)
+{
+	kvm_register_read(vcpu, VCPU_REGS_RAX);
+	kvm_register_read(vcpu, VCPU_REGS_RSP);
+	kvm_register_read(vcpu, VCPU_REGS_RIP);
+	vcpu->arch.regs_dirty = ~0;
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			struct kvm_run *run,
 			unsigned long cr2,
@@ -2112,7 +2122,13 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	struct decode_cache *c;
 
 	vcpu->arch.mmio_fault_cr2 = cr2;
-	kvm_x86_ops->cache_regs(vcpu);
+	/*
+	 * TODO: fix x86_emulate.c to use guest_read/write_register
+	 * instead of direct ->regs accesses, can save hundred cycles
+	 * on Intel for instructions that don't read/change RSP, for
+	 * for example.
+	 */
+	cache_all_regs(vcpu);
 
 	vcpu->mmio_is_write = 0;
 	vcpu->arch.pio.string = 0;
@@ -2172,7 +2188,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DO_MMIO;
 	}
 
-	kvm_x86_ops->decache_regs(vcpu);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 
 	if (vcpu->mmio_is_write) {
@@ -2225,20 +2240,19 @@ int complete_pio(struct kvm_vcpu *vcpu)
 	struct kvm_pio_request *io = &vcpu->arch.pio;
 	long delta;
 	int r;
-
-	kvm_x86_ops->cache_regs(vcpu);
+	unsigned long val;
 
 	if (!io->string) {
-		if (io->in)
-			memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
-			       io->size);
+		if (io->in) {
+			val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+			memcpy(&val, vcpu->arch.pio_data, io->size);
+			kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+		}
 	} else {
 		if (io->in) {
 			r = pio_copy_data(vcpu);
-			if (r) {
-				kvm_x86_ops->cache_regs(vcpu);
+			if (r)
 				return r;
-			}
 		}
 
 		delta = 1;
@@ -2248,19 +2262,24 @@ int complete_pio(struct kvm_vcpu *vcpu)
 			 * The size of the register should really depend on
 			 * current address size.
 			 */
-			vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
+			val = kvm_register_read(vcpu, VCPU_REGS_RCX);
+			val -= delta;
+			kvm_register_write(vcpu, VCPU_REGS_RCX, val);
 		}
 		if (io->down)
 			delta = -delta;
 		delta *= io->size;
-		if (io->in)
-			vcpu->arch.regs[VCPU_REGS_RDI] += delta;
-		else
-			vcpu->arch.regs[VCPU_REGS_RSI] += delta;
+		if (io->in) {
+			val = kvm_register_read(vcpu, VCPU_REGS_RDI);
+			val += delta;
+			kvm_register_write(vcpu, VCPU_REGS_RDI, val);
+		} else {
+			val = kvm_register_read(vcpu, VCPU_REGS_RSI);
+			val += delta;
+			kvm_register_write(vcpu, VCPU_REGS_RSI, val);
+		}
 	}
 
-	kvm_x86_ops->decache_regs(vcpu);
-
 	io->count -= io->cur_count;
 	io->cur_count = 0;
 
@@ -2313,6 +2332,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		  int size, unsigned port)
 {
 	struct kvm_io_device *pio_dev;
+	unsigned long val;
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2333,8 +2353,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 		KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
 			    handler);
 
-	kvm_x86_ops->cache_regs(vcpu);
-	memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
+	val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	memcpy(vcpu->arch.pio_data, &val, 4);
 
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
 
@@ -2519,13 +2539,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	unsigned long nr, a0, a1, a2, a3, ret;
 	int r = 1;
 
-	kvm_x86_ops->cache_regs(vcpu);
-
-	nr = vcpu->arch.regs[VCPU_REGS_RAX];
-	a0 = vcpu->arch.regs[VCPU_REGS_RBX];
-	a1 = vcpu->arch.regs[VCPU_REGS_RCX];
-	a2 = vcpu->arch.regs[VCPU_REGS_RDX];
-	a3 = vcpu->arch.regs[VCPU_REGS_RSI];
+	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
 
 	KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
 
@@ -2548,8 +2566,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 		ret = -KVM_ENOSYS;
 		break;
 	}
-	vcpu->arch.regs[VCPU_REGS_RAX] = ret;
-	kvm_x86_ops->decache_regs(vcpu);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
 	++vcpu->stat.hypercalls;
 	return r;
 }
@@ -2559,6 +2576,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 {
 	char instruction[3];
 	int ret = 0;
+	unsigned long rip = kvm_rip_read(vcpu);
 
 
 	/*
@@ -2568,9 +2586,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 	 */
 	kvm_mmu_zap_all(vcpu->kvm);
 
-	kvm_x86_ops->cache_regs(vcpu);
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
-	if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
+	if (emulator_write_emulated(rip, instruction, 3, vcpu)
 	    != X86EMUL_CONTINUE)
 		ret = -EFAULT;
 
@@ -2700,13 +2717,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 	u32 function, index;
 	struct kvm_cpuid_entry2 *e, *best;
 
-	kvm_x86_ops->cache_regs(vcpu);
-	function = vcpu->arch.regs[VCPU_REGS_RAX];
-	index = vcpu->arch.regs[VCPU_REGS_RCX];
-	vcpu->arch.regs[VCPU_REGS_RAX] = 0;
-	vcpu->arch.regs[VCPU_REGS_RBX] = 0;
-	vcpu->arch.regs[VCPU_REGS_RCX] = 0;
-	vcpu->arch.regs[VCPU_REGS_RDX] = 0;
+	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
 	best = NULL;
 	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
 		e = &vcpu->arch.cpuid_entries[i];
@@ -2724,18 +2740,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 				best = e;
 	}
 	if (best) {
-		vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
-		vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
-		vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
-		vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
+		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
+		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
+		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
+		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
 	}
-	kvm_x86_ops->decache_regs(vcpu);
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
 	KVMTRACE_5D(CPUID, vcpu, function,
-		    (u32)vcpu->arch.regs[VCPU_REGS_RAX],
-		    (u32)vcpu->arch.regs[VCPU_REGS_RBX],
-		    (u32)vcpu->arch.regs[VCPU_REGS_RCX],
-		    (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);
+		    (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
+		    (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
+		    (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
+		    (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
 
@@ -2917,8 +2932,8 @@ again:
 	 * Profile KVM exit RIPs:
 	 */
 	if (unlikely(prof_on == KVM_PROFILING)) {
-		kvm_x86_ops->cache_regs(vcpu);
-		profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
+		unsigned long rip = kvm_rip_read(vcpu);
+		profile_hit(KVM_PROFILING, (void *)rip);
 	}
 
 	if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
@@ -2999,11 +3014,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 #endif
-	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
-		kvm_x86_ops->cache_regs(vcpu);
-		vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
-		kvm_x86_ops->decache_regs(vcpu);
-	}
+	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
+		kvm_register_write(vcpu, VCPU_REGS_RAX,
+				     kvm_run->hypercall.ret);
 
 	r = __vcpu_run(vcpu, kvm_run);
 
@@ -3019,28 +3032,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	vcpu_load(vcpu);
 
-	kvm_x86_ops->cache_regs(vcpu);
-
-	regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
-	regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
-	regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
-	regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
-	regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
-	regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
-	regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-	regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
+	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+	regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
 #ifdef CONFIG_X86_64
-	regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
-	regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
-	regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
-	regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
-	regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
-	regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
-	regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
-	regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
+	regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
+	regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
+	regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
+	regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
+	regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
+	regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
+	regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
+	regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
 #endif
 
-	regs->rip = vcpu->arch.rip;
+	regs->rip = kvm_rip_read(vcpu);
 	regs->rflags = kvm_x86_ops->get_rflags(vcpu);
 
 	/*
@@ -3058,29 +3069,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	vcpu_load(vcpu);
 
-	vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
-	vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
-	vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
-	vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
-	vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
-	vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
-	vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
-	vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
+	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
 #ifdef CONFIG_X86_64
-	vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
-	vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
-	vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
-	vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
-	vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
-	vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
-	vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
-	vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
+	kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+	kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+	kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+	kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+	kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+	kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+	kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+	kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+
 #endif
 
-	vcpu->arch.rip = regs->rip;
+	kvm_rip_write(vcpu, regs->rip);
 	kvm_x86_ops->set_rflags(vcpu, regs->rflags);
 
-	kvm_x86_ops->decache_regs(vcpu);
 
 	vcpu->arch.exception.pending = false;
 
@@ -3316,17 +3327,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
 				struct tss_segment_32 *tss)
 {
 	tss->cr3 = vcpu->arch.cr3;
-	tss->eip = vcpu->arch.rip;
+	tss->eip = kvm_rip_read(vcpu);
 	tss->eflags = kvm_x86_ops->get_rflags(vcpu);
-	tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];
-	tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];
-	tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];
-	tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];
-	tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];
-	tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];
-	tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];
-	tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];
-
+	tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+	tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
 	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
 	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
 	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
@@ -3342,17 +3352,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
 {
 	kvm_set_cr3(vcpu, tss->cr3);
 
-	vcpu->arch.rip = tss->eip;
+	kvm_rip_write(vcpu, tss->eip);
 	kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
 
-	vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;
-	vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;
-	vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;
-	vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;
-	vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;
-	vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;
-	vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
-	vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
+	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
 
 	if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
 		return 1;
@@ -3380,16 +3390,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
 				struct tss_segment_16 *tss)
 {
-	tss->ip = vcpu->arch.rip;
+	tss->ip = kvm_rip_read(vcpu);
 	tss->flag = kvm_x86_ops->get_rflags(vcpu);
-	tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];
-	tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];
-	tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];
-	tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];
-	tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];
-	tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];
-	tss->si = vcpu->arch.regs[VCPU_REGS_RSI];
-	tss->di = vcpu->arch.regs[VCPU_REGS_RDI];
+	tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+	tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+	tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+	tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
+	tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
 
 	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
 	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
@@ -3402,16 +3412,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
 				 struct tss_segment_16 *tss)
 {
-	vcpu->arch.rip = tss->ip;
+	kvm_rip_write(vcpu, tss->ip);
 	kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
-	vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;
-	vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;
-	vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;
-	vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;
-	vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;
-	vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;
-	vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
-	vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
+	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
+	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
+	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
+	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
+	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
 
 	if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
 		return 1;
@@ -3534,7 +3544,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 	}
 
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
-	kvm_x86_ops->cache_regs(vcpu);
 
 	if (nseg_desc.type & 8)
 		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
@@ -3559,7 +3568,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 	tr_seg.type = 11;
 	kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
 out:
-	kvm_x86_ops->decache_regs(vcpu);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f2f90468f8b..d5da7f14d53 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -26,6 +26,7 @@
 #define DPRINTF(_f, _a ...) printf(_f , ## _a)
 #else
 #include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
 #define DPRINTF(x...) do {} while (0)
 #endif
 #include <linux/module.h>
@@ -839,7 +840,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* Shadow copy of register state. Committed on successful emulation. */
 
 	memset(c, 0, sizeof(struct decode_cache));
-	c->eip = ctxt->vcpu->arch.rip;
+	c->eip = kvm_rip_read(ctxt->vcpu);
 	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
@@ -1267,7 +1268,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	if (c->rep_prefix && (c->d & String)) {
 		/* All REP prefixes have the same first termination condition */
 		if (c->regs[VCPU_REGS_RCX] == 0) {
-			ctxt->vcpu->arch.rip = c->eip;
+			kvm_rip_write(ctxt->vcpu, c->eip);
 			goto done;
 		}
 		/* The second termination condition only applies for REPE
@@ -1281,17 +1282,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 				(c->b == 0xae) || (c->b == 0xaf)) {
 			if ((c->rep_prefix == REPE_PREFIX) &&
 				((ctxt->eflags & EFLG_ZF) == 0)) {
-					ctxt->vcpu->arch.rip = c->eip;
+					kvm_rip_write(ctxt->vcpu, c->eip);
 					goto done;
 			}
 			if ((c->rep_prefix == REPNE_PREFIX) &&
 				((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
-				ctxt->vcpu->arch.rip = c->eip;
+				kvm_rip_write(ctxt->vcpu, c->eip);
 				goto done;
 			}
 		}
 		c->regs[VCPU_REGS_RCX]--;
-		c->eip = ctxt->vcpu->arch.rip;
+		c->eip = kvm_rip_read(ctxt->vcpu);
 	}
 
 	if (c->src.type == OP_MEM) {
@@ -1768,7 +1769,7 @@ writeback:
 
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-	ctxt->vcpu->arch.rip = c->eip;
+	kvm_rip_write(ctxt->vcpu, c->eip);
 
 done:
 	if (rc == X86EMUL_UNHANDLEABLE) {
@@ -1793,7 +1794,7 @@ twobyte_insn:
 				goto done;
 
 			/* Let the processor re-execute the fixed hypercall */
-			c->eip = ctxt->vcpu->arch.rip;
+			c->eip = kvm_rip_read(ctxt->vcpu);
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
@@ -1889,7 +1890,7 @@ twobyte_insn:
 		rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
 		if (rc) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = ctxt->vcpu->arch.rip;
+			c->eip = kvm_rip_read(ctxt->vcpu);
 		}
 		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;
@@ -1899,7 +1900,7 @@ twobyte_insn:
 		rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
 		if (rc) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = ctxt->vcpu->arch.rip;
+			c->eip = kvm_rip_read(ctxt->vcpu);
 		} else {
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
 			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
-- 
cgit v1.2.3


From 867767a365ee74a3adcfaba27075eefb66b14bfd Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@qumranet.com>
Date: Fri, 27 Jun 2008 15:55:02 +0300
Subject: KVM: Introduce kvm_set_irq to inject interrupts in guests

This function injects an interrupt into the guest given the kvm struct,
the (guest) irq number and the interrupt level.

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/irq.c | 11 +++++++++++
 arch/x86/kvm/irq.h |  2 ++
 2 files changed, 13 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 76d736b5f66..0d9e55275af 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -100,3 +100,14 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
 	__kvm_migrate_apic_timer(vcpu);
 	__kvm_migrate_pit_timer(vcpu);
 }
+
+/* This should be called with the kvm->lock mutex held */
+void kvm_set_irq(struct kvm *kvm, int irq, int level)
+{
+	/* Not possible to detect if the guest uses the PIC or the
+	 * IOAPIC.  So set the bit in both. The guest will ignore
+	 * writes to the unused one.
+	 */
+	kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level);
+	kvm_pic_set_irq(pic_irqchip(kvm), irq, level);
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7ca47cbb48b..07ff2aef0c1 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -82,6 +82,8 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
+void kvm_set_irq(struct kvm *kvm, int irq, int level);
+
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From 31aa2b44afd5e73365221b1de66f6081e4616f33 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 11 Jul 2008 17:59:46 +0300
Subject: KVM: MMU: Separate the code for unlinking a shadow page from its
 parents

Place into own function, in preparation for further cleanups.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3da2508eb22..81016a3a6fd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -991,11 +991,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
 			kvm->vcpus[i]->arch.last_pte_updated = NULL;
 }
 
-static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	u64 *parent_pte;
 
-	++kvm->stat.mmu_shadow_zapped;
 	while (sp->multimapped || sp->parent_pte) {
 		if (!sp->multimapped)
 			parent_pte = sp->parent_pte;
@@ -1010,7 +1009,13 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 		kvm_mmu_put_page(sp, parent_pte);
 		set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
 	}
+}
+
+static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	++kvm->stat.mmu_shadow_zapped;
 	kvm_mmu_page_unlink_children(kvm, sp);
+	kvm_mmu_unlink_parents(kvm, sp);
 	if (!sp->root_count) {
 		if (!sp->role.metaphysical && !sp->role.invalid)
 			unaccount_shadowed(kvm, sp->gfn);
-- 
cgit v1.2.3


From 5b5c6a5a60801effb559e787a947885d9850a7da Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 11 Jul 2008 18:07:26 +0300
Subject: KVM: MMU: Simplify kvm_mmu_zap_page()

The twisty maze of conditionals can be reduced.

[joerg: fix tlb flushing]

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 81016a3a6fd..c3afbfe6b0c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -955,7 +955,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 				rmap_remove(kvm, &pt[i]);
 			pt[i] = shadow_trap_nonpresent_pte;
 		}
-		kvm_flush_remote_tlbs(kvm);
 		return;
 	}
 
@@ -974,7 +973,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 		}
 		pt[i] = shadow_trap_nonpresent_pte;
 	}
-	kvm_flush_remote_tlbs(kvm);
 }
 
 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -1016,18 +1014,16 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	++kvm->stat.mmu_shadow_zapped;
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
+	kvm_flush_remote_tlbs(kvm);
+	if (!sp->role.invalid && !sp->role.metaphysical)
+		unaccount_shadowed(kvm, sp->gfn);
 	if (!sp->root_count) {
-		if (!sp->role.metaphysical && !sp->role.invalid)
-			unaccount_shadowed(kvm, sp->gfn);
 		hlist_del(&sp->hash_link);
 		kvm_mmu_free_page(kvm, sp);
 	} else {
-		int invalid = sp->role.invalid;
-		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 		sp->role.invalid = 1;
+		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 		kvm_reload_remote_mmus(kvm);
-		if (!sp->role.metaphysical && !invalid)
-			unaccount_shadowed(kvm, sp->gfn);
 	}
 	kvm_mmu_reset_last_pte_updated(kvm);
 }
@@ -1842,7 +1838,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-		if (sp->gfn != gfn || sp->role.metaphysical)
+		if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
 			continue;
 		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
-- 
cgit v1.2.3


From cf393f75661f4b17451377b353833eb5502a9688 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 1 Jul 2008 16:20:21 +0300
Subject: KVM: Move NMI IRET fault processing to new vmx_complete_interrupts()

Currently most interrupt exit processing is handled on the entry path,
which is confusing.  Move the NMI IRET fault processing to a new function,
vmx_complete_interrupts(), which is called on the vmexit path.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5a3a0326c27..2adfacb0033 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2817,6 +2817,27 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
 		enable_irq_window(vcpu);
 }
 
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+	u32 exit_intr_info;
+	bool unblock_nmi;
+	u8 vector;
+
+	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	if (cpu_has_virtual_nmis()) {
+		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+		/*
+		 * SDM 3: 25.7.1.2
+		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
+		 * a guest IRET fault.
+		 */
+		if (unblock_nmi && vector != DF_VECTOR)
+			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+				      GUEST_INTR_STATE_NMI);
+	}
+}
+
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2873,23 +2894,12 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 		return;
 	}
 	if (cpu_has_virtual_nmis()) {
-		/*
-		 * SDM 3: 25.7.1.2
-		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
-		 * a guest IRET fault.
-		 */
-		if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) &&
-		    (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8)
-			vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-				vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) |
-				GUEST_INTR_STATE_NMI);
-		else if (vcpu->arch.nmi_pending) {
+		if (vcpu->arch.nmi_pending) {
 			if (vmx_nmi_enabled(vcpu))
 				vmx_inject_nmi(vcpu);
 			enable_intr_window(vcpu);
 			return;
 		}
-
 	}
 	if (!kvm_cpu_has_interrupt(vcpu))
 		return;
@@ -3076,6 +3086,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		KVMTRACE_0D(NMI, vcpu, handler);
 		asm("int $2");
 	}
+
+	vmx_complete_interrupts(vmx);
 }
 
 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 668f612fa0d8d4120ec5dc0725d7e1ca3152a954 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 2 Jul 2008 09:28:55 +0300
Subject: KVM: VMX: Move nmi injection failure processing to vm exit path

Instead of processing nmi injection failure in the vm entry path, move
it to the vm exit path (vm_complete_interrupts()).  This separates nmi
injection from nmi post-processing, and moves the nmi state from the VT
state into vcpu state (new variable nmi_injected specifying an injection
in progress).

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 49 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2adfacb0033..ce13b53d21c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2151,7 +2151,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
-	vcpu->arch.nmi_pending = 0;
 }
 
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
@@ -2820,8 +2819,11 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
+	u32 idt_vectoring_info;
 	bool unblock_nmi;
 	u8 vector;
+	int type;
+	bool idtv_info_valid;
 
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	if (cpu_has_virtual_nmis()) {
@@ -2836,18 +2838,34 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 				      GUEST_INTR_STATE_NMI);
 	}
+
+	idt_vectoring_info = vmx->idt_vectoring_info;
+	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+	if (vmx->vcpu.arch.nmi_injected) {
+		/*
+		 * SDM 3: 25.7.1.2
+		 * Clear bit "block by NMI" before VM entry if a NMI delivery
+		 * faulted.
+		 */
+		if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
+			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+					GUEST_INTR_STATE_NMI);
+		else
+			vmx->vcpu.arch.nmi_injected = false;
+	}
 }
 
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 idtv_info_field, intr_info_field, exit_intr_info_field;
+	u32 idtv_info_field, intr_info_field;
 	int vector;
 
 	update_tpr_threshold(vcpu);
 
 	intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
-	exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO);
 	idtv_info_field = vmx->idt_vectoring_info;
 	if (intr_info_field & INTR_INFO_VALID_MASK) {
 		if (idtv_info_field & INTR_INFO_VALID_MASK) {
@@ -2871,17 +2889,6 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 
 		KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
 
-		/*
-		 * SDM 3: 25.7.1.2
-		 * Clear bit "block by NMI" before VM entry if a NMI delivery
-		 * faulted.
-		 */
-		if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
-		    == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis())
-			vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-				vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-				~GUEST_INTR_STATE_NMI);
-
 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
 				& ~INTR_INFO_RESVD_BITS_MASK);
 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
@@ -2894,9 +2901,17 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 		return;
 	}
 	if (cpu_has_virtual_nmis()) {
-		if (vcpu->arch.nmi_pending) {
-			if (vmx_nmi_enabled(vcpu))
-				vmx_inject_nmi(vcpu);
+		if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+			if (vmx_nmi_enabled(vcpu)) {
+				vcpu->arch.nmi_pending = false;
+				vcpu->arch.nmi_injected = true;
+			} else {
+				enable_intr_window(vcpu);
+				return;
+			}
+		}
+		if (vcpu->arch.nmi_injected) {
+			vmx_inject_nmi(vcpu);
 			enable_intr_window(vcpu);
 			return;
 		}
-- 
cgit v1.2.3


From 26eef70c3e8c76e73dff2579c792fc7355f8a291 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 3 Jul 2008 14:59:22 +0300
Subject: KVM: Clear exception queue before emulating an instruction

If we're emulating an instruction, either it will succeed, in which case
any previously queued exception will be spurious, or we will requeue the
same exception.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c |  2 ++
 arch/x86/kvm/x86.h | 11 +++++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 arch/x86/kvm/x86.h

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2f0696bc7d2..5620df2685d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -20,6 +20,7 @@
 #include "i8254.h"
 #include "tss.h"
 #include "kvm_cache_regs.h"
+#include "x86.h"
 
 #include <linux/clocksource.h>
 #include <linux/kvm.h>
@@ -2121,6 +2122,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	int r;
 	struct decode_cache *c;
 
+	kvm_clear_exception_queue(vcpu);
 	vcpu->arch.mmio_fault_cr2 = cr2;
 	/*
 	 * TODO: fix x86_emulate.c to use guest_read/write_register
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 00000000000..c666649c4bb
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,11 @@
+#ifndef ARCH_X86_KVM_X86_H
+#define ARCH_X86_KVM_X86_H
+
+#include <linux/kvm_host.h>
+
+static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.exception.pending = false;
+}
+
+#endif
-- 
cgit v1.2.3


From 35920a356957eea9fd1f9da043f93469e8d72eab Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 3 Jul 2008 14:50:12 +0300
Subject: KVM: VMX: Fix pending exception processing

The vmx code assumes that IDT-Vectoring can only be set when an exception
is injected due to the exception in question.  That's not true, however:
if the exception is injected correctly, and later another exception occurs
but its delivery is blocked due to a fault, then we will incorrectly assume
the first exception was not delivered.

Fix by unconditionally dequeuing the pending exception, and requeuing it
(or the second exception) if we see it in the IDT-Vectoring field.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ce13b53d21c..ed4fe8e72ad 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
 #include "kvm_cache_regs.h"
+#include "x86.h"
 
 #include <asm/io.h>
 #include <asm/desc.h>
@@ -744,9 +745,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 
 static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+	return false;
 }
 
 /*
@@ -2824,6 +2823,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	u8 vector;
 	int type;
 	bool idtv_info_valid;
+	u32 error;
 
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	if (cpu_has_virtual_nmis()) {
@@ -2855,6 +2855,15 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		else
 			vmx->vcpu.arch.nmi_injected = false;
 	}
+	kvm_clear_exception_queue(&vmx->vcpu);
+	if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
+		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
+			error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			kvm_queue_exception_e(&vmx->vcpu, vector, error);
+		} else
+			kvm_queue_exception(&vmx->vcpu, vector);
+		vmx->idt_vectoring_info = 0;
+	}
 }
 
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 937a7eaef9f08342958d17055a350982b7bd92cb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 3 Jul 2008 15:17:01 +0300
Subject: KVM: Add a pending interrupt queue

Similar to the exception queue, this hold interrupts that have been
accepted by the virtual processor core but not yet injected.

Not yet used.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c666649c4bb..6a4be78a738 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,4 +8,15 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 	vcpu->arch.exception.pending = false;
 }
 
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
+{
+	vcpu->arch.interrupt.pending = true;
+	vcpu->arch.interrupt.nr = vector;
+}
+
+static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.interrupt.pending = false;
+}
+
 #endif
-- 
cgit v1.2.3


From f7d9238f5dc9306fd3bf1653c2939eef72f94d06 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 3 Jul 2008 16:14:28 +0300
Subject: KVM: VMX: Move interrupt post-processing to vmx_complete_interrupts()

Instead of looking at failed injections in the vm entry path, move
processing to the exit path in vmx_complete_interrupts().  This simplifes
the logic and removes any state that is hidden in vmx registers.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 77 ++++++++++++++----------------------------------------
 1 file changed, 20 insertions(+), 57 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed4fe8e72ad..e8fed9b8756 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1002,17 +1002,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
 
 static int vmx_get_irq(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 idtv_info_field;
-
-	idtv_info_field = vmx->idt_vectoring_info;
-	if (idtv_info_field & INTR_INFO_VALID_MASK) {
-		if (is_external_interrupt(idtv_info_field))
-			return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-		else
-			printk(KERN_DEBUG "pending exception: not handled yet\n");
-	}
-	return -1;
+	if (!vcpu->arch.interrupt.pending)
+		return -1;
+	return vcpu->arch.interrupt.nr;
 }
 
 static __init int cpu_has_kvm_support(void)
@@ -2293,7 +2285,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 		KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
 			    (u32)((u64)cr2 >> 32), handler);
-		if (vect_info & VECTORING_INFO_VALID_MASK)
+		if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
 			kvm_mmu_unprotect_page_virt(vcpu, cr2);
 		return kvm_mmu_page_fault(vcpu, cr2, error_code);
 	}
@@ -2864,51 +2856,20 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 			kvm_queue_exception(&vmx->vcpu, vector);
 		vmx->idt_vectoring_info = 0;
 	}
+	kvm_clear_interrupt_queue(&vmx->vcpu);
+	if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
+		kvm_queue_interrupt(&vmx->vcpu, vector);
+		vmx->idt_vectoring_info = 0;
+	}
 }
 
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 idtv_info_field, intr_info_field;
-	int vector;
+	u32 intr_info_field;
 
 	update_tpr_threshold(vcpu);
 
 	intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
-	idtv_info_field = vmx->idt_vectoring_info;
-	if (intr_info_field & INTR_INFO_VALID_MASK) {
-		if (idtv_info_field & INTR_INFO_VALID_MASK) {
-			/* TODO: fault when IDT_Vectoring */
-			if (printk_ratelimit())
-				printk(KERN_ERR "Fault when IDT_Vectoring\n");
-		}
-		enable_intr_window(vcpu);
-		return;
-	}
-	if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
-		if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
-		    == INTR_TYPE_EXT_INTR
-		    && vcpu->arch.rmode.active) {
-			u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-
-			vmx_inject_irq(vcpu, vect);
-			enable_intr_window(vcpu);
-			return;
-		}
-
-		KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
-
-		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
-				& ~INTR_INFO_RESVD_BITS_MASK);
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-				vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
-
-		if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
-			vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-				vmcs_read32(IDT_VECTORING_ERROR_CODE));
-		enable_intr_window(vcpu);
-		return;
-	}
 	if (cpu_has_virtual_nmis()) {
 		if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
 			if (vmx_nmi_enabled(vcpu)) {
@@ -2925,14 +2886,16 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 			return;
 		}
 	}
-	if (!kvm_cpu_has_interrupt(vcpu))
-		return;
-	if (vmx_irq_enabled(vcpu)) {
-		vector = kvm_cpu_get_interrupt(vcpu);
-		vmx_inject_irq(vcpu, vector);
-		kvm_timer_intr_post(vcpu, vector);
-	} else
-		enable_irq_window(vcpu);
+	if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
+		if (vmx_irq_enabled(vcpu))
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
+		else
+			enable_irq_window(vcpu);
+	}
+	if (vcpu->arch.interrupt.pending) {
+		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+		kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 60bd83a125030878665684f353c7d36fd54f09fd Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sat, 12 Jul 2008 16:02:08 +0300
Subject: KVM: VMX: Remove redundant check in handle_rmode_exception

Since checking for vcpu->arch.rmode.active is already done whenever we
call handle_rmode_exception(), checking it inside the function is redundant.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e8fed9b8756..9591ca73191 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2224,9 +2224,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 				  int vec, u32 err_code)
 {
-	if (!vcpu->arch.rmode.active)
-		return 0;
-
 	/*
 	 * Instruction with address size override prefix opcode 0x67
 	 * Cause the #SS fault with 0 error code in VM86 mode.
-- 
cgit v1.2.3


From 7edd0ce05892831c77fa4cebe24a6056d33336d5 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 7 Jul 2008 14:45:39 +0300
Subject: KVM: Consolidate PIC isr clearing into a function

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8259.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index c31164e8aa4..55e179ad98e 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,6 +30,11 @@
 
 #include <linux/kvm_host.h>
 
+static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
+{
+	s->isr &= ~(1 << irq);
+}
+
 /*
  * set irq level. If an edge is detected, then the IRR is set to 1
  */
@@ -141,11 +146,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
  */
 static inline void pic_intack(struct kvm_kpic_state *s, int irq)
 {
+	s->isr |= 1 << irq;
 	if (s->auto_eoi) {
 		if (s->rotate_on_auto_eoi)
 			s->priority_add = (irq + 1) & 7;
-	} else
-		s->isr |= (1 << irq);
+		pic_clear_isr(s, irq);
+	}
 	/*
 	 * We don't clear a level sensitive interrupt here
 	 */
@@ -243,7 +249,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 				priority = get_priority(s, s->isr);
 				if (priority != 8) {
 					irq = (priority + s->priority_add) & 7;
-					s->isr &= ~(1 << irq);
+					pic_clear_isr(s, irq);
 					if (cmd == 5)
 						s->priority_add = (irq + 1) & 7;
 					pic_update_irq(s->pics_state);
@@ -251,7 +257,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 				break;
 			case 3:
 				irq = val & 7;
-				s->isr &= ~(1 << irq);
+				pic_clear_isr(s, irq);
 				pic_update_irq(s->pics_state);
 				break;
 			case 6:
@@ -260,8 +266,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 				break;
 			case 7:
 				irq = val & 7;
-				s->isr &= ~(1 << irq);
 				s->priority_add = (irq + 1) & 7;
+				pic_clear_isr(s, irq);
 				pic_update_irq(s->pics_state);
 				break;
 			default:
@@ -303,7 +309,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
 			s->pics_state->pics[0].irr &= ~(1 << 2);
 		}
 		s->irr &= ~(1 << ret);
-		s->isr &= ~(1 << ret);
+		pic_clear_isr(s, ret);
 		if (addr1 >> 7 || ret != 2)
 			pic_update_irq(s->pics_state);
 	} else {
-- 
cgit v1.2.3


From 19bd8afdc4e6fbb47e4841f8d771f8cb29916d9f Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Sun, 13 Jul 2008 13:40:55 +0200
Subject: KVM: Consolidate XX_VECTOR defines

Signed-off-by: Jan Kiszka <jan.kiszka@web.de>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 4 ----
 arch/x86/kvm/vmx.c | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 54b0bf33e21..743c98d135c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -36,10 +36,6 @@ MODULE_LICENSE("GPL");
 #define IOPM_ALLOC_ORDER 2
 #define MSRPM_ALLOC_ORDER 1
 
-#define DB_VECTOR 1
-#define UD_VECTOR 6
-#define GP_VECTOR 13
-
 #define DR7_GD_MASK (1 << 13)
 #define DR6_BD_MASK (1 << 13)
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9591ca73191..5c9dac567a7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -470,7 +470,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	if (!vcpu->fpu_active)
 		eb |= 1u << NM_VECTOR;
 	if (vcpu->guest_debug.enabled)
-		eb |= 1u << 1;
+		eb |= 1u << DB_VECTOR;
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
 	if (vm_need_ept())
-- 
cgit v1.2.3


From 77ab6db0a1c403387b403e9351ab3f5ae1df83e6 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Mon, 14 Jul 2008 12:28:51 +0200
Subject: KVM: VMX: Reinject real mode exception

As we execute real mode guests in VM86 mode, exception have to be
reinjected appropriately when the guest triggered them. For this purpose
the patch adopts the real-mode injection pattern used in vmx_inject_irq
to vmx_queue_exception, additionally taking care that the IP is set
correctly for #BP exceptions. Furthermore it extends
handle_rmode_exception to reinject all those exceptions that can be
raised in real mode.

This fixes the execution of himem.exe from FreeDOS and also makes its
debug.com work properly.

Note that guest debugging in real mode is broken now. This has to be
fixed by the scheduled debugging infrastructure rework (will be done
once base patches for QEMU have been accepted).

Signed-off-by: Jan Kiszka <jan.kiszka@web.de>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5c9dac567a7..2879880076d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -735,12 +735,30 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (has_error_code)
+		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+
+	if (vcpu->arch.rmode.active) {
+		vmx->rmode.irq.pending = true;
+		vmx->rmode.irq.vector = nr;
+		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+		if (nr == BP_VECTOR)
+			vmx->rmode.irq.rip++;
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			     nr | INTR_TYPE_SOFT_INTR
+			     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
+			     | INTR_INFO_VALID_MASK);
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		return;
+	}
+
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 		     nr | INTR_TYPE_EXCEPTION
 		     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
 		     | INTR_INFO_VALID_MASK);
-	if (has_error_code)
-		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 }
 
 static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
@@ -2231,6 +2249,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
 		if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
 			return 1;
+	/*
+	 * Forward all other exceptions that are valid in real mode.
+	 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+	 *        the required debugging infrastructure rework.
+	 */
+	switch (vec) {
+	case DE_VECTOR:
+	case DB_VECTOR:
+	case BP_VECTOR:
+	case OF_VECTOR:
+	case BR_VECTOR:
+	case UD_VECTOR:
+	case DF_VECTOR:
+	case SS_VECTOR:
+	case GP_VECTOR:
+	case MF_VECTOR:
+		kvm_queue_exception(vcpu, vec);
+		return 1;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From c801949ddf0a51074937f488a52072825ed50174 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 14 Jul 2008 14:44:59 +0300
Subject: KVM: VMX: Unify register save/restore across 32 and 64 bit hosts

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 90 ++++++++++++++++++++++--------------------------------
 1 file changed, 36 insertions(+), 54 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2879880076d..fc8996b1043 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2955,6 +2955,14 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 		| vmx->rmode.irq.vector;
 }
 
+#ifdef CONFIG_X86_64
+#define R "r"
+#define Q "q"
+#else
+#define R "e"
+#define Q "l"
+#endif
+
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2972,26 +2980,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	asm(
 		/* Store host registers */
-#ifdef CONFIG_X86_64
-		"push %%rdx; push %%rbp;"
-		"push %%rcx \n\t"
-#else
-		"push %%edx; push %%ebp;"
-		"push %%ecx \n\t"
-#endif
+		"push %%"R"dx; push %%"R"bp;"
+		"push %%"R"cx \n\t"
 		__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
 		/* Check if vmlaunch of vmresume is needed */
 		"cmpl $0, %c[launched](%0) \n\t"
 		/* Load guest registers.  Don't clobber flags. */
+		"mov %c[cr2](%0), %%"R"ax \n\t"
+		"mov %%"R"ax, %%cr2 \n\t"
+		"mov %c[rax](%0), %%"R"ax \n\t"
+		"mov %c[rbx](%0), %%"R"bx \n\t"
+		"mov %c[rdx](%0), %%"R"dx \n\t"
+		"mov %c[rsi](%0), %%"R"si \n\t"
+		"mov %c[rdi](%0), %%"R"di \n\t"
+		"mov %c[rbp](%0), %%"R"bp \n\t"
 #ifdef CONFIG_X86_64
-		"mov %c[cr2](%0), %%rax \n\t"
-		"mov %%rax, %%cr2 \n\t"
-		"mov %c[rax](%0), %%rax \n\t"
-		"mov %c[rbx](%0), %%rbx \n\t"
-		"mov %c[rdx](%0), %%rdx \n\t"
-		"mov %c[rsi](%0), %%rsi \n\t"
-		"mov %c[rdi](%0), %%rdi \n\t"
-		"mov %c[rbp](%0), %%rbp \n\t"
 		"mov %c[r8](%0),  %%r8  \n\t"
 		"mov %c[r9](%0),  %%r9  \n\t"
 		"mov %c[r10](%0), %%r10 \n\t"
@@ -3000,18 +3003,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		"mov %c[r13](%0), %%r13 \n\t"
 		"mov %c[r14](%0), %%r14 \n\t"
 		"mov %c[r15](%0), %%r15 \n\t"
-		"mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
-#else
-		"mov %c[cr2](%0), %%eax \n\t"
-		"mov %%eax,   %%cr2 \n\t"
-		"mov %c[rax](%0), %%eax \n\t"
-		"mov %c[rbx](%0), %%ebx \n\t"
-		"mov %c[rdx](%0), %%edx \n\t"
-		"mov %c[rsi](%0), %%esi \n\t"
-		"mov %c[rdi](%0), %%edi \n\t"
-		"mov %c[rbp](%0), %%ebp \n\t"
-		"mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
 #endif
+		"mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+
 		/* Enter guest mode */
 		"jne .Llaunched \n\t"
 		__ex(ASM_VMX_VMLAUNCH) "\n\t"
@@ -3019,15 +3013,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
 		".Lkvm_vmx_return: "
 		/* Save guest registers, load host registers, keep flags */
+		"xchg %0,     (%%"R"sp) \n\t"
+		"mov %%"R"ax, %c[rax](%0) \n\t"
+		"mov %%"R"bx, %c[rbx](%0) \n\t"
+		"push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
+		"mov %%"R"dx, %c[rdx](%0) \n\t"
+		"mov %%"R"si, %c[rsi](%0) \n\t"
+		"mov %%"R"di, %c[rdi](%0) \n\t"
+		"mov %%"R"bp, %c[rbp](%0) \n\t"
 #ifdef CONFIG_X86_64
-		"xchg %0,     (%%rsp) \n\t"
-		"mov %%rax, %c[rax](%0) \n\t"
-		"mov %%rbx, %c[rbx](%0) \n\t"
-		"pushq (%%rsp); popq %c[rcx](%0) \n\t"
-		"mov %%rdx, %c[rdx](%0) \n\t"
-		"mov %%rsi, %c[rsi](%0) \n\t"
-		"mov %%rdi, %c[rdi](%0) \n\t"
-		"mov %%rbp, %c[rbp](%0) \n\t"
 		"mov %%r8,  %c[r8](%0) \n\t"
 		"mov %%r9,  %c[r9](%0) \n\t"
 		"mov %%r10, %c[r10](%0) \n\t"
@@ -3036,24 +3030,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		"mov %%r13, %c[r13](%0) \n\t"
 		"mov %%r14, %c[r14](%0) \n\t"
 		"mov %%r15, %c[r15](%0) \n\t"
-		"mov %%cr2, %%rax   \n\t"
-		"mov %%rax, %c[cr2](%0) \n\t"
-
-		"pop  %%rbp; pop  %%rbp; pop  %%rdx \n\t"
-#else
-		"xchg %0, (%%esp) \n\t"
-		"mov %%eax, %c[rax](%0) \n\t"
-		"mov %%ebx, %c[rbx](%0) \n\t"
-		"pushl (%%esp); popl %c[rcx](%0) \n\t"
-		"mov %%edx, %c[rdx](%0) \n\t"
-		"mov %%esi, %c[rsi](%0) \n\t"
-		"mov %%edi, %c[rdi](%0) \n\t"
-		"mov %%ebp, %c[rbp](%0) \n\t"
-		"mov %%cr2, %%eax  \n\t"
-		"mov %%eax, %c[cr2](%0) \n\t"
-
-		"pop %%ebp; pop %%ebp; pop %%edx \n\t"
 #endif
+		"mov %%cr2, %%"R"ax   \n\t"
+		"mov %%"R"ax, %c[cr2](%0) \n\t"
+
+		"pop  %%"R"bp; pop  %%"R"bp; pop  %%"R"dx \n\t"
 		"setbe %c[fail](%0) \n\t"
 	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
 		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
@@ -3077,11 +3058,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 #endif
 		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
 	      : "cc", "memory"
+		, R"bx", R"di", R"si"
 #ifdef CONFIG_X86_64
-		, "rbx", "rdi", "rsi"
 		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#else
-		, "ebx", "edi", "rsi"
 #endif
 	      );
 
@@ -3111,6 +3090,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	vmx_complete_interrupts(vmx);
 }
 
+#undef R
+#undef Q
+
 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-- 
cgit v1.2.3


From 80e31d4f61f69d0e480ae092cda0e590d6a30aeb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 14 Jul 2008 14:44:59 +0300
Subject: KVM: SVM: Unify register save/restore across 32 and 64 bit hosts

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 78 ++++++++++++++++++------------------------------------
 1 file changed, 26 insertions(+), 52 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 743c98d135c..179c2e00d0f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1697,6 +1697,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
 }
 
+#ifdef CONFIG_X86_64
+#define R "r"
+#else
+#define R "e"
+#endif
+
 static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1735,19 +1741,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	local_irq_enable();
 
 	asm volatile (
+		"push %%"R"bp; \n\t"
+		"mov %c[rbx](%[svm]), %%"R"bx \n\t"
+		"mov %c[rcx](%[svm]), %%"R"cx \n\t"
+		"mov %c[rdx](%[svm]), %%"R"dx \n\t"
+		"mov %c[rsi](%[svm]), %%"R"si \n\t"
+		"mov %c[rdi](%[svm]), %%"R"di \n\t"
+		"mov %c[rbp](%[svm]), %%"R"bp \n\t"
 #ifdef CONFIG_X86_64
-		"push %%rbp; \n\t"
-#else
-		"push %%ebp; \n\t"
-#endif
-
-#ifdef CONFIG_X86_64
-		"mov %c[rbx](%[svm]), %%rbx \n\t"
-		"mov %c[rcx](%[svm]), %%rcx \n\t"
-		"mov %c[rdx](%[svm]), %%rdx \n\t"
-		"mov %c[rsi](%[svm]), %%rsi \n\t"
-		"mov %c[rdi](%[svm]), %%rdi \n\t"
-		"mov %c[rbp](%[svm]), %%rbp \n\t"
 		"mov %c[r8](%[svm]),  %%r8  \n\t"
 		"mov %c[r9](%[svm]),  %%r9  \n\t"
 		"mov %c[r10](%[svm]), %%r10 \n\t"
@@ -1756,41 +1757,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		"mov %c[r13](%[svm]), %%r13 \n\t"
 		"mov %c[r14](%[svm]), %%r14 \n\t"
 		"mov %c[r15](%[svm]), %%r15 \n\t"
-#else
-		"mov %c[rbx](%[svm]), %%ebx \n\t"
-		"mov %c[rcx](%[svm]), %%ecx \n\t"
-		"mov %c[rdx](%[svm]), %%edx \n\t"
-		"mov %c[rsi](%[svm]), %%esi \n\t"
-		"mov %c[rdi](%[svm]), %%edi \n\t"
-		"mov %c[rbp](%[svm]), %%ebp \n\t"
 #endif
 
-#ifdef CONFIG_X86_64
 		/* Enter guest mode */
-		"push %%rax \n\t"
-		"mov %c[vmcb](%[svm]), %%rax \n\t"
+		"push %%"R"ax \n\t"
+		"mov %c[vmcb](%[svm]), %%"R"ax \n\t"
 		__ex(SVM_VMLOAD) "\n\t"
 		__ex(SVM_VMRUN) "\n\t"
 		__ex(SVM_VMSAVE) "\n\t"
-		"pop %%rax \n\t"
-#else
-		/* Enter guest mode */
-		"push %%eax \n\t"
-		"mov %c[vmcb](%[svm]), %%eax \n\t"
-		__ex(SVM_VMLOAD) "\n\t"
-		__ex(SVM_VMRUN) "\n\t"
-		__ex(SVM_VMSAVE) "\n\t"
-		"pop %%eax \n\t"
-#endif
+		"pop %%"R"ax \n\t"
 
 		/* Save guest registers, load host registers */
+		"mov %%"R"bx, %c[rbx](%[svm]) \n\t"
+		"mov %%"R"cx, %c[rcx](%[svm]) \n\t"
+		"mov %%"R"dx, %c[rdx](%[svm]) \n\t"
+		"mov %%"R"si, %c[rsi](%[svm]) \n\t"
+		"mov %%"R"di, %c[rdi](%[svm]) \n\t"
+		"mov %%"R"bp, %c[rbp](%[svm]) \n\t"
 #ifdef CONFIG_X86_64
-		"mov %%rbx, %c[rbx](%[svm]) \n\t"
-		"mov %%rcx, %c[rcx](%[svm]) \n\t"
-		"mov %%rdx, %c[rdx](%[svm]) \n\t"
-		"mov %%rsi, %c[rsi](%[svm]) \n\t"
-		"mov %%rdi, %c[rdi](%[svm]) \n\t"
-		"mov %%rbp, %c[rbp](%[svm]) \n\t"
 		"mov %%r8,  %c[r8](%[svm]) \n\t"
 		"mov %%r9,  %c[r9](%[svm]) \n\t"
 		"mov %%r10, %c[r10](%[svm]) \n\t"
@@ -1799,18 +1783,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		"mov %%r13, %c[r13](%[svm]) \n\t"
 		"mov %%r14, %c[r14](%[svm]) \n\t"
 		"mov %%r15, %c[r15](%[svm]) \n\t"
-
-		"pop  %%rbp; \n\t"
-#else
-		"mov %%ebx, %c[rbx](%[svm]) \n\t"
-		"mov %%ecx, %c[rcx](%[svm]) \n\t"
-		"mov %%edx, %c[rdx](%[svm]) \n\t"
-		"mov %%esi, %c[rsi](%[svm]) \n\t"
-		"mov %%edi, %c[rdi](%[svm]) \n\t"
-		"mov %%ebp, %c[rbp](%[svm]) \n\t"
-
-		"pop  %%ebp; \n\t"
 #endif
+		"pop %%"R"bp"
 		:
 		: [svm]"a"(svm),
 		  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -1831,11 +1805,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
 #endif
 		: "cc", "memory"
+		, R"bx", R"cx", R"dx", R"si", R"di"
 #ifdef CONFIG_X86_64
-		, "rbx", "rcx", "rdx", "rsi", "rdi"
 		, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#else
-		, "ebx", "ecx", "edx" , "esi", "edi"
 #endif
 		);
 
@@ -1867,6 +1839,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	svm->next_rip = 0;
 }
 
+#undef R
+
 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-- 
cgit v1.2.3


From 12f67556023389a6be929a56617142a8e8ab20fe Mon Sep 17 00:00:00 2001
From: Jerone Young <jyoung5@us.ibm.com>
Date: Mon, 14 Jul 2008 14:00:02 +0200
Subject: KVM: ppc: enable KVM_TRACE building for powerpc

This patch enables KVM_TRACE to build for PowerPC arch. This means just
adding sections to Kconfig and Makefile.

Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/powerpc/kvm/Kconfig  | 11 +++++++++++
 arch/powerpc/kvm/Makefile |  6 ++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 6b076010213..53aaa66b25e 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -37,6 +37,17 @@ config KVM_BOOKE_HOST
 	  Provides host support for KVM on Book E PowerPC processors. Currently
 	  this works on 440 processors only.
 
+config KVM_TRACE
+	bool "KVM trace support"
+	depends on KVM && MARKERS && SYSFS
+	select RELAY
+	select DEBUG_FS
+	default n
+	---help---
+	  This option allows reading a trace of kvm-related events through
+	  relayfs.  Note the ABI is not considered stable and will be
+	  modified in future updates.
+
 source drivers/virtio/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 04e3449e1f4..2a5d4397ac4 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -4,9 +4,11 @@
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm
 
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
+common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 
-kvm-objs := $(common-objs) powerpc.o emulate.o booke_guest.o
+common-objs-$(CONFIG_KVM_TRACE)  += $(addprefix ../../../virt/kvm/, kvm_trace.o)
+
+kvm-objs := $(common-objs-y) powerpc.o emulate.o booke_guest.o
 obj-$(CONFIG_KVM) += kvm.o
 
 AFLAGS_booke_interrupts.o := -I$(obj)
-- 
cgit v1.2.3


From 31711f2294b38d8334efaf7dbac6da4781fd151e Mon Sep 17 00:00:00 2001
From: Jerone Young <jyoung5@us.ibm.com>
Date: Mon, 14 Jul 2008 14:00:03 +0200
Subject: KVM: ppc: adds trace points for ppc tlb activity

This patch adds trace points to track powerpc TLB activities using the
KVM_TRACE infrastructure.

Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/powerpc/kvm/44x_tlb.c | 15 ++++++++++++++-
 arch/powerpc/kvm/emulate.c |  4 ++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 5a5602da509..a207d16b9db 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -19,6 +19,7 @@
 
 #include <linux/types.h>
 #include <linux/string.h>
+#include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/highmem.h>
 #include <asm/mmu-44x.h>
@@ -175,6 +176,10 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
 	stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
 	                                            vcpu->arch.msr & MSR_PR);
+
+	KVMTRACE_5D(STLB_WRITE, vcpu, victim,
+			stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2,
+			handler);
 }
 
 void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
@@ -204,6 +209,9 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
 		kvmppc_44x_shadow_release(vcpu, i);
 		stlbe->word0 = 0;
+		KVMTRACE_5D(STLB_INVAL, vcpu, i,
+				stlbe->tid, stlbe->word0, stlbe->word1,
+				stlbe->word2, handler);
 	}
 	up_write(&current->mm->mmap_sem);
 }
@@ -217,8 +225,13 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 	/* XXX Replace loop with fancy data structures. */
 	down_write(&current->mm->mmap_sem);
 	for (i = 0; i <= tlb_44x_hwater; i++) {
+		struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+
 		kvmppc_44x_shadow_release(vcpu, i);
-		vcpu->arch.shadow_tlb[i].word0 = 0;
+		stlbe->word0 = 0;
+		KVMTRACE_5D(STLB_INVAL, vcpu, i,
+				stlbe->tid, stlbe->word0, stlbe->word1,
+				stlbe->word2, handler);
 	}
 	up_write(&current->mm->mmap_sem);
 }
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 8c605d0a548..4a3e274bac1 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -170,6 +170,10 @@ static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst)
 		kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
 	}
 
+	KVMTRACE_5D(GTLB_WRITE, vcpu, index,
+			tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2,
+			handler);
+
 	return EMULATE_DONE;
 }
 
-- 
cgit v1.2.3


From 3b4bd7969f7b61a1ab455bff084ee4f0a2411055 Mon Sep 17 00:00:00 2001
From: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Date: Mon, 14 Jul 2008 14:00:04 +0200
Subject: KVM: ppc: trace powerpc instruction emulation

This patch adds a trace point for the instruction emulation on embedded powerpc
utilizing the KVM_TRACE interface.

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/powerpc/kvm/emulate.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 4a3e274bac1..c3ed63b2221 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -769,6 +769,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		break;
 	}
 
+	KVMTRACE_3D(PPC_INSTR, vcpu, inst, vcpu->arch.pc, emulated, entryexit);
+
 	if (advance)
 		vcpu->arch.pc += 4; /* Advance past emulated instruction. */
 
-- 
cgit v1.2.3


From 313dbd49dc239205b96da79fba09f7637cf84f3c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 17 Jul 2008 18:04:30 +0300
Subject: KVM: VMX: Avoid vmwrite(HOST_RSP) when possible

Usually HOST_RSP retains its value across guest entries.  Take advantage
of this and avoid a vmwrite() when this is so.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fc8996b1043..ddb49e34697 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -58,6 +58,7 @@ struct vmcs {
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	struct list_head      local_vcpus_link;
+	unsigned long         host_rsp;
 	int                   launched;
 	u8                    fail;
 	u32                   idt_vectoring_info;
@@ -2982,7 +2983,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
 		"push %%"R"cx \n\t"
+		"cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+		"je 1f \n\t"
+		"mov %%"R"sp, %c[host_rsp](%0) \n\t"
 		__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+		"1: \n\t"
 		/* Check if vmlaunch of vmresume is needed */
 		"cmpl $0, %c[launched](%0) \n\t"
 		/* Load guest registers.  Don't clobber flags. */
@@ -3039,6 +3044,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
 		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
 		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
+		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
 		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
 		[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
 		[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
-- 
cgit v1.2.3


From b5e2fec0ebc3fcaff954092bb69444a67a904c0a Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 22 Jul 2008 08:00:45 +0200
Subject: KVM: Ignore DEBUGCTL MSRs with no effect

Netware writes to DEBUGCTL and reads from the DEBUGCTL and LAST*IP MSRs
without further checks and is really confused to receive a #GP during that.
To make it happy we should just make them stubs, which is exactly what SVM
already does.

Writes to DEBUGCTL that are vendor-specific are resembled to behave as if the
virtual CPU does not know them.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5620df2685d..94a216562f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -665,6 +665,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
 			__func__, data);
 		break;
+	case MSR_IA32_DEBUGCTLMSR:
+		if (!data) {
+			/* We support the non-activated case already */
+			break;
+		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+			/* Values other than LBR and BTF are vendor-specific,
+			   thus reserved and should throw a #GP */
+			return 1;
+		}
+		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+			__func__, data);
+		break;
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_UCODE_WRITE:
 		break;
@@ -757,6 +769,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MC0_MISC+16:
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_EBL_CR_POWERON:
+	case MSR_IA32_DEBUGCTLMSR:
+	case MSR_IA32_LASTBRANCHFROMIP:
+	case MSR_IA32_LASTBRANCHTOIP:
+	case MSR_IA32_LASTINTFROMIP:
+	case MSR_IA32_LASTINTTOIP:
 		data = 0;
 		break;
 	case MSR_MTRRcap:
-- 
cgit v1.2.3


From 6a0ab738ef42d87951b3980f61b1f4cbb14d4171 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 25 Jul 2008 13:54:49 -0500
Subject: KVM: ppc: guest breakpoint support

Allow host userspace to program hardware debug registers to set breakpoints
inside guests.

Signed-off-by: Jerone Young <jyoung5@us.ibm.com>
Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/powerpc/include/asm/kvm_host.h |  5 +++
 arch/powerpc/kvm/booke_guest.c      | 15 +++++++
 arch/powerpc/kvm/booke_interrupts.S | 11 ++++-
 arch/powerpc/kvm/powerpc.c          | 84 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 113 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 2655e2a4831..23bad40b0ea 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -86,6 +86,11 @@ struct kvm_vcpu_arch {
 
 	u32 host_stack;
 	u32 host_pid;
+	u32 host_dbcr0;
+	u32 host_dbcr1;
+	u32 host_dbcr2;
+	u32 host_iac[4];
+	u32 host_msr;
 
 	u64 fpr[32];
 	u32 gpr[32];
diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
index 9c8ad850c6e..3cca079975e 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke_guest.c
@@ -410,6 +410,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	}
 
+	case BOOKE_INTERRUPT_DEBUG: {
+		u32 dbsr;
+
+		vcpu->arch.pc = mfspr(SPRN_CSRR0);
+
+		/* clear IAC events in DBSR register */
+		dbsr = mfspr(SPRN_DBSR);
+		dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4;
+		mtspr(SPRN_DBSR, dbsr);
+
+		run->exit_reason = KVM_EXIT_DEBUG;
+		r = RESUME_HOST;
+		break;
+	}
+
 	default:
 		printk(KERN_EMERG "exit_nr %d\n", exit_nr);
 		BUG();
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 3b653b5309b..8eaba2613ff 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -42,7 +42,8 @@
 #define HOST_STACK_LR   (HOST_STACK_SIZE + 4) /* In caller stack frame. */
 
 #define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \
-                        (1<<BOOKE_INTERRUPT_DTLB_MISS))
+                        (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
+                        (1<<BOOKE_INTERRUPT_DEBUG))
 
 #define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
                         (1<<BOOKE_INTERRUPT_DTLB_MISS))
@@ -431,6 +432,14 @@ lightweight_exit:
 	oris	r3, r3, KVMPPC_MSR_MASK@h
 	ori	r3, r3, KVMPPC_MSR_MASK@l
 	mtsrr1	r3
+
+	/* Clear any debug events which occurred since we disabled MSR[DE].
+	 * XXX This gives us a 3-instruction window in which a breakpoint
+	 * intended for guest context could fire in the host instead. */
+	lis	r3, 0xffff
+	ori	r3, r3, 0xffff
+	mtspr	SPRN_DBSR, r3
+
 	lwz	r3, VCPU_GPR(r3)(r4)
 	lwz	r4, VCPU_GPR(r4)(r4)
 	rfi
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 53826a5f6c0..b75607180dd 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -239,18 +239,100 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 }
 
+/* Note: clearing MSR[DE] just means that the debug interrupt will not be
+ * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
+ * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
+ * will be delivered as an "imprecise debug event" (which is indicated by
+ * DBSR[IDE].
+ */
+static void kvmppc_disable_debug_interrupts(void)
+{
+	mtmsr(mfmsr() & ~MSR_DE);
+}
+
+static void kvmppc_restore_host_debug_state(struct kvm_vcpu *vcpu)
+{
+	kvmppc_disable_debug_interrupts();
+
+	mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
+	mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
+	mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
+	mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
+	mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
+	mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
+	mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
+	mtmsr(vcpu->arch.host_msr);
+}
+
+static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu)
+{
+	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
+	u32 dbcr0 = 0;
+
+	vcpu->arch.host_msr = mfmsr();
+	kvmppc_disable_debug_interrupts();
+
+	/* Save host debug register state. */
+	vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
+	vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
+	vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
+	vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
+	vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
+	vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
+	vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
+
+	/* set registers up for guest */
+
+	if (dbg->bp[0]) {
+		mtspr(SPRN_IAC1, dbg->bp[0]);
+		dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
+	}
+	if (dbg->bp[1]) {
+		mtspr(SPRN_IAC2, dbg->bp[1]);
+		dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
+	}
+	if (dbg->bp[2]) {
+		mtspr(SPRN_IAC3, dbg->bp[2]);
+		dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
+	}
+	if (dbg->bp[3]) {
+		mtspr(SPRN_IAC4, dbg->bp[3]);
+		dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
+	}
+
+	mtspr(SPRN_DBCR0, dbcr0);
+	mtspr(SPRN_DBCR1, 0);
+	mtspr(SPRN_DBCR2, 0);
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	if (vcpu->guest_debug.enabled)
+		kvmppc_load_guest_debug_registers(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	if (vcpu->guest_debug.enabled)
+		kvmppc_restore_host_debug_state(vcpu);
 }
 
 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
                                     struct kvm_debug_guest *dbg)
 {
-	return -ENOTSUPP;
+	int i;
+
+	vcpu->guest_debug.enabled = dbg->enabled;
+	if (vcpu->guest_debug.enabled) {
+		for (i=0; i < ARRAY_SIZE(vcpu->guest_debug.bp); i++) {
+			if (dbg->breakpoints[i].enabled)
+				vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
+			else
+				vcpu->guest_debug.bp[i] = 0;
+		}
+	}
+
+	return 0;
 }
 
 static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From 20754c2495a791b5b429c0da63394c86ade978e7 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 25 Jul 2008 13:54:51 -0500
Subject: KVM: ppc: Stop saving host TLB state

We're saving the host TLB state to memory on every exit, but never using it.
Originally I had thought that we'd want to restore host TLB for heavyweight
exits, but that could actually hurt when context switching to an unrelated host
process (i.e. not qemu).

Since this decreases the performance penalty of all exits, this patch improves
guest boot time by about 15%.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/powerpc/include/asm/kvm_host.h |  2 --
 arch/powerpc/kernel/asm-offsets.c   |  1 -
 arch/powerpc/kvm/booke_interrupts.S | 17 +++--------------
 3 files changed, 3 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 23bad40b0ea..dc3a7562bae 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -81,8 +81,6 @@ struct kvm_vcpu_arch {
 	struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
 	/* Pages which are referenced in the shadow TLB. */
 	struct page *shadow_pages[PPC44x_TLB_SIZE];
-	/* Copy of the host's TLB. */
-	struct tlbe host_tlb[PPC44x_TLB_SIZE];
 
 	u32 host_stack;
 	u32 host_pid;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 92768d3006f..59406495395 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -356,7 +356,6 @@ int main(void)
 
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
-	DEFINE(VCPU_HOST_TLB, offsetof(struct kvm_vcpu, arch.host_tlb));
 	DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 8eaba2613ff..3e88dfa1dbe 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -342,26 +342,15 @@ lightweight_exit:
 	andc	r6, r5, r6
 	mtmsr	r6
 
-	/* Save the host's non-pinned TLB mappings, and load the guest mappings
-	 * over them. Leave the host's "pinned" kernel mappings in place. */
-	/* XXX optimization: use generation count to avoid swapping unmodified
-	 * entries. */
+	/* Load the guest mappings, leaving the host's "pinned" kernel mappings
+	 * in place. */
+	/* XXX optimization: load only modified guest entries. */
 	mfspr	r10, SPRN_MMUCR			/* Save host MMUCR. */
 	lis	r8, tlb_44x_hwater@ha
 	lwz	r8, tlb_44x_hwater@l(r8)
-	addi	r3, r4, VCPU_HOST_TLB - 4
 	addi	r9, r4, VCPU_SHADOW_TLB - 4
 	li	r6, 0
 1:
-	/* Save host entry. */
-	tlbre	r7, r6, PPC44x_TLB_PAGEID
-	mfspr	r5, SPRN_MMUCR
-	stwu	r5, 4(r3)
-	stwu	r7, 4(r3)
-	tlbre	r7, r6, PPC44x_TLB_XLAT
-	stwu	r7, 4(r3)
-	tlbre	r7, r6, PPC44x_TLB_ATTRIB
-	stwu	r7, 4(r3)
 	/* Load guest entry. */
 	lwzu	r7, 4(r9)
 	mtspr	SPRN_MMUCR, r7
-- 
cgit v1.2.3


From 83aae4a8098eb8a40a2e9dab3714354182143b4f Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 25 Jul 2008 13:54:52 -0500
Subject: KVM: ppc: Write only modified shadow entries into the TLB on exit

Track which TLB entries need to be written, instead of overwriting everything
below the high water mark. Typically only a single guest TLB entry will be
modified in a single exit.

Guest boot time performance improvement: about 15%.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/powerpc/include/asm/kvm_host.h |  3 +++
 arch/powerpc/include/asm/kvm_ppc.h  |  3 +++
 arch/powerpc/kernel/asm-offsets.c   |  1 +
 arch/powerpc/kvm/44x_tlb.c          |  9 ++++++-
 arch/powerpc/kvm/booke_interrupts.S | 51 ++++++++++++++++++++++++-------------
 arch/powerpc/kvm/powerpc.c          | 15 +++++++++++
 6 files changed, 64 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index dc3a7562bae..4338b03da8f 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -82,6 +82,9 @@ struct kvm_vcpu_arch {
 	/* Pages which are referenced in the shadow TLB. */
 	struct page *shadow_pages[PPC44x_TLB_SIZE];
 
+	/* Track which TLB entries we've modified in the current exit. */
+	u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
+
 	u32 host_stack;
 	u32 host_pid;
 	u32 host_dbcr0;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index a8b06879226..8e7e4295990 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -65,6 +65,9 @@ extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
                                   gva_t eend, u32 asid);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
 
+/* XXX Book E specific */
+extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
+
 extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu);
 
 static inline void kvmppc_queue_exception(struct kvm_vcpu *vcpu, int exception)
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 59406495395..1631d670b9e 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -357,6 +357,7 @@ int main(void)
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
 	DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
+	DEFINE(VCPU_SHADOW_MOD, offsetof(struct kvm_vcpu, arch.shadow_tlb_mod));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index a207d16b9db..06a5fcfc4d3 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -125,6 +125,11 @@ static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
 	}
 }
 
+void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
+{
+    vcpu->arch.shadow_tlb_mod[i] = 1;
+}
+
 /* Caller must ensure that the specified guest TLB entry is safe to insert into
  * the shadow TLB. */
 void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
@@ -172,10 +177,10 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	 * use host large pages in the future. */
 	stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS
 	               | PPC44x_TLB_4K;
-
 	stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
 	stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
 	                                            vcpu->arch.msr & MSR_PR);
+	kvmppc_tlbe_set_modified(vcpu, victim);
 
 	KVMTRACE_5D(STLB_WRITE, vcpu, victim,
 			stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2,
@@ -209,6 +214,7 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
 		kvmppc_44x_shadow_release(vcpu, i);
 		stlbe->word0 = 0;
+		kvmppc_tlbe_set_modified(vcpu, i);
 		KVMTRACE_5D(STLB_INVAL, vcpu, i,
 				stlbe->tid, stlbe->word0, stlbe->word1,
 				stlbe->word2, handler);
@@ -229,6 +235,7 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 
 		kvmppc_44x_shadow_release(vcpu, i);
 		stlbe->word0 = 0;
+		kvmppc_tlbe_set_modified(vcpu, i);
 		KVMTRACE_5D(STLB_INVAL, vcpu, i,
 				stlbe->tid, stlbe->word0, stlbe->word1,
 				stlbe->word2, handler);
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 3e88dfa1dbe..564ea32ecba 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -335,7 +335,7 @@ lightweight_exit:
 	lwz	r3, VCPU_PID(r4)
 	mtspr	SPRN_PID, r3
 
-	/* Prevent all TLB updates. */
+	/* Prevent all asynchronous TLB updates. */
 	mfmsr	r5
 	lis	r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h
 	ori	r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
@@ -344,28 +344,45 @@ lightweight_exit:
 
 	/* Load the guest mappings, leaving the host's "pinned" kernel mappings
 	 * in place. */
-	/* XXX optimization: load only modified guest entries. */
 	mfspr	r10, SPRN_MMUCR			/* Save host MMUCR. */
-	lis	r8, tlb_44x_hwater@ha
-	lwz	r8, tlb_44x_hwater@l(r8)
-	addi	r9, r4, VCPU_SHADOW_TLB - 4
-	li	r6, 0
+	li	r5, PPC44x_TLB_SIZE
+	lis	r5, tlb_44x_hwater@ha
+	lwz	r5, tlb_44x_hwater@l(r5)
+	mtctr	r5
+	addi	r9, r4, VCPU_SHADOW_TLB
+	addi	r5, r4, VCPU_SHADOW_MOD
+	li	r3, 0
 1:
+	lbzx	r7, r3, r5
+	cmpwi	r7, 0
+	beq	3f
+
 	/* Load guest entry. */
-	lwzu	r7, 4(r9)
+	mulli	r11, r3, TLBE_BYTES
+	add	r11, r11, r9
+	lwz	r7, 0(r11)
 	mtspr	SPRN_MMUCR, r7
-	lwzu	r7, 4(r9)
-	tlbwe	r7, r6, PPC44x_TLB_PAGEID
-	lwzu	r7, 4(r9)
-	tlbwe	r7, r6, PPC44x_TLB_XLAT
-	lwzu	r7, 4(r9)
-	tlbwe	r7, r6, PPC44x_TLB_ATTRIB
-	/* Increment index. */
-	addi	r6, r6, 1
-	cmpw	r6, r8
-	blt	1b
+	lwz	r7, 4(r11)
+	tlbwe	r7, r3, PPC44x_TLB_PAGEID
+	lwz	r7, 8(r11)
+	tlbwe	r7, r3, PPC44x_TLB_XLAT
+	lwz	r7, 12(r11)
+	tlbwe	r7, r3, PPC44x_TLB_ATTRIB
+3:
+	addi	r3, r3, 1                       /* Increment index. */
+	bdnz	1b
+
 	mtspr	SPRN_MMUCR, r10			/* Restore host MMUCR. */
 
+	/* Clear bitmap of modified TLB entries */
+	li	r5, PPC44x_TLB_SIZE>>2
+	mtctr	r5
+	addi	r5, r4, VCPU_SHADOW_MOD - 4
+	li	r6, 0
+1:
+	stwu	r6, 4(r5)
+	bdnz	1b
+
 	iccci	0, 0 /* XXX hack */
 
 	/* Load some guest volatiles. */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index b75607180dd..90a6fc422b2 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -27,6 +27,7 @@
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
+#include <asm/tlbflush.h>
 
 
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
@@ -307,14 +308,28 @@ static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	int i;
+
 	if (vcpu->guest_debug.enabled)
 		kvmppc_load_guest_debug_registers(vcpu);
+
+	/* Mark every guest entry in the shadow TLB entry modified, so that they
+	 * will all be reloaded on the next vcpu run (instead of being
+	 * demand-faulted). */
+	for (i = 0; i <= tlb_44x_hwater; i++)
+		kvmppc_tlbe_set_modified(vcpu, i);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->guest_debug.enabled)
 		kvmppc_restore_host_debug_state(vcpu);
+
+	/* Don't leave guest TLB entries resident when being de-scheduled. */
+	/* XXX It would be nice to differentiate between heavyweight exit and
+	 * sched_out here, since we could avoid the TLB flush for heavyweight
+	 * exits. */
+	_tlbia();
 }
 
 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From 49dd2c492895828a90ecdf889e7fe9cfb40a82a7 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 25 Jul 2008 13:54:53 -0500
Subject: KVM: powerpc: Map guest userspace with TID=0 mappings

When we use TID=N userspace mappings, we must ensure that kernel mappings have
been destroyed when entering userspace. Using TID=1/TID=0 for kernel/user
mappings and running userspace with PID=0 means that userspace can't access the
kernel mappings, but the kernel can directly access userspace.

The net is that we don't need to flush the TLB on privilege switches, but we do
on guest context switches (which are far more infrequent). Guest boot time
performance improvement: about 30%.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/powerpc/include/asm/kvm_host.h |  4 ++++
 arch/powerpc/include/asm/kvm_ppc.h  |  9 +++++++++
 arch/powerpc/kernel/asm-offsets.c   |  2 +-
 arch/powerpc/kvm/44x_tlb.c          | 39 ++++++++++++++++++++++---------------
 arch/powerpc/kvm/booke_guest.c      |  2 ++
 arch/powerpc/kvm/booke_interrupts.S |  2 +-
 arch/powerpc/kvm/emulate.c          |  2 +-
 7 files changed, 41 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 4338b03da8f..34b52b7180c 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -129,7 +129,11 @@ struct kvm_vcpu_arch {
 	u32 ivor[16];
 	u32 ivpr;
 	u32 pir;
+
+	u32 shadow_pid;
 	u32 pid;
+	u32 swap_pid;
+
 	u32 pvr;
 	u32 ccr0;
 	u32 ccr1;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 8e7e4295990..8931ba729d2 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -64,6 +64,7 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
 extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
                                   gva_t eend, u32 asid);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
+extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
 
 /* XXX Book E specific */
 extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
@@ -95,4 +96,12 @@ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 		kvm_vcpu_block(vcpu);
 }
 
+static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
+{
+	if (vcpu->arch.pid != new_pid) {
+		vcpu->arch.pid = new_pid;
+		vcpu->arch.swap_pid = 1;
+	}
+}
+
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1631d670b9e..52649da344f 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -369,7 +369,7 @@ int main(void)
 	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
 	DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
-	DEFINE(VCPU_PID, offsetof(struct kvm_vcpu, arch.pid));
+	DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
 
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 06a5fcfc4d3..3594bbd1f61 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -170,7 +170,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 
 	/* XXX what about AS? */
 
-	stlbe->tid = asid & 0xff;
+	stlbe->tid = !(asid & 0xff);
 
 	/* Force TS=1 for all guest mappings. */
 	/* For now we hardcode 4KB mappings, but it will be important to
@@ -190,7 +190,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
                            gva_t eend, u32 asid)
 {
-	unsigned int pid = asid & 0xff;
+	unsigned int pid = !(asid & 0xff);
 	int i;
 
 	/* XXX Replace loop with fancy data structures. */
@@ -222,23 +222,30 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	up_write(&current->mm->mmap_sem);
 }
 
-/* Invalidate all mappings, so that when they fault back in they will get the
- * proper permission bits. */
+/* Invalidate all mappings on the privilege switch after PID has been changed.
+ * The guest always runs with PID=1, so we must clear the entire TLB when
+ * switching address spaces. */
 void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 {
 	int i;
 
-	/* XXX Replace loop with fancy data structures. */
-	down_write(&current->mm->mmap_sem);
-	for (i = 0; i <= tlb_44x_hwater; i++) {
-		struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
-
-		kvmppc_44x_shadow_release(vcpu, i);
-		stlbe->word0 = 0;
-		kvmppc_tlbe_set_modified(vcpu, i);
-		KVMTRACE_5D(STLB_INVAL, vcpu, i,
-				stlbe->tid, stlbe->word0, stlbe->word1,
-				stlbe->word2, handler);
+	if (vcpu->arch.swap_pid) {
+		/* XXX Replace loop with fancy data structures. */
+		down_write(&current->mm->mmap_sem);
+		for (i = 0; i <= tlb_44x_hwater; i++) {
+			struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+
+			/* Future optimization: clear only userspace mappings. */
+			kvmppc_44x_shadow_release(vcpu, i);
+			stlbe->word0 = 0;
+			kvmppc_tlbe_set_modified(vcpu, i);
+			KVMTRACE_5D(STLB_INVAL, vcpu, i,
+			            stlbe->tid, stlbe->word0, stlbe->word1,
+			            stlbe->word2, handler);
+		}
+		up_write(&current->mm->mmap_sem);
+		vcpu->arch.swap_pid = 0;
 	}
-	up_write(&current->mm->mmap_sem);
+
+	vcpu->arch.shadow_pid = !usermode;
 }
diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
index 3cca079975e..7b2591e26ba 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke_guest.c
@@ -486,6 +486,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	vcpu->arch.msr = 0;
 	vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
 
+	vcpu->arch.shadow_pid = 1;
+
 	/* Eye-catching number so we know if the guest takes an interrupt
 	 * before it's programmed its own IVPR. */
 	vcpu->arch.ivpr = 0x55550000;
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 564ea32ecba..95e165baf85 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -332,7 +332,7 @@ lightweight_exit:
 
 	mfspr	r3, SPRN_PID
 	stw	r3, VCPU_HOST_PID(r4)
-	lwz	r3, VCPU_PID(r4)
+	lwz	r3, VCPU_SHADOW_PID(r4)
 	mtspr	SPRN_PID, r3
 
 	/* Prevent all asynchronous TLB updates. */
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index c3ed63b2221..0fce4fbdc20 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -508,7 +508,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			case SPRN_MMUCR:
 				vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
 			case SPRN_PID:
-				vcpu->arch.pid = vcpu->arch.gpr[rs]; break;
+				kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
 			case SPRN_CCR0:
 				vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
 			case SPRN_CCR1:
-- 
cgit v1.2.3


From 564f15378f04921d5749f27ec53d5e68a6d1d446 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Sat, 26 Jul 2008 17:00:59 -0300
Subject: KVM: Add irq ack notifier list

This can be used by kvm subsystems that are interested in when
interrupts are acked, for example time drift compensation.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/irq.c | 22 ++++++++++++++++++++++
 arch/x86/kvm/irq.h |  5 +++++
 2 files changed, 27 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 0d9e55275af..90911958d85 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -111,3 +111,25 @@ void kvm_set_irq(struct kvm *kvm, int irq, int level)
 	kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level);
 	kvm_pic_set_irq(pic_irqchip(kvm), irq, level);
 }
+
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
+{
+	struct kvm_irq_ack_notifier *kian;
+	struct hlist_node *n;
+
+	hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
+		if (kian->gsi == gsi)
+			kian->irq_acked(kian);
+}
+
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+				   struct kvm_irq_ack_notifier *kian)
+{
+	hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
+}
+
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+				     struct kvm_irq_ack_notifier *kian)
+{
+	hlist_del(&kian->link);
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 07ff2aef0c1..95fe718e3ab 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -83,6 +83,11 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
 void kvm_set_irq(struct kvm *kvm, int irq, int level);
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+				   struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+				     struct kvm_irq_ack_notifier *kian);
 
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From f52447261bc8c21dfd4635196e32d2da1352f589 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Sat, 26 Jul 2008 17:01:00 -0300
Subject: KVM: irq ack notification

Based on a patch from: Ben-Ami Yassour <benami@il.ibm.com>
which was based on a patch from: Amit Shah <amit.shah@qumranet.com>

Notify IRQ acking on PIC/APIC emulation. The previous patch missed two things:

- Edge triggered interrupts on IOAPIC
- PIC reset with IRR/ISR set should be equivalent to ack (LAPIC probably
needs something similar).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
CC: Amit Shah <amit.shah@qumranet.com>
CC: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8259.c | 12 +++++++++++-
 arch/x86/kvm/irq.c   |  2 +-
 arch/x86/kvm/irq.h   |  3 ++-
 arch/x86/kvm/lapic.c |  7 +++++--
 4 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 55e179ad98e..de704995b81 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -159,9 +159,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
 		s->irr &= ~(1 << irq);
 }
 
-int kvm_pic_read_irq(struct kvm_pic *s)
+int kvm_pic_read_irq(struct kvm *kvm)
 {
 	int irq, irq2, intno;
+	struct kvm_pic *s = pic_irqchip(kvm);
 
 	irq = pic_get_irq(&s->pics[0]);
 	if (irq >= 0) {
@@ -187,12 +188,21 @@ int kvm_pic_read_irq(struct kvm_pic *s)
 		intno = s->pics[0].irq_base + irq;
 	}
 	pic_update_irq(s);
+	kvm_notify_acked_irq(kvm, irq);
 
 	return intno;
 }
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
+	int irq;
+	struct kvm *kvm = s->pics_state->irq_request_opaque;
+
+	for (irq = 0; irq < PIC_NUM_PINS; irq++) {
+		if (!(s->imr & (1 << irq)) && (s->irr & (1 << irq) ||
+		    s->isr & (1 << irq)))
+			kvm_notify_acked_irq(kvm, irq);
+	}
 	s->last_irr = 0;
 	s->irr = 0;
 	s->imr = 0;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 90911958d85..3c508afaa28 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 		if (kvm_apic_accept_pic_intr(v)) {
 			s = pic_irqchip(v->kvm);
 			s->output = 0;		/* PIC */
-			vector = kvm_pic_read_irq(s);
+			vector = kvm_pic_read_irq(v->kvm);
 		}
 	}
 	return vector;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 95fe718e3ab..479a3d2d561 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,11 +63,12 @@ struct kvm_pic {
 	void *irq_request_opaque;
 	int output;		/* intr from master PIC */
 	struct kvm_io_device dev;
+	void (*ack_notifier)(void *opaque, int irq);
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm);
 void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
+int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 
 static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9fde0ac2426..be94f93a73f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -439,7 +439,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
 static void apic_set_eoi(struct kvm_lapic *apic)
 {
 	int vector = apic_find_highest_isr(apic);
-
+	int trigger_mode;
 	/*
 	 * Not every write EOI will has corresponding ISR,
 	 * one example is when Kernel check timer on setup_IO_APIC
@@ -451,7 +451,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
 	apic_update_ppr(apic);
 
 	if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
-		kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
+		trigger_mode = IOAPIC_LEVEL_TRIG;
+	else
+		trigger_mode = IOAPIC_EDGE_TRIG;
+	kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
-- 
cgit v1.2.3


From 3cf57fed216e2c1b6fdfeccb792650bab72a350a Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Sat, 26 Jul 2008 17:01:01 -0300
Subject: KVM: PIT: fix injection logic and count

The PIT injection logic is problematic under the following cases:

1) If there is a higher priority vector to be delivered by the time
kvm_pit_timer_intr_post is invoked ps->inject_pending won't be set.
This opens the possibility for missing many PIT event injections (say if
guest executes hlt at this point).

2) ps->inject_pending is racy with more than two vcpus. Since there's no locking
around read/dec of pt->pending, two vcpu's can inject two interrupts for a single
pt->pending count.

Fix 1 by using an irq ack notifier: only reinject when the previous irq
has been acked. Fix 2 with appropriate locking around manipulation of
pending count and irq_ack by the injection / ack paths.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c | 70 ++++++++++++++++++++++++++--------------------------
 arch/x86/kvm/i8254.h |  7 +++---
 arch/x86/kvm/irq.c   |  1 -
 3 files changed, 38 insertions(+), 40 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c0f7872a912..7d04dd3ef85 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -207,6 +207,8 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 
 	pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
 	pt->scheduled = ktime_to_ns(pt->timer.expires);
+	if (pt->period)
+		ps->channels[0].count_load_time = pt->timer.expires;
 
 	return (pt->period == 0 ? 0 : 1);
 }
@@ -215,12 +217,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
 
-	if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
+	if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
 		return atomic_read(&pit->pit_state.pit_timer.pending);
-
 	return 0;
 }
 
+void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+						 irq_ack_notifier);
+	spin_lock(&ps->inject_lock);
+	if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+		WARN_ON(1);
+	ps->irq_ack = 1;
+	spin_unlock(&ps->inject_lock);
+}
+
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
 	struct kvm_kpit_state *ps;
@@ -255,8 +267,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
 	hrtimer_cancel(&pt->timer);
 }
 
-static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 {
+	struct kvm_kpit_timer *pt = &ps->pit_timer;
 	s64 interval;
 
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -268,6 +281,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
 	pt->period = (is_period == 0) ? 0 : interval;
 	pt->timer.function = pit_timer_fn;
 	atomic_set(&pt->pending, 0);
+	ps->irq_ack = 1;
 
 	hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
 		      HRTIMER_MODE_ABS);
@@ -302,11 +316,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	case 1:
         /* FIXME: enhance mode 4 precision */
 	case 4:
-		create_pit_timer(&ps->pit_timer, val, 0);
+		create_pit_timer(ps, val, 0);
 		break;
 	case 2:
 	case 3:
-		create_pit_timer(&ps->pit_timer, val, 1);
+		create_pit_timer(ps, val, 1);
 		break;
 	default:
 		destroy_pit_timer(&ps->pit_timer);
@@ -520,7 +534,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
 	mutex_unlock(&pit->pit_state.lock);
 
 	atomic_set(&pit->pit_state.pit_timer.pending, 0);
-	pit->pit_state.inject_pending = 1;
+	pit->pit_state.irq_ack = 1;
 }
 
 struct kvm_pit *kvm_create_pit(struct kvm *kvm)
@@ -534,6 +548,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 
 	mutex_init(&pit->pit_state.lock);
 	mutex_lock(&pit->pit_state.lock);
+	spin_lock_init(&pit->pit_state.inject_lock);
 
 	/* Initialize PIO device */
 	pit->dev.read = pit_ioport_read;
@@ -555,6 +570,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 	pit_state->pit = pit;
 	hrtimer_init(&pit_state->pit_timer.timer,
 		     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	pit_state->irq_ack_notifier.gsi = 0;
+	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+	kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
 	mutex_unlock(&pit->pit_state.lock);
 
 	kvm_pit_reset(pit);
@@ -592,37 +610,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
 	struct kvm_kpit_state *ps;
 
 	if (vcpu && pit) {
+		int inject = 0;
 		ps = &pit->pit_state;
 
-		/* Try to inject pending interrupts when:
-		 * 1. Pending exists
-		 * 2. Last interrupt was accepted or waited for too long time*/
-		if (atomic_read(&ps->pit_timer.pending) &&
-		    (ps->inject_pending ||
-		    (jiffies - ps->last_injected_time
-				>= KVM_MAX_PIT_INTR_INTERVAL))) {
-			ps->inject_pending = 0;
-			__inject_pit_timer_intr(kvm);
-			ps->last_injected_time = jiffies;
-		}
-	}
-}
-
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
-	struct kvm_arch *arch = &vcpu->kvm->arch;
-	struct kvm_kpit_state *ps;
-
-	if (vcpu && arch->vpit) {
-		ps = &arch->vpit->pit_state;
-		if (atomic_read(&ps->pit_timer.pending) &&
-		(((arch->vpic->pics[0].imr & 1) == 0 &&
-		  arch->vpic->pics[0].irq_base == vec) ||
-		  (arch->vioapic->redirtbl[0].fields.vector == vec &&
-		  arch->vioapic->redirtbl[0].fields.mask != 1))) {
-			ps->inject_pending = 1;
-			atomic_dec(&ps->pit_timer.pending);
-			ps->channels[0].count_load_time = ktime_get();
+		/* Try to inject pending interrupts when
+		 * last one has been acked.
+		 */
+		spin_lock(&ps->inject_lock);
+		if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
+			ps->irq_ack = 0;
+			inject = 1;
 		}
+		spin_unlock(&ps->inject_lock);
+		if (inject)
+			__inject_pit_timer_intr(kvm);
 	}
 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index db25c2a6c8c..e436d4983aa 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
 	int irq;
 	s64 period; /* unit: ns */
 	s64 scheduled;
-	ktime_t last_update;
 	atomic_t pending;
 };
 
@@ -34,8 +33,9 @@ struct kvm_kpit_state {
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct kvm_pit *pit;
-	bool inject_pending; /* if inject pending interrupts */
-	unsigned long last_injected_time;
+	spinlock_t inject_lock;
+	unsigned long irq_ack;
+	struct kvm_irq_ack_notifier irq_ack_notifier;
 };
 
 struct kvm_pit {
@@ -54,7 +54,6 @@ struct kvm_pit {
 #define KVM_PIT_CHANNEL_MASK	    0x3
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
 struct kvm_pit *kvm_create_pit(struct kvm *kvm);
 void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 3c508afaa28..8c1b9c5def7 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
 {
 	kvm_apic_timer_intr_post(vcpu, vec);
-	kvm_pit_timer_intr_post(vcpu, vec);
 	/* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
-- 
cgit v1.2.3


From 3807f345b2c610336c17c7624a0d496a38df75a0 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Mon, 28 Jul 2008 11:47:52 -0300
Subject: x86: paravirt: factor out cpu_khz to common code

KVM intends to use paravirt code to calibrate khz. Xen
current code will do just fine. So as a first step, factor out
code to pvclock.c.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kernel/pvclock.c | 12 ++++++++++++
 arch/x86/xen/time.c       | 11 ++---------
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a0325..1c54b5fb7ae 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
 	return dst->version;
 }
 
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+	u64 tsc_khz = 1000000ULL << 32;
+
+	do_div(tsc_khz, src->tsc_to_system_mul);
+	if (src->tsc_shift < 0)
+		tsc_khz <<= -src->tsc_shift;
+	else
+		tsc_khz >>= src->tsc_shift;
+	return tsc_khz;
+}
+
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
 	struct pvclock_shadow_time shadow;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 004ba86326a..c9f7cda48ed 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void)
 /* Get the TSC speed from Xen */
 unsigned long xen_tsc_khz(void)
 {
-	u64 xen_khz = 1000000ULL << 32;
-	const struct pvclock_vcpu_time_info *info =
+	struct pvclock_vcpu_time_info *info =
 		&HYPERVISOR_shared_info->vcpu_info[0].time;
 
-	do_div(xen_khz, info->tsc_to_system_mul);
-	if (info->tsc_shift < 0)
-		xen_khz <<= -info->tsc_shift;
-	else
-		xen_khz >>= info->tsc_shift;
-
-	return xen_khz;
+	return pvclock_tsc_khz(info);
 }
 
 cycle_t xen_clocksource_read(void)
-- 
cgit v1.2.3


From 0293615f3fb9886b6b23800c121be293bb7483e9 Mon Sep 17 00:00:00 2001
From: Glauber Costa <gcosta@redhat.com>
Date: Mon, 28 Jul 2008 11:47:53 -0300
Subject: x86: KVM guest: use paravirt function to calculate cpu khz

We're currently facing timing problems in guests that do
calibration under heavy load, and then the load vanishes.
This means we'll have a much lower lpj than we actually should,
and delays end up taking less time than they should, which is a
nasty bug.

Solution is to pass on the lpj value from host to guest, and have it
preset.

Signed-off-by: Glauber Costa <gcosta@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kernel/kvmclock.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d02def06ca9..774ac499156 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
 	return ret;
 }
 
+/*
+ * If we don't do that, there is the possibility that the guest
+ * will calibrate under heavy load - thus, getting a lower lpj -
+ * and execute the delays themselves without load. This is wrong,
+ * because no delay loop can finish beforehand.
+ * Any heuristics is subject to fail, because ultimately, a large
+ * poll of guests can be running and trouble each other. So we preset
+ * lpj here
+ */
+static unsigned long kvm_get_tsc_khz(void)
+{
+	return preset_lpj;
+}
+
+static void kvm_get_preset_lpj(void)
+{
+	struct pvclock_vcpu_time_info *src;
+	unsigned long khz;
+	u64 lpj;
+
+	src = &per_cpu(hv_clock, 0);
+	khz = pvclock_tsc_khz(src);
+
+	lpj = ((u64)khz * 1000);
+	do_div(lpj, HZ);
+	preset_lpj = lpj;
+}
+
 static struct clocksource kvm_clock = {
 	.name = "kvm-clock",
 	.read = kvm_clock_read,
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
 		pv_time_ops.get_wallclock = kvm_get_wallclock;
 		pv_time_ops.set_wallclock = kvm_set_wallclock;
 		pv_time_ops.sched_clock = kvm_clock_read;
+		pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
 #ifdef CONFIG_X86_LOCAL_APIC
 		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
 #endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
 #ifdef CONFIG_KEXEC
 		machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
+		kvm_get_preset_lpj();
 		clocksource_register(&kvm_clock);
 	}
 }
-- 
cgit v1.2.3


From 4d5c5d0fe89c921336b95f5e7e4f529a9df92f53 Mon Sep 17 00:00:00 2001
From: Ben-Ami Yassour <benami@il.ibm.com>
Date: Mon, 28 Jul 2008 19:26:26 +0300
Subject: KVM: pci device assignment

Based on a patch from: Amit Shah <amit.shah@qumranet.com>

This patch adds support for handling PCI devices that are assigned to
the guest.

The device to be assigned to the guest is registered in the host kernel
and interrupt delivery is handled.  If a device is already assigned, or
the device driver for it is still loaded on the host, the device
assignment is failed by conveying a -EBUSY reply to the userspace.

Devices that share their interrupt line are not supported at the moment.

By itself, this patch will not make devices work within the guest.
The VT-d extension is required to enable the device to perform DMA.
Another alternative is PVDMA.

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 243 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 243 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 94a216562f1..a97157cc42a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4,10 +4,14 @@
  * derived from drivers/kvm/kvm_main.c
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
  *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Amit Shah    <amit.shah@qumranet.com>
+ *   Ben-Ami Yassour <benami@il.ibm.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
@@ -23,8 +27,10 @@
 #include "x86.h"
 
 #include <linux/clocksource.h>
+#include <linux/interrupt.h>
 #include <linux/kvm.h>
 #include <linux/fs.h>
+#include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/mman.h>
@@ -98,6 +104,219 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+						      int assigned_dev_id)
+{
+	struct list_head *ptr;
+	struct kvm_assigned_dev_kernel *match;
+
+	list_for_each(ptr, head) {
+		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+		if (match->assigned_dev_id == assigned_dev_id)
+			return match;
+	}
+	return NULL;
+}
+
+static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev;
+
+	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
+				    interrupt_work);
+
+	/* This is taken to safely inject irq inside the guest. When
+	 * the interrupt injection (or the ioapic code) uses a
+	 * finer-grained lock, update this
+	 */
+	mutex_lock(&assigned_dev->kvm->lock);
+	kvm_set_irq(assigned_dev->kvm,
+		    assigned_dev->guest_irq, 1);
+	mutex_unlock(&assigned_dev->kvm->lock);
+	kvm_put_kvm(assigned_dev->kvm);
+}
+
+/* FIXME: Implement the OR logic needed to make shared interrupts on
+ * this line behave properly
+ */
+static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
+{
+	struct kvm_assigned_dev_kernel *assigned_dev =
+		(struct kvm_assigned_dev_kernel *) dev_id;
+
+	kvm_get_kvm(assigned_dev->kvm);
+	schedule_work(&assigned_dev->interrupt_work);
+	disable_irq_nosync(irq);
+	return IRQ_HANDLED;
+}
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+	struct kvm_assigned_dev_kernel *dev;
+
+	if (kian->gsi == -1)
+		return;
+
+	dev = container_of(kian, struct kvm_assigned_dev_kernel,
+			   ack_notifier);
+	kvm_set_irq(dev->kvm, dev->guest_irq, 0);
+	enable_irq(dev->host_irq);
+}
+
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+				   struct kvm_assigned_irq
+				   *assigned_irq)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_irq->assigned_dev_id);
+	if (!match) {
+		mutex_unlock(&kvm->lock);
+		return -EINVAL;
+	}
+
+	if (match->irq_requested) {
+		match->guest_irq = assigned_irq->guest_irq;
+		match->ack_notifier.gsi = assigned_irq->guest_irq;
+		mutex_unlock(&kvm->lock);
+		return 0;
+	}
+
+	INIT_WORK(&match->interrupt_work,
+		  kvm_assigned_dev_interrupt_work_handler);
+
+	if (irqchip_in_kernel(kvm)) {
+		if (assigned_irq->host_irq)
+			match->host_irq = assigned_irq->host_irq;
+		else
+			match->host_irq = match->dev->irq;
+		match->guest_irq = assigned_irq->guest_irq;
+		match->ack_notifier.gsi = assigned_irq->guest_irq;
+		match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+		kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
+
+		/* Even though this is PCI, we don't want to use shared
+		 * interrupts. Sharing host devices with guest-assigned devices
+		 * on the same interrupt line is not a happy situation: there
+		 * are going to be long delays in accepting, acking, etc.
+		 */
+		if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
+				"kvm_assigned_device", (void *)match)) {
+			printk(KERN_INFO "%s: couldn't allocate irq for pv "
+			       "device\n", __func__);
+			r = -EIO;
+			goto out;
+		}
+	}
+
+	match->irq_requested = true;
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+				      struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+	struct pci_dev *dev;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev->assigned_dev_id);
+	if (match) {
+		/* device already assigned */
+		r = -EINVAL;
+		goto out;
+	}
+
+	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+	if (match == NULL) {
+		printk(KERN_INFO "%s: Couldn't allocate memory\n",
+		       __func__);
+		r = -ENOMEM;
+		goto out;
+	}
+	dev = pci_get_bus_and_slot(assigned_dev->busnr,
+				   assigned_dev->devfn);
+	if (!dev) {
+		printk(KERN_INFO "%s: host device not found\n", __func__);
+		r = -EINVAL;
+		goto out_free;
+	}
+	if (pci_enable_device(dev)) {
+		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+		r = -EBUSY;
+		goto out_put;
+	}
+	r = pci_request_regions(dev, "kvm_assigned_device");
+	if (r) {
+		printk(KERN_INFO "%s: Could not get access to device regions\n",
+		       __func__);
+		goto out_disable;
+	}
+	match->assigned_dev_id = assigned_dev->assigned_dev_id;
+	match->host_busnr = assigned_dev->busnr;
+	match->host_devfn = assigned_dev->devfn;
+	match->dev = dev;
+
+	match->kvm = kvm;
+
+	list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+out_disable:
+	pci_disable_device(dev);
+out_put:
+	pci_dev_put(dev);
+out_free:
+	kfree(match);
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static void kvm_free_assigned_devices(struct kvm *kvm)
+{
+	struct list_head *ptr, *ptr2;
+	struct kvm_assigned_dev_kernel *assigned_dev;
+
+	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+		assigned_dev = list_entry(ptr,
+					  struct kvm_assigned_dev_kernel,
+					  list);
+
+		if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) {
+			free_irq(assigned_dev->host_irq,
+				 (void *)assigned_dev);
+
+			kvm_unregister_irq_ack_notifier(kvm,
+							&assigned_dev->
+							ack_notifier);
+		}
+
+		if (cancel_work_sync(&assigned_dev->interrupt_work))
+			/* We had pending work. That means we will have to take
+			 * care of kvm_put_kvm.
+			 */
+			kvm_put_kvm(kvm);
+
+		pci_release_regions(assigned_dev->dev);
+		pci_disable_device(assigned_dev->dev);
+		pci_dev_put(assigned_dev->dev);
+
+		list_del(&assigned_dev->list);
+		kfree(assigned_dev);
+	}
+}
 
 unsigned long segment_base(u16 selector)
 {
@@ -1766,6 +1985,28 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_ASSIGN_PCI_DEVICE: {
+		struct kvm_assigned_pci_dev assigned_dev;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+			goto out;
+		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ASSIGN_IRQ: {
+		struct kvm_assigned_irq assigned_irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+			goto out;
+		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+		if (r)
+			goto out;
+		break;
+	}
 	case KVM_GET_PIT: {
 		struct kvm_pit_state ps;
 		r = -EFAULT;
@@ -3945,6 +4186,7 @@ struct  kvm *kvm_arch_create_vm(void)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
 	return kvm;
 }
@@ -3977,6 +4219,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	kvm_free_assigned_devices(kvm);
 	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
-- 
cgit v1.2.3


From f0d662759a2465babdba1160749c446648c9d159 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Mon, 11 Aug 2008 10:01:45 -0700
Subject: KVM: Reduce kvm stack usage in kvm_arch_vm_ioctl()

On my machine with gcc 3.4, kvm uses ~2k of stack in a few
select functions.  This is mostly because gcc fails to
notice that the different case: statements could have their
stack usage combined.  It overflows very nicely if interrupts
happen during one of these large uses.

This patch uses two methods for reducing stack usage.
1. dynamically allocate large objects instead of putting
   on the stack.
2. Use a union{} member for all of the case variables. This
   tricks gcc into combining them all into a single stack
   allocation. (There's also a comment on this)

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 72 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a97157cc42a..87d434228fe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1869,6 +1869,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	struct kvm *kvm = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r = -EINVAL;
+	/*
+	 * This union makes it completely explicit to gcc-3.x
+	 * that these two variables' stack usage should be
+	 * combined, not added together.
+	 */
+	union {
+		struct kvm_pit_state ps;
+		struct kvm_memory_alias alias;
+	} u;
 
 	switch (ioctl) {
 	case KVM_SET_TSS_ADDR:
@@ -1900,17 +1909,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_GET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 		break;
-	case KVM_SET_MEMORY_ALIAS: {
-		struct kvm_memory_alias alias;
-
+	case KVM_SET_MEMORY_ALIAS:
 		r = -EFAULT;
-		if (copy_from_user(&alias, argp, sizeof alias))
+		if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
 			goto out;
-		r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
+		r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
 		if (r)
 			goto out;
 		break;
-	}
 	case KVM_CREATE_IRQCHIP:
 		r = -ENOMEM;
 		kvm->arch.vpic = kvm_create_pic(kvm);
@@ -1952,37 +1958,51 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 	case KVM_GET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip chip;
+		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
 
-		r = -EFAULT;
-		if (copy_from_user(&chip, argp, sizeof chip))
+		r = -ENOMEM;
+		if (!chip)
 			goto out;
+		r = -EFAULT;
+		if (copy_from_user(chip, argp, sizeof *chip))
+			goto get_irqchip_out;
 		r = -ENXIO;
 		if (!irqchip_in_kernel(kvm))
-			goto out;
-		r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+			goto get_irqchip_out;
+		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		if (r)
-			goto out;
+			goto get_irqchip_out;
 		r = -EFAULT;
-		if (copy_to_user(argp, &chip, sizeof chip))
-			goto out;
+		if (copy_to_user(argp, chip, sizeof *chip))
+			goto get_irqchip_out;
 		r = 0;
+	get_irqchip_out:
+		kfree(chip);
+		if (r)
+			goto out;
 		break;
 	}
 	case KVM_SET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip chip;
+		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
 
-		r = -EFAULT;
-		if (copy_from_user(&chip, argp, sizeof chip))
+		r = -ENOMEM;
+		if (!chip)
 			goto out;
+		r = -EFAULT;
+		if (copy_from_user(chip, argp, sizeof *chip))
+			goto set_irqchip_out;
 		r = -ENXIO;
 		if (!irqchip_in_kernel(kvm))
-			goto out;
-		r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+			goto set_irqchip_out;
+		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 		if (r)
-			goto out;
+			goto set_irqchip_out;
 		r = 0;
+	set_irqchip_out:
+		kfree(chip);
+		if (r)
+			goto out;
 		break;
 	}
 	case KVM_ASSIGN_PCI_DEVICE: {
@@ -2008,31 +2028,29 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_PIT: {
-		struct kvm_pit_state ps;
 		r = -EFAULT;
-		if (copy_from_user(&ps, argp, sizeof ps))
+		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
 			goto out;
 		r = -ENXIO;
 		if (!kvm->arch.vpit)
 			goto out;
-		r = kvm_vm_ioctl_get_pit(kvm, &ps);
+		r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
 		if (r)
 			goto out;
 		r = -EFAULT;
-		if (copy_to_user(argp, &ps, sizeof ps))
+		if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_PIT: {
-		struct kvm_pit_state ps;
 		r = -EFAULT;
-		if (copy_from_user(&ps, argp, sizeof ps))
+		if (copy_from_user(&u.ps, argp, sizeof u.ps))
 			goto out;
 		r = -ENXIO;
 		if (!kvm->arch.vpit)
 			goto out;
-		r = kvm_vm_ioctl_set_pit(kvm, &ps);
+		r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
 		if (r)
 			goto out;
 		r = 0;
-- 
cgit v1.2.3


From b772ff362ec6b821c8a5227a3355e263f917bfad Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Mon, 11 Aug 2008 10:01:47 -0700
Subject: KVM: Reduce stack usage in kvm_arch_vcpu_ioctl()

[sheng: fix KVM_GET_LAPIC using wrong size]

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 87d434228fe..f1b0223c408 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1542,28 +1542,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r;
+	struct kvm_lapic_state *lapic = NULL;
 
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
-		struct kvm_lapic_state lapic;
+		lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
-		memset(&lapic, 0, sizeof lapic);
-		r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
+		r = -ENOMEM;
+		if (!lapic)
+			goto out;
+		r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
 		if (r)
 			goto out;
 		r = -EFAULT;
-		if (copy_to_user(argp, &lapic, sizeof lapic))
+		if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
 			goto out;
 		r = 0;
 		break;
 	}
 	case KVM_SET_LAPIC: {
-		struct kvm_lapic_state lapic;
-
+		lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!lapic)
+			goto out;
 		r = -EFAULT;
-		if (copy_from_user(&lapic, argp, sizeof lapic))
+		if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
 			goto out;
-		r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
+		r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
 		if (r)
 			goto out;
 		r = 0;
@@ -1661,6 +1666,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 	}
 out:
+	if (lapic)
+		kfree(lapic);
 	return r;
 }
 
-- 
cgit v1.2.3


From 6ad18fba05228fb1d47cdbc0339fe8b3fca1ca26 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Mon, 11 Aug 2008 10:01:49 -0700
Subject: KVM: Reduce stack usage in kvm_pv_mmu_op()

We're in a hot path.  We can't use kmalloc() because
it might impact performance.  So, we just stick the buffer that
we need into the kvm_vcpu_arch structure.  This is used very
often, so it is not really a waste.

We also have to move the buffer structure's definition to the
arch-specific x86 kvm header.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c3afbfe6b0c..171bcea1be2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -135,13 +135,6 @@ module_param(dbg, bool, 0644);
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
-struct kvm_pv_mmu_op_buffer {
-	void *ptr;
-	unsigned len;
-	unsigned processed;
-	char buf[512] __aligned(sizeof(long));
-};
-
 struct kvm_rmap_desc {
 	u64 *shadow_ptes[RMAP_EXT];
 	struct kvm_rmap_desc *more;
@@ -2292,18 +2285,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 		  gpa_t addr, unsigned long *ret)
 {
 	int r;
-	struct kvm_pv_mmu_op_buffer buffer;
+	struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
 
-	buffer.ptr = buffer.buf;
-	buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
-	buffer.processed = 0;
+	buffer->ptr = buffer->buf;
+	buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
+	buffer->processed = 0;
 
-	r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+	r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
 	if (r)
 		goto out;
 
-	while (buffer.len) {
-		r = kvm_pv_mmu_op_one(vcpu, &buffer);
+	while (buffer->len) {
+		r = kvm_pv_mmu_op_one(vcpu, buffer);
 		if (r < 0)
 			goto out;
 		if (r == 0)
@@ -2312,7 +2305,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 
 	r = 1;
 out:
-	*ret = buffer.processed;
+	*ret = buffer->processed;
 	return r;
 }
 
-- 
cgit v1.2.3


From 464d17c8b747deb77d1bf8c14cc4f28aab2a4952 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Wed, 13 Aug 2008 14:10:33 +0800
Subject: KVM: VMX: Clean up magic number 0x66 in init_rmode_tss

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ddb49e34697..229e2d06fa2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1732,7 +1732,8 @@ static int init_rmode_tss(struct kvm *kvm)
 	if (r < 0)
 		goto out;
 	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
-	r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
+	r = kvm_write_guest_page(kvm, fn++, &data,
+			TSS_IOPB_BASE_OFFSET, sizeof(u16));
 	if (r < 0)
 		goto out;
 	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
-- 
cgit v1.2.3


From 29415c37f043d1d54dcf356601d738ff6633b72b Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 1 Aug 2008 20:09:13 -0300
Subject: KVM: set debug registers after "schedulable" section

The vcpu thread can be preempted after the guest_debug_pre() callback,
resulting in invalid debug registers on the new vcpu.

Move it inside the non-preemptable section.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1b0223c408..4a033757a19 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3113,10 +3113,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	down_read(&vcpu->kvm->slots_lock);
 	vapic_enter(vcpu);
 
-preempted:
-	if (vcpu->guest_debug.enabled)
-		kvm_x86_ops->guest_debug_pre(vcpu);
-
 again:
 	if (vcpu->requests)
 		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3170,6 +3166,9 @@ again:
 		goto out;
 	}
 
+	if (vcpu->guest_debug.enabled)
+		kvm_x86_ops->guest_debug_pre(vcpu);
+
 	vcpu->guest_mode = 1;
 	/*
 	 * Make sure that guest_mode assignment won't happen after
@@ -3244,7 +3243,7 @@ out:
 	if (r > 0) {
 		kvm_resched(vcpu);
 		down_read(&vcpu->kvm->slots_lock);
-		goto preempted;
+		goto again;
 	}
 
 	post_kvm_run_save(vcpu, kvm_run);
-- 
cgit v1.2.3


From ecfc79c700b02c5ad1ccae58718015caa84824be Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 14 Aug 2008 11:13:16 +0300
Subject: KVM: VMX: Use interrupt queue for !irqchip_in_kernel

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 229e2d06fa2..81db7d48ab8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2173,7 +2173,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
 	if (!vcpu->arch.irq_pending[word_index])
 		clear_bit(word_index, &vcpu->arch.irq_summary);
-	vmx_inject_irq(vcpu, irq);
+	kvm_queue_interrupt(vcpu, irq);
 }
 
 
@@ -2187,13 +2187,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
 
 	if (vcpu->arch.interrupt_window_open &&
-	    vcpu->arch.irq_summary &&
-	    !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
-		/*
-		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
-		 */
+	    vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
 		kvm_do_inject_irq(vcpu);
 
+	if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
+		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	if (!vcpu->arch.interrupt_window_open &&
 	    (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
-- 
cgit v1.2.3


From 85428ac7c39ab5fff23b5d14ccb32941e9401285 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 14 Aug 2008 20:53:25 -0300
Subject: KVM: fix i8259 reset irq acking

The irq ack during pic reset has three problems:

- Ignores slave/master PIC, using gsi 0-8 for both.
- Generates an ACK even if the APIC is in control.
- Depends upon IMR being clear, which is broken if the irq was masked
at the time it was generated.

The last one causes the BIOS to hang after the first reboot of
Windows installation, since PIT interrupts stop.

[avi: fix check whether pic interrupts are seen by cpu]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8259.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index de704995b81..71e3eeeccae 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -195,13 +195,19 @@ int kvm_pic_read_irq(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-	int irq;
+	int irq, irqbase;
 	struct kvm *kvm = s->pics_state->irq_request_opaque;
+	struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
 
-	for (irq = 0; irq < PIC_NUM_PINS; irq++) {
-		if (!(s->imr & (1 << irq)) && (s->irr & (1 << irq) ||
-		    s->isr & (1 << irq)))
-			kvm_notify_acked_irq(kvm, irq);
+	if (s == &s->pics_state->pics[0])
+		irqbase = 0;
+	else
+		irqbase = 8;
+
+	for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
+		if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
+			if (s->irr & (1 << irq) || s->isr & (1 << irq))
+				kvm_notify_acked_irq(kvm, irq+irqbase);
 	}
 	s->last_irr = 0;
 	s->irr = 0;
-- 
cgit v1.2.3


From dc7404cea34ef997dfe89ca94d16358e9d29c8d8 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Sun, 17 Aug 2008 16:03:46 +0300
Subject: KVM: Handle spurious acks for PIT interrupts

Spurious acks can be generated, for example if the PIC is being reset.
Handle those acks gracefully rather than flooding the log with warnings.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 7d04dd3ef85..c842060c6c0 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -228,7 +228,7 @@ void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 						 irq_ack_notifier);
 	spin_lock(&ps->inject_lock);
 	if (atomic_dec_return(&ps->pit_timer.pending) < 0)
-		WARN_ON(1);
+		atomic_inc(&ps->pit_timer.pending);
 	ps->irq_ack = 1;
 	spin_unlock(&ps->inject_lock);
 }
-- 
cgit v1.2.3


From 6762b7299aa115e11815decd1fd982d015f09615 Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@qumranet.com>
Date: Wed, 13 Aug 2008 16:22:37 +0300
Subject: KVM: Device assignment: Check for privileges before assigning irq

Even though we don't share irqs at the moment, we should ensure
regular user processes don't try to allocate system resources.

We check for capability to access IO devices (CAP_SYS_RAWIO) before
we request_irq on behalf of the guest.

Noticed by Avi.

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4a033757a19..fffdf4f69c5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -191,6 +191,11 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		  kvm_assigned_dev_interrupt_work_handler);
 
 	if (irqchip_in_kernel(kvm)) {
+		if (!capable(CAP_SYS_RAWIO)) {
+			return -EPERM;
+			goto out;
+		}
+
 		if (assigned_irq->host_irq)
 			match->host_irq = assigned_irq->host_irq;
 		else
-- 
cgit v1.2.3


From 648dfaa7df2d3692db4e63dcb18dccb275d9c5a7 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 17 Aug 2008 16:38:32 +0300
Subject: KVM: VMX: Add Guest State Validity Checks

This patch adds functions to check whether guest state is VMX compliant.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81db7d48ab8..e889b768c75 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1721,6 +1721,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
 	vmcs_writel(GUEST_GDTR_BASE, dt->base);
 }
 
+static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+	struct kvm_segment var;
+	u32 ar;
+
+	vmx_get_segment(vcpu, &var, seg);
+	ar = vmx_segment_access_rights(&var);
+
+	if (var.base != (var.selector << 4))
+		return false;
+	if (var.limit != 0xffff)
+		return false;
+	if (ar != 0xf3)
+		return false;
+
+	return true;
+}
+
+static bool code_segment_valid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	unsigned int cs_rpl;
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+	cs_rpl = cs.selector & SELECTOR_RPL_MASK;
+
+	if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
+		return false;
+	if (!cs.s)
+		return false;
+	if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
+		if (cs.dpl > cs_rpl)
+			return false;
+	} else if (cs.type & AR_TYPE_CODE_MASK) {
+		if (cs.dpl != cs_rpl)
+			return false;
+	}
+	if (!cs.present)
+		return false;
+
+	/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
+	return true;
+}
+
+static bool stack_segment_valid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment ss;
+	unsigned int ss_rpl;
+
+	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+	ss_rpl = ss.selector & SELECTOR_RPL_MASK;
+
+	if ((ss.type != 3) || (ss.type != 7))
+		return false;
+	if (!ss.s)
+		return false;
+	if (ss.dpl != ss_rpl) /* DPL != RPL */
+		return false;
+	if (!ss.present)
+		return false;
+
+	return true;
+}
+
+static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+	struct kvm_segment var;
+	unsigned int rpl;
+
+	vmx_get_segment(vcpu, &var, seg);
+	rpl = var.selector & SELECTOR_RPL_MASK;
+
+	if (!var.s)
+		return false;
+	if (!var.present)
+		return false;
+	if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
+		if (var.dpl < rpl) /* DPL < RPL */
+			return false;
+	}
+
+	/* TODO: Add other members to kvm_segment_field to allow checking for other access
+	 * rights flags
+	 */
+	return true;
+}
+
+static bool tr_valid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment tr;
+
+	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
+
+	if (tr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
+		return false;
+	if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
+		return false;
+	if (!tr.present)
+		return false;
+
+	return true;
+}
+
+static bool ldtr_valid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment ldtr;
+
+	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
+
+	if (ldtr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
+		return false;
+	if (ldtr.type != 2)
+		return false;
+	if (!ldtr.present)
+		return false;
+
+	return true;
+}
+
+static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs, ss;
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+
+	return ((cs.selector & SELECTOR_RPL_MASK) ==
+		 (ss.selector & SELECTOR_RPL_MASK));
+}
+
+/*
+ * Check if guest state is valid. Returns true if valid, false if
+ * not.
+ * We assume that registers are always usable
+ */
+static bool guest_state_valid(struct kvm_vcpu *vcpu)
+{
+	/* real mode guest state checks */
+	if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
+			return false;
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
+			return false;
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
+			return false;
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
+			return false;
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
+			return false;
+		if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
+			return false;
+	} else {
+	/* protected mode guest state checks */
+		if (!cs_ss_rpl_check(vcpu))
+			return false;
+		if (!code_segment_valid(vcpu))
+			return false;
+		if (!stack_segment_valid(vcpu))
+			return false;
+		if (!data_segment_valid(vcpu, VCPU_SREG_DS))
+			return false;
+		if (!data_segment_valid(vcpu, VCPU_SREG_ES))
+			return false;
+		if (!data_segment_valid(vcpu, VCPU_SREG_FS))
+			return false;
+		if (!data_segment_valid(vcpu, VCPU_SREG_GS))
+			return false;
+		if (!tr_valid(vcpu))
+			return false;
+		if (!ldtr_valid(vcpu))
+			return false;
+	}
+	/* TODO:
+	 * - Add checks on RIP
+	 * - Add checks on RFLAGS
+	 */
+
+	return true;
+}
+
 static int init_rmode_tss(struct kvm *kvm)
 {
 	gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
-- 
cgit v1.2.3


From 04fa4d32117b1a7773290fd59a97cf90cfc2a0d4 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 17 Aug 2008 16:39:48 +0300
Subject: KVM: VMX: Add module parameter and emulation flag.

The patch adds the module parameter required to enable emulating invalid
guest state, as well as the emulation_required flag used to drive
emulation whenever needed.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e889b768c75..7c5f611e1a9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -49,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0);
 static int enable_ept = 1;
 module_param(enable_ept, bool, 0);
 
+static int emulate_invalid_guest_state = 0;
+module_param(emulate_invalid_guest_state, bool, 0);
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -86,6 +89,7 @@ struct vcpu_vmx {
 		} irq;
 	} rmode;
 	int vpid;
+	bool emulation_required;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From ea953ef0ca84e778187905177e2a789a1974837b Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 17 Aug 2008 16:47:05 +0300
Subject: KVM: VMX: Add invalid guest state handler

This adds the invalid guest state handler function which invokes the x86
emulator until getting the guest to a VMX-friendly state.

[avi: leave atomic context if scheduling]
[guillaume: return to atomic context correctly]

Signed-off-by: Laurent Vivier <laurent.vivier@bull.net>
Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7c5f611e1a9..eae1f2c64f9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2892,6 +2892,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
+				struct kvm_run *kvm_run)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int err;
+
+	preempt_enable();
+	local_irq_enable();
+
+	while (!guest_state_valid(vcpu)) {
+		err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+		switch (err) {
+			case EMULATE_DONE:
+				break;
+			case EMULATE_DO_MMIO:
+				kvm_report_emulation_failure(vcpu, "mmio");
+				/* TODO: Handle MMIO */
+				return;
+			default:
+				kvm_report_emulation_failure(vcpu, "emulation failure");
+				return;
+		}
+
+		if (signal_pending(current))
+			break;
+		if (need_resched())
+			schedule();
+	}
+
+	local_irq_disable();
+	preempt_disable();
+
+	/* Guest state should be valid now, no more emulation should be needed */
+	vmx->emulation_required = 0;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
-- 
cgit v1.2.3


From a89a8fb93ba63d4ad39db97c90997c409c87ccb9 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sun, 17 Aug 2008 16:42:16 +0300
Subject: KVM: VMX: Modify mode switching and vmentry functions

This patch modifies mode switching and vmentry function in order to
drive invalid guest state emulation.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index eae1f2c64f9..9840f37925a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1298,7 +1298,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
 static void enter_pmode(struct kvm_vcpu *vcpu)
 {
 	unsigned long flags;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	vmx->emulation_required = 1;
 	vcpu->arch.rmode.active = 0;
 
 	vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
@@ -1315,6 +1317,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 
 	update_exception_bitmap(vcpu);
 
+	if (emulate_invalid_guest_state)
+		return;
+
 	fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
 	fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
 	fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
@@ -1355,7 +1360,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
 static void enter_rmode(struct kvm_vcpu *vcpu)
 {
 	unsigned long flags;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	vmx->emulation_required = 1;
 	vcpu->arch.rmode.active = 1;
 
 	vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
@@ -1377,6 +1384,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
 	update_exception_bitmap(vcpu);
 
+	if (emulate_invalid_guest_state)
+		goto continue_rmode;
+
 	vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
 	vmcs_write32(GUEST_SS_LIMIT, 0xffff);
 	vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
@@ -1392,6 +1402,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
 	fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
 
+continue_rmode:
 	kvm_mmu_reset_context(vcpu);
 	init_rmode(vcpu->kvm);
 }
@@ -2317,6 +2328,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	ret = 0;
 
+	/* HACK: Don't enable emulation on guest boot/reset */
+	vmx->emulation_required = 0;
+
 out:
 	up_read(&vcpu->kvm->slots_lock);
 	return ret;
@@ -3190,6 +3204,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info;
 
+	/* Handle invalid guest state instead of entering VMX */
+	if (vmx->emulation_required && emulate_invalid_guest_state) {
+		handle_invalid_guest_state(vcpu, kvm_run);
+		return;
+	}
+
 	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
 		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
 	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
-- 
cgit v1.2.3


From 94c935a1ee7a9c134f0ebed656384186619d05cb Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@qumranet.com>
Date: Mon, 18 Aug 2008 13:11:46 +0300
Subject: KVM: SVM: Fix typo

Fix typo in as-yet unused macro definition.

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 179c2e00d0f..be86c096385 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -44,7 +44,7 @@ MODULE_LICENSE("GPL");
 
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_DEATURE_SVML (1 << 2)
+#define SVM_FEATURE_SVML (1 << 2)
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
-- 
cgit v1.2.3


From 29c8fa32c5d1e2d26d53ad9467b3a13130014cdf Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@qumranet.com>
Date: Mon, 18 Aug 2008 15:07:05 +0300
Subject: KVM: Use kvm_set_irq to inject interrupts

... instead of using the pic and ioapic variants

Signed-off-by: Amit Shah <amit.shah@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c | 6 ++----
 arch/x86/kvm/x86.c   | 8 +-------
 2 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c842060c6c0..fdaa0f00e47 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -596,10 +596,8 @@ void kvm_free_pit(struct kvm *kvm)
 static void __inject_pit_timer_intr(struct kvm *kvm)
 {
 	mutex_lock(&kvm->lock);
-	kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
-	kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
-	kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
-	kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+	kvm_set_irq(kvm, 0, 1);
+	kvm_set_irq(kvm, 0, 0);
 	mutex_unlock(&kvm->lock);
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fffdf4f69c5..5b3c8821b19 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1956,13 +1956,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		if (irqchip_in_kernel(kvm)) {
 			mutex_lock(&kvm->lock);
-			if (irq_event.irq < 16)
-				kvm_pic_set_irq(pic_irqchip(kvm),
-					irq_event.irq,
-					irq_event.level);
-			kvm_ioapic_set_irq(kvm->arch.vioapic,
-					irq_event.irq,
-					irq_event.level);
+			kvm_set_irq(kvm, irq_event.irq, irq_event.level);
 			mutex_unlock(&kvm->lock);
 			r = 0;
 		}
-- 
cgit v1.2.3


From ee032c993edd34e0bdf64dab06a55d0e08a4eeb9 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 11 Aug 2008 16:54:20 -0700
Subject: KVM: make irq ack notifier functions static

sparse says:

arch/x86/kvm/x86.c:107:32: warning: symbol 'kvm_find_assigned_dev' was not declared. Should it be static?
arch/x86/kvm/i8254.c:225:6: warning: symbol 'kvm_pit_ack_irq' was not declared. Should it be static?

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c | 2 +-
 arch/x86/kvm/x86.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index fdaa0f00e47..4cb443026ec 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -222,7 +222,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 						 irq_ack_notifier);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5b3c8821b19..22edd95712e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -104,7 +104,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
-struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
 {
 	struct list_head *ptr;
-- 
cgit v1.2.3


From 26815a648e1ec2b338a63a2bc301dcf449b93e5a Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Tue, 19 Aug 2008 20:48:03 +0800
Subject: KVM: ia64: add a dummy irq ack notification

Before enabling notify_acked_irq for ia64, leave the related APIs as
nop-op first.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/ia64/kvm/irq.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 arch/ia64/kvm/irq.h

(limited to 'arch')

diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h
new file mode 100644
index 00000000000..f2e6545debf
--- /dev/null
+++ b/arch/ia64/kvm/irq.h
@@ -0,0 +1,32 @@
+/*
+ * irq.h: In-kernel interrupt controller related definitions
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Authors:
+ *   Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+struct kvm;
+
+static inline void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
+{
+}
+
+#endif
-- 
cgit v1.2.3


From 5706be0dafd6f42852f85fbae292301dcad4ccec Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 20 Aug 2008 15:07:31 +0300
Subject: KVM: VMX: Change cs reset state to be a data segment

Real mode cs is a data segment, not a code segment.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9840f37925a..6aa305ace79 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2239,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	fx_init(&vmx->vcpu);
 
+	seg_setup(VCPU_SREG_CS);
 	/*
 	 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
 	 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
@@ -2250,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
 		vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
 	}
-	vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-	vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
 
 	seg_setup(VCPU_SREG_DS);
 	seg_setup(VCPU_SREG_ES);
-- 
cgit v1.2.3


From a16b20da879430fdf245ed45461ed40ffef8db3c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 20 Aug 2008 15:48:27 +0300
Subject: KVM: VMX: Change segment dpl at reset to 3

This is more emulation friendly, if not 100% correct.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6aa305ace79..71e57ae1cab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1991,7 +1991,7 @@ static void seg_setup(int seg)
 	vmcs_write16(sf->selector, 0);
 	vmcs_writel(sf->base, 0);
 	vmcs_write32(sf->limit, 0xffff);
-	vmcs_write32(sf->ar_bytes, 0x93);
+	vmcs_write32(sf->ar_bytes, 0xf3);
 }
 
 static int alloc_apic_access_page(struct kvm *kvm)
-- 
cgit v1.2.3


From f4bbd9aaaae23007e4d79536d35a30cbbb11d407 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 20 Aug 2008 15:51:42 +0300
Subject: KVM: Load real mode segments correctly

Real mode segments to not reference the GDT or LDT; they simply compute
base = selector * 16.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 22edd95712e..bfc7c332c5d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3588,11 +3588,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
+{
+	struct kvm_segment segvar = {
+		.base = selector << 4,
+		.limit = 0xffff,
+		.selector = selector,
+		.type = 3,
+		.present = 1,
+		.dpl = 3,
+		.db = 0,
+		.s = 1,
+		.l = 0,
+		.g = 0,
+		.avl = 0,
+		.unusable = 0,
+	};
+	kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+	return 0;
+}
+
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 				int type_bits, int seg)
 {
 	struct kvm_segment kvm_seg;
 
+	if (!(vcpu->arch.cr0 & X86_CR0_PE))
+		return kvm_load_realmode_segment(vcpu, selector, seg);
 	if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
 		return 1;
 	kvm_seg.type |= type_bits;
-- 
cgit v1.2.3


From 41afa025878bc31c9c4e18415fba2435fe035376 Mon Sep 17 00:00:00 2001
From: roel kluin <roel.kluin@gmail.com>
Date: Mon, 18 Aug 2008 21:25:01 -0400
Subject: KVM: x86 emulator: remove duplicate SrcImm

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d5da7f14d53..5d6c1444b61 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -269,7 +269,7 @@ static u16 group_table[] = {
 	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group3*8] =
-	DstMem | SrcImm | ModRM | SrcImm, 0,
+	DstMem | SrcImm | ModRM, 0,
 	DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group4*8] =
-- 
cgit v1.2.3


From 6eb06cb2863a2ff5704b501f1699216180e790b5 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 21 Aug 2008 17:41:39 +0300
Subject: KVM: x86 emulator: remove bad ByteOp specifier from NEG descriptor

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 5d6c1444b61..ae30435ad33 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -270,7 +270,7 @@ static u16 group_table[] = {
 	0, 0, 0, 0,
 	[Group3*8] =
 	DstMem | SrcImm | ModRM, 0,
-	DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group4*8] =
 	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
-- 
cgit v1.2.3


From 135f8c2b078533cc74e75f696e73d47304a61125 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 21 Aug 2008 17:49:56 +0300
Subject: KVM: MMU: Move SHADOW_PT_INDEX to mmu.c

It is not specific to the paging mode, so can be made global (and reusable).

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 2 ++
 arch/x86/kvm/paging_tmpl.h | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 171bcea1be2..51d4cd7ae4f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -135,6 +135,8 @@ module_param(dbg, bool, 0644);
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+
 struct kvm_rmap_desc {
 	u64 *shadow_ptes[RMAP_EXT];
 	struct kvm_rmap_desc *more;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4a814bff21f..ebb26a09d31 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -29,7 +29,6 @@
 	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
 	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
-	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
 	#define PT_LEVEL_BITS PT64_LEVEL_BITS
 	#ifdef CONFIG_X86_64
@@ -46,7 +45,6 @@
 	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
 	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
-	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 	#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
 	#define PT_LEVEL_BITS PT32_LEVEL_BITS
 	#define PT_MAX_FULL_LEVELS 2
@@ -504,7 +502,6 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 #undef FNAME
 #undef PT_BASE_ADDR_MASK
 #undef PT_INDEX
-#undef SHADOW_PT_INDEX
 #undef PT_LEVEL_MASK
 #undef PT_DIR_BASE_ADDR_MASK
 #undef PT_LEVEL_BITS
-- 
cgit v1.2.3


From 6e37d3dc3e358dbf907f8b96a51282966934124b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 22 Aug 2008 19:14:17 +0300
Subject: KVM: MMU: Unify direct map 4K and large page paths

The two paths are equivalent except for one argument, which is already
available.  Merge the two codepaths.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 51d4cd7ae4f..3ee856f6812 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1240,15 +1240,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 		ASSERT(VALID_PAGE(table_addr));
 		table = __va(table_addr);
 
-		if (level == 1) {
+		if (level == 1 || (largepage && level == 2)) {
 			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, 0, gfn, pfn, false);
-			return pt_write;
-		}
-
-		if (largepage && level == 2) {
-			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, 1, gfn, pfn, false);
+				     0, write, 1, &pt_write, largepage,
+				     gfn, pfn, false);
 			return pt_write;
 		}
 
-- 
cgit v1.2.3


From 0be9e929e398d6da6406183a8732dbfd0937fafe Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Tue, 26 Aug 2008 08:58:53 +0800
Subject: KVM: ia64: Enable virtio driver for ia64 in Kconfig

kvm/ia64 uses the virtio drivers to optimize its I/O subsytem.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/ia64/kvm/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 7914e482850..8e99fed6b3f 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -46,4 +46,6 @@ config KVM_INTEL
 config KVM_TRACE
        bool
 
+source drivers/virtio/Kconfig
+
 endif # VIRTUALIZATION
-- 
cgit v1.2.3


From 6c41f428b72afe5a581b967590c12538db31d399 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 26 Aug 2008 16:16:08 +0300
Subject: KVM: MMU: Infer shadow root level in direct_map()

In all cases the shadow root level is available in mmu.shadow_root_level,
so there is no need to pass it as a parameter.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3ee856f6812..72f739aa862 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1227,11 +1227,11 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			   int largepage, gfn_t gfn, pfn_t pfn,
-			   int level)
+			int largepage, gfn_t gfn, pfn_t pfn)
 {
 	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
 	int pt_write = 0;
+	int level = vcpu->arch.mmu.shadow_root_level;
 
 	for (; ; level--) {
 		u32 index = PT64_INDEX(v, level);
@@ -1299,8 +1299,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
-	r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
-			 PT32E_ROOT_LEVEL);
+	r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -1455,7 +1454,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 		goto out_unlock;
 	kvm_mmu_free_some_pages(vcpu);
 	r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-			 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
+			 largepage, gfn, pfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
-- 
cgit v1.2.3


From 3d000db5688c8beff6319fb9ff4b98dcac32f798 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 22 Aug 2008 19:24:38 +0300
Subject: KVM: MMU: Add generic shadow walker

We currently walk the shadow page tables in two places: direct map (for
real mode and two dimensional paging) and paging mode shadow.  Since we
anticipate requiring a third walk (for invlpg), it makes sense to have
a generic facility for shadow walk.

This patch adds such a shadow walker, walks the page tables and calls a
method for every spte encountered.  The method can examine the spte,
modify it, or even instantiate it.  The walk can be aborted by returning
nonzero from the method.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 72f739aa862..8b95cf748b5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -142,6 +142,11 @@ struct kvm_rmap_desc {
 	struct kvm_rmap_desc *more;
 };
 
+struct kvm_shadow_walk {
+	int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
+		     gva_t addr, u64 *spte, int level);
+};
+
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
@@ -935,6 +940,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	return sp;
 }
 
+static int walk_shadow(struct kvm_shadow_walk *walker,
+		       struct kvm_vcpu *vcpu, gva_t addr)
+{
+	hpa_t shadow_addr;
+	int level;
+	int r;
+	u64 *sptep;
+	unsigned index;
+
+	shadow_addr = vcpu->arch.mmu.root_hpa;
+	level = vcpu->arch.mmu.shadow_root_level;
+	if (level == PT32E_ROOT_LEVEL) {
+		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+		shadow_addr &= PT64_BASE_ADDR_MASK;
+		--level;
+	}
+
+	while (level >= PT_PAGE_TABLE_LEVEL) {
+		index = SHADOW_PT_INDEX(addr, level);
+		sptep = ((u64 *)__va(shadow_addr)) + index;
+		r = walker->entry(walker, vcpu, addr, sptep, level);
+		if (r)
+			return r;
+		shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
+		--level;
+	}
+	return 0;
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 {
-- 
cgit v1.2.3


From 140754bc80e1cdbf2d14cdb10d900da1f7718e7b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 22 Aug 2008 19:28:04 +0300
Subject: KVM: MMU: Convert direct maps to use the generic shadow walker

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 93 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 38 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8b95cf748b5..a1ca4ff9c11 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1260,49 +1260,66 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
 
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-			int largepage, gfn_t gfn, pfn_t pfn)
-{
-	hpa_t table_addr = vcpu->arch.mmu.root_hpa;
-	int pt_write = 0;
-	int level = vcpu->arch.mmu.shadow_root_level;
-
-	for (; ; level--) {
-		u32 index = PT64_INDEX(v, level);
-		u64 *table;
+struct direct_shadow_walk {
+	struct kvm_shadow_walk walker;
+	pfn_t pfn;
+	int write;
+	int largepage;
+	int pt_write;
+};
 
-		ASSERT(VALID_PAGE(table_addr));
-		table = __va(table_addr);
+static int direct_map_entry(struct kvm_shadow_walk *_walk,
+			    struct kvm_vcpu *vcpu,
+			    gva_t addr, u64 *sptep, int level)
+{
+	struct direct_shadow_walk *walk =
+		container_of(_walk, struct direct_shadow_walk, walker);
+	struct kvm_mmu_page *sp;
+	gfn_t pseudo_gfn;
+	gfn_t gfn = addr >> PAGE_SHIFT;
+
+	if (level == PT_PAGE_TABLE_LEVEL
+	    || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
+		mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
+			     0, walk->write, 1, &walk->pt_write,
+			     walk->largepage, gfn, walk->pfn, false);
+		return 1;
+	}
 
-		if (level == 1 || (largepage && level == 2)) {
-			mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-				     0, write, 1, &pt_write, largepage,
-				     gfn, pfn, false);
-			return pt_write;
+	if (*sptep == shadow_trap_nonpresent_pte) {
+		pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+		sp = kvm_mmu_get_page(vcpu, pseudo_gfn, addr, level - 1,
+				      1, ACC_ALL, sptep);
+		if (!sp) {
+			pgprintk("nonpaging_map: ENOMEM\n");
+			kvm_release_pfn_clean(walk->pfn);
+			return -ENOMEM;
 		}
 
-		if (table[index] == shadow_trap_nonpresent_pte) {
-			struct kvm_mmu_page *new_table;
-			gfn_t pseudo_gfn;
-
-			pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
-				>> PAGE_SHIFT;
-			new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
-						     v, level - 1,
-						     1, ACC_ALL, &table[index]);
-			if (!new_table) {
-				pgprintk("nonpaging_map: ENOMEM\n");
-				kvm_release_pfn_clean(pfn);
-				return -ENOMEM;
-			}
-
-			set_shadow_pte(&table[index],
-				       __pa(new_table->spt)
-				       | PT_PRESENT_MASK | PT_WRITABLE_MASK
-				       | shadow_user_mask | shadow_x_mask);
-		}
-		table_addr = table[index] & PT64_BASE_ADDR_MASK;
+		set_shadow_pte(sptep,
+			       __pa(sp->spt)
+			       | PT_PRESENT_MASK | PT_WRITABLE_MASK
+			       | shadow_user_mask | shadow_x_mask);
 	}
+	return 0;
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+			int largepage, gfn_t gfn, pfn_t pfn)
+{
+	int r;
+	struct direct_shadow_walk walker = {
+		.walker = { .entry = direct_map_entry, },
+		.pfn = pfn,
+		.largepage = largepage,
+		.write = write,
+		.pt_write = 0,
+	};
+
+	r = walk_shadow(&walker.walker, vcpu, (gva_t)gfn << PAGE_SHIFT);
+	if (r < 0)
+		return r;
+	return walker.pt_write;
 }
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
-- 
cgit v1.2.3


From abb9e0b8e33e58ac8e04e03b680c46435bc312fd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Fri, 22 Aug 2008 19:11:39 +0300
Subject: KVM: MMU: Convert the paging mode shadow walk to use the generic
 walker

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/paging_tmpl.h | 158 ++++++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 72 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ebb26a09d31..b7064e1e1e1 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,6 +25,7 @@
 #if PTTYPE == 64
 	#define pt_element_t u64
 	#define guest_walker guest_walker64
+	#define shadow_walker shadow_walker64
 	#define FNAME(name) paging##64_##name
 	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
@@ -41,6 +42,7 @@
 #elif PTTYPE == 32
 	#define pt_element_t u32
 	#define guest_walker guest_walker32
+	#define shadow_walker shadow_walker32
 	#define FNAME(name) paging##32_##name
 	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
 	#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
@@ -71,6 +73,17 @@ struct guest_walker {
 	u32 error_code;
 };
 
+struct shadow_walker {
+	struct kvm_shadow_walk walker;
+	struct guest_walker *guest_walker;
+	int user_fault;
+	int write_fault;
+	int largepage;
+	int *ptwrite;
+	pfn_t pfn;
+	u64 *sptep;
+};
+
 static gfn_t gpte_to_gfn(pt_element_t gpte)
 {
 	return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -272,86 +285,86 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
-			 struct guest_walker *walker,
-			 int user_fault, int write_fault, int largepage,
-			 int *ptwrite, pfn_t pfn)
+static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
+				    struct kvm_vcpu *vcpu, gva_t addr,
+				    u64 *sptep, int level)
 {
-	hpa_t shadow_addr;
-	int level;
-	u64 *shadow_ent;
-	unsigned access = walker->pt_access;
-
-	if (!is_present_pte(walker->ptes[walker->level - 1]))
-		return NULL;
-
-	shadow_addr = vcpu->arch.mmu.root_hpa;
-	level = vcpu->arch.mmu.shadow_root_level;
-	if (level == PT32E_ROOT_LEVEL) {
-		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
-		shadow_addr &= PT64_BASE_ADDR_MASK;
-		--level;
+	struct shadow_walker *sw =
+		container_of(_sw, struct shadow_walker, walker);
+	struct guest_walker *gw = sw->guest_walker;
+	unsigned access = gw->pt_access;
+	struct kvm_mmu_page *shadow_page;
+	u64 spte;
+	int metaphysical;
+	gfn_t table_gfn;
+	int r;
+	pt_element_t curr_pte;
+
+	if (level == PT_PAGE_TABLE_LEVEL
+	    || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
+		mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
+			     sw->user_fault, sw->write_fault,
+			     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
+			     sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
+			     false);
+		sw->sptep = sptep;
+		return 1;
 	}
 
-	for (; ; level--) {
-		u32 index = SHADOW_PT_INDEX(addr, level);
-		struct kvm_mmu_page *shadow_page;
-		u64 shadow_pte;
-		int metaphysical;
-		gfn_t table_gfn;
-
-		shadow_ent = ((u64 *)__va(shadow_addr)) + index;
-		if (level == PT_PAGE_TABLE_LEVEL)
-			break;
-
-		if (largepage && level == PT_DIRECTORY_LEVEL)
-			break;
-
-		if (is_shadow_present_pte(*shadow_ent)
-		    && !is_large_pte(*shadow_ent)) {
-			shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
-			continue;
-		}
+	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+		return 0;
 
-		if (is_large_pte(*shadow_ent))
-			rmap_remove(vcpu->kvm, shadow_ent);
-
-		if (level - 1 == PT_PAGE_TABLE_LEVEL
-		    && walker->level == PT_DIRECTORY_LEVEL) {
-			metaphysical = 1;
-			if (!is_dirty_pte(walker->ptes[level - 1]))
-				access &= ~ACC_WRITE_MASK;
-			table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
-		} else {
-			metaphysical = 0;
-			table_gfn = walker->table_gfn[level - 2];
-		}
-		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-					       metaphysical, access,
-					       shadow_ent);
-		if (!metaphysical) {
-			int r;
-			pt_element_t curr_pte;
-			r = kvm_read_guest_atomic(vcpu->kvm,
-						  walker->pte_gpa[level - 2],
-						  &curr_pte, sizeof(curr_pte));
-			if (r || curr_pte != walker->ptes[level - 2]) {
-				kvm_release_pfn_clean(pfn);
-				return NULL;
-			}
+	if (is_large_pte(*sptep))
+		rmap_remove(vcpu->kvm, sptep);
+
+	if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
+		metaphysical = 1;
+		if (!is_dirty_pte(gw->ptes[level - 1]))
+			access &= ~ACC_WRITE_MASK;
+		table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
+	} else {
+		metaphysical = 0;
+		table_gfn = gw->table_gfn[level - 2];
+	}
+	shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+				       metaphysical, access, sptep);
+	if (!metaphysical) {
+		r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
+					  &curr_pte, sizeof(curr_pte));
+		if (r || curr_pte != gw->ptes[level - 2]) {
+			kvm_release_pfn_clean(sw->pfn);
+			sw->sptep = NULL;
+			return 1;
 		}
-		shadow_addr = __pa(shadow_page->spt);
-		shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
-			| PT_WRITABLE_MASK | PT_USER_MASK;
-		set_shadow_pte(shadow_ent, shadow_pte);
 	}
 
-	mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
-		     user_fault, write_fault,
-		     walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-		     ptwrite, largepage, walker->gfn, pfn, false);
+	spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
+		| PT_WRITABLE_MASK | PT_USER_MASK;
+	*sptep = spte;
+	return 0;
+}
+
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+			 struct guest_walker *guest_walker,
+			 int user_fault, int write_fault, int largepage,
+			 int *ptwrite, pfn_t pfn)
+{
+	struct shadow_walker walker = {
+		.walker = { .entry = FNAME(shadow_walk_entry), },
+		.guest_walker = guest_walker,
+		.user_fault = user_fault,
+		.write_fault = write_fault,
+		.largepage = largepage,
+		.ptwrite = ptwrite,
+		.pfn = pfn,
+	};
+
+	if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
+		return NULL;
+
+	walk_shadow(&walker.walker, vcpu, addr);
 
-	return shadow_ent;
+	return walker.sptep;
 }
 
 /*
@@ -499,6 +512,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 
 #undef pt_element_t
 #undef guest_walker
+#undef shadow_walker
 #undef FNAME
 #undef PT_BASE_ADDR_MASK
 #undef PT_INDEX
-- 
cgit v1.2.3


From acee3c04e8208c17aad1baff99baa68d71640a19 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Tue, 26 Aug 2008 17:22:47 +0300
Subject: KVM: Allocate guest memory as MAP_PRIVATE, not MAP_SHARED

There is no reason to share internal memory slots with fork()ed instances.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bfc7c332c5d..675d010995a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4296,7 +4296,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 			userspace_addr = do_mmap(NULL, 0,
 						 npages * PAGE_SIZE,
 						 PROT_READ | PROT_WRITE,
-						 MAP_SHARED | MAP_ANONYMOUS,
+						 MAP_PRIVATE | MAP_ANONYMOUS,
 						 0);
 			up_write(&current->mm->mmap_sem);
 
-- 
cgit v1.2.3


From a5e2e82b8b62acd24a44b851e6bb4fd0793ead01 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 27 Aug 2008 05:02:56 +0300
Subject: KVM: x86 emulator: Add mov r, imm instructions (opcodes 0xb0-0xbf)

The emulator only supported one instance of mov r, imm instruction
(opcode 0xb8), this adds the rest of these instructions.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ae30435ad33..66e0bd6c628 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -154,9 +154,16 @@ static u16 opcode_table[256] = {
 	0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
 	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
 	ByteOp | ImplicitOps | String, ImplicitOps | String,
-	/* 0xB0 - 0xBF */
-	0, 0, 0, 0, 0, 0, 0, 0,
-	DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0,
+	/* 0xB0 - 0xB7 */
+	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+	/* 0xB8 - 0xBF */
+	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+	DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
 	/* 0xC0 - 0xC7 */
 	ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
 	0, ImplicitOps | Stack, 0, 0,
@@ -1660,7 +1667,7 @@ special_insn:
 	case 0xae ... 0xaf:	/* scas */
 		DPRINTF("Urk! I don't handle SCAS.\n");
 		goto cannot_emulate;
-	case 0xb8: /* mov r, imm */
+	case 0xb0 ... 0xbf: /* mov r, imm */
 		goto mov;
 	case 0xc0 ... 0xc1:
 		emulate_grp2(ctxt);
-- 
cgit v1.2.3


From bc2d429979451d69d0985c5dbdf908cace2831cc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 27 Aug 2008 16:30:56 +0300
Subject: KVM: MMU: Account for npt/ept/realmode page faults

Now that two-dimensional paging is becoming common, account for tdp page
faults.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a1ca4ff9c11..a24da8f2ee9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1283,6 +1283,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
 		mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
 			     0, walk->write, 1, &walk->pt_write,
 			     walk->largepage, gfn, walk->pfn, false);
+		++vcpu->stat.pf_fixed;
 		return 1;
 	}
 
-- 
cgit v1.2.3


From 2245a28fe2e6fdb1bdabc4dcde1ea3a5c37e2a9e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 27 Aug 2008 16:32:24 +0300
Subject: KVM: MMU: Add locking around kvm_mmu_slot_remove_write_access()

It was generally safe due to slots_lock being held for write, but it wasn't
very nice.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a24da8f2ee9..5052acdc0a7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2097,6 +2097,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
 	struct kvm_mmu_page *sp;
 
+	spin_lock(&kvm->mmu_lock);
 	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
 		int i;
 		u64 *pt;
@@ -2110,6 +2111,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 			if (pt[i] & PT_WRITABLE_MASK)
 				pt[i] &= ~PT_WRITABLE_MASK;
 	}
+	spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_zap_all(struct kvm *kvm)
-- 
cgit v1.2.3


From 171d595d3b3254b9a952af8d1f6965d2e85dcbaa Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 27 Aug 2008 16:40:51 +0300
Subject: KVM: MMU: Flush tlbs after clearing write permission when accessing
 dirty log

Otherwise, the cpu may allow writes to the tracked pages, and we lose
some display bits or fail to migrate correctly.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5052acdc0a7..853a2889b20 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2111,6 +2111,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 			if (pt[i] & PT_WRITABLE_MASK)
 				pt[i] &= ~PT_WRITABLE_MASK;
 	}
+	kvm_flush_remote_tlbs(kvm);
 	spin_unlock(&kvm->mmu_lock);
 }
 
-- 
cgit v1.2.3


From 3201b5d9f0f7ef392886cd76dcd2c69186d9d5cd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Wed, 27 Aug 2008 20:01:04 +0300
Subject: KVM: MMU: Fix setting the accessed bit on non-speculative sptes

The accessed bit was accidentally turned on in a random flag word, rather
than, the spte itself, which was lucky, since it used the non-EPT compatible
PT_ACCESSED_MASK.

Fix by turning the bit on in the spte and changing it to use the portable
accessed mask.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 853a2889b20..866d7133cad 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1192,7 +1192,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	 */
 	spte = shadow_base_present_pte | shadow_dirty_mask;
 	if (!speculative)
-		pte_access |= PT_ACCESSED_MASK;
+		spte |= shadow_accessed_mask;
 	if (!dirty)
 		pte_access &= ~ACC_WRITE_MASK;
 	if (pte_access & ACC_EXEC_MASK)
-- 
cgit v1.2.3


From 48d150394999c542b2e828f03da226842950d46a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 28 Aug 2008 18:27:15 +0300
Subject: KVM: SVM: No need to unprotect memory during event injection when
 using npt

No memory is protected anyway.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index be86c096385..60228888d1b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1021,7 +1021,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	if (npt_enabled)
 		svm_flush_tlb(&svm->vcpu);
 
-	if (event_injection)
+	if (!npt_enabled && event_injection)
 		kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
 	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
-- 
cgit v1.2.3


From a89c1ad270ca7ad0eec2667bc754362ce7b142be Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 29 Aug 2008 11:52:07 +0200
Subject: KVM: add MC5_MISC msr read support

Currently KVM implements MC0-MC4_MISC read support. When booting Linux this
results in KVM warnings in the kernel log when the guest tries to read
MC5_MISC. Fix this warnings with this patch.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 675d010995a..e3b89662cf6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -991,6 +991,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MC0_MISC+8:
 	case MSR_IA32_MC0_MISC+12:
 	case MSR_IA32_MC0_MISC+16:
+	case MSR_IA32_MC0_MISC+20:
 	case MSR_IA32_UCODE_REV:
 	case MSR_IA32_EBL_CR_POWERON:
 	case MSR_IA32_DEBUGCTLMSR:
-- 
cgit v1.2.3


From a0046b6db1c514149585e11895cd8434e0eafa79 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 29 Aug 2008 13:29:45 +0200
Subject: KVM: s390: Make facility bits future-proof

Heiko Carstens pointed out, that its safer to activate working facilities
instead of disabling problematic facilities. The new code uses the host
facility bits and masks it with known good ones.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/s390/kvm/priv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index d1faf5c5440..cce40ff2913 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -157,8 +157,8 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
 	int rc;
 
 	vcpu->stat.instruction_stfl++;
-	facility_list &= ~(1UL<<24); /* no stfle */
-	facility_list &= ~(1UL<<23); /* no large pages */
+	/* only pass the facility bits, which we can handle */
+	facility_list &= 0xfe00fff3;
 
 	rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
 			   &facility_list, sizeof(facility_list));
-- 
cgit v1.2.3


From 20766c083e6ab3c33125f07c7ffe39914c106d98 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Fri, 29 Aug 2008 13:30:56 +0200
Subject: KVM: s390: change help text of guest Kconfig

The current help text for CONFIG_S390_GUEST is not very helpful.
Lets add more text.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/s390/Kconfig | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 4c03049e7db..bc581d8a7cd 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -565,13 +565,16 @@ config ZFCPDUMP
 	  Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this.
 
 config S390_GUEST
-bool "s390 guest support (EXPERIMENTAL)"
+bool "s390 guest support for KVM (EXPERIMENTAL)"
 	depends on 64BIT && EXPERIMENTAL
 	select VIRTIO
 	select VIRTIO_RING
 	select VIRTIO_CONSOLE
 	help
-	  Select this option if you want to run the kernel under s390 linux
+	  Select this option if you want to run the kernel as a guest under
+	  the KVM hypervisor. This will add detection for KVM as well  as a
+	  virtio transport. If KVM is detected, the virtio console will be
+	  the default console.
 endmenu
 
 source "net/Kconfig"
-- 
cgit v1.2.3


From fb4616f43148c5b3f3e453a47657572d1bda39ee Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Mon, 1 Sep 2008 04:52:24 +0300
Subject: KVM: x86 emulator: Add std and cld instructions (opcodes 0xfc-0xfd)

This adds the std and cld instructions to the emulator.

Encountered while running the BIOS with invalid guest
state emulation enabled.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 66e0bd6c628..944f1f4d4be 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -187,7 +187,7 @@ static u16 opcode_table[256] = {
 	ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
 	/* 0xF8 - 0xFF */
 	ImplicitOps, 0, ImplicitOps, ImplicitOps,
-	0, 0, Group | Group4, Group | Group5,
+	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
 };
 
 static u16 twobyte_table[256] = {
@@ -1762,6 +1762,14 @@ special_insn:
 		ctxt->eflags |= X86_EFLAGS_IF;
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
+	case 0xfc: /* cld */
+		ctxt->eflags &= ~EFLG_DF;
+		c->dst.type = OP_NONE;	/* Disable writeback. */
+		break;
+	case 0xfd: /* std */
+		ctxt->eflags |= EFLG_DF;
+		c->dst.type = OP_NONE;	/* Disable writeback. */
+		break;
 	case 0xfe ... 0xff:	/* Grp4/Grp5 */
 		rc = emulate_grp45(ctxt, ops);
 		if (rc != 0)
-- 
cgit v1.2.3


From 8c4b537da7eceab1246695df21beea10f180d460 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Thu, 28 Aug 2008 09:34:08 +0800
Subject: KVM: ia64: Implement kvm_arch_vcpu_ioctl_{set,get}_mpstate

Two ioctl arch functions are added to set vcpu's smp state.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/ia64/kvm/kvm-ia64.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index cd0d1a7284b..7ad759e3429 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -179,6 +179,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	switch (ext) {
 	case KVM_CAP_IRQCHIP:
 	case KVM_CAP_USER_MEMORY:
+	case KVM_CAP_MP_STATE:
 
 		r = 1;
 		break;
@@ -1789,11 +1790,43 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	return -EINVAL;
+	vcpu_load(vcpu);
+	mp_state->mp_state = vcpu->arch.mp_state;
+	vcpu_put(vcpu);
+	return 0;
+}
+
+static int vcpu_reset(struct kvm_vcpu *vcpu)
+{
+	int r;
+	long psr;
+	local_irq_save(psr);
+	r = kvm_insert_vmm_mapping(vcpu);
+	if (r)
+		goto fail;
+
+	vcpu->arch.launched = 0;
+	kvm_arch_vcpu_uninit(vcpu);
+	r = kvm_arch_vcpu_init(vcpu);
+	if (r)
+		goto fail;
+
+	kvm_purge_vmm_mapping(vcpu);
+	r = 0;
+fail:
+	local_irq_restore(psr);
+	return r;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	return -EINVAL;
+	int r = 0;
+
+	vcpu_load(vcpu);
+	vcpu->arch.mp_state = mp_state->mp_state;
+	if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
+		r = vcpu_reset(vcpu);
+	vcpu_put(vcpu);
+	return r;
 }
-- 
cgit v1.2.3


From d40a1ee4859c673677c9811ae84475c4051baca5 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Mon, 1 Sep 2008 19:41:20 +0800
Subject: KVM: MMU: Modify kvm_shadow_walk.entry to accept u64 addr

EPT is 4 level by default in 32pae(48 bits), but the addr parameter
of kvm_shadow_walk->entry() only accept unsigned long as virtual
address, which is 32bit in 32pae. This result in SHADOW_PT_INDEX()
overflow when try to fetch level 4 index.

Fix it by extend kvm_shadow_walk->entry() to accept 64bit addr in
parameter.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/mmu.c         | 10 +++++-----
 arch/x86/kvm/paging_tmpl.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 866d7133cad..bce3e25ec79 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -144,7 +144,7 @@ struct kvm_rmap_desc {
 
 struct kvm_shadow_walk {
 	int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
-		     gva_t addr, u64 *spte, int level);
+		     u64 addr, u64 *spte, int level);
 };
 
 static struct kmem_cache *pte_chain_cache;
@@ -941,7 +941,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 }
 
 static int walk_shadow(struct kvm_shadow_walk *walker,
-		       struct kvm_vcpu *vcpu, gva_t addr)
+		       struct kvm_vcpu *vcpu, u64 addr)
 {
 	hpa_t shadow_addr;
 	int level;
@@ -1270,7 +1270,7 @@ struct direct_shadow_walk {
 
 static int direct_map_entry(struct kvm_shadow_walk *_walk,
 			    struct kvm_vcpu *vcpu,
-			    gva_t addr, u64 *sptep, int level)
+			    u64 addr, u64 *sptep, int level)
 {
 	struct direct_shadow_walk *walk =
 		container_of(_walk, struct direct_shadow_walk, walker);
@@ -1289,7 +1289,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
 
 	if (*sptep == shadow_trap_nonpresent_pte) {
 		pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
-		sp = kvm_mmu_get_page(vcpu, pseudo_gfn, addr, level - 1,
+		sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
 				      1, ACC_ALL, sptep);
 		if (!sp) {
 			pgprintk("nonpaging_map: ENOMEM\n");
@@ -1317,7 +1317,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 		.pt_write = 0,
 	};
 
-	r = walk_shadow(&walker.walker, vcpu, (gva_t)gfn << PAGE_SHIFT);
+	r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
 	if (r < 0)
 		return r;
 	return walker.pt_write;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index b7064e1e1e1..b671f61be41 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -286,7 +286,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
 static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
-				    struct kvm_vcpu *vcpu, gva_t addr,
+				    struct kvm_vcpu *vcpu, u64 addr,
 				    u64 *sptep, int level)
 {
 	struct shadow_walker *sw =
@@ -326,7 +326,7 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
 		metaphysical = 0;
 		table_gfn = gw->table_gfn[level - 2];
 	}
-	shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+	shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
 				       metaphysical, access, sptep);
 	if (!metaphysical) {
 		r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
-- 
cgit v1.2.3


From fa89a81766e33343fa8f0fe0e371819bf94a17a1 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Mon, 1 Sep 2008 15:57:51 +0300
Subject: KVM: Add statistics for guest irq injections

These can help show whether a guest is making progress or not.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/svm.c | 1 +
 arch/x86/kvm/vmx.c | 1 +
 arch/x86/kvm/x86.c | 1 +
 3 files changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 60228888d1b..9b54550fa4d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1519,6 +1519,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 
 	KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
 
+	++svm->vcpu.stat.irq_injections;
 	control = &svm->vmcb->control;
 	control->int_vector = irq;
 	control->int_ctl &= ~V_INTR_PRIO_MASK;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 71e57ae1cab..e7e8c86f1b7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2341,6 +2341,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 
 	KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
 
+	++vcpu->stat.irq_injections;
 	if (vcpu->arch.rmode.active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = irq;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e3b89662cf6..3f3cb7107c0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -92,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "fpu_reload", VCPU_STAT(fpu_reload) },
 	{ "insn_emulation", VCPU_STAT(insn_emulation) },
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+	{ "irq_injections", VCPU_STAT(irq_injections) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
-- 
cgit v1.2.3


From a6a3034cb979b1fa3948d8e1e91b2387fc66b89b Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Sat, 6 Sep 2008 17:22:29 +0300
Subject: KVM: x86 emulator: Add in/out instructions (opcodes 0xe4-0xe7,
 0xec-0xef)

The patch adds in/out instructions to the x86 emulator.

The instruction was encountered while running the BIOS while using
the invalid guest state emulation patch.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 944f1f4d4be..3ac2f148522 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -177,11 +177,14 @@ static u16 opcode_table[256] = {
 	/* 0xD8 - 0xDF */
 	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xE0 - 0xE7 */
-	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0,
+	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	/* 0xE8 - 0xEF */
 	ImplicitOps | Stack, SrcImm | ImplicitOps,
 	ImplicitOps, SrcImmByte | ImplicitOps,
-	0, 0, 0, 0,
+	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	/* 0xF0 - 0xF7 */
 	0, 0, 0, 0,
 	ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -1259,6 +1262,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	u64 msr_data;
 	unsigned long saved_eip = 0;
 	struct decode_cache *c = &ctxt->decode;
+	unsigned int port;
+	int io_dir_in;
 	int rc = 0;
 
 	/* Shadow copy of register state. Committed on successful emulation.
@@ -1687,6 +1692,16 @@ special_insn:
 		c->src.val = c->regs[VCPU_REGS_RCX];
 		emulate_grp2(ctxt);
 		break;
+	case 0xe4: 	/* inb */
+	case 0xe5: 	/* in */
+		port = insn_fetch(u8, 1, c->eip);
+		io_dir_in = 1;
+		goto do_io;
+	case 0xe6: /* outb */
+	case 0xe7: /* out */
+		port = insn_fetch(u8, 1, c->eip);
+		io_dir_in = 0;
+		goto do_io;
 	case 0xe8: /* call (near) */ {
 		long int rel;
 		switch (c->op_bytes) {
@@ -1737,6 +1752,22 @@ special_insn:
 		jmp_rel(c, c->src.val);
 		c->dst.type = OP_NONE; /* Disable writeback. */
 		break;
+	case 0xec: /* in al,dx */
+	case 0xed: /* in (e/r)ax,dx */
+		port = c->regs[VCPU_REGS_RDX];
+		io_dir_in = 1;
+		goto do_io;
+	case 0xee: /* out al,dx */
+	case 0xef: /* out (e/r)ax,dx */
+		port = c->regs[VCPU_REGS_RDX];
+		io_dir_in = 0;
+	do_io:	if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
+				   (c->d & ByteOp) ? 1 : c->op_bytes,
+				   port) != 0) {
+			c->eip = saved_eip;
+			goto cannot_emulate;
+		}
+		return 0;
 	case 0xf4:              /* hlt */
 		ctxt->vcpu->arch.halt_request = 1;
 		break;
-- 
cgit v1.2.3


From d76901750ab9f71091d33ef3d2b5909d8a9a4ad4 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 8 Sep 2008 15:23:48 -0300
Subject: KVM: x86: do not execute halted vcpus

Offline or uninitialized vcpu's can be executed if requested to perform
userspace work.

Follow Avi's suggestion to handle halted vcpu's in the main loop,
simplifying kvm_emulate_halt(). Introduce a new vcpu->requests bit to
indicate events that promote state from halted to running.

Also standardize vcpu wake sites.

Signed-off-by: Marcelo Tosatti <mtosatti <at> redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/i8254.c |   5 ++-
 arch/x86/kvm/lapic.c |  16 +++------
 arch/x86/kvm/x86.c   | 100 ++++++++++++++++++++++++++++-----------------------
 3 files changed, 61 insertions(+), 60 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4cb443026ec..634132a9a51 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,10 +200,9 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 
 	if (!atomic_inc_and_test(&pt->pending))
 		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
-	if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
-		vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+	if (vcpu0 && waitqueue_active(&vcpu0->wq))
 		wake_up_interruptible(&vcpu0->wq);
-	}
 
 	pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
 	pt->scheduled = ktime_to_ns(pt->timer.expires);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index be94f93a73f..fd00f698692 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -339,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		} else
 			apic_clear_vector(vector, apic->regs + APIC_TMR);
 
-		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
-			kvm_vcpu_kick(vcpu);
-		else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
-			vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-			if (waitqueue_active(&vcpu->wq))
-				wake_up_interruptible(&vcpu->wq);
-		}
+		kvm_vcpu_kick(vcpu);
 
 		result = (orig_irr == 0);
 		break;
@@ -384,8 +378,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
 			vcpu->arch.sipi_vector = vector;
 			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
-			if (waitqueue_active(&vcpu->wq))
-				wake_up_interruptible(&vcpu->wq);
+			kvm_vcpu_kick(vcpu);
 		}
 		break;
 
@@ -950,10 +943,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
 
 	if(!atomic_inc_and_test(&apic->timer.pending))
 		set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
-	if (waitqueue_active(q)) {
-		apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+	if (waitqueue_active(q))
 		wake_up_interruptible(q);
-	}
+
 	if (apic_lvtt_period(apic)) {
 		result = 1;
 		apic->timer.dev.expires = ktime_add_ns(
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3f3cb7107c0..bf98d40b21e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2798,11 +2798,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 	KVMTRACE_0D(HLT, vcpu, handler);
 	if (irqchip_in_kernel(vcpu->kvm)) {
 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
-		up_read(&vcpu->kvm->slots_lock);
-		kvm_vcpu_block(vcpu);
-		down_read(&vcpu->kvm->slots_lock);
-		if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
-			return -EINTR;
 		return 1;
 	} else {
 		vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -3097,24 +3092,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
 	up_read(&vcpu->kvm->slots_lock);
 }
 
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
 
-	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
-		pr_debug("vcpu %d received sipi with vector # %x\n",
-		       vcpu->vcpu_id, vcpu->arch.sipi_vector);
-		kvm_lapic_reset(vcpu);
-		r = kvm_x86_ops->vcpu_reset(vcpu);
-		if (r)
-			return r;
-		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-	}
-
-	down_read(&vcpu->kvm->slots_lock);
-	vapic_enter(vcpu);
-
-again:
 	if (vcpu->requests)
 		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
 			kvm_mmu_unload(vcpu);
@@ -3151,22 +3132,13 @@ again:
 
 	local_irq_disable();
 
-	if (vcpu->requests || need_resched()) {
+	if (vcpu->requests || need_resched() || signal_pending(current)) {
 		local_irq_enable();
 		preempt_enable();
 		r = 1;
 		goto out;
 	}
 
-	if (signal_pending(current)) {
-		local_irq_enable();
-		preempt_enable();
-		r = -EINTR;
-		kvm_run->exit_reason = KVM_EXIT_INTR;
-		++vcpu->stat.signal_exits;
-		goto out;
-	}
-
 	if (vcpu->guest_debug.enabled)
 		kvm_x86_ops->guest_debug_pre(vcpu);
 
@@ -3227,26 +3199,63 @@ again:
 	kvm_lapic_sync_from_vapic(vcpu);
 
 	r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+out:
+	return r;
+}
 
-	if (r > 0) {
-		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
-			r = -EINTR;
-			kvm_run->exit_reason = KVM_EXIT_INTR;
-			++vcpu->stat.request_irq_exits;
-			goto out;
-		}
-		if (!need_resched())
-			goto again;
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	int r;
+
+	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
+		printk("vcpu %d received sipi with vector # %x\n",
+		       vcpu->vcpu_id, vcpu->arch.sipi_vector);
+		kvm_lapic_reset(vcpu);
+		r = kvm_x86_ops->vcpu_reset(vcpu);
+		if (r)
+			return r;
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	}
 
-out:
-	up_read(&vcpu->kvm->slots_lock);
-	if (r > 0) {
-		kvm_resched(vcpu);
-		down_read(&vcpu->kvm->slots_lock);
-		goto again;
+	down_read(&vcpu->kvm->slots_lock);
+	vapic_enter(vcpu);
+
+	r = 1;
+	while (r > 0) {
+		if (kvm_arch_vcpu_runnable(vcpu))
+			r = vcpu_enter_guest(vcpu, kvm_run);
+		else {
+			up_read(&vcpu->kvm->slots_lock);
+			kvm_vcpu_block(vcpu);
+			down_read(&vcpu->kvm->slots_lock);
+			if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+				if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+					vcpu->arch.mp_state =
+							KVM_MP_STATE_RUNNABLE;
+			if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
+				r = -EINTR;
+		}
+
+		if (r > 0) {
+			if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+				r = -EINTR;
+				kvm_run->exit_reason = KVM_EXIT_INTR;
+				++vcpu->stat.request_irq_exits;
+			}
+			if (signal_pending(current)) {
+				r = -EINTR;
+				kvm_run->exit_reason = KVM_EXIT_INTR;
+				++vcpu->stat.signal_exits;
+			}
+			if (need_resched()) {
+				up_read(&vcpu->kvm->slots_lock);
+				kvm_resched(vcpu);
+				down_read(&vcpu->kvm->slots_lock);
+			}
+		}
 	}
 
+	up_read(&vcpu->kvm->slots_lock);
 	post_kvm_run_save(vcpu, kvm_run);
 
 	vapic_exit(vcpu);
@@ -3266,6 +3275,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		kvm_vcpu_block(vcpu);
+		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		r = -EAGAIN;
 		goto out;
 	}
-- 
cgit v1.2.3


From d19292e457a7c1b7f6c12bccbfdfd53630de1cee Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Mon, 8 Sep 2008 21:47:19 +0300
Subject: KVM: x86 emulator: Add call near absolute instruction (opcode 0xff/2)

Add call near absolute instruction.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 3ac2f148522..0630d219876 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -286,7 +286,8 @@ static u16 group_table[] = {
 	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
 	0, 0, 0, 0, 0, 0,
 	[Group5*8] =
-	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0,
+	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+	SrcMem | ModRM | Stack, 0,
 	SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
 	[Group7*8] =
 	0, 0, ModRM | SrcMem, ModRM | SrcMem,
@@ -1162,6 +1163,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 	case 1:	/* dec */
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		break;
+	case 2: /* call near abs */ {
+		long int old_eip;
+		old_eip = c->eip;
+		c->eip = c->src.val;
+		c->src.val = old_eip;
+		emulate_push(ctxt);
+		break;
+	}
 	case 4: /* jmp abs */
 		c->eip = c->src.val;
 		break;
-- 
cgit v1.2.3


From 9c3e4aab5ae27bf8589d61c7874e213832b4b7d2 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 10 Sep 2008 16:40:55 -0300
Subject: KVM: x86: unhalt vcpu0 on reset

Since "KVM: x86: do not execute halted vcpus", HLT by vcpu0 before system
reset by the IO thread will hang the guest.

Mark vcpu as runnable in such case.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bf98d40b21e..2134f3e0a51 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3959,6 +3959,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
 	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 
+	/* Older userspace won't unhalt the vcpu on reset. */
+	if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+	    !(vcpu->arch.cr0 & X86_CR0_PE))
+		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
 	vcpu_put(vcpu);
 
 	return 0;
-- 
cgit v1.2.3


From 0bd595fc222583ca260f259698f72e9946c6e524 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Thu, 11 Sep 2008 10:04:29 -0500
Subject: KVM: ppc: kvmppc_44x_shadow_release() does not require mmap_sem to be
 locked

And it gets in the way of get_user_pages_fast().

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/powerpc/kvm/44x_tlb.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 3594bbd1f61..7b11fd7be54 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -110,7 +110,6 @@ static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe)
 	return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW);
 }
 
-/* Must be called with mmap_sem locked for writing. */
 static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
                                       unsigned int index)
 {
@@ -150,17 +149,16 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	/* Get reference to new page. */
 	down_read(&current->mm->mmap_sem);
 	new_page = gfn_to_page(vcpu->kvm, gfn);
+	up_read(&current->mm->mmap_sem);
 	if (is_error_page(new_page)) {
 		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
 		kvm_release_page_clean(new_page);
-		up_read(&current->mm->mmap_sem);
 		return;
 	}
 	hpaddr = page_to_phys(new_page);
 
 	/* Drop reference to old page. */
 	kvmppc_44x_shadow_release(vcpu, victim);
-	up_read(&current->mm->mmap_sem);
 
 	vcpu->arch.shadow_pages[victim] = new_page;
 
@@ -194,7 +192,6 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	int i;
 
 	/* XXX Replace loop with fancy data structures. */
-	down_write(&current->mm->mmap_sem);
 	for (i = 0; i <= tlb_44x_hwater; i++) {
 		struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
 		unsigned int tid;
@@ -219,7 +216,6 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 				stlbe->tid, stlbe->word0, stlbe->word1,
 				stlbe->word2, handler);
 	}
-	up_write(&current->mm->mmap_sem);
 }
 
 /* Invalidate all mappings on the privilege switch after PID has been changed.
@@ -231,7 +227,6 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 
 	if (vcpu->arch.swap_pid) {
 		/* XXX Replace loop with fancy data structures. */
-		down_write(&current->mm->mmap_sem);
 		for (i = 0; i <= tlb_44x_hwater; i++) {
 			struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
 
@@ -243,7 +238,6 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 			            stlbe->tid, stlbe->word0, stlbe->word1,
 			            stlbe->word2, handler);
 		}
-		up_write(&current->mm->mmap_sem);
 		vcpu->arch.swap_pid = 0;
 	}
 
-- 
cgit v1.2.3


From 4b92fe0c9d0376eb105c88d49b77595e8e5b1548 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Thu, 11 Sep 2008 12:58:00 +0200
Subject: KVM: VMX: Cleanup stalled INTR_INFO read

Commit 1c0f4f5011829dac96347b5f84ba37c2252e1e08 left a useless access
of VM_ENTRY_INTR_INFO_FIELD in vmx_intr_assist behind. Clean this up.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e7e8c86f1b7..f8e615fc874 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3135,11 +3135,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
-	u32 intr_info_field;
-
 	update_tpr_threshold(vcpu);
 
-	intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
 	if (cpu_has_virtual_nmis()) {
 		if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
 			if (vmx_nmi_enabled(vcpu)) {
-- 
cgit v1.2.3


From ef46f18ea010359f7536afd3f56a92c9c83ac09a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Thu, 11 Sep 2008 19:47:13 +0300
Subject: KVM: x86 emulator: fix jmp r/m64 instruction

jmp r/m64 doesn't require the rex.w prefix to indicate the operand size
is 64 bits.  Set the Stack attribute (even though it doesn't involve the
stack, really) to indicate this.

Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 0630d219876..0c120c4c9c0 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -288,7 +288,7 @@ static u16 group_table[] = {
 	[Group5*8] =
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	SrcMem | ModRM | Stack, 0,
-	SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
+	SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
 	[Group7*8] =
 	0, 0, ModRM | SrcMem, ModRM | SrcMem,
 	SrcNone | ModRM | DstMem | Mov, 0,
-- 
cgit v1.2.3


From 9ea542facbd0fd12a53e169953a3fdd6d0364532 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Thu, 11 Sep 2008 15:27:49 +0800
Subject: KVM: VMX: Rename IA32_FEATURE_CONTROL bits

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 18 +++++++++---------
 arch/x86/kvm/vmx.h |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f8e615fc874..046a91b5a4b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1041,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void)
 	u64 msr;
 
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-	return (msr & (IA32_FEATURE_CONTROL_LOCKED_BIT |
-		       IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
-	    == IA32_FEATURE_CONTROL_LOCKED_BIT;
+	return (msr & (FEATURE_CONTROL_LOCKED |
+		       FEATURE_CONTROL_VMXON_ENABLED))
+	    == FEATURE_CONTROL_LOCKED;
 	/* locked but not enabled */
 }
 
@@ -1055,14 +1055,14 @@ static void hardware_enable(void *garbage)
 
 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-	if ((old & (IA32_FEATURE_CONTROL_LOCKED_BIT |
-		    IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
-	    != (IA32_FEATURE_CONTROL_LOCKED_BIT |
-		IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT))
+	if ((old & (FEATURE_CONTROL_LOCKED |
+		    FEATURE_CONTROL_VMXON_ENABLED))
+	    != (FEATURE_CONTROL_LOCKED |
+		FEATURE_CONTROL_VMXON_ENABLED))
 		/* enable and lock */
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
-		       IA32_FEATURE_CONTROL_LOCKED_BIT |
-		       IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT);
+		       FEATURE_CONTROL_LOCKED |
+		       FEATURE_CONTROL_VMXON_ENABLED);
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
 	asm volatile (ASM_VMX_VMXON_RAX
 		      : : "a"(&phys_addr), "m"(phys_addr)
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 86059f439cb..44cfab70644 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,8 +331,8 @@ enum vmcs_field {
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define IA32_FEATURE_CONTROL_LOCKED_BIT		0x1
-#define IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT	0x4
+#define FEATURE_CONTROL_LOCKED		    (1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED	    (1<<2)
 
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	10
-- 
cgit v1.2.3


From defed7ed926eca9f178a632df05baa866abe754e Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng.yang@intel.com>
Date: Thu, 11 Sep 2008 15:27:50 +0800
Subject: x86: Move FEATURE_CONTROL bits to msr-index.h

For MSR_IA32_FEATURE_CONTROL is already there.

Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 44cfab70644..3e010d21fdd 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,9 +331,6 @@ enum vmcs_field {
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define FEATURE_CONTROL_LOCKED		    (1<<0)
-#define FEATURE_CONTROL_VMXON_ENABLED	    (1<<2)
-
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	10
 
-- 
cgit v1.2.3


From 9c9fddd0e784346a6d0ce82ed20d9ad21f3c8a2c Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Fri, 12 Sep 2008 13:50:25 +0200
Subject: KVM: x86 emulator: Add DstAcc operand type

Add DstAcc operand type. That means that there are 4 bits now for
DstMask.

"In the good old days cpus would have only one register that was able to
 fully participate in arithmetic operations, typically called A for
 Accumulator.  The x86 retains this tradition by having special, shorter
 encodings for the A register (like the cmp opcode), and even some
 instructions that only operate on A (like mul).

 SrcAcc and DstAcc would accommodate these instructions by decoding A
 into the corresponding 'struct operand'."
  -- Avi Kivity

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 50 +++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 0c120c4c9c0..4390ec8c47a 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -47,25 +47,26 @@
 #define ImplicitOps (1<<1)	/* Implicit in opcode. No generic decode. */
 #define DstReg      (2<<1)	/* Register operand. */
 #define DstMem      (3<<1)	/* Memory operand. */
-#define DstMask     (3<<1)
+#define DstAcc      (4<<1)      /* Destination Accumulator */
+#define DstMask     (7<<1)
 /* Source operand type. */
-#define SrcNone     (0<<3)	/* No source operand. */
-#define SrcImplicit (0<<3)	/* Source operand is implicit in the opcode. */
-#define SrcReg      (1<<3)	/* Register operand. */
-#define SrcMem      (2<<3)	/* Memory operand. */
-#define SrcMem16    (3<<3)	/* Memory operand (16-bit). */
-#define SrcMem32    (4<<3)	/* Memory operand (32-bit). */
-#define SrcImm      (5<<3)	/* Immediate operand. */
-#define SrcImmByte  (6<<3)	/* 8-bit sign-extended immediate operand. */
-#define SrcMask     (7<<3)
+#define SrcNone     (0<<4)	/* No source operand. */
+#define SrcImplicit (0<<4)	/* Source operand is implicit in the opcode. */
+#define SrcReg      (1<<4)	/* Register operand. */
+#define SrcMem      (2<<4)	/* Memory operand. */
+#define SrcMem16    (3<<4)	/* Memory operand (16-bit). */
+#define SrcMem32    (4<<4)	/* Memory operand (32-bit). */
+#define SrcImm      (5<<4)	/* Immediate operand. */
+#define SrcImmByte  (6<<4)	/* 8-bit sign-extended immediate operand. */
+#define SrcMask     (7<<4)
 /* Generic ModRM decode. */
-#define ModRM       (1<<6)
+#define ModRM       (1<<7)
 /* Destination is only written; never read. */
-#define Mov         (1<<7)
-#define BitOp       (1<<8)
-#define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
-#define String      (1<<10)     /* String instruction (rep capable) */
-#define Stack       (1<<11)     /* Stack instruction (push/pop) */
+#define Mov         (1<<8)
+#define BitOp       (1<<9)
+#define MemAbs      (1<<10)      /* Memory operand is absolute displacement */
+#define String      (1<<12)     /* String instruction (rep capable) */
+#define Stack       (1<<13)     /* Stack instruction (push/pop) */
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
@@ -1060,6 +1061,23 @@ done_prefixes:
 		}
 		c->dst.type = OP_MEM;
 		break;
+	case DstAcc:
+		c->dst.type = OP_REG;
+		c->dst.bytes = c->op_bytes;
+		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+		switch (c->op_bytes) {
+			case 1:
+				c->dst.val = *(u8 *)c->dst.ptr;
+				break;
+			case 2:
+				c->dst.val = *(u16 *)c->dst.ptr;
+				break;
+			case 4:
+				c->dst.val = *(u32 *)c->dst.ptr;
+				break;
+		}
+		c->dst.orig_val = c->dst.val;
+		break;
 	}
 
 	if (c->rip_relative)
-- 
cgit v1.2.3


From 8a9fee67fba585b4c4731a749367e1751ebf416c Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Fri, 12 Sep 2008 13:51:15 +0200
Subject: KVM: x86 emulator: Add cmp al, imm and cmp ax, imm instructions
 (ocodes 3c, 3d)

Add decode entries for these opcodes; execution is already implemented.

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 4390ec8c47a..2b43208a38b 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -108,7 +108,8 @@ static u16 opcode_table[256] = {
 	/* 0x38 - 0x3F */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
+	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+	0, 0,
 	/* 0x40 - 0x47 */
 	DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
 	/* 0x48 - 0x4F */
-- 
cgit v1.2.3


From aa3a816b6d0bd59e1a9c548cc7d2dd829f26534f Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Fri, 12 Sep 2008 13:52:18 +0200
Subject: KVM: x86 emulator: Use DstAcc for 'and'

For instruction 'and al,imm' we use DstAcc instead of doing
the emulation directly into the instruction's opcode.

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86_emulate.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 2b43208a38b..ea051173b0d 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -96,7 +96,7 @@ static u16 opcode_table[256] = {
 	/* 0x20 - 0x27 */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	SrcImmByte, SrcImm, 0, 0,
+	DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
 	/* 0x28 - 0x2F */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -1392,27 +1392,10 @@ special_insn:
 	      sbb:		/* sbb */
 		emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
 		break;
-	case 0x20 ... 0x23:
+	case 0x20 ... 0x25:
 	      and:		/* and */
 		emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
 		break;
-	case 0x24:              /* and al imm8 */
-		c->dst.type = OP_REG;
-		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		c->dst.val = *(u8 *)c->dst.ptr;
-		c->dst.bytes = 1;
-		c->dst.orig_val = c->dst.val;
-		goto and;
-	case 0x25:              /* and ax imm16, or eax imm32 */
-		c->dst.type = OP_REG;
-		c->dst.bytes = c->op_bytes;
-		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		if (c->op_bytes == 2)
-			c->dst.val = *(u16 *)c->dst.ptr;
-		else
-			c->dst.val = *(u32 *)c->dst.ptr;
-		c->dst.orig_val = c->dst.val;
-		goto and;
 	case 0x28 ... 0x2d:
 	      sub:		/* sub */
 		emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
-- 
cgit v1.2.3


From 62c476c7c7f25a5b245b9902a935636e6316e58c Mon Sep 17 00:00:00 2001
From: Ben-Ami Yassour <benami@il.ibm.com>
Date: Sun, 14 Sep 2008 03:48:28 +0300
Subject: KVM: Device Assignment with VT-d

Based on a patch by: Kay, Allen M <allen.m.kay@intel.com>

This patch enables PCI device assignment based on VT-d support.
When a device is assigned to the guest, the guest memory is pinned and
the mapping is updated in the VT-d IOMMU.

[Amit: Expose KVM_CAP_IOMMU so we can check if an IOMMU is present
and also control enable/disable from userspace]

Signed-off-by: Kay, Allen M <allen.m.kay@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Amit Shah <amit.shah@qumranet.com>

Acked-by: Mark Gross <mgross@linux.intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/Makefile |   3 +
 arch/x86/kvm/vtd.c    | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c    |  14 ++++
 3 files changed, 215 insertions(+)
 create mode 100644 arch/x86/kvm/vtd.c

(limited to 'arch')

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d0e940bb6f4..3072b17447a 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,6 +12,9 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
 kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
 	i8254.o
+ifeq ($(CONFIG_DMAR),y)
+kvm-objs += vtd.o
+endif
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c
new file mode 100644
index 00000000000..667bf3fb64b
--- /dev/null
+++ b/arch/x86/kvm/vtd.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Copyright IBM Corporation, 2008
+ * Author: Allen M. Kay <allen.m.kay@intel.com>
+ * Author: Weidong Han <weidong.han@intel.com>
+ * Author: Ben-Ami Yassour <benami@il.ibm.com>
+ */
+
+#include <linux/list.h>
+#include <linux/kvm_host.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/intel-iommu.h>
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm);
+static void kvm_iommu_put_pages(struct kvm *kvm,
+				gfn_t base_gfn, unsigned long npages);
+
+int kvm_iommu_map_pages(struct kvm *kvm,
+			gfn_t base_gfn, unsigned long npages)
+{
+	gfn_t gfn = base_gfn;
+	pfn_t pfn;
+	int i, r;
+	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	r = -EINVAL;
+	for (i = 0; i < npages; i++) {
+		/* check if already mapped */
+		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
+						     gfn_to_gpa(gfn));
+		if (pfn && !is_mmio_pfn(pfn))
+			continue;
+
+		pfn = gfn_to_pfn(kvm, gfn);
+		if (!is_mmio_pfn(pfn)) {
+			r = intel_iommu_page_mapping(domain,
+						     gfn_to_gpa(gfn),
+						     pfn_to_hpa(pfn),
+						     PAGE_SIZE,
+						     DMA_PTE_READ |
+						     DMA_PTE_WRITE);
+			if (r) {
+				printk(KERN_DEBUG "kvm_iommu_map_pages:"
+				       "iommu failed to map pfn=%lx\n", pfn);
+				goto unmap_pages;
+			}
+		} else {
+			printk(KERN_DEBUG "kvm_iommu_map_page:"
+			       "invalid pfn=%lx\n", pfn);
+			goto unmap_pages;
+		}
+		gfn++;
+	}
+	return 0;
+
+unmap_pages:
+	kvm_iommu_put_pages(kvm, base_gfn, i);
+	return r;
+}
+
+static int kvm_iommu_map_memslots(struct kvm *kvm)
+{
+	int i, r;
+
+	down_read(&kvm->slots_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn,
+					kvm->memslots[i].npages);
+		if (r)
+			break;
+	}
+	up_read(&kvm->slots_lock);
+	return r;
+}
+
+int kvm_iommu_map_guest(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	struct pci_dev *pdev = NULL;
+	int r;
+
+	if (!intel_iommu_found()) {
+		printk(KERN_ERR "%s: intel iommu not found\n", __func__);
+		return -ENODEV;
+	}
+
+	printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n",
+	       assigned_dev->host_busnr,
+	       PCI_SLOT(assigned_dev->host_devfn),
+	       PCI_FUNC(assigned_dev->host_devfn));
+
+	pdev = assigned_dev->dev;
+
+	if (pdev == NULL) {
+		if (kvm->arch.intel_iommu_domain) {
+			intel_iommu_domain_exit(kvm->arch.intel_iommu_domain);
+			kvm->arch.intel_iommu_domain = NULL;
+		}
+		return -ENODEV;
+	}
+
+	kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev);
+	if (!kvm->arch.intel_iommu_domain)
+		return -ENODEV;
+
+	r = kvm_iommu_map_memslots(kvm);
+	if (r)
+		goto out_unmap;
+
+	intel_iommu_detach_dev(kvm->arch.intel_iommu_domain,
+			       pdev->bus->number, pdev->devfn);
+
+	r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain,
+					pdev);
+	if (r) {
+		printk(KERN_ERR "Domain context map for %s failed",
+		       pci_name(pdev));
+		goto out_unmap;
+	}
+	return 0;
+
+out_unmap:
+	kvm_iommu_unmap_memslots(kvm);
+	return r;
+}
+
+static void kvm_iommu_put_pages(struct kvm *kvm,
+			       gfn_t base_gfn, unsigned long npages)
+{
+	gfn_t gfn = base_gfn;
+	pfn_t pfn;
+	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+	int i;
+
+	for (i = 0; i < npages; i++) {
+		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
+						     gfn_to_gpa(gfn));
+		kvm_release_pfn_clean(pfn);
+		gfn++;
+	}
+}
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm)
+{
+	int i;
+	down_read(&kvm->slots_lock);
+	for (i = 0; i < kvm->nmemslots; i++) {
+		kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn,
+				    kvm->memslots[i].npages);
+	}
+	up_read(&kvm->slots_lock);
+
+	return 0;
+}
+
+int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+	struct kvm_assigned_dev_kernel *entry;
+	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) {
+		printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n",
+		       entry->host_busnr,
+		       PCI_SLOT(entry->host_devfn),
+		       PCI_FUNC(entry->host_devfn));
+
+		/* detach kvm dmar domain */
+		intel_iommu_detach_dev(domain, entry->host_busnr,
+				       entry->host_devfn);
+	}
+	kvm_iommu_unmap_memslots(kvm);
+	intel_iommu_domain_exit(domain);
+	return 0;
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2134f3e0a51..c8a2793626e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -35,6 +35,7 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
+#include <linux/intel-iommu.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -277,9 +278,18 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 
 	list_add(&match->list, &kvm->arch.assigned_dev_head);
 
+	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
+		r = kvm_iommu_map_guest(kvm, match);
+		if (r)
+			goto out_list_del;
+	}
+
 out:
 	mutex_unlock(&kvm->lock);
 	return r;
+out_list_del:
+	list_del(&match->list);
+	pci_release_regions(dev);
 out_disable:
 	pci_disable_device(dev);
 out_put:
@@ -1147,6 +1157,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PV_MMU:
 		r = !tdp_enabled;
 		break;
+	case KVM_CAP_IOMMU:
+		r = intel_iommu_found();
+		break;
 	default:
 		r = 0;
 		break;
@@ -4282,6 +4295,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	kvm_iommu_unmap_guest(kvm);
 	kvm_free_assigned_devices(kvm);
 	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
-- 
cgit v1.2.3


From bfadaded0dc323a1cf3f08b5068f12955b54cbaa Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Tue, 16 Sep 2008 18:04:28 +0300
Subject: KVM: Device Assignment: Free device structures if IRQ allocation
 fails

When an IRQ allocation fails, we free up the device structures and
disable the device so that we can unregister the device in the
userspace and not expose it to the guest at all.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 86 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 45 insertions(+), 41 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c8a2793626e..61eddbeabeb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -166,6 +166,43 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	enable_irq(dev->host_irq);
 }
 
+static void kvm_free_assigned_device(struct kvm *kvm,
+				     struct kvm_assigned_dev_kernel
+				     *assigned_dev)
+{
+	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
+		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+
+	if (cancel_work_sync(&assigned_dev->interrupt_work))
+		/* We had pending work. That means we will have to take
+		 * care of kvm_put_kvm.
+		 */
+		kvm_put_kvm(kvm);
+
+	pci_release_regions(assigned_dev->dev);
+	pci_disable_device(assigned_dev->dev);
+	pci_dev_put(assigned_dev->dev);
+
+	list_del(&assigned_dev->list);
+	kfree(assigned_dev);
+}
+
+static void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+	struct list_head *ptr, *ptr2;
+	struct kvm_assigned_dev_kernel *assigned_dev;
+
+	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+		assigned_dev = list_entry(ptr,
+					  struct kvm_assigned_dev_kernel,
+					  list);
+
+		kvm_free_assigned_device(kvm, assigned_dev);
+	}
+}
+
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 				   struct kvm_assigned_irq
 				   *assigned_irq)
@@ -194,8 +231,8 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 
 	if (irqchip_in_kernel(kvm)) {
 		if (!capable(CAP_SYS_RAWIO)) {
-			return -EPERM;
-			goto out;
+			r = -EPERM;
+			goto out_release;
 		}
 
 		if (assigned_irq->host_irq)
@@ -214,17 +251,18 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		 */
 		if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
 				"kvm_assigned_device", (void *)match)) {
-			printk(KERN_INFO "%s: couldn't allocate irq for pv "
-			       "device\n", __func__);
 			r = -EIO;
-			goto out;
+			goto out_release;
 		}
 	}
 
 	match->irq_requested = true;
-out:
 	mutex_unlock(&kvm->lock);
 	return r;
+out_release:
+	mutex_unlock(&kvm->lock);
+	kvm_free_assigned_device(kvm, match);
+	return r;
 }
 
 static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
@@ -300,40 +338,6 @@ out_free:
 	return r;
 }
 
-static void kvm_free_assigned_devices(struct kvm *kvm)
-{
-	struct list_head *ptr, *ptr2;
-	struct kvm_assigned_dev_kernel *assigned_dev;
-
-	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
-		assigned_dev = list_entry(ptr,
-					  struct kvm_assigned_dev_kernel,
-					  list);
-
-		if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) {
-			free_irq(assigned_dev->host_irq,
-				 (void *)assigned_dev);
-
-			kvm_unregister_irq_ack_notifier(kvm,
-							&assigned_dev->
-							ack_notifier);
-		}
-
-		if (cancel_work_sync(&assigned_dev->interrupt_work))
-			/* We had pending work. That means we will have to take
-			 * care of kvm_put_kvm.
-			 */
-			kvm_put_kvm(kvm);
-
-		pci_release_regions(assigned_dev->dev);
-		pci_disable_device(assigned_dev->dev);
-		pci_dev_put(assigned_dev->dev);
-
-		list_del(&assigned_dev->list);
-		kfree(assigned_dev);
-	}
-}
-
 unsigned long segment_base(u16 selector)
 {
 	struct descriptor_table gdt;
@@ -4296,7 +4300,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_iommu_unmap_guest(kvm);
-	kvm_free_assigned_devices(kvm);
+	kvm_free_all_assigned_devices(kvm);
 	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
-- 
cgit v1.2.3


From 4c2155ce81c193788082d4b8cdbc26d79edebc58 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 16 Sep 2008 20:54:47 -0300
Subject: KVM: switch to get_user_pages_fast

Convert gfn_to_pfn to use get_user_pages_fast, which can do lockless
pagetable lookups on x86. Kernel compilation on 4-way guest is 3.7%
faster on VMX.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/44x_tlb.c |  2 --
 arch/x86/kvm/mmu.c         | 23 +++++++++--------------
 arch/x86/kvm/paging_tmpl.h |  8 +-------
 arch/x86/kvm/vmx.c         |  4 ----
 arch/x86/kvm/x86.c         |  6 ------
 5 files changed, 10 insertions(+), 33 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 7b11fd7be54..2e227a412bc 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -147,9 +147,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	stlbe = &vcpu->arch.shadow_tlb[victim];
 
 	/* Get reference to new page. */
-	down_read(&current->mm->mmap_sem);
 	new_page = gfn_to_page(vcpu->kvm, gfn);
-	up_read(&current->mm->mmap_sem);
 	if (is_error_page(new_page)) {
 		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
 		kvm_release_page_clean(new_page);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bce3e25ec79..5779a2323e2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -405,16 +405,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
 {
 	struct vm_area_struct *vma;
 	unsigned long addr;
+	int ret = 0;
 
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr))
-		return 0;
+		return ret;
 
+	down_read(&current->mm->mmap_sem);
 	vma = find_vma(current->mm, addr);
 	if (vma && is_vm_hugetlb_page(vma))
-		return 1;
+		ret = 1;
+	up_read(&current->mm->mmap_sem);
 
-	return 0;
+	return ret;
 }
 
 static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -1140,9 +1143,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 	if (gpa == UNMAPPED_GVA)
 		return NULL;
 
-	down_read(&current->mm->mmap_sem);
 	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-	up_read(&current->mm->mmap_sem);
 
 	return page;
 }
@@ -1330,16 +1331,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	pfn_t pfn;
 	unsigned long mmu_seq;
 
-	down_read(&current->mm->mmap_sem);
 	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		largepage = 1;
 	}
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
-	/* implicit mb(), we'll read before PT lock is unlocked */
+	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
-	up_read(&current->mm->mmap_sem);
 
 	/* mmio */
 	if (is_error_pfn(pfn)) {
@@ -1488,15 +1487,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	if (r)
 		return r;
 
-	down_read(&current->mm->mmap_sem);
 	if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		largepage = 1;
 	}
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
-	/* implicit mb(), we'll read before PT lock is unlocked */
+	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
-	up_read(&current->mm->mmap_sem);
 	if (is_error_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
 		return 1;
@@ -1809,15 +1806,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		return;
 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
-	down_read(&current->mm->mmap_sem);
 	if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
 		gfn &= ~(KVM_PAGES_PER_HPAGE-1);
 		vcpu->arch.update_pte.largepage = 1;
 	}
 	vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
-	/* implicit mb(), we'll read before PT lock is unlocked */
+	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
-	up_read(&current->mm->mmap_sem);
 
 	if (is_error_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index b671f61be41..6dd08e096e2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -102,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
 	pt_element_t *table;
 	struct page *page;
 
-	down_read(&current->mm->mmap_sem);
 	page = gfn_to_page(kvm, table_gfn);
-	up_read(&current->mm->mmap_sem);
 
 	table = kmap_atomic(page, KM_USER0);
-
 	ret = CMPXCHG(&table[index], orig_pte, new_pte);
-
 	kunmap_atomic(table, KM_USER0);
 
 	kvm_release_page_dirty(page);
@@ -418,7 +414,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		return 0;
 	}
 
-	down_read(&current->mm->mmap_sem);
 	if (walker.level == PT_DIRECTORY_LEVEL) {
 		gfn_t large_gfn;
 		large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
@@ -428,9 +423,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		}
 	}
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
-	/* implicit mb(), we'll read before PT lock is unlocked */
+	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
-	up_read(&current->mm->mmap_sem);
 
 	/* mmio */
 	if (is_error_pfn(pfn)) {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 046a91b5a4b..025bf4011ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2010,9 +2010,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
 	if (r)
 		goto out;
 
-	down_read(&current->mm->mmap_sem);
 	kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
-	up_read(&current->mm->mmap_sem);
 out:
 	up_write(&kvm->slots_lock);
 	return r;
@@ -2034,10 +2032,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
 	if (r)
 		goto out;
 
-	down_read(&current->mm->mmap_sem);
 	kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
 			VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
-	up_read(&current->mm->mmap_sem);
 out:
 	up_write(&kvm->slots_lock);
 	return r;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 61eddbeabeb..108f07267e8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -946,10 +946,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		/* ...but clean it before doing the actual write */
 		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 
-		down_read(&current->mm->mmap_sem);
 		vcpu->arch.time_page =
 				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
-		up_read(&current->mm->mmap_sem);
 
 		if (is_error_page(vcpu->arch.time_page)) {
 			kvm_release_page_clean(vcpu->arch.time_page);
@@ -2322,9 +2320,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 
 		val = *(u64 *)new;
 
-		down_read(&current->mm->mmap_sem);
 		page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-		up_read(&current->mm->mmap_sem);
 
 		kaddr = kmap_atomic(page, KM_USER0);
 		set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
@@ -3089,9 +3085,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
 	if (!apic || !apic->vapic_addr)
 		return;
 
-	down_read(&current->mm->mmap_sem);
 	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-	up_read(&current->mm->mmap_sem);
 
 	vcpu->arch.apic->vapic_page = page;
 }
-- 
cgit v1.2.3


From 2259e3a7a6089007839cd4bbf7c9867190c67238 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 22 Aug 2008 13:29:17 -0700
Subject: KVM: x86.c make kvm_load_realmode_segment static

Noticed by sparse:
arch/x86/kvm/x86.c:3591:5: warning: symbol 'kvm_load_realmode_segment' was not declared. Should it be static?

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 108f07267e8..1b738cb0283 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3611,7 +3611,7 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
+static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
 {
 	struct kvm_segment segvar = {
 		.base = selector << 4,
-- 
cgit v1.2.3


From af2152f5457448bd90cb019c108e0a85e716fdbe Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@qumranet.com>
Date: Mon, 22 Sep 2008 14:28:53 +0300
Subject: KVM: don't enter guest after SIPI was received by a CPU

The vcpu should process pending SIPI message before entering guest mode again.
kvm_arch_vcpu_runnable() returns true if the vcpu is in SIPI state, so
we can't call it here.

Signed-off-by: Gleb Natapov <gleb@qumranet.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b738cb0283..08edeabf15e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3233,7 +3233,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	r = 1;
 	while (r > 0) {
-		if (kvm_arch_vcpu_runnable(vcpu))
+		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
 			r = vcpu_enter_guest(vcpu, kvm_run);
 		else {
 			up_read(&vcpu->kvm->slots_lock);
-- 
cgit v1.2.3


From 81aec5227eedf9035e8544d8021ca6b8fb7c357a Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Fri, 12 Sep 2008 20:23:11 +0800
Subject: KVM: ia64: Implement a uniform vps interface

An uniform entry kvm_vps_entry is added for
vps_sync_write/read, vps_resume_handler/guest,
and branches to differnt PAL service according to the offset.

Singed-off-by: Anthony Xu <anthony.xu@intel.com>
Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm_minstate.h | 23 +++++----------
 arch/ia64/kvm/optvfault.S    | 69 ++++++++++++++++++++++++++++++++++++++++++++
 arch/ia64/kvm/process.c      |  4 +--
 arch/ia64/kvm/vmm_ivt.S      | 39 ++++---------------------
 4 files changed, 84 insertions(+), 51 deletions(-)

(limited to 'arch')

diff --git a/arch/ia64/kvm/kvm_minstate.h b/arch/ia64/kvm/kvm_minstate.h
index 13980d9b8bc..2cc41d17cf9 100644
--- a/arch/ia64/kvm/kvm_minstate.h
+++ b/arch/ia64/kvm/kvm_minstate.h
@@ -50,27 +50,18 @@
 
 #define PAL_VSA_SYNC_READ						\
 	/* begin to call pal vps sync_read */				\
+{.mii;									\
 	add r25 = VMM_VPD_BASE_OFFSET, r21;				\
-	adds r20 = VMM_VCPU_VSA_BASE_OFFSET, r21;  /* entry point */	\
+	nop 0x0;							\
+	mov r24=ip;							\
 	;;								\
+}									\
+{.mmb									\
+	add r24=0x20, r24;						\
 	ld8 r25 = [r25];      /* read vpd base */			\
-	ld8 r20 = [r20];						\
-	;;								\
-	add r20 = PAL_VPS_SYNC_READ,r20;				\
-	;;								\
-{ .mii;									\
-	nop 0x0;							\
-	mov r24 = ip;							\
-	mov b0 = r20;							\
+	br.cond.sptk kvm_vps_sync_read;		/*call the service*/	\
 	;;								\
 };									\
-{ .mmb;									\
-	add r24 = 0x20, r24;						\
-	nop 0x0;							\
-	br.cond.sptk b0;        /*  call the service */			\
-	;;								\
-};
-
 
 
 #define KVM_MINSTATE_GET_CURRENT(reg)   mov reg=r21
diff --git a/arch/ia64/kvm/optvfault.S b/arch/ia64/kvm/optvfault.S
index e4f15d641b2..f0bf0a8efa3 100644
--- a/arch/ia64/kvm/optvfault.S
+++ b/arch/ia64/kvm/optvfault.S
@@ -20,6 +20,75 @@
 #define ACCE_MOV_TO_PSR
 #define ACCE_THASH
 
+ENTRY(kvm_vps_entry)
+	adds r29 = VMM_VCPU_VSA_BASE_OFFSET,r21
+	;;
+	ld8 r29 = [r29]
+	;;
+	add r29 = r29, r30
+	;;
+	mov b0 = r29
+	br.sptk.many b0
+END(kvm_vps_entry)
+
+/*
+ *	Inputs:
+ *	r24 : return address
+ *  	r25 : vpd
+ *	r29 : scratch
+ *
+ */
+GLOBAL_ENTRY(kvm_vps_sync_read)
+	movl r30 = PAL_VPS_SYNC_READ
+	;;
+	br.sptk.many kvm_vps_entry
+END(kvm_vps_sync_read)
+
+/*
+ *	Inputs:
+ *	r24 : return address
+ *  	r25 : vpd
+ *	r29 : scratch
+ *
+ */
+GLOBAL_ENTRY(kvm_vps_sync_write)
+	movl r30 = PAL_VPS_SYNC_WRITE
+	;;
+	br.sptk.many kvm_vps_entry
+END(kvm_vps_sync_write)
+
+/*
+ *	Inputs:
+ *	r23 : pr
+ *	r24 : guest b0
+ *  	r25 : vpd
+ *
+ */
+GLOBAL_ENTRY(kvm_vps_resume_normal)
+	movl r30 = PAL_VPS_RESUME_NORMAL
+	;;
+	mov pr=r23,-2
+	br.sptk.many kvm_vps_entry
+END(kvm_vps_resume_normal)
+
+/*
+ *	Inputs:
+ *	r23 : pr
+ *	r24 : guest b0
+ *  	r25 : vpd
+ *	r17 : isr
+ */
+GLOBAL_ENTRY(kvm_vps_resume_handler)
+	movl r30 = PAL_VPS_RESUME_HANDLER
+	;;
+	ld8 r27=[r25]
+	shr r17=r17,IA64_ISR_IR_BIT
+	;;
+	dep r27=r17,r27,63,1   // bit 63 of r27 indicate whether enable CFLE
+	mov pr=r23,-2
+	br.sptk.many kvm_vps_entry
+END(kvm_vps_resume_handler)
+
 //mov r1=ar3
 GLOBAL_ENTRY(kvm_asm_mov_from_ar)
 #ifndef ACCE_MOV_FROM_AR
diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c
index 5a33f7ed29a..3417783ae16 100644
--- a/arch/ia64/kvm/process.c
+++ b/arch/ia64/kvm/process.c
@@ -962,9 +962,9 @@ static void kvm_do_resume_op(struct kvm_vcpu *vcpu)
 void vmm_transition(struct kvm_vcpu *vcpu)
 {
 	ia64_call_vsa(PAL_VPS_SAVE, (unsigned long)vcpu->arch.vpd,
-			0, 0, 0, 0, 0, 0);
+			1, 0, 0, 0, 0, 0);
 	vmm_trampoline(&vcpu->arch.guest, &vcpu->arch.host);
 	ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)vcpu->arch.vpd,
-						0, 0, 0, 0, 0, 0);
+						1, 0, 0, 0, 0, 0);
 	kvm_do_resume_op(vcpu);
 }
diff --git a/arch/ia64/kvm/vmm_ivt.S b/arch/ia64/kvm/vmm_ivt.S
index 3ee5f481c06..c1d7251a148 100644
--- a/arch/ia64/kvm/vmm_ivt.S
+++ b/arch/ia64/kvm/vmm_ivt.S
@@ -1261,11 +1261,6 @@ kvm_rse_clear_invalid:
     adds r19=VMM_VPD_VPSR_OFFSET,r18
     ;;
     ld8 r19=[r19]        //vpsr
-    adds r20=VMM_VCPU_VSA_BASE_OFFSET,r21
-    ;;
-    ld8 r20=[r20]
-    ;;
-//vsa_sync_write_start
     mov r25=r18
     adds r16= VMM_VCPU_GP_OFFSET,r21
     ;;
@@ -1274,10 +1269,7 @@ kvm_rse_clear_invalid:
     ;;
     add  r24=r24,r16
     ;;
-    add r16=PAL_VPS_SYNC_WRITE,r20
-    ;;
-    mov b0=r16
-    br.cond.sptk b0         // call the service
+    br.sptk.many  kvm_vps_sync_write       // call the service
     ;;
 END(ia64_leave_hypervisor)
 // fall through
@@ -1288,28 +1280,15 @@ GLOBAL_ENTRY(ia64_vmm_entry)
  *  r17:cr.isr
  *  r18:vpd
  *  r19:vpsr
- *  r20:__vsa_base
  *  r22:b0
  *  r23:predicate
  */
     mov r24=r22
     mov r25=r18
     tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT        // p1=vpsr.ic
+    (p1) br.cond.sptk.few kvm_vps_resume_normal
+    (p2) br.cond.sptk.many kvm_vps_resume_handler
     ;;
-    (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
-    (p1) br.sptk.many ia64_vmm_entry_out
-    ;;
-    tbit.nz p1,p2 = r17,IA64_ISR_IR_BIT		//p1=cr.isr.ir
-    ;;
-    (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
-    (p2) add r29=PAL_VPS_RESUME_HANDLER,r20
-    (p2) ld8 r26=[r25]
-    ;;
-ia64_vmm_entry_out:
-    mov pr=r23,-2
-    mov b0=r29
-    ;;
-    br.cond.sptk b0             // call pal service
 END(ia64_vmm_entry)
 
 
@@ -1376,6 +1355,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
     //set up ipsr, iip, vpd.vpsr, dcr
     // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
     // For DCR: all bits 0
+    bsw.0
+    ;;
+    mov r21 =r13
     adds r14=-VMM_PT_REGS_SIZE, r12
     ;;
     movl r6=0x501008826000      // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
@@ -1387,12 +1369,6 @@ GLOBAL_ENTRY(vmm_reset_entry)
     ;;
     srlz.i
     ;;
-    bsw.0
-    ;;
-    mov r21 =r13
-    ;;
-    bsw.1
-    ;;
     mov ar.rsc = 0
     ;;
     flushrs
@@ -1406,12 +1382,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
     ld8 r1 = [r20]
     ;;
     mov cr.iip=r4
-    ;;
     adds r16=VMM_VPD_BASE_OFFSET,r13
-    adds r20=VMM_VCPU_VSA_BASE_OFFSET,r13
     ;;
     ld8 r18=[r16]
-    ld8 r20=[r20]
     ;;
     adds r19=VMM_VPD_VPSR_OFFSET,r18
     ;;
-- 
cgit v1.2.3


From 1f095610aabb9d54617901aa734d2a6093f2000c Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Sat, 13 Sep 2008 06:21:22 +0800
Subject: KVM: ia64: add support for Tukwila processors

In Tukwila processor, VT-i has been enhanced in its
implementation, it is often called VT-i2 techonology.
With VTi-2 support, virtulization performance should be
improved. In this patch, we added the related stuff to
support kvm/ia64 for Tukwila processors.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/optvfault.S | 112 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 91 insertions(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/ia64/kvm/optvfault.S b/arch/ia64/kvm/optvfault.S
index f0bf0a8efa3..634abad979b 100644
--- a/arch/ia64/kvm/optvfault.S
+++ b/arch/ia64/kvm/optvfault.S
@@ -1,9 +1,12 @@
 /*
- * arch/ia64/vmx/optvfault.S
+ * arch/ia64/kvm/optvfault.S
  * optimize virtualization fault handler
  *
  * Copyright (C) 2006 Intel Co
  *	Xuefei Xu (Anthony Xu) <anthony.xu@intel.com>
+ * Copyright (C) 2008 Intel Co
+ *      Add the support for Tukwila processors.
+ *	Xiantao Zhang <xiantao.zhang@intel.com>
  */
 
 #include <asm/asmmacro.h>
@@ -20,6 +23,29 @@
 #define ACCE_MOV_TO_PSR
 #define ACCE_THASH
 
+#define VMX_VPS_SYNC_READ			\
+	add r16=VMM_VPD_BASE_OFFSET,r21;	\
+	mov r17 = b0;				\
+	mov r18 = r24;				\
+	mov r19 = r25;				\
+	mov r20 = r31;				\
+	;;					\
+{.mii;						\
+	ld8 r16 = [r16];			\
+	nop 0x0;				\
+	mov r24 = ip;				\
+	;;					\
+};						\
+{.mmb;						\
+	add r24=0x20, r24;			\
+	mov r25 =r16;				\
+	br.sptk.many kvm_vps_sync_read;		\
+};						\
+	mov b0 = r17;				\
+	mov r24 = r18;				\
+	mov r25 = r19;				\
+	mov r31 = r20
+
 ENTRY(kvm_vps_entry)
 	adds r29 = VMM_VCPU_VSA_BASE_OFFSET,r21
 	;;
@@ -226,11 +252,11 @@ GLOBAL_ENTRY(kvm_asm_rsm)
 #ifndef ACCE_RSM
 	br.many kvm_virtualization_fault_back
 #endif
-	add r16=VMM_VPD_BASE_OFFSET,r21
+	VMX_VPS_SYNC_READ
+	;;
 	extr.u r26=r25,6,21
 	extr.u r27=r25,31,2
 	;;
-	ld8 r16=[r16]
 	extr.u r28=r25,36,1
 	dep r26=r27,r26,21,2
 	;;
@@ -265,7 +291,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
 	tbit.nz p6,p0=r23,0
 	;;
 	tbit.z.or p6,p0=r26,IA64_PSR_DT_BIT
-	(p6) br.dptk kvm_resume_to_guest
+	(p6) br.dptk kvm_resume_to_guest_with_sync
 	;;
 	add r26=VMM_VCPU_META_RR0_OFFSET,r21
 	add r27=VMM_VCPU_META_RR0_OFFSET+8,r21
@@ -281,7 +307,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
 	mov rr[r28]=r27
 	;;
 	srlz.d
-	br.many kvm_resume_to_guest
+	br.many kvm_resume_to_guest_with_sync
 END(kvm_asm_rsm)
 
 
@@ -290,11 +316,11 @@ GLOBAL_ENTRY(kvm_asm_ssm)
 #ifndef ACCE_SSM
 	br.many kvm_virtualization_fault_back
 #endif
-	add r16=VMM_VPD_BASE_OFFSET,r21
+	VMX_VPS_SYNC_READ
+	;;
 	extr.u r26=r25,6,21
 	extr.u r27=r25,31,2
 	;;
-	ld8 r16=[r16]
 	extr.u r28=r25,36,1
 	dep r26=r27,r26,21,2
 	;;  //r26 is imm24
@@ -340,7 +366,7 @@ kvm_asm_ssm_1:
 	tbit.nz p6,p0=r29,IA64_PSR_I_BIT
 	;;
 	tbit.z.or p6,p0=r19,IA64_PSR_I_BIT
-	(p6) br.dptk kvm_resume_to_guest
+	(p6) br.dptk kvm_resume_to_guest_with_sync
 	;;
 	add r29=VPD_VTPR_START_OFFSET,r16
 	add r30=VPD_VHPI_START_OFFSET,r16
@@ -355,7 +381,7 @@ kvm_asm_ssm_1:
 	;;
 	cmp.gt p6,p0=r30,r17
 	(p6) br.dpnt.few kvm_asm_dispatch_vexirq
-	br.many kvm_resume_to_guest
+	br.many kvm_resume_to_guest_with_sync
 END(kvm_asm_ssm)
 
 
@@ -364,10 +390,9 @@ GLOBAL_ENTRY(kvm_asm_mov_to_psr)
 #ifndef ACCE_MOV_TO_PSR
 	br.many kvm_virtualization_fault_back
 #endif
-	add r16=VMM_VPD_BASE_OFFSET,r21
-	extr.u r26=r25,13,7 //r2
+	VMX_VPS_SYNC_READ
 	;;
-	ld8 r16=[r16]
+	extr.u r26=r25,13,7 //r2
 	addl r20=@gprel(asm_mov_from_reg),gp
 	;;
 	adds r30=kvm_asm_mov_to_psr_back-asm_mov_from_reg,r20
@@ -443,7 +468,7 @@ kvm_asm_mov_to_psr_1:
 	;;
 	tbit.nz.or p6,p0=r17,IA64_PSR_I_BIT
 	tbit.z.or p6,p0=r30,IA64_PSR_I_BIT
-	(p6) br.dpnt.few kvm_resume_to_guest
+	(p6) br.dpnt.few kvm_resume_to_guest_with_sync
 	;;
 	add r29=VPD_VTPR_START_OFFSET,r16
 	add r30=VPD_VHPI_START_OFFSET,r16
@@ -458,13 +483,29 @@ kvm_asm_mov_to_psr_1:
 	;;
 	cmp.gt p6,p0=r30,r17
 	(p6) br.dpnt.few kvm_asm_dispatch_vexirq
-	br.many kvm_resume_to_guest
+	br.many kvm_resume_to_guest_with_sync
 END(kvm_asm_mov_to_psr)
 
 
 ENTRY(kvm_asm_dispatch_vexirq)
 //increment iip
+	mov r17 = b0
+	mov r18 = r31
+{.mii
+	add r25=VMM_VPD_BASE_OFFSET,r21
+	nop 0x0
+	mov r24 = ip
+	;;
+}
+{.mmb
+	add r24 = 0x20, r24
+	ld8 r25 = [r25]
+	br.sptk.many kvm_vps_sync_write
+}
+	mov b0 =r17
 	mov r16=cr.ipsr
+	mov r31 = r18
+	mov r19 = 37
 	;;
 	extr.u r17=r16,IA64_PSR_RI_BIT,2
 	tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1
@@ -504,25 +545,31 @@ GLOBAL_ENTRY(kvm_asm_thash)
 	;;
 kvm_asm_thash_back1:
 	shr.u r23=r19,61		// get RR number
-	adds r25=VMM_VCPU_VRR0_OFFSET,r21	// get vcpu->arch.vrr[0]'s addr
+	adds r28=VMM_VCPU_VRR0_OFFSET,r21	// get vcpu->arch.vrr[0]'s addr
 	adds r16=VMM_VPD_VPTA_OFFSET,r16	// get vpta
 	;;
-	shladd r27=r23,3,r25	// get vcpu->arch.vrr[r23]'s addr
+	shladd r27=r23,3,r28	// get vcpu->arch.vrr[r23]'s addr
 	ld8 r17=[r16]		// get PTA
 	mov r26=1
 	;;
-	extr.u r29=r17,2,6		// get pta.size
-	ld8 r25=[r27]		// get vcpu->arch.vrr[r23]'s value
+	extr.u r29=r17,2,6	// get pta.size
+	ld8 r28=[r27]		// get vcpu->arch.vrr[r23]'s value
 	;;
-	extr.u r25=r25,2,6		// get rr.ps
+	mov b0=r24
+	//Fallback to C if pta.vf is set
+	tbit.nz p6,p0=r17, 8
+	;;
+	(p6) mov r24=EVENT_THASH
+	(p6) br.cond.dpnt.many kvm_virtualization_fault_back
+	extr.u r28=r28,2,6	// get rr.ps
 	shl r22=r26,r29		// 1UL << pta.size
 	;;
-	shr.u r23=r19,r25		// vaddr >> rr.ps
+	shr.u r23=r19,r28	// vaddr >> rr.ps
 	adds r26=3,r29		// pta.size + 3
 	shl r27=r17,3		// pta << 3
 	;;
 	shl r23=r23,3		// (vaddr >> rr.ps) << 3
-	shr.u r27=r27,r26		// (pta << 3) >> (pta.size+3)
+	shr.u r27=r27,r26	// (pta << 3) >> (pta.size+3)
 	movl r16=7<<61
 	;;
 	adds r22=-1,r22		// (1UL << pta.size) - 1
@@ -793,6 +840,29 @@ END(asm_mov_from_reg)
  * r31: pr
  * r24: b0
  */
+ENTRY(kvm_resume_to_guest_with_sync)
+	adds r19=VMM_VPD_BASE_OFFSET,r21
+	mov r16 = r31
+	mov r17 = r24
+	;;
+{.mii
+	ld8 r25 =[r19]
+	nop 0x0
+	mov r24 = ip
+	;;
+}
+{.mmb
+	add r24 =0x20, r24
+	nop 0x0
+	br.sptk.many kvm_vps_sync_write
+}
+
+	mov r31 = r16
+	mov r24 =r17
+	;;
+	br.sptk.many kvm_resume_to_guest
+END(kvm_resume_to_guest_with_sync)
+
 ENTRY(kvm_resume_to_guest)
 	adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
 	;;
-- 
cgit v1.2.3


From a08546001c2b0f584ffc81987340943a7d6d6acb Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 23 Sep 2008 11:01:45 -0700
Subject: x86: pvclock: fix shadowed variable warning

arch/x86/kernel/pvclock.c:102:6: warning: symbol 'tsc_khz' shadows an earlier one
include/asm/tsc.h:18:21: originally declared here

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/pvclock.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 1c54b5fb7ae..4f9c55f3a7c 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -99,14 +99,14 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
 
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
 {
-	u64 tsc_khz = 1000000ULL << 32;
+	u64 pv_tsc_khz = 1000000ULL << 32;
 
-	do_div(tsc_khz, src->tsc_to_system_mul);
+	do_div(pv_tsc_khz, src->tsc_to_system_mul);
 	if (src->tsc_shift < 0)
-		tsc_khz <<= -src->tsc_shift;
+		pv_tsc_khz <<= -src->tsc_shift;
 	else
-		tsc_khz >>= src->tsc_shift;
-	return tsc_khz;
+		pv_tsc_khz >>= src->tsc_shift;
+	return pv_tsc_khz;
 }
 
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
-- 
cgit v1.2.3


From 93a423e7045cf3cf69f960ff307edda1afcd7b41 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:29 -0300
Subject: KVM: MMU: flush remote TLBs on large->normal entry overwrite

It is necessary to flush all TLB's when a large spte entry is
overwritten with a normal page directory pointer.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6dd08e096e2..e9fbaa44d44 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -310,8 +310,11 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
 	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
 		return 0;
 
-	if (is_large_pte(*sptep))
+	if (is_large_pte(*sptep)) {
+		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+		kvm_flush_remote_tlbs(vcpu->kvm);
 		rmap_remove(vcpu->kvm, sptep);
+	}
 
 	if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
 		metaphysical = 1;
-- 
cgit v1.2.3


From 1e73f9dd885957bf0c7bb5e63b350d5aeb06b726 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:30 -0300
Subject: KVM: MMU: split mmu_set_spte

Split the spte entry creation code into a new set_spte function.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 101 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 44 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5779a2323e2..9ad4cc55389 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1148,44 +1148,13 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 	return page;
 }
 
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
-			 unsigned pt_access, unsigned pte_access,
-			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, int largepage, gfn_t gfn,
-			 pfn_t pfn, bool speculative)
+static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+		    unsigned pte_access, int user_fault,
+		    int write_fault, int dirty, int largepage,
+		    gfn_t gfn, pfn_t pfn, bool speculative)
 {
 	u64 spte;
-	int was_rmapped = 0;
-	int was_writeble = is_writeble_pte(*shadow_pte);
-
-	pgprintk("%s: spte %llx access %x write_fault %d"
-		 " user_fault %d gfn %lx\n",
-		 __func__, *shadow_pte, pt_access,
-		 write_fault, user_fault, gfn);
-
-	if (is_rmap_pte(*shadow_pte)) {
-		/*
-		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
-		 * the parent of the now unreachable PTE.
-		 */
-		if (largepage && !is_large_pte(*shadow_pte)) {
-			struct kvm_mmu_page *child;
-			u64 pte = *shadow_pte;
-
-			child = page_header(pte & PT64_BASE_ADDR_MASK);
-			mmu_page_remove_parent_pte(child, shadow_pte);
-		} else if (pfn != spte_to_pfn(*shadow_pte)) {
-			pgprintk("hfn old %lx new %lx\n",
-				 spte_to_pfn(*shadow_pte), pfn);
-			rmap_remove(vcpu->kvm, shadow_pte);
-		} else {
-			if (largepage)
-				was_rmapped = is_large_pte(*shadow_pte);
-			else
-				was_rmapped = 1;
-		}
-	}
-
+	int ret = 0;
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
 	 * whether the guest actually used the pte (in order to detect
@@ -1218,26 +1187,70 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		   (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __func__, gfn);
+			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
 			if (is_writeble_pte(spte)) {
 				spte &= ~PT_WRITABLE_MASK;
 				kvm_x86_ops->tlb_flush(vcpu);
 			}
-			if (write_fault)
-				*ptwrite = 1;
 		}
 	}
 
 	if (pte_access & ACC_WRITE_MASK)
 		mark_page_dirty(vcpu->kvm, gfn);
 
-	pgprintk("%s: setting spte %llx\n", __func__, spte);
-	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
-		 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
-		 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
 	set_shadow_pte(shadow_pte, spte);
-	if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
-	    && (spte & PT_PRESENT_MASK))
+	return ret;
+}
+
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+			 unsigned pt_access, unsigned pte_access,
+			 int user_fault, int write_fault, int dirty,
+			 int *ptwrite, int largepage, gfn_t gfn,
+			 pfn_t pfn, bool speculative)
+{
+	int was_rmapped = 0;
+	int was_writeble = is_writeble_pte(*shadow_pte);
+
+	pgprintk("%s: spte %llx access %x write_fault %d"
+		 " user_fault %d gfn %lx\n",
+		 __func__, *shadow_pte, pt_access,
+		 write_fault, user_fault, gfn);
+
+	if (is_rmap_pte(*shadow_pte)) {
+		/*
+		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+		 * the parent of the now unreachable PTE.
+		 */
+		if (largepage && !is_large_pte(*shadow_pte)) {
+			struct kvm_mmu_page *child;
+			u64 pte = *shadow_pte;
+
+			child = page_header(pte & PT64_BASE_ADDR_MASK);
+			mmu_page_remove_parent_pte(child, shadow_pte);
+		} else if (pfn != spte_to_pfn(*shadow_pte)) {
+			pgprintk("hfn old %lx new %lx\n",
+				 spte_to_pfn(*shadow_pte), pfn);
+			rmap_remove(vcpu->kvm, shadow_pte);
+		} else {
+			if (largepage)
+				was_rmapped = is_large_pte(*shadow_pte);
+			else
+				was_rmapped = 1;
+		}
+	}
+	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
+		      dirty, largepage, gfn, pfn, speculative))
+		if (write_fault)
+			*ptwrite = 1;
+
+	pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
+	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+		 is_large_pte(*shadow_pte)? "2MB" : "4kB",
+		 is_present_pte(*shadow_pte)?"RW":"R", gfn,
+		 *shadow_pte, shadow_pte);
+	if (!was_rmapped && is_large_pte(*shadow_pte))
 		++vcpu->kvm->stat.lpages;
 
 	page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
-- 
cgit v1.2.3


From a378b4e64c0fef2d9e53214db167878b7673a7a3 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:31 -0300
Subject: KVM: MMU: move local TLB flush to mmu_set_spte

Since the sync page path can collapse flushes.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9ad4cc55389..23752ef0839 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1189,10 +1189,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 				 __func__, gfn);
 			ret = 1;
 			pte_access &= ~ACC_WRITE_MASK;
-			if (is_writeble_pte(spte)) {
+			if (is_writeble_pte(spte))
 				spte &= ~PT_WRITABLE_MASK;
-				kvm_x86_ops->tlb_flush(vcpu);
-			}
 		}
 	}
 
@@ -1241,9 +1239,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		}
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, gfn, pfn, speculative))
+		      dirty, largepage, gfn, pfn, speculative)) {
 		if (write_fault)
 			*ptwrite = 1;
+		kvm_x86_ops->tlb_flush(vcpu);
+	}
 
 	pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
 	pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
-- 
cgit v1.2.3


From 38187c830cab84daecb41169948467f1f19317e3 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:32 -0300
Subject: KVM: MMU: do not write-protect large mappings

There is not much point in write protecting large mappings. This
can only happen when a page is shadowed during the window between
is_largepage_backed and mmu_lock acquision. Zap the entry instead, so
the next pagefault will find a shadowed page via is_largepage_backed and
fallback to 4k translations.

Simplifies out of sync shadow.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 23752ef0839..731e6fe9cb0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1180,11 +1180,16 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
 		struct kvm_mmu_page *shadow;
 
+		if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
+			ret = 1;
+			spte = shadow_trap_nonpresent_pte;
+			goto set_pte;
+		}
+
 		spte |= PT_WRITABLE_MASK;
 
 		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-		if (shadow ||
-		   (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
+		if (shadow) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __func__, gfn);
 			ret = 1;
@@ -1197,6 +1202,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	if (pte_access & ACC_WRITE_MASK)
 		mark_page_dirty(vcpu->kvm, gfn);
 
+set_pte:
 	set_shadow_pte(shadow_pte, spte);
 	return ret;
 }
-- 
cgit v1.2.3


From e8bc217aef67d41d767ede6e7a7eb10f1d47c86c Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:33 -0300
Subject: KVM: MMU: mode specific sync_page

Examine guest pagetable and bring the shadow back in sync. Caller is responsible
for local TLB flush before re-entering guest mode.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 10 +++++++++
 arch/x86/kvm/paging_tmpl.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 731e6fe9cb0..90f01169c8f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -871,6 +871,12 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 		sp->spt[i] = shadow_trap_nonpresent_pte;
 }
 
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+			       struct kvm_mmu_page *sp)
+{
+	return 1;
+}
+
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned index;
@@ -1547,6 +1553,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 	context->gva_to_gpa = nonpaging_gva_to_gpa;
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
+	context->sync_page = nonpaging_sync_page;
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1594,6 +1601,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	context->page_fault = paging64_page_fault;
 	context->gva_to_gpa = paging64_gva_to_gpa;
 	context->prefetch_page = paging64_prefetch_page;
+	context->sync_page = paging64_sync_page;
 	context->free = paging_free;
 	context->root_level = level;
 	context->shadow_root_level = level;
@@ -1615,6 +1623,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	context->gva_to_gpa = paging32_gva_to_gpa;
 	context->free = paging_free;
 	context->prefetch_page = paging32_prefetch_page;
+	context->sync_page = paging32_sync_page;
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1634,6 +1643,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->page_fault = tdp_page_fault;
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
+	context->sync_page = nonpaging_sync_page;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e9fbaa44d44..776fb6d2fd8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -507,6 +507,60 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
 	}
 }
 
+/*
+ * Using the cached information from sp->gfns is safe because:
+ * - The spte has a reference to the struct page, so the pfn for a given gfn
+ *   can't change unless all sptes pointing to it are nuked first.
+ * - Alias changes zap the entire shadow cache.
+ */
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	int i, offset, nr_present;
+
+	offset = nr_present = 0;
+
+	if (PTTYPE == 32)
+		offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+		unsigned pte_access;
+		pt_element_t gpte;
+		gpa_t pte_gpa;
+		gfn_t gfn = sp->gfns[i];
+
+		if (!is_shadow_present_pte(sp->spt[i]))
+			continue;
+
+		pte_gpa = gfn_to_gpa(sp->gfn);
+		pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+					  sizeof(pt_element_t)))
+			return -EINVAL;
+
+		if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
+		    !(gpte & PT_ACCESSED_MASK)) {
+			u64 nonpresent;
+
+			rmap_remove(vcpu->kvm, &sp->spt[i]);
+			if (is_present_pte(gpte))
+				nonpresent = shadow_trap_nonpresent_pte;
+			else
+				nonpresent = shadow_notrap_nonpresent_pte;
+			set_shadow_pte(&sp->spt[i], nonpresent);
+			continue;
+		}
+
+		nr_present++;
+		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
+			 is_dirty_pte(gpte), 0, gfn,
+			 spte_to_pfn(sp->spt[i]), true);
+	}
+
+	return !nr_present;
+}
+
 #undef pt_element_t
 #undef guest_walker
 #undef shadow_walker
-- 
cgit v1.2.3


From 0ba73cdadb8ac172f396df7e23c4a9cebd59b550 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:34 -0300
Subject: KVM: MMU: sync roots on mmu reload

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 36 ++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c |  1 +
 2 files changed, 37 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 90f01169c8f..9d8c4bb68a8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1471,6 +1471,41 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 }
 
+static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+}
+
+static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+		sp = page_header(root);
+		mmu_sync_children(vcpu, sp);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			mmu_sync_children(vcpu, sp);
+		}
+	}
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	mmu_sync_roots(vcpu);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
 	return vaddr;
@@ -1715,6 +1750,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
 	mmu_alloc_roots(vcpu);
+	mmu_sync_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 	kvm_mmu_flush_tlb(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 08edeabf15e..88e6d9abbd2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -594,6 +594,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+		kvm_mmu_sync_roots(vcpu);
 		kvm_mmu_flush_tlb(vcpu);
 		return;
 	}
-- 
cgit v1.2.3


From a7052897b3bcd568a9f5bfaa558957039e7e7ec0 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:35 -0300
Subject: KVM: x86: trap invlpg

With pages out of sync invlpg needs to be trapped. For now simply nuke
the entry.

Untested on AMD.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 18 ++++++++++++++++++
 arch/x86/kvm/paging_tmpl.h | 25 +++++++++++++++++++++++++
 arch/x86/kvm/svm.c         | 13 +++++++++++--
 arch/x86/kvm/vmx.c         | 19 ++++++++++++++++---
 arch/x86/kvm/x86.c         |  1 +
 5 files changed, 71 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9d8c4bb68a8..e89af1df4fc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -877,6 +877,10 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
 	return 1;
 }
 
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+}
+
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned index;
@@ -1589,6 +1593,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
 	context->sync_page = nonpaging_sync_page;
+	context->invlpg = nonpaging_invlpg;
 	context->root_level = 0;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1637,6 +1642,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 	context->gva_to_gpa = paging64_gva_to_gpa;
 	context->prefetch_page = paging64_prefetch_page;
 	context->sync_page = paging64_sync_page;
+	context->invlpg = paging64_invlpg;
 	context->free = paging_free;
 	context->root_level = level;
 	context->shadow_root_level = level;
@@ -1659,6 +1665,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 	context->free = paging_free;
 	context->prefetch_page = paging32_prefetch_page;
 	context->sync_page = paging32_sync_page;
+	context->invlpg = paging32_invlpg;
 	context->root_level = PT32_ROOT_LEVEL;
 	context->shadow_root_level = PT32E_ROOT_LEVEL;
 	context->root_hpa = INVALID_PAGE;
@@ -1679,6 +1686,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->free = nonpaging_free;
 	context->prefetch_page = nonpaging_prefetch_page;
 	context->sync_page = nonpaging_sync_page;
+	context->invlpg = nonpaging_invlpg;
 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 	context->root_hpa = INVALID_PAGE;
 
@@ -2071,6 +2079,16 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	vcpu->arch.mmu.invlpg(vcpu, gva);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	kvm_mmu_flush_tlb(vcpu);
+	++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
 void kvm_enable_tdp(void)
 {
 	tdp_enabled = true;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 776fb6d2fd8..dc169e8148b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -461,6 +461,31 @@ out_unlock:
 	return 0;
 }
 
+static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
+				      struct kvm_vcpu *vcpu, u64 addr,
+				      u64 *sptep, int level)
+{
+
+	if (level == PT_PAGE_TABLE_LEVEL) {
+		if (is_shadow_present_pte(*sptep))
+			rmap_remove(vcpu->kvm, sptep);
+		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+		return 1;
+	}
+	if (!is_shadow_present_pte(*sptep))
+		return 1;
+	return 0;
+}
+
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	struct shadow_walker walker = {
+		.walker = { .entry = FNAME(shadow_invlpg_entry), },
+	};
+
+	walk_shadow(&walker.walker, vcpu, gva);
+}
+
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
 	struct guest_walker walker;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9b54550fa4d..9c4ce657d96 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -525,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 				(1ULL << INTERCEPT_CPUID) |
 				(1ULL << INTERCEPT_INVD) |
 				(1ULL << INTERCEPT_HLT) |
+				(1ULL << INTERCEPT_INVLPG) |
 				(1ULL << INTERCEPT_INVLPGA) |
 				(1ULL << INTERCEPT_IOIO_PROT) |
 				(1ULL << INTERCEPT_MSR_PROT) |
@@ -589,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 	if (npt_enabled) {
 		/* Setup VMCB for Nested Paging */
 		control->nested_ctl = 1;
-		control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH);
+		control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
+					(1ULL << INTERCEPT_INVLPG));
 		control->intercept_exceptions &= ~(1 << PF_VECTOR);
 		control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
 						INTERCEPT_CR3_MASK);
@@ -1164,6 +1166,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
+		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
+	return 1;
+}
+
 static int emulate_on_interception(struct vcpu_svm *svm,
 				   struct kvm_run *kvm_run)
 {
@@ -1417,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
 	[SVM_EXIT_HLT]				= halt_interception,
-	[SVM_EXIT_INVLPG]			= emulate_on_interception,
+	[SVM_EXIT_INVLPG]			= invlpg_interception,
 	[SVM_EXIT_INVLPGA]			= invalid_op_interception,
 	[SVM_EXIT_IOIO] 		  	= io_interception,
 	[SVM_EXIT_MSR]				= msr_interception,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 025bf4011ab..4556cc3715b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1130,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_CR3_STORE_EXITING |
 	      CPU_BASED_USE_IO_BITMAPS |
 	      CPU_BASED_MOV_DR_EXITING |
-	      CPU_BASED_USE_TSC_OFFSETING;
+	      CPU_BASED_USE_TSC_OFFSETING |
+	      CPU_BASED_INVLPG_EXITING;
 	opt = CPU_BASED_TPR_SHADOW |
 	      CPU_BASED_USE_MSR_BITMAPS |
 	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1159,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
 #endif
 	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
-		/* CR3 accesses don't need to cause VM Exits when EPT enabled */
+		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
+		   enabled */
 		min &= ~(CPU_BASED_CR3_LOAD_EXITING |
-			 CPU_BASED_CR3_STORE_EXITING);
+			 CPU_BASED_CR3_STORE_EXITING |
+			 CPU_BASED_INVLPG_EXITING);
 		if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
 					&_cpu_based_exec_control) < 0)
 			return -EIO;
@@ -2790,6 +2793,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+
+	kvm_mmu_invlpg(vcpu, exit_qualification);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	skip_emulated_instruction(vcpu);
@@ -2958,6 +2970,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
 	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
 	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
 	[EXIT_REASON_HLT]                     = handle_halt,
+	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 88e6d9abbd2..efee85ba07e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2341,6 +2341,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 {
+	kvm_mmu_invlpg(vcpu, address);
 	return X86EMUL_CONTINUE;
 }
 
-- 
cgit v1.2.3


From ad8cfbe3fffdc09704f0808fde3934855620d545 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:36 -0300
Subject: KVM: MMU: mmu_parent_walk

Introduce a function to walk all parents of a given page, invoking a handler.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e89af1df4fc..b82abee78f1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -147,6 +147,8 @@ struct kvm_shadow_walk {
 		     u64 addr, u64 *spte, int level);
 };
 
+typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
@@ -862,6 +864,31 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 	BUG();
 }
 
+
+static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    mmu_parent_walk_fn fn)
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_node *node;
+	struct kvm_mmu_page *parent_sp;
+	int i;
+
+	if (!sp->multimapped && sp->parent_pte) {
+		parent_sp = page_header(__pa(sp->parent_pte));
+		fn(vcpu, parent_sp);
+		mmu_parent_walk(vcpu, parent_sp, fn);
+		return;
+	}
+	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+			if (!pte_chain->parent_ptes[i])
+				break;
+			parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
+			fn(vcpu, parent_sp);
+			mmu_parent_walk(vcpu, parent_sp, fn);
+		}
+}
+
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp)
 {
-- 
cgit v1.2.3


From 0738541396be165995c7f2387746eb0b47024fec Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:37 -0300
Subject: KVM: MMU: awareness of new kvm_mmu_zap_page behaviour

kvm_mmu_zap_page will soon zap the unsynced children of a page. Restart
list walk in such case.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b82abee78f1..c9b4b902527 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1078,7 +1078,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 	}
 }
 
-static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	++kvm->stat.mmu_shadow_zapped;
 	kvm_mmu_page_unlink_children(kvm, sp);
@@ -1095,6 +1095,7 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 		kvm_reload_remote_mmus(kvm);
 	}
 	kvm_mmu_reset_last_pte_updated(kvm);
+	return 0;
 }
 
 /*
@@ -1147,8 +1148,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 		if (sp->gfn == gfn && !sp->role.metaphysical) {
 			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
-			kvm_mmu_zap_page(kvm, sp);
 			r = 1;
+			if (kvm_mmu_zap_page(kvm, sp))
+				n = bucket->first;
 		}
 	return r;
 }
@@ -1992,7 +1994,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			 */
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
-			kvm_mmu_zap_page(vcpu->kvm, sp);
+			if (kvm_mmu_zap_page(vcpu->kvm, sp))
+				n = bucket->first;
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
@@ -2226,7 +2229,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 
 	spin_lock(&kvm->mmu_lock);
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-		kvm_mmu_zap_page(kvm, sp);
+		if (kvm_mmu_zap_page(kvm, sp))
+			node = container_of(kvm->arch.active_mmu_pages.next,
+					    struct kvm_mmu_page, link);
 	spin_unlock(&kvm->mmu_lock);
 
 	kvm_flush_remote_tlbs(kvm);
-- 
cgit v1.2.3


From 6844dec6948679d084f054235fee19ba4e3a3096 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:38 -0300
Subject: KVM: MMU: mmu_convert_notrap helper

Need to convert shadow_notrap_nonpresent -> shadow_trap_nonpresent when
unsyncing pages.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c9b4b902527..57c7580e7f9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1173,6 +1173,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
 	__set_bit(slot, &sp->slot_bitmap);
 }
 
+static void mmu_convert_notrap(struct kvm_mmu_page *sp)
+{
+	int i;
+	u64 *pt = sp->spt;
+
+	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
+		return;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		if (pt[i] == shadow_notrap_nonpresent_pte)
+			set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
+	}
+}
+
 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct page *page;
-- 
cgit v1.2.3


From 4731d4c7a07769cf2926c327177b97bb8c68cafc Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:39 -0300
Subject: KVM: MMU: out of sync shadow core

Allow guest pagetables to go out of sync.  Instead of emulating write
accesses to guest pagetables, or unshadowing them, we un-write-protect
the page table and allow the guest to modify it at will.  We rely on
invlpg executions to synchronize individual ptes, and will synchronize
the entire pagetable on tlb flushes.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 210 +++++++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/paging_tmpl.h |   2 +-
 arch/x86/kvm/x86.c         |   3 +
 3 files changed, 197 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 57c7580e7f9..d88659ae777 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -147,6 +147,10 @@ struct kvm_shadow_walk {
 		     u64 addr, u64 *spte, int level);
 };
 
+struct kvm_unsync_walk {
+	int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
+};
+
 typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
 
 static struct kmem_cache *pte_chain_cache;
@@ -654,8 +658,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 
 	if (write_protected)
 		kvm_flush_remote_tlbs(kvm);
-
-	account_shadowed(kvm, gfn);
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -908,6 +910,41 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 }
 
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+			   struct kvm_unsync_walk *walker)
+{
+	int i, ret;
+
+	if (!sp->unsync_children)
+		return 0;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		u64 ent = sp->spt[i];
+
+		if (is_shadow_present_pte(ent)) {
+			struct kvm_mmu_page *child;
+			child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+			if (child->unsync_children) {
+				ret = mmu_unsync_walk(child, walker);
+				if (ret)
+					return ret;
+			}
+
+			if (child->unsync) {
+				ret = walker->entry(child, walker);
+				if (ret)
+					return ret;
+			}
+		}
+	}
+
+	if (i == PT64_ENT_PER_PAGE)
+		sp->unsync_children = 0;
+
+	return 0;
+}
+
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned index;
@@ -928,6 +965,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	return NULL;
 }
 
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	WARN_ON(!sp->unsync);
+	sp->unsync = 0;
+	--kvm->stat.mmu_unsync;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+		kvm_mmu_zap_page(vcpu->kvm, sp);
+		return 1;
+	}
+
+	rmap_write_protect(vcpu->kvm, sp->gfn);
+	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+		kvm_mmu_zap_page(vcpu->kvm, sp);
+		return 1;
+	}
+
+	kvm_mmu_flush_tlb(vcpu);
+	kvm_unlink_unsync_page(vcpu->kvm, sp);
+	return 0;
+}
+
+struct sync_walker {
+	struct kvm_vcpu *vcpu;
+	struct kvm_unsync_walk walker;
+};
+
+static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+	struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
+						     walker);
+	struct kvm_vcpu *vcpu = sync_walk->vcpu;
+
+	kvm_sync_page(vcpu, sp);
+	return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	struct sync_walker walker = {
+		.walker = { .entry = mmu_sync_fn, },
+		.vcpu = vcpu,
+	};
+
+	while (mmu_unsync_walk(sp, &walker.walker))
+		cond_resched_lock(&vcpu->kvm->mmu_lock);
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
@@ -941,7 +1031,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	unsigned quadrant;
 	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
+	struct hlist_node *node, *tmp;
 
 	role.word = 0;
 	role.glevels = vcpu->arch.mmu.root_level;
@@ -957,8 +1047,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		 gfn, role.word);
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && sp->role.word == role.word) {
+	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
+		if (sp->gfn == gfn) {
+			if (sp->unsync)
+				if (kvm_sync_page(vcpu, sp))
+					continue;
+
+			if (sp->role.word != role.word)
+				continue;
+
+			if (sp->unsync_children)
+				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 			pgprintk("%s: found\n", __func__);
 			return sp;
@@ -971,8 +1071,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	sp->gfn = gfn;
 	sp->role = role;
 	hlist_add_head(&sp->hash_link, bucket);
-	if (!metaphysical)
+	if (!metaphysical) {
 		rmap_write_protect(vcpu->kvm, gfn);
+		account_shadowed(vcpu->kvm, gfn);
+	}
 	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
 		vcpu->arch.mmu.prefetch_page(vcpu, sp);
 	else
@@ -1078,14 +1180,47 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 	}
 }
 
+struct zap_walker {
+	struct kvm_unsync_walk walker;
+	struct kvm *kvm;
+	int zapped;
+};
+
+static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+	struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
+						     walker);
+	kvm_mmu_zap_page(zap_walk->kvm, sp);
+	zap_walk->zapped = 1;
+	return 0;
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	struct zap_walker walker = {
+		.walker = { .entry = mmu_zap_fn, },
+		.kvm = kvm,
+		.zapped = 0,
+	};
+
+	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+		return 0;
+	mmu_unsync_walk(sp, &walker.walker);
+	return walker.zapped;
+}
+
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
+	int ret;
 	++kvm->stat.mmu_shadow_zapped;
+	ret = mmu_zap_unsync_children(kvm, sp);
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
 	kvm_flush_remote_tlbs(kvm);
 	if (!sp->role.invalid && !sp->role.metaphysical)
 		unaccount_shadowed(kvm, sp->gfn);
+	if (sp->unsync)
+		kvm_unlink_unsync_page(kvm, sp);
 	if (!sp->root_count) {
 		hlist_del(&sp->hash_link);
 		kvm_mmu_free_page(kvm, sp);
@@ -1095,7 +1230,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 		kvm_reload_remote_mmus(kvm);
 	}
 	kvm_mmu_reset_last_pte_updated(kvm);
-	return 0;
+	return ret;
 }
 
 /*
@@ -1201,10 +1336,58 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 	return page;
 }
 
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	sp->unsync_children = 1;
+	return 1;
+}
+
+static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	unsigned index;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *s;
+	struct hlist_node *node, *n;
+
+	index = kvm_page_table_hashfn(sp->gfn);
+	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+	/* don't unsync if pagetable is shadowed with multiple roles */
+	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+		if (s->gfn != sp->gfn || s->role.metaphysical)
+			continue;
+		if (s->role.word != sp->role.word)
+			return 1;
+	}
+	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	++vcpu->kvm->stat.mmu_unsync;
+	sp->unsync = 1;
+	mmu_convert_notrap(sp);
+	return 0;
+}
+
+static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+				  bool can_unsync)
+{
+	struct kvm_mmu_page *shadow;
+
+	shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+	if (shadow) {
+		if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+			return 1;
+		if (shadow->unsync)
+			return 0;
+		if (can_unsync)
+			return kvm_unsync_page(vcpu, shadow);
+		return 1;
+	}
+	return 0;
+}
+
 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		    unsigned pte_access, int user_fault,
 		    int write_fault, int dirty, int largepage,
-		    gfn_t gfn, pfn_t pfn, bool speculative)
+		    gfn_t gfn, pfn_t pfn, bool speculative,
+		    bool can_unsync)
 {
 	u64 spte;
 	int ret = 0;
@@ -1231,7 +1414,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
 	if ((pte_access & ACC_WRITE_MASK)
 	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
-		struct kvm_mmu_page *shadow;
 
 		if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
 			ret = 1;
@@ -1241,8 +1423,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
 		spte |= PT_WRITABLE_MASK;
 
-		shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-		if (shadow) {
+		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __func__, gfn);
 			ret = 1;
@@ -1260,7 +1441,6 @@ set_pte:
 	return ret;
 }
 
-
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
@@ -1298,7 +1478,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		}
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, gfn, pfn, speculative)) {
+		      dirty, largepage, gfn, pfn, speculative, true)) {
 		if (write_fault)
 			*ptwrite = 1;
 		kvm_x86_ops->tlb_flush(vcpu);
@@ -1518,10 +1698,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 }
 
-static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
-}
-
 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index dc169e8148b..613ec9aa674 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -580,7 +580,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
 			 is_dirty_pte(gpte), 0, gfn,
-			 spte_to_pfn(sp->spt[i]), true);
+			 spte_to_pfn(sp->spt[i]), true, false);
 	}
 
 	return !nr_present;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index efee85ba07e..1c5864ac083 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -101,6 +101,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_flooded", VM_STAT(mmu_flooded) },
 	{ "mmu_recycled", VM_STAT(mmu_recycled) },
 	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+	{ "mmu_unsync", VM_STAT(mmu_unsync) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
 	{ NULL }
@@ -3120,6 +3121,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->requests) {
 		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
 			__kvm_migrate_timers(vcpu);
+		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+			kvm_mmu_sync_roots(vcpu);
 		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
 			kvm_x86_ops->tlb_flush(vcpu);
 		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
-- 
cgit v1.2.3


From 0074ff63ebc195701062ca46e0d82fcea0fa3a0a Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:40 -0300
Subject: KVM: MMU: speed up mmu_unsync_walk

Cache the unsynced children information in a per-page bitmap.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 72 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 60 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d88659ae777..cb391d629af 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -891,6 +891,52 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		}
 }
 
+static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+{
+	unsigned int index;
+	struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+	index = spte - sp->spt;
+	__set_bit(index, sp->unsync_child_bitmap);
+	sp->unsync_children = 1;
+}
+
+static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+{
+	struct kvm_pte_chain *pte_chain;
+	struct hlist_node *node;
+	int i;
+
+	if (!sp->parent_pte)
+		return;
+
+	if (!sp->multimapped) {
+		kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+		return;
+	}
+
+	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+			if (!pte_chain->parent_ptes[i])
+				break;
+			kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
+		}
+}
+
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	sp->unsync_children = 1;
+	kvm_mmu_update_parents_unsync(sp);
+	return 1;
+}
+
+static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
+					struct kvm_mmu_page *sp)
+{
+	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	kvm_mmu_update_parents_unsync(sp);
+}
+
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp)
 {
@@ -910,6 +956,11 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 }
 
+#define for_each_unsync_children(bitmap, idx)		\
+	for (idx = find_first_bit(bitmap, 512);		\
+	     idx < 512;					\
+	     idx = find_next_bit(bitmap, 512, idx+1))
+
 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 			   struct kvm_unsync_walk *walker)
 {
@@ -918,7 +969,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 	if (!sp->unsync_children)
 		return 0;
 
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+	for_each_unsync_children(sp->unsync_child_bitmap, i) {
 		u64 ent = sp->spt[i];
 
 		if (is_shadow_present_pte(ent)) {
@@ -929,17 +980,19 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 				ret = mmu_unsync_walk(child, walker);
 				if (ret)
 					return ret;
+				__clear_bit(i, sp->unsync_child_bitmap);
 			}
 
 			if (child->unsync) {
 				ret = walker->entry(child, walker);
+				__clear_bit(i, sp->unsync_child_bitmap);
 				if (ret)
 					return ret;
 			}
 		}
 	}
 
-	if (i == PT64_ENT_PER_PAGE)
+	if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
 		sp->unsync_children = 0;
 
 	return 0;
@@ -1056,10 +1109,11 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			if (sp->role.word != role.word)
 				continue;
 
-			if (sp->unsync_children)
-				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
-
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+			if (sp->unsync_children) {
+				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+				kvm_mmu_mark_parents_unsync(vcpu, sp);
+			}
 			pgprintk("%s: found\n", __func__);
 			return sp;
 		}
@@ -1336,12 +1390,6 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 	return page;
 }
 
-static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
-	sp->unsync_children = 1;
-	return 1;
-}
-
 static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	unsigned index;
@@ -1358,7 +1406,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		if (s->role.word != sp->role.word)
 			return 1;
 	}
-	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	kvm_mmu_mark_parents_unsync(vcpu, sp);
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
 	mmu_convert_notrap(sp);
-- 
cgit v1.2.3


From 582801a95d2f2ceab841779e1dec0e11dfec44c0 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Sep 2008 13:18:41 -0300
Subject: KVM: MMU: add "oos_shadow" parameter to disable oos

Subject says it all.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb391d629af..99c239c5c0a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,6 +70,9 @@ static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
 
+static int oos_shadow = 1;
+module_param(oos_shadow, bool, 0644);
+
 #ifndef MMU_DEBUG
 #define ASSERT(x) do { } while (0)
 #else
@@ -1424,7 +1427,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 			return 1;
 		if (shadow->unsync)
 			return 0;
-		if (can_unsync)
+		if (can_unsync && oos_shadow)
 			return kvm_unsync_page(vcpu, shadow);
 		return 1;
 	}
-- 
cgit v1.2.3


From e48258009d941891fca35348986b8d280caf31cd Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 24 Sep 2008 20:28:34 -0300
Subject: KVM: PIC: enhance IPI avoidance

The PIC code makes little effort to avoid kvm_vcpu_kick(), resulting in
unnecessary guest exits in some conditions.

For example, if the timer interrupt is routed through the IOAPIC, IRR
for IRQ 0 will get set but not cleared, since the APIC is handling the
acks.

This means that everytime an interrupt < 16 is triggered, the priority
logic will find IRQ0 pending and send an IPI to vcpu0 (in case IRQ0 is
not masked, which is Linux's case).

Introduce a new variable isr_ack to represent the IRQ's for which the
guest has been signalled / cleared the ISR. Use it to avoid more than
one IPI per trigger-ack cycle, in addition to the avoidance when ISR is
set in get_priority().

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 17 +++++++++++++++--
 arch/x86/kvm/irq.h   |  2 ++
 arch/x86/kvm/x86.c   |  1 +
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 71e3eeeccae..17e41e165f1 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,14 @@
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
 	s->isr &= ~(1 << irq);
+	s->isr_ack |= (1 << irq);
+}
+
+void kvm_pic_clear_isr_ack(struct kvm *kvm)
+{
+	struct kvm_pic *s = pic_irqchip(kvm);
+	s->pics[0].isr_ack = 0xff;
+	s->pics[1].isr_ack = 0xff;
 }
 
 /*
@@ -213,6 +221,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 	s->irr = 0;
 	s->imr = 0;
 	s->isr = 0;
+	s->isr_ack = 0xff;
 	s->priority_add = 0;
 	s->irq_base = 0;
 	s->read_reg_select = 0;
@@ -444,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
 {
 	struct kvm *kvm = opaque;
 	struct kvm_vcpu *vcpu = kvm->vcpus[0];
+	struct kvm_pic *s = pic_irqchip(kvm);
+	int irq = pic_get_irq(&s->pics[0]);
 
-	pic_irqchip(kvm)->output = level;
-	if (vcpu)
+	s->output = level;
+	if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
+		s->pics[0].isr_ack &= ~(1 << irq);
 		kvm_vcpu_kick(vcpu);
+	}
 }
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm)
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 479a3d2d561..4748532fd1d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -42,6 +42,7 @@ struct kvm_kpic_state {
 	u8 irr;		/* interrupt request register */
 	u8 imr;		/* interrupt mask register */
 	u8 isr;		/* interrupt service register */
+	u8 isr_ack;	/* interrupt ack detection */
 	u8 priority_add;	/* highest irq priority */
 	u8 irq_base;
 	u8 read_reg_select;
@@ -70,6 +71,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm);
 void kvm_pic_set_irq(void *opaque, int irq, int level);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
+void kvm_pic_clear_isr_ack(struct kvm *kvm);
 
 static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1c5864ac083..4cfdd1b6df0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3963,6 +3963,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 			pr_debug("Set back pending irq %d\n",
 				 pending_vec);
 		}
+		kvm_pic_clear_isr_ack(vcpu->kvm);
 	}
 
 	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
-- 
cgit v1.2.3


From e5fcfc821a467bd0827635db8fd39ab1213987e5 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Thu, 25 Sep 2008 23:32:02 +0800
Subject: KVM: Device Assignment: Map mmio pages into VT-d page table

Assigned device could DMA to mmio pages, so also need to map mmio pages
into VT-d page table.

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vtd.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c
index 667bf3fb64b..a770874f3a3 100644
--- a/arch/x86/kvm/vtd.c
+++ b/arch/x86/kvm/vtd.c
@@ -36,37 +36,30 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 {
 	gfn_t gfn = base_gfn;
 	pfn_t pfn;
-	int i, r;
+	int i, r = 0;
 	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
 
 	/* check if iommu exists and in use */
 	if (!domain)
 		return 0;
 
-	r = -EINVAL;
 	for (i = 0; i < npages; i++) {
 		/* check if already mapped */
 		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
 						     gfn_to_gpa(gfn));
-		if (pfn && !is_mmio_pfn(pfn))
+		if (pfn)
 			continue;
 
 		pfn = gfn_to_pfn(kvm, gfn);
-		if (!is_mmio_pfn(pfn)) {
-			r = intel_iommu_page_mapping(domain,
-						     gfn_to_gpa(gfn),
-						     pfn_to_hpa(pfn),
-						     PAGE_SIZE,
-						     DMA_PTE_READ |
-						     DMA_PTE_WRITE);
-			if (r) {
-				printk(KERN_DEBUG "kvm_iommu_map_pages:"
-				       "iommu failed to map pfn=%lx\n", pfn);
-				goto unmap_pages;
-			}
-		} else {
-			printk(KERN_DEBUG "kvm_iommu_map_page:"
-			       "invalid pfn=%lx\n", pfn);
+		r = intel_iommu_page_mapping(domain,
+					     gfn_to_gpa(gfn),
+					     pfn_to_hpa(pfn),
+					     PAGE_SIZE,
+					     DMA_PTE_READ |
+					     DMA_PTE_WRITE);
+		if (r) {
+			printk(KERN_ERR "kvm_iommu_map_pages:"
+			       "iommu failed to map pfn=%lx\n", pfn);
 			goto unmap_pages;
 		}
 		gfn++;
-- 
cgit v1.2.3


From 1b10bf31a5de5b76e2e9c2937878a45c5ae2be37 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 30 Sep 2008 10:41:06 +0200
Subject: KVM: x86: Silence various LAPIC-related host kernel messages

KVM-x86 dumps a lot of debug messages that have no meaning for normal
operation:
 - INIT de-assertion is ignored
 - SIPIs are sent and received
 - APIC writes are unaligned or < 4 byte long
   (Windows Server 2003 triggers this on SMP)

Degrade them to true debug messages, keeping the host kernel log clean
for real problems.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 16 +++++++---------
 arch/x86/kvm/x86.c   |  4 ++--
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fd00f698692..6571926bfd3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -365,16 +365,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
 			kvm_vcpu_kick(vcpu);
 		} else {
-			printk(KERN_DEBUG
-			       "Ignoring de-assert INIT to vcpu %d\n",
-			       vcpu->vcpu_id);
+			apic_debug("Ignoring de-assert INIT to vcpu %d\n",
+				   vcpu->vcpu_id);
 		}
-
 		break;
 
 	case APIC_DM_STARTUP:
-		printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
-		       vcpu->vcpu_id, vector);
+		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
+			   vcpu->vcpu_id, vector);
 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
 			vcpu->arch.sipi_vector = vector;
 			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
@@ -679,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
 	 * Refer SDM 8.4.1
 	 */
 	if (len != 4 || alignment) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "apic write: bad size=%d %lx\n",
-			       len, (long)address);
+		/* Don't shout loud, $infamous_os would cause only noise. */
+		apic_debug("apic write: bad size=%d %lx\n",
+			   len, (long)address);
 		return;
 	}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4cfdd1b6df0..d6d7123d264 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3224,8 +3224,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
-		printk("vcpu %d received sipi with vector # %x\n",
-		       vcpu->vcpu_id, vcpu->arch.sipi_vector);
+		pr_debug("vcpu %d received sipi with vector # %x\n",
+			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
 		kvm_lapic_reset(vcpu);
 		r = kvm_x86_ops->vcpu_reset(vcpu);
 		if (r)
-- 
cgit v1.2.3


From 83dbc83a0d7c88c919d769177bd1924a46c9c034 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 7 Oct 2008 17:01:27 -0300
Subject: KVM: VMX: enable invlpg exiting if EPT is disabled

Manually disabling EPT via module option fails to re-enable INVLPG
exiting.

Reported-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4556cc3715b..2643b430d83 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2118,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	}
 	if (!vm_need_ept())
 		exec_control |= CPU_BASED_CR3_STORE_EXITING |
-				CPU_BASED_CR3_LOAD_EXITING;
+				CPU_BASED_CR3_LOAD_EXITING  |
+				CPU_BASED_INVLPG_EXITING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
 
 	if (cpu_has_secondary_exec_ctrls()) {
-- 
cgit v1.2.3


From 371c01b28e4049d1fbf60a9631cdad98f7cae030 Mon Sep 17 00:00:00 2001
From: Zhang xiantao <xiantao.zhang@intel.com>
Date: Thu, 11 Sep 2008 13:19:32 +0800
Subject: KVM: Device Assignment: Move vtd.c from arch/x86/kvm/ to virt/kvm/

Preparation for kvm/ia64 VT-d support.

Signed-off-by: Zhang xiantao <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/Makefile |   6 +-
 arch/x86/kvm/vtd.c    | 191 --------------------------------------------------
 2 files changed, 3 insertions(+), 194 deletions(-)
 delete mode 100644 arch/x86/kvm/vtd.c

(limited to 'arch')

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 3072b17447a..7dce593b9c6 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,14 +7,14 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 ifeq ($(CONFIG_KVM_TRACE),y)
 common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 endif
+ifeq ($(CONFIG_DMAR),y)
+common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+endif
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
 kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
 	i8254.o
-ifeq ($(CONFIG_DMAR),y)
-kvm-objs += vtd.o
-endif
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c
deleted file mode 100644
index a770874f3a3..00000000000
--- a/arch/x86/kvm/vtd.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Copyright IBM Corporation, 2008
- * Author: Allen M. Kay <allen.m.kay@intel.com>
- * Author: Weidong Han <weidong.han@intel.com>
- * Author: Ben-Ami Yassour <benami@il.ibm.com>
- */
-
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <linux/pci.h>
-#include <linux/dmar.h>
-#include <linux/intel-iommu.h>
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm);
-static void kvm_iommu_put_pages(struct kvm *kvm,
-				gfn_t base_gfn, unsigned long npages);
-
-int kvm_iommu_map_pages(struct kvm *kvm,
-			gfn_t base_gfn, unsigned long npages)
-{
-	gfn_t gfn = base_gfn;
-	pfn_t pfn;
-	int i, r = 0;
-	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	for (i = 0; i < npages; i++) {
-		/* check if already mapped */
-		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
-						     gfn_to_gpa(gfn));
-		if (pfn)
-			continue;
-
-		pfn = gfn_to_pfn(kvm, gfn);
-		r = intel_iommu_page_mapping(domain,
-					     gfn_to_gpa(gfn),
-					     pfn_to_hpa(pfn),
-					     PAGE_SIZE,
-					     DMA_PTE_READ |
-					     DMA_PTE_WRITE);
-		if (r) {
-			printk(KERN_ERR "kvm_iommu_map_pages:"
-			       "iommu failed to map pfn=%lx\n", pfn);
-			goto unmap_pages;
-		}
-		gfn++;
-	}
-	return 0;
-
-unmap_pages:
-	kvm_iommu_put_pages(kvm, base_gfn, i);
-	return r;
-}
-
-static int kvm_iommu_map_memslots(struct kvm *kvm)
-{
-	int i, r;
-
-	down_read(&kvm->slots_lock);
-	for (i = 0; i < kvm->nmemslots; i++) {
-		r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn,
-					kvm->memslots[i].npages);
-		if (r)
-			break;
-	}
-	up_read(&kvm->slots_lock);
-	return r;
-}
-
-int kvm_iommu_map_guest(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	struct pci_dev *pdev = NULL;
-	int r;
-
-	if (!intel_iommu_found()) {
-		printk(KERN_ERR "%s: intel iommu not found\n", __func__);
-		return -ENODEV;
-	}
-
-	printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n",
-	       assigned_dev->host_busnr,
-	       PCI_SLOT(assigned_dev->host_devfn),
-	       PCI_FUNC(assigned_dev->host_devfn));
-
-	pdev = assigned_dev->dev;
-
-	if (pdev == NULL) {
-		if (kvm->arch.intel_iommu_domain) {
-			intel_iommu_domain_exit(kvm->arch.intel_iommu_domain);
-			kvm->arch.intel_iommu_domain = NULL;
-		}
-		return -ENODEV;
-	}
-
-	kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev);
-	if (!kvm->arch.intel_iommu_domain)
-		return -ENODEV;
-
-	r = kvm_iommu_map_memslots(kvm);
-	if (r)
-		goto out_unmap;
-
-	intel_iommu_detach_dev(kvm->arch.intel_iommu_domain,
-			       pdev->bus->number, pdev->devfn);
-
-	r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain,
-					pdev);
-	if (r) {
-		printk(KERN_ERR "Domain context map for %s failed",
-		       pci_name(pdev));
-		goto out_unmap;
-	}
-	return 0;
-
-out_unmap:
-	kvm_iommu_unmap_memslots(kvm);
-	return r;
-}
-
-static void kvm_iommu_put_pages(struct kvm *kvm,
-			       gfn_t base_gfn, unsigned long npages)
-{
-	gfn_t gfn = base_gfn;
-	pfn_t pfn;
-	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
-	int i;
-
-	for (i = 0; i < npages; i++) {
-		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
-						     gfn_to_gpa(gfn));
-		kvm_release_pfn_clean(pfn);
-		gfn++;
-	}
-}
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm)
-{
-	int i;
-	down_read(&kvm->slots_lock);
-	for (i = 0; i < kvm->nmemslots; i++) {
-		kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn,
-				    kvm->memslots[i].npages);
-	}
-	up_read(&kvm->slots_lock);
-
-	return 0;
-}
-
-int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-	struct kvm_assigned_dev_kernel *entry;
-	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) {
-		printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n",
-		       entry->host_busnr,
-		       PCI_SLOT(entry->host_devfn),
-		       PCI_FUNC(entry->host_devfn));
-
-		/* detach kvm dmar domain */
-		intel_iommu_detach_dev(domain, entry->host_busnr,
-				       entry->host_devfn);
-	}
-	kvm_iommu_unmap_memslots(kvm);
-	intel_iommu_domain_exit(domain);
-	return 0;
-}
-- 
cgit v1.2.3


From 8a98f6648a2b0756d8f26d6c13332f5526355fec Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Mon, 6 Oct 2008 13:47:38 +0800
Subject: KVM: Move device assignment logic to common code

To share with other archs, this patch moves device assignment
logic to common parts.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/x86/kvm/x86.c | 255 -----------------------------------------------------
 1 file changed, 255 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d6d7123d264..f8bde01ba8e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -30,7 +30,6 @@
 #include <linux/interrupt.h>
 #include <linux/kvm.h>
 #include <linux/fs.h>
-#include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/mman.h>
@@ -107,238 +106,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
-						      int assigned_dev_id)
-{
-	struct list_head *ptr;
-	struct kvm_assigned_dev_kernel *match;
-
-	list_for_each(ptr, head) {
-		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
-		if (match->assigned_dev_id == assigned_dev_id)
-			return match;
-	}
-	return NULL;
-}
-
-static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev;
-
-	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
-				    interrupt_work);
-
-	/* This is taken to safely inject irq inside the guest. When
-	 * the interrupt injection (or the ioapic code) uses a
-	 * finer-grained lock, update this
-	 */
-	mutex_lock(&assigned_dev->kvm->lock);
-	kvm_set_irq(assigned_dev->kvm,
-		    assigned_dev->guest_irq, 1);
-	mutex_unlock(&assigned_dev->kvm->lock);
-	kvm_put_kvm(assigned_dev->kvm);
-}
-
-/* FIXME: Implement the OR logic needed to make shared interrupts on
- * this line behave properly
- */
-static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev =
-		(struct kvm_assigned_dev_kernel *) dev_id;
-
-	kvm_get_kvm(assigned_dev->kvm);
-	schedule_work(&assigned_dev->interrupt_work);
-	disable_irq_nosync(irq);
-	return IRQ_HANDLED;
-}
-
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
-	struct kvm_assigned_dev_kernel *dev;
-
-	if (kian->gsi == -1)
-		return;
-
-	dev = container_of(kian, struct kvm_assigned_dev_kernel,
-			   ack_notifier);
-	kvm_set_irq(dev->kvm, dev->guest_irq, 0);
-	enable_irq(dev->host_irq);
-}
-
-static void kvm_free_assigned_device(struct kvm *kvm,
-				     struct kvm_assigned_dev_kernel
-				     *assigned_dev)
-{
-	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
-		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
-
-	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
-
-	if (cancel_work_sync(&assigned_dev->interrupt_work))
-		/* We had pending work. That means we will have to take
-		 * care of kvm_put_kvm.
-		 */
-		kvm_put_kvm(kvm);
-
-	pci_release_regions(assigned_dev->dev);
-	pci_disable_device(assigned_dev->dev);
-	pci_dev_put(assigned_dev->dev);
-
-	list_del(&assigned_dev->list);
-	kfree(assigned_dev);
-}
-
-static void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
-	struct list_head *ptr, *ptr2;
-	struct kvm_assigned_dev_kernel *assigned_dev;
-
-	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
-		assigned_dev = list_entry(ptr,
-					  struct kvm_assigned_dev_kernel,
-					  list);
-
-		kvm_free_assigned_device(kvm, assigned_dev);
-	}
-}
-
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-				   struct kvm_assigned_irq
-				   *assigned_irq)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match) {
-		mutex_unlock(&kvm->lock);
-		return -EINVAL;
-	}
-
-	if (match->irq_requested) {
-		match->guest_irq = assigned_irq->guest_irq;
-		match->ack_notifier.gsi = assigned_irq->guest_irq;
-		mutex_unlock(&kvm->lock);
-		return 0;
-	}
-
-	INIT_WORK(&match->interrupt_work,
-		  kvm_assigned_dev_interrupt_work_handler);
-
-	if (irqchip_in_kernel(kvm)) {
-		if (!capable(CAP_SYS_RAWIO)) {
-			r = -EPERM;
-			goto out_release;
-		}
-
-		if (assigned_irq->host_irq)
-			match->host_irq = assigned_irq->host_irq;
-		else
-			match->host_irq = match->dev->irq;
-		match->guest_irq = assigned_irq->guest_irq;
-		match->ack_notifier.gsi = assigned_irq->guest_irq;
-		match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-		kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
-
-		/* Even though this is PCI, we don't want to use shared
-		 * interrupts. Sharing host devices with guest-assigned devices
-		 * on the same interrupt line is not a happy situation: there
-		 * are going to be long delays in accepting, acking, etc.
-		 */
-		if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
-				"kvm_assigned_device", (void *)match)) {
-			r = -EIO;
-			goto out_release;
-		}
-	}
-
-	match->irq_requested = true;
-	mutex_unlock(&kvm->lock);
-	return r;
-out_release:
-	mutex_unlock(&kvm->lock);
-	kvm_free_assigned_device(kvm, match);
-	return r;
-}
-
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
-				      struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-	struct pci_dev *dev;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (match) {
-		/* device already assigned */
-		r = -EINVAL;
-		goto out;
-	}
-
-	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
-	if (match == NULL) {
-		printk(KERN_INFO "%s: Couldn't allocate memory\n",
-		       __func__);
-		r = -ENOMEM;
-		goto out;
-	}
-	dev = pci_get_bus_and_slot(assigned_dev->busnr,
-				   assigned_dev->devfn);
-	if (!dev) {
-		printk(KERN_INFO "%s: host device not found\n", __func__);
-		r = -EINVAL;
-		goto out_free;
-	}
-	if (pci_enable_device(dev)) {
-		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
-		r = -EBUSY;
-		goto out_put;
-	}
-	r = pci_request_regions(dev, "kvm_assigned_device");
-	if (r) {
-		printk(KERN_INFO "%s: Could not get access to device regions\n",
-		       __func__);
-		goto out_disable;
-	}
-	match->assigned_dev_id = assigned_dev->assigned_dev_id;
-	match->host_busnr = assigned_dev->busnr;
-	match->host_devfn = assigned_dev->devfn;
-	match->dev = dev;
-
-	match->kvm = kvm;
-
-	list_add(&match->list, &kvm->arch.assigned_dev_head);
-
-	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
-		r = kvm_iommu_map_guest(kvm, match);
-		if (r)
-			goto out_list_del;
-	}
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-out_list_del:
-	list_del(&match->list);
-	pci_release_regions(dev);
-out_disable:
-	pci_disable_device(dev);
-out_put:
-	pci_dev_put(dev);
-out_free:
-	kfree(match);
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
 unsigned long segment_base(u16 selector)
 {
 	struct descriptor_table gdt;
@@ -2030,28 +1797,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
-	case KVM_ASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
 	case KVM_GET_PIT: {
 		r = -EFAULT;
 		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
-- 
cgit v1.2.3


From 3de42dc094ecd313dc7d551e007a134b52f8663d Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Mon, 6 Oct 2008 13:48:45 +0800
Subject: KVM: Separate irq ack notification out of arch/x86/kvm/irq.c

Moving irq ack notification logic as common, and make
it shared with ia64 side.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/ia64/include/asm/kvm_host.h |  4 ++++
 arch/ia64/kvm/Makefile           |  2 +-
 arch/ia64/kvm/irq.h              |  5 -----
 arch/x86/kvm/Makefile            |  2 +-
 arch/x86/kvm/irq.c               | 33 ---------------------------------
 arch/x86/kvm/irq.h               |  8 --------
 6 files changed, 6 insertions(+), 48 deletions(-)

(limited to 'arch')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 1efe513a994..da579a33db1 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -413,6 +413,10 @@ struct kvm_arch {
 	struct kvm_ioapic *vioapic;
 	struct kvm_vm_stat stat;
 	struct kvm_sal_data rdv_sal_data;
+
+	struct list_head assigned_dev_head;
+	struct dmar_domain *intel_iommu_domain;
+	struct hlist_head irq_ack_notifier_list;
 };
 
 union cpuid3_t {
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index bf22fb9e6dc..3b1a1c156ef 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -44,7 +44,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-		coalesced_mmio.o)
+		coalesced_mmio.o irq_comm.o)
 
 kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h
index f2e6545debf..604329ac3c9 100644
--- a/arch/ia64/kvm/irq.h
+++ b/arch/ia64/kvm/irq.h
@@ -23,10 +23,5 @@
 #ifndef __IRQ_H
 #define __IRQ_H
 
-struct kvm;
-
-static inline void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
-{
-}
 
 #endif
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 7dce593b9c6..c02343594b4 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,7 +3,7 @@
 #
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-                coalesced_mmio.o)
+                coalesced_mmio.o irq_comm.o)
 ifeq ($(CONFIG_KVM_TRACE),y)
 common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 8c1b9c5def7..c019b8edcdb 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -99,36 +99,3 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
 	__kvm_migrate_apic_timer(vcpu);
 	__kvm_migrate_pit_timer(vcpu);
 }
-
-/* This should be called with the kvm->lock mutex held */
-void kvm_set_irq(struct kvm *kvm, int irq, int level)
-{
-	/* Not possible to detect if the guest uses the PIC or the
-	 * IOAPIC.  So set the bit in both. The guest will ignore
-	 * writes to the unused one.
-	 */
-	kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level);
-	kvm_pic_set_irq(pic_irqchip(kvm), irq, level);
-}
-
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
-{
-	struct kvm_irq_ack_notifier *kian;
-	struct hlist_node *n;
-
-	hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
-		if (kian->gsi == gsi)
-			kian->irq_acked(kian);
-}
-
-void kvm_register_irq_ack_notifier(struct kvm *kvm,
-				   struct kvm_irq_ack_notifier *kian)
-{
-	hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
-}
-
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
-				     struct kvm_irq_ack_notifier *kian)
-{
-	hlist_del(&kian->link);
-}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 4748532fd1d..f17c8f5bbf3 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -68,7 +68,6 @@ struct kvm_pic {
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 void kvm_pic_clear_isr_ack(struct kvm *kvm);
@@ -85,13 +84,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
-void kvm_set_irq(struct kvm *kvm, int irq, int level);
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
-void kvm_register_irq_ack_notifier(struct kvm *kvm,
-				   struct kvm_irq_ack_notifier *kian);
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
-				     struct kvm_irq_ack_notifier *kian);
-
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From 2f7497719179a9f3270b05434be989d21f9fdc09 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Sat, 27 Sep 2008 11:46:36 +0800
Subject: KVM: Move irqchip_in_kernel() from ioapic.h to irq.h

Moving irqchip_in_kernel() from ioapic.h to irq.h.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/ia64/kvm/irq.h      | 4 ++++
 arch/ia64/kvm/kvm-ia64.c | 1 +
 2 files changed, 5 insertions(+)

(limited to 'arch')

diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h
index 604329ac3c9..c6786e8b1bf 100644
--- a/arch/ia64/kvm/irq.h
+++ b/arch/ia64/kvm/irq.h
@@ -23,5 +23,9 @@
 #ifndef __IRQ_H
 #define __IRQ_H
 
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	return 1;
+}
 
 #endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 7ad759e3429..a6cf719811b 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -45,6 +45,7 @@
 #include "iodev.h"
 #include "ioapic.h"
 #include "lapic.h"
+#include "irq.h"
 
 static unsigned long kvm_vmm_base;
 static unsigned long kvm_vsa_base;
-- 
cgit v1.2.3


From 1cbea809c400661eecb538e0dd0bc4f3660f0a35 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Fri, 3 Oct 2008 14:58:09 +0800
Subject: KVM: ia64: Make pmt table be able to hold physical mmio entries.

Don't try to do put_page once the entries are mmio.
Set the tag to indicate the mmio space for vmm setting
TLB's memory attribute.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/ia64/kvm/kvm-ia64.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index a6cf719811b..800a4f2e917 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1437,17 +1437,24 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 		int user_alloc)
 {
 	unsigned long i;
-	struct page *page;
+	unsigned long pfn;
 	int npages = mem->memory_size >> PAGE_SHIFT;
 	struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
 	unsigned long base_gfn = memslot->base_gfn;
 
 	for (i = 0; i < npages; i++) {
-		page = gfn_to_page(kvm, base_gfn + i);
-		kvm_set_pmt_entry(kvm, base_gfn + i,
-				page_to_pfn(page) << PAGE_SHIFT,
-				_PAGE_AR_RWX|_PAGE_MA_WB);
-		memslot->rmap[i] = (unsigned long)page;
+		pfn = gfn_to_pfn(kvm, base_gfn + i);
+		if (!kvm_is_mmio_pfn(pfn)) {
+			kvm_set_pmt_entry(kvm, base_gfn + i,
+					pfn << PAGE_SHIFT,
+					_PAGE_MA_WB);
+			memslot->rmap[i] = (unsigned long)pfn_to_page(pfn);
+		} else {
+			kvm_set_pmt_entry(kvm, base_gfn + i,
+					GPFN_LOW_MMIO | (pfn << PAGE_SHIFT),
+					_PAGE_MA_UC);
+			memslot->rmap[i] = 0;
+			}
 	}
 
 	return 0;
-- 
cgit v1.2.3


From b010eb5103cfbe12ae6f08a4cdb3a748bf78c410 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Sun, 28 Sep 2008 01:39:46 -0700
Subject: KVM: ia64: add directed mmio range support for kvm guests

Using vt-d, kvm guests can be assigned physcial devices, so
this patch introduce a new mmio type (directed mmio)
to handle its mmio access.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/ia64/include/asm/kvm_host.h |  2 +-
 arch/ia64/kvm/kvm-ia64.c         |  4 ++--
 arch/ia64/kvm/vcpu.h             | 26 +++++++++++++-------------
 arch/ia64/kvm/vtlb.c             | 23 +++++++++++++++++------
 4 files changed, 33 insertions(+), 22 deletions(-)

(limited to 'arch')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index da579a33db1..85db124d37f 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -132,7 +132,7 @@
 #define GPFN_IOSAPIC        (4UL << 60) /* IOSAPIC base */
 #define GPFN_LEGACY_IO      (5UL << 60) /* Legacy I/O base */
 #define GPFN_GFW        (6UL << 60) /* Guest Firmware */
-#define GPFN_HIGH_MMIO      (7UL << 60) /* High MMIO range */
+#define GPFN_PHYS_MMIO      (7UL << 60) /* Directed MMIO Range */
 
 #define GPFN_IO_MASK        (7UL << 60) /* Guest pfn is I/O type */
 #define GPFN_INV_MASK       (1UL << 63) /* Guest pfn is invalid */
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 800a4f2e917..3df82f3fe54 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1447,11 +1447,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 		if (!kvm_is_mmio_pfn(pfn)) {
 			kvm_set_pmt_entry(kvm, base_gfn + i,
 					pfn << PAGE_SHIFT,
-					_PAGE_MA_WB);
+				_PAGE_AR_RWX | _PAGE_MA_WB);
 			memslot->rmap[i] = (unsigned long)pfn_to_page(pfn);
 		} else {
 			kvm_set_pmt_entry(kvm, base_gfn + i,
-					GPFN_LOW_MMIO | (pfn << PAGE_SHIFT),
+					GPFN_PHYS_MMIO | (pfn << PAGE_SHIFT),
 					_PAGE_MA_UC);
 			memslot->rmap[i] = 0;
 			}
diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h
index b0fcfb62c49..341e3fee280 100644
--- a/arch/ia64/kvm/vcpu.h
+++ b/arch/ia64/kvm/vcpu.h
@@ -313,21 +313,21 @@ static inline void vcpu_set_tr(struct thash_data *trp, u64 pte, u64 itir,
 	trp->rid = rid;
 }
 
-extern u64 kvm_lookup_mpa(u64 gpfn);
-extern u64 kvm_gpa_to_mpa(u64 gpa);
-
-/* Return I/O type if trye */
-#define __gpfn_is_io(gpfn)			\
-	({						\
-	 u64 pte, ret = 0;			\
-	 pte = kvm_lookup_mpa(gpfn);		\
-	 if (!(pte & GPFN_INV_MASK))		\
-	 ret = pte & GPFN_IO_MASK;	\
-	 ret;					\
-	 })
+extern u64 kvm_get_mpt_entry(u64 gpfn);
 
+/* Return I/ */
+static inline u64 __gpfn_is_io(u64 gpfn)
+{
+	u64  pte;
+	pte = kvm_get_mpt_entry(gpfn);
+	if (!(pte & GPFN_INV_MASK)) {
+		pte = pte & GPFN_IO_MASK;
+		if (pte != GPFN_PHYS_MMIO)
+			return pte;
+	}
+	return 0;
+}
 #endif
-
 #define IA64_NO_FAULT	0
 #define IA64_FAULT	1
 
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
index def4576d22b..e22b93361e0 100644
--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -390,7 +390,7 @@ void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps)
 
 u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
 {
-	u64 ps, ps_mask, paddr, maddr;
+	u64 ps, ps_mask, paddr, maddr, io_mask;
 	union pte_flags phy_pte;
 
 	ps = itir_ps(itir);
@@ -398,8 +398,9 @@ u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
 	phy_pte.val = *pte;
 	paddr = *pte;
 	paddr = ((paddr & _PAGE_PPN_MASK) & ps_mask) | (va & ~ps_mask);
-	maddr = kvm_lookup_mpa(paddr >> PAGE_SHIFT);
-	if (maddr & GPFN_IO_MASK) {
+	maddr = kvm_get_mpt_entry(paddr >> PAGE_SHIFT);
+	io_mask = maddr & GPFN_IO_MASK;
+	if (io_mask && (io_mask != GPFN_PHYS_MMIO)) {
 		*pte |= VTLB_PTE_IO;
 		return -1;
 	}
@@ -418,7 +419,7 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
 						u64 ifa, int type)
 {
 	u64 ps;
-	u64 phy_pte;
+	u64 phy_pte, io_mask, index;
 	union ia64_rr vrr, mrr;
 	int ret = 0;
 
@@ -426,13 +427,16 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
 	vrr.val = vcpu_get_rr(v, ifa);
 	mrr.val = ia64_get_rr(ifa);
 
+	index = (pte & _PAGE_PPN_MASK) >> PAGE_SHIFT;
+	io_mask = kvm_get_mpt_entry(index) & GPFN_IO_MASK;
 	phy_pte = translate_phy_pte(&pte, itir, ifa);
 
 	/* Ensure WB attribute if pte is related to a normal mem page,
 	 * which is required by vga acceleration since qemu maps shared
 	 * vram buffer with WB.
 	 */
-	if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT)) {
+	if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT) &&
+			io_mask != GPFN_PHYS_MMIO) {
 		pte &= ~_PAGE_MA_MASK;
 		phy_pte &= ~_PAGE_MA_MASK;
 	}
@@ -566,12 +570,19 @@ void thash_init(struct thash_cb *hcb, u64 sz)
 	}
 }
 
-u64 kvm_lookup_mpa(u64 gpfn)
+u64 kvm_get_mpt_entry(u64 gpfn)
 {
 	u64 *base = (u64 *) KVM_P2M_BASE;
 	return *(base + gpfn);
 }
 
+u64 kvm_lookup_mpa(u64 gpfn)
+{
+	u64 maddr;
+	maddr = kvm_get_mpt_entry(gpfn);
+	return maddr&_PAGE_PPN_MASK;
+}
+
 u64 kvm_gpa_to_mpa(u64 gpa)
 {
 	u64 pte = kvm_lookup_mpa(gpa >> PAGE_SHIFT);
-- 
cgit v1.2.3


From 2381ad241d0bea1253a37f314b270848067640bb Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Wed, 8 Oct 2008 08:29:33 +0800
Subject: KVM: ia64: Add intel iommu support for guests.

With intel iommu hardware, we can assign devices to kvm/ia64 guests.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
---
 arch/ia64/kvm/Makefile   | 4 ++++
 arch/ia64/kvm/kvm-ia64.c | 9 +++++++++
 2 files changed, 13 insertions(+)

(limited to 'arch')

diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 3b1a1c156ef..cf37f8f490c 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -46,6 +46,10 @@ EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 		coalesced_mmio.o irq_comm.o)
 
+ifeq ($(CONFIG_DMAR),y)
+common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+endif
+
 kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
 obj-$(CONFIG_KVM) += kvm.o
 
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 3df82f3fe54..c0699f0e35a 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -31,6 +31,7 @@
 #include <linux/bitops.h>
 #include <linux/hrtimer.h>
 #include <linux/uaccess.h>
+#include <linux/intel-iommu.h>
 
 #include <asm/pgtable.h>
 #include <asm/gcc_intrin.h>
@@ -187,6 +188,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
+	case KVM_CAP_IOMMU:
+		r = intel_iommu_found();
+		break;
 	default:
 		r = 0;
 	}
@@ -773,6 +777,7 @@ static void kvm_init_vm(struct kvm *kvm)
 	 */
 	kvm_build_io_pmt(kvm);
 
+	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 }
 
 struct  kvm *kvm_arch_create_vm(void)
@@ -1336,6 +1341,10 @@ static void kvm_release_vm_pages(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+	kvm_iommu_unmap_guest(kvm);
+#ifdef  KVM_CAP_DEVICE_ASSIGNMENT
+	kvm_free_all_assigned_devices(kvm);
+#endif
 	kfree(kvm->arch.vioapic);
 	kvm_release_vm_pages(kvm);
 	kvm_free_physmem(kvm);
-- 
cgit v1.2.3