aboutsummaryrefslogtreecommitdiff
path: root/drivers/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/kvm')
-rw-r--r--drivers/kvm/kvm.h1
-rw-r--r--drivers/kvm/kvm_main.c19
-rw-r--r--drivers/kvm/mmu.c16
-rw-r--r--drivers/kvm/paging_tmpl.h79
-rw-r--r--drivers/kvm/svm.c28
-rw-r--r--drivers/kvm/vmx.c5
-rw-r--r--drivers/kvm/x86_emulate.c98
7 files changed, 151 insertions, 95 deletions
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 91e0c75aca8..2db1ca4c680 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -242,6 +242,7 @@ struct kvm_vcpu {
u64 pdptrs[4]; /* pae */
u64 shadow_efer;
u64 apic_base;
+ u64 ia32_misc_enable_msr;
int nmsrs;
struct vmx_msr_entry *guest_msrs;
struct vmx_msr_entry *host_msrs;
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 67c1154960f..b10972ed0c9 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -272,7 +272,9 @@ static void kvm_free_physmem(struct kvm *kvm)
static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
{
+ vcpu_load(vcpu->kvm, vcpu_slot(vcpu));
kvm_mmu_destroy(vcpu);
+ vcpu_put(vcpu);
kvm_arch_ops->vcpu_free(vcpu);
}
@@ -1224,6 +1226,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_APICBASE:
data = vcpu->apic_base;
break;
+ case MSR_IA32_MISC_ENABLE:
+ data = vcpu->ia32_misc_enable_msr;
+ break;
#ifdef CONFIG_X86_64
case MSR_EFER:
data = vcpu->shadow_efer;
@@ -1295,6 +1300,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_APICBASE:
vcpu->apic_base = data;
break;
+ case MSR_IA32_MISC_ENABLE:
+ vcpu->ia32_misc_enable_msr = data;
+ break;
default:
printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
return 1;
@@ -1598,6 +1606,10 @@ static u32 msrs_to_save[] = {
static unsigned num_msrs_to_save;
+static u32 emulated_msrs[] = {
+ MSR_IA32_MISC_ENABLE,
+};
+
static __init void kvm_init_msr_list(void)
{
u32 dummy[2];
@@ -1923,7 +1935,7 @@ static long kvm_dev_ioctl(struct file *filp,
if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
goto out;
n = msr_list.nmsrs;
- msr_list.nmsrs = num_msrs_to_save;
+ msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
goto out;
r = -E2BIG;
@@ -1933,6 +1945,11 @@ static long kvm_dev_ioctl(struct file *filp,
if (copy_to_user(user_msr_list->indices, &msrs_to_save,
num_msrs_to_save * sizeof(u32)))
goto out;
+ if (copy_to_user(user_msr_list->indices
+ + num_msrs_to_save * sizeof(u32),
+ &emulated_msrs,
+ ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+ goto out;
r = 0;
break;
}
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index c6f972914f0..22c426cd8cb 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -143,6 +143,7 @@ static int dbg = 1;
#define PFERR_PRESENT_MASK (1U << 0)
#define PFERR_WRITE_MASK (1U << 1)
#define PFERR_USER_MASK (1U << 2)
+#define PFERR_FETCH_MASK (1U << 4)
#define PT64_ROOT_LEVEL 4
#define PT32_ROOT_LEVEL 2
@@ -168,6 +169,11 @@ static int is_cpuid_PSE36(void)
return 1;
}
+static int is_nx(struct kvm_vcpu *vcpu)
+{
+ return vcpu->shadow_efer & EFER_NX;
+}
+
static int is_present_pte(unsigned long pte)
{
return pte & PT_PRESENT_MASK;
@@ -992,16 +998,6 @@ static inline int fix_read_pf(u64 *shadow_ent)
return 0;
}
-static int may_access(u64 pte, int write, int user)
-{
-
- if (user && !(pte & PT_USER_MASK))
- return 0;
- if (write && !(pte & PT_WRITABLE_MASK))
- return 0;
- return 1;
-}
-
static void paging_free(struct kvm_vcpu *vcpu)
{
nonpaging_free(vcpu);
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index 2dbf4307ed9..149fa45fd9a 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -63,13 +63,15 @@ struct guest_walker {
pt_element_t *ptep;
pt_element_t inherited_ar;
gfn_t gfn;
+ u32 error_code;
};
/*
* Fetch a guest pte for a guest virtual address
*/
-static void FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr)
+static int FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gva_t addr,
+ int write_fault, int user_fault, int fetch_fault)
{
hpa_t hpa;
struct kvm_memory_slot *slot;
@@ -86,7 +88,7 @@ static void FNAME(walk_addr)(struct guest_walker *walker,
walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
root = *walker->ptep;
if (!(root & PT_PRESENT_MASK))
- return;
+ goto not_present;
--walker->level;
}
#endif
@@ -111,11 +113,23 @@ static void FNAME(walk_addr)(struct guest_walker *walker,
ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
((unsigned long)ptep & PAGE_MASK));
- if (is_present_pte(*ptep) && !(*ptep & PT_ACCESSED_MASK))
- *ptep |= PT_ACCESSED_MASK;
-
if (!is_present_pte(*ptep))
- break;
+ goto not_present;
+
+ if (write_fault && !is_writeble_pte(*ptep))
+ if (user_fault || is_write_protection(vcpu))
+ goto access_error;
+
+ if (user_fault && !(*ptep & PT_USER_MASK))
+ goto access_error;
+
+#if PTTYPE == 64
+ if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
+ goto access_error;
+#endif
+
+ if (!(*ptep & PT_ACCESSED_MASK))
+ *ptep |= PT_ACCESSED_MASK; /* avoid rmw */
if (walker->level == PT_PAGE_TABLE_LEVEL) {
walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
@@ -146,6 +160,23 @@ static void FNAME(walk_addr)(struct guest_walker *walker,
}
walker->ptep = ptep;
pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
+ return 1;
+
+not_present:
+ walker->error_code = 0;
+ goto err;
+
+access_error:
+ walker->error_code = PFERR_PRESENT_MASK;
+
+err:
+ if (write_fault)
+ walker->error_code |= PFERR_WRITE_MASK;
+ if (user_fault)
+ walker->error_code |= PFERR_USER_MASK;
+ if (fetch_fault)
+ walker->error_code |= PFERR_FETCH_MASK;
+ return 0;
}
static void FNAME(release_walker)(struct guest_walker *walker)
@@ -274,7 +305,7 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page;
if (is_writeble_pte(*shadow_ent))
- return 0;
+ return !user || (*shadow_ent & PT_USER_MASK);
writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK;
if (user) {
@@ -347,8 +378,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
u32 error_code)
{
int write_fault = error_code & PFERR_WRITE_MASK;
- int pte_present = error_code & PFERR_PRESENT_MASK;
int user_fault = error_code & PFERR_USER_MASK;
+ int fetch_fault = error_code & PFERR_FETCH_MASK;
struct guest_walker walker;
u64 *shadow_pte;
int fixed;
@@ -365,19 +396,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
/*
* Look up the shadow pte for the faulting address.
*/
- FNAME(walk_addr)(&walker, vcpu, addr);
- shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
+ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
+ fetch_fault);
/*
* The page is not mapped by the guest. Let the guest handle it.
*/
- if (!shadow_pte) {
- pgprintk("%s: not mapped\n", __FUNCTION__);
- inject_page_fault(vcpu, addr, error_code);
+ if (!r) {
+ pgprintk("%s: guest page fault\n", __FUNCTION__);
+ inject_page_fault(vcpu, addr, walker.error_code);
FNAME(release_walker)(&walker);
return 0;
}
+ shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__,
shadow_pte, *shadow_pte);
@@ -399,22 +431,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
* mmio: emulate if accessible, otherwise its a guest fault.
*/
if (is_io_pte(*shadow_pte)) {
- if (may_access(*shadow_pte, write_fault, user_fault))
- return 1;
- pgprintk("%s: io work, no access\n", __FUNCTION__);
- inject_page_fault(vcpu, addr,
- error_code | PFERR_PRESENT_MASK);
- kvm_mmu_audit(vcpu, "post page fault (io)");
- return 0;
- }
-
- /*
- * pte not present, guest page fault.
- */
- if (pte_present && !fixed && !write_pt) {
- inject_page_fault(vcpu, addr, error_code);
- kvm_mmu_audit(vcpu, "post page fault (guest)");
- return 0;
+ return 1;
}
++kvm_stat.pf_fixed;
@@ -429,7 +446,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
pt_element_t guest_pte;
gpa_t gpa;
- FNAME(walk_addr)(&walker, vcpu, vaddr);
+ FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
guest_pte = *walker.ptep;
FNAME(release_walker)(&walker);
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index 714f6a7841c..c79df79307e 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -502,6 +502,7 @@ static void init_vmcb(struct vmcb *vmcb)
(1ULL << INTERCEPT_IOIO_PROT) |
(1ULL << INTERCEPT_MSR_PROT) |
(1ULL << INTERCEPT_TASK_SWITCH) |
+ (1ULL << INTERCEPT_SHUTDOWN) |
(1ULL << INTERCEPT_VMRUN) |
(1ULL << INTERCEPT_VMMCALL) |
(1ULL << INTERCEPT_VMLOAD) |
@@ -680,14 +681,14 @@ static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
- dt->limit = vcpu->svm->vmcb->save.ldtr.limit;
- dt->base = vcpu->svm->vmcb->save.ldtr.base;
+ dt->limit = vcpu->svm->vmcb->save.idtr.limit;
+ dt->base = vcpu->svm->vmcb->save.idtr.base;
}
static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
- vcpu->svm->vmcb->save.ldtr.limit = dt->limit;
- vcpu->svm->vmcb->save.ldtr.base = dt->base ;
+ vcpu->svm->vmcb->save.idtr.limit = dt->limit;
+ vcpu->svm->vmcb->save.idtr.base = dt->base ;
}
static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
@@ -892,6 +893,19 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 0;
}
+static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ /*
+ * VMCB is undefined after a SHUTDOWN intercept
+ * so reinitialize it.
+ */
+ memset(vcpu->svm->vmcb, 0, PAGE_SIZE);
+ init_vmcb(vcpu->svm->vmcb);
+
+ kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
static int io_get_override(struct kvm_vcpu *vcpu,
struct vmcb_seg **seg,
int *addr_override)
@@ -1149,7 +1163,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
case MSR_K6_STAR:
vcpu->svm->vmcb->save.star = data;
break;
-#ifdef CONFIG_X86_64_
+#ifdef CONFIG_X86_64
case MSR_LSTAR:
vcpu->svm->vmcb->save.lstar = data;
break;
@@ -1249,6 +1263,7 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu,
[SVM_EXIT_IOIO] = io_interception,
[SVM_EXIT_MSR] = msr_interception,
[SVM_EXIT_TASK_SWITCH] = task_switch_interception,
+ [SVM_EXIT_SHUTDOWN] = shutdown_interception,
[SVM_EXIT_VMRUN] = invalid_op_interception,
[SVM_EXIT_VMMCALL] = invalid_op_interception,
[SVM_EXIT_VMLOAD] = invalid_op_interception,
@@ -1407,7 +1422,8 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
again:
- do_interrupt_requests(vcpu, kvm_run);
+ if (!vcpu->mmio_read_completed)
+ do_interrupt_requests(vcpu, kvm_run);
clgi();
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 0aa2659f6ae..54c35c0b318 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -1116,6 +1116,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
if (rdmsr_safe(index, &data_low, &data_high) < 0)
continue;
+ if (wrmsr_safe(index, data_low, data_high) < 0)
+ continue;
data = data_low | ((u64)data_high << 32);
vcpu->host_msrs[j].index = index;
vcpu->host_msrs[j].reserved = 0;
@@ -1717,7 +1719,8 @@ again:
vmcs_writel(HOST_GS_BASE, segment_base(gs_sel));
#endif
- do_interrupt_requests(vcpu, kvm_run);
+ if (!vcpu->mmio_read_completed)
+ do_interrupt_requests(vcpu, kvm_run);
if (vcpu->guest_debug.enabled)
kvm_guest_debug_pre(vcpu);
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
index be70795b482..7513cddb929 100644
--- a/drivers/kvm/x86_emulate.c
+++ b/drivers/kvm/x86_emulate.c
@@ -61,6 +61,7 @@
#define ModRM (1<<6)
/* Destination is only written; never read. */
#define Mov (1<<7)
+#define BitOp (1<<8)
static u8 opcode_table[256] = {
/* 0x00 - 0x07 */
@@ -148,7 +149,7 @@ static u8 opcode_table[256] = {
0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
};
-static u8 twobyte_table[256] = {
+static u16 twobyte_table[256] = {
/* 0x00 - 0x0F */
0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
@@ -180,16 +181,16 @@ static u8 twobyte_table[256] = {
/* 0x90 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xA0 - 0xA7 */
- 0, 0, 0, DstMem | SrcReg | ModRM, 0, 0, 0, 0,
+ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
/* 0xA8 - 0xAF */
- 0, 0, 0, DstMem | SrcReg | ModRM, 0, 0, 0, 0,
+ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
/* 0xB0 - 0xB7 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
- DstMem | SrcReg | ModRM,
+ DstMem | SrcReg | ModRM | BitOp,
0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
DstReg | SrcMem16 | ModRM | Mov,
/* 0xB8 - 0xBF */
- 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM,
+ 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
DstReg | SrcMem16 | ModRM | Mov,
/* 0xC0 - 0xCF */
@@ -469,7 +470,8 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
int
x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
- u8 b, d, sib, twobyte = 0, rex_prefix = 0;
+ unsigned d;
+ u8 b, sib, twobyte = 0, rex_prefix = 0;
u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
unsigned long *override_base = NULL;
unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
@@ -726,46 +728,6 @@ done_prefixes:
;
}
- /* Decode and fetch the destination operand: register or memory. */
- switch (d & DstMask) {
- case ImplicitOps:
- /* Special instructions do their own operand decoding. */
- goto special_insn;
- case DstReg:
- dst.type = OP_REG;
- if ((d & ByteOp)
- && !(twobyte_table && (b == 0xb6 || b == 0xb7))) {
- dst.ptr = decode_register(modrm_reg, _regs,
- (rex_prefix == 0));
- dst.val = *(u8 *) dst.ptr;
- dst.bytes = 1;
- } else {
- dst.ptr = decode_register(modrm_reg, _regs, 0);
- switch ((dst.bytes = op_bytes)) {
- case 2:
- dst.val = *(u16 *)dst.ptr;
- break;
- case 4:
- dst.val = *(u32 *)dst.ptr;
- break;
- case 8:
- dst.val = *(u64 *)dst.ptr;
- break;
- }
- }
- break;
- case DstMem:
- dst.type = OP_MEM;
- dst.ptr = (unsigned long *)cr2;
- dst.bytes = (d & ByteOp) ? 1 : op_bytes;
- if (!(d & Mov) && /* optimisation - avoid slow emulated read */
- ((rc = ops->read_emulated((unsigned long)dst.ptr,
- &dst.val, dst.bytes, ctxt)) != 0))
- goto done;
- break;
- }
- dst.orig_val = dst.val;
-
/*
* Decode and fetch the source operand: register, memory
* or immediate.
@@ -838,6 +800,50 @@ done_prefixes:
break;
}
+ /* Decode and fetch the destination operand: register or memory. */
+ switch (d & DstMask) {
+ case ImplicitOps:
+ /* Special instructions do their own operand decoding. */
+ goto special_insn;
+ case DstReg:
+ dst.type = OP_REG;
+ if ((d & ByteOp)
+ && !(twobyte_table && (b == 0xb6 || b == 0xb7))) {
+ dst.ptr = decode_register(modrm_reg, _regs,
+ (rex_prefix == 0));
+ dst.val = *(u8 *) dst.ptr;
+ dst.bytes = 1;
+ } else {
+ dst.ptr = decode_register(modrm_reg, _regs, 0);
+ switch ((dst.bytes = op_bytes)) {
+ case 2:
+ dst.val = *(u16 *)dst.ptr;
+ break;
+ case 4:
+ dst.val = *(u32 *)dst.ptr;
+ break;
+ case 8:
+ dst.val = *(u64 *)dst.ptr;
+ break;
+ }
+ }
+ break;
+ case DstMem:
+ dst.type = OP_MEM;
+ dst.ptr = (unsigned long *)cr2;
+ dst.bytes = (d & ByteOp) ? 1 : op_bytes;
+ if (d & BitOp) {
+ dst.ptr += src.val / BITS_PER_LONG;
+ dst.bytes = sizeof(long);
+ }
+ if (!(d & Mov) && /* optimisation - avoid slow emulated read */
+ ((rc = ops->read_emulated((unsigned long)dst.ptr,
+ &dst.val, dst.bytes, ctxt)) != 0))
+ goto done;
+ break;
+ }
+ dst.orig_val = dst.val;
+
if (twobyte)
goto twobyte_insn;