aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c672
1 files changed, 567 insertions, 105 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e55af12e11b..2ad6f548167 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -27,11 +27,22 @@
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/swap.h>
+#include <linux/hugetlb.h>
+#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
+/*
+ * When setting this variable to true it enables Two-Dimensional-Paging
+ * where the hardware walks 2 page tables:
+ * 1. the guest-virtual to guest-physical
+ * 2. while doing 1. it walks guest-physical to host-physical
+ * If the hardware supports that we don't need to do shadow paging.
+ */
+bool tdp_enabled = false;
+
#undef MMU_DEBUG
#undef AUDIT
@@ -101,8 +112,6 @@ static int dbg = 1;
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
#define PT64_LEVEL_BITS 9
@@ -159,6 +168,13 @@ static int dbg = 1;
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+struct kvm_pv_mmu_op_buffer {
+ void *ptr;
+ unsigned len;
+ unsigned processed;
+ char buf[512] __aligned(sizeof(long));
+};
+
struct kvm_rmap_desc {
u64 *shadow_ptes[RMAP_EXT];
struct kvm_rmap_desc *more;
@@ -200,11 +216,15 @@ static int is_present_pte(unsigned long pte)
static int is_shadow_present_pte(u64 pte)
{
- pte &= ~PT_SHADOW_IO_MARK;
return pte != shadow_trap_nonpresent_pte
&& pte != shadow_notrap_nonpresent_pte;
}
+static int is_large_pte(u64 pte)
+{
+ return pte & PT_PAGE_SIZE_MASK;
+}
+
static int is_writeble_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
@@ -215,14 +235,14 @@ static int is_dirty_pte(unsigned long pte)
return pte & PT_DIRTY_MASK;
}
-static int is_io_pte(unsigned long pte)
+static int is_rmap_pte(u64 pte)
{
- return pte & PT_SHADOW_IO_MARK;
+ return is_shadow_present_pte(pte);
}
-static int is_rmap_pte(u64 pte)
+static pfn_t spte_to_pfn(u64 pte)
{
- return is_shadow_present_pte(pte);
+ return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
}
static gfn_t pse36_gfn_delta(u32 gpte)
@@ -349,16 +369,100 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
}
/*
+ * Return the pointer to the largepage write count for a given
+ * gfn, handling slots that are not large page aligned.
+ */
+static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
+{
+ unsigned long idx;
+
+ idx = (gfn / KVM_PAGES_PER_HPAGE) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+ return &slot->lpage_info[idx].write_count;
+}
+
+static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+ int *write_count;
+
+ write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ *write_count += 1;
+ WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
+}
+
+static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+ int *write_count;
+
+ write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ *write_count -= 1;
+ WARN_ON(*write_count < 0);
+}
+
+static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ int *largepage_idx;
+
+ if (slot) {
+ largepage_idx = slot_largepage_idx(gfn, slot);
+ return *largepage_idx;
+ }
+
+ return 1;
+}
+
+static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 0;
+
+ vma = find_vma(current->mm, addr);
+ if (vma && is_vm_hugetlb_page(vma))
+ return 1;
+
+ return 0;
+}
+
+static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ if (has_wrprotected_page(vcpu->kvm, large_gfn))
+ return 0;
+
+ if (!host_largepage_backed(vcpu->kvm, large_gfn))
+ return 0;
+
+ slot = gfn_to_memslot(vcpu->kvm, large_gfn);
+ if (slot && slot->dirty_bitmap)
+ return 0;
+
+ return 1;
+}
+
+/*
* Take gfn and return the reverse mapping to it.
* Note: gfn must be unaliased before this function get called
*/
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
{
struct kvm_memory_slot *slot;
+ unsigned long idx;
slot = gfn_to_memslot(kvm, gfn);
- return &slot->rmap[gfn - slot->base_gfn];
+ if (!lpage)
+ return &slot->rmap[gfn - slot->base_gfn];
+
+ idx = (gfn / KVM_PAGES_PER_HPAGE) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+
+ return &slot->lpage_info[idx].rmap_pde;
}
/*
@@ -370,7 +474,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
* If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
* containing more mappings.
*/
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
{
struct kvm_mmu_page *sp;
struct kvm_rmap_desc *desc;
@@ -382,7 +486,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
sp->gfns[spte - sp->spt] = gfn;
- rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
*rmapp = (unsigned long)spte;
@@ -435,20 +539,21 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
struct kvm_mmu_page *sp;
- struct page *page;
+ pfn_t pfn;
unsigned long *rmapp;
int i;
if (!is_rmap_pte(*spte))
return;
sp = page_header(__pa(spte));
- page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
- mark_page_accessed(page);
+ pfn = spte_to_pfn(*spte);
+ if (*spte & PT_ACCESSED_MASK)
+ kvm_set_pfn_accessed(pfn);
if (is_writeble_pte(*spte))
- kvm_release_page_dirty(page);
+ kvm_release_pfn_dirty(pfn);
else
- kvm_release_page_clean(page);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
+ kvm_release_pfn_clean(pfn);
+ rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
@@ -514,7 +619,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
int write_protected = 0;
gfn = unalias_gfn(kvm, gfn);
- rmapp = gfn_to_rmap(kvm, gfn);
+ rmapp = gfn_to_rmap(kvm, gfn, 0);
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
@@ -527,8 +632,35 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
}
spte = rmap_next(kvm, rmapp, spte);
}
+ if (write_protected) {
+ pfn_t pfn;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ pfn = spte_to_pfn(*spte);
+ kvm_set_pfn_dirty(pfn);
+ }
+
+ /* check for huge page mappings */
+ rmapp = gfn_to_rmap(kvm, gfn, 1);
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!spte);
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+ pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
+ if (is_writeble_pte(*spte)) {
+ rmap_remove(kvm, spte);
+ --kvm->stat.lpages;
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ write_protected = 1;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+
if (write_protected)
kvm_flush_remote_tlbs(kvm);
+
+ account_shadowed(kvm, gfn);
}
#ifdef MMU_DEBUG
@@ -538,8 +670,8 @@ static int is_empty_shadow_page(u64 *spt)
u64 *end;
for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
- if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
- printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+ if (*pos != shadow_trap_nonpresent_pte) {
+ printk(KERN_ERR "%s: %p %llx\n", __func__,
pos, *pos);
return 0;
}
@@ -559,7 +691,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
- return gfn;
+ return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
}
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
@@ -662,13 +794,14 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
struct kvm_mmu_page *sp;
struct hlist_node *node;
- pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
+ index = kvm_page_table_hashfn(gfn);
bucket = &kvm->arch.mmu_page_hash[index];
hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.metaphysical) {
+ if (sp->gfn == gfn && !sp->role.metaphysical
+ && !sp->role.invalid) {
pgprintk("%s: found role %x\n",
- __FUNCTION__, sp->role.word);
+ __func__, sp->role.word);
return sp;
}
return NULL;
@@ -699,27 +832,27 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+ pgprintk("%s: looking gfn %lx role %x\n", __func__,
gfn, role.word);
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry(sp, node, bucket, hash_link)
if (sp->gfn == gfn && sp->role.word == role.word) {
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- pgprintk("%s: found\n", __FUNCTION__);
+ pgprintk("%s: found\n", __func__);
return sp;
}
++vcpu->kvm->stat.mmu_cache_miss;
sp = kvm_mmu_alloc_page(vcpu, parent_pte);
if (!sp)
return sp;
- pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+ pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
- vcpu->arch.mmu.prefetch_page(vcpu, sp);
if (!metaphysical)
rmap_write_protect(vcpu->kvm, gfn);
+ vcpu->arch.mmu.prefetch_page(vcpu, sp);
return sp;
}
@@ -745,11 +878,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
ent = pt[i];
+ if (is_shadow_present_pte(ent)) {
+ if (!is_large_pte(ent)) {
+ ent &= PT64_BASE_ADDR_MASK;
+ mmu_page_remove_parent_pte(page_header(ent),
+ &pt[i]);
+ } else {
+ --kvm->stat.lpages;
+ rmap_remove(kvm, &pt[i]);
+ }
+ }
pt[i] = shadow_trap_nonpresent_pte;
- if (!is_shadow_present_pte(ent))
- continue;
- ent &= PT64_BASE_ADDR_MASK;
- mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
}
kvm_flush_remote_tlbs(kvm);
}
@@ -789,10 +928,15 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
}
kvm_mmu_page_unlink_children(kvm, sp);
if (!sp->root_count) {
+ if (!sp->role.metaphysical)
+ unaccount_shadowed(kvm, sp->gfn);
hlist_del(&sp->hash_link);
kvm_mmu_free_page(kvm, sp);
- } else
+ } else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
+ sp->role.invalid = 1;
+ kvm_reload_remote_mmus(kvm);
+ }
kvm_mmu_reset_last_pte_updated(kvm);
}
@@ -838,13 +982,13 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
struct hlist_node *node, *n;
int r;
- pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+ pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
r = 0;
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ index = kvm_page_table_hashfn(gfn);
bucket = &kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
if (sp->gfn == gfn && !sp->role.metaphysical) {
- pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+ pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
sp->role.word);
kvm_mmu_zap_page(kvm, sp);
r = 1;
@@ -857,7 +1001,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
struct kvm_mmu_page *sp;
while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
- pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
+ pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
kvm_mmu_zap_page(kvm, sp);
}
}
@@ -889,26 +1033,39 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, gfn_t gfn, struct page *page)
+ int *ptwrite, int largepage, gfn_t gfn,
+ pfn_t pfn, bool speculative)
{
u64 spte;
int was_rmapped = 0;
int was_writeble = is_writeble_pte(*shadow_pte);
- hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
- __FUNCTION__, *shadow_pte, pt_access,
+ __func__, *shadow_pte, pt_access,
write_fault, user_fault, gfn);
if (is_rmap_pte(*shadow_pte)) {
- if (host_pfn != page_to_pfn(page)) {
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (largepage && !is_large_pte(*shadow_pte)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *shadow_pte;
+
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, shadow_pte);
+ } else if (pfn != spte_to_pfn(*shadow_pte)) {
pgprintk("hfn old %lx new %lx\n",
- host_pfn, page_to_pfn(page));
+ spte_to_pfn(*shadow_pte), pfn);
rmap_remove(vcpu->kvm, shadow_pte);
+ } else {
+ if (largepage)
+ was_rmapped = is_large_pte(*shadow_pte);
+ else
+ was_rmapped = 1;
}
- else
- was_rmapped = 1;
}
/*
@@ -917,6 +1074,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
* demand paging).
*/
spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+ if (!speculative)
+ pte_access |= PT_ACCESSED_MASK;
if (!dirty)
pte_access &= ~ACC_WRITE_MASK;
if (!(pte_access & ACC_EXEC_MASK))
@@ -925,15 +1084,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= PT_PRESENT_MASK;
if (pte_access & ACC_USER_MASK)
spte |= PT_USER_MASK;
+ if (largepage)
+ spte |= PT_PAGE_SIZE_MASK;
- if (is_error_page(page)) {
- set_shadow_pte(shadow_pte,
- shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
- kvm_release_page_clean(page);
- return;
- }
-
- spte |= page_to_phys(page);
+ spte |= (u64)pfn << PAGE_SHIFT;
if ((pte_access & ACC_WRITE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
@@ -946,9 +1100,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
}
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow) {
+ if (shadow ||
+ (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
- __FUNCTION__, gfn);
+ __func__, gfn);
pte_access &= ~ACC_WRITE_MASK;
if (is_writeble_pte(spte)) {
spte &= ~PT_WRITABLE_MASK;
@@ -964,18 +1119,25 @@ unshadowed:
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
- pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+ pgprintk("%s: setting spte %llx\n", __func__, spte);
+ pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n",
+ (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
+ (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
set_shadow_pte(shadow_pte, spte);
+ if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
+ && (spte & PT_PRESENT_MASK))
+ ++vcpu->kvm->stat.lpages;
+
page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
if (!was_rmapped) {
- rmap_add(vcpu, shadow_pte, gfn);
+ rmap_add(vcpu, shadow_pte, gfn, largepage);
if (!is_rmap_pte(*shadow_pte))
- kvm_release_page_clean(page);
+ kvm_release_pfn_clean(pfn);
} else {
if (was_writeble)
- kvm_release_page_dirty(page);
+ kvm_release_pfn_dirty(pfn);
else
- kvm_release_page_clean(page);
+ kvm_release_pfn_clean(pfn);
}
if (!ptwrite || !*ptwrite)
vcpu->arch.last_pte_updated = shadow_pte;
@@ -985,10 +1147,10 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
-static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
- gfn_t gfn, struct page *page)
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+ int largepage, gfn_t gfn, pfn_t pfn,
+ int level)
{
- int level = PT32E_ROOT_LEVEL;
hpa_t table_addr = vcpu->arch.mmu.root_hpa;
int pt_write = 0;
@@ -1001,8 +1163,14 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
if (level == 1) {
mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, gfn, page);
- return pt_write || is_io_pte(table[index]);
+ 0, write, 1, &pt_write, 0, gfn, pfn, false);
+ return pt_write;
+ }
+
+ if (largepage && level == 2) {
+ mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
+ 0, write, 1, &pt_write, 1, gfn, pfn, false);
+ return pt_write;
}
if (table[index] == shadow_trap_nonpresent_pte) {
@@ -1016,7 +1184,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
1, ACC_ALL, &table[index]);
if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n");
- kvm_release_page_clean(page);
+ kvm_release_pfn_clean(pfn);
return -ENOMEM;
}
@@ -1030,21 +1198,30 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
-
- struct page *page;
-
- down_read(&vcpu->kvm->slots_lock);
+ int largepage = 0;
+ pfn_t pfn;
down_read(&current->mm->mmap_sem);
- page = gfn_to_page(vcpu->kvm, gfn);
+ if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ largepage = 1;
+ }
+
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
+ /* mmio */
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return 1;
+ }
+
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
- r = __nonpaging_map(vcpu, v, write, gfn, page);
+ r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
+ PT32E_ROOT_LEVEL);
spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(&vcpu->kvm->slots_lock);
return r;
}
@@ -1073,6 +1250,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_zap_page(vcpu->kvm, sp);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
return;
@@ -1085,6 +1264,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
root &= PT64_BASE_ADDR_MASK;
sp = page_header(root);
--sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_zap_page(vcpu->kvm, sp);
}
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
@@ -1097,6 +1278,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
int i;
gfn_t root_gfn;
struct kvm_mmu_page *sp;
+ int metaphysical = 0;
root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
@@ -1105,14 +1287,20 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
+ if (tdp_enabled)
+ metaphysical = 1;
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
- PT64_ROOT_LEVEL, 0, ACC_ALL, NULL);
+ PT64_ROOT_LEVEL, metaphysical,
+ ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
vcpu->arch.mmu.root_hpa = root;
return;
}
#endif
+ metaphysical = !is_paging(vcpu);
+ if (tdp_enabled)
+ metaphysical = 1;
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
@@ -1126,7 +1314,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
} else if (vcpu->arch.mmu.root_level == 0)
root_gfn = 0;
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, !is_paging(vcpu),
+ PT32_ROOT_LEVEL, metaphysical,
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
@@ -1146,7 +1334,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn_t gfn;
int r;
- pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
+ pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -1160,6 +1348,41 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
error_code & PFERR_WRITE_MASK, gfn);
}
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+ u32 error_code)
+{
+ pfn_t pfn;
+ int r;
+ int largepage = 0;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+
+ ASSERT(vcpu);
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ down_read(&current->mm->mmap_sem);
+ if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ largepage = 1;
+ }
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ up_read(&current->mm->mmap_sem);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return 1;
+ }
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
+ largepage, gfn, pfn, TDP_ROOT_LEVEL);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ return r;
+}
+
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
@@ -1188,7 +1411,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
- pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3);
+ pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
mmu_free_roots(vcpu);
}
@@ -1253,7 +1476,35 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu)
return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
}
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ context->new_cr3 = nonpaging_new_cr3;
+ context->page_fault = tdp_page_fault;
+ context->free = nonpaging_free;
+ context->prefetch_page = nonpaging_prefetch_page;
+ context->shadow_root_level = TDP_ROOT_LEVEL;
+ context->root_hpa = INVALID_PAGE;
+
+ if (!is_paging(vcpu)) {
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->root_level = 0;
+ } else if (is_long_mode(vcpu)) {
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->root_level = PT64_ROOT_LEVEL;
+ } else if (is_pae(vcpu)) {
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->root_level = PT32E_ROOT_LEVEL;
+ } else {
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->root_level = PT32_ROOT_LEVEL;
+ }
+
+ return 0;
+}
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1268,6 +1519,16 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
return paging32_init_context(vcpu);
}
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.update_pte.pfn = bad_pfn;
+
+ if (tdp_enabled)
+ return init_kvm_tdp_mmu(vcpu);
+ else
+ return init_kvm_softmmu(vcpu);
+}
+
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
@@ -1316,7 +1577,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
- if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
+ is_large_pte(pte))
rmap_remove(vcpu->kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
@@ -1324,24 +1586,26 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
}
}
set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ if (is_large_pte(pte))
+ --vcpu->kvm->stat.lpages;
}
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *spte,
- const void *new, int bytes,
- int offset_in_pte)
+ const void *new)
{
- if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+ if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
+ && !vcpu->arch.update_pte.largepage) {
++vcpu->kvm->stat.mmu_pde_zapped;
return;
}
++vcpu->kvm->stat.mmu_pte_updated;
if (sp->role.glevels == PT32_ROOT_LEVEL)
- paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+ paging32_update_pte(vcpu, sp, spte, new);
else
- paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+ paging64_update_pte(vcpu, sp, spte, new);
}
static bool need_remote_flush(u64 old, u64 new)
@@ -1378,7 +1642,9 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gfn_t gfn;
int r;
u64 gpte = 0;
- struct page *page;
+ pfn_t pfn;
+
+ vcpu->arch.update_pte.largepage = 0;
if (bytes != 4 && bytes != 8)
return;
@@ -1408,11 +1674,19 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
down_read(&current->mm->mmap_sem);
- page = gfn_to_page(vcpu->kvm, gfn);
+ if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ vcpu->arch.update_pte.largepage = 1;
+ }
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return;
+ }
vcpu->arch.update_pte.gfn = gfn;
- vcpu->arch.update_pte.page = page;
+ vcpu->arch.update_pte.pfn = pfn;
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1423,7 +1697,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
struct hlist_node *node, *n;
struct hlist_head *bucket;
unsigned index;
- u64 entry;
+ u64 entry, gentry;
u64 *spte;
unsigned offset = offset_in_page(gpa);
unsigned pte_size;
@@ -1433,8 +1707,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int level;
int flooded = 0;
int npte;
+ int r;
- pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+ pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
@@ -1450,7 +1725,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
vcpu->arch.last_pt_write_count = 1;
vcpu->arch.last_pte_updated = NULL;
}
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
if (sp->gfn != gfn || sp->role.metaphysical)
@@ -1496,20 +1771,29 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
continue;
}
spte = &sp->spt[page_offset / sizeof(*spte)];
+ if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
+ gentry = 0;
+ r = kvm_read_guest_atomic(vcpu->kvm,
+ gpa & ~(u64)(pte_size - 1),
+ &gentry, pte_size);
+ new = (const void *)&gentry;
+ if (r < 0)
+ new = NULL;
+ }
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
- mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
- page_offset & (pte_size - 1));
+ if (new)
+ mmu_pte_write_new_pte(vcpu, sp, spte, new);
mmu_pte_write_flush_tlb(vcpu, entry, *spte);
++spte;
}
}
kvm_mmu_audit(vcpu, "post pte write");
spin_unlock(&vcpu->kvm->mmu_lock);
- if (vcpu->arch.update_pte.page) {
- kvm_release_page_clean(vcpu->arch.update_pte.page);
- vcpu->arch.update_pte.page = NULL;
+ if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
+ kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
+ vcpu->arch.update_pte.pfn = bad_pfn;
}
}
@@ -1518,9 +1802,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
- down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
- up_read(&vcpu->kvm->slots_lock);
spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -1577,6 +1859,12 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+void kvm_enable_tdp(void)
+{
+ tdp_enabled = true;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_tdp);
+
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
@@ -1677,7 +1965,53 @@ void kvm_mmu_zap_all(struct kvm *kvm)
kvm_flush_remote_tlbs(kvm);
}
-void kvm_mmu_module_exit(void)
+void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
+{
+ struct kvm_mmu_page *page;
+
+ page = container_of(kvm->arch.active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ kvm_mmu_zap_page(kvm, page);
+}
+
+static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
+{
+ struct kvm *kvm;
+ struct kvm *kvm_freed = NULL;
+ int cache_count = 0;
+
+ spin_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ int npages;
+
+ spin_lock(&kvm->mmu_lock);
+ npages = kvm->arch.n_alloc_mmu_pages -
+ kvm->arch.n_free_mmu_pages;
+ cache_count += npages;
+ if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
+ kvm_mmu_remove_one_alloc_mmu_page(kvm);
+ cache_count--;
+ kvm_freed = kvm;
+ }
+ nr_to_scan--;
+
+ spin_unlock(&kvm->mmu_lock);
+ }
+ if (kvm_freed)
+ list_move_tail(&kvm_freed->vm_list, &vm_list);
+
+ spin_unlock(&kvm_lock);
+
+ return cache_count;
+}
+
+static struct shrinker mmu_shrinker = {
+ .shrink = mmu_shrink,
+ .seeks = DEFAULT_SEEKS * 10,
+};
+
+void mmu_destroy_caches(void)
{
if (pte_chain_cache)
kmem_cache_destroy(pte_chain_cache);
@@ -1687,6 +2021,12 @@ void kvm_mmu_module_exit(void)
kmem_cache_destroy(mmu_page_header_cache);
}
+void kvm_mmu_module_exit(void)
+{
+ mmu_destroy_caches();
+ unregister_shrinker(&mmu_shrinker);
+}
+
int kvm_mmu_module_init(void)
{
pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -1706,10 +2046,12 @@ int kvm_mmu_module_init(void)
if (!mmu_page_header_cache)
goto nomem;
+ register_shrinker(&mmu_shrinker);
+
return 0;
nomem:
- kvm_mmu_module_exit();
+ mmu_destroy_caches();
return -ENOMEM;
}
@@ -1732,6 +2074,127 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
return nr_mmu_pages;
}
+static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+ unsigned len)
+{
+ if (len > buffer->len)
+ return NULL;
+ return buffer->ptr;
+}
+
+static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+ unsigned len)
+{
+ void *ret;
+
+ ret = pv_mmu_peek_buffer(buffer, len);
+ if (!ret)
+ return ret;
+ buffer->ptr += len;
+ buffer->len -= len;
+ buffer->processed += len;
+ return ret;
+}
+
+static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
+ gpa_t addr, gpa_t value)
+{
+ int bytes = 8;
+ int r;
+
+ if (!is_long_mode(vcpu) && !is_pae(vcpu))
+ bytes = 4;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ if (!emulator_write_phys(vcpu, addr, &value, bytes))
+ return -EFAULT;
+
+ return 1;
+}
+
+static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->tlb_flush(vcpu);
+ return 1;
+}
+
+static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return 1;
+}
+
+static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
+ struct kvm_pv_mmu_op_buffer *buffer)
+{
+ struct kvm_mmu_op_header *header;
+
+ header = pv_mmu_peek_buffer(buffer, sizeof *header);
+ if (!header)
+ return 0;
+ switch (header->op) {
+ case KVM_MMU_OP_WRITE_PTE: {
+ struct kvm_mmu_op_write_pte *wpte;
+
+ wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
+ if (!wpte)
+ return 0;
+ return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
+ wpte->pte_val);
+ }
+ case KVM_MMU_OP_FLUSH_TLB: {
+ struct kvm_mmu_op_flush_tlb *ftlb;
+
+ ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
+ if (!ftlb)
+ return 0;
+ return kvm_pv_mmu_flush_tlb(vcpu);
+ }
+ case KVM_MMU_OP_RELEASE_PT: {
+ struct kvm_mmu_op_release_pt *rpt;
+
+ rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
+ if (!rpt)
+ return 0;
+ return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
+ }
+ default: return 0;
+ }
+}
+
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+ gpa_t addr, unsigned long *ret)
+{
+ int r;
+ struct kvm_pv_mmu_op_buffer buffer;
+
+ buffer.ptr = buffer.buf;
+ buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
+ buffer.processed = 0;
+
+ r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+ if (r)
+ goto out;
+
+ while (buffer.len) {
+ r = kvm_pv_mmu_op_one(vcpu, &buffer);
+ if (r < 0)
+ goto out;
+ if (r == 0)
+ break;
+ }
+
+ r = 1;
+out:
+ *ret = buffer.processed;
+ return r;
+}
+
#ifdef AUDIT
static const char *audit_msg;
@@ -1768,8 +2231,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
audit_mappings_page(vcpu, ent, va, level - 1);
} else {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
- struct page *page = gpa_to_page(vcpu, gpa);
- hpa_t hpa = page_to_phys(page);
+ hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
if (is_shadow_present_pte(ent)
&& (ent & PT64_BASE_ADDR_MASK) != hpa)
@@ -1782,7 +2244,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
&& !is_error_hpa(hpa))
printk(KERN_ERR "audit: (%s) notrap shadow,"
" valid guest gva %lx\n", audit_msg, va);
- kvm_release_page_clean(page);
+ kvm_release_pfn_clean(pfn);
}
}
@@ -1867,7 +2329,7 @@ static void audit_rmap(struct kvm_vcpu *vcpu)
if (n_rmap != n_actual)
printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
- __FUNCTION__, audit_msg, n_rmap, n_actual);
+ __func__, audit_msg, n_rmap, n_actual);
}
static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -1887,7 +2349,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
if (*rmapp)
printk(KERN_ERR "%s: (%s) shadow page has writable"
" mappings: gfn %lx role %x\n",
- __FUNCTION__, audit_msg, sp->gfn,
+ __func__, audit_msg, sp->gfn,
sp->role.word);
}
}