315 lines
10 KiB
Diff
315 lines
10 KiB
Diff
|
When using mmu notifiers, we are allowed to remove the page count
|
||
|
reference tooken by get_user_pages to a specific page that is mapped
|
||
|
inside the shadow page tables.
|
||
|
|
||
|
This is needed so we can balance the pagecount against mapcount
|
||
|
checking.
|
||
|
|
||
|
(Right now kvm increase the pagecount and does not increase the
|
||
|
mapcount when mapping page into shadow page table entry,
|
||
|
so when comparing pagecount against mapcount, you have no
|
||
|
reliable result.)
|
||
|
|
||
|
add SPTE_HOST_WRITEABLE flag notify that the host physical page we are
|
||
|
pointing to from the spte is write protected, and therefore we cant
|
||
|
change its access to be write unless we run get_user_pages(write = 1).
|
||
|
|
||
|
(this is needed for change_pte support in kvm)
|
||
|
|
||
|
support for change_pte mmu notifiers is needed for kvm if it want ksm to
|
||
|
directly map pages into its shadow page tables.
|
||
|
|
||
|
Signed-off-by: Izik Eidus <ieidus@redhat.com>
|
||
|
Signed-off-by: Justin M. Forbes <jforbes@redhat.com>
|
||
|
---
|
||
|
--- linux-2.6.30.x86_64/arch/x86/include/asm/kvm_host.h 2009-08-20 10:37:37.784886414 -0500
|
||
|
+++ linux-2.6.30.x86_64.kvm/arch/x86/include/asm/kvm_host.h 2009-08-20 10:39:33.742641558 -0500
|
||
|
@@ -796,5 +796,6 @@ asmlinkage void kvm_handle_fault_on_rebo
|
||
|
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
|
||
|
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
|
||
|
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
|
||
|
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
|
||
|
|
||
|
#endif /* _ASM_X86_KVM_HOST_H */
|
||
|
--- linux-2.6.30.x86_64/arch/x86/kvm/mmu.c 2009-08-20 10:37:37.964887039 -0500
|
||
|
+++ linux-2.6.30.x86_64.kvm/arch/x86/kvm/mmu.c 2009-08-20 10:41:15.231638028 -0500
|
||
|
@@ -139,6 +139,8 @@ module_param(oos_shadow, bool, 0644);
|
||
|
#define ACC_USER_MASK PT_USER_MASK
|
||
|
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
|
||
|
|
||
|
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
|
||
|
+
|
||
|
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
|
||
|
|
||
|
struct kvm_rmap_desc {
|
||
|
@@ -254,6 +256,11 @@ static pfn_t spte_to_pfn(u64 pte)
|
||
|
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
|
||
|
}
|
||
|
|
||
|
+static pte_t ptep_val(pte_t *ptep)
|
||
|
+{
|
||
|
+ return *ptep;
|
||
|
+}
|
||
|
+
|
||
|
static gfn_t pse36_gfn_delta(u32 gpte)
|
||
|
{
|
||
|
int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
|
||
|
@@ -573,9 +580,7 @@ static void rmap_remove(struct kvm *kvm,
|
||
|
if (*spte & shadow_accessed_mask)
|
||
|
kvm_set_pfn_accessed(pfn);
|
||
|
if (is_writeble_pte(*spte))
|
||
|
- kvm_release_pfn_dirty(pfn);
|
||
|
- else
|
||
|
- kvm_release_pfn_clean(pfn);
|
||
|
+ kvm_set_pfn_dirty(pfn);
|
||
|
rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
|
||
|
if (!*rmapp) {
|
||
|
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
|
||
|
@@ -684,7 +689,8 @@ static int rmap_write_protect(struct kvm
|
||
|
return write_protected;
|
||
|
}
|
||
|
|
||
|
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
|
||
|
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
|
||
|
+ unsigned long data)
|
||
|
{
|
||
|
u64 *spte;
|
||
|
int need_tlb_flush = 0;
|
||
|
@@ -699,8 +705,48 @@ static int kvm_unmap_rmapp(struct kvm *k
|
||
|
return need_tlb_flush;
|
||
|
}
|
||
|
|
||
|
+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
|
||
|
+ unsigned long data)
|
||
|
+{
|
||
|
+ int need_flush = 0;
|
||
|
+ u64 *spte, new_spte;
|
||
|
+ pte_t *ptep = (pte_t *)data;
|
||
|
+ pfn_t new_pfn;
|
||
|
+
|
||
|
+ new_pfn = pte_pfn(ptep_val(ptep));
|
||
|
+ spte = rmap_next(kvm, rmapp, NULL);
|
||
|
+ while (spte) {
|
||
|
+ BUG_ON(!is_shadow_present_pte(*spte));
|
||
|
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
|
||
|
+ need_flush = 1;
|
||
|
+ if (pte_write(ptep_val(ptep))) {
|
||
|
+ rmap_remove(kvm, spte);
|
||
|
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
|
||
|
+ spte = rmap_next(kvm, rmapp, NULL);
|
||
|
+ } else {
|
||
|
+ new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
|
||
|
+ new_spte |= new_pfn << PAGE_SHIFT;
|
||
|
+
|
||
|
+ if (!pte_write(ptep_val(ptep))) {
|
||
|
+ new_spte &= ~PT_WRITABLE_MASK;
|
||
|
+ new_spte &= ~SPTE_HOST_WRITEABLE;
|
||
|
+ if (is_writeble_pte(*spte))
|
||
|
+ kvm_set_pfn_dirty(spte_to_pfn(*spte));
|
||
|
+ }
|
||
|
+ set_shadow_pte(spte, new_spte);
|
||
|
+ spte = rmap_next(kvm, rmapp, spte);
|
||
|
+ }
|
||
|
+ }
|
||
|
+ if (need_flush)
|
||
|
+ kvm_flush_remote_tlbs(kvm);
|
||
|
+
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
|
||
|
- int (*handler)(struct kvm *kvm, unsigned long *rmapp))
|
||
|
+ unsigned long data,
|
||
|
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
|
||
|
+ unsigned long data))
|
||
|
{
|
||
|
int i;
|
||
|
int retval = 0;
|
||
|
@@ -721,11 +767,13 @@ static int kvm_handle_hva(struct kvm *kv
|
||
|
end = start + (memslot->npages << PAGE_SHIFT);
|
||
|
if (hva >= start && hva < end) {
|
||
|
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
|
||
|
- retval |= handler(kvm, &memslot->rmap[gfn_offset]);
|
||
|
+ retval |= handler(kvm, &memslot->rmap[gfn_offset],
|
||
|
+ data);
|
||
|
retval |= handler(kvm,
|
||
|
&memslot->lpage_info[
|
||
|
gfn_offset /
|
||
|
- KVM_PAGES_PER_HPAGE].rmap_pde);
|
||
|
+ KVM_PAGES_PER_HPAGE].rmap_pde,
|
||
|
+ data);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
@@ -734,10 +782,16 @@ static int kvm_handle_hva(struct kvm *kv
|
||
|
|
||
|
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
|
||
|
{
|
||
|
- return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
|
||
|
+ return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
|
||
|
+}
|
||
|
+
|
||
|
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
|
||
|
+{
|
||
|
+ kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
|
||
|
}
|
||
|
|
||
|
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
|
||
|
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
|
||
|
+ unsigned long data)
|
||
|
{
|
||
|
u64 *spte;
|
||
|
int young = 0;
|
||
|
@@ -770,13 +824,13 @@ static void rmap_recycle(struct kvm_vcpu
|
||
|
gfn = unalias_gfn(vcpu->kvm, gfn);
|
||
|
rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
|
||
|
|
||
|
- kvm_unmap_rmapp(vcpu->kvm, rmapp);
|
||
|
+ kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
|
||
|
kvm_flush_remote_tlbs(vcpu->kvm);
|
||
|
}
|
||
|
|
||
|
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
|
||
|
{
|
||
|
- return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
|
||
|
+ return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
|
||
|
}
|
||
|
|
||
|
#ifdef MMU_DEBUG
|
||
|
@@ -1686,7 +1740,7 @@ static int set_spte(struct kvm_vcpu *vcp
|
||
|
unsigned pte_access, int user_fault,
|
||
|
int write_fault, int dirty, int largepage,
|
||
|
gfn_t gfn, pfn_t pfn, bool speculative,
|
||
|
- bool can_unsync)
|
||
|
+ bool can_unsync, bool reset_host_protection)
|
||
|
{
|
||
|
u64 spte;
|
||
|
int ret = 0;
|
||
|
@@ -1744,6 +1798,8 @@ static int set_spte(struct kvm_vcpu *vcp
|
||
|
spte &= ~PT_WRITABLE_MASK;
|
||
|
}
|
||
|
}
|
||
|
+ if (reset_host_protection)
|
||
|
+ spte |= SPTE_HOST_WRITEABLE;
|
||
|
|
||
|
if (pte_access & ACC_WRITE_MASK)
|
||
|
mark_page_dirty(vcpu->kvm, gfn);
|
||
|
@@ -1757,7 +1813,8 @@ static void mmu_set_spte(struct kvm_vcpu
|
||
|
unsigned pt_access, unsigned pte_access,
|
||
|
int user_fault, int write_fault, int dirty,
|
||
|
int *ptwrite, int largepage, gfn_t gfn,
|
||
|
- pfn_t pfn, bool speculative)
|
||
|
+ pfn_t pfn, bool speculative,
|
||
|
+ bool reset_host_protection)
|
||
|
{
|
||
|
int was_rmapped = 0;
|
||
|
int was_writeble = is_writeble_pte(*shadow_pte);
|
||
|
@@ -1787,7 +1844,8 @@ static void mmu_set_spte(struct kvm_vcpu
|
||
|
was_rmapped = 1;
|
||
|
}
|
||
|
if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
|
||
|
- dirty, largepage, gfn, pfn, speculative, true)) {
|
||
|
+ dirty, largepage, gfn, pfn, speculative, true,
|
||
|
+ reset_host_protection)) {
|
||
|
if (write_fault)
|
||
|
*ptwrite = 1;
|
||
|
kvm_x86_ops->tlb_flush(vcpu);
|
||
|
@@ -1804,8 +1862,7 @@ static void mmu_set_spte(struct kvm_vcpu
|
||
|
page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
|
||
|
if (!was_rmapped) {
|
||
|
rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage);
|
||
|
- if (!is_rmap_pte(*shadow_pte))
|
||
|
- kvm_release_pfn_clean(pfn);
|
||
|
+ kvm_release_pfn_clean(pfn);
|
||
|
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
|
||
|
rmap_recycle(vcpu, gfn, largepage);
|
||
|
} else {
|
||
|
@@ -1837,7 +1894,7 @@ static int __direct_map(struct kvm_vcpu
|
||
|
|| (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
|
||
|
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
|
||
|
0, write, 1, &pt_write,
|
||
|
- largepage, gfn, pfn, false);
|
||
|
+ largepage, gfn, pfn, false, true);
|
||
|
++vcpu->stat.pf_fixed;
|
||
|
break;
|
||
|
}
|
||
|
--- linux-2.6.30.x86_64/arch/x86/kvm/paging_tmpl.h 2009-08-20 10:37:37.966889166 -0500
|
||
|
+++ linux-2.6.30.x86_64.kvm/arch/x86/kvm/paging_tmpl.h 2009-08-20 10:39:33.747636180 -0500
|
||
|
@@ -266,9 +266,13 @@ static void FNAME(update_pte)(struct kvm
|
||
|
if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
|
||
|
return;
|
||
|
kvm_get_pfn(pfn);
|
||
|
+ /*
|
||
|
+ * we call mmu_set_spte() with reset_host_protection = true beacuse that
|
||
|
+ * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
|
||
|
+ */
|
||
|
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
|
||
|
gpte & PT_DIRTY_MASK, NULL, largepage,
|
||
|
- gpte_to_gfn(gpte), pfn, true);
|
||
|
+ gpte_to_gfn(gpte), pfn, true, true);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
@@ -302,7 +306,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
|
||
|
user_fault, write_fault,
|
||
|
gw->ptes[gw->level-1] & PT_DIRTY_MASK,
|
||
|
ptwrite, largepage,
|
||
|
- gw->gfn, pfn, false);
|
||
|
+ gw->gfn, pfn, false, true);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
@@ -552,6 +556,7 @@ static void FNAME(prefetch_page)(struct
|
||
|
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
||
|
{
|
||
|
int i, offset, nr_present;
|
||
|
+ bool reset_host_protection = 1;
|
||
|
|
||
|
offset = nr_present = 0;
|
||
|
|
||
|
@@ -589,9 +594,13 @@ static int FNAME(sync_page)(struct kvm_v
|
||
|
|
||
|
nr_present++;
|
||
|
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
|
||
|
+ if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
|
||
|
+ pte_access &= ~PT_WRITABLE_MASK;
|
||
|
+ reset_host_protection = 0;
|
||
|
+ } else { reset_host_protection = 1; }
|
||
|
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
|
||
|
is_dirty_pte(gpte), 0, gfn,
|
||
|
- spte_to_pfn(sp->spt[i]), true, false);
|
||
|
+ spte_to_pfn(sp->spt[i]), true, false, reset_host_protection);
|
||
|
}
|
||
|
|
||
|
return !nr_present;
|
||
|
--- linux-2.6.30.x86_64/virt/kvm/kvm_main.c 2009-08-20 10:37:45.448886340 -0500
|
||
|
+++ linux-2.6.30.x86_64.kvm/virt/kvm/kvm_main.c 2009-08-20 10:39:33.749636212 -0500
|
||
|
@@ -859,6 +859,19 @@ static void kvm_mmu_notifier_invalidate_
|
||
|
|
||
|
}
|
||
|
|
||
|
+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
|
||
|
+ struct mm_struct *mm,
|
||
|
+ unsigned long address,
|
||
|
+ pte_t pte)
|
||
|
+{
|
||
|
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
|
||
|
+
|
||
|
+ spin_lock(&kvm->mmu_lock);
|
||
|
+ kvm->mmu_notifier_seq++;
|
||
|
+ kvm_set_spte_hva(kvm, address, pte);
|
||
|
+ spin_unlock(&kvm->mmu_lock);
|
||
|
+}
|
||
|
+
|
||
|
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
|
||
|
struct mm_struct *mm,
|
||
|
unsigned long start,
|
||
|
@@ -938,6 +951,7 @@ static const struct mmu_notifier_ops kvm
|
||
|
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
|
||
|
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
|
||
|
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
|
||
|
+ .change_pte = kvm_mmu_notifier_change_pte,
|
||
|
.release = kvm_mmu_notifier_release,
|
||
|
};
|
||
|
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
|