Merge branch 'kvm-tdpmmu-fixes' into HEAD

Merge topic branch with fixes for 5.14-rc6 and 5.15 merge window.
This commit is contained in:
Paolo Bonzini 2021-08-13 03:35:01 -04:00
commit 9a63b4517c
4 changed files with 63 additions and 15 deletions

View File

@ -31,10 +31,10 @@ On x86:
- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock
- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is
taken inside kvm->arch.mmu_lock, and cannot be taken without already
holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise
there's no need to take kvm->arch.tdp_mmu_pages_lock at all).
- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and
kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and
cannot be taken without already holding kvm->arch.mmu_lock (typically with
``read_lock`` for the TDP MMU, thus the need for additional spinlocks).
Everything else is a leaf: no other lock is taken inside the critical
sections.

View File

@ -1038,6 +1038,13 @@ struct kvm_arch {
struct list_head lpage_disallowed_mmu_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
/*
* Protects marking pages unsync during page faults, as TDP MMU page
* faults only take mmu_lock for read. For simplicity, the unsync
* pages lock is always taken when marking pages unsync regardless of
* whether mmu_lock is held for read or write.
*/
spinlock_t mmu_unsync_pages_lock;
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;

View File

@ -2575,6 +2575,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
{
struct kvm_mmu_page *sp;
bool locked = false;
/*
* Force write-protection if the page is being tracked. Note, the page
@ -2597,9 +2598,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
if (sp->unsync)
continue;
/*
* TDP MMU page faults require an additional spinlock as they
* run with mmu_lock held for read, not write, and the unsync
* logic is not thread safe. Take the spinklock regardless of
* the MMU type to avoid extra conditionals/parameters, there's
* no meaningful penalty if mmu_lock is held for write.
*/
if (!locked) {
locked = true;
spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
/*
* Recheck after taking the spinlock, a different vCPU
* may have since marked the page unsync. A false
* positive on the unprotected check above is not
* possible as clearing sp->unsync _must_ hold mmu_lock
* for write, i.e. unsync cannot transition from 0->1
* while this CPU holds mmu_lock for read (or write).
*/
if (READ_ONCE(sp->unsync))
continue;
}
WARN_ON(sp->role.level != PG_LEVEL_4K);
kvm_unsync_page(vcpu, sp);
}
if (locked)
spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
/*
* We need to ensure that the marking of unsync pages is visible
@ -5605,6 +5631,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
if (!kvm_mmu_init_tdp_mmu(kvm))
/*
* No smp_load/store wrappers needed here as we are in

View File

@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
if (!kvm->arch.tdp_mmu_enabled)
return;
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
{
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
list_del_rcu(&root->link);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
@ -753,13 +752,29 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, bool can_yield, bool flush,
bool shared)
{
gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
bool zap_all = (start == 0 && end >= max_gfn_host);
struct tdp_iter iter;
/*
* No need to try to step down in the iterator when zapping all SPTEs,
* zapping the top-level non-leaf SPTEs will recurse on their children.
*/
int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
/*
* Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
* hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
* and so KVM will never install a SPTE for such addresses.
*/
end = min(end, max_gfn_host);
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
min_level, start, end) {
retry:
if (can_yield &&
tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
@ -773,9 +788,10 @@ retry:
/*
* If this is a non-last-level SPTE that covers a larger range
* than should be zapped, continue, and zap the mappings at a
* lower level.
* lower level, except when zapping all SPTEs.
*/
if ((iter.gfn < start ||
if (!zap_all &&
(iter.gfn < start ||
iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
!is_last_spte(iter.old_spte, iter.level))
continue;
@ -823,12 +839,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
void kvm_tdp_mmu_zap_all(struct kvm *kvm)
{
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
bool flush = false;
int i;
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
flush, false);
if (flush)
@ -867,7 +882,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
*/
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
struct kvm_mmu_page *next_root;
struct kvm_mmu_page *root;
bool flush = false;
@ -883,8 +897,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
rcu_read_unlock();
flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush,
true);
flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
/*
* Put the reference acquired in