Skip to content

Commit ba3a612

Browse files
sean-jcbonzini
authored andcommitted
KVM: x86/mmu: Use atomic XCHG to write TDP MMU SPTEs with volatile bits
Use an atomic XCHG to write TDP MMU SPTEs that have volatile bits, even if mmu_lock is held for write, as volatile SPTEs can be written by other tasks/vCPUs outside of mmu_lock. If a vCPU uses the to-be-modified SPTE to write a page, the CPU can cache the translation as WRITABLE in the TLB despite it being seen by KVM as !WRITABLE, and/or KVM can clobber the Accessed/Dirty bits and not properly tag the backing page. Exempt non-leaf SPTEs from atomic updates as KVM itself doesn't modify non-leaf SPTEs without holding mmu_lock, they do not have Dirty bits, and KVM doesn't consume the Accessed bit of non-leaf SPTEs. Dropping the Dirty and/or Writable bits is most problematic for dirty logging, as doing so can result in a missed TLB flush and eventually a missed dirty page. In the unlikely event that the only dirty page(s) is a clobbered SPTE, clear_dirty_gfn_range() will see the SPTE as not dirty (based on the Dirty or Writable bit depending on the method) and so not update the SPTE and ultimately not flush. If the SPTE is cached in the TLB as writable before it is clobbered, the guest can continue writing the associated page without ever taking a write-protect fault. For most (all?) file back memory, dropping the Dirty bit is a non-issue. The primary MMU write-protects its PTEs on writeback, i.e. KVM's dirty bit is effectively ignored because the primary MMU will mark that page dirty when the write-protection is lifted, e.g. when KVM faults the page back in for write. The Accessed bit is a complete non-issue. Aside from being unused for non-leaf SPTEs, KVM doesn't do a TLB flush when aging SPTEs, i.e. the Accessed bit may be dropped anyways. Lastly, the Writable bit is also problematic as an extension of the Dirty bit, as KVM (correctly) treats the Dirty bit as volatile iff the SPTE is !DIRTY && WRITABLE. If KVM fixes an MMU-writable, but !WRITABLE, SPTE out of mmu_lock, then it can allow the CPU to set the Dirty bit despite the SPTE being !WRITABLE when it is checked by KVM. But that all depends on the Dirty bit being problematic in the first place. Fixes: 2f2fad0 ("kvm: x86/mmu: Add functions to handle changed TDP SPTEs") Cc: stable@vger.kernel.org Cc: Ben Gardon <bgardon@google.com> Cc: David Matlack <dmatlack@google.com> Cc: Venkatesh Srinivas <venkateshs@google.com> Signed-off-by: Sean Christopherson <seanjc@google.com> Message-Id: <20220423034752.1161007-4-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
1 parent 54eb3ef commit ba3a612

2 files changed

Lines changed: 85 additions & 31 deletions

File tree

arch/x86/kvm/mmu/tdp_iter.h

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <linux/kvm_host.h>
77

88
#include "mmu.h"
9+
#include "spte.h"
910

1011
/*
1112
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
@@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
1718
{
1819
return READ_ONCE(*rcu_dereference(sptep));
1920
}
20-
static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val)
21+
22+
static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
23+
{
24+
return xchg(rcu_dereference(sptep), new_spte);
25+
}
26+
27+
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
28+
{
29+
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
30+
}
31+
32+
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
33+
u64 new_spte, int level)
2134
{
22-
WRITE_ONCE(*rcu_dereference(sptep), val);
35+
/*
36+
* Atomically write the SPTE if it is a shadow-present, leaf SPTE with
37+
* volatile bits, i.e. has bits that can be set outside of mmu_lock.
38+
* The Writable bit can be set by KVM's fast page fault handler, and
39+
* Accessed and Dirty bits can be set by the CPU.
40+
*
41+
* Note, non-leaf SPTEs do have Accessed bits and those bits are
42+
* technically volatile, but KVM doesn't consume the Accessed bit of
43+
* non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
44+
* logic needs to be reassessed if KVM were to use non-leaf Accessed
45+
* bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
46+
*/
47+
if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
48+
spte_has_volatile_bits(old_spte))
49+
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
50+
51+
__kvm_tdp_mmu_write_spte(sptep, new_spte);
52+
return old_spte;
2353
}
2454

2555
/*

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 53 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
426426
tdp_mmu_unlink_sp(kvm, sp, shared);
427427

428428
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
429-
u64 *sptep = rcu_dereference(pt) + i;
429+
tdp_ptep_t sptep = pt + i;
430430
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
431-
u64 old_child_spte;
431+
u64 old_spte;
432432

433433
if (shared) {
434434
/*
@@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
440440
* value to the removed SPTE value.
441441
*/
442442
for (;;) {
443-
old_child_spte = xchg(sptep, REMOVED_SPTE);
444-
if (!is_removed_spte(old_child_spte))
443+
old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
444+
if (!is_removed_spte(old_spte))
445445
break;
446446
cpu_relax();
447447
}
@@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
455455
* are guarded by the memslots generation, not by being
456456
* unreachable.
457457
*/
458-
old_child_spte = READ_ONCE(*sptep);
459-
if (!is_shadow_present_pte(old_child_spte))
458+
old_spte = kvm_tdp_mmu_read_spte(sptep);
459+
if (!is_shadow_present_pte(old_spte))
460460
continue;
461461

462462
/*
463-
* Marking the SPTE as a removed SPTE is not
464-
* strictly necessary here as the MMU lock will
465-
* stop other threads from concurrently modifying
466-
* this SPTE. Using the removed SPTE value keeps
467-
* the two branches consistent and simplifies
468-
* the function.
463+
* Use the common helper instead of a raw WRITE_ONCE as
464+
* the SPTE needs to be updated atomically if it can be
465+
* modified by a different vCPU outside of mmu_lock.
466+
* Even though the parent SPTE is !PRESENT, the TLB
467+
* hasn't yet been flushed, and both Intel and AMD
468+
* document that A/D assists can use upper-level PxE
469+
* entries that are cached in the TLB, i.e. the CPU can
470+
* still access the page and mark it dirty.
471+
*
472+
* No retry is needed in the atomic update path as the
473+
* sole concern is dropping a Dirty bit, i.e. no other
474+
* task can zap/remove the SPTE as mmu_lock is held for
475+
* write. Marking the SPTE as a removed SPTE is not
476+
* strictly necessary for the same reason, but using
477+
* the remove SPTE value keeps the shared/exclusive
478+
* paths consistent and allows the handle_changed_spte()
479+
* call below to hardcode the new value to REMOVED_SPTE.
480+
*
481+
* Note, even though dropping a Dirty bit is the only
482+
* scenario where a non-atomic update could result in a
483+
* functional bug, simply checking the Dirty bit isn't
484+
* sufficient as a fast page fault could read the upper
485+
* level SPTE before it is zapped, and then make this
486+
* target SPTE writable, resume the guest, and set the
487+
* Dirty bit between reading the SPTE above and writing
488+
* it here.
469489
*/
470-
WRITE_ONCE(*sptep, REMOVED_SPTE);
490+
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
491+
REMOVED_SPTE, level);
471492
}
472493
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
473-
old_child_spte, REMOVED_SPTE, level,
474-
shared);
494+
old_spte, REMOVED_SPTE, level, shared);
475495
}
476496

477497
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
667687
KVM_PAGES_PER_HPAGE(iter->level));
668688

669689
/*
670-
* No other thread can overwrite the removed SPTE as they
671-
* must either wait on the MMU lock or use
672-
* tdp_mmu_set_spte_atomic which will not overwrite the
673-
* special removed SPTE value. No bookkeeping is needed
674-
* here since the SPTE is going from non-present
675-
* to non-present.
690+
* No other thread can overwrite the removed SPTE as they must either
691+
* wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
692+
* overwrite the special removed SPTE value. No bookkeeping is needed
693+
* here since the SPTE is going from non-present to non-present. Use
694+
* the raw write helper to avoid an unnecessary check on volatile bits.
676695
*/
677-
kvm_tdp_mmu_write_spte(iter->sptep, 0);
696+
__kvm_tdp_mmu_write_spte(iter->sptep, 0);
678697

679698
return 0;
680699
}
@@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
699718
* unless performing certain dirty logging operations.
700719
* Leaving record_dirty_log unset in that case prevents page
701720
* writes from being double counted.
721+
*
722+
* Returns the old SPTE value, which _may_ be different than @old_spte if the
723+
* SPTE had voldatile bits.
702724
*/
703-
static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
704-
u64 old_spte, u64 new_spte, gfn_t gfn, int level,
705-
bool record_acc_track, bool record_dirty_log)
725+
static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
726+
u64 old_spte, u64 new_spte, gfn_t gfn, int level,
727+
bool record_acc_track, bool record_dirty_log)
706728
{
707729
lockdep_assert_held_write(&kvm->mmu_lock);
708730

@@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
715737
*/
716738
WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
717739

718-
kvm_tdp_mmu_write_spte(sptep, new_spte);
740+
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
719741

720742
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
721743

@@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
724746
if (record_dirty_log)
725747
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
726748
new_spte, level);
749+
return old_spte;
727750
}
728751

729752
static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
@@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
732755
{
733756
WARN_ON_ONCE(iter->yielded);
734757

735-
__tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
736-
new_spte, iter->gfn, iter->level,
737-
record_acc_track, record_dirty_log);
758+
iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
759+
iter->old_spte, new_spte,
760+
iter->gfn, iter->level,
761+
record_acc_track, record_dirty_log);
738762
}
739763

740764
static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,

0 commit comments

Comments
 (0)