@@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
426426 tdp_mmu_unlink_sp (kvm , sp , shared );
427427
428428 for (i = 0 ; i < PT64_ENT_PER_PAGE ; i ++ ) {
429- u64 * sptep = rcu_dereference ( pt ) + i ;
429+ tdp_ptep_t sptep = pt + i ;
430430 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE (level );
431- u64 old_child_spte ;
431+ u64 old_spte ;
432432
433433 if (shared ) {
434434 /*
@@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
440440 * value to the removed SPTE value.
441441 */
442442 for (;;) {
443- old_child_spte = xchg (sptep , REMOVED_SPTE );
444- if (!is_removed_spte (old_child_spte ))
443+ old_spte = kvm_tdp_mmu_write_spte_atomic (sptep , REMOVED_SPTE );
444+ if (!is_removed_spte (old_spte ))
445445 break ;
446446 cpu_relax ();
447447 }
@@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
455455 * are guarded by the memslots generation, not by being
456456 * unreachable.
457457 */
458- old_child_spte = READ_ONCE ( * sptep );
459- if (!is_shadow_present_pte (old_child_spte ))
458+ old_spte = kvm_tdp_mmu_read_spte ( sptep );
459+ if (!is_shadow_present_pte (old_spte ))
460460 continue ;
461461
462462 /*
463- * Marking the SPTE as a removed SPTE is not
464- * strictly necessary here as the MMU lock will
465- * stop other threads from concurrently modifying
466- * this SPTE. Using the removed SPTE value keeps
467- * the two branches consistent and simplifies
468- * the function.
463+ * Use the common helper instead of a raw WRITE_ONCE as
464+ * the SPTE needs to be updated atomically if it can be
465+ * modified by a different vCPU outside of mmu_lock.
466+ * Even though the parent SPTE is !PRESENT, the TLB
467+ * hasn't yet been flushed, and both Intel and AMD
468+ * document that A/D assists can use upper-level PxE
469+ * entries that are cached in the TLB, i.e. the CPU can
470+ * still access the page and mark it dirty.
471+ *
472+ * No retry is needed in the atomic update path as the
473+ * sole concern is dropping a Dirty bit, i.e. no other
474+ * task can zap/remove the SPTE as mmu_lock is held for
475+ * write. Marking the SPTE as a removed SPTE is not
476+ * strictly necessary for the same reason, but using
477+ * the remove SPTE value keeps the shared/exclusive
478+ * paths consistent and allows the handle_changed_spte()
479+ * call below to hardcode the new value to REMOVED_SPTE.
480+ *
481+ * Note, even though dropping a Dirty bit is the only
482+ * scenario where a non-atomic update could result in a
483+ * functional bug, simply checking the Dirty bit isn't
484+ * sufficient as a fast page fault could read the upper
485+ * level SPTE before it is zapped, and then make this
486+ * target SPTE writable, resume the guest, and set the
487+ * Dirty bit between reading the SPTE above and writing
488+ * it here.
469489 */
470- WRITE_ONCE (* sptep , REMOVED_SPTE );
490+ old_spte = kvm_tdp_mmu_write_spte (sptep , old_spte ,
491+ REMOVED_SPTE , level );
471492 }
472493 handle_changed_spte (kvm , kvm_mmu_page_as_id (sp ), gfn ,
473- old_child_spte , REMOVED_SPTE , level ,
474- shared );
494+ old_spte , REMOVED_SPTE , level , shared );
475495 }
476496
477497 call_rcu (& sp -> rcu_head , tdp_mmu_free_sp_rcu_callback );
@@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
667687 KVM_PAGES_PER_HPAGE (iter -> level ));
668688
669689 /*
670- * No other thread can overwrite the removed SPTE as they
671- * must either wait on the MMU lock or use
672- * tdp_mmu_set_spte_atomic which will not overwrite the
673- * special removed SPTE value. No bookkeeping is needed
674- * here since the SPTE is going from non-present
675- * to non-present.
690+ * No other thread can overwrite the removed SPTE as they must either
691+ * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
692+ * overwrite the special removed SPTE value. No bookkeeping is needed
693+ * here since the SPTE is going from non-present to non-present. Use
694+ * the raw write helper to avoid an unnecessary check on volatile bits.
676695 */
677- kvm_tdp_mmu_write_spte (iter -> sptep , 0 );
696+ __kvm_tdp_mmu_write_spte (iter -> sptep , 0 );
678697
679698 return 0 ;
680699}
@@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
699718 * unless performing certain dirty logging operations.
700719 * Leaving record_dirty_log unset in that case prevents page
701720 * writes from being double counted.
721+ *
722+ * Returns the old SPTE value, which _may_ be different than @old_spte if the
723+ * SPTE had voldatile bits.
702724 */
703- static void __tdp_mmu_set_spte (struct kvm * kvm , int as_id , tdp_ptep_t sptep ,
704- u64 old_spte , u64 new_spte , gfn_t gfn , int level ,
705- bool record_acc_track , bool record_dirty_log )
725+ static u64 __tdp_mmu_set_spte (struct kvm * kvm , int as_id , tdp_ptep_t sptep ,
726+ u64 old_spte , u64 new_spte , gfn_t gfn , int level ,
727+ bool record_acc_track , bool record_dirty_log )
706728{
707729 lockdep_assert_held_write (& kvm -> mmu_lock );
708730
@@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
715737 */
716738 WARN_ON (is_removed_spte (old_spte ) || is_removed_spte (new_spte ));
717739
718- kvm_tdp_mmu_write_spte (sptep , new_spte );
740+ old_spte = kvm_tdp_mmu_write_spte (sptep , old_spte , new_spte , level );
719741
720742 __handle_changed_spte (kvm , as_id , gfn , old_spte , new_spte , level , false);
721743
@@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
724746 if (record_dirty_log )
725747 handle_changed_spte_dirty_log (kvm , as_id , gfn , old_spte ,
726748 new_spte , level );
749+ return old_spte ;
727750}
728751
729752static inline void _tdp_mmu_set_spte (struct kvm * kvm , struct tdp_iter * iter ,
@@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
732755{
733756 WARN_ON_ONCE (iter -> yielded );
734757
735- __tdp_mmu_set_spte (kvm , iter -> as_id , iter -> sptep , iter -> old_spte ,
736- new_spte , iter -> gfn , iter -> level ,
737- record_acc_track , record_dirty_log );
758+ iter -> old_spte = __tdp_mmu_set_spte (kvm , iter -> as_id , iter -> sptep ,
759+ iter -> old_spte , new_spte ,
760+ iter -> gfn , iter -> level ,
761+ record_acc_track , record_dirty_log );
738762}
739763
740764static inline void tdp_mmu_set_spte (struct kvm * kvm , struct tdp_iter * iter ,
0 commit comments