@@ -40,7 +40,17 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
4040
4141void kvm_mmu_uninit_tdp_mmu (struct kvm * kvm )
4242{
43- /* Also waits for any queued work items. */
43+ /*
44+ * Invalidate all roots, which besides the obvious, schedules all roots
45+ * for zapping and thus puts the TDP MMU's reference to each root, i.e.
46+ * ultimately frees all roots.
47+ */
48+ kvm_tdp_mmu_invalidate_all_roots (kvm );
49+
50+ /*
51+ * Destroying a workqueue also first flushes the workqueue, i.e. no
52+ * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
53+ */
4454 destroy_workqueue (kvm -> arch .tdp_mmu_zap_wq );
4555
4656 WARN_ON (atomic64_read (& kvm -> arch .tdp_mmu_pages ));
@@ -116,16 +126,6 @@ static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root
116126 queue_work (kvm -> arch .tdp_mmu_zap_wq , & root -> tdp_mmu_async_work );
117127}
118128
119- static inline bool kvm_tdp_root_mark_invalid (struct kvm_mmu_page * page )
120- {
121- union kvm_mmu_page_role role = page -> role ;
122- role .invalid = true;
123-
124- /* No need to use cmpxchg, only the invalid bit can change. */
125- role .word = xchg (& page -> role .word , role .word );
126- return role .invalid ;
127- }
128-
129129void kvm_tdp_mmu_put_root (struct kvm * kvm , struct kvm_mmu_page * root ,
130130 bool shared )
131131{
@@ -134,45 +134,12 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
134134 if (!refcount_dec_and_test (& root -> tdp_mmu_root_count ))
135135 return ;
136136
137- WARN_ON (!is_tdp_mmu_page (root ));
138-
139137 /*
140- * The root now has refcount=0. It is valid, but readers already
141- * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
142- * rejects it. This remains true for the rest of the execution
143- * of this function, because readers visit valid roots only
144- * (except for tdp_mmu_zap_root_work(), which however
145- * does not acquire any reference itself).
146- *
147- * Even though there are flows that need to visit all roots for
148- * correctness, they all take mmu_lock for write, so they cannot yet
149- * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
150- * since the root still has refcount=0.
151- *
152- * However, tdp_mmu_zap_root can yield, and writers do not expect to
153- * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
154- * So the root temporarily gets an extra reference, going to refcount=1
155- * while staying invalid. Readers still cannot acquire any reference;
156- * but writers are now allowed to run if tdp_mmu_zap_root yields and
157- * they might take an extra reference if they themselves yield.
158- * Therefore, when the reference is given back by the worker,
159- * there is no guarantee that the refcount is still 1. If not, whoever
160- * puts the last reference will free the page, but they will not have to
161- * zap the root because a root cannot go from invalid to valid.
138+ * The TDP MMU itself holds a reference to each root until the root is
139+ * explicitly invalidated, i.e. the final reference should be never be
140+ * put for a valid root.
162141 */
163- if (!kvm_tdp_root_mark_invalid (root )) {
164- refcount_set (& root -> tdp_mmu_root_count , 1 );
165-
166- /*
167- * Zapping the root in a worker is not just "nice to have";
168- * it is required because kvm_tdp_mmu_invalidate_all_roots()
169- * skips already-invalid roots. If kvm_tdp_mmu_put_root() did
170- * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
171- * might return with some roots not zapped yet.
172- */
173- tdp_mmu_schedule_zap_root (kvm , root );
174- return ;
175- }
142+ KVM_BUG_ON (!is_tdp_mmu_page (root ) || !root -> role .invalid , kvm );
176143
177144 spin_lock (& kvm -> arch .tdp_mmu_pages_lock );
178145 list_del_rcu (& root -> link );
@@ -320,7 +287,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
320287 root = tdp_mmu_alloc_sp (vcpu );
321288 tdp_mmu_init_sp (root , NULL , 0 , role );
322289
323- refcount_set (& root -> tdp_mmu_root_count , 1 );
290+ /*
291+ * TDP MMU roots are kept until they are explicitly invalidated, either
292+ * by a memslot update or by the destruction of the VM. Initialize the
293+ * refcount to two; one reference for the vCPU, and one reference for
294+ * the TDP MMU itself, which is held until the root is invalidated and
295+ * is ultimately put by tdp_mmu_zap_root_work().
296+ */
297+ refcount_set (& root -> tdp_mmu_root_count , 2 );
324298
325299 spin_lock (& kvm -> arch .tdp_mmu_pages_lock );
326300 list_add_rcu (& root -> link , & kvm -> arch .tdp_mmu_roots );
@@ -946,32 +920,49 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
946920/*
947921 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
948922 * is about to be zapped, e.g. in response to a memslots update. The actual
949- * zapping is performed asynchronously, so a reference is taken on all roots.
950- * Using a separate workqueue makes it easy to ensure that the destruction is
951- * performed before the "fast zap" completes, without keeping a separate list
952- * of invalidated roots; the list is effectively the list of work items in
953- * the workqueue.
954- *
955- * Get a reference even if the root is already invalid, the asynchronous worker
956- * assumes it was gifted a reference to the root it processes. Because mmu_lock
957- * is held for write, it should be impossible to observe a root with zero refcount,
958- * i.e. the list of roots cannot be stale.
923+ * zapping is performed asynchronously. Using a separate workqueue makes it
924+ * easy to ensure that the destruction is performed before the "fast zap"
925+ * completes, without keeping a separate list of invalidated roots; the list is
926+ * effectively the list of work items in the workqueue.
959927 *
960- * This has essentially the same effect for the TDP MMU
961- * as updating mmu_valid_gen does for the shadow MMU .
928+ * Note, the asynchronous worker is gifted the TDP MMU's reference.
929+ * See kvm_tdp_mmu_get_vcpu_root_hpa() .
962930 */
963931void kvm_tdp_mmu_invalidate_all_roots (struct kvm * kvm )
964932{
965933 struct kvm_mmu_page * root ;
966934
967- lockdep_assert_held_write (& kvm -> mmu_lock );
968- list_for_each_entry (root , & kvm -> arch .tdp_mmu_roots , link ) {
969- if (!root -> role .invalid &&
970- !WARN_ON_ONCE (!kvm_tdp_mmu_get_root (root ))) {
935+ /*
936+ * mmu_lock must be held for write to ensure that a root doesn't become
937+ * invalid while there are active readers (invalidating a root while
938+ * there are active readers may or may not be problematic in practice,
939+ * but it's uncharted territory and not supported).
940+ *
941+ * Waive the assertion if there are no users of @kvm, i.e. the VM is
942+ * being destroyed after all references have been put, or if no vCPUs
943+ * have been created (which means there are no roots), i.e. the VM is
944+ * being destroyed in an error path of KVM_CREATE_VM.
945+ */
946+ if (IS_ENABLED (CONFIG_PROVE_LOCKING ) &&
947+ refcount_read (& kvm -> users_count ) && kvm -> created_vcpus )
948+ lockdep_assert_held_write (& kvm -> mmu_lock );
949+
950+ /*
951+ * As above, mmu_lock isn't held when destroying the VM! There can't
952+ * be other references to @kvm, i.e. nothing else can invalidate roots
953+ * or be consuming roots, but walking the list of roots does need to be
954+ * guarded against roots being deleted by the asynchronous zap worker.
955+ */
956+ rcu_read_lock ();
957+
958+ list_for_each_entry_rcu (root , & kvm -> arch .tdp_mmu_roots , link ) {
959+ if (!root -> role .invalid ) {
971960 root -> role .invalid = true;
972961 tdp_mmu_schedule_zap_root (kvm , root );
973962 }
974963 }
964+
965+ rcu_read_unlock ();
975966}
976967
977968/*
0 commit comments