@@ -666,14 +666,33 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr)
666666 CONFIG_PGTABLE_LEVELS ),
667667 .mm_ops = & kvm_user_mm_ops ,
668668 };
669+ unsigned long flags ;
669670 kvm_pte_t pte = 0 ; /* Keep GCC quiet... */
670671 u32 level = ~0 ;
671672 int ret ;
672673
674+ /*
675+ * Disable IRQs so that we hazard against a concurrent
676+ * teardown of the userspace page tables (which relies on
677+ * IPI-ing threads).
678+ */
679+ local_irq_save (flags );
673680 ret = kvm_pgtable_get_leaf (& pgt , addr , & pte , & level );
674- VM_BUG_ON (ret );
675- VM_BUG_ON (level >= KVM_PGTABLE_MAX_LEVELS );
676- VM_BUG_ON (!(pte & PTE_VALID ));
681+ local_irq_restore (flags );
682+
683+ if (ret )
684+ return ret ;
685+
686+ /*
687+ * Not seeing an error, but not updating level? Something went
688+ * deeply wrong...
689+ */
690+ if (WARN_ON (level >= KVM_PGTABLE_MAX_LEVELS ))
691+ return - EFAULT ;
692+
693+ /* Oops, the userspace PTs are gone... Replay the fault */
694+ if (!kvm_pte_valid (pte ))
695+ return - EAGAIN ;
677696
678697 return BIT (ARM64_HW_PGTABLE_LEVEL_SHIFT (level ));
679698}
@@ -1079,7 +1098,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
10791098 *
10801099 * Returns the size of the mapping.
10811100 */
1082- static unsigned long
1101+ static long
10831102transparent_hugepage_adjust (struct kvm * kvm , struct kvm_memory_slot * memslot ,
10841103 unsigned long hva , kvm_pfn_t * pfnp ,
10851104 phys_addr_t * ipap )
@@ -1091,8 +1110,15 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
10911110 * sure that the HVA and IPA are sufficiently aligned and that the
10921111 * block map is contained within the memslot.
10931112 */
1094- if (fault_supports_stage2_huge_mapping (memslot , hva , PMD_SIZE ) &&
1095- get_user_mapping_size (kvm , hva ) >= PMD_SIZE ) {
1113+ if (fault_supports_stage2_huge_mapping (memslot , hva , PMD_SIZE )) {
1114+ int sz = get_user_mapping_size (kvm , hva );
1115+
1116+ if (sz < 0 )
1117+ return sz ;
1118+
1119+ if (sz < PMD_SIZE )
1120+ return PAGE_SIZE ;
1121+
10961122 /*
10971123 * The address we faulted on is backed by a transparent huge
10981124 * page. However, because we map the compound huge page and
@@ -1192,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
11921218{
11931219 int ret = 0 ;
11941220 bool write_fault , writable , force_pte = false;
1195- bool exec_fault ;
1221+ bool exec_fault , mte_allowed ;
11961222 bool device = false;
11971223 unsigned long mmu_seq ;
11981224 struct kvm * kvm = vcpu -> kvm ;
@@ -1203,7 +1229,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
12031229 kvm_pfn_t pfn ;
12041230 bool logging_active = memslot_is_logging (memslot );
12051231 unsigned long fault_level = kvm_vcpu_trap_get_fault_level (vcpu );
1206- unsigned long vma_pagesize , fault_granule ;
1232+ long vma_pagesize , fault_granule ;
12071233 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R ;
12081234 struct kvm_pgtable * pgt ;
12091235
@@ -1217,6 +1243,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
12171243 return - EFAULT ;
12181244 }
12191245
1246+ /*
1247+ * Permission faults just need to update the existing leaf entry,
1248+ * and so normally don't require allocations from the memcache. The
1249+ * only exception to this is when dirty logging is enabled at runtime
1250+ * and a write fault needs to collapse a block entry into a table.
1251+ */
1252+ if (fault_status != ESR_ELx_FSC_PERM ||
1253+ (logging_active && write_fault )) {
1254+ ret = kvm_mmu_topup_memory_cache (memcache ,
1255+ kvm_mmu_cache_min_pages (kvm ));
1256+ if (ret )
1257+ return ret ;
1258+ }
1259+
12201260 /*
12211261 * Let's check if we will get back a huge page backed by hugetlbfs, or
12221262 * get block mapping for device MMIO region.
@@ -1269,37 +1309,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
12691309 fault_ipa &= ~(vma_pagesize - 1 );
12701310
12711311 gfn = fault_ipa >> PAGE_SHIFT ;
1272- mmap_read_unlock ( current -> mm );
1312+ mte_allowed = kvm_vma_mte_allowed ( vma );
12731313
1274- /*
1275- * Permission faults just need to update the existing leaf entry,
1276- * and so normally don't require allocations from the memcache. The
1277- * only exception to this is when dirty logging is enabled at runtime
1278- * and a write fault needs to collapse a block entry into a table.
1279- */
1280- if (fault_status != ESR_ELx_FSC_PERM ||
1281- (logging_active && write_fault )) {
1282- ret = kvm_mmu_topup_memory_cache (memcache ,
1283- kvm_mmu_cache_min_pages (kvm ));
1284- if (ret )
1285- return ret ;
1286- }
1314+ /* Don't use the VMA after the unlock -- it may have vanished */
1315+ vma = NULL ;
12871316
1288- mmu_seq = vcpu -> kvm -> mmu_invalidate_seq ;
12891317 /*
1290- * Ensure the read of mmu_invalidate_seq happens before we call
1291- * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1292- * the page we just got a reference to gets unmapped before we have a
1293- * chance to grab the mmu_lock, which ensure that if the page gets
1294- * unmapped afterwards, the call to kvm_unmap_gfn will take it away
1295- * from us again properly. This smp_rmb() interacts with the smp_wmb()
1296- * in kvm_mmu_notifier_invalidate_<page|range_end>.
1318+ * Read mmu_invalidate_seq so that KVM can detect if the results of
1319+ * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
1320+ * acquiring kvm->mmu_lock.
12971321 *
1298- * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
1299- * used to avoid unnecessary overhead introduced to locate the memory
1300- * slot because it's always fixed even @gfn is adjusted for huge pages.
1322+ * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
1323+ * with the smp_wmb() in kvm_mmu_invalidate_end().
13011324 */
1302- smp_rmb ();
1325+ mmu_seq = vcpu -> kvm -> mmu_invalidate_seq ;
1326+ mmap_read_unlock (current -> mm );
13031327
13041328 pfn = __gfn_to_pfn_memslot (memslot , gfn , false, false, NULL ,
13051329 write_fault , & writable , NULL );
@@ -1350,11 +1374,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
13501374 vma_pagesize = transparent_hugepage_adjust (kvm , memslot ,
13511375 hva , & pfn ,
13521376 & fault_ipa );
1377+
1378+ if (vma_pagesize < 0 ) {
1379+ ret = vma_pagesize ;
1380+ goto out_unlock ;
1381+ }
13531382 }
13541383
13551384 if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte (kvm )) {
13561385 /* Check the VMM hasn't introduced a new disallowed VMA */
1357- if (kvm_vma_mte_allowed ( vma ) ) {
1386+ if (mte_allowed ) {
13581387 sanitise_mte_tags (kvm , pfn , vma_pagesize );
13591388 } else {
13601389 ret = - EFAULT ;
0 commit comments