@@ -2804,8 +2804,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
28042804 const struct kvm_memory_slot * slot )
28052805{
28062806 unsigned long hva ;
2807- pte_t * pte ;
2808- int level ;
2807+ unsigned long flags ;
2808+ int level = PG_LEVEL_4K ;
2809+ pgd_t pgd ;
2810+ p4d_t p4d ;
2811+ pud_t pud ;
2812+ pmd_t pmd ;
28092813
28102814 if (!PageCompound (pfn_to_page (pfn )) && !kvm_is_zone_device_pfn (pfn ))
28112815 return PG_LEVEL_4K ;
@@ -2820,10 +2824,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
28202824 */
28212825 hva = __gfn_to_hva_memslot (slot , gfn );
28222826
2823- pte = lookup_address_in_mm (kvm -> mm , hva , & level );
2824- if (unlikely (!pte ))
2825- return PG_LEVEL_4K ;
2827+ /*
2828+ * Lookup the mapping level in the current mm. The information
2829+ * may become stale soon, but it is safe to use as long as
2830+ * 1) mmu_notifier_retry was checked after taking mmu_lock, and
2831+ * 2) mmu_lock is taken now.
2832+ *
2833+ * We still need to disable IRQs to prevent concurrent tear down
2834+ * of page tables.
2835+ */
2836+ local_irq_save (flags );
2837+
2838+ pgd = READ_ONCE (* pgd_offset (kvm -> mm , hva ));
2839+ if (pgd_none (pgd ))
2840+ goto out ;
2841+
2842+ p4d = READ_ONCE (* p4d_offset (& pgd , hva ));
2843+ if (p4d_none (p4d ) || !p4d_present (p4d ))
2844+ goto out ;
28262845
2846+ pud = READ_ONCE (* pud_offset (& p4d , hva ));
2847+ if (pud_none (pud ) || !pud_present (pud ))
2848+ goto out ;
2849+
2850+ if (pud_large (pud )) {
2851+ level = PG_LEVEL_1G ;
2852+ goto out ;
2853+ }
2854+
2855+ pmd = READ_ONCE (* pmd_offset (& pud , hva ));
2856+ if (pmd_none (pmd ) || !pmd_present (pmd ))
2857+ goto out ;
2858+
2859+ if (pmd_large (pmd ))
2860+ level = PG_LEVEL_2M ;
2861+
2862+ out :
2863+ local_irq_restore (flags );
28272864 return level ;
28282865}
28292866
@@ -2992,9 +3029,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa
29923029 /*
29933030 * If MMIO caching is disabled, emulate immediately without
29943031 * touching the shadow page tables as attempting to install an
2995- * MMIO SPTE will just be an expensive nop.
3032+ * MMIO SPTE will just be an expensive nop. Do not cache MMIO
3033+ * whose gfn is greater than host.MAXPHYADDR, any guest that
3034+ * generates such gfns is running nested and is being tricked
3035+ * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
3036+ * and only if L1's MAXPHYADDR is inaccurate with respect to
3037+ * the hardware's).
29963038 */
2997- if (unlikely (!shadow_mmio_value )) {
3039+ if (unlikely (!shadow_mmio_value ) ||
3040+ unlikely (fault -> gfn > kvm_mmu_max_gfn ())) {
29983041 * ret_val = RET_PF_EMULATE ;
29993042 return true;
30003043 }
0 commit comments