@@ -3262,9 +3262,19 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
32623262{
32633263 gva_t gva = fault -> is_tdp ? 0 : fault -> addr ;
32643264
3265+ if (fault -> is_private ) {
3266+ kvm_mmu_prepare_memory_fault_exit (vcpu , fault );
3267+ return - EFAULT ;
3268+ }
3269+
32653270 vcpu_cache_mmio_info (vcpu , gva , fault -> gfn ,
32663271 access & shadow_mmio_access_mask );
32673272
3273+ fault -> slot = NULL ;
3274+ fault -> pfn = KVM_PFN_NOSLOT ;
3275+ fault -> map_writable = false;
3276+ fault -> hva = KVM_HVA_ERR_BAD ;
3277+
32683278 /*
32693279 * If MMIO caching is disabled, emulate immediately without
32703280 * touching the shadow page tables as attempting to install an
@@ -4207,24 +4217,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
42074217 return (vcpu -> arch .apf .id ++ << 12 ) | vcpu -> vcpu_id ;
42084218}
42094219
4210- static bool kvm_arch_setup_async_pf (struct kvm_vcpu * vcpu , gpa_t cr2_or_gpa ,
4211- gfn_t gfn )
4220+ static bool kvm_arch_setup_async_pf (struct kvm_vcpu * vcpu ,
4221+ struct kvm_page_fault * fault )
42124222{
42134223 struct kvm_arch_async_pf arch ;
42144224
42154225 arch .token = alloc_apf_token (vcpu );
4216- arch .gfn = gfn ;
4226+ arch .gfn = fault -> gfn ;
4227+ arch .error_code = fault -> error_code ;
42174228 arch .direct_map = vcpu -> arch .mmu -> root_role .direct ;
42184229 arch .cr3 = kvm_mmu_get_guest_pgd (vcpu , vcpu -> arch .mmu );
42194230
4220- return kvm_setup_async_pf (vcpu , cr2_or_gpa ,
4221- kvm_vcpu_gfn_to_hva (vcpu , gfn ), & arch );
4231+ return kvm_setup_async_pf (vcpu , fault -> addr ,
4232+ kvm_vcpu_gfn_to_hva (vcpu , fault -> gfn ), & arch );
42224233}
42234234
42244235void kvm_arch_async_page_ready (struct kvm_vcpu * vcpu , struct kvm_async_pf * work )
42254236{
42264237 int r ;
42274238
4239+ if (WARN_ON_ONCE (work -> arch .error_code & PFERR_PRIVATE_ACCESS ))
4240+ return ;
4241+
42284242 if ((vcpu -> arch .mmu -> root_role .direct != work -> arch .direct_map ) ||
42294243 work -> wakeup_all )
42304244 return ;
@@ -4237,7 +4251,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
42374251 work -> arch .cr3 != kvm_mmu_get_guest_pgd (vcpu , vcpu -> arch .mmu ))
42384252 return ;
42394253
4240- kvm_mmu_do_page_fault (vcpu , work -> cr2_or_gpa , 0 , true, NULL );
4254+ kvm_mmu_do_page_fault (vcpu , work -> cr2_or_gpa , work -> arch . error_code , true, NULL );
42414255}
42424256
42434257static inline u8 kvm_max_level_for_order (int order )
@@ -4257,14 +4271,6 @@ static inline u8 kvm_max_level_for_order(int order)
42574271 return PG_LEVEL_4K ;
42584272}
42594273
4260- static void kvm_mmu_prepare_memory_fault_exit (struct kvm_vcpu * vcpu ,
4261- struct kvm_page_fault * fault )
4262- {
4263- kvm_prepare_memory_fault_exit (vcpu , fault -> gfn << PAGE_SHIFT ,
4264- PAGE_SIZE , fault -> write , fault -> exec ,
4265- fault -> is_private );
4266- }
4267-
42684274static int kvm_faultin_pfn_private (struct kvm_vcpu * vcpu ,
42694275 struct kvm_page_fault * fault )
42704276{
@@ -4291,48 +4297,15 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
42914297
42924298static int __kvm_faultin_pfn (struct kvm_vcpu * vcpu , struct kvm_page_fault * fault )
42934299{
4294- struct kvm_memory_slot * slot = fault -> slot ;
42954300 bool async ;
42964301
4297- /*
4298- * Retry the page fault if the gfn hit a memslot that is being deleted
4299- * or moved. This ensures any existing SPTEs for the old memslot will
4300- * be zapped before KVM inserts a new MMIO SPTE for the gfn.
4301- */
4302- if (slot && (slot -> flags & KVM_MEMSLOT_INVALID ))
4303- return RET_PF_RETRY ;
4304-
4305- if (!kvm_is_visible_memslot (slot )) {
4306- /* Don't expose private memslots to L2. */
4307- if (is_guest_mode (vcpu )) {
4308- fault -> slot = NULL ;
4309- fault -> pfn = KVM_PFN_NOSLOT ;
4310- fault -> map_writable = false;
4311- return RET_PF_CONTINUE ;
4312- }
4313- /*
4314- * If the APIC access page exists but is disabled, go directly
4315- * to emulation without caching the MMIO access or creating a
4316- * MMIO SPTE. That way the cache doesn't need to be purged
4317- * when the AVIC is re-enabled.
4318- */
4319- if (slot && slot -> id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
4320- !kvm_apicv_activated (vcpu -> kvm ))
4321- return RET_PF_EMULATE ;
4322- }
4323-
4324- if (fault -> is_private != kvm_mem_is_private (vcpu -> kvm , fault -> gfn )) {
4325- kvm_mmu_prepare_memory_fault_exit (vcpu , fault );
4326- return - EFAULT ;
4327- }
4328-
43294302 if (fault -> is_private )
43304303 return kvm_faultin_pfn_private (vcpu , fault );
43314304
43324305 async = false;
4333- fault -> pfn = __gfn_to_pfn_memslot (slot , fault -> gfn , false, false, & async ,
4334- fault -> write , & fault -> map_writable ,
4335- & fault -> hva );
4306+ fault -> pfn = __gfn_to_pfn_memslot (fault -> slot , fault -> gfn , false, false,
4307+ & async , fault -> write ,
4308+ & fault -> map_writable , & fault -> hva );
43364309 if (!async )
43374310 return RET_PF_CONTINUE ; /* *pfn has correct page already */
43384311
@@ -4342,7 +4315,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
43424315 trace_kvm_async_pf_repeated_fault (fault -> addr , fault -> gfn );
43434316 kvm_make_request (KVM_REQ_APF_HALT , vcpu );
43444317 return RET_PF_RETRY ;
4345- } else if (kvm_arch_setup_async_pf (vcpu , fault -> addr , fault -> gfn )) {
4318+ } else if (kvm_arch_setup_async_pf (vcpu , fault )) {
43464319 return RET_PF_RETRY ;
43474320 }
43484321 }
@@ -4352,17 +4325,72 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
43524325 * to wait for IO. Note, gup always bails if it is unable to quickly
43534326 * get a page and a fatal signal, i.e. SIGKILL, is pending.
43544327 */
4355- fault -> pfn = __gfn_to_pfn_memslot (slot , fault -> gfn , false, true, NULL ,
4356- fault -> write , & fault -> map_writable ,
4357- & fault -> hva );
4328+ fault -> pfn = __gfn_to_pfn_memslot (fault -> slot , fault -> gfn , false, true,
4329+ NULL , fault -> write ,
4330+ & fault -> map_writable , & fault -> hva );
43584331 return RET_PF_CONTINUE ;
43594332}
43604333
43614334static int kvm_faultin_pfn (struct kvm_vcpu * vcpu , struct kvm_page_fault * fault ,
43624335 unsigned int access )
43634336{
4337+ struct kvm_memory_slot * slot = fault -> slot ;
43644338 int ret ;
43654339
4340+ /*
4341+ * Note that the mmu_invalidate_seq also serves to detect a concurrent
4342+ * change in attributes. is_page_fault_stale() will detect an
4343+ * invalidation relate to fault->fn and resume the guest without
4344+ * installing a mapping in the page tables.
4345+ */
4346+ fault -> mmu_seq = vcpu -> kvm -> mmu_invalidate_seq ;
4347+ smp_rmb ();
4348+
4349+ /*
4350+ * Now that we have a snapshot of mmu_invalidate_seq we can check for a
4351+ * private vs. shared mismatch.
4352+ */
4353+ if (fault -> is_private != kvm_mem_is_private (vcpu -> kvm , fault -> gfn )) {
4354+ kvm_mmu_prepare_memory_fault_exit (vcpu , fault );
4355+ return - EFAULT ;
4356+ }
4357+
4358+ if (unlikely (!slot ))
4359+ return kvm_handle_noslot_fault (vcpu , fault , access );
4360+
4361+ /*
4362+ * Retry the page fault if the gfn hit a memslot that is being deleted
4363+ * or moved. This ensures any existing SPTEs for the old memslot will
4364+ * be zapped before KVM inserts a new MMIO SPTE for the gfn.
4365+ */
4366+ if (slot -> flags & KVM_MEMSLOT_INVALID )
4367+ return RET_PF_RETRY ;
4368+
4369+ if (slot -> id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT ) {
4370+ /*
4371+ * Don't map L1's APIC access page into L2, KVM doesn't support
4372+ * using APICv/AVIC to accelerate L2 accesses to L1's APIC,
4373+ * i.e. the access needs to be emulated. Emulating access to
4374+ * L1's APIC is also correct if L1 is accelerating L2's own
4375+ * virtual APIC, but for some reason L1 also maps _L1's_ APIC
4376+ * into L2. Note, vcpu_is_mmio_gpa() always treats access to
4377+ * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM
4378+ * uses different roots for L1 vs. L2, i.e. there is no danger
4379+ * of breaking APICv/AVIC for L1.
4380+ */
4381+ if (is_guest_mode (vcpu ))
4382+ return kvm_handle_noslot_fault (vcpu , fault , access );
4383+
4384+ /*
4385+ * If the APIC access page exists but is disabled, go directly
4386+ * to emulation without caching the MMIO access or creating a
4387+ * MMIO SPTE. That way the cache doesn't need to be purged
4388+ * when the AVIC is re-enabled.
4389+ */
4390+ if (!kvm_apicv_activated (vcpu -> kvm ))
4391+ return RET_PF_EMULATE ;
4392+ }
4393+
43664394 fault -> mmu_seq = vcpu -> kvm -> mmu_invalidate_seq ;
43674395 smp_rmb ();
43684396
@@ -4387,8 +4415,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
43874415 * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
43884416 * to detect retry guarantees the worst case latency for the vCPU.
43894417 */
4390- if (fault -> slot &&
4391- mmu_invalidate_retry_gfn_unsafe (vcpu -> kvm , fault -> mmu_seq , fault -> gfn ))
4418+ if (mmu_invalidate_retry_gfn_unsafe (vcpu -> kvm , fault -> mmu_seq , fault -> gfn ))
43924419 return RET_PF_RETRY ;
43934420
43944421 ret = __kvm_faultin_pfn (vcpu , fault );
@@ -4398,7 +4425,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
43984425 if (unlikely (is_error_pfn (fault -> pfn )))
43994426 return kvm_handle_error_pfn (vcpu , fault );
44004427
4401- if (unlikely (!fault -> slot ))
4428+ if (WARN_ON_ONCE (!fault -> slot || is_noslot_pfn ( fault -> pfn ) ))
44024429 return kvm_handle_noslot_fault (vcpu , fault , access );
44034430
44044431 /*
@@ -4509,6 +4536,16 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
45094536 if (WARN_ON_ONCE (fault_address >> 32 ))
45104537 return - EFAULT ;
45114538#endif
4539+ /*
4540+ * Legacy #PF exception only have a 32-bit error code. Simply drop the
4541+ * upper bits as KVM doesn't use them for #PF (because they are never
4542+ * set), and to ensure there are no collisions with KVM-defined bits.
4543+ */
4544+ if (WARN_ON_ONCE (error_code >> 32 ))
4545+ error_code = lower_32_bits (error_code );
4546+
4547+ /* Ensure the above sanity check also covers KVM-defined flags. */
4548+ BUILD_BUG_ON (lower_32_bits (PFERR_SYNTHETIC_MASK ));
45124549
45134550 vcpu -> arch .l1tf_flush_l1d = true;
45144551 if (!flags ) {
@@ -5794,30 +5831,35 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
57945831 int r , emulation_type = EMULTYPE_PF ;
57955832 bool direct = vcpu -> arch .mmu -> root_role .direct ;
57965833
5797- /*
5798- * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
5799- * checks when emulating instructions that triggers implicit access.
5800- * WARN if hardware generates a fault with an error code that collides
5801- * with the KVM-defined value. Clear the flag and continue on, i.e.
5802- * don't terminate the VM, as KVM can't possibly be relying on a flag
5803- * that KVM doesn't know about.
5804- */
5805- if (WARN_ON_ONCE (error_code & PFERR_IMPLICIT_ACCESS ))
5806- error_code &= ~PFERR_IMPLICIT_ACCESS ;
5807-
58085834 if (WARN_ON_ONCE (!VALID_PAGE (vcpu -> arch .mmu -> root .hpa )))
58095835 return RET_PF_RETRY ;
58105836
5837+ /*
5838+ * Except for reserved faults (emulated MMIO is shared-only), set the
5839+ * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
5840+ * current attributes, which are the source of truth for such VMs. Note,
5841+ * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
5842+ * currently supported nested virtualization (among many other things)
5843+ * for software-protected VMs.
5844+ */
5845+ if (IS_ENABLED (CONFIG_KVM_SW_PROTECTED_VM ) &&
5846+ !(error_code & PFERR_RSVD_MASK ) &&
5847+ vcpu -> kvm -> arch .vm_type == KVM_X86_SW_PROTECTED_VM &&
5848+ kvm_mem_is_private (vcpu -> kvm , gpa_to_gfn (cr2_or_gpa )))
5849+ error_code |= PFERR_PRIVATE_ACCESS ;
5850+
58115851 r = RET_PF_INVALID ;
58125852 if (unlikely (error_code & PFERR_RSVD_MASK )) {
5853+ if (WARN_ON_ONCE (error_code & PFERR_PRIVATE_ACCESS ))
5854+ return - EFAULT ;
5855+
58135856 r = handle_mmio_page_fault (vcpu , cr2_or_gpa , direct );
58145857 if (r == RET_PF_EMULATE )
58155858 goto emulate ;
58165859 }
58175860
58185861 if (r == RET_PF_INVALID ) {
5819- r = kvm_mmu_do_page_fault (vcpu , cr2_or_gpa ,
5820- lower_32_bits (error_code ), false,
5862+ r = kvm_mmu_do_page_fault (vcpu , cr2_or_gpa , error_code , false,
58215863 & emulation_type );
58225864 if (KVM_BUG_ON (r == RET_PF_INVALID , vcpu -> kvm ))
58235865 return - EIO ;
0 commit comments