Skip to content

Commit a3522ac

Browse files
sean-jcbonzini
authored andcommitted
KVM: x86/mmu: Enforce guest_memfd's max order when recovering hugepages
Rework kvm_mmu_max_mapping_level() to provide the plumbing to consult guest_memfd (and relevant vendor code) when recovering hugepages, e.g. after disabling live migration. The flaw has existed since guest_memfd was originally added, but has gone unnoticed due to lack of guest_memfd support for hugepages or dirty logging. Don't actually call into guest_memfd at this time, as it's unclear as to what the API should be. Ideally, KVM would simply use kvm_gmem_get_pfn(), but invoking kvm_gmem_get_pfn() would lead to sleeping in atomic context if guest_memfd needed to allocate memory (mmu_lock is held). Luckily, the path isn't actually reachable, so just add a TODO and WARN to ensure the functionality is added alongisde guest_memfd hugepage support, and punt the guest_memfd API design question to the future. Note, calling kvm_mem_is_private() in the non-fault path is safe, so long as mmu_lock is held, as hugepage recovery operates on shadow-present SPTEs, i.e. calling kvm_mmu_max_mapping_level() with @fault=NULL is mutually exclusive with kvm_vm_set_mem_attributes() changing the PRIVATE attribute of the gfn. Signed-off-by: Sean Christopherson <seanjc@google.com> Reviewed-by: David Hildenbrand <david@redhat.com> Reviewed-by: Fuad Tabba <tabba@google.com> Message-ID: <20250729225455.670324-15-seanjc@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
1 parent 1c3fdf1 commit a3522ac

3 files changed

Lines changed: 47 additions & 35 deletions

File tree

arch/x86/kvm/mmu/mmu.c

Lines changed: 45 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3302,31 +3302,54 @@ static u8 kvm_max_level_for_order(int order)
33023302
return PG_LEVEL_4K;
33033303
}
33043304

3305-
static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
3306-
u8 max_level, int gmem_order)
3305+
static u8 kvm_max_private_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
3306+
const struct kvm_memory_slot *slot, gfn_t gfn)
33073307
{
3308-
u8 req_max_level;
3308+
u8 max_level, coco_level;
3309+
kvm_pfn_t pfn;
33093310

3310-
if (max_level == PG_LEVEL_4K)
3311-
return PG_LEVEL_4K;
3311+
/* For faults, use the gmem information that was resolved earlier. */
3312+
if (fault) {
3313+
pfn = fault->pfn;
3314+
max_level = fault->max_level;
3315+
} else {
3316+
/* TODO: Call into guest_memfd once hugepages are supported. */
3317+
WARN_ONCE(1, "Get pfn+order from guest_memfd");
3318+
pfn = KVM_PFN_ERR_FAULT;
3319+
max_level = PG_LEVEL_4K;
3320+
}
33123321

3313-
max_level = min(kvm_max_level_for_order(gmem_order), max_level);
33143322
if (max_level == PG_LEVEL_4K)
3315-
return PG_LEVEL_4K;
3323+
return max_level;
33163324

3317-
req_max_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn);
3318-
if (req_max_level)
3319-
max_level = min(max_level, req_max_level);
3325+
/*
3326+
* CoCo may influence the max mapping level, e.g. due to RMP or S-EPT
3327+
* restrictions. A return of '0' means "no additional restrictions", to
3328+
* allow for using an optional "ret0" static call.
3329+
*/
3330+
coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn);
3331+
if (coco_level)
3332+
max_level = min(max_level, coco_level);
33203333

33213334
return max_level;
33223335
}
33233336

3324-
static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
3325-
const struct kvm_memory_slot *slot,
3326-
gfn_t gfn, int max_level, bool is_private)
3337+
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
3338+
const struct kvm_memory_slot *slot, gfn_t gfn)
33273339
{
33283340
struct kvm_lpage_info *linfo;
3329-
int host_level;
3341+
int host_level, max_level;
3342+
bool is_private;
3343+
3344+
lockdep_assert_held(&kvm->mmu_lock);
3345+
3346+
if (fault) {
3347+
max_level = fault->max_level;
3348+
is_private = fault->is_private;
3349+
} else {
3350+
max_level = PG_LEVEL_NUM;
3351+
is_private = kvm_mem_is_private(kvm, gfn);
3352+
}
33303353

33313354
max_level = min(max_level, max_huge_page_level);
33323355
for ( ; max_level > PG_LEVEL_4K; max_level--) {
@@ -3335,25 +3358,16 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
33353358
break;
33363359
}
33373360

3338-
if (is_private)
3339-
return max_level;
3340-
33413361
if (max_level == PG_LEVEL_4K)
33423362
return PG_LEVEL_4K;
33433363

3344-
host_level = host_pfn_mapping_level(kvm, gfn, slot);
3364+
if (is_private)
3365+
host_level = kvm_max_private_mapping_level(kvm, fault, slot, gfn);
3366+
else
3367+
host_level = host_pfn_mapping_level(kvm, gfn, slot);
33453368
return min(host_level, max_level);
33463369
}
33473370

3348-
int kvm_mmu_max_mapping_level(struct kvm *kvm,
3349-
const struct kvm_memory_slot *slot, gfn_t gfn)
3350-
{
3351-
bool is_private = kvm_slot_has_gmem(slot) &&
3352-
kvm_mem_is_private(kvm, gfn);
3353-
3354-
return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
3355-
}
3356-
33573371
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
33583372
{
33593373
struct kvm_memory_slot *slot = fault->slot;
@@ -3374,9 +3388,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
33743388
* Enforce the iTLB multihit workaround after capturing the requested
33753389
* level, which will be used to do precise, accurate accounting.
33763390
*/
3377-
fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
3378-
fault->gfn, fault->max_level,
3379-
fault->is_private);
3391+
fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault,
3392+
fault->slot, fault->gfn);
33803393
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
33813394
return;
33823395

@@ -4564,8 +4577,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
45644577
}
45654578

45664579
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
4567-
fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
4568-
fault->max_level, max_order);
4580+
fault->max_level = kvm_max_level_for_order(max_order);
45694581

45704582
return RET_PF_CONTINUE;
45714583
}
@@ -7165,7 +7177,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
71657177
* mapping if the indirect sp has level = 1.
71667178
*/
71677179
if (sp->role.direct &&
7168-
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
7180+
sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) {
71697181
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
71707182

71717183
if (kvm_available_flush_remote_tlbs_range())

arch/x86/kvm/mmu/mmu_internal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
411411
return r;
412412
}
413413

414-
int kvm_mmu_max_mapping_level(struct kvm *kvm,
414+
int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
415415
const struct kvm_memory_slot *slot, gfn_t gfn);
416416
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
417417
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1813,7 +1813,7 @@ static void recover_huge_pages_range(struct kvm *kvm,
18131813
if (iter.gfn < start || iter.gfn >= end)
18141814
continue;
18151815

1816-
max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
1816+
max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
18171817
if (max_mapping_level < iter.level)
18181818
continue;
18191819

0 commit comments

Comments
 (0)