Skip to content

Commit ed7ae7a

Browse files
Jessica Liuavpatel
authored andcommitted
RISC-V: KVM: Transparent huge page support
Use block mapping if backed by a THP, as implemented in architectures like ARM and x86_64. Signed-off-by: Jessica Liu <liu.xuemei1@zte.com.cn> Reviewed-by: Anup Patel <anup@brainfault.org> Link: https://lore.kernel.org/r/20251127165137780QbUOVPKPAfWSGAFl5qtRy@zte.com.cn Signed-off-by: Anup Patel <anup@brainfault.org>
1 parent 671995f commit ed7ae7a

2 files changed

Lines changed: 142 additions & 0 deletions

File tree

arch/riscv/kvm/mmu.c

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,142 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
305305
return pte_young(ptep_get(ptep));
306306
}
307307

308+
static bool fault_supports_gstage_huge_mapping(struct kvm_memory_slot *memslot,
309+
unsigned long hva)
310+
{
311+
hva_t uaddr_start, uaddr_end;
312+
gpa_t gpa_start;
313+
size_t size;
314+
315+
size = memslot->npages * PAGE_SIZE;
316+
uaddr_start = memslot->userspace_addr;
317+
uaddr_end = uaddr_start + size;
318+
319+
gpa_start = memslot->base_gfn << PAGE_SHIFT;
320+
321+
/*
322+
* Pages belonging to memslots that don't have the same alignment
323+
* within a PMD for userspace and GPA cannot be mapped with g-stage
324+
* PMD entries, because we'll end up mapping the wrong pages.
325+
*
326+
* Consider a layout like the following:
327+
*
328+
* memslot->userspace_addr:
329+
* +-----+--------------------+--------------------+---+
330+
* |abcde|fgh vs-stage block | vs-stage block tv|xyz|
331+
* +-----+--------------------+--------------------+---+
332+
*
333+
* memslot->base_gfn << PAGE_SHIFT:
334+
* +---+--------------------+--------------------+-----+
335+
* |abc|def g-stage block | g-stage block |tvxyz|
336+
* +---+--------------------+--------------------+-----+
337+
*
338+
* If we create those g-stage blocks, we'll end up with this incorrect
339+
* mapping:
340+
* d -> f
341+
* e -> g
342+
* f -> h
343+
*/
344+
if ((gpa_start & (PMD_SIZE - 1)) != (uaddr_start & (PMD_SIZE - 1)))
345+
return false;
346+
347+
/*
348+
* Next, let's make sure we're not trying to map anything not covered
349+
* by the memslot. This means we have to prohibit block size mappings
350+
* for the beginning and end of a non-block aligned and non-block sized
351+
* memory slot (illustrated by the head and tail parts of the
352+
* userspace view above containing pages 'abcde' and 'xyz',
353+
* respectively).
354+
*
355+
* Note that it doesn't matter if we do the check using the
356+
* userspace_addr or the base_gfn, as both are equally aligned (per
357+
* the check above) and equally sized.
358+
*/
359+
return (hva >= ALIGN(uaddr_start, PMD_SIZE)) && (hva < ALIGN_DOWN(uaddr_end, PMD_SIZE));
360+
}
361+
362+
static int get_hva_mapping_size(struct kvm *kvm,
363+
unsigned long hva)
364+
{
365+
int size = PAGE_SIZE;
366+
unsigned long flags;
367+
pgd_t pgd;
368+
p4d_t p4d;
369+
pud_t pud;
370+
pmd_t pmd;
371+
372+
/*
373+
* Disable IRQs to prevent concurrent tear down of host page tables,
374+
* e.g. if the primary MMU promotes a P*D to a huge page and then frees
375+
* the original page table.
376+
*/
377+
local_irq_save(flags);
378+
379+
/*
380+
* Read each entry once. As above, a non-leaf entry can be promoted to
381+
* a huge page _during_ this walk. Re-reading the entry could send the
382+
* walk into the weeks, e.g. p*d_leaf() returns false (sees the old
383+
* value) and then p*d_offset() walks into the target huge page instead
384+
* of the old page table (sees the new value).
385+
*/
386+
pgd = pgdp_get(pgd_offset(kvm->mm, hva));
387+
if (pgd_none(pgd))
388+
goto out;
389+
390+
p4d = p4dp_get(p4d_offset(&pgd, hva));
391+
if (p4d_none(p4d) || !p4d_present(p4d))
392+
goto out;
393+
394+
pud = pudp_get(pud_offset(&p4d, hva));
395+
if (pud_none(pud) || !pud_present(pud))
396+
goto out;
397+
398+
if (pud_leaf(pud)) {
399+
size = PUD_SIZE;
400+
goto out;
401+
}
402+
403+
pmd = pmdp_get(pmd_offset(&pud, hva));
404+
if (pmd_none(pmd) || !pmd_present(pmd))
405+
goto out;
406+
407+
if (pmd_leaf(pmd))
408+
size = PMD_SIZE;
409+
410+
out:
411+
local_irq_restore(flags);
412+
return size;
413+
}
414+
415+
static unsigned long transparent_hugepage_adjust(struct kvm *kvm,
416+
struct kvm_memory_slot *memslot,
417+
unsigned long hva,
418+
kvm_pfn_t *hfnp, gpa_t *gpa)
419+
{
420+
kvm_pfn_t hfn = *hfnp;
421+
422+
/*
423+
* Make sure the adjustment is done only for THP pages. Also make
424+
* sure that the HVA and GPA are sufficiently aligned and that the
425+
* block map is contained within the memslot.
426+
*/
427+
if (fault_supports_gstage_huge_mapping(memslot, hva)) {
428+
int sz;
429+
430+
sz = get_hva_mapping_size(kvm, hva);
431+
if (sz < PMD_SIZE)
432+
return sz;
433+
434+
*gpa &= PMD_MASK;
435+
hfn &= ~(PTRS_PER_PMD - 1);
436+
*hfnp = hfn;
437+
438+
return PMD_SIZE;
439+
}
440+
441+
return PAGE_SIZE;
442+
}
443+
308444
int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
309445
gpa_t gpa, unsigned long hva, bool is_write,
310446
struct kvm_gstage_mapping *out_map)
@@ -398,6 +534,10 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
398534
if (mmu_invalidate_retry(kvm, mmu_seq))
399535
goto out_unlock;
400536

537+
/* Check if we are backed by a THP and thus use block mapping if possible */
538+
if (vma_pagesize == PAGE_SIZE)
539+
vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &hfn, &gpa);
540+
401541
if (writable) {
402542
mark_page_dirty_in_slot(kvm, memslot, gfn);
403543
ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT,

arch/riscv/mm/pgtable.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ pud_t *pud_offset(p4d_t *p4d, unsigned long address)
4747

4848
return (pud_t *)p4d;
4949
}
50+
EXPORT_SYMBOL_GPL(pud_offset);
5051

5152
p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
5253
{
@@ -55,6 +56,7 @@ p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
5556

5657
return (p4d_t *)pgd;
5758
}
59+
EXPORT_SYMBOL_GPL(p4d_offset);
5860
#endif
5961

6062
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

0 commit comments

Comments
 (0)