Skip to content

Commit b9a66cd

Browse files
Stanislav Kinsburskiiliuw
authored andcommitted
mshv: Add support for movable memory regions
Introduce support for movable memory regions in the Hyper-V root partition driver to improve memory management flexibility and enable advanced use cases such as dynamic memory remapping. Mirror the address space between the Linux root partition and guest VMs using HMM. The root partition owns the memory, while guest VMs act as devices with page tables managed via hypercalls. MSHV handles VP intercepts by invoking hmm_range_fault() and updating SLAT entries. When memory is reclaimed, HMM invalidates the relevant regions, prompting MSHV to clear SLAT entries; guest VMs will fault again on access. Integrate mmu_interval_notifier for movable regions, implement handlers for HMM faults and memory invalidation, and update memory region mapping logic to support movable regions. While MMU notifiers are commonly used in virtualization drivers, this implementation leverages HMM (Heterogeneous Memory Management) for its specialized functionality. HMM provides a framework for mirroring, invalidation, and fault handling, reducing boilerplate and improving maintainability compared to generic MMU notifiers. Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Reviewed-by: Nuno Das Neves <nunodasneves@linux.microsoft.com> Signed-off-by: Wei Liu <wei.liu@kernel.org>
1 parent c39dda0 commit b9a66cd

4 files changed

Lines changed: 346 additions & 36 deletions

File tree

drivers/hv/Kconfig

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ config MSHV_ROOT
7676
depends on PAGE_SIZE_4KB
7777
select EVENTFD
7878
select VIRT_XFER_TO_GUEST_WORK
79+
select HMM_MIRROR
80+
select MMU_NOTIFIER
7981
default n
8082
help
8183
Select this option to enable support for booting and running as root

drivers/hv/mshv_regions.c

Lines changed: 212 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
* Authors: Microsoft Linux virtualization team
88
*/
99

10+
#include <linux/hmm.h>
11+
#include <linux/hyperv.h>
1012
#include <linux/kref.h>
1113
#include <linux/mm.h>
1214
#include <linux/vmalloc.h>
@@ -15,6 +17,8 @@
1517

1618
#include "mshv_root.h"
1719

20+
#define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD
21+
1822
/**
1923
* mshv_region_process_chunk - Processes a contiguous chunk of memory pages
2024
* in a region.
@@ -134,8 +138,7 @@ static int mshv_region_process_range(struct mshv_mem_region *region,
134138
}
135139

136140
struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
137-
u64 uaddr, u32 flags,
138-
bool is_mmio)
141+
u64 uaddr, u32 flags)
139142
{
140143
struct mshv_mem_region *region;
141144

@@ -152,9 +155,6 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
152155
if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
153156
region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
154157

155-
if (!is_mmio)
156-
region->flags.range_pinned = true;
157-
158158
kref_init(&region->refcount);
159159

160160
return region;
@@ -245,7 +245,7 @@ int mshv_region_map(struct mshv_mem_region *region)
245245
static void mshv_region_invalidate_pages(struct mshv_mem_region *region,
246246
u64 page_offset, u64 page_count)
247247
{
248-
if (region->flags.range_pinned)
248+
if (region->type == MSHV_REGION_TYPE_MEM_PINNED)
249249
unpin_user_pages(region->pages + page_offset, page_count);
250250

251251
memset(region->pages + page_offset, 0,
@@ -321,6 +321,9 @@ static void mshv_region_destroy(struct kref *ref)
321321
struct mshv_partition *partition = region->partition;
322322
int ret;
323323

324+
if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
325+
mshv_region_movable_fini(region);
326+
324327
if (mshv_partition_encrypted(partition)) {
325328
ret = mshv_region_share(region);
326329
if (ret) {
@@ -347,3 +350,206 @@ int mshv_region_get(struct mshv_mem_region *region)
347350
{
348351
return kref_get_unless_zero(&region->refcount);
349352
}
353+
354+
/**
355+
* mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region
356+
* @region: Pointer to the memory region structure
357+
* @range: Pointer to the HMM range structure
358+
*
359+
* This function performs the following steps:
360+
* 1. Reads the notifier sequence for the HMM range.
361+
* 2. Acquires a read lock on the memory map.
362+
* 3. Handles HMM faults for the specified range.
363+
* 4. Releases the read lock on the memory map.
364+
* 5. If successful, locks the memory region mutex.
365+
* 6. Verifies if the notifier sequence has changed during the operation.
366+
* If it has, releases the mutex and returns -EBUSY to match with
367+
* hmm_range_fault() return code for repeating.
368+
*
369+
* Return: 0 on success, a negative error code otherwise.
370+
*/
371+
static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
372+
struct hmm_range *range)
373+
{
374+
int ret;
375+
376+
range->notifier_seq = mmu_interval_read_begin(range->notifier);
377+
mmap_read_lock(region->mni.mm);
378+
ret = hmm_range_fault(range);
379+
mmap_read_unlock(region->mni.mm);
380+
if (ret)
381+
return ret;
382+
383+
mutex_lock(&region->mutex);
384+
385+
if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) {
386+
mutex_unlock(&region->mutex);
387+
cond_resched();
388+
return -EBUSY;
389+
}
390+
391+
return 0;
392+
}
393+
394+
/**
395+
* mshv_region_range_fault - Handle memory range faults for a given region.
396+
* @region: Pointer to the memory region structure.
397+
* @page_offset: Offset of the page within the region.
398+
* @page_count: Number of pages to handle.
399+
*
400+
* This function resolves memory faults for a specified range of pages
401+
* within a memory region. It uses HMM (Heterogeneous Memory Management)
402+
* to fault in the required pages and updates the region's page array.
403+
*
404+
* Return: 0 on success, negative error code on failure.
405+
*/
406+
static int mshv_region_range_fault(struct mshv_mem_region *region,
407+
u64 page_offset, u64 page_count)
408+
{
409+
struct hmm_range range = {
410+
.notifier = &region->mni,
411+
.default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
412+
};
413+
unsigned long *pfns;
414+
int ret;
415+
u64 i;
416+
417+
pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL);
418+
if (!pfns)
419+
return -ENOMEM;
420+
421+
range.hmm_pfns = pfns;
422+
range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE;
423+
range.end = range.start + page_count * HV_HYP_PAGE_SIZE;
424+
425+
do {
426+
ret = mshv_region_hmm_fault_and_lock(region, &range);
427+
} while (ret == -EBUSY);
428+
429+
if (ret)
430+
goto out;
431+
432+
for (i = 0; i < page_count; i++)
433+
region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]);
434+
435+
ret = mshv_region_remap_pages(region, region->hv_map_flags,
436+
page_offset, page_count);
437+
438+
mutex_unlock(&region->mutex);
439+
out:
440+
kfree(pfns);
441+
return ret;
442+
}
443+
444+
bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn)
445+
{
446+
u64 page_offset, page_count;
447+
int ret;
448+
449+
/* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */
450+
page_offset = ALIGN_DOWN(gfn - region->start_gfn,
451+
MSHV_MAP_FAULT_IN_PAGES);
452+
453+
/* Map more pages than requested to reduce the number of faults. */
454+
page_count = min(region->nr_pages - page_offset,
455+
MSHV_MAP_FAULT_IN_PAGES);
456+
457+
ret = mshv_region_range_fault(region, page_offset, page_count);
458+
459+
WARN_ONCE(ret,
460+
"p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n",
461+
region->partition->pt_id, region->start_uaddr,
462+
region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
463+
gfn, page_offset, page_count);
464+
465+
return !ret;
466+
}
467+
468+
/**
469+
* mshv_region_interval_invalidate - Invalidate a range of memory region
470+
* @mni: Pointer to the mmu_interval_notifier structure
471+
* @range: Pointer to the mmu_notifier_range structure
472+
* @cur_seq: Current sequence number for the interval notifier
473+
*
474+
* This function invalidates a memory region by remapping its pages with
475+
* no access permissions. It locks the region's mutex to ensure thread safety
476+
* and updates the sequence number for the interval notifier. If the range
477+
* is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking
478+
* lock and returns false if unsuccessful.
479+
*
480+
* NOTE: Failure to invalidate a region is a serious error, as the pages will
481+
* be considered freed while they are still mapped by the hypervisor.
482+
* Any attempt to access such pages will likely crash the system.
483+
*
484+
* Return: true if the region was successfully invalidated, false otherwise.
485+
*/
486+
static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
487+
const struct mmu_notifier_range *range,
488+
unsigned long cur_seq)
489+
{
490+
struct mshv_mem_region *region = container_of(mni,
491+
struct mshv_mem_region,
492+
mni);
493+
u64 page_offset, page_count;
494+
unsigned long mstart, mend;
495+
int ret = -EPERM;
496+
497+
if (mmu_notifier_range_blockable(range))
498+
mutex_lock(&region->mutex);
499+
else if (!mutex_trylock(&region->mutex))
500+
goto out_fail;
501+
502+
mmu_interval_set_seq(mni, cur_seq);
503+
504+
mstart = max(range->start, region->start_uaddr);
505+
mend = min(range->end, region->start_uaddr +
506+
(region->nr_pages << HV_HYP_PAGE_SHIFT));
507+
508+
page_offset = HVPFN_DOWN(mstart - region->start_uaddr);
509+
page_count = HVPFN_DOWN(mend - mstart);
510+
511+
ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS,
512+
page_offset, page_count);
513+
if (ret)
514+
goto out_fail;
515+
516+
mshv_region_invalidate_pages(region, page_offset, page_count);
517+
518+
mutex_unlock(&region->mutex);
519+
520+
return true;
521+
522+
out_fail:
523+
WARN_ONCE(ret,
524+
"Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n",
525+
region->start_uaddr,
526+
region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
527+
range->start, range->end, range->event,
528+
page_offset, page_offset + page_count - 1, (u64)range->mm, ret);
529+
return false;
530+
}
531+
532+
static const struct mmu_interval_notifier_ops mshv_region_mni_ops = {
533+
.invalidate = mshv_region_interval_invalidate,
534+
};
535+
536+
void mshv_region_movable_fini(struct mshv_mem_region *region)
537+
{
538+
mmu_interval_notifier_remove(&region->mni);
539+
}
540+
541+
bool mshv_region_movable_init(struct mshv_mem_region *region)
542+
{
543+
int ret;
544+
545+
ret = mmu_interval_notifier_insert(&region->mni, current->mm,
546+
region->start_uaddr,
547+
region->nr_pages << HV_HYP_PAGE_SHIFT,
548+
&mshv_region_mni_ops);
549+
if (ret)
550+
return false;
551+
552+
mutex_init(&region->mutex);
553+
554+
return true;
555+
}

drivers/hv/mshv_root.h

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <linux/hashtable.h>
1616
#include <linux/dev_printk.h>
1717
#include <linux/build_bug.h>
18+
#include <linux/mmu_notifier.h>
1819
#include <uapi/linux/mshv.h>
1920

2021
/*
@@ -70,18 +71,23 @@ do { \
7071
#define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__)
7172
#define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__)
7273

74+
enum mshv_region_type {
75+
MSHV_REGION_TYPE_MEM_PINNED,
76+
MSHV_REGION_TYPE_MEM_MOVABLE,
77+
MSHV_REGION_TYPE_MMIO
78+
};
79+
7380
struct mshv_mem_region {
7481
struct hlist_node hnode;
7582
struct kref refcount;
7683
u64 nr_pages;
7784
u64 start_gfn;
7885
u64 start_uaddr;
7986
u32 hv_map_flags;
80-
struct {
81-
u64 range_pinned: 1;
82-
u64 reserved: 63;
83-
} flags;
8487
struct mshv_partition *partition;
88+
enum mshv_region_type type;
89+
struct mmu_interval_notifier mni;
90+
struct mutex mutex; /* protects region pages remapping */
8591
struct page *pages[];
8692
};
8793

@@ -315,14 +321,16 @@ extern enum hv_scheduler_type hv_scheduler_type;
315321
extern u8 * __percpu *hv_synic_eventring_tail;
316322

317323
struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
318-
u64 uaddr, u32 flags,
319-
bool is_mmio);
324+
u64 uaddr, u32 flags);
320325
int mshv_region_share(struct mshv_mem_region *region);
321326
int mshv_region_unshare(struct mshv_mem_region *region);
322327
int mshv_region_map(struct mshv_mem_region *region);
323328
void mshv_region_invalidate(struct mshv_mem_region *region);
324329
int mshv_region_pin(struct mshv_mem_region *region);
325330
void mshv_region_put(struct mshv_mem_region *region);
326331
int mshv_region_get(struct mshv_mem_region *region);
332+
bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
333+
void mshv_region_movable_fini(struct mshv_mem_region *region);
334+
bool mshv_region_movable_init(struct mshv_mem_region *region);
327335

328336
#endif /* _MSHV_ROOT_H_ */

0 commit comments

Comments
 (0)