77 * Authors: Microsoft Linux virtualization team
88 */
99
10+ #include <linux/hmm.h>
11+ #include <linux/hyperv.h>
1012#include <linux/kref.h>
1113#include <linux/mm.h>
1214#include <linux/vmalloc.h>
1517
1618#include "mshv_root.h"
1719
20+ #define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD
21+
1822/**
1923 * mshv_region_process_chunk - Processes a contiguous chunk of memory pages
2024 * in a region.
@@ -134,8 +138,7 @@ static int mshv_region_process_range(struct mshv_mem_region *region,
134138}
135139
136140struct mshv_mem_region * mshv_region_create (u64 guest_pfn , u64 nr_pages ,
137- u64 uaddr , u32 flags ,
138- bool is_mmio )
141+ u64 uaddr , u32 flags )
139142{
140143 struct mshv_mem_region * region ;
141144
@@ -152,9 +155,6 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
152155 if (flags & BIT (MSHV_SET_MEM_BIT_EXECUTABLE ))
153156 region -> hv_map_flags |= HV_MAP_GPA_EXECUTABLE ;
154157
155- if (!is_mmio )
156- region -> flags .range_pinned = true;
157-
158158 kref_init (& region -> refcount );
159159
160160 return region ;
@@ -245,7 +245,7 @@ int mshv_region_map(struct mshv_mem_region *region)
245245static void mshv_region_invalidate_pages (struct mshv_mem_region * region ,
246246 u64 page_offset , u64 page_count )
247247{
248- if (region -> flags . range_pinned )
248+ if (region -> type == MSHV_REGION_TYPE_MEM_PINNED )
249249 unpin_user_pages (region -> pages + page_offset , page_count );
250250
251251 memset (region -> pages + page_offset , 0 ,
@@ -321,6 +321,9 @@ static void mshv_region_destroy(struct kref *ref)
321321 struct mshv_partition * partition = region -> partition ;
322322 int ret ;
323323
324+ if (region -> type == MSHV_REGION_TYPE_MEM_MOVABLE )
325+ mshv_region_movable_fini (region );
326+
324327 if (mshv_partition_encrypted (partition )) {
325328 ret = mshv_region_share (region );
326329 if (ret ) {
@@ -347,3 +350,206 @@ int mshv_region_get(struct mshv_mem_region *region)
347350{
348351 return kref_get_unless_zero (& region -> refcount );
349352}
353+
354+ /**
355+ * mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region
356+ * @region: Pointer to the memory region structure
357+ * @range: Pointer to the HMM range structure
358+ *
359+ * This function performs the following steps:
360+ * 1. Reads the notifier sequence for the HMM range.
361+ * 2. Acquires a read lock on the memory map.
362+ * 3. Handles HMM faults for the specified range.
363+ * 4. Releases the read lock on the memory map.
364+ * 5. If successful, locks the memory region mutex.
365+ * 6. Verifies if the notifier sequence has changed during the operation.
366+ * If it has, releases the mutex and returns -EBUSY to match with
367+ * hmm_range_fault() return code for repeating.
368+ *
369+ * Return: 0 on success, a negative error code otherwise.
370+ */
371+ static int mshv_region_hmm_fault_and_lock (struct mshv_mem_region * region ,
372+ struct hmm_range * range )
373+ {
374+ int ret ;
375+
376+ range -> notifier_seq = mmu_interval_read_begin (range -> notifier );
377+ mmap_read_lock (region -> mni .mm );
378+ ret = hmm_range_fault (range );
379+ mmap_read_unlock (region -> mni .mm );
380+ if (ret )
381+ return ret ;
382+
383+ mutex_lock (& region -> mutex );
384+
385+ if (mmu_interval_read_retry (range -> notifier , range -> notifier_seq )) {
386+ mutex_unlock (& region -> mutex );
387+ cond_resched ();
388+ return - EBUSY ;
389+ }
390+
391+ return 0 ;
392+ }
393+
394+ /**
395+ * mshv_region_range_fault - Handle memory range faults for a given region.
396+ * @region: Pointer to the memory region structure.
397+ * @page_offset: Offset of the page within the region.
398+ * @page_count: Number of pages to handle.
399+ *
400+ * This function resolves memory faults for a specified range of pages
401+ * within a memory region. It uses HMM (Heterogeneous Memory Management)
402+ * to fault in the required pages and updates the region's page array.
403+ *
404+ * Return: 0 on success, negative error code on failure.
405+ */
406+ static int mshv_region_range_fault (struct mshv_mem_region * region ,
407+ u64 page_offset , u64 page_count )
408+ {
409+ struct hmm_range range = {
410+ .notifier = & region -> mni ,
411+ .default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE ,
412+ };
413+ unsigned long * pfns ;
414+ int ret ;
415+ u64 i ;
416+
417+ pfns = kmalloc_array (page_count , sizeof (* pfns ), GFP_KERNEL );
418+ if (!pfns )
419+ return - ENOMEM ;
420+
421+ range .hmm_pfns = pfns ;
422+ range .start = region -> start_uaddr + page_offset * HV_HYP_PAGE_SIZE ;
423+ range .end = range .start + page_count * HV_HYP_PAGE_SIZE ;
424+
425+ do {
426+ ret = mshv_region_hmm_fault_and_lock (region , & range );
427+ } while (ret == - EBUSY );
428+
429+ if (ret )
430+ goto out ;
431+
432+ for (i = 0 ; i < page_count ; i ++ )
433+ region -> pages [page_offset + i ] = hmm_pfn_to_page (pfns [i ]);
434+
435+ ret = mshv_region_remap_pages (region , region -> hv_map_flags ,
436+ page_offset , page_count );
437+
438+ mutex_unlock (& region -> mutex );
439+ out :
440+ kfree (pfns );
441+ return ret ;
442+ }
443+
444+ bool mshv_region_handle_gfn_fault (struct mshv_mem_region * region , u64 gfn )
445+ {
446+ u64 page_offset , page_count ;
447+ int ret ;
448+
449+ /* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */
450+ page_offset = ALIGN_DOWN (gfn - region -> start_gfn ,
451+ MSHV_MAP_FAULT_IN_PAGES );
452+
453+ /* Map more pages than requested to reduce the number of faults. */
454+ page_count = min (region -> nr_pages - page_offset ,
455+ MSHV_MAP_FAULT_IN_PAGES );
456+
457+ ret = mshv_region_range_fault (region , page_offset , page_count );
458+
459+ WARN_ONCE (ret ,
460+ "p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n" ,
461+ region -> partition -> pt_id , region -> start_uaddr ,
462+ region -> start_uaddr + (region -> nr_pages << HV_HYP_PAGE_SHIFT ),
463+ gfn , page_offset , page_count );
464+
465+ return !ret ;
466+ }
467+
468+ /**
469+ * mshv_region_interval_invalidate - Invalidate a range of memory region
470+ * @mni: Pointer to the mmu_interval_notifier structure
471+ * @range: Pointer to the mmu_notifier_range structure
472+ * @cur_seq: Current sequence number for the interval notifier
473+ *
474+ * This function invalidates a memory region by remapping its pages with
475+ * no access permissions. It locks the region's mutex to ensure thread safety
476+ * and updates the sequence number for the interval notifier. If the range
477+ * is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking
478+ * lock and returns false if unsuccessful.
479+ *
480+ * NOTE: Failure to invalidate a region is a serious error, as the pages will
481+ * be considered freed while they are still mapped by the hypervisor.
482+ * Any attempt to access such pages will likely crash the system.
483+ *
484+ * Return: true if the region was successfully invalidated, false otherwise.
485+ */
486+ static bool mshv_region_interval_invalidate (struct mmu_interval_notifier * mni ,
487+ const struct mmu_notifier_range * range ,
488+ unsigned long cur_seq )
489+ {
490+ struct mshv_mem_region * region = container_of (mni ,
491+ struct mshv_mem_region ,
492+ mni );
493+ u64 page_offset , page_count ;
494+ unsigned long mstart , mend ;
495+ int ret = - EPERM ;
496+
497+ if (mmu_notifier_range_blockable (range ))
498+ mutex_lock (& region -> mutex );
499+ else if (!mutex_trylock (& region -> mutex ))
500+ goto out_fail ;
501+
502+ mmu_interval_set_seq (mni , cur_seq );
503+
504+ mstart = max (range -> start , region -> start_uaddr );
505+ mend = min (range -> end , region -> start_uaddr +
506+ (region -> nr_pages << HV_HYP_PAGE_SHIFT ));
507+
508+ page_offset = HVPFN_DOWN (mstart - region -> start_uaddr );
509+ page_count = HVPFN_DOWN (mend - mstart );
510+
511+ ret = mshv_region_remap_pages (region , HV_MAP_GPA_NO_ACCESS ,
512+ page_offset , page_count );
513+ if (ret )
514+ goto out_fail ;
515+
516+ mshv_region_invalidate_pages (region , page_offset , page_count );
517+
518+ mutex_unlock (& region -> mutex );
519+
520+ return true;
521+
522+ out_fail :
523+ WARN_ONCE (ret ,
524+ "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n" ,
525+ region -> start_uaddr ,
526+ region -> start_uaddr + (region -> nr_pages << HV_HYP_PAGE_SHIFT ),
527+ range -> start , range -> end , range -> event ,
528+ page_offset , page_offset + page_count - 1 , (u64 )range -> mm , ret );
529+ return false;
530+ }
531+
532+ static const struct mmu_interval_notifier_ops mshv_region_mni_ops = {
533+ .invalidate = mshv_region_interval_invalidate ,
534+ };
535+
536+ void mshv_region_movable_fini (struct mshv_mem_region * region )
537+ {
538+ mmu_interval_notifier_remove (& region -> mni );
539+ }
540+
541+ bool mshv_region_movable_init (struct mshv_mem_region * region )
542+ {
543+ int ret ;
544+
545+ ret = mmu_interval_notifier_insert (& region -> mni , current -> mm ,
546+ region -> start_uaddr ,
547+ region -> nr_pages << HV_HYP_PAGE_SHIFT ,
548+ & mshv_region_mni_ops );
549+ if (ret )
550+ return false;
551+
552+ mutex_init (& region -> mutex );
553+
554+ return true;
555+ }
0 commit comments