Skip to content

Commit e5f19b6

Browse files
ankita-nvawilliam
authored andcommitted
vfio/nvgrace-gpu: register device memory for poison handling
The nvgrace-gpu module [1] maps the device memory to the user VA (Qemu) without adding the memory to the kernel. The device memory pages are PFNMAP and not backed by struct page. The module can thus utilize the MM's PFNMAP memory_failure mechanism that handles ECC/poison on regions with no struct pages. The kernel MM code exposes register/unregister APIs allowing modules to register the device memory for memory_failure handling. Make nvgrace-gpu register the GPU memory with the MM on open. The module registers its memory region, the address_space with the kernel MM for ECC handling and implements a callback function to convert the PFN to the file page offset. The callback functions checks if the PFN belongs to the device memory region and is also contained in the VMA range, an error is returned otherwise. Link: https://lore.kernel.org/all/20240220115055.23546-1-ankita@nvidia.com/ [1] Suggested-by: Alex Williamson <alex@shazbot.org> Suggested-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Ankit Agrawal <ankita@nvidia.com> Reviewed-by: Jiaqi Yan <jiaqiyan@google.com> Link: https://lore.kernel.org/r/20260115202849.2921-3-ankita@nvidia.com Signed-off-by: Alex Williamson <alex@shazbot.org>
1 parent 205e6d1 commit e5f19b6

1 file changed

Lines changed: 109 additions & 4 deletions

File tree

  • drivers/vfio/pci/nvgrace-gpu

drivers/vfio/pci/nvgrace-gpu/main.c

Lines changed: 109 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <linux/jiffies.h>
1010
#include <linux/pci-p2pdma.h>
1111
#include <linux/pm_runtime.h>
12+
#include <linux/memory-failure.h>
1213

1314
/*
1415
* The device memory usable to the workloads running in the VM is cached
@@ -49,6 +50,7 @@ struct mem_region {
4950
void *memaddr;
5051
void __iomem *ioaddr;
5152
}; /* Base virtual address of the region */
53+
struct pfn_address_space pfn_address_space;
5254
};
5355

5456
struct nvgrace_gpu_pci_core_device {
@@ -88,6 +90,80 @@ nvgrace_gpu_memregion(int index,
8890
return NULL;
8991
}
9092

93+
static int pfn_memregion_offset(struct nvgrace_gpu_pci_core_device *nvdev,
94+
unsigned int index,
95+
unsigned long pfn,
96+
pgoff_t *pfn_offset_in_region)
97+
{
98+
struct mem_region *region;
99+
unsigned long start_pfn, num_pages;
100+
101+
region = nvgrace_gpu_memregion(index, nvdev);
102+
if (!region)
103+
return -EINVAL;
104+
105+
start_pfn = PHYS_PFN(region->memphys);
106+
num_pages = region->memlength >> PAGE_SHIFT;
107+
108+
if (pfn < start_pfn || pfn >= start_pfn + num_pages)
109+
return -EFAULT;
110+
111+
*pfn_offset_in_region = pfn - start_pfn;
112+
113+
return 0;
114+
}
115+
116+
static inline
117+
struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma);
118+
119+
static int nvgrace_gpu_pfn_to_vma_pgoff(struct vm_area_struct *vma,
120+
unsigned long pfn,
121+
pgoff_t *pgoff)
122+
{
123+
struct nvgrace_gpu_pci_core_device *nvdev;
124+
unsigned int index =
125+
vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
126+
pgoff_t vma_offset_in_region = vma->vm_pgoff &
127+
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
128+
pgoff_t pfn_offset_in_region;
129+
int ret;
130+
131+
nvdev = vma_to_nvdev(vma);
132+
if (!nvdev)
133+
return -ENOENT;
134+
135+
ret = pfn_memregion_offset(nvdev, index, pfn, &pfn_offset_in_region);
136+
if (ret)
137+
return ret;
138+
139+
/* Ensure PFN is not before VMA's start within the region */
140+
if (pfn_offset_in_region < vma_offset_in_region)
141+
return -EFAULT;
142+
143+
/* Calculate offset from VMA start */
144+
*pgoff = vma->vm_pgoff +
145+
(pfn_offset_in_region - vma_offset_in_region);
146+
147+
return 0;
148+
}
149+
150+
static int
151+
nvgrace_gpu_vfio_pci_register_pfn_range(struct vfio_device *core_vdev,
152+
struct mem_region *region)
153+
{
154+
unsigned long pfn, nr_pages;
155+
156+
pfn = PHYS_PFN(region->memphys);
157+
nr_pages = region->memlength >> PAGE_SHIFT;
158+
159+
region->pfn_address_space.node.start = pfn;
160+
region->pfn_address_space.node.last = pfn + nr_pages - 1;
161+
region->pfn_address_space.mapping = core_vdev->inode->i_mapping;
162+
region->pfn_address_space.pfn_to_vma_pgoff = nvgrace_gpu_pfn_to_vma_pgoff;
163+
164+
return register_pfn_address_space(&region->pfn_address_space);
165+
}
166+
91167
static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
92168
{
93169
struct vfio_pci_core_device *vdev =
@@ -114,14 +190,28 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
114190
* memory mapping.
115191
*/
116192
ret = vfio_pci_core_setup_barmap(vdev, 0);
117-
if (ret) {
118-
vfio_pci_core_disable(vdev);
119-
return ret;
193+
if (ret)
194+
goto error_exit;
195+
196+
if (nvdev->resmem.memlength) {
197+
ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->resmem);
198+
if (ret && ret != -EOPNOTSUPP)
199+
goto error_exit;
120200
}
121201

122-
vfio_pci_core_finish_enable(vdev);
202+
ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->usemem);
203+
if (ret && ret != -EOPNOTSUPP)
204+
goto register_mem_failed;
123205

206+
vfio_pci_core_finish_enable(vdev);
124207
return 0;
208+
209+
register_mem_failed:
210+
if (nvdev->resmem.memlength)
211+
unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
212+
error_exit:
213+
vfio_pci_core_disable(vdev);
214+
return ret;
125215
}
126216

127217
static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
@@ -130,6 +220,11 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
130220
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
131221
core_device.vdev);
132222

223+
if (nvdev->resmem.memlength)
224+
unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
225+
226+
unregister_pfn_address_space(&nvdev->usemem.pfn_address_space);
227+
133228
/* Unmap the mapping to the device memory cached region */
134229
if (nvdev->usemem.memaddr) {
135230
memunmap(nvdev->usemem.memaddr);
@@ -247,6 +342,16 @@ static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
247342
#endif
248343
};
249344

345+
static inline
346+
struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma)
347+
{
348+
/* Check if this VMA belongs to us */
349+
if (vma->vm_ops != &nvgrace_gpu_vfio_pci_mmap_ops)
350+
return NULL;
351+
352+
return vma->vm_private_data;
353+
}
354+
250355
static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
251356
struct vm_area_struct *vma)
252357
{

0 commit comments

Comments
 (0)