Skip to content

Commit 463f46e

Browse files
committed
Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd
Pull iommufd updates from Jason Gunthorpe: "This brings three new iommufd capabilities: - Dirty tracking for DMA. AMD/ARM/Intel CPUs can now record if a DMA writes to a page in the IOPTEs within the IO page table. This can be used to generate a record of what memory is being dirtied by DMA activities during a VM migration process. A VMM like qemu will combine the IOMMU dirty bits with the CPU's dirty log to determine what memory to transfer. VFIO already has a DMA dirty tracking framework that requires PCI devices to implement tracking HW internally. The iommufd version provides an alternative that the VMM can select, if available. The two are designed to have very similar APIs. - Userspace controlled attributes for hardware page tables (HWPT/iommu_domain). There are currently a few generic attributes for HWPTs (support dirty tracking, and parent of a nest). This is an entry point for the userspace iommu driver to control the HW in detail. - Nested translation support for HWPTs. This is a 2D translation scheme similar to the CPU where a DMA goes through a first stage to determine an intermediate address which is then translated trough a second stage to a physical address. Like for CPU translation the first stage table would exist in VM controlled memory and the second stage is in the kernel and matches the VM's guest to physical map. As every IOMMU has a unique set of parameter to describe the S1 IO page table and its associated parameters the userspace IOMMU driver has to marshal the information into the correct format. This is 1/3 of the feature, it allows creating the nested translation and binding it to VFIO devices, however the API to support IOTLB and ATC invalidation of the stage 1 io page table, and forwarding of IO faults are still in progress. The series includes AMD and Intel support for dirty tracking. Intel support for nested translation. Along the way are a number of internal items: - New iommu core items: ops->domain_alloc_user(), ops->set_dirty_tracking, ops->read_and_clear_dirty(), IOMMU_DOMAIN_NESTED, and iommu_copy_struct_from_user - UAF fix in iopt_area_split() - Spelling fixes and some test suite improvement" * tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (52 commits) iommufd: Organize the mock domain alloc functions closer to Joerg's tree iommufd/selftest: Fix page-size check in iommufd_test_dirty() iommufd: Add iopt_area_alloc() iommufd: Fix missing update of domains_itree after splitting iopt_area iommu/vt-d: Disallow read-only mappings to nest parent domain iommu/vt-d: Add nested domain allocation iommu/vt-d: Set the nested domain to a device iommu/vt-d: Make domain attach helpers to be extern iommu/vt-d: Add helper to setup pasid nested translation iommu/vt-d: Add helper for nested domain allocation iommu/vt-d: Extend dmar_domain to support nested domain iommufd: Add data structure for Intel VT-d stage-1 domain allocation iommu/vt-d: Enhance capability check for nested parent domain allocation iommufd/selftest: Add coverage for IOMMU_HWPT_ALLOC with nested HWPTs iommufd/selftest: Add nested domain allocation for mock domain iommu: Add iommu_copy_struct_from_user helper iommufd: Add a nested HW pagetable object iommu: Pass in parent domain with user_data to domain_alloc_user op iommufd: Share iommufd_hwpt_alloc with IOMMUFD_OBJ_HWPT_NESTED iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable ...
2 parents ff269e2 + b2b67c9 commit 463f46e

36 files changed

Lines changed: 2722 additions & 218 deletions

drivers/iommu/Kconfig

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ config IOMMU_IOVA
77
config IOMMU_API
88
bool
99

10+
config IOMMUFD_DRIVER
11+
bool
12+
default n
13+
1014
menuconfig IOMMU_SUPPORT
1115
bool "IOMMU Hardware Support"
1216
depends on MMU

drivers/iommu/amd/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ config AMD_IOMMU
1010
select IOMMU_API
1111
select IOMMU_IOVA
1212
select IOMMU_IO_PGTABLE
13+
select IOMMUFD_DRIVER if IOMMUFD
1314
depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
1415
help
1516
With this option you can enable support for AMD IOMMU hardware in

drivers/iommu/amd/amd_iommu_types.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,9 @@
9797
#define FEATURE_GATS_MASK (3ULL)
9898
#define FEATURE_GAM_VAPIC BIT_ULL(21)
9999
#define FEATURE_GIOSUP BIT_ULL(48)
100+
#define FEATURE_HASUP BIT_ULL(49)
100101
#define FEATURE_EPHSUP BIT_ULL(50)
102+
#define FEATURE_HDSUP BIT_ULL(52)
101103
#define FEATURE_SNP BIT_ULL(63)
102104

103105
#define FEATURE_PASID_SHIFT 32
@@ -212,6 +214,7 @@
212214
/* macros and definitions for device table entries */
213215
#define DEV_ENTRY_VALID 0x00
214216
#define DEV_ENTRY_TRANSLATION 0x01
217+
#define DEV_ENTRY_HAD 0x07
215218
#define DEV_ENTRY_PPR 0x34
216219
#define DEV_ENTRY_IR 0x3d
217220
#define DEV_ENTRY_IW 0x3e
@@ -370,10 +373,16 @@
370373
#define PTE_LEVEL_PAGE_SIZE(level) \
371374
(1ULL << (12 + (9 * (level))))
372375

376+
/*
377+
* The IOPTE dirty bit
378+
*/
379+
#define IOMMU_PTE_HD_BIT (6)
380+
373381
/*
374382
* Bit value definition for I/O PTE fields
375383
*/
376384
#define IOMMU_PTE_PR BIT_ULL(0)
385+
#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT)
377386
#define IOMMU_PTE_U BIT_ULL(59)
378387
#define IOMMU_PTE_FC BIT_ULL(60)
379388
#define IOMMU_PTE_IR BIT_ULL(61)
@@ -384,6 +393,7 @@
384393
*/
385394
#define DTE_FLAG_V BIT_ULL(0)
386395
#define DTE_FLAG_TV BIT_ULL(1)
396+
#define DTE_FLAG_HAD (3ULL << 7)
387397
#define DTE_FLAG_GIOV BIT_ULL(54)
388398
#define DTE_FLAG_GV BIT_ULL(55)
389399
#define DTE_GLX_SHIFT (56)
@@ -413,6 +423,7 @@
413423

414424
#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
415425
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
426+
#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
416427
#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
417428
#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
418429

@@ -563,6 +574,7 @@ struct protection_domain {
563574
int nid; /* Node ID */
564575
u64 *gcr3_tbl; /* Guest CR3 table */
565576
unsigned long flags; /* flags to find out type of domain */
577+
bool dirty_tracking; /* dirty tracking is enabled in the domain */
566578
unsigned dev_cnt; /* devices assigned to this domain */
567579
unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
568580
};

drivers/iommu/amd/io_pgtable.c

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo
486486
return (__pte & ~offset_mask) | (iova & offset_mask);
487487
}
488488

489+
static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
490+
unsigned long flags)
491+
{
492+
bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
493+
bool dirty = false;
494+
int i, count;
495+
496+
/*
497+
* 2.2.3.2 Host Dirty Support
498+
* When a non-default page size is used , software must OR the
499+
* Dirty bits in all of the replicated host PTEs used to map
500+
* the page. The IOMMU does not guarantee the Dirty bits are
501+
* set in all of the replicated PTEs. Any portion of the page
502+
* may have been written even if the Dirty bit is set in only
503+
* one of the replicated PTEs.
504+
*/
505+
count = PAGE_SIZE_PTE_COUNT(size);
506+
for (i = 0; i < count && test_only; i++) {
507+
if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
508+
dirty = true;
509+
break;
510+
}
511+
}
512+
513+
for (i = 0; i < count && !test_only; i++) {
514+
if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
515+
(unsigned long *)&ptep[i])) {
516+
dirty = true;
517+
}
518+
}
519+
520+
return dirty;
521+
}
522+
523+
static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
524+
unsigned long iova, size_t size,
525+
unsigned long flags,
526+
struct iommu_dirty_bitmap *dirty)
527+
{
528+
struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
529+
unsigned long end = iova + size - 1;
530+
531+
do {
532+
unsigned long pgsize = 0;
533+
u64 *ptep, pte;
534+
535+
ptep = fetch_pte(pgtable, iova, &pgsize);
536+
if (ptep)
537+
pte = READ_ONCE(*ptep);
538+
if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
539+
pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
540+
iova += pgsize;
541+
continue;
542+
}
543+
544+
/*
545+
* Mark the whole IOVA range as dirty even if only one of
546+
* the replicated PTEs were marked dirty.
547+
*/
548+
if (pte_test_and_clear_dirty(ptep, pgsize, flags))
549+
iommu_dirty_bitmap_record(dirty, iova, pgsize);
550+
iova += pgsize;
551+
} while (iova < end);
552+
553+
return 0;
554+
}
555+
489556
/*
490557
* ----------------------------------------------------
491558
*/
@@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo
527594
pgtable->iop.ops.map_pages = iommu_v1_map_pages;
528595
pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages;
529596
pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
597+
pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
530598

531599
return &pgtable->iop;
532600
}

drivers/iommu/amd/iommu.c

Lines changed: 144 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <asm/iommu.h>
3838
#include <asm/gart.h>
3939
#include <asm/dma.h>
40+
#include <uapi/linux/iommufd.h>
4041

4142
#include "amd_iommu.h"
4243
#include "../dma-iommu.h"
@@ -65,6 +66,7 @@ LIST_HEAD(hpet_map);
6566
LIST_HEAD(acpihid_map);
6667

6768
const struct iommu_ops amd_iommu_ops;
69+
const struct iommu_dirty_ops amd_dirty_ops;
6870

6971
static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
7072
int amd_iommu_max_glx_val = -1;
@@ -1610,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
16101612
pte_root |= 1ULL << DEV_ENTRY_PPR;
16111613
}
16121614

1615+
if (domain->dirty_tracking)
1616+
pte_root |= DTE_FLAG_HAD;
1617+
16131618
if (domain->flags & PD_IOMMUV2_MASK) {
16141619
u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
16151620
u64 glx = domain->glx;
@@ -2155,28 +2160,79 @@ static inline u64 dma_max_address(void)
21552160
return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
21562161
}
21572162

2158-
static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
2163+
static bool amd_iommu_hd_support(struct amd_iommu *iommu)
21592164
{
2165+
return iommu && (iommu->features & FEATURE_HDSUP);
2166+
}
2167+
2168+
static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
2169+
struct device *dev, u32 flags)
2170+
{
2171+
bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
21602172
struct protection_domain *domain;
2173+
struct amd_iommu *iommu = NULL;
2174+
2175+
if (dev) {
2176+
iommu = rlookup_amd_iommu(dev);
2177+
if (!iommu)
2178+
return ERR_PTR(-ENODEV);
2179+
}
21612180

21622181
/*
21632182
* Since DTE[Mode]=0 is prohibited on SNP-enabled system,
21642183
* default to use IOMMU_DOMAIN_DMA[_FQ].
21652184
*/
21662185
if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
2167-
return NULL;
2186+
return ERR_PTR(-EINVAL);
2187+
2188+
if (dirty_tracking && !amd_iommu_hd_support(iommu))
2189+
return ERR_PTR(-EOPNOTSUPP);
21682190

21692191
domain = protection_domain_alloc(type);
21702192
if (!domain)
2171-
return NULL;
2193+
return ERR_PTR(-ENOMEM);
21722194

21732195
domain->domain.geometry.aperture_start = 0;
21742196
domain->domain.geometry.aperture_end = dma_max_address();
21752197
domain->domain.geometry.force_aperture = true;
21762198

2199+
if (iommu) {
2200+
domain->domain.type = type;
2201+
domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
2202+
domain->domain.ops = iommu->iommu.ops->default_domain_ops;
2203+
2204+
if (dirty_tracking)
2205+
domain->domain.dirty_ops = &amd_dirty_ops;
2206+
}
2207+
21772208
return &domain->domain;
21782209
}
21792210

2211+
static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
2212+
{
2213+
struct iommu_domain *domain;
2214+
2215+
domain = do_iommu_domain_alloc(type, NULL, 0);
2216+
if (IS_ERR(domain))
2217+
return NULL;
2218+
2219+
return domain;
2220+
}
2221+
2222+
static struct iommu_domain *
2223+
amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
2224+
struct iommu_domain *parent,
2225+
const struct iommu_user_data *user_data)
2226+
2227+
{
2228+
unsigned int type = IOMMU_DOMAIN_UNMANAGED;
2229+
2230+
if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
2231+
return ERR_PTR(-EOPNOTSUPP);
2232+
2233+
return do_iommu_domain_alloc(type, dev, flags);
2234+
}
2235+
21802236
static void amd_iommu_domain_free(struct iommu_domain *dom)
21812237
{
21822238
struct protection_domain *domain;
@@ -2214,6 +2270,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
22142270

22152271
dev_data->defer_attach = false;
22162272

2273+
/*
2274+
* Restrict to devices with compatible IOMMU hardware support
2275+
* when enforcement of dirty tracking is enabled.
2276+
*/
2277+
if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
2278+
return -EINVAL;
2279+
22172280
if (dev_data->domain)
22182281
detach_device(dev);
22192282

@@ -2332,13 +2395,85 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
23322395
return true;
23332396
case IOMMU_CAP_DEFERRED_FLUSH:
23342397
return true;
2398+
case IOMMU_CAP_DIRTY_TRACKING: {
2399+
struct amd_iommu *iommu = rlookup_amd_iommu(dev);
2400+
2401+
return amd_iommu_hd_support(iommu);
2402+
}
23352403
default:
23362404
break;
23372405
}
23382406

23392407
return false;
23402408
}
23412409

2410+
static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
2411+
bool enable)
2412+
{
2413+
struct protection_domain *pdomain = to_pdomain(domain);
2414+
struct dev_table_entry *dev_table;
2415+
struct iommu_dev_data *dev_data;
2416+
bool domain_flush = false;
2417+
struct amd_iommu *iommu;
2418+
unsigned long flags;
2419+
u64 pte_root;
2420+
2421+
spin_lock_irqsave(&pdomain->lock, flags);
2422+
if (!(pdomain->dirty_tracking ^ enable)) {
2423+
spin_unlock_irqrestore(&pdomain->lock, flags);
2424+
return 0;
2425+
}
2426+
2427+
list_for_each_entry(dev_data, &pdomain->dev_list, list) {
2428+
iommu = rlookup_amd_iommu(dev_data->dev);
2429+
if (!iommu)
2430+
continue;
2431+
2432+
dev_table = get_dev_table(iommu);
2433+
pte_root = dev_table[dev_data->devid].data[0];
2434+
2435+
pte_root = (enable ? pte_root | DTE_FLAG_HAD :
2436+
pte_root & ~DTE_FLAG_HAD);
2437+
2438+
/* Flush device DTE */
2439+
dev_table[dev_data->devid].data[0] = pte_root;
2440+
device_flush_dte(dev_data);
2441+
domain_flush = true;
2442+
}
2443+
2444+
/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
2445+
if (domain_flush) {
2446+
amd_iommu_domain_flush_tlb_pde(pdomain);
2447+
amd_iommu_domain_flush_complete(pdomain);
2448+
}
2449+
pdomain->dirty_tracking = enable;
2450+
spin_unlock_irqrestore(&pdomain->lock, flags);
2451+
2452+
return 0;
2453+
}
2454+
2455+
static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
2456+
unsigned long iova, size_t size,
2457+
unsigned long flags,
2458+
struct iommu_dirty_bitmap *dirty)
2459+
{
2460+
struct protection_domain *pdomain = to_pdomain(domain);
2461+
struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
2462+
unsigned long lflags;
2463+
2464+
if (!ops || !ops->read_and_clear_dirty)
2465+
return -EOPNOTSUPP;
2466+
2467+
spin_lock_irqsave(&pdomain->lock, lflags);
2468+
if (!pdomain->dirty_tracking && dirty->bitmap) {
2469+
spin_unlock_irqrestore(&pdomain->lock, lflags);
2470+
return -EINVAL;
2471+
}
2472+
spin_unlock_irqrestore(&pdomain->lock, lflags);
2473+
2474+
return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
2475+
}
2476+
23422477
static void amd_iommu_get_resv_regions(struct device *dev,
23432478
struct list_head *head)
23442479
{
@@ -2461,9 +2596,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
24612596
return true;
24622597
}
24632598

2599+
const struct iommu_dirty_ops amd_dirty_ops = {
2600+
.set_dirty_tracking = amd_iommu_set_dirty_tracking,
2601+
.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
2602+
};
2603+
24642604
const struct iommu_ops amd_iommu_ops = {
24652605
.capable = amd_iommu_capable,
24662606
.domain_alloc = amd_iommu_domain_alloc,
2607+
.domain_alloc_user = amd_iommu_domain_alloc_user,
24672608
.probe_device = amd_iommu_probe_device,
24682609
.release_device = amd_iommu_release_device,
24692610
.probe_finalize = amd_iommu_probe_finalize,

drivers/iommu/intel/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ config INTEL_IOMMU
1515
select DMA_OPS
1616
select IOMMU_API
1717
select IOMMU_IOVA
18+
select IOMMUFD_DRIVER if IOMMUFD
1819
select NEED_DMA_MAP_STATE
1920
select DMAR_TABLE
2021
select SWIOTLB

0 commit comments

Comments
 (0)