Skip to content

Commit c279e83

Browse files
nicolincjoergroedel
authored andcommitted
iommu: Introduce pci_dev_reset_iommu_prepare/done()
PCIe permits a device to ignore ATS invalidation TLPs while processing a reset. This creates a problem visible to the OS where an ATS invalidation command will time out. E.g. an SVA domain will have no coordination with a reset event and can racily issue ATS invalidations to a resetting device. The OS should do something to mitigate this as we do not want production systems to be reporting critical ATS failures, especially in a hypervisor environment. Broadly, OS could arrange to ignore the timeouts, block page table mutations to prevent invalidations, or disable and block ATS. The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends SW to disable and block ATS before initiating a Function Level Reset. It also mentions that other reset methods could have the same vulnerability as well. Provide a callback from the PCI subsystem that will enclose the reset and have the iommu core temporarily change all the attached RID/PASID domains group->blocking_domain so that the IOMMU hardware would fence any incoming ATS queries. And IOMMU drivers should also synchronously stop issuing new ATS invalidations and wait for all ATS invalidations to complete. This can avoid any ATS invaliation timeouts. However, if there is a domain attachment/replacement happening during an ongoing reset, ATS routines may be re-activated between the two function calls. So, introduce a new resetting_domain in the iommu_group structure to reject any concurrent attach_dev/set_dev_pasid call during a reset for a concern of compatibility failure. Since this changes the behavior of an attach operation, update the uAPI accordingly. Note that there are two corner cases: 1. Devices in the same iommu_group Since an attachment is always per iommu_group, this means that any sibling devices in the iommu_group cannot change domain, to prevent race conditions. 2. An SR-IOV PF that is being reset while its VF is not In such case, the VF itself is already broken. So, there is no point in preventing PF from going through the iommu reset. Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Tested-by: Dheeraj Kumar Srivastava <dheerajkumar.srivastava@amd.com> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
1 parent a75b2be commit c279e83

3 files changed

Lines changed: 190 additions & 0 deletions

File tree

drivers/iommu/iommu.c

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ struct iommu_group {
6161
int id;
6262
struct iommu_domain *default_domain;
6363
struct iommu_domain *blocking_domain;
64+
/*
65+
* During a group device reset, @resetting_domain points to the physical
66+
* domain, while @domain points to the attached domain before the reset.
67+
*/
68+
struct iommu_domain *resetting_domain;
6469
struct iommu_domain *domain;
6570
struct list_head entry;
6671
unsigned int owner_cnt;
@@ -2195,6 +2200,15 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
21952200

21962201
guard(mutex)(&dev->iommu_group->mutex);
21972202

2203+
/*
2204+
* This is a concurrent attach during a device reset. Reject it until
2205+
* pci_dev_reset_iommu_done() attaches the device to group->domain.
2206+
*
2207+
* Note that this might fail the iommu_dma_map(). But there's nothing
2208+
* more we can do here.
2209+
*/
2210+
if (dev->iommu_group->resetting_domain)
2211+
return -EBUSY;
21982212
return __iommu_attach_device(domain, dev, NULL);
21992213
}
22002214

@@ -2253,6 +2267,17 @@ struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev)
22532267

22542268
lockdep_assert_held(&group->mutex);
22552269

2270+
/*
2271+
* Driver handles the low-level __iommu_attach_device(), including the
2272+
* one invoked by pci_dev_reset_iommu_done() re-attaching the device to
2273+
* the cached group->domain. In this case, the driver must get the old
2274+
* domain from group->resetting_domain rather than group->domain. This
2275+
* prevents it from re-attaching the device from group->domain (old) to
2276+
* group->domain (new).
2277+
*/
2278+
if (group->resetting_domain)
2279+
return group->resetting_domain;
2280+
22562281
return group->domain;
22572282
}
22582283
EXPORT_SYMBOL_GPL(iommu_driver_get_domain_for_dev);
@@ -2409,6 +2434,13 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group,
24092434
if (WARN_ON(!new_domain))
24102435
return -EINVAL;
24112436

2437+
/*
2438+
* This is a concurrent attach during a device reset. Reject it until
2439+
* pci_dev_reset_iommu_done() attaches the device to group->domain.
2440+
*/
2441+
if (group->resetting_domain)
2442+
return -EBUSY;
2443+
24122444
/*
24132445
* Changing the domain is done by calling attach_dev() on the new
24142446
* domain. This switch does not have to be atomic and DMA can be
@@ -3527,6 +3559,16 @@ int iommu_attach_device_pasid(struct iommu_domain *domain,
35273559
return -EINVAL;
35283560

35293561
mutex_lock(&group->mutex);
3562+
3563+
/*
3564+
* This is a concurrent attach during a device reset. Reject it until
3565+
* pci_dev_reset_iommu_done() attaches the device to group->domain.
3566+
*/
3567+
if (group->resetting_domain) {
3568+
ret = -EBUSY;
3569+
goto out_unlock;
3570+
}
3571+
35303572
for_each_group_device(group, device) {
35313573
/*
35323574
* Skip PASID validation for devices without PASID support
@@ -3610,6 +3652,16 @@ int iommu_replace_device_pasid(struct iommu_domain *domain,
36103652
return -EINVAL;
36113653

36123654
mutex_lock(&group->mutex);
3655+
3656+
/*
3657+
* This is a concurrent attach during a device reset. Reject it until
3658+
* pci_dev_reset_iommu_done() attaches the device to group->domain.
3659+
*/
3660+
if (group->resetting_domain) {
3661+
ret = -EBUSY;
3662+
goto out_unlock;
3663+
}
3664+
36133665
entry = iommu_make_pasid_array_entry(domain, handle);
36143666
curr = xa_cmpxchg(&group->pasid_array, pasid, NULL,
36153667
XA_ZERO_ENTRY, GFP_KERNEL);
@@ -3867,6 +3919,127 @@ int iommu_replace_group_handle(struct iommu_group *group,
38673919
}
38683920
EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL");
38693921

3922+
/**
3923+
* pci_dev_reset_iommu_prepare() - Block IOMMU to prepare for a PCI device reset
3924+
* @pdev: PCI device that is going to enter a reset routine
3925+
*
3926+
* The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends to disable and block
3927+
* ATS before initiating a reset. This means that a PCIe device during the reset
3928+
* routine wants to block any IOMMU activity: translation and ATS invalidation.
3929+
*
3930+
* This function attaches the device's RID/PASID(s) the group->blocking_domain,
3931+
* setting the group->resetting_domain. This allows the IOMMU driver pausing any
3932+
* IOMMU activity while leaving the group->domain pointer intact. Later when the
3933+
* reset is finished, pci_dev_reset_iommu_done() can restore everything.
3934+
*
3935+
* Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done()
3936+
* before/after the core-level reset routine, to unset the resetting_domain.
3937+
*
3938+
* Return: 0 on success or negative error code if the preparation failed.
3939+
*
3940+
* These two functions are designed to be used by PCI reset functions that would
3941+
* not invoke any racy iommu_release_device(), since PCI sysfs node gets removed
3942+
* before it notifies with a BUS_NOTIFY_REMOVED_DEVICE. When using them in other
3943+
* case, callers must ensure there will be no racy iommu_release_device() call,
3944+
* which otherwise would UAF the dev->iommu_group pointer.
3945+
*/
3946+
int pci_dev_reset_iommu_prepare(struct pci_dev *pdev)
3947+
{
3948+
struct iommu_group *group = pdev->dev.iommu_group;
3949+
unsigned long pasid;
3950+
void *entry;
3951+
int ret;
3952+
3953+
if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev))
3954+
return 0;
3955+
3956+
guard(mutex)(&group->mutex);
3957+
3958+
/* Re-entry is not allowed */
3959+
if (WARN_ON(group->resetting_domain))
3960+
return -EBUSY;
3961+
3962+
ret = __iommu_group_alloc_blocking_domain(group);
3963+
if (ret)
3964+
return ret;
3965+
3966+
/* Stage RID domain at blocking_domain while retaining group->domain */
3967+
if (group->domain != group->blocking_domain) {
3968+
ret = __iommu_attach_device(group->blocking_domain, &pdev->dev,
3969+
group->domain);
3970+
if (ret)
3971+
return ret;
3972+
}
3973+
3974+
/*
3975+
* Stage PASID domains at blocking_domain while retaining pasid_array.
3976+
*
3977+
* The pasid_array is mostly fenced by group->mutex, except one reader
3978+
* in iommu_attach_handle_get(), so it's safe to read without xa_lock.
3979+
*/
3980+
xa_for_each_start(&group->pasid_array, pasid, entry, 1)
3981+
iommu_remove_dev_pasid(&pdev->dev, pasid,
3982+
pasid_array_entry_to_domain(entry));
3983+
3984+
group->resetting_domain = group->blocking_domain;
3985+
return ret;
3986+
}
3987+
EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare);
3988+
3989+
/**
3990+
* pci_dev_reset_iommu_done() - Restore IOMMU after a PCI device reset is done
3991+
* @pdev: PCI device that has finished a reset routine
3992+
*
3993+
* After a PCIe device finishes a reset routine, it wants to restore its IOMMU
3994+
* IOMMU activity, including new translation as well as cache invalidation, by
3995+
* re-attaching all RID/PASID of the device's back to the domains retained in
3996+
* the core-level structure.
3997+
*
3998+
* Caller must pair it with a successful pci_dev_reset_iommu_prepare().
3999+
*
4000+
* Note that, although unlikely, there is a risk that re-attaching domains might
4001+
* fail due to some unexpected happening like OOM.
4002+
*/
4003+
void pci_dev_reset_iommu_done(struct pci_dev *pdev)
4004+
{
4005+
struct iommu_group *group = pdev->dev.iommu_group;
4006+
unsigned long pasid;
4007+
void *entry;
4008+
4009+
if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev))
4010+
return;
4011+
4012+
guard(mutex)(&group->mutex);
4013+
4014+
/* pci_dev_reset_iommu_prepare() was bypassed for the device */
4015+
if (!group->resetting_domain)
4016+
return;
4017+
4018+
/* pci_dev_reset_iommu_prepare() was not successfully called */
4019+
if (WARN_ON(!group->blocking_domain))
4020+
return;
4021+
4022+
/* Re-attach RID domain back to group->domain */
4023+
if (group->domain != group->blocking_domain) {
4024+
WARN_ON(__iommu_attach_device(group->domain, &pdev->dev,
4025+
group->blocking_domain));
4026+
}
4027+
4028+
/*
4029+
* Re-attach PASID domains back to the domains retained in pasid_array.
4030+
*
4031+
* The pasid_array is mostly fenced by group->mutex, except one reader
4032+
* in iommu_attach_handle_get(), so it's safe to read without xa_lock.
4033+
*/
4034+
xa_for_each_start(&group->pasid_array, pasid, entry, 1)
4035+
WARN_ON(__iommu_set_group_pasid(
4036+
pasid_array_entry_to_domain(entry), group, pasid,
4037+
group->blocking_domain));
4038+
4039+
group->resetting_domain = NULL;
4040+
}
4041+
EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done);
4042+
38704043
#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
38714044
/**
38724045
* iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain

include/linux/iommu.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1188,6 +1188,10 @@ void iommu_detach_device_pasid(struct iommu_domain *domain,
11881188
struct device *dev, ioasid_t pasid);
11891189
ioasid_t iommu_alloc_global_pasid(struct device *dev);
11901190
void iommu_free_global_pasid(ioasid_t pasid);
1191+
1192+
/* PCI device reset functions */
1193+
int pci_dev_reset_iommu_prepare(struct pci_dev *pdev);
1194+
void pci_dev_reset_iommu_done(struct pci_dev *pdev);
11911195
#else /* CONFIG_IOMMU_API */
11921196

11931197
struct iommu_ops {};
@@ -1511,6 +1515,15 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev)
15111515
}
15121516

15131517
static inline void iommu_free_global_pasid(ioasid_t pasid) {}
1518+
1519+
static inline int pci_dev_reset_iommu_prepare(struct pci_dev *pdev)
1520+
{
1521+
return 0;
1522+
}
1523+
1524+
static inline void pci_dev_reset_iommu_done(struct pci_dev *pdev)
1525+
{
1526+
}
15141527
#endif /* CONFIG_IOMMU_API */
15151528

15161529
#ifdef CONFIG_IRQ_MSI_IOMMU

include/uapi/linux/vfio.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -964,6 +964,10 @@ struct vfio_device_bind_iommufd {
964964
* hwpt corresponding to the given pt_id.
965965
*
966966
* Return: 0 on success, -errno on failure.
967+
*
968+
* When a device is resetting, -EBUSY will be returned to reject any concurrent
969+
* attachment to the resetting device itself or any sibling device in the IOMMU
970+
* group having the resetting device.
967971
*/
968972
struct vfio_device_attach_iommufd_pt {
969973
__u32 argsz;

0 commit comments

Comments
 (0)