* [PATCH v4 0/2] iommu/s390: fixes related to repeat attach_dev calls @ 2022-08-31 20:12 Matthew Rosato 2022-08-31 20:12 ` [PATCH v4 1/2] iommu/s390: Fix race with release_device ops Matthew Rosato 2022-08-31 20:12 ` [PATCH v4 2/2] iommu/s390: fix leak of s390_domain_device Matthew Rosato 0 siblings, 2 replies; 24+ messages in thread From: Matthew Rosato @ 2022-08-31 20:12 UTC (permalink / raw) To: iommu Cc: linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, robin.murphy, jgg, linux-kernel This series contains a few fixes related to supporting multiple unmanaged iommu domains on s390. Changelog since v3: - add a 2nd patch to fix issues related to leaking s390_domain_device Matthew Rosato (2): iommu/s390: Fix race with release_device ops iommu/s390: fix leak of s390_domain_device arch/s390/include/asm/pci.h | 1 + arch/s390/pci/pci.c | 1 + drivers/iommu/s390-iommu.c | 62 ++++++++++++++++++++++++++++++++++--- 3 files changed, 59 insertions(+), 5 deletions(-) -- 2.37.2 ^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-08-31 20:12 [PATCH v4 0/2] iommu/s390: fixes related to repeat attach_dev calls Matthew Rosato @ 2022-08-31 20:12 ` Matthew Rosato 2022-09-01 7:56 ` Pierre Morel 2022-09-01 10:25 ` Robin Murphy 2022-08-31 20:12 ` [PATCH v4 2/2] iommu/s390: fix leak of s390_domain_device Matthew Rosato 1 sibling, 2 replies; 24+ messages in thread From: Matthew Rosato @ 2022-08-31 20:12 UTC (permalink / raw) To: iommu Cc: linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, robin.murphy, jgg, linux-kernel With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") s390-iommu is supposed to handle dynamic switching between IOMMU domains and the DMA API handling. However, this commit does not sufficiently handle the case where the device is released via a call to the release_device op as it may occur at the same time as an opposing attach_dev or detach_dev since the group mutex is not held over release_device. This was observed if the device is deconfigured during a small window during vfio-pci initialization and can result in WARNs and potential kernel panics. Handle this by tracking when the device is probed/released via dev_iommu_priv_set/get(). Ensure that once the device is released only release_device handles the re-init of the device DMA. Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> --- arch/s390/include/asm/pci.h | 1 + arch/s390/pci/pci.c | 1 + drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++--- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 7b4cdadbc023..080251e7b275 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -157,6 +157,7 @@ struct zpci_dev { /* DMA stuff */ unsigned long *dma_table; spinlock_t dma_table_lock; + struct mutex dma_domain_lock; /* protects s390_domain value */ int tlb_refresh; spinlock_t iommu_bitmap_lock; diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 73cdc5539384..973edd32ecc9 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -832,6 +832,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) kref_init(&zdev->kref); mutex_init(&zdev->lock); mutex_init(&zdev->kzdev_lock); + mutex_init(&zdev->dma_domain_lock); rc = zpci_init_iommu(zdev); if (rc) diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index c898bcbbce11..1137d669e849 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -99,6 +99,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, if (!domain_device) return -ENOMEM; + /* Leave now if the device has already been released */ + mutex_lock(&zdev->dma_domain_lock); + if (!dev_iommu_priv_get(dev)) { + mutex_unlock(&zdev->dma_domain_lock); + kfree(domain_device); + return 0; + } + if (zdev->dma_table && !zdev->s390_domain) { cc = zpci_dma_exit_device(zdev); if (cc) { @@ -132,9 +140,10 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, goto out_restore; } domain_device->zdev = zdev; - zdev->s390_domain = s390_domain; list_add(&domain_device->list, &s390_domain->devices); spin_unlock_irqrestore(&s390_domain->list_lock, flags); + zdev->s390_domain = s390_domain; + mutex_unlock(&zdev->dma_domain_lock); return 0; @@ -147,6 +156,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, virt_to_phys(zdev->dma_table)); } out_free: + mutex_unlock(&zdev->dma_domain_lock); kfree(domain_device); return rc; @@ -176,17 +186,22 @@ static void s390_iommu_detach_device(struct iommu_domain *domain, } spin_unlock_irqrestore(&s390_domain->list_lock, flags); - if (found && (zdev->s390_domain == s390_domain)) { + mutex_lock(&zdev->dma_domain_lock); + if (found && (zdev->s390_domain == s390_domain) && + dev_iommu_priv_get(dev)) { zdev->s390_domain = NULL; zpci_unregister_ioat(zdev, 0); zpci_dma_init_device(zdev); } + mutex_unlock(&zdev->dma_domain_lock); } static struct iommu_device *s390_iommu_probe_device(struct device *dev) { struct zpci_dev *zdev = to_zpci_dev(dev); + dev_iommu_priv_set(dev, zdev); + return &zdev->iommu_dev; } @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev) * * So let's call detach_dev from here if it hasn't been called before. */ - if (zdev && zdev->s390_domain) { + if (zdev) { + /* + * Clear priv to block further attaches for this device, + * ensure detaches don't init DMA. Hold the domain lock + * to ensure that attach/detach get a consistent view of + * whether or not the device is released. + */ + mutex_lock(&zdev->dma_domain_lock); + dev_iommu_priv_set(dev, NULL); + mutex_unlock(&zdev->dma_domain_lock); + /* Make sure this device is removed from the domain list */ domain = iommu_get_domain_for_dev(dev); if (domain) s390_iommu_detach_device(domain, dev); + /* Now ensure DMA is initialized from here */ + mutex_lock(&zdev->dma_domain_lock); + if (zdev->s390_domain) { + zdev->s390_domain = NULL; + zpci_unregister_ioat(zdev, 0); + zpci_dma_init_device(zdev); + } + mutex_unlock(&zdev->dma_domain_lock); } } -- 2.37.2 ^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-08-31 20:12 ` [PATCH v4 1/2] iommu/s390: Fix race with release_device ops Matthew Rosato @ 2022-09-01 7:56 ` Pierre Morel 2022-09-01 9:37 ` Niklas Schnelle 2022-09-01 10:25 ` Robin Murphy 1 sibling, 1 reply; 24+ messages in thread From: Pierre Morel @ 2022-09-01 7:56 UTC (permalink / raw) To: Matthew Rosato, iommu Cc: linux-s390, schnelle, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, robin.murphy, jgg, linux-kernel On 8/31/22 22:12, Matthew Rosato wrote: > With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev > calls") s390-iommu is supposed to handle dynamic switching between IOMMU > domains and the DMA API handling. However, this commit does not > sufficiently handle the case where the device is released via a call > to the release_device op as it may occur at the same time as an opposing > attach_dev or detach_dev since the group mutex is not held over > release_device. This was observed if the device is deconfigured during a > small window during vfio-pci initialization and can result in WARNs and > potential kernel panics. > > Handle this by tracking when the device is probed/released via > dev_iommu_priv_set/get(). Ensure that once the device is released only > release_device handles the re-init of the device DMA. > > Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") > Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> > --- > arch/s390/include/asm/pci.h | 1 + > arch/s390/pci/pci.c | 1 + > drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++--- > 3 files changed, 38 insertions(+), 3 deletions(-) > > diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h > index 7b4cdadbc023..080251e7b275 100644 > --- a/arch/s390/include/asm/pci.h > +++ b/arch/s390/include/asm/pci.h > @@ -157,6 +157,7 @@ struct zpci_dev { > /* DMA stuff */ > unsigned long *dma_table; > spinlock_t dma_table_lock; > + struct mutex dma_domain_lock; /* protects s390_domain value */ > int tlb_refresh; > > spinlock_t iommu_bitmap_lock; > diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c > index 73cdc5539384..973edd32ecc9 100644 > --- a/arch/s390/pci/pci.c > +++ b/arch/s390/pci/pci.c > @@ -832,6 +832,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) > kref_init(&zdev->kref); > mutex_init(&zdev->lock); > mutex_init(&zdev->kzdev_lock); > + mutex_init(&zdev->dma_domain_lock); > > rc = zpci_init_iommu(zdev); > if (rc) > diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c > index c898bcbbce11..1137d669e849 100644 > --- a/drivers/iommu/s390-iommu.c > +++ b/drivers/iommu/s390-iommu.c > @@ -99,6 +99,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, > if (!domain_device) > return -ENOMEM; > > + /* Leave now if the device has already been released */ > + mutex_lock(&zdev->dma_domain_lock); > + if (!dev_iommu_priv_get(dev)) { > + mutex_unlock(&zdev->dma_domain_lock); > + kfree(domain_device); > + return 0; > + } > + > if (zdev->dma_table && !zdev->s390_domain) { > cc = zpci_dma_exit_device(zdev); > if (cc) { > @@ -132,9 +140,10 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, > goto out_restore; > } > domain_device->zdev = zdev; > - zdev->s390_domain = s390_domain; > list_add(&domain_device->list, &s390_domain->devices); > spin_unlock_irqrestore(&s390_domain->list_lock, flags); > + zdev->s390_domain = s390_domain; > + mutex_unlock(&zdev->dma_domain_lock); > > return 0; > > @@ -147,6 +156,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, > virt_to_phys(zdev->dma_table)); > } > out_free: > + mutex_unlock(&zdev->dma_domain_lock); > kfree(domain_device); > > return rc; > @@ -176,17 +186,22 @@ static void s390_iommu_detach_device(struct iommu_domain *domain, > } > spin_unlock_irqrestore(&s390_domain->list_lock, flags); > > - if (found && (zdev->s390_domain == s390_domain)) { > + mutex_lock(&zdev->dma_domain_lock); > + if (found && (zdev->s390_domain == s390_domain) && > + dev_iommu_priv_get(dev)) { > zdev->s390_domain = NULL; > zpci_unregister_ioat(zdev, 0); > zpci_dma_init_device(zdev); > } > + mutex_unlock(&zdev->dma_domain_lock); > } > > static struct iommu_device *s390_iommu_probe_device(struct device *dev) > { > struct zpci_dev *zdev = to_zpci_dev(dev); > > + dev_iommu_priv_set(dev, zdev); > + > return &zdev->iommu_dev; > } > > @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev) > * > * So let's call detach_dev from here if it hasn't been called before. > */ > - if (zdev && zdev->s390_domain) { > + if (zdev) { > + /* > + * Clear priv to block further attaches for this device, > + * ensure detaches don't init DMA. Hold the domain lock > + * to ensure that attach/detach get a consistent view of > + * whether or not the device is released. > + */ > + mutex_lock(&zdev->dma_domain_lock); > + dev_iommu_priv_set(dev, NULL); > + mutex_unlock(&zdev->dma_domain_lock); We release the lock here to later call s390_iommu_detach_device safely right? Couldn't we keep the lock and put the common code from s390_iommu_release_device and s390_iommu_detach_device inside a common function? > + /* Make sure this device is removed from the domain list */ > domain = iommu_get_domain_for_dev(dev); > if (domain) > s390_iommu_detach_device(domain, dev); > + /* Now ensure DMA is initialized from here */ > + mutex_lock(&zdev->dma_domain_lock); > + if (zdev->s390_domain) { > + zdev->s390_domain = NULL; > + zpci_unregister_ioat(zdev, 0); > + zpci_dma_init_device(zdev); Sorry if it is a stupid question, but two things looks strange to me: - having DMA initialized just after having unregistered the IOAT Is that really all we need to unregister before calling dma_init_device? - having DMA initialized inside the release_device callback: Why isn't it done in the device_probe ? > + } > + mutex_unlock(&zdev->dma_domain_lock); > } > } > > -- Pierre Morel IBM Lab Boeblingen ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 7:56 ` Pierre Morel @ 2022-09-01 9:37 ` Niklas Schnelle 2022-09-01 11:01 ` Robin Murphy 2022-09-01 20:28 ` Matthew Rosato 0 siblings, 2 replies; 24+ messages in thread From: Niklas Schnelle @ 2022-09-01 9:37 UTC (permalink / raw) To: Pierre Morel, Matthew Rosato, iommu Cc: linux-s390, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, robin.murphy, jgg, linux-kernel On Thu, 2022-09-01 at 09:56 +0200, Pierre Morel wrote: > > On 8/31/22 22:12, Matthew Rosato wrote: > > With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev > > calls") s390-iommu is supposed to handle dynamic switching between IOMMU > > domains and the DMA API handling. However, this commit does not > > sufficiently handle the case where the device is released via a call > > to the release_device op as it may occur at the same time as an opposing > > attach_dev or detach_dev since the group mutex is not held over > > release_device. This was observed if the device is deconfigured during a > > small window during vfio-pci initialization and can result in WARNs and > > potential kernel panics. > > > > Handle this by tracking when the device is probed/released via > > dev_iommu_priv_set/get(). Ensure that once the device is released only > > release_device handles the re-init of the device DMA. > > > > Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") > > Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> > > --- > > arch/s390/include/asm/pci.h | 1 + > > arch/s390/pci/pci.c | 1 + > > drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++--- > > 3 files changed, 38 insertions(+), 3 deletions(-) > > > > ---8<--- > > > > @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev) > > > ---8<--- > > + /* Make sure this device is removed from the domain list */ > > domain = iommu_get_domain_for_dev(dev); > > if (domain) > > s390_iommu_detach_device(domain, dev); > > + /* Now ensure DMA is initialized from here */ > > + mutex_lock(&zdev->dma_domain_lock); > > + if (zdev->s390_domain) { > > + zdev->s390_domain = NULL; > > + zpci_unregister_ioat(zdev, 0); > > + zpci_dma_init_device(zdev); > > Sorry if it is a stupid question, but two things looks strange to me: > > - having DMA initialized just after having unregistered the IOAT > Is that really all we need to unregister before calling dma_init_device? > > - having DMA initialized inside the release_device callback: > Why isn't it done in the device_probe ? As I understand it iommu_release_device() which calls this code is only used when a device goes away. So, I think you're right in that it makes little sense to re-initialize DMA at this point, it's going to be torn down immediately after anyway. I do wonder if it would be an acceptably small change to just set zdev->s390_domain = NULL here and leave DMA uninitialized while making zpci_dma_exit_device() deal with that e.g. by doing nothing if zdev->dma_table is NULL but I'm not sure. Either way I fear this mess really is just a symptom of our current design oddity of driving the same IOMMU hardware through both our DMA API implementation (arch/s390/pci_dma.c) and the IOMMU driver (driver/iommu/s390-iommu.c) and trying to hand off between them smoothly where common code instead just layers one atop the other when using an IOMMU at all. I think the correct medium term solution is to use the common DMA API implementation (drivers/iommu/dma-iommu.c) like everyone else. But that isn't the minimal fix we need now. I do have a working prototype of using the common implementation but the big problem that I'm still searching a solution for is its performance with a virtualized IOMMU where IOTLB flushes (RPCIT on s390) are used for shadowing and are expensive and serialized. The optimization we used so far for unmap, only doing one global IOTLB flush once we run out of IOVA space, is just too much better in that scenario to just ignore. As one data point, on an NVMe I get about _twice_ the IOPS when using our existing scheme compared to strict mode. Which makes sense as IOTLB flushes are known as the bottleneck and optimizing unmap like that reduces them by almost half. Queued flushing is still much worse likely due to serialization of the shadowing, though again it works great on LPAR. To make sure it's not due to some bug in the IOMMU driver I even tried converting our existing DMA driver to layer on top of the IOMMU driver with the same result. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 9:37 ` Niklas Schnelle @ 2022-09-01 11:01 ` Robin Murphy 2022-09-01 13:42 ` Niklas Schnelle 2022-09-01 20:28 ` Matthew Rosato 1 sibling, 1 reply; 24+ messages in thread From: Robin Murphy @ 2022-09-01 11:01 UTC (permalink / raw) To: Niklas Schnelle, Pierre Morel, Matthew Rosato, iommu Cc: linux-s390, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, jgg, linux-kernel On 2022-09-01 10:37, Niklas Schnelle wrote: > On Thu, 2022-09-01 at 09:56 +0200, Pierre Morel wrote: >> >> On 8/31/22 22:12, Matthew Rosato wrote: >>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev >>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU >>> domains and the DMA API handling. However, this commit does not >>> sufficiently handle the case where the device is released via a call >>> to the release_device op as it may occur at the same time as an opposing >>> attach_dev or detach_dev since the group mutex is not held over >>> release_device. This was observed if the device is deconfigured during a >>> small window during vfio-pci initialization and can result in WARNs and >>> potential kernel panics. >>> >>> Handle this by tracking when the device is probed/released via >>> dev_iommu_priv_set/get(). Ensure that once the device is released only >>> release_device handles the re-init of the device DMA. >>> >>> Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") >>> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> >>> --- >>> arch/s390/include/asm/pci.h | 1 + >>> arch/s390/pci/pci.c | 1 + >>> drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++--- >>> 3 files changed, 38 insertions(+), 3 deletions(-) >>> >>> > ---8<--- >>> >>> @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev) >>> >> > ---8<--- >>> + /* Make sure this device is removed from the domain list */ >>> domain = iommu_get_domain_for_dev(dev); >>> if (domain) >>> s390_iommu_detach_device(domain, dev); >>> + /* Now ensure DMA is initialized from here */ >>> + mutex_lock(&zdev->dma_domain_lock); >>> + if (zdev->s390_domain) { >>> + zdev->s390_domain = NULL; >>> + zpci_unregister_ioat(zdev, 0); >>> + zpci_dma_init_device(zdev); >> >> Sorry if it is a stupid question, but two things looks strange to me: >> >> - having DMA initialized just after having unregistered the IOAT >> Is that really all we need to unregister before calling dma_init_device? >> >> - having DMA initialized inside the release_device callback: >> Why isn't it done in the device_probe ? > > As I understand it iommu_release_device() which calls this code is only > used when a device goes away. So, I think you're right in that it makes > little sense to re-initialize DMA at this point, it's going to be torn > down immediately after anyway. I do wonder if it would be an acceptably > small change to just set zdev->s390_domain = NULL here and leave DMA > uninitialized while making zpci_dma_exit_device() deal with that e.g. > by doing nothing if zdev->dma_table is NULL but I'm not sure. > > Either way I fear this mess really is just a symptom of our current > design oddity of driving the same IOMMU hardware through both our DMA > API implementation (arch/s390/pci_dma.c) and the IOMMU driver > (driver/iommu/s390-iommu.c) and trying to hand off between them > smoothly where common code instead just layers one atop the other when > using an IOMMU at all. > > I think the correct medium term solution is to use the common DMA API > implementation (drivers/iommu/dma-iommu.c) like everyone else. But that > isn't the minimal fix we need now. > > I do have a working prototype of using the common implementation but > the big problem that I'm still searching a solution for is its > performance with a virtualized IOMMU where IOTLB flushes (RPCIT on > s390) are used for shadowing and are expensive and serialized. The > optimization we used so far for unmap, only doing one global IOTLB > flush once we run out of IOVA space, is just too much better in that > scenario to just ignore. As one data point, on an NVMe I get about > _twice_ the IOPS when using our existing scheme compared to strict > mode. Which makes sense as IOTLB flushes are known as the bottleneck > and optimizing unmap like that reduces them by almost half. Queued > flushing is still much worse likely due to serialization of the > shadowing, though again it works great on LPAR. To make sure it's not > due to some bug in the IOMMU driver I even tried converting our > existing DMA driver to layer on top of the IOMMU driver with the same > result. FWIW, can you approximate the same behaviour by just making IOVA_FQ_SIZE and IOVA_FQ_TIMEOUT really big, and deferring your zpci_refresh_trans() hook from .unmap to .flush_iotlb_all when in non-strict mode? I'm not against the idea of trying to support this mode of operation better in the common code, since it seems like it could potentially be useful for *any* virtualised scenario where trapping to invalidate is expensive and the user is happy to trade off the additional address space/memory overhead (and even greater loss of memory protection) against that. Robin. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 11:01 ` Robin Murphy @ 2022-09-01 13:42 ` Niklas Schnelle 2022-09-01 14:17 ` Niklas Schnelle 2022-09-01 14:29 ` Robin Murphy 0 siblings, 2 replies; 24+ messages in thread From: Niklas Schnelle @ 2022-09-01 13:42 UTC (permalink / raw) To: Robin Murphy, Pierre Morel, Matthew Rosato, iommu Cc: linux-s390, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, jgg, linux-kernel On Thu, 2022-09-01 at 12:01 +0100, Robin Murphy wrote: > On 2022-09-01 10:37, Niklas Schnelle wrote: > > On Thu, 2022-09-01 at 09:56 +0200, Pierre Morel wrote: > > > On 8/31/22 22:12, Matthew Rosato wrote: > > > > With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev > > > > calls") s390-iommu is supposed to handle dynamic switching between IOMMU > > > > domains and the DMA API handling. However, this commit does not > > > > sufficiently handle the case where the device is released via a call > > > > to the release_device op as it may occur at the same time as an opposing > > > > attach_dev or detach_dev since the group mutex is not held over > > > > release_device. This was observed if the device is deconfigured during a > > > > small window during vfio-pci initialization and can result in WARNs and > > > > potential kernel panics. > > > > > > > > Handle this by tracking when the device is probed/released via > > > > dev_iommu_priv_set/get(). Ensure that once the device is released only > > > > release_device handles the re-init of the device DMA. > > > > > > > > Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") > > > > Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> > > > > --- > > > > arch/s390/include/asm/pci.h | 1 + > > > > arch/s390/pci/pci.c | 1 + > > > > drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++--- > > > > 3 files changed, 38 insertions(+), 3 deletions(-) > > > > > > > > > > ---8<--- > > > > > > > > @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev) > > > > > > ---8<--- > > > > + /* Make sure this device is removed from the domain list */ > > > > domain = iommu_get_domain_for_dev(dev); > > > > if (domain) > > > > s390_iommu_detach_device(domain, dev); > > > > + /* Now ensure DMA is initialized from here */ > > > > + mutex_lock(&zdev->dma_domain_lock); > > > > + if (zdev->s390_domain) { > > > > + zdev->s390_domain = NULL; > > > > + zpci_unregister_ioat(zdev, 0); > > > > + zpci_dma_init_device(zdev); > > > > > > Sorry if it is a stupid question, but two things looks strange to me: > > > > > > - having DMA initialized just after having unregistered the IOAT > > > Is that really all we need to unregister before calling dma_init_device? > > > > > > - having DMA initialized inside the release_device callback: > > > Why isn't it done in the device_probe ? > > > > As I understand it iommu_release_device() which calls this code is only > > used when a device goes away. So, I think you're right in that it makes > > little sense to re-initialize DMA at this point, it's going to be torn > > down immediately after anyway. I do wonder if it would be an acceptably > > small change to just set zdev->s390_domain = NULL here and leave DMA > > uninitialized while making zpci_dma_exit_device() deal with that e.g. > > by doing nothing if zdev->dma_table is NULL but I'm not sure. > > > > Either way I fear this mess really is just a symptom of our current > > design oddity of driving the same IOMMU hardware through both our DMA > > API implementation (arch/s390/pci_dma.c) and the IOMMU driver > > (driver/iommu/s390-iommu.c) and trying to hand off between them > > smoothly where common code instead just layers one atop the other when > > using an IOMMU at all. > > > > I think the correct medium term solution is to use the common DMA API > > implementation (drivers/iommu/dma-iommu.c) like everyone else. But that > > isn't the minimal fix we need now. > > > > I do have a working prototype of using the common implementation but > > the big problem that I'm still searching a solution for is its > > performance with a virtualized IOMMU where IOTLB flushes (RPCIT on > > s390) are used for shadowing and are expensive and serialized. The > > optimization we used so far for unmap, only doing one global IOTLB > > flush once we run out of IOVA space, is just too much better in that > > scenario to just ignore. As one data point, on an NVMe I get about > > _twice_ the IOPS when using our existing scheme compared to strict > > mode. Which makes sense as IOTLB flushes are known as the bottleneck > > and optimizing unmap like that reduces them by almost half. Queued > > flushing is still much worse likely due to serialization of the > > shadowing, though again it works great on LPAR. To make sure it's not > > due to some bug in the IOMMU driver I even tried converting our > > existing DMA driver to layer on top of the IOMMU driver with the same > > result. > > FWIW, can you approximate the same behaviour by just making IOVA_FQ_SIZE > and IOVA_FQ_TIMEOUT really big, and deferring your zpci_refresh_trans() > hook from .unmap to .flush_iotlb_all when in non-strict mode? > > I'm not against the idea of trying to support this mode of operation > better in the common code, since it seems like it could potentially be > useful for *any* virtualised scenario where trapping to invalidate is > expensive and the user is happy to trade off the additional address > space/memory overhead (and even greater loss of memory protection) > against that. > > Robin. Ah thanks for reminding me. I had tried that earlier but quickly ran into the size limit of per-CPU allocations. This time I turned the "struct iova_fq_entry entries" member into a pointer and allocted that with vmalloc(). Also thankfully the ops->flush_iotlb_all(), iommu_iotlb_sync(), and iommu_iotlb_sync_map() already perfectly match our needs. Okay, this is _very_ interesting. With the above cranking IOVA_FQ_SIZE all the way to 32768 and IOVA_FQ_TIMEOUT to 4000 ms, I can get to about 91% of the performance of our scheme (layered on the IOMMU API). That also seems to be the limit. I guess there is also more overhead than with our bitset IOVA allocation that doesn't need any bookkeeping besides a "lazily unmapped" bit per page. With a more sane IOVA_FQ_SIZE of 8192 and 100 ms timeout I still get about 76% of the performance. Interestingly with the above changes but default values for IOVA_FQ_SIZE/IOVA_FQ_TIMEOUT things are much worse than even strict mode (~50%) and I get less than 8% the IOPS with this NVMe. So yeah it seems you're right and one can largely emulate our scheme with this. I do wonder if we could go further and do a "flush on running out of IOVAs" domain type with acceptable changes. My rough idea would be to collect lazily freed IOVAs in the same data structure as the free IOVAs, then on running out of those one can simply do a global IOTLB flush and the lazily freed IOVAs become the new free IOVAs. With that the global reset would be even cheaper than with our bitmaps. For a generic case one would of course also need to track the gather->freelist that we don't use in s390 but e.g. virtio-iommu doesn't seem to use that either. What do you think? ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 13:42 ` Niklas Schnelle @ 2022-09-01 14:17 ` Niklas Schnelle 2022-09-01 14:29 ` Robin Murphy 1 sibling, 0 replies; 24+ messages in thread From: Niklas Schnelle @ 2022-09-01 14:17 UTC (permalink / raw) To: Robin Murphy, Pierre Morel, Matthew Rosato, iommu Cc: linux-s390, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, jgg, linux-kernel ---8<--- > > > > > > I do have a working prototype of using the common implementation but > > > the big problem that I'm still searching a solution for is its > > > performance with a virtualized IOMMU where IOTLB flushes (RPCIT on > > > s390) are used for shadowing and are expensive and serialized. The > > > optimization we used so far for unmap, only doing one global IOTLB > > > flush once we run out of IOVA space, is just too much better in that > > > scenario to just ignore. As one data point, on an NVMe I get about > > > _twice_ the IOPS when using our existing scheme compared to strict > > > mode. Which makes sense as IOTLB flushes are known as the bottleneck > > > and optimizing unmap like that reduces them by almost half. Queued > > > flushing is still much worse likely due to serialization of the > > > shadowing, though again it works great on LPAR. To make sure it's not > > > due to some bug in the IOMMU driver I even tried converting our > > > existing DMA driver to layer on top of the IOMMU driver with the same > > > result. > > > > FWIW, can you approximate the same behaviour by just making IOVA_FQ_SIZE > > and IOVA_FQ_TIMEOUT really big, and deferring your zpci_refresh_trans() > > hook from .unmap to .flush_iotlb_all when in non-strict mode? > > > > I'm not against the idea of trying to support this mode of operation > > better in the common code, since it seems like it could potentially be > > useful for *any* virtualised scenario where trapping to invalidate is > > expensive and the user is happy to trade off the additional address > > space/memory overhead (and even greater loss of memory protection) > > against that. > > > > Robin. > > Ah thanks for reminding me. I had tried that earlier but quickly ran > into the size limit of per-CPU allocations. This time I turned the > "struct iova_fq_entry entries" member into a pointer and allocted that > with vmalloc(). Also thankfully the ops->flush_iotlb_all(), iommu_iotlb_sync(), and iommu_iotlb_sync_map() already perfectly match > our needs. > > Okay, this is _very_ interesting. With the above cranking IOVA_FQ_SIZE > all the way to 32768 and IOVA_FQ_TIMEOUT to 4000 ms, I can get to about > 91% of the performance of our scheme (layered on the IOMMU API). That > also seems to be the limit. I guess there is also more overhead than > with our bitset IOVA allocation that doesn't need any bookkeeping > besides a "lazily unmapped" bit per page. With a more sane IOVA_FQ_SIZE > of 8192 and 100 ms timeout I still get about 76% of the performance. > > Interestingly with the above changes but default values for > IOVA_FQ_SIZE/IOVA_FQ_TIMEOUT things are much worse than even strict > mode (~50%) and I get less than 8% the IOPS with this NVMe. > > So yeah it seems you're right and one can largely emulate our scheme > with this. I do wonder if we could go further and do a "flush on > running out of IOVAs" domain type with acceptable changes. My rough > idea would be to collect lazily freed IOVAs in the same data structure > as the free IOVAs, then on running out of those one can simply do a > global IOTLB flush and the lazily freed IOVAs become the new free > IOVAs. With that the global reset would be even cheaper than with our > bitmaps. Ok disregard the last part, that's obviously not how the IOVA allocation works. Will have to take an actual look. > For a generic case one would of course also need to track the > gather->freelist that we don't use in s390 but e.g. virtio-iommu > doesn't seem to use that either. What do you think? > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 13:42 ` Niklas Schnelle 2022-09-01 14:17 ` Niklas Schnelle @ 2022-09-01 14:29 ` Robin Murphy 2022-09-01 14:34 ` Jason Gunthorpe 1 sibling, 1 reply; 24+ messages in thread From: Robin Murphy @ 2022-09-01 14:29 UTC (permalink / raw) To: Niklas Schnelle, Pierre Morel, Matthew Rosato, iommu Cc: linux-s390, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, jgg, linux-kernel On 2022-09-01 14:42, Niklas Schnelle wrote: > On Thu, 2022-09-01 at 12:01 +0100, Robin Murphy wrote: >> On 2022-09-01 10:37, Niklas Schnelle wrote: >>> On Thu, 2022-09-01 at 09:56 +0200, Pierre Morel wrote: >>>> On 8/31/22 22:12, Matthew Rosato wrote: >>>>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev >>>>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU >>>>> domains and the DMA API handling. However, this commit does not >>>>> sufficiently handle the case where the device is released via a call >>>>> to the release_device op as it may occur at the same time as an opposing >>>>> attach_dev or detach_dev since the group mutex is not held over >>>>> release_device. This was observed if the device is deconfigured during a >>>>> small window during vfio-pci initialization and can result in WARNs and >>>>> potential kernel panics. >>>>> >>>>> Handle this by tracking when the device is probed/released via >>>>> dev_iommu_priv_set/get(). Ensure that once the device is released only >>>>> release_device handles the re-init of the device DMA. >>>>> >>>>> Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") >>>>> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> >>>>> --- >>>>> arch/s390/include/asm/pci.h | 1 + >>>>> arch/s390/pci/pci.c | 1 + >>>>> drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++--- >>>>> 3 files changed, 38 insertions(+), 3 deletions(-) >>>>> >>>>> >>> ---8<--- >>>>> >>>>> @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev) >>>>> >>> ---8<--- >>>>> + /* Make sure this device is removed from the domain list */ >>>>> domain = iommu_get_domain_for_dev(dev); >>>>> if (domain) >>>>> s390_iommu_detach_device(domain, dev); >>>>> + /* Now ensure DMA is initialized from here */ >>>>> + mutex_lock(&zdev->dma_domain_lock); >>>>> + if (zdev->s390_domain) { >>>>> + zdev->s390_domain = NULL; >>>>> + zpci_unregister_ioat(zdev, 0); >>>>> + zpci_dma_init_device(zdev); >>>> >>>> Sorry if it is a stupid question, but two things looks strange to me: >>>> >>>> - having DMA initialized just after having unregistered the IOAT >>>> Is that really all we need to unregister before calling dma_init_device? >>>> >>>> - having DMA initialized inside the release_device callback: >>>> Why isn't it done in the device_probe ? >>> >>> As I understand it iommu_release_device() which calls this code is only >>> used when a device goes away. So, I think you're right in that it makes >>> little sense to re-initialize DMA at this point, it's going to be torn >>> down immediately after anyway. I do wonder if it would be an acceptably >>> small change to just set zdev->s390_domain = NULL here and leave DMA >>> uninitialized while making zpci_dma_exit_device() deal with that e.g. >>> by doing nothing if zdev->dma_table is NULL but I'm not sure. >>> >>> Either way I fear this mess really is just a symptom of our current >>> design oddity of driving the same IOMMU hardware through both our DMA >>> API implementation (arch/s390/pci_dma.c) and the IOMMU driver >>> (driver/iommu/s390-iommu.c) and trying to hand off between them >>> smoothly where common code instead just layers one atop the other when >>> using an IOMMU at all. >>> >>> I think the correct medium term solution is to use the common DMA API >>> implementation (drivers/iommu/dma-iommu.c) like everyone else. But that >>> isn't the minimal fix we need now. >>> >>> I do have a working prototype of using the common implementation but >>> the big problem that I'm still searching a solution for is its >>> performance with a virtualized IOMMU where IOTLB flushes (RPCIT on >>> s390) are used for shadowing and are expensive and serialized. The >>> optimization we used so far for unmap, only doing one global IOTLB >>> flush once we run out of IOVA space, is just too much better in that >>> scenario to just ignore. As one data point, on an NVMe I get about >>> _twice_ the IOPS when using our existing scheme compared to strict >>> mode. Which makes sense as IOTLB flushes are known as the bottleneck >>> and optimizing unmap like that reduces them by almost half. Queued >>> flushing is still much worse likely due to serialization of the >>> shadowing, though again it works great on LPAR. To make sure it's not >>> due to some bug in the IOMMU driver I even tried converting our >>> existing DMA driver to layer on top of the IOMMU driver with the same >>> result. >> >> FWIW, can you approximate the same behaviour by just making IOVA_FQ_SIZE >> and IOVA_FQ_TIMEOUT really big, and deferring your zpci_refresh_trans() >> hook from .unmap to .flush_iotlb_all when in non-strict mode? >> >> I'm not against the idea of trying to support this mode of operation >> better in the common code, since it seems like it could potentially be >> useful for *any* virtualised scenario where trapping to invalidate is >> expensive and the user is happy to trade off the additional address >> space/memory overhead (and even greater loss of memory protection) >> against that. >> >> Robin. > > > Ah thanks for reminding me. I had tried that earlier but quickly ran > into the size limit of per-CPU allocations. This time I turned the > "struct iova_fq_entry entries" member into a pointer and allocted that > with vmalloc(). Also thankfully the ops->flush_iotlb_all(), iommu_iotlb_sync(), and iommu_iotlb_sync_map() already perfectly match > our needs. > > Okay, this is _very_ interesting. With the above cranking IOVA_FQ_SIZE > all the way to 32768 and IOVA_FQ_TIMEOUT to 4000 ms, I can get to about > 91% of the performance of our scheme (layered on the IOMMU API). That > also seems to be the limit. I guess there is also more overhead than > with our bitset IOVA allocation that doesn't need any bookkeeping > besides a "lazily unmapped" bit per page. With a more sane IOVA_FQ_SIZE > of 8192 and 100 ms timeout I still get about 76% of the performance. Promising indeed... come to think of it, if you weren't already using it then "iommu.forcedac=1" should save a bit more time in the IOVA allocator especially if we're deliberately letting the address space fill up. Clearly your current allocator doesn't have to work around broken x86 PCs, so you're at liberty to convince the common one not to either. > Interestingly with the above changes but default values for > IOVA_FQ_SIZE/IOVA_FQ_TIMEOUT things are much worse than even strict > mode (~50%) and I get less than 8% the IOPS with this NVMe. > > So yeah it seems you're right and one can largely emulate our scheme > with this. I do wonder if we could go further and do a "flush on > running out of IOVAs" domain type with acceptable changes. My rough > idea would be to collect lazily freed IOVAs in the same data structure > as the free IOVAs, then on running out of those one can simply do a > global IOTLB flush and the lazily freed IOVAs become the new free > IOVAs. With that the global reset would be even cheaper than with our > bitmaps. For a generic case one would of course also need to track the > gather->freelist that we don't use in s390 but e.g. virtio-iommu > doesn't seem to use that either. What do you think? Right, the next step would be to bridge that gap to iommu-dma to dump the flush queue when IOVA allocation failure implies we've reached the "rollover" point, and perhaps not use the timer at all. By that point a dedicated domain type, or at least some definite internal flag, for this alternate behaviour seems like the logical way to go. Cheers, Robin. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 14:29 ` Robin Murphy @ 2022-09-01 14:34 ` Jason Gunthorpe 2022-09-01 15:03 ` Robin Murphy 0 siblings, 1 reply; 24+ messages in thread From: Jason Gunthorpe @ 2022-09-01 14:34 UTC (permalink / raw) To: Robin Murphy Cc: Niklas Schnelle, Pierre Morel, Matthew Rosato, iommu, linux-s390, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On Thu, Sep 01, 2022 at 03:29:16PM +0100, Robin Murphy wrote: > Right, the next step would be to bridge that gap to iommu-dma to dump the > flush queue when IOVA allocation failure implies we've reached the > "rollover" point, and perhaps not use the timer at all. By that point a > dedicated domain type, or at least some definite internal flag, for this > alternate behaviour seems like the logical way to go. At least on this direction, I've been thinking it would be nice to replace the domain type _FQ with a flag inside the domain, maybe the ops, saying how the domain wants the common DMA API to operate. If it wants FQ mode or other tuning parameters Jason ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 14:34 ` Jason Gunthorpe @ 2022-09-01 15:03 ` Robin Murphy 2022-09-01 15:49 ` Jason Gunthorpe 0 siblings, 1 reply; 24+ messages in thread From: Robin Murphy @ 2022-09-01 15:03 UTC (permalink / raw) To: Jason Gunthorpe Cc: Niklas Schnelle, Pierre Morel, Matthew Rosato, iommu, linux-s390, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On 2022-09-01 15:34, Jason Gunthorpe wrote: > On Thu, Sep 01, 2022 at 03:29:16PM +0100, Robin Murphy wrote: > >> Right, the next step would be to bridge that gap to iommu-dma to dump the >> flush queue when IOVA allocation failure implies we've reached the >> "rollover" point, and perhaps not use the timer at all. By that point a >> dedicated domain type, or at least some definite internal flag, for this >> alternate behaviour seems like the logical way to go. > > At least on this direction, I've been thinking it would be nice to > replace the domain type _FQ with a flag inside the domain, maybe the > ops, saying how the domain wants the common DMA API to operate. If it > wants FQ mode or other tuning parameters Compare the not-necessarily-obvious matrix of "strict" and "passthrough" command-line parameters with the nice understandable kconfig and sysfs controls for a reminder of why I moved things *from* that paradigm in the first place ;) This idea still fits perfectly into the the "continuum of strictness" notion underlying that domain type rework, since it potentially leaves a lot more address space mapped for a much longer time than our current FQ implementation. I would agree that exposing FQ tuneables in their own right may well have some potential value, much like John's equivalent idea for the IOVA cache layer, but I for one have no desire to bring back DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE, much less any further mess of disjoint properties at that level. Thanks, Robin. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 15:03 ` Robin Murphy @ 2022-09-01 15:49 ` Jason Gunthorpe 2022-09-01 17:00 ` Robin Murphy 0 siblings, 1 reply; 24+ messages in thread From: Jason Gunthorpe @ 2022-09-01 15:49 UTC (permalink / raw) To: Robin Murphy Cc: Niklas Schnelle, Pierre Morel, Matthew Rosato, iommu, linux-s390, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On Thu, Sep 01, 2022 at 04:03:36PM +0100, Robin Murphy wrote: > On 2022-09-01 15:34, Jason Gunthorpe wrote: > > On Thu, Sep 01, 2022 at 03:29:16PM +0100, Robin Murphy wrote: > > > > > Right, the next step would be to bridge that gap to iommu-dma to dump the > > > flush queue when IOVA allocation failure implies we've reached the > > > "rollover" point, and perhaps not use the timer at all. By that point a > > > dedicated domain type, or at least some definite internal flag, for this > > > alternate behaviour seems like the logical way to go. > > > > At least on this direction, I've been thinking it would be nice to > > replace the domain type _FQ with a flag inside the domain, maybe the > > ops, saying how the domain wants the common DMA API to operate. If it > > wants FQ mode or other tuning parameters > > Compare the not-necessarily-obvious matrix of "strict" and "passthrough" > command-line parameters with the nice understandable kconfig and sysfs > controls for a reminder of why I moved things *from* that paradigm in the > first place ;) I'm looking at it from a code perspective, where the drivers don't seem to actually care about DMA_FQ. eg search for it in the drivers and you mostly see: (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_DMA_FQ)) The exception is domain_alloc which fails in cases where the domain doesn't 'support' FQ. But support FQ or not can be cast as a simple capability flag in the domain. We don't need a whole type for the driver to communicate it. The strictness level belongs completely in the core code, it shouldn't leak into the driver. The same general comment seems to be true of IOMMU_DOMAIN_DMA. All the drivers implement this as an UNMANAGED domain. There are only two places in the Intel driver that do anything special with DOMAIN_DMA vs DOMAIN_UNMANAGED (and possibly that is just cruft). So the "does this support DMA API" is also just a capability flag, and doesn't really need a whole type. This is what I mean, not going backwards to the driver specifying strictness policy. Jason ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 15:49 ` Jason Gunthorpe @ 2022-09-01 17:00 ` Robin Murphy 0 siblings, 0 replies; 24+ messages in thread From: Robin Murphy @ 2022-09-01 17:00 UTC (permalink / raw) To: Jason Gunthorpe Cc: Niklas Schnelle, Pierre Morel, Matthew Rosato, iommu, linux-s390, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On 2022-09-01 16:49, Jason Gunthorpe wrote: > On Thu, Sep 01, 2022 at 04:03:36PM +0100, Robin Murphy wrote: >> On 2022-09-01 15:34, Jason Gunthorpe wrote: >>> On Thu, Sep 01, 2022 at 03:29:16PM +0100, Robin Murphy wrote: >>> >>>> Right, the next step would be to bridge that gap to iommu-dma to dump the >>>> flush queue when IOVA allocation failure implies we've reached the >>>> "rollover" point, and perhaps not use the timer at all. By that point a >>>> dedicated domain type, or at least some definite internal flag, for this >>>> alternate behaviour seems like the logical way to go. >>> >>> At least on this direction, I've been thinking it would be nice to >>> replace the domain type _FQ with a flag inside the domain, maybe the >>> ops, saying how the domain wants the common DMA API to operate. If it >>> wants FQ mode or other tuning parameters >> >> Compare the not-necessarily-obvious matrix of "strict" and "passthrough" >> command-line parameters with the nice understandable kconfig and sysfs >> controls for a reminder of why I moved things *from* that paradigm in the >> first place ;) > > I'm looking at it from a code perspective, where the drivers don't > seem to actually care about DMA_FQ. eg search for it in the drivers > and you mostly see: > > (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_DMA_FQ)) > > The exception is domain_alloc which fails in cases where the domain > doesn't 'support' FQ. > > But support FQ or not can be cast as a simple capability flag in the > domain. We don't need a whole type for the driver to communicate it. Right, since the rest of DMA domain setup got streamlined into the core code this one remaining vestige of the old world order is ripe for cleanup, I've just had bigger fish to fry. Or as the case may be, bigger chunks of repetitive boilerplate to remove from elsewhere in the drivers :) > The strictness level belongs completely in the core code, it shouldn't > leak into the driver. It's already not about drivers having any influence on strictness, it's about there being good reason to treat these as distinct domain types through the core code, and as things stand those domain types are exposed to drivers, so drivers need to not freak out at seeing them. Indeed Any driver can "support" IOMMU_DOMAIN_DMA now, so if you've got time to come up with a way of making that completely transparent to drivers then please go ahead, though IIRC there were one or two cases where folks explicitly *didn't* want it being used, so those might need proper IOMMU_DOMAIN_IDENTITY support and a def_domain_type override adding. The thing with IOMMU_DOMAIN_DMA_FQ is that drivers *do* need to somehow indicate that they implement the relevant optimisations in their unmap path, otherwise using a flush queue is objectively worse in every respect than not using a flush queue. Given the status quo, rejecting the domain type at allocation was by far the simplest and most obvious way to achieve that. Once again, please do propose moving it to a more explicit "I can support deferred unmap" driver capability if you'd like, otherwise I'll get there eventually. Thanks, Robin. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 9:37 ` Niklas Schnelle 2022-09-01 11:01 ` Robin Murphy @ 2022-09-01 20:28 ` Matthew Rosato 2022-09-02 7:49 ` Niklas Schnelle 1 sibling, 1 reply; 24+ messages in thread From: Matthew Rosato @ 2022-09-01 20:28 UTC (permalink / raw) To: Niklas Schnelle, Pierre Morel, iommu Cc: linux-s390, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, robin.murphy, jgg, linux-kernel On 9/1/22 5:37 AM, Niklas Schnelle wrote: > On Thu, 2022-09-01 at 09:56 +0200, Pierre Morel wrote: >> >> On 8/31/22 22:12, Matthew Rosato wrote: >>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev >>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU >>> domains and the DMA API handling. However, this commit does not >>> sufficiently handle the case where the device is released via a call >>> to the release_device op as it may occur at the same time as an opposing >>> attach_dev or detach_dev since the group mutex is not held over >>> release_device. This was observed if the device is deconfigured during a >>> small window during vfio-pci initialization and can result in WARNs and >>> potential kernel panics. >>> >>> Handle this by tracking when the device is probed/released via >>> dev_iommu_priv_set/get(). Ensure that once the device is released only >>> release_device handles the re-init of the device DMA. >>> >>> Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") >>> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> >>> --- >>> arch/s390/include/asm/pci.h | 1 + >>> arch/s390/pci/pci.c | 1 + >>> drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++--- >>> 3 files changed, 38 insertions(+), 3 deletions(-) >>> >>> > ---8<--- >>> >>> @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev) >>> >> > ---8<--- >>> + /* Make sure this device is removed from the domain list */ >>> domain = iommu_get_domain_for_dev(dev); >>> if (domain) >>> s390_iommu_detach_device(domain, dev); >>> + /* Now ensure DMA is initialized from here */ >>> + mutex_lock(&zdev->dma_domain_lock); >>> + if (zdev->s390_domain) { >>> + zdev->s390_domain = NULL; >>> + zpci_unregister_ioat(zdev, 0); >>> + zpci_dma_init_device(zdev); >> >> Sorry if it is a stupid question, but two things looks strange to me: >> >> - having DMA initialized just after having unregistered the IOAT >> Is that really all we need to unregister before calling dma_init_device? This is also how s390-iommu has been handling detach_dev (and still does) >> >> - having DMA initialized inside the release_device callback: >> Why isn't it done in the device_probe ? > > As I understand it iommu_release_device() which calls this code is only > used when a device goes away. So, I think you're right in that it makes > little sense to re-initialize DMA at this point, it's going to be torn > down immediately after anyway. I do wonder if it would be an acceptably > small change to just set zdev->s390_domain = NULL here and leave DMA > uninitialized while making zpci_dma_exit_device() deal with that e.g. > by doing nothing if zdev->dma_table is NULL but I'm not sure. Right -- since it's a fix, I was trying to keep the changes minimal and this behavior (re-init DMA even on release_device) was existing, it was just always done within s390_iommu_detach_device before. If you want, I could experiment with setting zdev->dma_table = NULL on the release path only (and checking it in zpci_dma_exit_device()) > > Either way I fear this mess really is just a symptom of our current > design oddity of driving the same IOMMU hardware through both our DMA > API implementation (arch/s390/pci_dma.c) and the IOMMU driver > (driver/iommu/s390-iommu.c) and trying to hand off between them > smoothly where common code instead just layers one atop the other when > using an IOMMU at all. > > I think the correct medium term solution is to use the common DMA API > implementation (drivers/iommu/dma-iommu.c) like everyone else. But that > isn't the minimal fix we need now. Agree > > I do have a working prototype of using the common implementation but > the big problem that I'm still searching a solution for is its > performance with a virtualized IOMMU where IOTLB flushes (RPCIT on > s390) are used for shadowing and are expensive and serialized. The > optimization we used so far for unmap, only doing one global IOTLB > flush once we run out of IOVA space, is just too much better in that > scenario to just ignore. As one data point, on an NVMe I get about > _twice_ the IOPS when using our existing scheme compared to strict > mode. Which makes sense as IOTLB flushes are known as the bottleneck > and optimizing unmap like that reduces them by almost half. Queued > flushing is still much worse likely due to serialization of the > shadowing, though again it works great on LPAR. To make sure it's not > due to some bug in the IOMMU driver I even tried converting our > existing DMA driver to layer on top of the IOMMU driver with the same > result. > > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 20:28 ` Matthew Rosato @ 2022-09-02 7:49 ` Niklas Schnelle 0 siblings, 0 replies; 24+ messages in thread From: Niklas Schnelle @ 2022-09-02 7:49 UTC (permalink / raw) To: Matthew Rosato, Pierre Morel, iommu Cc: linux-s390, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, robin.murphy, jgg, linux-kernel On Thu, 2022-09-01 at 16:28 -0400, Matthew Rosato wrote: > > > On 9/1/22 5:37 AM, Niklas Schnelle wrote: > > On Thu, 2022-09-01 at 09:56 +0200, Pierre Morel wrote: > > > On 8/31/22 22:12, Matthew Rosato wrote: > > > > With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev > > > > calls") s390-iommu is supposed to handle dynamic switching between IOMMU > > > > domains and the DMA API handling. However, this commit does not > > > > sufficiently handle the case where the device is released via a call > > > > to the release_device op as it may occur at the same time as an opposing > > > > attach_dev or detach_dev since the group mutex is not held over > > > > release_device. This was observed if the device is deconfigured during a > > > > small window during vfio-pci initialization and can result in WARNs and > > > > potential kernel panics. > > > > > > > > Handle this by tracking when the device is probed/released via > > > > dev_iommu_priv_set/get(). Ensure that once the device is released only > > > > release_device handles the re-init of the device DMA. > > > > > > > > Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") > > > > Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> > > > > --- > > > > arch/s390/include/asm/pci.h | 1 + > > > > arch/s390/pci/pci.c | 1 + > > > > drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++--- > > > > 3 files changed, 38 insertions(+), 3 deletions(-) > > > > > > > > > > ---8<--- > > > > > > > > @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev) > > > > > > ---8<--- > > > > + /* Make sure this device is removed from the domain list */ > > > > domain = iommu_get_domain_for_dev(dev); > > > > if (domain) > > > > s390_iommu_detach_device(domain, dev); > > > > + /* Now ensure DMA is initialized from here */ > > > > + mutex_lock(&zdev->dma_domain_lock); > > > > + if (zdev->s390_domain) { > > > > + zdev->s390_domain = NULL; > > > > + zpci_unregister_ioat(zdev, 0); > > > > + zpci_dma_init_device(zdev); > > > > > > Sorry if it is a stupid question, but two things looks strange to me: > > > > > > - having DMA initialized just after having unregistered the IOAT > > > Is that really all we need to unregister before calling dma_init_device? > > This is also how s390-iommu has been handling detach_dev (and still does) > > > > - having DMA initialized inside the release_device callback: > > > Why isn't it done in the device_probe ? > > > > As I understand it iommu_release_device() which calls this code is only > > used when a device goes away. So, I think you're right in that it makes > > little sense to re-initialize DMA at this point, it's going to be torn > > down immediately after anyway. I do wonder if it would be an acceptably > > small change to just set zdev->s390_domain = NULL here and leave DMA > > uninitialized while making zpci_dma_exit_device() deal with that e.g. > > by doing nothing if zdev->dma_table is NULL but I'm not sure. > > Right -- since it's a fix, I was trying to keep the changes minimal and this behavior (re-init DMA even on release_device) was existing, it was just always done within s390_iommu_detach_device before. > > If you want, I could experiment with setting zdev->dma_table = NULL on the release path only (and checking it in zpci_dma_exit_device()) > Your current approach is fine with me. After all this oddity of detaching on release and initializing DMA is existing behavior. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-08-31 20:12 ` [PATCH v4 1/2] iommu/s390: Fix race with release_device ops Matthew Rosato 2022-09-01 7:56 ` Pierre Morel @ 2022-09-01 10:25 ` Robin Murphy 2022-09-01 16:14 ` Matthew Rosato 1 sibling, 1 reply; 24+ messages in thread From: Robin Murphy @ 2022-09-01 10:25 UTC (permalink / raw) To: Matthew Rosato, iommu Cc: linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, jgg, linux-kernel On 2022-08-31 21:12, Matthew Rosato wrote: > With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev > calls") s390-iommu is supposed to handle dynamic switching between IOMMU > domains and the DMA API handling. However, this commit does not > sufficiently handle the case where the device is released via a call > to the release_device op as it may occur at the same time as an opposing > attach_dev or detach_dev since the group mutex is not held over > release_device. This was observed if the device is deconfigured during a > small window during vfio-pci initialization and can result in WARNs and > potential kernel panics. Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all? Robin. > Handle this by tracking when the device is probed/released via > dev_iommu_priv_set/get(). Ensure that once the device is released only > release_device handles the re-init of the device DMA. > > Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") > Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> > --- > arch/s390/include/asm/pci.h | 1 + > arch/s390/pci/pci.c | 1 + > drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++--- > 3 files changed, 38 insertions(+), 3 deletions(-) > > diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h > index 7b4cdadbc023..080251e7b275 100644 > --- a/arch/s390/include/asm/pci.h > +++ b/arch/s390/include/asm/pci.h > @@ -157,6 +157,7 @@ struct zpci_dev { > /* DMA stuff */ > unsigned long *dma_table; > spinlock_t dma_table_lock; > + struct mutex dma_domain_lock; /* protects s390_domain value */ > int tlb_refresh; > > spinlock_t iommu_bitmap_lock; > diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c > index 73cdc5539384..973edd32ecc9 100644 > --- a/arch/s390/pci/pci.c > +++ b/arch/s390/pci/pci.c > @@ -832,6 +832,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) > kref_init(&zdev->kref); > mutex_init(&zdev->lock); > mutex_init(&zdev->kzdev_lock); > + mutex_init(&zdev->dma_domain_lock); > > rc = zpci_init_iommu(zdev); > if (rc) > diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c > index c898bcbbce11..1137d669e849 100644 > --- a/drivers/iommu/s390-iommu.c > +++ b/drivers/iommu/s390-iommu.c > @@ -99,6 +99,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, > if (!domain_device) > return -ENOMEM; > > + /* Leave now if the device has already been released */ > + mutex_lock(&zdev->dma_domain_lock); > + if (!dev_iommu_priv_get(dev)) { > + mutex_unlock(&zdev->dma_domain_lock); > + kfree(domain_device); > + return 0; > + } > + > if (zdev->dma_table && !zdev->s390_domain) { > cc = zpci_dma_exit_device(zdev); > if (cc) { > @@ -132,9 +140,10 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, > goto out_restore; > } > domain_device->zdev = zdev; > - zdev->s390_domain = s390_domain; > list_add(&domain_device->list, &s390_domain->devices); > spin_unlock_irqrestore(&s390_domain->list_lock, flags); > + zdev->s390_domain = s390_domain; > + mutex_unlock(&zdev->dma_domain_lock); > > return 0; > > @@ -147,6 +156,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, > virt_to_phys(zdev->dma_table)); > } > out_free: > + mutex_unlock(&zdev->dma_domain_lock); > kfree(domain_device); > > return rc; > @@ -176,17 +186,22 @@ static void s390_iommu_detach_device(struct iommu_domain *domain, > } > spin_unlock_irqrestore(&s390_domain->list_lock, flags); > > - if (found && (zdev->s390_domain == s390_domain)) { > + mutex_lock(&zdev->dma_domain_lock); > + if (found && (zdev->s390_domain == s390_domain) && > + dev_iommu_priv_get(dev)) { > zdev->s390_domain = NULL; > zpci_unregister_ioat(zdev, 0); > zpci_dma_init_device(zdev); > } > + mutex_unlock(&zdev->dma_domain_lock); > } > > static struct iommu_device *s390_iommu_probe_device(struct device *dev) > { > struct zpci_dev *zdev = to_zpci_dev(dev); > > + dev_iommu_priv_set(dev, zdev); > + > return &zdev->iommu_dev; > } > > @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev) > * > * So let's call detach_dev from here if it hasn't been called before. > */ > - if (zdev && zdev->s390_domain) { > + if (zdev) { > + /* > + * Clear priv to block further attaches for this device, > + * ensure detaches don't init DMA. Hold the domain lock > + * to ensure that attach/detach get a consistent view of > + * whether or not the device is released. > + */ > + mutex_lock(&zdev->dma_domain_lock); > + dev_iommu_priv_set(dev, NULL); > + mutex_unlock(&zdev->dma_domain_lock); > + /* Make sure this device is removed from the domain list */ > domain = iommu_get_domain_for_dev(dev); > if (domain) > s390_iommu_detach_device(domain, dev); > + /* Now ensure DMA is initialized from here */ > + mutex_lock(&zdev->dma_domain_lock); > + if (zdev->s390_domain) { > + zdev->s390_domain = NULL; > + zpci_unregister_ioat(zdev, 0); > + zpci_dma_init_device(zdev); > + } > + mutex_unlock(&zdev->dma_domain_lock); > } > } > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 10:25 ` Robin Murphy @ 2022-09-01 16:14 ` Matthew Rosato 2022-09-01 20:37 ` Jason Gunthorpe 2022-09-02 10:48 ` Robin Murphy 0 siblings, 2 replies; 24+ messages in thread From: Matthew Rosato @ 2022-09-01 16:14 UTC (permalink / raw) To: Robin Murphy, iommu, jgg, Alex Williamson Cc: linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On 9/1/22 6:25 AM, Robin Murphy wrote: > On 2022-08-31 21:12, Matthew Rosato wrote: >> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev >> calls") s390-iommu is supposed to handle dynamic switching between IOMMU >> domains and the DMA API handling. However, this commit does not >> sufficiently handle the case where the device is released via a call >> to the release_device op as it may occur at the same time as an opposing >> attach_dev or detach_dev since the group mutex is not held over >> release_device. This was observed if the device is deconfigured during a >> small window during vfio-pci initialization and can result in WARNs and >> potential kernel panics. > > Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all? > > Robin. So, I generally have seen the issue manifest as one of the calls into the iommu core from __vfio_group_unset_container (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN. This happens when the vfio group fd is released, which could be coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER. AFAICT there's nothing serializing the notion of calling into the iommu core here against a device that is simultaneously going through release_device (because we don't enter release_device with the group mutex held), resulting in unpredictable behavior between the dueling attach_dev/detach_dev and the release_device for s390-iommu at least. > >> Handle this by tracking when the device is probed/released via >> dev_iommu_priv_set/get(). Ensure that once the device is released only >> release_device handles the re-init of the device DMA. >> >> Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") >> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> >> --- >> arch/s390/include/asm/pci.h | 1 + >> arch/s390/pci/pci.c | 1 + >> drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++--- >> 3 files changed, 38 insertions(+), 3 deletions(-) >> >> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h >> index 7b4cdadbc023..080251e7b275 100644 >> --- a/arch/s390/include/asm/pci.h >> +++ b/arch/s390/include/asm/pci.h >> @@ -157,6 +157,7 @@ struct zpci_dev { >> /* DMA stuff */ >> unsigned long *dma_table; >> spinlock_t dma_table_lock; >> + struct mutex dma_domain_lock; /* protects s390_domain value */ >> int tlb_refresh; >> spinlock_t iommu_bitmap_lock; >> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c >> index 73cdc5539384..973edd32ecc9 100644 >> --- a/arch/s390/pci/pci.c >> +++ b/arch/s390/pci/pci.c >> @@ -832,6 +832,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) >> kref_init(&zdev->kref); >> mutex_init(&zdev->lock); >> mutex_init(&zdev->kzdev_lock); >> + mutex_init(&zdev->dma_domain_lock); >> rc = zpci_init_iommu(zdev); >> if (rc) >> diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c >> index c898bcbbce11..1137d669e849 100644 >> --- a/drivers/iommu/s390-iommu.c >> +++ b/drivers/iommu/s390-iommu.c >> @@ -99,6 +99,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, >> if (!domain_device) >> return -ENOMEM; >> + /* Leave now if the device has already been released */ >> + mutex_lock(&zdev->dma_domain_lock); >> + if (!dev_iommu_priv_get(dev)) { >> + mutex_unlock(&zdev->dma_domain_lock); >> + kfree(domain_device); >> + return 0; >> + } >> + >> if (zdev->dma_table && !zdev->s390_domain) { >> cc = zpci_dma_exit_device(zdev); >> if (cc) { >> @@ -132,9 +140,10 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, >> goto out_restore; >> } >> domain_device->zdev = zdev; >> - zdev->s390_domain = s390_domain; >> list_add(&domain_device->list, &s390_domain->devices); >> spin_unlock_irqrestore(&s390_domain->list_lock, flags); >> + zdev->s390_domain = s390_domain; >> + mutex_unlock(&zdev->dma_domain_lock); >> return 0; >> @@ -147,6 +156,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, >> virt_to_phys(zdev->dma_table)); >> } >> out_free: >> + mutex_unlock(&zdev->dma_domain_lock); >> kfree(domain_device); >> return rc; >> @@ -176,17 +186,22 @@ static void s390_iommu_detach_device(struct iommu_domain *domain, >> } >> spin_unlock_irqrestore(&s390_domain->list_lock, flags); >> - if (found && (zdev->s390_domain == s390_domain)) { >> + mutex_lock(&zdev->dma_domain_lock); >> + if (found && (zdev->s390_domain == s390_domain) && >> + dev_iommu_priv_get(dev)) { >> zdev->s390_domain = NULL; >> zpci_unregister_ioat(zdev, 0); >> zpci_dma_init_device(zdev); >> } >> + mutex_unlock(&zdev->dma_domain_lock); >> } >> static struct iommu_device *s390_iommu_probe_device(struct device *dev) >> { >> struct zpci_dev *zdev = to_zpci_dev(dev); >> + dev_iommu_priv_set(dev, zdev); >> + >> return &zdev->iommu_dev; >> } >> @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev) >> * >> * So let's call detach_dev from here if it hasn't been called before. >> */ >> - if (zdev && zdev->s390_domain) { >> + if (zdev) { >> + /* >> + * Clear priv to block further attaches for this device, >> + * ensure detaches don't init DMA. Hold the domain lock >> + * to ensure that attach/detach get a consistent view of >> + * whether or not the device is released. >> + */ >> + mutex_lock(&zdev->dma_domain_lock); >> + dev_iommu_priv_set(dev, NULL); >> + mutex_unlock(&zdev->dma_domain_lock); >> + /* Make sure this device is removed from the domain list */ >> domain = iommu_get_domain_for_dev(dev); >> if (domain) >> s390_iommu_detach_device(domain, dev); >> + /* Now ensure DMA is initialized from here */ >> + mutex_lock(&zdev->dma_domain_lock); >> + if (zdev->s390_domain) { >> + zdev->s390_domain = NULL; >> + zpci_unregister_ioat(zdev, 0); >> + zpci_dma_init_device(zdev); >> + } >> + mutex_unlock(&zdev->dma_domain_lock); >> } >> } >> ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 16:14 ` Matthew Rosato @ 2022-09-01 20:37 ` Jason Gunthorpe 2022-09-02 17:11 ` Matthew Rosato 2022-09-02 10:48 ` Robin Murphy 1 sibling, 1 reply; 24+ messages in thread From: Jason Gunthorpe @ 2022-09-01 20:37 UTC (permalink / raw) To: Matthew Rosato Cc: Robin Murphy, iommu, Alex Williamson, linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On Thu, Sep 01, 2022 at 12:14:24PM -0400, Matthew Rosato wrote: > On 9/1/22 6:25 AM, Robin Murphy wrote: > > On 2022-08-31 21:12, Matthew Rosato wrote: > >> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev > >> calls") s390-iommu is supposed to handle dynamic switching between IOMMU > >> domains and the DMA API handling. However, this commit does not > >> sufficiently handle the case where the device is released via a call > >> to the release_device op as it may occur at the same time as an opposing > >> attach_dev or detach_dev since the group mutex is not held over > >> release_device. This was observed if the device is deconfigured during a > >> small window during vfio-pci initialization and can result in WARNs and > >> potential kernel panics. > > > > Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all? > > > > Robin. > > So, I generally have seen the issue manifest as one of the calls > into the iommu core from __vfio_group_unset_container > (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN. > This happens when the vfio group fd is released, which could be > coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER. > AFAICT there's nothing serializing the notion of calling into the > iommu core here against a device that is simultaneously going > through release_device (because we don't enter release_device with > the group mutex held), resulting in unpredictable behavior between > the dueling attach_dev/detach_dev and the release_device for > s390-iommu at least. Oh, this is a vfio bug. dev->iommu_group is only a valid pointer as long as a driver is attach to the device. vfio copies the dev->iommu_group into struct vfio_group during probe() but then lets vfio_group live independently. Particularly the driver can be removed()'d and the vfio_group keeps on going. Thus it is possible to UAF the iommu_group by referencing it through the vfio_group. We must wait during remove for all the vfio_groups to stop referencing iommu_group. Something like this or so: diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index eb714a484662fc..d8f40b22c98ddb 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -65,7 +65,15 @@ struct vfio_container { struct vfio_group { struct device dev; struct cdev cdev; + /* + * When drivers is non-zero a driver is attached to the struct device + * that provided the iommu_group and thus the iommu_group is a valid + * pointer. When drivers is 0 the driver is being detached. Once users + * reaches 0 then the iommu_group is invalid. + */ + refcount_t drivers; refcount_t users; + struct completion comp; unsigned int container_users; struct iommu_group *iommu_group; struct vfio_container *container; @@ -276,8 +284,6 @@ void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) } EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); -static void vfio_group_get(struct vfio_group *group); - /* * Container objects - containers are created when /dev/vfio/vfio is * opened, but their lifecycle extends until the last user is done, so @@ -305,16 +311,21 @@ static void vfio_container_put(struct vfio_container *container) /* * Group objects - create, release, get, put, search */ + + /* + * This returns a driver reference. It can only be used in the probe function + * of a device_driver, eg as part of the internal implementation of + * __vfio_register_dev(). + */ static struct vfio_group * __vfio_group_get_from_iommu(struct iommu_group *iommu_group) { struct vfio_group *group; list_for_each_entry(group, &vfio.group_list, vfio_next) { - if (group->iommu_group == iommu_group) { - vfio_group_get(group); + if (group->iommu_group == iommu_group && + refcount_inc_not_zero(&group->drivers)) return group; - } } return NULL; } @@ -364,6 +375,8 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, group->cdev.owner = THIS_MODULE; refcount_set(&group->users, 1); + refcount_set(&group->drivers, 1); + init_completion(&group->comp); init_rwsem(&group->group_rwsem); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); @@ -422,8 +435,28 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, static void vfio_group_put(struct vfio_group *group) { - if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock)) - return; + if (refcount_dec_and_test(&group->users)) + complete(&group->comp); +} + +/* + * When the drivers count reaches 0 then the group must be destroyed + * immediately. A zero driver group is a zombie awaiting destruction. + */ +static void vfio_group_remove(struct vfio_group *group) +{ + /* Matches the get from vfio_group_alloc() */ + vfio_group_put(group); + + cdev_device_del(&group->cdev, &group->dev); + + /* + * Before we allow the last driver in the group to be unplugged the + * group must be sanitized so nothing else is or can reference it. This + * is because the group->iommu_group pointer is only valid so long as a + * VFIO device driver is attached to a device belonging to the group. + */ + wait_for_completion(&group->comp); /* * These data structures all have paired operations that can only be @@ -434,19 +467,15 @@ static void vfio_group_put(struct vfio_group *group) WARN_ON(!list_empty(&group->device_list)); WARN_ON(group->container || group->container_users); WARN_ON(group->notifier.head); + group->iommu_group = NULL; + mutex_lock(&vfio.group_lock); list_del(&group->vfio_next); - cdev_device_del(&group->cdev, &group->dev); mutex_unlock(&vfio.group_lock); put_device(&group->dev); } -static void vfio_group_get(struct vfio_group *group) -{ - refcount_inc(&group->users); -} - /* * Device objects - create, release, get, put, search */ @@ -462,22 +491,6 @@ static bool vfio_device_try_get(struct vfio_device *device) return refcount_inc_not_zero(&device->refcount); } -static struct vfio_device *vfio_group_get_device(struct vfio_group *group, - struct device *dev) -{ - struct vfio_device *device; - - mutex_lock(&group->device_lock); - list_for_each_entry(device, &group->device_list, group_next) { - if (device->dev == dev && vfio_device_try_get(device)) { - mutex_unlock(&group->device_lock); - return device; - } - } - mutex_unlock(&group->device_lock); - return NULL; -} - /* * VFIO driver API */ @@ -576,8 +589,10 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) static int __vfio_register_dev(struct vfio_device *device, struct vfio_group *group) { - struct vfio_device *existing_device; - + /* + * In all cases group is the output of one of the group allocation functions + * and we have group->drivers incremetned for us + */ if (IS_ERR(group)) return PTR_ERR(group); @@ -588,18 +603,6 @@ static int __vfio_register_dev(struct vfio_device *device, if (!device->dev_set) vfio_assign_device_set(device, device); - existing_device = vfio_group_get_device(group, device->dev); - if (existing_device) { - dev_WARN(device->dev, "Device already exists on group %d\n", - iommu_group_id(group->iommu_group)); - vfio_device_put(existing_device); - if (group->type == VFIO_NO_IOMMU || - group->type == VFIO_EMULATED_IOMMU) - iommu_group_remove_device(device->dev); - vfio_group_put(group); - return -EBUSY; - } - /* Our reference on group is moved to the device */ device->group = group; @@ -702,8 +705,9 @@ void vfio_unregister_group_dev(struct vfio_device *device) if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); - /* Matches the get in vfio_register_group_dev() */ - vfio_group_put(group); + /* Matches the alloc get in vfio_register_group_dev() */ + if (refcount_dec_and_test(&group->drivers)) + vfio_group_remove(group); } EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); ^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 20:37 ` Jason Gunthorpe @ 2022-09-02 17:11 ` Matthew Rosato 2022-09-02 17:21 ` Jason Gunthorpe 0 siblings, 1 reply; 24+ messages in thread From: Matthew Rosato @ 2022-09-02 17:11 UTC (permalink / raw) To: Jason Gunthorpe Cc: Robin Murphy, iommu, Alex Williamson, linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On 9/1/22 4:37 PM, Jason Gunthorpe wrote: > On Thu, Sep 01, 2022 at 12:14:24PM -0400, Matthew Rosato wrote: >> On 9/1/22 6:25 AM, Robin Murphy wrote: >>> On 2022-08-31 21:12, Matthew Rosato wrote: >>>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev >>>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU >>>> domains and the DMA API handling. However, this commit does not >>>> sufficiently handle the case where the device is released via a call >>>> to the release_device op as it may occur at the same time as an opposing >>>> attach_dev or detach_dev since the group mutex is not held over >>>> release_device. This was observed if the device is deconfigured during a >>>> small window during vfio-pci initialization and can result in WARNs and >>>> potential kernel panics. >>> >>> Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all? >>> >>> Robin. >> >> So, I generally have seen the issue manifest as one of the calls >> into the iommu core from __vfio_group_unset_container >> (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN. >> This happens when the vfio group fd is released, which could be >> coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER. >> AFAICT there's nothing serializing the notion of calling into the >> iommu core here against a device that is simultaneously going >> through release_device (because we don't enter release_device with >> the group mutex held), resulting in unpredictable behavior between >> the dueling attach_dev/detach_dev and the release_device for >> s390-iommu at least. > > Oh, this is a vfio bug. I've been running with your diff applied today on s390 and this indeed fixes the issue by preventing the detach-after-release coming out of vfio. Can you send as a patch for review? > > dev->iommu_group is only a valid pointer as long as a driver is attach > to the device. > > vfio copies the dev->iommu_group into struct vfio_group during probe() > but then lets vfio_group live independently. Particularly the driver > can be removed()'d and the vfio_group keeps on going. > > Thus it is possible to UAF the iommu_group by referencing it through > the vfio_group. > > We must wait during remove for all the vfio_groups to stop > referencing iommu_group. > > Something like this or so: > > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c > index eb714a484662fc..d8f40b22c98ddb 100644 > --- a/drivers/vfio/vfio_main.c > +++ b/drivers/vfio/vfio_main.c > @@ -65,7 +65,15 @@ struct vfio_container { > struct vfio_group { > struct device dev; > struct cdev cdev; > + /* > + * When drivers is non-zero a driver is attached to the struct device > + * that provided the iommu_group and thus the iommu_group is a valid > + * pointer. When drivers is 0 the driver is being detached. Once users > + * reaches 0 then the iommu_group is invalid. > + */ > + refcount_t drivers; > refcount_t users; > + struct completion comp; > unsigned int container_users; > struct iommu_group *iommu_group; > struct vfio_container *container; > @@ -276,8 +284,6 @@ void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) > } > EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); > > -static void vfio_group_get(struct vfio_group *group); > - > /* > * Container objects - containers are created when /dev/vfio/vfio is > * opened, but their lifecycle extends until the last user is done, so > @@ -305,16 +311,21 @@ static void vfio_container_put(struct vfio_container *container) > /* > * Group objects - create, release, get, put, search > */ > + > + /* > + * This returns a driver reference. It can only be used in the probe function > + * of a device_driver, eg as part of the internal implementation of > + * __vfio_register_dev(). > + */ > static struct vfio_group * > __vfio_group_get_from_iommu(struct iommu_group *iommu_group) > { > struct vfio_group *group; > > list_for_each_entry(group, &vfio.group_list, vfio_next) { > - if (group->iommu_group == iommu_group) { > - vfio_group_get(group); > + if (group->iommu_group == iommu_group && > + refcount_inc_not_zero(&group->drivers)) > return group; > - } > } > return NULL; > } > @@ -364,6 +375,8 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, > group->cdev.owner = THIS_MODULE; > > refcount_set(&group->users, 1); > + refcount_set(&group->drivers, 1); > + init_completion(&group->comp); > init_rwsem(&group->group_rwsem); > INIT_LIST_HEAD(&group->device_list); > mutex_init(&group->device_lock); > @@ -422,8 +435,28 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, > > static void vfio_group_put(struct vfio_group *group) > { > - if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock)) > - return; > + if (refcount_dec_and_test(&group->users)) > + complete(&group->comp); > +} > + > +/* > + * When the drivers count reaches 0 then the group must be destroyed > + * immediately. A zero driver group is a zombie awaiting destruction. > + */ > +static void vfio_group_remove(struct vfio_group *group) > +{ > + /* Matches the get from vfio_group_alloc() */ > + vfio_group_put(group); > + > + cdev_device_del(&group->cdev, &group->dev); > + > + /* > + * Before we allow the last driver in the group to be unplugged the > + * group must be sanitized so nothing else is or can reference it. This > + * is because the group->iommu_group pointer is only valid so long as a > + * VFIO device driver is attached to a device belonging to the group. > + */ > + wait_for_completion(&group->comp); > > /* > * These data structures all have paired operations that can only be > @@ -434,19 +467,15 @@ static void vfio_group_put(struct vfio_group *group) > WARN_ON(!list_empty(&group->device_list)); > WARN_ON(group->container || group->container_users); > WARN_ON(group->notifier.head); > + group->iommu_group = NULL; > > + mutex_lock(&vfio.group_lock); > list_del(&group->vfio_next); > - cdev_device_del(&group->cdev, &group->dev); > mutex_unlock(&vfio.group_lock); > > put_device(&group->dev); > } > > -static void vfio_group_get(struct vfio_group *group) > -{ > - refcount_inc(&group->users); > -} > - > /* > * Device objects - create, release, get, put, search > */ > @@ -462,22 +491,6 @@ static bool vfio_device_try_get(struct vfio_device *device) > return refcount_inc_not_zero(&device->refcount); > } > > -static struct vfio_device *vfio_group_get_device(struct vfio_group *group, > - struct device *dev) > -{ > - struct vfio_device *device; > - > - mutex_lock(&group->device_lock); > - list_for_each_entry(device, &group->device_list, group_next) { > - if (device->dev == dev && vfio_device_try_get(device)) { > - mutex_unlock(&group->device_lock); > - return device; > - } > - } > - mutex_unlock(&group->device_lock); > - return NULL; > -} > - > /* > * VFIO driver API > */ > @@ -576,8 +589,10 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) > static int __vfio_register_dev(struct vfio_device *device, > struct vfio_group *group) > { > - struct vfio_device *existing_device; > - > + /* > + * In all cases group is the output of one of the group allocation functions > + * and we have group->drivers incremetned for us > + */ > if (IS_ERR(group)) > return PTR_ERR(group); > > @@ -588,18 +603,6 @@ static int __vfio_register_dev(struct vfio_device *device, > if (!device->dev_set) > vfio_assign_device_set(device, device); > > - existing_device = vfio_group_get_device(group, device->dev); > - if (existing_device) { > - dev_WARN(device->dev, "Device already exists on group %d\n", > - iommu_group_id(group->iommu_group)); > - vfio_device_put(existing_device); > - if (group->type == VFIO_NO_IOMMU || > - group->type == VFIO_EMULATED_IOMMU) > - iommu_group_remove_device(device->dev); > - vfio_group_put(group); > - return -EBUSY; > - } > - > /* Our reference on group is moved to the device */ > device->group = group; > > @@ -702,8 +705,9 @@ void vfio_unregister_group_dev(struct vfio_device *device) > if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) > iommu_group_remove_device(device->dev); > > - /* Matches the get in vfio_register_group_dev() */ > - vfio_group_put(group); > + /* Matches the alloc get in vfio_register_group_dev() */ > + if (refcount_dec_and_test(&group->drivers)) > + vfio_group_remove(group); > } > EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-02 17:11 ` Matthew Rosato @ 2022-09-02 17:21 ` Jason Gunthorpe 2022-09-02 18:20 ` Matthew Rosato 2022-09-05 9:46 ` Robin Murphy 0 siblings, 2 replies; 24+ messages in thread From: Jason Gunthorpe @ 2022-09-02 17:21 UTC (permalink / raw) To: Matthew Rosato Cc: Robin Murphy, iommu, Alex Williamson, linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On Fri, Sep 02, 2022 at 01:11:09PM -0400, Matthew Rosato wrote: > On 9/1/22 4:37 PM, Jason Gunthorpe wrote: > > On Thu, Sep 01, 2022 at 12:14:24PM -0400, Matthew Rosato wrote: > >> On 9/1/22 6:25 AM, Robin Murphy wrote: > >>> On 2022-08-31 21:12, Matthew Rosato wrote: > >>>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev > >>>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU > >>>> domains and the DMA API handling. However, this commit does not > >>>> sufficiently handle the case where the device is released via a call > >>>> to the release_device op as it may occur at the same time as an opposing > >>>> attach_dev or detach_dev since the group mutex is not held over > >>>> release_device. This was observed if the device is deconfigured during a > >>>> small window during vfio-pci initialization and can result in WARNs and > >>>> potential kernel panics. > >>> > >>> Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all? > >>> > >>> Robin. > >> > >> So, I generally have seen the issue manifest as one of the calls > >> into the iommu core from __vfio_group_unset_container > >> (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN. > >> This happens when the vfio group fd is released, which could be > >> coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER. > >> AFAICT there's nothing serializing the notion of calling into the > >> iommu core here against a device that is simultaneously going > >> through release_device (because we don't enter release_device with > >> the group mutex held), resulting in unpredictable behavior between > >> the dueling attach_dev/detach_dev and the release_device for > >> s390-iommu at least. > > > > Oh, this is a vfio bug. > > I've been running with your diff applied today on s390 and this > indeed fixes the issue by preventing the detach-after-release coming > out of vfio. Heh, I'm shocked it worked at all I've been trying to understand Robin's latest remarks because maybe I don't really understand your situation right. IMHO this is definately a VFIO bug, because in a single-device group we must not allow the domain to remain attached past remove(). Or more broadly we shouldn't be holding ownership of a group without also having a driver attached. But this dicussion with Robin about multi-device groups and hotplug makes me wonder what your situation is? There is certainly something interesting there too, and this can't be a solution to that problem. > Can you send as a patch for review? After I wrote this I had a better idea, to avoid the completion and just fully orphan the group fd. And the patch is kind of messy Can you forward me the backtrace you hit also? (Though I'm not sure I can get to this promptly, I have only 4 working days before LPC and still many things to do) Thanks, Jason ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-02 17:21 ` Jason Gunthorpe @ 2022-09-02 18:20 ` Matthew Rosato 2022-09-05 9:46 ` Robin Murphy 1 sibling, 0 replies; 24+ messages in thread From: Matthew Rosato @ 2022-09-02 18:20 UTC (permalink / raw) To: Jason Gunthorpe Cc: Robin Murphy, iommu, Alex Williamson, linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On 9/2/22 1:21 PM, Jason Gunthorpe wrote: > On Fri, Sep 02, 2022 at 01:11:09PM -0400, Matthew Rosato wrote: >> On 9/1/22 4:37 PM, Jason Gunthorpe wrote: >>> On Thu, Sep 01, 2022 at 12:14:24PM -0400, Matthew Rosato wrote: >>>> On 9/1/22 6:25 AM, Robin Murphy wrote: >>>>> On 2022-08-31 21:12, Matthew Rosato wrote: >>>>>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev >>>>>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU >>>>>> domains and the DMA API handling. However, this commit does not >>>>>> sufficiently handle the case where the device is released via a call >>>>>> to the release_device op as it may occur at the same time as an opposing >>>>>> attach_dev or detach_dev since the group mutex is not held over >>>>>> release_device. This was observed if the device is deconfigured during a >>>>>> small window during vfio-pci initialization and can result in WARNs and >>>>>> potential kernel panics. >>>>> >>>>> Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all? >>>>> >>>>> Robin. >>>> >>>> So, I generally have seen the issue manifest as one of the calls >>>> into the iommu core from __vfio_group_unset_container >>>> (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN. >>>> This happens when the vfio group fd is released, which could be >>>> coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER. >>>> AFAICT there's nothing serializing the notion of calling into the >>>> iommu core here against a device that is simultaneously going >>>> through release_device (because we don't enter release_device with >>>> the group mutex held), resulting in unpredictable behavior between >>>> the dueling attach_dev/detach_dev and the release_device for >>>> s390-iommu at least. >>> >>> Oh, this is a vfio bug. >> >> I've been running with your diff applied today on s390 and this >> indeed fixes the issue by preventing the detach-after-release coming >> out of vfio. > > Heh, I'm shocked it worked at all > > I've been trying to understand Robin's latest remarks because maybe I > don't really understand your situation right. > > IMHO this is definately a VFIO bug, because in a single-device group > we must not allow the domain to remain attached past remove(). Or more > broadly we shouldn't be holding ownership of a group without also > having a driver attached.> > But this dicussion with Robin about multi-device groups and hotplug > makes me wonder what your situation is? There is certainly something > interesting there too, and this can't be a solution to that problem. > It's just a single-device group, that's all we do in s390 today. Of course, as we move forward to consume s390-iommu for other use-cases (e.g. Niklas DMA conversion) then yeah I think we will still need something within s390-iommu to handle competing e.g. attach/detach and release_device calls. For this particular issue, the scenario is triggered by deconfiguring that one host device (effectively, pull the plug) while the device is passed to QEMU via vfio-pci. >> Can you send as a patch for review? > > After I wrote this I had a better idea, to avoid the completion and > just fully orphan the group fd. > > And the patch is kind of messy > > Can you forward me the backtrace you hit also? It manifests in a few different ways (depends on the timing of the iommu_detach_group vs release_device), but this is by far the most common: [ 402.718355] iommu driver failed to attach the default/blocking domain [ 402.718380] WARNING: CPU: 0 PID: 5082 at drivers/iommu/iommu.c:1961 iommu_detach_group+0x6c/0x80 [ 402.718389] Modules linked in: macvtap macvlan tap vfio_pci vfio_pci_core irqbypass vfio_virqfd kvm nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink mlx5_ib sunrpc ib_uverbs ism smc uvdevice ib_core s390_trng eadm_sch tape_3590 tape tape_class vfio_ccw mdev vfio_iommu_type1 vfio zcrypt_cex4 sch_fq_codel configfs ghash_s390 prng chacha_s390 libchacha aes_s390 mlx5_core des_s390 libdes sha3_512_s390 nvme sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common nvme_core zfcp scsi_transport_fc pkey zcrypt rng_core autofs4 [ 402.718439] CPU: 0 PID: 5082 Comm: qemu-system-s39 Tainted: G W 6.0.0-rc3 #5 [ 402.718442] Hardware name: IBM 3931 A01 782 (LPAR) [ 402.718443] Krnl PSW : 0704c00180000000 000000095bb10d28 (iommu_detach_group+0x70/0x80) [ 402.718447] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3 [ 402.718450] Krnl GPRS: 0000000000000001 0000000900000027 0000000000000039 000000095c97ffe0 [ 402.718453] 00000000fffeffff 00000009fc290000 00000000af1fda50 00000000af590b58 [ 402.718455] 00000000af1fdaf0 0000000135c7a320 0000000135e52258 0000000135e52200 [ 402.718457] 00000000a29e8000 00000000af590b40 000000095bb10d24 0000038004b13c98 [ 402.718464] Krnl Code: 000000095bb10d18: c020003d56fc larl %r2,000000095c2bbb10 000000095bb10d1e: c0e50019d901 brasl %r14,000000095be4bf20 #000000095bb10d24: af000000 mc 0,0 >000000095bb10d28: b904002a lgr %r2,%r10 000000095bb10d2c: ebaff0a00004 lmg %r10,%r15,160(%r15) 000000095bb10d32: c0f4001aa867 brcl 15,000000095be65e00 000000095bb10d38: c004002168e0 brcl 0,000000095bf3def8 000000095bb10d3e: eb6ff0480024 stmg %r6,%r15,72(%r15) [ 402.718532] Call Trace: [ 402.718534] [<000000095bb10d28>] iommu_detach_group+0x70/0x80 [ 402.718538] ([<000000095bb10d24>] iommu_detach_group+0x6c/0x80) [ 402.718540] [<000003ff80243b0e>] vfio_iommu_type1_detach_group+0x136/0x6c8 [vfio_iommu_type1] [ 402.718546] [<000003ff80137780>] __vfio_group_unset_container+0x58/0x158 [vfio] [ 402.718552] [<000003ff80138a16>] vfio_group_fops_unl_ioctl+0x1b6/0x210 [vfio] [ 402.718557] pci 0004:00:00.0: Removing from iommu group 4 [ 402.718555] [<000000095b5b62e8>] __s390x_sys_ioctl+0xc0/0x100 [ 402.718562] [<000000095be5d3b4>] __do_syscall+0x1d4/0x200 [ 402.718566] [<000000095be6c072>] system_call+0x82/0xb0 [ 402.718570] Last Breaking-Event-Address: [ 402.718571] [<000000095be4bf80>] __warn_printk+0x60/0x68 The WARN is hit because the attach fails due to the simultaneously in-flight release_device. The subject patch was basically attempting to serialize the value that was being stomped over within s390-iommu (zdev->s390_domain) as a result of the competing iommu_detach_group + release_device. By ensuring the vfio group is cleaned up before the release, I no longer see the competing threads, with release_device happening after the group is cleaned up. > > (Though I'm not sure I can get to this promptly, I have only 4 working > days before LPC and still many things to do) > > Thanks, > Jason ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-02 17:21 ` Jason Gunthorpe 2022-09-02 18:20 ` Matthew Rosato @ 2022-09-05 9:46 ` Robin Murphy 2022-09-06 13:36 ` Jason Gunthorpe 1 sibling, 1 reply; 24+ messages in thread From: Robin Murphy @ 2022-09-05 9:46 UTC (permalink / raw) To: Jason Gunthorpe, Matthew Rosato Cc: iommu, Alex Williamson, linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On 2022-09-02 18:21, Jason Gunthorpe wrote: > On Fri, Sep 02, 2022 at 01:11:09PM -0400, Matthew Rosato wrote: >> On 9/1/22 4:37 PM, Jason Gunthorpe wrote: >>> On Thu, Sep 01, 2022 at 12:14:24PM -0400, Matthew Rosato wrote: >>>> On 9/1/22 6:25 AM, Robin Murphy wrote: >>>>> On 2022-08-31 21:12, Matthew Rosato wrote: >>>>>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev >>>>>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU >>>>>> domains and the DMA API handling. However, this commit does not >>>>>> sufficiently handle the case where the device is released via a call >>>>>> to the release_device op as it may occur at the same time as an opposing >>>>>> attach_dev or detach_dev since the group mutex is not held over >>>>>> release_device. This was observed if the device is deconfigured during a >>>>>> small window during vfio-pci initialization and can result in WARNs and >>>>>> potential kernel panics. >>>>> >>>>> Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all? >>>>> >>>>> Robin. >>>> >>>> So, I generally have seen the issue manifest as one of the calls >>>> into the iommu core from __vfio_group_unset_container >>>> (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN. >>>> This happens when the vfio group fd is released, which could be >>>> coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER. >>>> AFAICT there's nothing serializing the notion of calling into the >>>> iommu core here against a device that is simultaneously going >>>> through release_device (because we don't enter release_device with >>>> the group mutex held), resulting in unpredictable behavior between >>>> the dueling attach_dev/detach_dev and the release_device for >>>> s390-iommu at least. >>> >>> Oh, this is a vfio bug. >> >> I've been running with your diff applied today on s390 and this >> indeed fixes the issue by preventing the detach-after-release coming >> out of vfio. > > Heh, I'm shocked it worked at all > > I've been trying to understand Robin's latest remarks because maybe I > don't really understand your situation right. That was really just me thinking out loud to guess at how it must be happening - I wasn't sure whether VFIO is actually intended to allow that or not, so if not then by all means let's look at fixing that, but as I say I think we're only seeing it provoke a problem at the driver level because of 9ac8545199a1, and fixing VFIO doesn't fix that in general. And conversely if we *can* fix that properly at the IOMMU API level then the current VFIO behaviour should become benign again anyway. > IMHO this is definately a VFIO bug, because in a single-device group > we must not allow the domain to remain attached past remove(). Or more > broadly we shouldn't be holding ownership of a group without also > having a driver attached. FWIW I was assuming it might be fine for a VFIO user to hold the group open if they expect the device to come back again and re-bind (for example, perhaps over some reconfiguration that requires turning SR-IOV off and on again?) Cheers, Robin. ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-05 9:46 ` Robin Murphy @ 2022-09-06 13:36 ` Jason Gunthorpe 0 siblings, 0 replies; 24+ messages in thread From: Jason Gunthorpe @ 2022-09-06 13:36 UTC (permalink / raw) To: Robin Murphy Cc: Matthew Rosato, iommu, Alex Williamson, linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On Mon, Sep 05, 2022 at 10:46:44AM +0100, Robin Murphy wrote: > > I've been trying to understand Robin's latest remarks because maybe I > > don't really understand your situation right. > > That was really just me thinking out loud to guess at how it must be > happening - I wasn't sure whether VFIO is actually intended to allow that or > not, so if not then by all means let's look at fixing that, but as I say I > think we're only seeing it provoke a problem at the driver level because of > 9ac8545199a1, and fixing VFIO doesn't fix that in general. And conversely if > we *can* fix that properly at the IOMMU API level then the current VFIO > behaviour should become benign again anyway. Okay, so there are probably other problems here that highlighted this.. > > IMHO this is definately a VFIO bug, because in a single-device group > > we must not allow the domain to remain attached past remove(). Or more > > broadly we shouldn't be holding ownership of a group without also > > having a driver attached. > > FWIW I was assuming it might be fine for a VFIO user to hold the group open > if they expect the device to come back again and re-bind (for example, > perhaps over some reconfiguration that requires turning SR-IOV off and on > again?) Once all the devices in the group are removed then something like pci_device_group() will have no way to discover the group again. eg in the SRIOV case it will just fall right down to iommu_group_alloc(), and that gives a new struct iommu_group and new IDR allocation. So in the general case this doesn't happen, I don't think any VFIO userspace should attempt to rely on it. From an API perspective is a much saner API toward iommu using drivers like VFIO if those drivers only use the iommu api while they have a device driver attached. Regards, Jason ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops 2022-09-01 16:14 ` Matthew Rosato 2022-09-01 20:37 ` Jason Gunthorpe @ 2022-09-02 10:48 ` Robin Murphy 1 sibling, 0 replies; 24+ messages in thread From: Robin Murphy @ 2022-09-02 10:48 UTC (permalink / raw) To: Matthew Rosato, iommu, jgg, Alex Williamson Cc: linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, linux-kernel On 2022-09-01 17:14, Matthew Rosato wrote: > On 9/1/22 6:25 AM, Robin Murphy wrote: >> On 2022-08-31 21:12, Matthew Rosato wrote: >>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev >>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU >>> domains and the DMA API handling. However, this commit does not >>> sufficiently handle the case where the device is released via a call >>> to the release_device op as it may occur at the same time as an opposing >>> attach_dev or detach_dev since the group mutex is not held over >>> release_device. This was observed if the device is deconfigured during a >>> small window during vfio-pci initialization and can result in WARNs and >>> potential kernel panics. >> >> Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all? >> >> Robin. > > So, I generally have seen the issue manifest as one of the calls into the iommu core from __vfio_group_unset_container (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN. This happens when the vfio group fd is released, which could be coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER. AFAICT there's nothing serializing the notion of calling into the iommu core here against a device that is simultaneously going through release_device (because we don't enter release_device with the group mutex held), resulting in unpredictable behavior between the dueling attach_dev/detach_dev and the release_device for s390-iommu at least. Oh, of course, I keep forgetting the way VFIO works with groups, so I guess as long as the group contained a VFIO device and passed the viability check at some point in the past, the VFIO user can then hold the fd open and continue to monkey about with it even after the device itself has unbound from vfio-pci. Plus there's also the obvious case of a device being hotplugged out of a group that still contains other live devices, so yes, this seems entirely normal and expected, derp. From a quick skim through the current mess, I think that 9ac8545199a1 is the culprit here, and has probably broken at least as much as it fixed - AFAICS it's still possible for release_device to race with a detach/domain_free and potentially lead to another use-after-free (or double-detach), it's just less likely than the previous case. Removing the device from the group *before* calling release_device is certainly more consistent with the other paths where release_device can be called with the device group-less, and is also symmetrical with probe_device being called before the device is first added... It would almost work if iommu_group_remove_device() captured the current domain at the point the device is removed from the group list, then passed that to the release_device callback directly. Except we still can't guarantee that that domain won't suddenly disappear, so those drivers would need to synchronise between their domain_free and release_device in some form, ho hum... Alternatively, a really sly way around the whole thing would be to find some pretence for doing an iommu_group_for_each_dev() in your release_device, but people will probably throw things at me for suggesting that :) Robin. >>> Handle this by tracking when the device is probed/released via >>> dev_iommu_priv_set/get(). Ensure that once the device is released only >>> release_device handles the re-init of the device DMA. >>> >>> Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") >>> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> >>> --- >>> arch/s390/include/asm/pci.h | 1 + >>> arch/s390/pci/pci.c | 1 + >>> drivers/iommu/s390-iommu.c | 39 ++++++++++++++++++++++++++++++++++--- >>> 3 files changed, 38 insertions(+), 3 deletions(-) >>> >>> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h >>> index 7b4cdadbc023..080251e7b275 100644 >>> --- a/arch/s390/include/asm/pci.h >>> +++ b/arch/s390/include/asm/pci.h >>> @@ -157,6 +157,7 @@ struct zpci_dev { >>> /* DMA stuff */ >>> unsigned long *dma_table; >>> spinlock_t dma_table_lock; >>> + struct mutex dma_domain_lock; /* protects s390_domain value */ >>> int tlb_refresh; >>> spinlock_t iommu_bitmap_lock; >>> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c >>> index 73cdc5539384..973edd32ecc9 100644 >>> --- a/arch/s390/pci/pci.c >>> +++ b/arch/s390/pci/pci.c >>> @@ -832,6 +832,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) >>> kref_init(&zdev->kref); >>> mutex_init(&zdev->lock); >>> mutex_init(&zdev->kzdev_lock); >>> + mutex_init(&zdev->dma_domain_lock); >>> rc = zpci_init_iommu(zdev); >>> if (rc) >>> diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c >>> index c898bcbbce11..1137d669e849 100644 >>> --- a/drivers/iommu/s390-iommu.c >>> +++ b/drivers/iommu/s390-iommu.c >>> @@ -99,6 +99,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, >>> if (!domain_device) >>> return -ENOMEM; >>> + /* Leave now if the device has already been released */ >>> + mutex_lock(&zdev->dma_domain_lock); >>> + if (!dev_iommu_priv_get(dev)) { >>> + mutex_unlock(&zdev->dma_domain_lock); >>> + kfree(domain_device); >>> + return 0; >>> + } >>> + >>> if (zdev->dma_table && !zdev->s390_domain) { >>> cc = zpci_dma_exit_device(zdev); >>> if (cc) { >>> @@ -132,9 +140,10 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, >>> goto out_restore; >>> } >>> domain_device->zdev = zdev; >>> - zdev->s390_domain = s390_domain; >>> list_add(&domain_device->list, &s390_domain->devices); >>> spin_unlock_irqrestore(&s390_domain->list_lock, flags); >>> + zdev->s390_domain = s390_domain; >>> + mutex_unlock(&zdev->dma_domain_lock); >>> return 0; >>> @@ -147,6 +156,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, >>> virt_to_phys(zdev->dma_table)); >>> } >>> out_free: >>> + mutex_unlock(&zdev->dma_domain_lock); >>> kfree(domain_device); >>> return rc; >>> @@ -176,17 +186,22 @@ static void s390_iommu_detach_device(struct iommu_domain *domain, >>> } >>> spin_unlock_irqrestore(&s390_domain->list_lock, flags); >>> - if (found && (zdev->s390_domain == s390_domain)) { >>> + mutex_lock(&zdev->dma_domain_lock); >>> + if (found && (zdev->s390_domain == s390_domain) && >>> + dev_iommu_priv_get(dev)) { >>> zdev->s390_domain = NULL; >>> zpci_unregister_ioat(zdev, 0); >>> zpci_dma_init_device(zdev); >>> } >>> + mutex_unlock(&zdev->dma_domain_lock); >>> } >>> static struct iommu_device *s390_iommu_probe_device(struct device *dev) >>> { >>> struct zpci_dev *zdev = to_zpci_dev(dev); >>> + dev_iommu_priv_set(dev, zdev); >>> + >>> return &zdev->iommu_dev; >>> } >>> @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev) >>> * >>> * So let's call detach_dev from here if it hasn't been called before. >>> */ >>> - if (zdev && zdev->s390_domain) { >>> + if (zdev) { >>> + /* >>> + * Clear priv to block further attaches for this device, >>> + * ensure detaches don't init DMA. Hold the domain lock >>> + * to ensure that attach/detach get a consistent view of >>> + * whether or not the device is released. >>> + */ >>> + mutex_lock(&zdev->dma_domain_lock); >>> + dev_iommu_priv_set(dev, NULL); >>> + mutex_unlock(&zdev->dma_domain_lock); >>> + /* Make sure this device is removed from the domain list */ >>> domain = iommu_get_domain_for_dev(dev); >>> if (domain) >>> s390_iommu_detach_device(domain, dev); >>> + /* Now ensure DMA is initialized from here */ >>> + mutex_lock(&zdev->dma_domain_lock); >>> + if (zdev->s390_domain) { >>> + zdev->s390_domain = NULL; >>> + zpci_unregister_ioat(zdev, 0); >>> + zpci_dma_init_device(zdev); >>> + } >>> + mutex_unlock(&zdev->dma_domain_lock); >>> } >>> } >>> > ^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v4 2/2] iommu/s390: fix leak of s390_domain_device 2022-08-31 20:12 [PATCH v4 0/2] iommu/s390: fixes related to repeat attach_dev calls Matthew Rosato 2022-08-31 20:12 ` [PATCH v4 1/2] iommu/s390: Fix race with release_device ops Matthew Rosato @ 2022-08-31 20:12 ` Matthew Rosato 1 sibling, 0 replies; 24+ messages in thread From: Matthew Rosato @ 2022-08-31 20:12 UTC (permalink / raw) To: iommu Cc: linux-s390, schnelle, pmorel, borntraeger, hca, gor, gerald.schaefer, agordeev, svens, joro, will, robin.murphy, jgg, linux-kernel Since allowing multiple domains to be attached via fa7e9ecc5e1c, it's now possible that a kfree of s390_domain_device is missed, either because a corresponding detach_dev was never called or because a repeat attach_dev was called for the same device and domain pair (resulting in unnecessary duplicates). Check for duplicates during attach_dev and ensure the list of s390_domain_device structures is cleared up when the domain is freed. Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com> --- drivers/iommu/s390-iommu.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 1137d669e849..db4dfbcf161b 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -78,7 +78,17 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type) static void s390_domain_free(struct iommu_domain *domain) { struct s390_domain *s390_domain = to_s390_domain(domain); + struct s390_domain_device *domain_device, *tmp; + unsigned long flags; + /* Ensure all device entries are cleaned up */ + spin_lock_irqsave(&s390_domain->list_lock, flags); + list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices, + list) { + list_del(&domain_device->list); + kfree(domain_device); + } + spin_unlock_irqrestore(&s390_domain->list_lock, flags); dma_cleanup_tables(s390_domain->dma_table); kfree(s390_domain); } @@ -88,7 +98,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); - struct s390_domain_device *domain_device; + struct s390_domain_device *domain_device, *ddev; unsigned long flags; int cc, rc; @@ -140,7 +150,16 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, goto out_restore; } domain_device->zdev = zdev; - list_add(&domain_device->list, &s390_domain->devices); + /* If already attached don't add another instance */ + list_for_each_entry(ddev, &s390_domain->devices, list) { + if (ddev->zdev == zdev) { + kfree(domain_device); + domain_device = NULL; + break; + } + } + if (domain_device) + list_add(&domain_device->list, &s390_domain->devices); spin_unlock_irqrestore(&s390_domain->list_lock, flags); zdev->s390_domain = s390_domain; mutex_unlock(&zdev->dma_domain_lock); -- 2.37.2 ^ permalink raw reply related [flat|nested] 24+ messages in thread
end of thread, other threads:[~2022-09-06 13:45 UTC | newest] Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-08-31 20:12 [PATCH v4 0/2] iommu/s390: fixes related to repeat attach_dev calls Matthew Rosato 2022-08-31 20:12 ` [PATCH v4 1/2] iommu/s390: Fix race with release_device ops Matthew Rosato 2022-09-01 7:56 ` Pierre Morel 2022-09-01 9:37 ` Niklas Schnelle 2022-09-01 11:01 ` Robin Murphy 2022-09-01 13:42 ` Niklas Schnelle 2022-09-01 14:17 ` Niklas Schnelle 2022-09-01 14:29 ` Robin Murphy 2022-09-01 14:34 ` Jason Gunthorpe 2022-09-01 15:03 ` Robin Murphy 2022-09-01 15:49 ` Jason Gunthorpe 2022-09-01 17:00 ` Robin Murphy 2022-09-01 20:28 ` Matthew Rosato 2022-09-02 7:49 ` Niklas Schnelle 2022-09-01 10:25 ` Robin Murphy 2022-09-01 16:14 ` Matthew Rosato 2022-09-01 20:37 ` Jason Gunthorpe 2022-09-02 17:11 ` Matthew Rosato 2022-09-02 17:21 ` Jason Gunthorpe 2022-09-02 18:20 ` Matthew Rosato 2022-09-05 9:46 ` Robin Murphy 2022-09-06 13:36 ` Jason Gunthorpe 2022-09-02 10:48 ` Robin Murphy 2022-08-31 20:12 ` [PATCH v4 2/2] iommu/s390: fix leak of s390_domain_device Matthew Rosato
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.