[RFC PATCH v2 0/2] Reduce IOTLB flush when pass-through dGPU devices

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH v2 0/2] Reduce IOTLB flush when pass-through dGPU devices
@ 2017-12-27  9:20 Suravee Suthikulpanit
  2017-12-27  9:20 ` [RFC PATCH v2 1/2] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs Suravee Suthikulpanit
  2017-12-27  9:20 ` [RFC PATCH v2 2/2] iommu/amd: Add support for fast IOTLB flushing Suravee Suthikulpanit
  0 siblings, 2 replies; 10+ messages in thread
From: Suravee Suthikulpanit @ 2017-12-27  9:20 UTC (permalink / raw)
  To: linux-kernel, iommu; +Cc: joro, jroedel, alex.williamson, Suravee Suthikulpanit

From: Suravee Suthikulpanit <ssuthiku@redhat.com>

Currently, when pass-through dGPU to a guest VM, there are thousands
of IOTLB flush commands sent from IOMMU to end-point-device. This cause
performance issue when launching new VMs, and could cause IOTLB invalidate
time-out issue on certain dGPUs.

This can be avoided by adopting the new fast IOTLB flush APIs.

Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Joerg Roedel <jroedel@suse.de>

Changes from V1: (https://lkml.org/lkml/2017/11/17/764)

  * Rebased on top of v4.15-rc5

  * Patch 1/2: Fix iommu_tlb_range_add() size parameter to use unmapped
    instead of len. (per Alex)

  * Patch 1/2: Use a list to keep track unmapped IOVAs for VFIO remote
    unpinning. Although, I am still not sure if using a list is the best
    way to keep track the IOVAs. (per Alex)

  * Patch 2/2: Fix logic due to missing spin unlock. (per Tom)
     
Suravee Suthikulpanit (2):
  vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs
  iommu/amd: Add support for fast IOTLB flushing

 drivers/iommu/amd_iommu.c       | 73 ++++++++++++++++++++++++++++++++-
 drivers/iommu/amd_iommu_init.c  |  7 ++++
 drivers/iommu/amd_iommu_types.h |  7 ++++
 drivers/vfio/vfio_iommu_type1.c | 89 +++++++++++++++++++++++++++++++++++------
 4 files changed, 163 insertions(+), 13 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [RFC PATCH v2 1/2] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs
  2017-12-27  9:20 [RFC PATCH v2 0/2] Reduce IOTLB flush when pass-through dGPU devices Suravee Suthikulpanit
@ 2017-12-27  9:20 ` Suravee Suthikulpanit
  2018-01-08 20:53   ` Alex Williamson
  2017-12-27  9:20 ` [RFC PATCH v2 2/2] iommu/amd: Add support for fast IOTLB flushing Suravee Suthikulpanit
  1 sibling, 1 reply; 10+ messages in thread
From: Suravee Suthikulpanit @ 2017-12-27  9:20 UTC (permalink / raw)
  To: linux-kernel, iommu; +Cc: joro, jroedel, alex.williamson, Suravee Suthikulpanit

VFIO IOMMU type1 currently upmaps IOVA pages synchronously, which requires
IOTLB flushing for every unmapping. This results in large IOTLB flushing
overhead when handling pass-through devices has a large number of mapped
IOVAs.

This can be avoided by using the new IOTLB flushing interface.

Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 drivers/vfio/vfio_iommu_type1.c | 89 +++++++++++++++++++++++++++++++++++------
 1 file changed, 77 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e30e29a..f000844 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -102,6 +102,13 @@ struct vfio_pfn {
 	atomic_t		ref_count;
 };
 
+struct vfio_regions{
+	struct list_head list;
+	dma_addr_t iova;
+	phys_addr_t phys;
+	size_t len;
+};
+
 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
 					(!list_empty(&iommu->domain_list))
 
@@ -479,6 +486,40 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
 	return unlocked;
 }
 
+/*
+ * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
+ * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
+ * of these regions (currently using a list).
+ *
+ * This value specifies maximum number of regions for each IOTLB flush sync.
+ */
+#define VFIO_IOMMU_TLB_SYNC_MAX		512
+
+static long vfio_sync_and_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
+				struct list_head *regions, bool do_accounting)
+{
+	long unlocked = 0;
+	struct vfio_regions *entry, *next;
+
+	iommu_tlb_sync(domain->domain);
+
+	list_for_each_entry_safe(entry, next, regions, list) {
+		unlocked += vfio_unpin_pages_remote(dma,
+						    entry->iova,
+						    entry->phys >> PAGE_SHIFT,
+						    entry->len >> PAGE_SHIFT,
+						    false);
+		list_del(&entry->list);
+		kfree(entry);
+	}
+
+	if (do_accounting) {
+		vfio_lock_acct(dma->task, -unlocked, NULL);
+		return 0;
+	}
+	return unlocked;
+}
+
 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 				  unsigned long *pfn_base, bool do_accounting)
 {
@@ -653,7 +694,10 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 {
 	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
 	struct vfio_domain *domain, *d;
+	struct list_head unmapped_regions;
+	struct vfio_regions *entry;
 	long unlocked = 0;
+	int cnt = 0;
 
 	if (!dma->size)
 		return 0;
@@ -661,6 +705,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
 		return 0;
 
+	INIT_LIST_HEAD(&unmapped_regions);
+
 	/*
 	 * We use the IOMMU to track the physical addresses, otherwise we'd
 	 * need a much more complicated tracking system.  Unfortunately that
@@ -698,24 +744,36 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 				break;
 		}
 
-		unmapped = iommu_unmap(domain->domain, iova, len);
-		if (WARN_ON(!unmapped))
+		entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry)
 			break;
 
-		unlocked += vfio_unpin_pages_remote(dma, iova,
-						    phys >> PAGE_SHIFT,
-						    unmapped >> PAGE_SHIFT,
-						    false);
+		unmapped = iommu_unmap_fast(domain->domain, iova, len);
+		if (WARN_ON(!unmapped)) {
+			kfree(entry);
+			break;
+		}
+
+		iommu_tlb_range_add(domain->domain, iova, unmapped);
+		entry->iova = iova;
+		entry->phys = phys;
+		entry->len  = unmapped;
+		list_add_tail(&entry->list, &unmapped_regions);
+		cnt++;
 		iova += unmapped;
 
+		if (cnt >= VFIO_IOMMU_TLB_SYNC_MAX) {
+			unlocked += vfio_sync_and_unpin(dma, domain, &unmapped_regions,
+							do_accounting);
+			cnt = 0;
+		}
 		cond_resched();
 	}
 
+	if (cnt)
+		unlocked += vfio_sync_and_unpin(dma, domain, &unmapped_regions,
+						do_accounting);
 	dma->iommu_mapped = false;
-	if (do_accounting) {
-		vfio_lock_acct(dma->task, -unlocked, NULL);
-		return 0;
-	}
 	return unlocked;
 }
 
@@ -878,6 +936,7 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
 {
 	long i;
 	int ret = 0;
+	size_t unmapped = 0;
 
 	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
 		ret = iommu_map(domain->domain, iova,
@@ -887,8 +946,14 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
 			break;
 	}
 
-	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
-		iommu_unmap(domain->domain, iova, PAGE_SIZE);
+	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) {
+		unmapped = iommu_unmap_fast(domain->domain, iova, PAGE_SIZE);
+		if (WARN_ON(!unmapped))
+			break;
+		iommu_tlb_range_add(domain->domain, iova, unmapped);
+	}
+	if (unmapped)
+		iommu_tlb_sync(domain->domain);
 
 	return ret;
 }
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [RFC PATCH v2 2/2] iommu/amd: Add support for fast IOTLB flushing
  2017-12-27  9:20 [RFC PATCH v2 0/2] Reduce IOTLB flush when pass-through dGPU devices Suravee Suthikulpanit
  2017-12-27  9:20 ` [RFC PATCH v2 1/2] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs Suravee Suthikulpanit
@ 2017-12-27  9:20 ` Suravee Suthikulpanit
  2018-01-22  3:41   ` Suravee Suthikulpanit
  2018-01-24 11:43   ` Suravee Suthikulpanit
  1 sibling, 2 replies; 10+ messages in thread
From: Suravee Suthikulpanit @ 2017-12-27  9:20 UTC (permalink / raw)
  To: linux-kernel, iommu; +Cc: joro, jroedel, alex.williamson, Suravee Suthikulpanit

Implement the newly added IOTLB flushing interface for AMD IOMMU.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 drivers/iommu/amd_iommu.c       | 73 ++++++++++++++++++++++++++++++++++++++++-
 drivers/iommu/amd_iommu_init.c  |  7 ++++
 drivers/iommu/amd_iommu_types.h |  7 ++++
 3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 7d5eb00..42fe365 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -129,6 +129,12 @@ struct dma_ops_domain {
 static struct iova_domain reserved_iova_ranges;
 static struct lock_class_key reserved_rbtree_key;
 
+struct amd_iommu_flush_entries {
+	struct list_head list;
+	unsigned long iova;
+	size_t size;
+};
+
 /****************************************************************************
  *
  * Helper functions
@@ -3043,7 +3049,6 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
 	unmap_size = iommu_unmap_page(domain, iova, page_size);
 	mutex_unlock(&domain->api_lock);
 
-	domain_flush_tlb_pde(domain);
 	domain_flush_complete(domain);
 
 	return unmap_size;
@@ -3163,6 +3168,69 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
 	return dev_data->defer_attach;
 }
 
+static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
+{
+	struct protection_domain *dom = to_pdomain(domain);
+
+	domain_flush_tlb_pde(dom);
+}
+
+static void amd_iommu_iotlb_range_add(struct iommu_domain *domain,
+				      unsigned long iova, size_t size)
+{
+	struct amd_iommu_flush_entries *entry, *p;
+	unsigned long flags;
+	bool found = false;
+
+	spin_lock_irqsave(&amd_iommu_flush_list_lock, flags);
+	list_for_each_entry(p, &amd_iommu_flush_list, list) {
+		if (iova != p->iova)
+			continue;
+
+		if (size > p->size) {
+			p->size = size;
+			pr_debug("%s: update range: iova=%#lx, size = %#lx\n",
+				 __func__, p->iova, p->size);
+		}
+		found = true;
+		break;
+	}
+
+	if (!found) {
+		entry = kzalloc(sizeof(struct amd_iommu_flush_entries),
+				GFP_KERNEL);
+		if (entry) {
+			pr_debug("%s: new range: iova=%lx, size=%#lx\n",
+				 __func__, iova, size);
+
+			entry->iova = iova;
+			entry->size = size;
+			list_add(&entry->list, &amd_iommu_flush_list);
+		}
+	}
+	spin_unlock_irqrestore(&amd_iommu_flush_list_lock, flags);
+}
+
+static void amd_iommu_iotlb_sync(struct iommu_domain *domain)
+{
+	struct protection_domain *pdom = to_pdomain(domain);
+	struct amd_iommu_flush_entries *entry, *next;
+	unsigned long flags;
+
+	/* Note:
+	 * Currently, IOMMU driver just flushes the whole IO/TLB for
+	 * a given domain. So, just remove entries from the list here.
+	 */
+	spin_lock_irqsave(&amd_iommu_flush_list_lock, flags);
+	list_for_each_entry_safe(entry, next, &amd_iommu_flush_list, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+	spin_unlock_irqrestore(&amd_iommu_flush_list_lock, flags);
+
+	domain_flush_tlb_pde(pdom);
+}
+
 const struct iommu_ops amd_iommu_ops = {
 	.capable = amd_iommu_capable,
 	.domain_alloc = amd_iommu_domain_alloc,
@@ -3181,6 +3249,9 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
 	.apply_resv_region = amd_iommu_apply_resv_region,
 	.is_attach_deferred = amd_iommu_is_attach_deferred,
 	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
+	.flush_iotlb_all = amd_iommu_flush_iotlb_all,
+	.iotlb_range_add = amd_iommu_iotlb_range_add,
+	.iotlb_sync = amd_iommu_iotlb_sync,
 };
 
 /*****************************************************************************
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 6fe2d03..e8f8cee 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -185,6 +185,12 @@ struct ivmd_header {
 bool amd_iommu_force_isolation __read_mostly;
 
 /*
+ * IOTLB flush list
+ */
+LIST_HEAD(amd_iommu_flush_list);
+spinlock_t amd_iommu_flush_list_lock;
+
+/*
  * List of protection domains - used during resume
  */
 LIST_HEAD(amd_iommu_pd_list);
@@ -2490,6 +2496,7 @@ static int __init early_amd_iommu_init(void)
 	__set_bit(0, amd_iommu_pd_alloc_bitmap);
 
 	spin_lock_init(&amd_iommu_pd_lock);
+	spin_lock_init(&amd_iommu_flush_list_lock);
 
 	/*
 	 * now the data structures are allocated and basically initialized
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index f6b24c7..c3f4a7e 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -668,6 +668,13 @@ struct iommu_dev_data {
 extern struct list_head amd_iommu_pd_list;
 
 /*
+ * Declarations for the global flush list to support
+ * iotlb_range_add() and iotlb_sync.
+ */
+extern spinlock_t amd_iommu_flush_list_lock;
+extern struct list_head amd_iommu_flush_list;
+
+/*
  * Structure defining one entry in the device table
  */
 struct dev_table_entry {
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH v2 1/2] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs
  2017-12-27  9:20 ` [RFC PATCH v2 1/2] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs Suravee Suthikulpanit
@ 2018-01-08 20:53   ` Alex Williamson
  2018-01-08 21:07     ` Alex Williamson
  2018-01-18  3:25     ` Suravee Suthikulpanit
  0 siblings, 2 replies; 10+ messages in thread
From: Alex Williamson @ 2018-01-08 20:53 UTC (permalink / raw)
  To: Suravee Suthikulpanit; +Cc: linux-kernel, iommu, joro, jroedel

On Wed, 27 Dec 2017 04:20:34 -0500
Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> wrote:

> VFIO IOMMU type1 currently upmaps IOVA pages synchronously, which requires
> IOTLB flushing for every unmapping. This results in large IOTLB flushing
> overhead when handling pass-through devices has a large number of mapped
> IOVAs.
> 
> This can be avoided by using the new IOTLB flushing interface.

Hi Suravee,

I've been playing with other ways we might do this, but I can't come up
with anything better.  A few comments below...

> 
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Joerg Roedel <jroedel@suse.de>
> Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 89 +++++++++++++++++++++++++++++++++++------
>  1 file changed, 77 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e30e29a..f000844 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -102,6 +102,13 @@ struct vfio_pfn {
>  	atomic_t		ref_count;
>  };
>  
> +struct vfio_regions{
> +	struct list_head list;
> +	dma_addr_t iova;
> +	phys_addr_t phys;
> +	size_t len;
> +};
> +
>  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
>  					(!list_empty(&iommu->domain_list))
>  
> @@ -479,6 +486,40 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
>  	return unlocked;
>  }
>  
> +/*
> + * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
> + * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
> + * of these regions (currently using a list).
> + *
> + * This value specifies maximum number of regions for each IOTLB flush sync.
> + */
> +#define VFIO_IOMMU_TLB_SYNC_MAX		512

Is this an arbitrary value or are there non-obvious considerations for
this value should we want to further tune it in the future?

> +
> +static long vfio_sync_and_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
> +				struct list_head *regions, bool do_accounting)
> +{
> +	long unlocked = 0;
> +	struct vfio_regions *entry, *next;
> +
> +	iommu_tlb_sync(domain->domain);
> +
> +	list_for_each_entry_safe(entry, next, regions, list) {
> +		unlocked += vfio_unpin_pages_remote(dma,
> +						    entry->iova,
> +						    entry->phys >> PAGE_SHIFT,
> +						    entry->len >> PAGE_SHIFT,
> +						    false);
> +		list_del(&entry->list);
> +		kfree(entry);
> +	}
> +
> +	if (do_accounting) {
> +		vfio_lock_acct(dma->task, -unlocked, NULL);
> +		return 0;
> +	}
> +	return unlocked;
> +}
> +
>  static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
>  				  unsigned long *pfn_base, bool do_accounting)
>  {
> @@ -653,7 +694,10 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  {
>  	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
>  	struct vfio_domain *domain, *d;
> +	struct list_head unmapped_regions;
> +	struct vfio_regions *entry;
>  	long unlocked = 0;
> +	int cnt = 0;
>  
>  	if (!dma->size)
>  		return 0;
> @@ -661,6 +705,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
>  		return 0;
>  
> +	INIT_LIST_HEAD(&unmapped_regions);
> +
>  	/*
>  	 * We use the IOMMU to track the physical addresses, otherwise we'd
>  	 * need a much more complicated tracking system.  Unfortunately that
> @@ -698,24 +744,36 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  				break;
>  		}
>  
> -		unmapped = iommu_unmap(domain->domain, iova, len);
> -		if (WARN_ON(!unmapped))
> +		entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> +		if (!entry)
>  			break;
>  
> -		unlocked += vfio_unpin_pages_remote(dma, iova,
> -						    phys >> PAGE_SHIFT,
> -						    unmapped >> PAGE_SHIFT,
> -						    false);
> +		unmapped = iommu_unmap_fast(domain->domain, iova, len);
> +		if (WARN_ON(!unmapped)) {
> +			kfree(entry);
> +			break;
> +		}
> +
> +		iommu_tlb_range_add(domain->domain, iova, unmapped);
> +		entry->iova = iova;
> +		entry->phys = phys;
> +		entry->len  = unmapped;
> +		list_add_tail(&entry->list, &unmapped_regions);
> +		cnt++;
>  		iova += unmapped;
>  
> +		if (cnt >= VFIO_IOMMU_TLB_SYNC_MAX) {
> +			unlocked += vfio_sync_and_unpin(dma, domain, &unmapped_regions,
> +							do_accounting);

Exceeds 80 columns here.

> +			cnt = 0;
> +		}
>  		cond_resched();
>  	}
>  
> +	if (cnt)
> +		unlocked += vfio_sync_and_unpin(dma, domain, &unmapped_regions,
> +						do_accounting);
>  	dma->iommu_mapped = false;
> -	if (do_accounting) {
> -		vfio_lock_acct(dma->task, -unlocked, NULL);
> -		return 0;
> -	}
>  	return unlocked;
>  }
>  
> @@ -878,6 +936,7 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
>  {
>  	long i;
>  	int ret = 0;
> +	size_t unmapped = 0;
>  
>  	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
>  		ret = iommu_map(domain->domain, iova,
> @@ -887,8 +946,14 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
>  			break;
>  	}
>  
> -	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
> -		iommu_unmap(domain->domain, iova, PAGE_SIZE);
> +	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) {
> +		unmapped = iommu_unmap_fast(domain->domain, iova, PAGE_SIZE);
> +		if (WARN_ON(!unmapped))
> +			break;
> +		iommu_tlb_range_add(domain->domain, iova, unmapped);
> +	}
> +	if (unmapped)
> +		iommu_tlb_sync(domain->domain);

Using unmapped here seems a little sketchy, for instance if we got back
zero on the last call to iommu_unmap_fast() but had other ranges queued
for flush.  Do we even need a WARN_ON and break here, are we just
trying to skip adding a zero range?  The intent is that we either leave
this function with everything mapped or nothing mapped, so perhaps we
should warn and continue.  Assuming a spurious sync is ok, we could
check (i < npage) for the sync condition, the only risk being we had no
mappings at all and therefore no unmaps.

TBH, I wonder if this function is even needed anymore or if the mapping
problem in amd_iommu has since ben fixed.

Also, I'm not sure why you're gating adding fast flushing to amd_iommu
on vfio making use of it.  These can be done independently.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH v2 1/2] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs
  2018-01-08 20:53   ` Alex Williamson
@ 2018-01-08 21:07     ` Alex Williamson
  2018-01-17 16:54       ` Suravee Suthikulpanit
  2018-01-18  3:25     ` Suravee Suthikulpanit
  1 sibling, 1 reply; 10+ messages in thread
From: Alex Williamson @ 2018-01-08 21:07 UTC (permalink / raw)
  To: Suravee Suthikulpanit; +Cc: iommu, jroedel, linux-kernel

On Mon, 8 Jan 2018 13:53:29 -0700
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Wed, 27 Dec 2017 04:20:34 -0500
> Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> wrote:
> 
> > VFIO IOMMU type1 currently upmaps IOVA pages synchronously, which requires
> > IOTLB flushing for every unmapping. This results in large IOTLB flushing
> > overhead when handling pass-through devices has a large number of mapped
> > IOVAs.
> > 
> > This can be avoided by using the new IOTLB flushing interface.  
> 
> Hi Suravee,
> 
> I've been playing with other ways we might do this, but I can't come up
> with anything better.  A few comments below...
> 
> > 
> > Cc: Alex Williamson <alex.williamson@redhat.com>
> > Cc: Joerg Roedel <jroedel@suse.de>
> > Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 89 +++++++++++++++++++++++++++++++++++------
> >  1 file changed, 77 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index e30e29a..f000844 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -102,6 +102,13 @@ struct vfio_pfn {
> >  	atomic_t		ref_count;
> >  };
> >  
> > +struct vfio_regions{
> > +	struct list_head list;
> > +	dma_addr_t iova;
> > +	phys_addr_t phys;
> > +	size_t len;
> > +};
> > +
> >  #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> >  					(!list_empty(&iommu->domain_list))
> >  
> > @@ -479,6 +486,40 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
> >  	return unlocked;
> >  }
> >  
> > +/*
> > + * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
> > + * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
> > + * of these regions (currently using a list).
> > + *
> > + * This value specifies maximum number of regions for each IOTLB flush sync.
> > + */
> > +#define VFIO_IOMMU_TLB_SYNC_MAX		512  
> 
> Is this an arbitrary value or are there non-obvious considerations for
> this value should we want to further tune it in the future?
> 
> > +
> > +static long vfio_sync_and_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
> > +				struct list_head *regions, bool do_accounting)
> > +{
> > +	long unlocked = 0;
> > +	struct vfio_regions *entry, *next;
> > +
> > +	iommu_tlb_sync(domain->domain);
> > +
> > +	list_for_each_entry_safe(entry, next, regions, list) {
> > +		unlocked += vfio_unpin_pages_remote(dma,
> > +						    entry->iova,
> > +						    entry->phys >> PAGE_SHIFT,
> > +						    entry->len >> PAGE_SHIFT,
> > +						    false);
> > +		list_del(&entry->list);
> > +		kfree(entry);
> > +	}
> > +
> > +	if (do_accounting) {
> > +		vfio_lock_acct(dma->task, -unlocked, NULL);
> > +		return 0;
> > +	}
> > +	return unlocked;
> > +}
> > +
> >  static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> >  				  unsigned long *pfn_base, bool do_accounting)
> >  {
> > @@ -653,7 +694,10 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> >  {
> >  	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> >  	struct vfio_domain *domain, *d;
> > +	struct list_head unmapped_regions;
> > +	struct vfio_regions *entry;
> >  	long unlocked = 0;
> > +	int cnt = 0;
> >  
> >  	if (!dma->size)
> >  		return 0;
> > @@ -661,6 +705,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> >  	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> >  		return 0;
> >  
> > +	INIT_LIST_HEAD(&unmapped_regions);
> > +
> >  	/*
> >  	 * We use the IOMMU to track the physical addresses, otherwise we'd
> >  	 * need a much more complicated tracking system.  Unfortunately that
> > @@ -698,24 +744,36 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> >  				break;
> >  		}
> >  
> > -		unmapped = iommu_unmap(domain->domain, iova, len);
> > -		if (WARN_ON(!unmapped))
> > +		entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> > +		if (!entry)
> >  			break;


Turns out this nagged at me a bit too, this function only gets called
once to dump the vfio_dma, so bailing out here leaves pages pinned and
IOMMU mappings in place, for a performance optimization that we could
just skip.  We could sync&unpin anything collected up to this point and
continue this step with a synchronous unmap/unpin.  Thanks,

Alex

> >  
> > -		unlocked += vfio_unpin_pages_remote(dma, iova,
> > -						    phys >> PAGE_SHIFT,
> > -						    unmapped >> PAGE_SHIFT,
> > -						    false);
> > +		unmapped = iommu_unmap_fast(domain->domain, iova, len);
> > +		if (WARN_ON(!unmapped)) {
> > +			kfree(entry);
> > +			break;
> > +		}
> > +
> > +		iommu_tlb_range_add(domain->domain, iova, unmapped);
> > +		entry->iova = iova;
> > +		entry->phys = phys;
> > +		entry->len  = unmapped;
> > +		list_add_tail(&entry->list, &unmapped_regions);
> > +		cnt++;
> >  		iova += unmapped;
> >  
> > +		if (cnt >= VFIO_IOMMU_TLB_SYNC_MAX) {
> > +			unlocked += vfio_sync_and_unpin(dma, domain, &unmapped_regions,
> > +							do_accounting);  
> 
> Exceeds 80 columns here.
> 
> > +			cnt = 0;
> > +		}
> >  		cond_resched();
> >  	}
> >  
> > +	if (cnt)
> > +		unlocked += vfio_sync_and_unpin(dma, domain, &unmapped_regions,
> > +						do_accounting);
> >  	dma->iommu_mapped = false;
> > -	if (do_accounting) {
> > -		vfio_lock_acct(dma->task, -unlocked, NULL);
> > -		return 0;
> > -	}
> >  	return unlocked;
> >  }
> >  
> > @@ -878,6 +936,7 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
> >  {
> >  	long i;
> >  	int ret = 0;
> > +	size_t unmapped = 0;
> >  
> >  	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
> >  		ret = iommu_map(domain->domain, iova,
> > @@ -887,8 +946,14 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
> >  			break;
> >  	}
> >  
> > -	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
> > -		iommu_unmap(domain->domain, iova, PAGE_SIZE);
> > +	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) {
> > +		unmapped = iommu_unmap_fast(domain->domain, iova, PAGE_SIZE);
> > +		if (WARN_ON(!unmapped))
> > +			break;
> > +		iommu_tlb_range_add(domain->domain, iova, unmapped);
> > +	}
> > +	if (unmapped)
> > +		iommu_tlb_sync(domain->domain);  
> 
> Using unmapped here seems a little sketchy, for instance if we got back
> zero on the last call to iommu_unmap_fast() but had other ranges queued
> for flush.  Do we even need a WARN_ON and break here, are we just
> trying to skip adding a zero range?  The intent is that we either leave
> this function with everything mapped or nothing mapped, so perhaps we
> should warn and continue.  Assuming a spurious sync is ok, we could
> check (i < npage) for the sync condition, the only risk being we had no
> mappings at all and therefore no unmaps.
> 
> TBH, I wonder if this function is even needed anymore or if the mapping
> problem in amd_iommu has since ben fixed.
> 
> Also, I'm not sure why you're gating adding fast flushing to amd_iommu
> on vfio making use of it.  These can be done independently.  Thanks,
> 
> Alex
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH v2 1/2] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs
  2018-01-08 21:07     ` Alex Williamson
@ 2018-01-17 16:54       ` Suravee Suthikulpanit
  0 siblings, 0 replies; 10+ messages in thread
From: Suravee Suthikulpanit @ 2018-01-17 16:54 UTC (permalink / raw)
  To: Alex Williamson; +Cc: iommu, jroedel, linux-kernel

Hi Alex,

On 1/9/18 4:07 AM, Alex Williamson wrote:
>>> @@ -661,6 +705,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
>>>   	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
>>>   		return 0;
>>>   
>>> +	INIT_LIST_HEAD(&unmapped_regions);
>>> +
>>>   	/*
>>>   	 * We use the IOMMU to track the physical addresses, otherwise we'd
>>>   	 * need a much more complicated tracking system.  Unfortunately that
>>> @@ -698,24 +744,36 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
>>>   				break;
>>>   		}
>>>   
>>> -		unmapped = iommu_unmap(domain->domain, iova, len);
>>> -		if (WARN_ON(!unmapped))
>>> +		entry = kzalloc(sizeof(*entry), GFP_KERNEL);
>>> +		if (!entry)
>>>   			break;
> 
> Turns out this nagged at me a bit too, this function only gets called
> once to dump the vfio_dma, so bailing out here leaves pages pinned and
> IOMMU mappings in place, for a performance optimization that we could
> just skip.  We could sync&unpin anything collected up to this point and
> continue this step with a synchronous unmap/unpin.  Thanks,

Ah, that's an over look in my part also. Thanks for catching this. I'll implement
the fallback mechanism per your suggestion in v3.

Thanks,
Suravee

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH v2 1/2] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs
  2018-01-08 20:53   ` Alex Williamson
  2018-01-08 21:07     ` Alex Williamson
@ 2018-01-18  3:25     ` Suravee Suthikulpanit
  2018-01-18 17:34       ` Alex Williamson
  1 sibling, 1 reply; 10+ messages in thread
From: Suravee Suthikulpanit @ 2018-01-18  3:25 UTC (permalink / raw)
  To: Alex Williamson; +Cc: linux-kernel, iommu, joro, jroedel

Hi Alex,

On 1/9/18 3:53 AM, Alex Williamson wrote:
> On Wed, 27 Dec 2017 04:20:34 -0500
> Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> wrote:
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index e30e29a..f000844 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>
>> ... >>
>> @@ -479,6 +486,40 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
>>   	return unlocked;
>>   }
>>   
>> +/*
>> + * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
>> + * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
>> + * of these regions (currently using a list).
>> + *
>> + * This value specifies maximum number of regions for each IOTLB flush sync.
>> + */
>> +#define VFIO_IOMMU_TLB_SYNC_MAX		512
> 
> Is this an arbitrary value or are there non-obvious considerations for
> this value should we want to further tune it in the future?

This is just an arbitrary value for now, which we could try further tuning.
On some dGPUs that I have been using, I have seen max of ~1500 regions within an unmap call.
In most case, I see less than 100 regions in an unmap call. The structure is currently 40 bytes.
So, I figured capping at 512 entry in the list is 20KB is reasonable. Let me know what you think.

>> ....
>>
>> @@ -887,8 +946,14 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
>>   			break;
>>   	}
>>   
>> -	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
>> -		iommu_unmap(domain->domain, iova, PAGE_SIZE);
>> +	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) {
>> +		unmapped = iommu_unmap_fast(domain->domain, iova, PAGE_SIZE);
>> +		if (WARN_ON(!unmapped))
>> +			break;
>> +		iommu_tlb_range_add(domain->domain, iova, unmapped);
>> +	}
>> +	if (unmapped)
>> +		iommu_tlb_sync(domain->domain);
> 
> Using unmapped here seems a little sketchy, for instance if we got back
> zero on the last call to iommu_unmap_fast() but had other ranges queued
> for flush.  Do we even need a WARN_ON and break here, are we just
> trying to skip adding a zero range?  The intent is that we either leave
> this function with everything mapped or nothing mapped, so perhaps we
> should warn and continue.  Assuming a spurious sync is ok, we could
> check (i < npage) for the sync condition, the only risk being we had no
> mappings at all and therefore no unmaps.
>
> TBH, I wonder if this function is even needed anymore or if the mapping
> problem in amd_iommu has since ben fixed.

Actually, I never hit this execution path in my test runs. I could just left this
unchanged and use the slow unmap path to simplify the logic. I'm not aware of
the history of why this logic is needed for AMD IOMMU. Is this a bug in the driver or
the hardware?

> Also, I'm not sure why you're gating adding fast flushing to amd_iommu
> on vfio making use of it.  These can be done independently.  Thanks,

Currently, the fast unmap interface is mainly called by VFIO. So, I thought I would
submit the patches together for review. If you would prefer, I can submit the IOMMU part
separately.

Thanks,
Suravee

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH v2 1/2] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs
  2018-01-18  3:25     ` Suravee Suthikulpanit
@ 2018-01-18 17:34       ` Alex Williamson
  0 siblings, 0 replies; 10+ messages in thread
From: Alex Williamson @ 2018-01-18 17:34 UTC (permalink / raw)
  To: Suravee Suthikulpanit; +Cc: linux-kernel, iommu, joro, jroedel

On Thu, 18 Jan 2018 10:25:12 +0700
Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> wrote:

> Hi Alex,
> 
> On 1/9/18 3:53 AM, Alex Williamson wrote:
> > On Wed, 27 Dec 2017 04:20:34 -0500
> > Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> wrote:  
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index e30e29a..f000844 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>
> >> ... >>
> >> @@ -479,6 +486,40 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
> >>   	return unlocked;
> >>   }
> >>   
> >> +/*
> >> + * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
> >> + * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
> >> + * of these regions (currently using a list).
> >> + *
> >> + * This value specifies maximum number of regions for each IOTLB flush sync.
> >> + */
> >> +#define VFIO_IOMMU_TLB_SYNC_MAX		512  
> > 
> > Is this an arbitrary value or are there non-obvious considerations for
> > this value should we want to further tune it in the future?  
> 
> This is just an arbitrary value for now, which we could try further tuning.
> On some dGPUs that I have been using, I have seen max of ~1500 regions within an unmap call.
> In most case, I see less than 100 regions in an unmap call. The structure is currently 40 bytes.
> So, I figured capping at 512 entry in the list is 20KB is reasonable. Let me know what you think.

Seems like a reasonable starting point.  For a VM user, the maximum
size of an unmap will largely depend on the size of the VM,
predominantly the size of memory above the 4G boundary in the VM.  That
could be 100s of GB.  In the best case, iommu_unmap could return 1GB
ranges, in the worst case, 4K.  So I'm not sure there's really any
upper bound, thus the cond_resched(), but we will opportunistically
coalesce pages unless disabled via module option.

> >> ....
> >>
> >> @@ -887,8 +946,14 @@ static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
> >>   			break;
> >>   	}
> >>   
> >> -	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
> >> -		iommu_unmap(domain->domain, iova, PAGE_SIZE);
> >> +	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) {
> >> +		unmapped = iommu_unmap_fast(domain->domain, iova, PAGE_SIZE);
> >> +		if (WARN_ON(!unmapped))
> >> +			break;
> >> +		iommu_tlb_range_add(domain->domain, iova, unmapped);
> >> +	}
> >> +	if (unmapped)
> >> +		iommu_tlb_sync(domain->domain);  
> > 
> > Using unmapped here seems a little sketchy, for instance if we got back
> > zero on the last call to iommu_unmap_fast() but had other ranges queued
> > for flush.  Do we even need a WARN_ON and break here, are we just
> > trying to skip adding a zero range?  The intent is that we either leave
> > this function with everything mapped or nothing mapped, so perhaps we
> > should warn and continue.  Assuming a spurious sync is ok, we could
> > check (i < npage) for the sync condition, the only risk being we had no
> > mappings at all and therefore no unmaps.
> >
> > TBH, I wonder if this function is even needed anymore or if the mapping
> > problem in amd_iommu has since ben fixed.  
> 
> Actually, I never hit this execution path in my test runs. I could just left this
> unchanged and use the slow unmap path to simplify the logic. I'm not aware of
> the history of why this logic is needed for AMD IOMMU. Is this a bug in the driver or
> the hardware?

It was a bug in amd_iommu code and unfortunately I've lost details
beyond what we see in the comments there, iommu ptes previously mapped
as 4k can't be unmapped and remapped using large page sizes,
presumably pmds still in place or something along those lines.  If
you're testing on AMD and not triggering this, let's not try to
optimize this path, it's just a crusty old safety net.
 
> > Also, I'm not sure why you're gating adding fast flushing to amd_iommu
> > on vfio making use of it.  These can be done independently.  Thanks,  
> 
> Currently, the fast unmap interface is mainly called by VFIO. So, I thought I would
> submit the patches together for review. If you would prefer, I can submit the IOMMU part
> separately.

I think it makes more sense and is most efficient to uncouple them.
Joerg probably isn't paying attention to the amd_iommu changes because
I'm commenting on the vfio parts and I'm not looking at the amd_iommu
changes.  The code doesn't depend on them going in together, so we can
do them in parallel if they're split.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH v2 2/2] iommu/amd: Add support for fast IOTLB flushing
  2017-12-27  9:20 ` [RFC PATCH v2 2/2] iommu/amd: Add support for fast IOTLB flushing Suravee Suthikulpanit
@ 2018-01-22  3:41   ` Suravee Suthikulpanit
  2018-01-24 11:43   ` Suravee Suthikulpanit
  1 sibling, 0 replies; 10+ messages in thread
From: Suravee Suthikulpanit @ 2018-01-22  3:41 UTC (permalink / raw)
  To: linux-kernel, iommu; +Cc: joro, jroedel, alex.williamson

Hi Joerg,

Do you have any feedback regarding this patch for AMD IOMMU? I'm re-sending the patch 1/2
separately per Alex's suggestion.

Thanks,
Suravee

On 12/27/17 4:20 PM, Suravee Suthikulpanit wrote:
> Implement the newly added IOTLB flushing interface for AMD IOMMU.
> 
> Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> ---
>   drivers/iommu/amd_iommu.c       | 73 ++++++++++++++++++++++++++++++++++++++++-
>   drivers/iommu/amd_iommu_init.c  |  7 ++++
>   drivers/iommu/amd_iommu_types.h |  7 ++++
>   3 files changed, 86 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> index 7d5eb00..42fe365 100644
> --- a/drivers/iommu/amd_iommu.c
> +++ b/drivers/iommu/amd_iommu.c
> @@ -129,6 +129,12 @@ struct dma_ops_domain {
>   static struct iova_domain reserved_iova_ranges;
>   static struct lock_class_key reserved_rbtree_key;
>   
> +struct amd_iommu_flush_entries {
> +	struct list_head list;
> +	unsigned long iova;
> +	size_t size;
> +};
> +
>   /****************************************************************************
>    *
>    * Helper functions
> @@ -3043,7 +3049,6 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
>   	unmap_size = iommu_unmap_page(domain, iova, page_size);
>   	mutex_unlock(&domain->api_lock);
>   
> -	domain_flush_tlb_pde(domain);
>   	domain_flush_complete(domain);
>   
>   	return unmap_size;
> @@ -3163,6 +3168,69 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
>   	return dev_data->defer_attach;
>   }
>   
> +static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
> +{
> +	struct protection_domain *dom = to_pdomain(domain);
> +
> +	domain_flush_tlb_pde(dom);
> +}
> +
> +static void amd_iommu_iotlb_range_add(struct iommu_domain *domain,
> +				      unsigned long iova, size_t size)
> +{
> +	struct amd_iommu_flush_entries *entry, *p;
> +	unsigned long flags;
> +	bool found = false;
> +
> +	spin_lock_irqsave(&amd_iommu_flush_list_lock, flags);
> +	list_for_each_entry(p, &amd_iommu_flush_list, list) {
> +		if (iova != p->iova)
> +			continue;
> +
> +		if (size > p->size) {
> +			p->size = size;
> +			pr_debug("%s: update range: iova=%#lx, size = %#lx\n",
> +				 __func__, p->iova, p->size);
> +		}
> +		found = true;
> +		break;
> +	}
> +
> +	if (!found) {
> +		entry = kzalloc(sizeof(struct amd_iommu_flush_entries),
> +				GFP_KERNEL);
> +		if (entry) {
> +			pr_debug("%s: new range: iova=%lx, size=%#lx\n",
> +				 __func__, iova, size);
> +
> +			entry->iova = iova;
> +			entry->size = size;
> +			list_add(&entry->list, &amd_iommu_flush_list);
> +		}
> +	}
> +	spin_unlock_irqrestore(&amd_iommu_flush_list_lock, flags);
> +}
> +
> +static void amd_iommu_iotlb_sync(struct iommu_domain *domain)
> +{
> +	struct protection_domain *pdom = to_pdomain(domain);
> +	struct amd_iommu_flush_entries *entry, *next;
> +	unsigned long flags;
> +
> +	/* Note:
> +	 * Currently, IOMMU driver just flushes the whole IO/TLB for
> +	 * a given domain. So, just remove entries from the list here.
> +	 */
> +	spin_lock_irqsave(&amd_iommu_flush_list_lock, flags);
> +	list_for_each_entry_safe(entry, next, &amd_iommu_flush_list, list) {
> +		list_del(&entry->list);
> +		kfree(entry);
> +	}
> +	spin_unlock_irqrestore(&amd_iommu_flush_list_lock, flags);
> +
> +	domain_flush_tlb_pde(pdom);
> +}
> +
>   const struct iommu_ops amd_iommu_ops = {
>   	.capable = amd_iommu_capable,
>   	.domain_alloc = amd_iommu_domain_alloc,
> @@ -3181,6 +3249,9 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
>   	.apply_resv_region = amd_iommu_apply_resv_region,
>   	.is_attach_deferred = amd_iommu_is_attach_deferred,
>   	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
> +	.flush_iotlb_all = amd_iommu_flush_iotlb_all,
> +	.iotlb_range_add = amd_iommu_iotlb_range_add,
> +	.iotlb_sync = amd_iommu_iotlb_sync,
>   };
>   
>   /*****************************************************************************
> diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
> index 6fe2d03..e8f8cee 100644
> --- a/drivers/iommu/amd_iommu_init.c
> +++ b/drivers/iommu/amd_iommu_init.c
> @@ -185,6 +185,12 @@ struct ivmd_header {
>   bool amd_iommu_force_isolation __read_mostly;
>   
>   /*
> + * IOTLB flush list
> + */
> +LIST_HEAD(amd_iommu_flush_list);
> +spinlock_t amd_iommu_flush_list_lock;
> +
> +/*
>    * List of protection domains - used during resume
>    */
>   LIST_HEAD(amd_iommu_pd_list);
> @@ -2490,6 +2496,7 @@ static int __init early_amd_iommu_init(void)
>   	__set_bit(0, amd_iommu_pd_alloc_bitmap);
>   
>   	spin_lock_init(&amd_iommu_pd_lock);
> +	spin_lock_init(&amd_iommu_flush_list_lock);
>   
>   	/*
>   	 * now the data structures are allocated and basically initialized
> diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
> index f6b24c7..c3f4a7e 100644
> --- a/drivers/iommu/amd_iommu_types.h
> +++ b/drivers/iommu/amd_iommu_types.h
> @@ -668,6 +668,13 @@ struct iommu_dev_data {
>   extern struct list_head amd_iommu_pd_list;
>   
>   /*
> + * Declarations for the global flush list to support
> + * iotlb_range_add() and iotlb_sync.
> + */
> +extern spinlock_t amd_iommu_flush_list_lock;
> +extern struct list_head amd_iommu_flush_list;
> +
> +/*
>    * Structure defining one entry in the device table
>    */
>   struct dev_table_entry {
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH v2 2/2] iommu/amd: Add support for fast IOTLB flushing
  2017-12-27  9:20 ` [RFC PATCH v2 2/2] iommu/amd: Add support for fast IOTLB flushing Suravee Suthikulpanit
  2018-01-22  3:41   ` Suravee Suthikulpanit
@ 2018-01-24 11:43   ` Suravee Suthikulpanit
  1 sibling, 0 replies; 10+ messages in thread
From: Suravee Suthikulpanit @ 2018-01-24 11:43 UTC (permalink / raw)
  To: linux-kernel, iommu; +Cc: joro, jroedel, alex.williamson

Hi Joerg,

On 12/27/17 4:20 PM, Suravee Suthikulpanit wrote:
> Implement the newly added IOTLB flushing interface for AMD IOMMU.
> 
> Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> ---
>   drivers/iommu/amd_iommu.c       | 73 ++++++++++++++++++++++++++++++++++++++++-
>   drivers/iommu/amd_iommu_init.c  |  7 ++++
>   drivers/iommu/amd_iommu_types.h |  7 ++++
>   3 files changed, 86 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> index 7d5eb00..42fe365 100644
> --- a/drivers/iommu/amd_iommu.c
> +++ b/drivers/iommu/amd_iommu.c
>  ...
> @@ -3163,6 +3168,69 @@ static bool amd_iommu_is_attach_deferred(struct iommu_domain *domain,
>   	return dev_data->defer_attach;
>   }
>   
> +static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
> +{
> +	struct protection_domain *dom = to_pdomain(domain);
> +
> +	domain_flush_tlb_pde(dom);
> +}

I made a mistake here ...

> ...
> +static void amd_iommu_iotlb_sync(struct iommu_domain *domain)
> +{
> +	struct protection_domain *pdom = to_pdomain(domain);
> +	struct amd_iommu_flush_entries *entry, *next;
> +	unsigned long flags;
> +
> +	/* Note:
> +	 * Currently, IOMMU driver just flushes the whole IO/TLB for
> +	 * a given domain. So, just remove entries from the list here.
> +	 */
> +	spin_lock_irqsave(&amd_iommu_flush_list_lock, flags);
> +	list_for_each_entry_safe(entry, next, &amd_iommu_flush_list, list) {
> +		list_del(&entry->list);
> +		kfree(entry);
> +	}
> +	spin_unlock_irqrestore(&amd_iommu_flush_list_lock, flags);
> +
> +	domain_flush_tlb_pde(pdom);
> +}

... and here where I should also call domain_flush_complete() after
domain_flush_tlb_pde(). I'll send out a new version (v3) as a separate patch
from the series.

Thanks,
Suravee

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2018-01-24 11:43 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-27  9:20 [RFC PATCH v2 0/2] Reduce IOTLB flush when pass-through dGPU devices Suravee Suthikulpanit
2017-12-27  9:20 ` [RFC PATCH v2 1/2] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs Suravee Suthikulpanit
2018-01-08 20:53   ` Alex Williamson
2018-01-08 21:07     ` Alex Williamson
2018-01-17 16:54       ` Suravee Suthikulpanit
2018-01-18  3:25     ` Suravee Suthikulpanit
2018-01-18 17:34       ` Alex Williamson
2017-12-27  9:20 ` [RFC PATCH v2 2/2] iommu/amd: Add support for fast IOTLB flushing Suravee Suthikulpanit
2018-01-22  3:41   ` Suravee Suthikulpanit
2018-01-24 11:43   ` Suravee Suthikulpanit

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).