All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation
@ 2016-01-09  0:44 Shaohua Li
       [not found] ` <cover.1452297604.git.shli-b10kYP2dOMg@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2016-01-09  0:44 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA; +Cc: Kernel-team-b10kYP2dOMg

Hi,

iova alloc/free causes big lock contention, which could be easily demonstrated
with iperf workload. Previously I posted a patchset:
http://lists.linuxfoundation.org/pipermail/iommu/2015-November/014984.html

the concern is it's not generic. This is another try for the issue. This
version implements a per-cpu iova cache for small size DMA allocation (<= 64k),
which should be generic enough and so we can do batch allocation. iova free
could be easily batched too. With batch alloc/free, iova lock contention
disappears. Performance result with this patchset is nearly the same as the
previous one in the same test.

After this patchset, async_umap_flush_lock becomes the hotest lock in
intel-iommu, but not very bad. That will be something we need work on in the
future.

Thanks,
Shaohua

Shaohua Li (4):
  IOMMU: add a percpu cache for iova allocation
  iommu: free_iova doesn't need lock twice
  intel-iommu: remove find_iova in unmap path
  intel-iommu: do batch iova free

 drivers/iommu/intel-iommu.c |  77 +++++++++++++------
 drivers/iommu/iova.c        | 177 +++++++++++++++++++++++++++++++++++---------
 include/linux/iova.h        |  10 +++
 3 files changed, 208 insertions(+), 56 deletions(-)

-- 
2.4.6

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH V2 1/4] IOMMU: add a percpu cache for iova allocation
       [not found] ` <cover.1452297604.git.shli-b10kYP2dOMg@public.gmane.org>
@ 2016-01-09  0:44   ` Shaohua Li
  2016-01-09  0:44   ` [PATCH V2 2/4] iommu: free_iova doesn't need lock twice Shaohua Li
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2016-01-09  0:44 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Kernel-team-b10kYP2dOMg, Joerg Roedel, David Woodhouse

This adds a percpu cache for small size iova allocation. For allocation
size is bigger than 64k, the cache is bypassed. The cache allows iova
allocation batched, so dramatically reduce iovad lock contention. In
test, the the lock contention becomes very tiny. The cache has 5 classes
of size (4k, 8k, 16k, 32k, 64k), each will cache 512K dma address. Each
cpu will cache 512k * 5 dma address at most. For an IOMMU system, the
cached dma address is quite tiny, so we don't bother draining cache for
CPU hotplug.

Cc: Joerg Roedel <jroedel-l3A5Bk7waGM@public.gmane.org>
Cc: David Woodhouse <David.Woodhouse-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
---
 drivers/iommu/iova.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/iova.h |  9 ++++++
 2 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index fa0adef..5c86c5c 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -25,6 +25,7 @@ void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn, unsigned long pfn_32bit)
 {
+	int cpu;
 	/*
 	 * IOVA granularity will normally be equal to the smallest
 	 * supported IOMMU page size; both *must* be capable of
@@ -38,6 +39,13 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->granule = granule;
 	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit;
+	iovad->percpu_cache = alloc_percpu(struct iova_cache);
+	for_each_possible_cpu(cpu) {
+		struct iova_cache *cache = per_cpu_ptr(iovad->percpu_cache, cpu);
+		int i;
+		for (i = 0; i <= ilog2(MAX_CACHE_SIZE); i++)
+			INIT_LIST_HEAD(&cache->cache_lists[i]);
+	}
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
@@ -103,12 +111,10 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 			struct iova *new, bool size_aligned)
 {
 	struct rb_node *prev, *curr = NULL;
-	unsigned long flags;
 	unsigned long saved_pfn;
 	unsigned int pad_size = 0;
 
 	/* Walk the tree backwards */
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	saved_pfn = limit_pfn;
 	curr = __get_cached_rbnode(iovad, &limit_pfn);
 	prev = curr;
@@ -135,10 +141,8 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 	if (!curr) {
 		if (size_aligned)
 			pad_size = iova_get_pad_size(size, limit_pfn);
-		if ((iovad->start_pfn + size + pad_size) > limit_pfn) {
-			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+		if ((iovad->start_pfn + size + pad_size) > limit_pfn)
 			return -ENOMEM;
-		}
 	}
 
 	/* pfn_lo will point to size aligned address if size_aligned is set */
@@ -177,9 +181,6 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 	}
 	__cached_rbnode_insert_update(iovad, saved_pfn, new);
 
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-
-
 	return 0;
 }
 
@@ -256,6 +257,69 @@ void iova_cache_put(void)
 }
 EXPORT_SYMBOL_GPL(iova_cache_put);
 
+static void refill_percpu_iova(struct iova_domain *iovad, int index)
+{
+	struct iova_cache *cache;
+	struct iova *new_iova;
+	unsigned long flags;
+	int ret;
+	int count = 0;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	cache = this_cpu_ptr(iovad->percpu_cache);
+
+	if (!list_empty(&cache->cache_lists[index]))
+		goto out;
+
+	while (count < (MAX_CACHE_UNIT_SIZE >> index)) {
+		new_iova = alloc_iova_mem();
+		if (!new_iova)
+			goto out;
+		ret = __alloc_and_insert_iova_range(iovad, 1 << index,
+			iovad->dma_32bit_pfn, new_iova, true);
+		if (ret) {
+			free_iova_mem(new_iova);
+			goto out;
+		}
+		count++;
+		list_add(&new_iova->sibling, &cache->cache_lists[index]);
+	}
+out:
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+}
+
+static struct iova *alloc_percpu_iova(struct iova_domain *iovad,
+	unsigned long size, unsigned long limit_pfn)
+{
+	int cache_index;
+	struct iova_cache *cache;
+	struct iova *iova;
+	bool refilled = false;
+
+	if (size > MAX_CACHE_SIZE || limit_pfn < iovad->dma_32bit_pfn)
+		return NULL;
+	cache_index = order_base_2(size);
+
+again:
+	preempt_disable();
+	cache = this_cpu_ptr(iovad->percpu_cache);
+	iova = list_first_entry_or_null(&cache->cache_lists[cache_index],
+		struct iova, sibling);
+	if (iova)
+		list_del(&iova->sibling);
+	preempt_enable();
+	if (iova)
+		return iova;
+
+	if (!refilled) {
+		refill_percpu_iova(iovad, cache_index);
+		refilled = true;
+		goto again;
+	}
+
+	return NULL;
+}
+
 /**
  * alloc_iova - allocates an iova
  * @iovad: - iova domain in question
@@ -274,13 +338,20 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 {
 	struct iova *new_iova;
 	int ret;
+	unsigned long flags;
+
+	new_iova = alloc_percpu_iova(iovad, size, limit_pfn);
+	if (new_iova)
+		return new_iova;
 
 	new_iova = alloc_iova_mem();
 	if (!new_iova)
 		return NULL;
 
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn,
 			new_iova, size_aligned);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 
 	if (ret) {
 		free_iova_mem(new_iova);
@@ -389,6 +460,7 @@ void put_iova_domain(struct iova_domain *iovad)
 		node = rb_first(&iovad->rbroot);
 	}
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	free_percpu(iovad->percpu_cache);
 }
 EXPORT_SYMBOL_GPL(put_iova_domain);
 
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 92f7177..1386a7b 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -21,6 +21,14 @@ struct iova {
 	struct rb_node	node;
 	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
 	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+	struct list_head sibling;
+};
+
+#define MAX_CACHE_SIZE 16 /* 16 * 4k, only cache <= 64K allocation */
+#define MAX_CACHE_UNIT_SIZE (512 / 4) /* 512k total cache size */
+
+struct iova_cache {
+	struct list_head cache_lists[ilog2(MAX_CACHE_SIZE) + 1];
 };
 
 /* holds all the iova translations for a domain */
@@ -31,6 +39,7 @@ struct iova_domain {
 	unsigned long	granule;	/* pfn granularity for this domain */
 	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
+	struct iova_cache __percpu *percpu_cache;
 };
 
 static inline unsigned long iova_size(struct iova *iova)
-- 
2.4.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH V2 2/4] iommu: free_iova doesn't need lock twice
       [not found] ` <cover.1452297604.git.shli-b10kYP2dOMg@public.gmane.org>
  2016-01-09  0:44   ` [PATCH V2 1/4] IOMMU: add a percpu cache for " Shaohua Li
@ 2016-01-09  0:44   ` Shaohua Li
  2016-01-09  0:44   ` [PATCH V2 3/4] intel-iommu: remove find_iova in unmap path Shaohua Li
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2016-01-09  0:44 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Kernel-team-b10kYP2dOMg, Joerg Roedel, David Woodhouse

cleanup the code a bit. free_iova doesn't need lock twice.

Cc: Joerg Roedel <jroedel-l3A5Bk7waGM@public.gmane.org>
Cc: David Woodhouse <David.Woodhouse-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
---
 drivers/iommu/iova.c | 62 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 5c86c5c..bb8a328 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -362,35 +362,17 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 }
 EXPORT_SYMBOL_GPL(alloc_iova);
 
-/**
- * find_iova - find's an iova for a given pfn
- * @iovad: - iova domain in question.
- * @pfn: - page frame number
- * This function finds and returns an iova belonging to the
- * given doamin which matches the given pfn.
- */
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+static struct iova *__find_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-	unsigned long flags;
 	struct rb_node *node;
 
-	/* Take the lock so that no other thread is manipulating the rbtree */
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	node = iovad->rbroot.rb_node;
 	while (node) {
 		struct iova *iova = container_of(node, struct iova, node);
 
 		/* If pfn falls within iova's range, return iova */
-		if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
-			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-			/* We are not holding the lock while this iova
-			 * is referenced by the caller as the same thread
-			 * which called this function also calls __free_iova()
-			 * and it is by design that only one thread can possibly
-			 * reference a particular iova and hence no conflict.
-			 */
+		if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi))
 			return iova;
-		}
 
 		if (pfn < iova->pfn_lo)
 			node = node->rb_left;
@@ -398,9 +380,33 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
 			node = node->rb_right;
 	}
 
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 	return NULL;
 }
+
+/**
+ * find_iova - find's an iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+	unsigned long flags;
+	struct iova *iova;
+
+	/* Take the lock so that no other thread is manipulating the rbtree */
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	iova = __find_iova(iovad, pfn);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	/* We are not holding the lock while this iova
+	 * is referenced by the caller as the same thread
+	 * which called this function also calls __free_iova()
+	 * and it is by design that only one thread can possibly
+	 * reference a particular iova and hence no conflict.
+	 */
+	return iova;
+}
 EXPORT_SYMBOL_GPL(find_iova);
 
 /**
@@ -432,11 +438,17 @@ EXPORT_SYMBOL_GPL(__free_iova);
 void
 free_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-	struct iova *iova = find_iova(iovad, pfn);
-
-	if (iova)
-		__free_iova(iovad, iova);
+	unsigned long flags;
+	struct iova *iova;
 
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	iova = __find_iova(iovad, pfn);
+	if (iova) {
+		__cached_rbnode_delete_update(iovad, iova);
+		rb_erase(&iova->node, &iovad->rbroot);
+	}
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	free_iova_mem(iova);
 }
 EXPORT_SYMBOL_GPL(free_iova);
 
-- 
2.4.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH V2 3/4] intel-iommu: remove find_iova in unmap path
       [not found] ` <cover.1452297604.git.shli-b10kYP2dOMg@public.gmane.org>
  2016-01-09  0:44   ` [PATCH V2 1/4] IOMMU: add a percpu cache for " Shaohua Li
  2016-01-09  0:44   ` [PATCH V2 2/4] iommu: free_iova doesn't need lock twice Shaohua Li
@ 2016-01-09  0:44   ` Shaohua Li
  2016-01-09  0:44   ` [PATCH V2 4/4] intel-iommu: do batch iova free Shaohua Li
  2016-01-10 22:56   ` [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation Adam Morrison
  4 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2016-01-09  0:44 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Kernel-team-b10kYP2dOMg, Joerg Roedel, David Woodhouse

unmap path does find_iova and follows __free_iova, which takes lock
twice. The find_iova isn't really required and so we can avoid the lock
overhead. Also next patch depends on this to reduce lock further.

Cc: Joerg Roedel <jroedel-l3A5Bk7waGM@public.gmane.org>
Cc: David Woodhouse <David.Woodhouse-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
---
 drivers/iommu/intel-iommu.c | 62 ++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index ac73876..b06a901 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -81,6 +81,7 @@
 #define IOVA_START_PFN		(1)
 
 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
+#define IOVA_PFN_ROUNDUP(addr) ((addr + PAGE_SIZE -1) >> PAGE_SHIFT)
 #define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
 #define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
 
@@ -461,7 +462,8 @@ static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 #define HIGH_WATER_MARK 250
 struct deferred_flush_tables {
 	int next;
-	struct iova *iova[HIGH_WATER_MARK];
+	unsigned long iova_pfn[HIGH_WATER_MARK];
+	unsigned long pages[HIGH_WATER_MARK];
 	struct dmar_domain *domain[HIGH_WATER_MARK];
 	struct page *freelist[HIGH_WATER_MARK];
 };
@@ -3518,20 +3520,24 @@ static void flush_unmaps(void)
 					 DMA_TLB_GLOBAL_FLUSH);
 		for (j = 0; j < deferred_flush[i].next; j++) {
 			unsigned long mask;
-			struct iova *iova = deferred_flush[i].iova[j];
+			unsigned long iova_pfn = deferred_flush[i].iova_pfn[j];
+			size_t pages = deferred_flush[i].pages[j];
 			struct dmar_domain *domain = deferred_flush[i].domain[j];
+			struct iova_domain *iovad = &domain->iovad;
 
 			/* On real hardware multiple invalidations are expensive */
 			if (cap_caching_mode(iommu->cap))
 				iommu_flush_iotlb_psi(iommu, domain,
-					iova->pfn_lo, iova_size(iova),
+					mm_to_dma_pfn(iova_pfn),
+					mm_to_dma_pfn(pages),
 					!deferred_flush[i].freelist[j], 0);
 			else {
-				mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
-				iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
-						(uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
+				mask = ilog2(mm_to_dma_pfn(pages));
+				iommu_flush_dev_iotlb(domain,
+					(uint64_t)iova_pfn << PAGE_SHIFT, mask);
 			}
-			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
+
+			free_iova(iovad, iova_pfn);
 			if (deferred_flush[i].freelist[j])
 				dma_free_pagelist(deferred_flush[i].freelist[j]);
 		}
@@ -3550,7 +3556,8 @@ static void flush_unmaps_timeout(unsigned long data)
 	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
 }
 
-static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
+static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
+	size_t pages, struct page *freelist)
 {
 	unsigned long flags;
 	int next, iommu_id;
@@ -3565,7 +3572,8 @@ static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *f
 
 	next = deferred_flush[iommu_id].next;
 	deferred_flush[iommu_id].domain[next] = dom;
-	deferred_flush[iommu_id].iova[next] = iova;
+	deferred_flush[iommu_id].iova_pfn[next] = iova_pfn;
+	deferred_flush[iommu_id].pages[next] = pages;
 	deferred_flush[iommu_id].freelist[next] = freelist;
 	deferred_flush[iommu_id].next++;
 
@@ -3577,11 +3585,10 @@ static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *f
 	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
 }
 
-static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
+static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t pages)
 {
 	struct dmar_domain *domain;
 	unsigned long start_pfn, last_pfn;
-	struct iova *iova;
 	struct intel_iommu *iommu;
 	struct page *freelist;
 
@@ -3592,14 +3599,11 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 	BUG_ON(!domain);
 
 	iommu = domain_get_iommu(domain);
+	/* intel_alloc_iova does the roundup */
+	pages = __roundup_pow_of_two(pages);
 
-	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
-	if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
-		      (unsigned long long)dev_addr))
-		return;
-
-	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
-	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
+	start_pfn = mm_to_dma_pfn(IOVA_PFN(dev_addr));
+	last_pfn = mm_to_dma_pfn(IOVA_PFN(dev_addr) + pages) - 1;
 
 	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
 		 dev_name(dev), start_pfn, last_pfn);
@@ -3610,10 +3614,10 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
 				      last_pfn - start_pfn + 1, !freelist, 0);
 		/* free iova */
-		__free_iova(&domain->iovad, iova);
+		free_iova(&domain->iovad, IOVA_PFN(dev_addr));
 		dma_free_pagelist(freelist);
 	} else {
-		add_unmap(domain, iova, freelist);
+		add_unmap(domain, IOVA_PFN(dev_addr), pages, freelist);
 		/*
 		 * queue up the release of the unmap to save the 1/6th of the
 		 * cpu used up by the iotlb flush operation...
@@ -3625,7 +3629,8 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
 			     size_t size, enum dma_data_direction dir,
 			     struct dma_attrs *attrs)
 {
-	intel_unmap(dev, dev_addr);
+	intel_unmap(dev, dev_addr, IOVA_PFN_ROUNDUP(dev_addr + size) -
+		IOVA_PFN(dev_addr));
 }
 
 static void *intel_alloc_coherent(struct device *dev, size_t size,
@@ -3684,7 +3689,7 @@ static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
 	size = PAGE_ALIGN(size);
 	order = get_order(size);
 
-	intel_unmap(dev, dma_handle);
+	intel_unmap(dev, dma_handle, size);
 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
 		__free_pages(page, order);
 }
@@ -3693,7 +3698,18 @@ static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
 			   int nelems, enum dma_data_direction dir,
 			   struct dma_attrs *attrs)
 {
-	intel_unmap(dev, sglist[0].dma_address);
+	struct scatterlist *sg;
+	size_t size = 0;
+	int i;
+
+	for_each_sg(sglist, sg, nelems, i) {
+		dma_addr_t dma_addr = sg_dma_address(sg);
+		unsigned int dma_len = sg_dma_len(sg);
+		size += IOVA_PFN_ROUNDUP(dma_addr + dma_len) -
+			IOVA_PFN(dma_addr);
+	}
+
+	intel_unmap(dev, sglist[0].dma_address, size);
 }
 
 static int intel_nontranslate_map_sg(struct device *hddev,
-- 
2.4.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH V2 4/4] intel-iommu: do batch iova free
       [not found] ` <cover.1452297604.git.shli-b10kYP2dOMg@public.gmane.org>
                     ` (2 preceding siblings ...)
  2016-01-09  0:44   ` [PATCH V2 3/4] intel-iommu: remove find_iova in unmap path Shaohua Li
@ 2016-01-09  0:44   ` Shaohua Li
  2016-01-10 22:56   ` [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation Adam Morrison
  4 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2016-01-09  0:44 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Kernel-team-b10kYP2dOMg, Joerg Roedel, David Woodhouse

iova free can be easily batched and remove the lock contention in unmap
path.

Cc: Joerg Roedel <jroedel-l3A5Bk7waGM@public.gmane.org>
Cc: David Woodhouse <David.Woodhouse-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
---
 drivers/iommu/intel-iommu.c | 17 ++++++++++++++++-
 drivers/iommu/iova.c        | 27 +++++++++++++++++++++++++++
 include/linux/iova.h        |  1 +
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index b06a901..c1e0702 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3499,9 +3499,13 @@ static dma_addr_t intel_map_page(struct device *dev, struct page *page,
 				  dir, *dev->dma_mask);
 }
 
+#define MAX_FREE_IOVA 16
 static void flush_unmaps(void)
 {
 	int i, j;
+	unsigned long free_iovas[MAX_FREE_IOVA];
+	int iova_cnt;
+	struct iova_domain *free_iovad;
 
 	timer_on = 0;
 
@@ -3518,6 +3522,8 @@ static void flush_unmaps(void)
 		if (!cap_caching_mode(iommu->cap))
 			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
 					 DMA_TLB_GLOBAL_FLUSH);
+		iova_cnt = 0;
+		free_iovad = NULL;
 		for (j = 0; j < deferred_flush[i].next; j++) {
 			unsigned long mask;
 			unsigned long iova_pfn = deferred_flush[i].iova_pfn[j];
@@ -3537,10 +3543,19 @@ static void flush_unmaps(void)
 					(uint64_t)iova_pfn << PAGE_SHIFT, mask);
 			}
 
-			free_iova(iovad, iova_pfn);
+			if (free_iovad && (free_iovad != iovad ||
+					   iova_cnt >= MAX_FREE_IOVA)) {
+				free_iova_array(free_iovad, free_iovas, iova_cnt);
+				iova_cnt = 0;
+			}
+
+			free_iovad = iovad;
+			free_iovas[iova_cnt++] = iova_pfn;
 			if (deferred_flush[i].freelist[j])
 				dma_free_pagelist(deferred_flush[i].freelist[j]);
 		}
+		if (iova_cnt)
+			free_iova_array(free_iovad, free_iovas, iova_cnt);
 		deferred_flush[i].next = 0;
 	}
 
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index bb8a328..a4959e2 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -453,6 +453,33 @@ free_iova(struct iova_domain *iovad, unsigned long pfn)
 EXPORT_SYMBOL_GPL(free_iova);
 
 /**
+ * free_iova_array - finds and frees iovas for pfn array
+ * @iovad: - iova domain in question.
+ * @pfns: - pfn array
+ * @size - array size
+ * This function free iovas of givien pfn array
+ */
+void
+free_iova_array(struct iova_domain *iovad, unsigned long *pfns, int size)
+{
+	unsigned long flags;
+	struct iova *iova;
+	int i;
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	for (i = 0; i < size; i++) {
+		iova = __find_iova(iovad, pfns[i]);
+		if (iova) {
+			__cached_rbnode_delete_update(iovad, iova);
+			rb_erase(&iova->node, &iovad->rbroot);
+		}
+		free_iova_mem(iova);
+	}
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+}
+EXPORT_SYMBOL_GPL(free_iova_array);
+
+/**
  * put_iova_domain - destroys the iova doamin
  * @iovad: - iova domain in question.
  * All the iova's in that domain are destroyed.
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 1386a7b..5a11304 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -83,6 +83,7 @@ void iova_cache_put(void);
 struct iova *alloc_iova_mem(void);
 void free_iova_mem(struct iova *iova);
 void free_iova(struct iova_domain *iovad, unsigned long pfn);
+void free_iova_array(struct iova_domain *iovad, unsigned long *pfns, int size);
 void __free_iova(struct iova_domain *iovad, struct iova *iova);
 struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 	unsigned long limit_pfn,
-- 
2.4.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation
       [not found] ` <cover.1452297604.git.shli-b10kYP2dOMg@public.gmane.org>
                     ` (3 preceding siblings ...)
  2016-01-09  0:44   ` [PATCH V2 4/4] intel-iommu: do batch iova free Shaohua Li
@ 2016-01-10 22:56   ` Adam Morrison
       [not found]     ` <20160110225610.GA16778-FrESSTt7Abv7r6psnUbsSmZHpeb/A1Y/@public.gmane.org>
  4 siblings, 1 reply; 14+ messages in thread
From: Adam Morrison @ 2016-01-10 22:56 UTC (permalink / raw)
  To: Shaohua Li
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Kernel-team-b10kYP2dOMg

Hi,

> iova alloc/free causes big lock contention, which could be easily demonstrated
> with iperf workload. Previously I posted a patchset:
> http://lists.linuxfoundation.org/pipermail/iommu/2015-November/014984.html
> 
> the concern is it's not generic. This is another try for the issue. This
> version implements a per-cpu iova cache for small size DMA allocation (<= 64k),
> which should be generic enough and so we can do batch allocation. iova free
> could be easily batched too. With batch alloc/free, iova lock contention
> disappears. Performance result with this patchset is nearly the same as the
> previous one in the same test.
> 
> After this patchset, async_umap_flush_lock becomes the hotest lock in
> intel-iommu, but not very bad. That will be something we need work on in the
> future.

There can still be significant spinlock contention with this patchset.
For example, here's the throughput obtained when accessing 16 memcached
instances running on a 16-core Sandy Bridge with an Intel XL710 NIC.
The client machine has iommu=off and runs memslap with the default
config (64-byte keys, 1024-byte values, and 10%/90% SET/GET ops):

  stock (4.4.0-rc5) iommu=off:
   1,088,996 memcached transactions/sec (=100%, median of 10 runs).

  this patch, iommu=on:
   313,161 memcached transactions/sec (=29%).
   perf: 21.87%    0.57%  memcached       [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
                    |
                    ---_raw_spin_lock_irqsave
                       |
                       |--67.84%-- intel_unmap  (async_umap_flush_lock)
                       |--17.54%-- alloc_iova
                       |--12.85%-- free_iova_array

For reference, the patchset I posted two weeks ago gets almost the same
throughput as with iommu=off:

  http://lists.linuxfoundation.org/pipermail/iommu/2015-December/015271.html:
   1,067,586 memcached transactions/sec (=98%).
   perf: 0.75%     0.75%  memcached       [kernel.kallsyms]      [k] _raw_spin_lock_irqsave]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation
       [not found]     ` <20160110225610.GA16778-FrESSTt7Abv7r6psnUbsSmZHpeb/A1Y/@public.gmane.org>
@ 2016-01-11  3:37       ` Shaohua Li
       [not found]         ` <20160111033749.GA1811285-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2016-01-11  3:37 UTC (permalink / raw)
  To: Adam Morrison
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Kernel-team-b10kYP2dOMg

On Mon, Jan 11, 2016 at 12:56:12AM +0200, Adam Morrison wrote:
> Hi,
> 
> > iova alloc/free causes big lock contention, which could be easily demonstrated
> > with iperf workload. Previously I posted a patchset:
> > https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.linuxfoundation.org_pipermail_iommu_2015-2DNovember_014984.html&d=CwIBAg&c=5VD0RTtNlTh3ycd41b3MUw&r=X13hAPkxmvBro1Ug8vcKHw&m=i-Y1E3oSPFHeeKufBaG6XLhWgcShO1zrKIdjlpP2AYo&s=iNOUfc0u4NPOZoGmz6B7zCJdYEE8jkrlTtZq7JG2vjc&e= 
> > 
> > the concern is it's not generic. This is another try for the issue. This
> > version implements a per-cpu iova cache for small size DMA allocation (<= 64k),
> > which should be generic enough and so we can do batch allocation. iova free
> > could be easily batched too. With batch alloc/free, iova lock contention
> > disappears. Performance result with this patchset is nearly the same as the
> > previous one in the same test.
> > 
> > After this patchset, async_umap_flush_lock becomes the hotest lock in
> > intel-iommu, but not very bad. That will be something we need work on in the
> > future.
> 
> There can still be significant spinlock contention with this patchset.
> For example, here's the throughput obtained when accessing 16 memcached
> instances running on a 16-core Sandy Bridge with an Intel XL710 NIC.
> The client machine has iommu=off and runs memslap with the default
> config (64-byte keys, 1024-byte values, and 10%/90% SET/GET ops):
> 
>   stock (4.4.0-rc5) iommu=off:
>    1,088,996 memcached transactions/sec (=100%, median of 10 runs).
> 
>   this patch, iommu=on:
>    313,161 memcached transactions/sec (=29%).
>    perf: 21.87%    0.57%  memcached       [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
>                     |
>                     ---_raw_spin_lock_irqsave
>                        |
>                        |--67.84%-- intel_unmap  (async_umap_flush_lock)
>                        |--17.54%-- alloc_iova
>                        |--12.85%-- free_iova_array

Yes, mine doesn't remove the async_unmap_flush_lock contention yet.
Should be easy with a percpu stuff or atomic based ring management.
 
> For reference, the patchset I posted two weeks ago gets almost the same
> throughput as with iommu=off:
> 
>   https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.linuxfoundation.org_pipermail_iommu_2015-2DDecember_015271.html-3A&d=CwIBAg&c=5VD0RTtNlTh3ycd41b3MUw&r=X13hAPkxmvBro1Ug8vcKHw&m=i-Y1E3oSPFHeeKufBaG6XLhWgcShO1zrKIdjlpP2AYo&s=eBeKYSequdm8wso0jsaRPjeKsKup7p5Tv_CDT2O0ZEs&e= 
>    1,067,586 memcached transactions/sec (=98%).
>    perf: 0.75%     0.75%  memcached       [kernel.kallsyms]      [k] _raw_spin_lock_irqsave]

I don't know you already posted one. Roughly looked at the patches. We
are using exactly the same idea. I'm happy we pursue your patches. At
the first look, the per-cpu allocation in your patch doesn't check
pfn_limit, that could be wrong, but should be easy to fix. I'll take a
close look tomorrow.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation
       [not found]         ` <20160111033749.GA1811285-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
@ 2016-01-20 12:21           ` Joerg Roedel
       [not found]             ` <20160120122103.GE18805-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Joerg Roedel @ 2016-01-20 12:21 UTC (permalink / raw)
  To: Shaohua Li
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Adam Morrison,
	Kernel-team-b10kYP2dOMg

On Sun, Jan 10, 2016 at 07:37:59PM -0800, Shaohua Li wrote:
> I don't know you already posted one. Roughly looked at the patches. We
> are using exactly the same idea. I'm happy we pursue your patches. At
> the first look, the per-cpu allocation in your patch doesn't check
> pfn_limit, that could be wrong, but should be easy to fix. I'll take a
> close look tomorrow.

Sounds great. Your approaches with the per-cpu are pretty similar, so I
would appreciate if you two come up with a combined patch-set?


	Joerg

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation
       [not found]             ` <20160120122103.GE18805-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
@ 2016-01-20 18:10               ` Shaohua Li
       [not found]                 ` <20160120180956.GA230160-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2016-01-20 18:10 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Adam Morrison,
	Kernel-team-b10kYP2dOMg

On Wed, Jan 20, 2016 at 01:21:03PM +0100, Joerg Roedel wrote:
> On Sun, Jan 10, 2016 at 07:37:59PM -0800, Shaohua Li wrote:
> > I don't know you already posted one. Roughly looked at the patches. We
> > are using exactly the same idea. I'm happy we pursue your patches. At
> > the first look, the per-cpu allocation in your patch doesn't check
> > pfn_limit, that could be wrong, but should be easy to fix. I'll take a
> > close look tomorrow.
> 
> Sounds great. Your approaches with the per-cpu are pretty similar, so I
> would appreciate if you two come up with a combined patch-set?

I'll do this if Adam doesn't object.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation
       [not found]                 ` <20160120180956.GA230160-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
@ 2016-01-20 20:47                   ` Adam Morrison
       [not found]                     ` <CAHMfzJmAUWT8X4492_84smUgQMW9pW1yNfOxC6e7ur9TitA2cA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Adam Morrison @ 2016-01-20 20:47 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Kernel-team-b10kYP2dOMg,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

On Wed, Jan 20, 2016 at 8:10 PM, Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:
> On Wed, Jan 20, 2016 at 01:21:03PM +0100, Joerg Roedel wrote:
>> On Sun, Jan 10, 2016 at 07:37:59PM -0800, Shaohua Li wrote:
>> > I don't know you already posted one. Roughly looked at the patches. We
>> > are using exactly the same idea. I'm happy we pursue your patches. At
>> > the first look, the per-cpu allocation in your patch doesn't check
>> > pfn_limit, that could be wrong, but should be easy to fix. I'll take a
>> > close look tomorrow.
>>
>> Sounds great. Your approaches with the per-cpu are pretty similar, so I
>> would appreciate if you two come up with a combined patch-set?
>
> I'll do this if Adam doesn't object.

My understanding from the above is that the only issue with our
patchset was not dealing with pfn_limit.  I can just fix that and
repost, sounds good?


Thanks,
Adam

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation
       [not found]                     ` <CAHMfzJmAUWT8X4492_84smUgQMW9pW1yNfOxC6e7ur9TitA2cA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2016-01-20 21:14                       ` Shaohua Li
       [not found]                         ` <20160120211421.GA684235-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2016-01-20 21:14 UTC (permalink / raw)
  To: Adam Morrison
  Cc: Kernel-team-b10kYP2dOMg,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

On Wed, Jan 20, 2016 at 10:47:45PM +0200, Adam Morrison wrote:
> On Wed, Jan 20, 2016 at 8:10 PM, Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:
> > On Wed, Jan 20, 2016 at 01:21:03PM +0100, Joerg Roedel wrote:
> >> On Sun, Jan 10, 2016 at 07:37:59PM -0800, Shaohua Li wrote:
> >> > I don't know you already posted one. Roughly looked at the patches. We
> >> > are using exactly the same idea. I'm happy we pursue your patches. At
> >> > the first look, the per-cpu allocation in your patch doesn't check
> >> > pfn_limit, that could be wrong, but should be easy to fix. I'll take a
> >> > close look tomorrow.
> >>
> >> Sounds great. Your approaches with the per-cpu are pretty similar, so I
> >> would appreciate if you two come up with a combined patch-set?
> >
> > I'll do this if Adam doesn't object.
> 
> My understanding from the above is that the only issue with our
> patchset was not dealing with pfn_limit.  I can just fix that and
> repost, sounds good?

Sure, please do it. For the patches, I'm not comformatable about the
per-cpu deferred invalidation. One important benefit of IOMMU is
isolation. Deferred invalidation already loose the isolation, per-cpu
invalidation loose further. It would be better we can flush all per-cpu
invalidation entries if one cpu hits its per-cpu limit. Also you'd
better look at CPU hotplug. We don't want to lose the invalidation
entries if one cpu is hot removed.

The per-cpu iova implementation looks unnecessary complicated. I know
you are referring the paper, but the whole point is batch
allocation/free.

Please cc me if you post the patches again. I don't subscribe the iommu
list.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation
       [not found]                         ` <20160120211421.GA684235-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
@ 2016-01-21 23:14                           ` Adam Morrison
       [not found]                             ` <20160121231326.GA16905-FrESSTt7Abv7r6psnUbsSmZHpeb/A1Y/@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Adam Morrison @ 2016-01-21 23:14 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Kernel-team-b10kYP2dOMg,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

On Wed, Jan 20, 2016 at 01:14:35PM -0800, Shaohua Li wrote:

> > My understanding from the above is that the only issue with our
> > patchset was not dealing with pfn_limit.  I can just fix that and
> > repost, sounds good?
> 
> Sure, please do it. For the patches, I'm not comformatable about the
> per-cpu deferred invalidation. One important benefit of IOMMU is
> isolation. Deferred invalidation already loose the isolation, per-cpu
> invalidation loose further. It would be better we can flush all per-cpu
> invalidation entries if one cpu hits its per-cpu limit. Also you'd
> better look at CPU hotplug. We don't want to lose the invalidation
> entries if one cpu is hot removed.

I'll look into these.

> The per-cpu iova implementation looks unnecessary complicated. I know
> you are referring the paper, but the whole point is batch
> allocation/free.

Batched allocation/free isn't enough.  It still creates spinlock
contention, even if there is per-cpu invalidation (that gets rid of
async_umap_flush_lock).  Here are sample results from our memcached
test (throughput of querying 16 memcached instances on a 16-core box
with an Intel XL710 NIC):

      batched alloc/free, iommu=on:
      313,161 memcached transactions/sec (= 29% of iommu=off)

      batched alloc/free + per-cpu invalidations, iommu=on:
      434,590 memcached transactions/sec (= 40% of iommu=off)

      perf report:
      61.15%     0.33%  swapper         [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
                     |
                     ---_raw_spin_lock_irqsave
                        |
                        |--87.81%-- free_iova_array
                        |--11.71%-- alloc_iova

In contrast, the per-cpu magazine cache in our patchset enables iova
allocation/free to complete without accessing the iova allocator at
all.  So we don't touch the rbtree spinlock, and also complete iova
allocation in constant time, which avoids the linear-time allocations
that the iova allocator suffers from.  (These were described in the
paper "Efficient intra-operating system protection against harmful
DMAs", presented at the USENIX FAST 2015 conference.)  The end result:

      magazines cache + per-cpu invalidations, iommu=on:
      1,067,586 memcached transactions/sec (= 98% of iommu=off)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation
       [not found]                             ` <20160121231326.GA16905-FrESSTt7Abv7r6psnUbsSmZHpeb/A1Y/@public.gmane.org>
@ 2016-01-22  0:18                               ` Shaohua Li
       [not found]                                 ` <20160122001801.GA4114061-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2016-01-22  0:18 UTC (permalink / raw)
  To: Adam Morrison
  Cc: Kernel-team-b10kYP2dOMg,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

On Fri, Jan 22, 2016 at 01:14:56AM +0200, Adam Morrison wrote:
> On Wed, Jan 20, 2016 at 01:14:35PM -0800, Shaohua Li wrote:
> 
> > > My understanding from the above is that the only issue with our
> > > patchset was not dealing with pfn_limit.  I can just fix that and
> > > repost, sounds good?
> > 
> > Sure, please do it. For the patches, I'm not comformatable about the
> > per-cpu deferred invalidation. One important benefit of IOMMU is
> > isolation. Deferred invalidation already loose the isolation, per-cpu
> > invalidation loose further. It would be better we can flush all per-cpu
> > invalidation entries if one cpu hits its per-cpu limit. Also you'd
> > better look at CPU hotplug. We don't want to lose the invalidation
> > entries if one cpu is hot removed.
> 
> I'll look into these.
> 
> > The per-cpu iova implementation looks unnecessary complicated. I know
> > you are referring the paper, but the whole point is batch
> > allocation/free.
> 
> Batched allocation/free isn't enough.  It still creates spinlock
> contention, even if there is per-cpu invalidation (that gets rid of
> async_umap_flush_lock).  Here are sample results from our memcached
> test (throughput of querying 16 memcached instances on a 16-core box
> with an Intel XL710 NIC):
> 
>       batched alloc/free, iommu=on:
>       313,161 memcached transactions/sec (= 29% of iommu=off)
> 
>       batched alloc/free + per-cpu invalidations, iommu=on:
>       434,590 memcached transactions/sec (= 40% of iommu=off)
> 
>       perf report:
>       61.15%     0.33%  swapper         [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
>                      |
>                      ---_raw_spin_lock_irqsave
>                         |
>                         |--87.81%-- free_iova_array
>                         |--11.71%-- alloc_iova
> 
> In contrast, the per-cpu magazine cache in our patchset enables iova
> allocation/free to complete without accessing the iova allocator at
> all.  So we don't touch the rbtree spinlock, and also complete iova
> allocation in constant time, which avoids the linear-time allocations
> that the iova allocator suffers from.  (These were described in the
> paper "Efficient intra-operating system protection against harmful
> DMAs", presented at the USENIX FAST 2015 conference.)  The end result:
> 
>       magazines cache + per-cpu invalidations, iommu=on:
>       1,067,586 memcached transactions/sec (= 98% of iommu=off)

Yes, I know a typical per-cpu cache algorithm will free entry to the
per-cpu cache first. I didn't do it because it's not worthy in my test.
We definitely should do it if it's worthy though. My point is we don't
need implement the per-cpu that complicated. It could be simply:

alloc:
 - refill per-cpu cache if it's empty
 - alloc from per-cpu cache

free
 - free to per-cpu cache
 - if per-cpu cache hits threshold, free to global

And please don't cache too much. The DMA address below 4G is still
precious.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation
       [not found]                                 ` <20160122001801.GA4114061-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
@ 2016-01-22 11:35                                   ` Adam Morrison
  0 siblings, 0 replies; 14+ messages in thread
From: Adam Morrison @ 2016-01-22 11:35 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Kernel-team-b10kYP2dOMg,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA

On Fri, Jan 22, 2016 at 2:18 AM, Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:

> Yes, I know a typical per-cpu cache algorithm will free entry to the
> per-cpu cache first. I didn't do it because it's not worthy in my test.
> We definitely should do it if it's worthy though. My point is we don't
> need implement the per-cpu that complicated. It could be simply:
>
> alloc:
>  - refill per-cpu cache if it's empty
>  - alloc from per-cpu cache
>
> free
>  - free to per-cpu cache
>  - if per-cpu cache hits threshold, free to global

It seems that in a workload in which a cpu does an alloc followed by a
free, this will access the global allocator every time, which makes
the cache useless.  The magazines algorithm we use solves these
thrashing problems, that's why it's slightly more complicated.  See
Section 3 in https://www.usenix.org/legacy/event/usenix01/bonwick.html
for details.

My point is that per-cpu caching of a global allocator is a well-known
problem with standard solutions, which our patch adopts.  We could
keep iterating here, but to solve all the issues that magazines solve,
we'll probably end up with similar complexity.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-01-22 11:35 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-09  0:44 [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation Shaohua Li
     [not found] ` <cover.1452297604.git.shli-b10kYP2dOMg@public.gmane.org>
2016-01-09  0:44   ` [PATCH V2 1/4] IOMMU: add a percpu cache for " Shaohua Li
2016-01-09  0:44   ` [PATCH V2 2/4] iommu: free_iova doesn't need lock twice Shaohua Li
2016-01-09  0:44   ` [PATCH V2 3/4] intel-iommu: remove find_iova in unmap path Shaohua Li
2016-01-09  0:44   ` [PATCH V2 4/4] intel-iommu: do batch iova free Shaohua Li
2016-01-10 22:56   ` [PATCH V2 0/4]IOMMU: avoid lock contention in iova allocation Adam Morrison
     [not found]     ` <20160110225610.GA16778-FrESSTt7Abv7r6psnUbsSmZHpeb/A1Y/@public.gmane.org>
2016-01-11  3:37       ` Shaohua Li
     [not found]         ` <20160111033749.GA1811285-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2016-01-20 12:21           ` Joerg Roedel
     [not found]             ` <20160120122103.GE18805-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
2016-01-20 18:10               ` Shaohua Li
     [not found]                 ` <20160120180956.GA230160-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2016-01-20 20:47                   ` Adam Morrison
     [not found]                     ` <CAHMfzJmAUWT8X4492_84smUgQMW9pW1yNfOxC6e7ur9TitA2cA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-01-20 21:14                       ` Shaohua Li
     [not found]                         ` <20160120211421.GA684235-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2016-01-21 23:14                           ` Adam Morrison
     [not found]                             ` <20160121231326.GA16905-FrESSTt7Abv7r6psnUbsSmZHpeb/A1Y/@public.gmane.org>
2016-01-22  0:18                               ` Shaohua Li
     [not found]                                 ` <20160122001801.GA4114061-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2016-01-22 11:35                                   ` Adam Morrison

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.