[PATCH 0/4] iommu: remove lock contention in iova allocation

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/4] iommu: remove lock contention in iova allocation
@ 2015-11-24 21:54 Shaohua Li
       [not found] ` <cover.1448401089.git.shli-b10kYP2dOMg@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2015-11-24 21:54 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA; +Cc: Kernel-team-b10kYP2dOMg

Hi,

The lock contention in iova allocation is very significant. In my iperf test
with intel 10Gbps NIC card with 48 threads, I observed 99% cpu time is spending
on the lock contention. In the test:

CPU utilization 100%, iperf reports ~1Gbps

The patches introduce a bitmap based approach for dma address allocation. It
completetly avoids the lock contention. Run the same test:

CPU utilization 1%, iperf reports 9.3Gbps

I also tried the test with the patch, but disable bitmap. The result is the
same as that without the patch.

The patches only work for DMA less than 8k, which is the most comman case we
have highest request(DMA) per second. Bigger size DMA still fallback to rbtree
based allocation. But if required and DAC is enabled by default, it's easy to
support bigger size DMA in the bitmap allocation.

Thanks,
Shaohua

Shaohua Li (4):
  iommu: alloc_iova returns a pfn
  iommu: add a bitmap based dma address allocator
  iommu: enable bitmap allocation for intel iommu
  iommu: free_iova doesn't need lock twice

 drivers/iommu/dma-iommu.c   |  32 ++++-----
 drivers/iommu/intel-iommu.c |  96 ++++++++++++++++-----------
 drivers/iommu/iova.c        | 157 +++++++++++++++++++++++++++++++++-----------
 include/linux/iova.h        |  21 ++++--
 4 files changed, 210 insertions(+), 96 deletions(-)

-- 
2.4.6

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/4] iommu: alloc_iova returns a pfn
       [not found] ` <cover.1448401089.git.shli-b10kYP2dOMg@public.gmane.org>
@ 2015-11-24 21:54   ` Shaohua Li
  2015-11-24 21:54   ` [PATCH 2/4] iommu: add a bitmap based dma address allocator Shaohua Li
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2015-11-24 21:54 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Kernel-team-b10kYP2dOMg, Joerg Roedel, David Woodhouse

make alloc_iova return a pfn instead of iova. This is a preparation
patch.

Cc: Joerg Roedel <jroedel-l3A5Bk7waGM@public.gmane.org>
Cc: David Woodhouse <David.Woodhouse-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
---
 drivers/iommu/dma-iommu.c   | 30 ++++++++++----------
 drivers/iommu/intel-iommu.c | 67 ++++++++++++++++++++++++++-------------------
 drivers/iommu/iova.c        | 11 ++++----
 include/linux/iova.h        |  6 ++--
 4 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 3a20db4..a86291e 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -148,7 +148,7 @@ int dma_direction_to_prot(enum dma_data_direction dir, bool coherent)
 	}
 }
 
-static struct iova *__alloc_iova(struct iova_domain *iovad, size_t size,
+static unsigned long __alloc_iova(struct iova_domain *iovad, size_t size,
 		dma_addr_t dma_limit)
 {
 	unsigned long shift = iova_shift(iovad);
@@ -279,7 +279,7 @@ struct page **iommu_dma_alloc(struct device *dev, size_t size,
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	struct iova_domain *iovad = domain->iova_cookie;
-	struct iova *iova;
+	unsigned long iova_pfn;
 	struct page **pages;
 	struct sg_table sgt;
 	dma_addr_t dma_addr;
@@ -291,8 +291,8 @@ struct page **iommu_dma_alloc(struct device *dev, size_t size,
 	if (!pages)
 		return NULL;
 
-	iova = __alloc_iova(iovad, size, dev->coherent_dma_mask);
-	if (!iova)
+	iova_pfn = __alloc_iova(iovad, size, dev->coherent_dma_mask);
+	if (!iova_pfn)
 		goto out_free_pages;
 
 	size = iova_align(iovad, size);
@@ -311,7 +311,7 @@ struct page **iommu_dma_alloc(struct device *dev, size_t size,
 		sg_miter_stop(&miter);
 	}
 
-	dma_addr = iova_dma_addr(iovad, iova);
+	dma_addr = iova_dma_addr(iovad, iova_pfn);
 	if (iommu_map_sg(domain, dma_addr, sgt.sgl, sgt.orig_nents, prot)
 			< size)
 		goto out_free_sg;
@@ -323,7 +323,7 @@ struct page **iommu_dma_alloc(struct device *dev, size_t size,
 out_free_sg:
 	sg_free_table(&sgt);
 out_free_iova:
-	__free_iova(iovad, iova);
+	free_iova(iovad, iova_pfn);
 out_free_pages:
 	__iommu_dma_free_pages(pages, count);
 	return NULL;
@@ -363,14 +363,14 @@ dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 	phys_addr_t phys = page_to_phys(page) + offset;
 	size_t iova_off = iova_offset(iovad, phys);
 	size_t len = iova_align(iovad, size + iova_off);
-	struct iova *iova = __alloc_iova(iovad, len, dma_get_mask(dev));
+	unsigned long iova_pfn = __alloc_iova(iovad, len, dma_get_mask(dev));
 
-	if (!iova)
+	if (!iova_pfn)
 		return DMA_ERROR_CODE;
 
-	dma_addr = iova_dma_addr(iovad, iova);
+	dma_addr = iova_dma_addr(iovad, iova_pfn);
 	if (iommu_map(domain, dma_addr, phys - iova_off, len, prot)) {
-		__free_iova(iovad, iova);
+		free_iova(iovad, iova_pfn);
 		return DMA_ERROR_CODE;
 	}
 	return dma_addr + iova_off;
@@ -437,7 +437,7 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	struct iova_domain *iovad = domain->iova_cookie;
-	struct iova *iova;
+	unsigned long iova_pfn;
 	struct scatterlist *s, *prev = NULL;
 	dma_addr_t dma_addr;
 	size_t iova_len = 0;
@@ -477,22 +477,22 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
 		prev = s;
 	}
 
-	iova = __alloc_iova(iovad, iova_len, dma_get_mask(dev));
-	if (!iova)
+	iova_pfn = __alloc_iova(iovad, iova_len, dma_get_mask(dev));
+	if (!iova_pfn)
 		goto out_restore_sg;
 
 	/*
 	 * We'll leave any physical concatenation to the IOMMU driver's
 	 * implementation - it knows better than we do.
 	 */
-	dma_addr = iova_dma_addr(iovad, iova);
+	dma_addr = iova_dma_addr(iovad, iova_pfn);
 	if (iommu_map_sg(domain, dma_addr, sg, nents, prot) < iova_len)
 		goto out_free_iova;
 
 	return __finalise_sg(dev, sg, nents, dma_addr);
 
 out_free_iova:
-	__free_iova(iovad, iova);
+	free_iova(iovad, iova_pfn);
 out_restore_sg:
 	__invalidate_sg(sg, nents);
 	return 0;
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 24aa0b3..5c57b9a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -462,6 +462,8 @@ static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 struct deferred_flush_tables {
 	int next;
 	struct iova *iova[HIGH_WATER_MARK];
+	unsigned long iova_pfn[HIGH_WATER_MARK];
+	unsigned long iova_size[HIGH_WATER_MARK];
 	struct dmar_domain *domain[HIGH_WATER_MARK];
 	struct page *freelist[HIGH_WATER_MARK];
 };
@@ -3303,11 +3305,11 @@ static int __init init_dmars(void)
 }
 
 /* This takes a number of _MM_ pages, not VTD pages */
-static struct iova *intel_alloc_iova(struct device *dev,
+static unsigned long intel_alloc_iova(struct device *dev,
 				     struct dmar_domain *domain,
 				     unsigned long nrpages, uint64_t dma_mask)
 {
-	struct iova *iova = NULL;
+	unsigned long iova_pfn = 0;
 
 	/* Restrict dma_mask to the width that the iommu can handle */
 	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
@@ -3320,19 +3322,19 @@ static struct iova *intel_alloc_iova(struct device *dev,
 		 * DMA_BIT_MASK(32) and if that fails then try allocating
 		 * from higher range
 		 */
-		iova = alloc_iova(&domain->iovad, nrpages,
+		iova_pfn = alloc_iova(&domain->iovad, nrpages,
 				  IOVA_PFN(DMA_BIT_MASK(32)), 1);
-		if (iova)
-			return iova;
+		if (iova_pfn)
+			return iova_pfn;
 	}
-	iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
-	if (unlikely(!iova)) {
+	iova_pfn = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
+	if (unlikely(!iova_pfn)) {
 		pr_err("Allocating %ld-page iova for %s failed",
 		       nrpages, dev_name(dev));
-		return NULL;
+		return 0;
 	}
 
-	return iova;
+	return iova_pfn;
 }
 
 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
@@ -3430,7 +3432,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 {
 	struct dmar_domain *domain;
 	phys_addr_t start_paddr;
-	struct iova *iova;
+	unsigned long iova_pfn;
 	int prot = 0;
 	int ret;
 	struct intel_iommu *iommu;
@@ -3448,8 +3450,8 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 	iommu = domain_get_iommu(domain);
 	size = aligned_nrpages(paddr, size);
 
-	iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
-	if (!iova)
+	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
+	if (!iova_pfn)
 		goto error;
 
 	/*
@@ -3467,7 +3469,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 	 * might have two guest_addr mapping to the same host paddr, but this
 	 * is not a big problem
 	 */
-	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
+	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
 				 mm_to_dma_pfn(paddr_pfn), size, prot);
 	if (ret)
 		goto error;
@@ -3475,18 +3477,18 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 	/* it's a non-present to present mapping. Only flush if caching mode */
 	if (cap_caching_mode(iommu->cap))
 		iommu_flush_iotlb_psi(iommu, domain,
-				      mm_to_dma_pfn(iova->pfn_lo),
+				      mm_to_dma_pfn(iova_pfn),
 				      size, 0, 1);
 	else
 		iommu_flush_write_buffer(iommu);
 
-	start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
+	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
 	start_paddr += paddr & ~PAGE_MASK;
 	return start_paddr;
 
 error:
-	if (iova)
-		__free_iova(&domain->iovad, iova);
+	if (iova_pfn)
+		free_iova(&domain->iovad, iova_pfn);
 	pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
 		dev_name(dev), size, (unsigned long long)paddr, dir);
 	return 0;
@@ -3523,19 +3525,24 @@ static void flush_unmaps(void)
 		for (j = 0; j < deferred_flush[i].next; j++) {
 			unsigned long mask;
 			struct iova *iova = deferred_flush[i].iova[j];
+			unsigned long iova_pfn = deferred_flush[i].iova_pfn[j];
+			unsigned long iova_size = deferred_flush[i].iova_size[j];
 			struct dmar_domain *domain = deferred_flush[i].domain[j];
 
 			/* On real hardware multiple invalidations are expensive */
 			if (cap_caching_mode(iommu->cap))
 				iommu_flush_iotlb_psi(iommu, domain,
-					iova->pfn_lo, iova_size(iova),
+					iova_pfn, iova_size,
 					!deferred_flush[i].freelist[j], 0);
 			else {
-				mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
+				mask = ilog2(mm_to_dma_pfn(iova_size));
 				iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
-						(uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
+						(uint64_t)iova_pfn << PAGE_SHIFT, mask);
 			}
-			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
+			if (iova)
+				__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
+			else
+				free_iova(&deferred_flush[i].domain[j]->iovad, iova_pfn);
 			if (deferred_flush[i].freelist[j])
 				dma_free_pagelist(deferred_flush[i].freelist[j]);
 		}
@@ -3554,7 +3561,8 @@ static void flush_unmaps_timeout(unsigned long data)
 	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
 }
 
-static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
+static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
+	unsigned long iova_size, struct page *freelist, struct iova *iova)
 {
 	unsigned long flags;
 	int next, iommu_id;
@@ -3570,6 +3578,8 @@ static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *f
 	next = deferred_flush[iommu_id].next;
 	deferred_flush[iommu_id].domain[next] = dom;
 	deferred_flush[iommu_id].iova[next] = iova;
+	deferred_flush[iommu_id].iova_pfn[next] = iova_pfn;
+	deferred_flush[iommu_id].iova_size[next] = iova_size;
 	deferred_flush[iommu_id].freelist[next] = freelist;
 	deferred_flush[iommu_id].next++;
 
@@ -3617,7 +3627,8 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 		__free_iova(&domain->iovad, iova);
 		dma_free_pagelist(freelist);
 	} else {
-		add_unmap(domain, iova, freelist);
+		add_unmap(domain, IOVA_PFN(dev_addr), last_pfn - start_pfn + 1,
+			freelist, iova);
 		/*
 		 * queue up the release of the unmap to save the 1/6th of the
 		 * cpu used up by the iotlb flush operation...
@@ -3721,7 +3732,7 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
 	struct dmar_domain *domain;
 	size_t size = 0;
 	int prot = 0;
-	struct iova *iova = NULL;
+	unsigned long iova_pfn;
 	int ret;
 	struct scatterlist *sg;
 	unsigned long start_vpfn;
@@ -3740,9 +3751,9 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
 	for_each_sg(sglist, sg, nelems, i)
 		size += aligned_nrpages(sg->offset, sg->length);
 
-	iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
+	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
 				*dev->dma_mask);
-	if (!iova) {
+	if (!iova_pfn) {
 		sglist->dma_length = 0;
 		return 0;
 	}
@@ -3757,13 +3768,13 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
 		prot |= DMA_PTE_WRITE;
 
-	start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
+	start_vpfn = mm_to_dma_pfn(iova_pfn);
 
 	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
 	if (unlikely(ret)) {
 		dma_pte_free_pagetable(domain, start_vpfn,
 				       start_vpfn + size - 1);
-		__free_iova(&domain->iovad, iova);
+		free_iova(&domain->iovad, iova_pfn);
 		return 0;
 	}
 
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index fa0adef..2c5e197 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -36,6 +36,8 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->rbroot = RB_ROOT;
 	iovad->cached32_node = NULL;
 	iovad->granule = granule;
+	if (start_pfn == 0)
+		start_pfn = 1;
 	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit;
 }
@@ -267,7 +269,7 @@ EXPORT_SYMBOL_GPL(iova_cache_put);
  * flag is set then the allocated address iova->pfn_lo will be naturally
  * aligned on roundup_power_of_two(size).
  */
-struct iova *
+unsigned long
 alloc_iova(struct iova_domain *iovad, unsigned long size,
 	unsigned long limit_pfn,
 	bool size_aligned)
@@ -277,17 +279,17 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 
 	new_iova = alloc_iova_mem();
 	if (!new_iova)
-		return NULL;
+		return 0;
 
 	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn,
 			new_iova, size_aligned);
 
 	if (ret) {
 		free_iova_mem(new_iova);
-		return NULL;
+		return 0;
 	}
 
-	return new_iova;
+	return new_iova->pfn_lo;
 }
 EXPORT_SYMBOL_GPL(alloc_iova);
 
@@ -365,7 +367,6 @@ free_iova(struct iova_domain *iovad, unsigned long pfn)
 
 	if (iova)
 		__free_iova(iovad, iova);
-
 }
 EXPORT_SYMBOL_GPL(free_iova);
 
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 92f7177..cfe5ee9 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -58,9 +58,9 @@ static inline size_t iova_align(struct iova_domain *iovad, size_t size)
 	return ALIGN(size, iovad->granule);
 }
 
-static inline dma_addr_t iova_dma_addr(struct iova_domain *iovad, struct iova *iova)
+static inline dma_addr_t iova_dma_addr(struct iova_domain *iovad, unsigned long iova_pfn)
 {
-	return (dma_addr_t)iova->pfn_lo << iova_shift(iovad);
+	return (dma_addr_t)iova_pfn << iova_shift(iovad);
 }
 
 static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
@@ -75,7 +75,7 @@ struct iova *alloc_iova_mem(void);
 void free_iova_mem(struct iova *iova);
 void free_iova(struct iova_domain *iovad, unsigned long pfn);
 void __free_iova(struct iova_domain *iovad, struct iova *iova);
-struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
+unsigned long alloc_iova(struct iova_domain *iovad, unsigned long size,
 	unsigned long limit_pfn,
 	bool size_aligned);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
-- 
2.4.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/4] iommu: add a bitmap based dma address allocator
       [not found] ` <cover.1448401089.git.shli-b10kYP2dOMg@public.gmane.org>
  2015-11-24 21:54   ` [PATCH 1/4] iommu: alloc_iova returns a pfn Shaohua Li
@ 2015-11-24 21:54   ` Shaohua Li
       [not found]     ` <4c0804c85404be81acfe81fcd402f1af484e9d5f.1448401089.git.shli-b10kYP2dOMg@public.gmane.org>
  2015-11-24 21:54   ` [PATCH 3/4] iommu: enable bitmap allocation for intel iommu Shaohua Li
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2015-11-24 21:54 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Kernel-team-b10kYP2dOMg, Joerg Roedel, David Woodhouse

iovad rbtree spinlock contention is very significant. In a workload with
netperf and 10Gb NIC, multithread workload shows 100% cpu utilization
(99% cpu time on the lock) and the total throughput is less than
1Gbps.

This patch introduces a bitmap based allocator. We allocate a big chunk
of DMA range and divide it into 8k chunks. Each bit in the bitmap
present the 8k chunk. For any allocation with size less than 8k, we
allocate 8k. The DMA address allocation then becomes an allocation one
bit from a bitmap. We use percpu bitmap to speed up the bit allocation
further.

With the bitmap allocation, the lock contention is completely avoided.
In the workload above, the throughput is around 9.28Gbps and cpu
utilization drops to 1%.

The only works for DMA less than 8k, but it's case with most heavy lock
contention. If DAC is enabled by default on the future, we can allocate
a bigger DMA range and bigger chunk size.

Cc: Joerg Roedel <jroedel-l3A5Bk7waGM@public.gmane.org>
Cc: David Woodhouse <David.Woodhouse-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
---
 drivers/iommu/intel-iommu.c | 23 +++++++----
 drivers/iommu/iova.c        | 93 +++++++++++++++++++++++++++++++++++++++------
 include/linux/iova.h        | 13 +++++++
 3 files changed, 109 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 5c57b9a..6412297 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3595,7 +3595,7 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 {
 	struct dmar_domain *domain;
 	unsigned long start_pfn, last_pfn;
-	struct iova *iova;
+	struct iova *iova = NULL;
 	struct intel_iommu *iommu;
 	struct page *freelist;
 
@@ -3607,13 +3607,17 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 
 	iommu = domain_get_iommu(domain);
 
-	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
-	if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
+	if (iova_pfn_in_bitmap(&domain->iovad, IOVA_PFN(dev_addr))) {
+		start_pfn = IOVA_PFN(dev_addr);
+		last_pfn = start_pfn + IOVA_BITMAP_UNIT - 1;
+	} else {
+		iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
+		if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
 		      (unsigned long long)dev_addr))
-		return;
-
-	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
-	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
+			return;
+		start_pfn = mm_to_dma_pfn(iova->pfn_lo);
+		last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
+	}
 
 	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
 		 dev_name(dev), start_pfn, last_pfn);
@@ -3624,7 +3628,10 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
 		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
 				      last_pfn - start_pfn + 1, !freelist, 0);
 		/* free iova */
-		__free_iova(&domain->iovad, iova);
+		if (iova)
+			__free_iova(&domain->iovad, iova);
+		else
+			free_iova(&domain->iovad, start_pfn);
 		dma_free_pagelist(freelist);
 	} else {
 		add_unmap(domain, IOVA_PFN(dev_addr), last_pfn - start_pfn + 1,
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 2c5e197..6d11caf 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -15,6 +15,7 @@
  * Place - Suite 330, Boston, MA 02111-1307 USA.
  *
  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
+ * Bitmap based allocation: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
  */
 
 #include <linux/iova.h>
@@ -40,6 +41,9 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 		start_pfn = 1;
 	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit;
+	percpu_ida_init(&iovad->bitmap, IOVA_BITMAP_SIZE / IOVA_BITMAP_UNIT);
+	iovad->bitmap_iova = NULL;
+	iovad->disable_bitmap = true;
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
@@ -258,6 +262,54 @@ void iova_cache_put(void)
 }
 EXPORT_SYMBOL_GPL(iova_cache_put);
 
+static struct iova *__alloc_iova(struct iova_domain *iovad,
+	unsigned long size, unsigned long limit_pfn, bool size_aligned)
+{
+	struct iova *new_iova;
+	int ret;
+
+	new_iova = alloc_iova_mem();
+	if (!new_iova)
+		return NULL;
+
+	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn,
+			new_iova, size_aligned);
+
+	if (ret) {
+		free_iova_mem(new_iova);
+		return NULL;
+	}
+	return new_iova;
+}
+
+static int __init_iova_bitmap(struct iova_domain *iovad)
+{
+	struct iova *new_iova;
+	unsigned long flags;
+
+	new_iova = __alloc_iova(iovad, IOVA_BITMAP_SIZE,
+		iovad->dma_32bit_pfn, false);
+
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	if (!new_iova) {
+		if (!iovad->bitmap_iova)
+			iovad->disable_bitmap = true;
+		spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+		return iovad->bitmap_iova ? 0 : -ENOMEM;
+	}
+
+	if (!iovad->bitmap_iova)
+		iovad->bitmap_iova = new_iova;
+	else {
+		__cached_rbnode_delete_update(iovad, new_iova);
+		rb_erase(&new_iova->node, &iovad->rbroot);
+	}
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	if (iovad->bitmap_iova != new_iova)
+		free_iova_mem(new_iova);
+	return 0;
+}
+
 /**
  * alloc_iova - allocates an iova
  * @iovad: - iova domain in question
@@ -275,20 +327,23 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 	bool size_aligned)
 {
 	struct iova *new_iova;
-	int ret;
-
-	new_iova = alloc_iova_mem();
+	int tag;
+
+	if (size <= IOVA_BITMAP_UNIT && !iovad->disable_bitmap) {
+		if (!(iovad->bitmap_iova) && __init_iova_bitmap(iovad))
+			goto fallback;
+		if (limit_pfn < iovad->bitmap_iova->pfn_hi)
+			goto fallback;
+		tag = percpu_ida_alloc(&iovad->bitmap, TASK_RUNNING);
+		if (tag < 0)
+			goto fallback;
+		return iovad->bitmap_iova->pfn_lo + tag * IOVA_BITMAP_UNIT;
+	}
+fallback:
+	new_iova = __alloc_iova(iovad, size, limit_pfn, size_aligned);
 	if (!new_iova)
 		return 0;
 
-	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn,
-			new_iova, size_aligned);
-
-	if (ret) {
-		free_iova_mem(new_iova);
-		return 0;
-	}
-
 	return new_iova->pfn_lo;
 }
 EXPORT_SYMBOL_GPL(alloc_iova);
@@ -345,6 +400,8 @@ __free_iova(struct iova_domain *iovad, struct iova *iova)
 {
 	unsigned long flags;
 
+	BUG_ON(iova_pfn_in_bitmap(iovad, iova->pfn_lo));
+
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	__cached_rbnode_delete_update(iovad, iova);
 	rb_erase(&iova->node, &iovad->rbroot);
@@ -363,8 +420,18 @@ EXPORT_SYMBOL_GPL(__free_iova);
 void
 free_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-	struct iova *iova = find_iova(iovad, pfn);
+	struct iova *iova;
+
+	if (iova_pfn_in_bitmap(iovad, pfn)) {
+		int tag;
+
+		tag = (pfn - iovad->bitmap_iova->pfn_lo) >>
+			IOVA_BITMAP_UNIT_LOG;
+		percpu_ida_free(&iovad->bitmap, tag);
+		return;
+	}
 
+	iova = find_iova(iovad, pfn);
 	if (iova)
 		__free_iova(iovad, iova);
 }
@@ -381,6 +448,7 @@ void put_iova_domain(struct iova_domain *iovad)
 	unsigned long flags;
 
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	iovad->bitmap_iova = NULL;
 	node = rb_first(&iovad->rbroot);
 	while (node) {
 		struct iova *iova = container_of(node, struct iova, node);
@@ -390,6 +458,7 @@ void put_iova_domain(struct iova_domain *iovad)
 		node = rb_first(&iovad->rbroot);
 	}
 	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	percpu_ida_destroy(&iovad->bitmap);
 }
 EXPORT_SYMBOL_GPL(put_iova_domain);
 
diff --git a/include/linux/iova.h b/include/linux/iova.h
index cfe5ee9..63a81ef 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/rbtree.h>
 #include <linux/dma-mapping.h>
+#include <linux/percpu_ida.h>
 
 /* iova structure */
 struct iova {
@@ -23,6 +24,9 @@ struct iova {
 	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
 };
 
+#define IOVA_BITMAP_UNIT ((8 * 1024) >> PAGE_SHIFT)
+#define IOVA_BITMAP_UNIT_LOG (ilog2(IOVA_BITMAP_UNIT))
+#define IOVA_BITMAP_SIZE ((1L * 1024 * 1024 * 1024) >> PAGE_SHIFT)
 /* holds all the iova translations for a domain */
 struct iova_domain {
 	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
@@ -31,6 +35,9 @@ struct iova_domain {
 	unsigned long	granule;	/* pfn granularity for this domain */
 	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
+	struct percpu_ida bitmap;
+	struct iova	*bitmap_iova;
+	bool		disable_bitmap;
 };
 
 static inline unsigned long iova_size(struct iova *iova)
@@ -68,6 +75,12 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
 	return iova >> iova_shift(iovad);
 }
 
+static inline bool iova_pfn_in_bitmap(struct iova_domain *iovad, unsigned long pfn)
+{
+	return iovad->bitmap_iova && pfn >= iovad->bitmap_iova->pfn_lo &&
+	    pfn <= iovad->bitmap_iova->pfn_hi;
+}
+
 int iova_cache_get(void);
 void iova_cache_put(void);
 
-- 
2.4.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/4] iommu: enable bitmap allocation for intel iommu
       [not found] ` <cover.1448401089.git.shli-b10kYP2dOMg@public.gmane.org>
  2015-11-24 21:54   ` [PATCH 1/4] iommu: alloc_iova returns a pfn Shaohua Li
  2015-11-24 21:54   ` [PATCH 2/4] iommu: add a bitmap based dma address allocator Shaohua Li
@ 2015-11-24 21:54   ` Shaohua Li
  2015-11-24 21:54   ` [PATCH 4/4] iommu: free_iova doesn't need lock twice Shaohua Li
  2015-12-07 17:52   ` [PATCH 0/4] iommu: remove lock contention in iova allocation Shaohua Li
  4 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2015-11-24 21:54 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Kernel-team-b10kYP2dOMg, Joerg Roedel, David Woodhouse

I can't test the bitmap allocation for dma-iommu, so just enable it for
intel iommu.

Cc: Joerg Roedel <jroedel-l3A5Bk7waGM@public.gmane.org>
Cc: David Woodhouse <David.Woodhouse-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
---
 drivers/iommu/dma-iommu.c   | 2 +-
 drivers/iommu/intel-iommu.c | 6 +++---
 drivers/iommu/iova.c        | 4 ++--
 include/linux/iova.h        | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index a86291e..a42c2a8 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -119,7 +119,7 @@ int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, u64 size
 		}
 		iovad->dma_32bit_pfn = end_pfn;
 	} else {
-		init_iova_domain(iovad, 1UL << order, base_pfn, end_pfn);
+		init_iova_domain(iovad, 1UL << order, base_pfn, end_pfn, true);
 	}
 	return 0;
 }
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 6412297..d0ce3e4 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1808,7 +1808,7 @@ static int dmar_init_reserved_ranges(void)
 	int i;
 
 	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
-			DMA_32BIT_PFN);
+			DMA_32BIT_PFN, false);
 
 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
 		&reserved_rbtree_key);
@@ -1867,7 +1867,7 @@ static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
 	unsigned long sagaw;
 
 	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
-			DMA_32BIT_PFN);
+			DMA_32BIT_PFN, false);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
@@ -4749,7 +4749,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 	int adjust_width;
 
 	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
-			DMA_32BIT_PFN);
+			DMA_32BIT_PFN, false);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 6d11caf..0d71be7 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -24,7 +24,7 @@
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
-	unsigned long start_pfn, unsigned long pfn_32bit)
+	unsigned long start_pfn, unsigned long pfn_32bit, bool disable_bitmap)
 {
 	/*
 	 * IOVA granularity will normally be equal to the smallest
@@ -43,7 +43,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->dma_32bit_pfn = pfn_32bit;
 	percpu_ida_init(&iovad->bitmap, IOVA_BITMAP_SIZE / IOVA_BITMAP_UNIT);
 	iovad->bitmap_iova = NULL;
-	iovad->disable_bitmap = true;
+	iovad->disable_bitmap = disable_bitmap;
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 63a81ef..457f93b 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -95,7 +95,7 @@ struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
-	unsigned long start_pfn, unsigned long pfn_32bit);
+	unsigned long start_pfn, unsigned long pfn_32bit, bool disable_bitmap);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
-- 
2.4.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/4] iommu: free_iova doesn't need lock twice
       [not found] ` <cover.1448401089.git.shli-b10kYP2dOMg@public.gmane.org>
                     ` (2 preceding siblings ...)
  2015-11-24 21:54   ` [PATCH 3/4] iommu: enable bitmap allocation for intel iommu Shaohua Li
@ 2015-11-24 21:54   ` Shaohua Li
  2015-12-07 17:52   ` [PATCH 0/4] iommu: remove lock contention in iova allocation Shaohua Li
  4 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2015-11-24 21:54 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: Kernel-team-b10kYP2dOMg, Joerg Roedel, David Woodhouse

cleanup the code a bit. free_iova doesn't need lock twice.

Cc: Joerg Roedel <jroedel-l3A5Bk7waGM@public.gmane.org>
Cc: David Woodhouse <David.Woodhouse-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
---
 drivers/iommu/iova.c | 57 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 0d71be7..5f03c42 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -348,33 +348,16 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 }
 EXPORT_SYMBOL_GPL(alloc_iova);
 
-/**
- * find_iova - find's an iova for a given pfn
- * @iovad: - iova domain in question.
- * @pfn: - page frame number
- * This function finds and returns an iova belonging to the
- * given doamin which matches the given pfn.
- */
-struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+static struct iova *__find_iova(struct iova_domain *iovad, unsigned long pfn)
 {
-	unsigned long flags;
 	struct rb_node *node;
 
-	/* Take the lock so that no other thread is manipulating the rbtree */
-	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	node = iovad->rbroot.rb_node;
 	while (node) {
 		struct iova *iova = container_of(node, struct iova, node);
 
 		/* If pfn falls within iova's range, return iova */
 		if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
-			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-			/* We are not holding the lock while this iova
-			 * is referenced by the caller as the same thread
-			 * which called this function also calls __free_iova()
-			 * and it is by design that only one thread can possibly
-			 * reference a particular iova and hence no conflict.
-			 */
 			return iova;
 		}
 
@@ -384,9 +367,33 @@ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
 			node = node->rb_right;
 	}
 
-	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 	return NULL;
 }
+
+/**
+ * find_iova - find's an iova for a given pfn
+ * @iovad: - iova domain in question.
+ * @pfn: - page frame number
+ * This function finds and returns an iova belonging to the
+ * given doamin which matches the given pfn.
+ */
+struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+	unsigned long flags;
+	struct iova *iova;
+
+	/* Take the lock so that no other thread is manipulating the rbtree */
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	iova = __find_iova(iovad, pfn);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	/* We are not holding the lock while this iova
+	 * is referenced by the caller as the same thread
+	 * which called this function also calls __free_iova()
+	 * and it is by design that only one thread can possibly
+	 * reference a particular iova and hence no conflict.
+	 */
+	return iova;
+}
 EXPORT_SYMBOL_GPL(find_iova);
 
 /**
@@ -421,6 +428,7 @@ void
 free_iova(struct iova_domain *iovad, unsigned long pfn)
 {
 	struct iova *iova;
+	unsigned long flags;
 
 	if (iova_pfn_in_bitmap(iovad, pfn)) {
 		int tag;
@@ -431,9 +439,14 @@ free_iova(struct iova_domain *iovad, unsigned long pfn)
 		return;
 	}
 
-	iova = find_iova(iovad, pfn);
-	if (iova)
-		__free_iova(iovad, iova);
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	iova = __find_iova(iovad, pfn);
+	if (iova) {
+		__cached_rbnode_delete_update(iovad, iova);
+		rb_erase(&iova->node, &iovad->rbroot);
+	}
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	free_iova_mem(iova);
 }
 EXPORT_SYMBOL_GPL(free_iova);
 
-- 
2.4.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] iommu: add a bitmap based dma address allocator
       [not found]     ` <4c0804c85404be81acfe81fcd402f1af484e9d5f.1448401089.git.shli-b10kYP2dOMg@public.gmane.org>
@ 2015-11-24 21:56       ` Woodhouse, David
       [not found]         ` <1448402209.44976.109.camel-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Woodhouse, David @ 2015-11-24 21:56 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, shli-b10kYP2dOMg
  Cc: Kernel-team-b10kYP2dOMg, jroedel-l3A5Bk7waGM


[-- Attachment #1.1: Type: text/plain, Size: 409 bytes --]

On Tue, 2015-11-24 at 13:54 -0800, Shaohua Li wrote:
> 
> This patch introduces a bitmap based allocator.

Why not just use the one in lib/iommu-common.c? I'd like to see
intel-iommu completely drop the iova.c code.

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org                              Intel Corporation


[-- Attachment #1.2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 3437 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] iommu: add a bitmap based dma address allocator
       [not found]         ` <1448402209.44976.109.camel-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
@ 2015-11-24 22:05           ` Shaohua Li
       [not found]             ` <20151124220511.GA2106568-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2015-11-24 22:05 UTC (permalink / raw)
  To: Woodhouse, David
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	jroedel-l3A5Bk7waGM, Kernel-team-b10kYP2dOMg

On Tue, Nov 24, 2015 at 09:56:50PM +0000, Woodhouse, David wrote:
> On Tue, 2015-11-24 at 13:54 -0800, Shaohua Li wrote:
> > 
> > This patch introduces a bitmap based allocator.
> 
> Why not just use the one in lib/iommu-common.c? I'd like to see
> intel-iommu completely drop the iova.c code.

The lib/iommu-common.c uses a bitmap and a lock. This implementation
actually uses a percpu_ida which completely avoids locking. It would be
possible to make lib/iommu-common.c use percpu_ida too if somebody wants
to do it, but I think this shouldn't be a blocker for these patches
giving it has huge performance gain.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] iommu: add a bitmap based dma address allocator
       [not found]             ` <20151124220511.GA2106568-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
@ 2015-11-25 14:46               ` jroedel-l3A5Bk7waGM
       [not found]                 ` <20151125144613.GA3763-l3A5Bk7waGM@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: jroedel-l3A5Bk7waGM @ 2015-11-25 14:46 UTC (permalink / raw)
  To: Shaohua Li
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Kernel-team-b10kYP2dOMg, Woodhouse, David

On Tue, Nov 24, 2015 at 02:05:12PM -0800, Shaohua Li wrote:
> The lib/iommu-common.c uses a bitmap and a lock. This implementation
> actually uses a percpu_ida which completely avoids locking. It would be
> possible to make lib/iommu-common.c use percpu_ida too if somebody wants
> to do it, but I think this shouldn't be a blocker for these patches
> giving it has huge performance gain.

It doesn't "completely avoids locking", the percpu_ida code uses a lock
internally too. Also, what is the memory and device address space
overhead per cpu?


	Joerg

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] iommu: add a bitmap based dma address allocator
       [not found]                 ` <20151125144613.GA3763-l3A5Bk7waGM@public.gmane.org>
@ 2015-11-25 16:40                   ` Woodhouse, David
       [not found]                     ` <1448469599.44976.134.camel-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Woodhouse, David @ 2015-11-25 16:40 UTC (permalink / raw)
  To: jroedel-l3A5Bk7waGM, Shaohua Li
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Kernel-team-b10kYP2dOMg


[-- Attachment #1.1: Type: text/plain, Size: 1033 bytes --]

On Wed, 2015-11-25 at 15:46 +0100, jroedel-l3A5Bk7waGM@public.gmane.org wrote:
> On Tue, Nov 24, 2015 at 02:05:12PM -0800, Shaohua Li wrote:
> > The lib/iommu-common.c uses a bitmap and a lock. This implementation
> > actually uses a percpu_ida which completely avoids locking. It would be
> > possible to make lib/iommu-common.c use percpu_ida too if somebody wants
> > to do it, but I think this shouldn't be a blocker for these patches
> > giving it has huge performance gain.
> 
> It doesn't "completely avoids locking", the percpu_ida code uses a lock
> internally too. Also, what is the memory and device address space
> overhead per cpu?

A percpu lock doesn't bounce cachelines between CPUs very much, so from
that point of view it might as well not exist :)

-- 
                  Sent with Evolution's ActiveSync support.

David Woodhouse                            Open Source Technology Centre
David.Woodhouse-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org                              Intel Corporation



[-- Attachment #1.2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 3437 bytes --]

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] iommu: add a bitmap based dma address allocator
       [not found]                     ` <1448469599.44976.134.camel-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
@ 2015-11-25 16:43                       ` Jens Axboe
       [not found]                         ` <5655E545.1010509-b10kYP2dOMg@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2015-11-25 16:43 UTC (permalink / raw)
  To: Woodhouse, David, jroedel-l3A5Bk7waGM, Shaohua Li
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Kernel-team-b10kYP2dOMg

On 11/25/2015 09:40 AM, Woodhouse, David wrote:
> On Wed, 2015-11-25 at 15:46 +0100, jroedel-l3A5Bk7waGM@public.gmane.org wrote:
>> On Tue, Nov 24, 2015 at 02:05:12PM -0800, Shaohua Li wrote:
>>> The lib/iommu-common.c uses a bitmap and a lock. This implementation
>>> actually uses a percpu_ida which completely avoids locking. It would be
>>> possible to make lib/iommu-common.c use percpu_ida too if somebody wants
>>> to do it, but I think this shouldn't be a blocker for these patches
>>> giving it has huge performance gain.
>>
>> It doesn't "completely avoids locking", the percpu_ida code uses a lock
>> internally too. Also, what is the memory and device address space
>> overhead per cpu?
>
> A percpu lock doesn't bounce cachelines between CPUs very much, so from
> that point of view it might as well not exist :)

As long as the address space can remain more than ~50% empty, it is 
indeed practically lockless. Are we ever worried about higher 
utilization? If so, in my experience percpu_ida fails miserable near or 
at exhaustion.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] iommu: add a bitmap based dma address allocator
       [not found]                         ` <5655E545.1010509-b10kYP2dOMg@public.gmane.org>
@ 2015-11-25 17:33                           ` Shaohua Li
       [not found]                             ` <20151125173320.GA92074-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2015-11-25 17:33 UTC (permalink / raw)
  To: Jens Axboe
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	jroedel-l3A5Bk7waGM, Woodhouse, David, Kernel-team-b10kYP2dOMg

On Wed, Nov 25, 2015 at 09:43:49AM -0700, Jens Axboe wrote:
> On 11/25/2015 09:40 AM, Woodhouse, David wrote:
> >On Wed, 2015-11-25 at 15:46 +0100, jroedel-l3A5Bk7waGM@public.gmane.org wrote:
> >>On Tue, Nov 24, 2015 at 02:05:12PM -0800, Shaohua Li wrote:
> >>>The lib/iommu-common.c uses a bitmap and a lock. This implementation
> >>>actually uses a percpu_ida which completely avoids locking. It would be
> >>>possible to make lib/iommu-common.c use percpu_ida too if somebody wants
> >>>to do it, but I think this shouldn't be a blocker for these patches
> >>>giving it has huge performance gain.
> >>
> >>It doesn't "completely avoids locking", the percpu_ida code uses a lock
> >>internally too. Also, what is the memory and device address space
> >>overhead per cpu?
> >
> >A percpu lock doesn't bounce cachelines between CPUs very much, so from
> >that point of view it might as well not exist :)
> 
> As long as the address space can remain more than ~50% empty, it is
> indeed practically lockless. Are we ever worried about higher
> utilization? If so, in my experience percpu_ida fails miserable near
> or at exhaustion.

The patch uses TASK_RUNNING for tag allocation, so it doesn't wait. I
thought it's ok, no?

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] iommu: add a bitmap based dma address allocator
       [not found]                             ` <20151125173320.GA92074-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
@ 2015-11-25 17:38                               ` Jens Axboe
       [not found]                                 ` <5655F1F8.9070203-b10kYP2dOMg@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2015-11-25 17:38 UTC (permalink / raw)
  To: Shaohua Li
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	jroedel-l3A5Bk7waGM, Woodhouse, David, Kernel-team-b10kYP2dOMg

On 11/25/2015 10:33 AM, Shaohua Li wrote:
> On Wed, Nov 25, 2015 at 09:43:49AM -0700, Jens Axboe wrote:
>> On 11/25/2015 09:40 AM, Woodhouse, David wrote:
>>> On Wed, 2015-11-25 at 15:46 +0100, jroedel-l3A5Bk7waGM@public.gmane.org wrote:
>>>> On Tue, Nov 24, 2015 at 02:05:12PM -0800, Shaohua Li wrote:
>>>>> The lib/iommu-common.c uses a bitmap and a lock. This implementation
>>>>> actually uses a percpu_ida which completely avoids locking. It would be
>>>>> possible to make lib/iommu-common.c use percpu_ida too if somebody wants
>>>>> to do it, but I think this shouldn't be a blocker for these patches
>>>>> giving it has huge performance gain.
>>>>
>>>> It doesn't "completely avoids locking", the percpu_ida code uses a lock
>>>> internally too. Also, what is the memory and device address space
>>>> overhead per cpu?
>>>
>>> A percpu lock doesn't bounce cachelines between CPUs very much, so from
>>> that point of view it might as well not exist :)
>>
>> As long as the address space can remain more than ~50% empty, it is
>> indeed practically lockless. Are we ever worried about higher
>> utilization? If so, in my experience percpu_ida fails miserable near
>> or at exhaustion.
>
> The patch uses TASK_RUNNING for tag allocation, so it doesn't wait. I
> thought it's ok, no?

Even without the waiting it can end up sucking. If you are near 
exhaustion, multiple tasks allocating will end up stealing from each other.

Maybe it's not a concern here, if the space is big enough?

In any case, the pathological case isn't any worse than the normal case 
for the current code...

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] iommu: add a bitmap based dma address allocator
       [not found]                                 ` <5655F1F8.9070203-b10kYP2dOMg@public.gmane.org>
@ 2015-11-25 18:57                                   ` Shaohua Li
  0 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2015-11-25 18:57 UTC (permalink / raw)
  To: Jens Axboe
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	jroedel-l3A5Bk7waGM, Woodhouse, David, Kernel-team-b10kYP2dOMg

On Wed, Nov 25, 2015 at 10:38:00AM -0700, Jens Axboe wrote:
> On 11/25/2015 10:33 AM, Shaohua Li wrote:
> >On Wed, Nov 25, 2015 at 09:43:49AM -0700, Jens Axboe wrote:
> >>On 11/25/2015 09:40 AM, Woodhouse, David wrote:
> >>>On Wed, 2015-11-25 at 15:46 +0100, jroedel-l3A5Bk7waGM@public.gmane.org wrote:
> >>>>On Tue, Nov 24, 2015 at 02:05:12PM -0800, Shaohua Li wrote:
> >>>>>The lib/iommu-common.c uses a bitmap and a lock. This implementation
> >>>>>actually uses a percpu_ida which completely avoids locking. It would be
> >>>>>possible to make lib/iommu-common.c use percpu_ida too if somebody wants
> >>>>>to do it, but I think this shouldn't be a blocker for these patches
> >>>>>giving it has huge performance gain.
> >>>>
> >>>>It doesn't "completely avoids locking", the percpu_ida code uses a lock
> >>>>internally too. Also, what is the memory and device address space
> >>>>overhead per cpu?
> >>>
> >>>A percpu lock doesn't bounce cachelines between CPUs very much, so from
> >>>that point of view it might as well not exist :)
> >>
> >>As long as the address space can remain more than ~50% empty, it is
> >>indeed practically lockless. Are we ever worried about higher
> >>utilization? If so, in my experience percpu_ida fails miserable near
> >>or at exhaustion.
> >
> >The patch uses TASK_RUNNING for tag allocation, so it doesn't wait. I
> >thought it's ok, no?
> 
> Even without the waiting it can end up sucking. If you are near
> exhaustion, multiple tasks allocating will end up stealing from each
> other.
> 
> Maybe it's not a concern here, if the space is big enough?

There are 128k tags. I think we can double it without problems within
32-bit DMA address space if necessary. But if driver holds the DMA
address space, we will have space shortage. If 64-bit DMA address is on
by default in the future, this will not be a concern.

> In any case, the pathological case isn't any worse than the normal
> case for the current code...

Yep, it should be not worse than current spinlock.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/4] iommu: remove lock contention in iova allocation
       [not found] ` <cover.1448401089.git.shli-b10kYP2dOMg@public.gmane.org>
                     ` (3 preceding siblings ...)
  2015-11-24 21:54   ` [PATCH 4/4] iommu: free_iova doesn't need lock twice Shaohua Li
@ 2015-12-07 17:52   ` Shaohua Li
  4 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2015-12-07 17:52 UTC (permalink / raw)
  To: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	jroedel-l3A5Bk7waGM, David.Woodhouse-ral2JQCrhuEAvxtiuMwx3w

Ping!

On Tue, Nov 24, 2015 at 01:54:06PM -0800, Shaohua Li wrote:
> Hi,
> 
> The lock contention in iova allocation is very significant. In my iperf test
> with intel 10Gbps NIC card with 48 threads, I observed 99% cpu time is spending
> on the lock contention. In the test:
> 
> CPU utilization 100%, iperf reports ~1Gbps
> 
> The patches introduce a bitmap based approach for dma address allocation. It
> completetly avoids the lock contention. Run the same test:
> 
> CPU utilization 1%, iperf reports 9.3Gbps
> 
> I also tried the test with the patch, but disable bitmap. The result is the
> same as that without the patch.
> 
> The patches only work for DMA less than 8k, which is the most comman case we
> have highest request(DMA) per second. Bigger size DMA still fallback to rbtree
> based allocation. But if required and DAC is enabled by default, it's easy to
> support bigger size DMA in the bitmap allocation.
> 
> Thanks,
> Shaohua
> 
> 
> Shaohua Li (4):
>   iommu: alloc_iova returns a pfn
>   iommu: add a bitmap based dma address allocator
>   iommu: enable bitmap allocation for intel iommu
>   iommu: free_iova doesn't need lock twice
> 
>  drivers/iommu/dma-iommu.c   |  32 ++++-----
>  drivers/iommu/intel-iommu.c |  96 ++++++++++++++++-----------
>  drivers/iommu/iova.c        | 157 +++++++++++++++++++++++++++++++++-----------
>  include/linux/iova.h        |  21 ++++--
>  4 files changed, 210 insertions(+), 96 deletions(-)
> 
> -- 
> 2.4.6
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2015-12-07 17:52 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-24 21:54 [PATCH 0/4] iommu: remove lock contention in iova allocation Shaohua Li
     [not found] ` <cover.1448401089.git.shli-b10kYP2dOMg@public.gmane.org>
2015-11-24 21:54   ` [PATCH 1/4] iommu: alloc_iova returns a pfn Shaohua Li
2015-11-24 21:54   ` [PATCH 2/4] iommu: add a bitmap based dma address allocator Shaohua Li
     [not found]     ` <4c0804c85404be81acfe81fcd402f1af484e9d5f.1448401089.git.shli-b10kYP2dOMg@public.gmane.org>
2015-11-24 21:56       ` Woodhouse, David
     [not found]         ` <1448402209.44976.109.camel-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2015-11-24 22:05           ` Shaohua Li
     [not found]             ` <20151124220511.GA2106568-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2015-11-25 14:46               ` jroedel-l3A5Bk7waGM
     [not found]                 ` <20151125144613.GA3763-l3A5Bk7waGM@public.gmane.org>
2015-11-25 16:40                   ` Woodhouse, David
     [not found]                     ` <1448469599.44976.134.camel-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2015-11-25 16:43                       ` Jens Axboe
     [not found]                         ` <5655E545.1010509-b10kYP2dOMg@public.gmane.org>
2015-11-25 17:33                           ` Shaohua Li
     [not found]                             ` <20151125173320.GA92074-tb7CFzD8y5b7E6g3fPdp/g2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2015-11-25 17:38                               ` Jens Axboe
     [not found]                                 ` <5655F1F8.9070203-b10kYP2dOMg@public.gmane.org>
2015-11-25 18:57                                   ` Shaohua Li
2015-11-24 21:54   ` [PATCH 3/4] iommu: enable bitmap allocation for intel iommu Shaohua Li
2015-11-24 21:54   ` [PATCH 4/4] iommu: free_iova doesn't need lock twice Shaohua Li
2015-12-07 17:52   ` [PATCH 0/4] iommu: remove lock contention in iova allocation Shaohua Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.