[PATCH v2 0/3] arm-smmu: performance optimization

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 0/3] arm-smmu: performance optimization
@ 2017-09-12 13:00 Zhen Lei
  2017-09-12 13:00 ` [PATCH v2 1/3] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction Zhen Lei
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Zhen Lei @ 2017-09-12 13:00 UTC (permalink / raw)
  To: Will Deacon, Joerg Roedel, linux-arm-kernel, iommu, Robin Murphy,
	linux-kernel
  Cc: Hanjun Guo, Libin, Zhen Lei, Jinyue Li, Kefeng Wang

v1 -> v2:
base on (add02cfdc9bc2 "iommu: Introduce Interface for IOMMU TLB Flushing")

Zhen Lei (3):
  iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock
    confliction
  iommu/arm-smmu-v3: add support for unmap an iova range with only one
    tlb sync
  iommu/arm-smmu: add support for unmap a memory range with only one tlb
    sync

 drivers/iommu/arm-smmu-v3.c        | 52 ++++++++++++++++++++++++++++++++++----
 drivers/iommu/arm-smmu.c           | 10 ++++++++
 drivers/iommu/io-pgtable-arm-v7s.c | 32 +++++++++++++++--------
 drivers/iommu/io-pgtable-arm.c     | 30 ++++++++++++++--------
 drivers/iommu/io-pgtable.h         |  1 +
 5 files changed, 99 insertions(+), 26 deletions(-)

-- 
2.5.0

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 1/3] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-09-12 13:00 [PATCH v2 0/3] arm-smmu: performance optimization Zhen Lei
@ 2017-09-12 13:00 ` Zhen Lei
  2017-10-18 12:58   ` Will Deacon
  2017-09-12 13:00 ` [PATCH v2 2/3] iommu/arm-smmu-v3: add support for unmap an iova range with only one tlb sync Zhen Lei
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 11+ messages in thread
From: Zhen Lei @ 2017-09-12 13:00 UTC (permalink / raw)
  To: Will Deacon, Joerg Roedel, linux-arm-kernel, iommu, Robin Murphy,
	linux-kernel
  Cc: Hanjun Guo, Libin, Zhen Lei, Jinyue Li, Kefeng Wang

Because all TLBI commands should be followed by a SYNC command, to make
sure that it has been completely finished. So we can just add the TLBI
commands into the queue, and put off the execution until meet SYNC or
other commands. To prevent the followed SYNC command waiting for a long
time because of too many commands have been delayed, restrict the max
delayed number.

According to my test, I got the same performance data as I replaced writel
with writel_relaxed in queue_inc_prod.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 drivers/iommu/arm-smmu-v3.c | 42 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index e67ba6c..ef42c4b 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -337,6 +337,7 @@
 /* Command queue */
 #define CMDQ_ENT_DWORDS			2
 #define CMDQ_MAX_SZ_SHIFT		8
+#define CMDQ_MAX_DELAYED		32
 
 #define CMDQ_ERR_SHIFT			24
 #define CMDQ_ERR_MASK			0x7f
@@ -482,6 +483,7 @@ struct arm_smmu_cmdq_ent {
 			};
 		} cfgi;
 
+		#define CMDQ_OP_TLBI_NH_ALL	0x10
 		#define CMDQ_OP_TLBI_NH_ASID	0x11
 		#define CMDQ_OP_TLBI_NH_VA	0x12
 		#define CMDQ_OP_TLBI_EL2_ALL	0x20
@@ -509,6 +511,7 @@ struct arm_smmu_cmdq_ent {
 
 struct arm_smmu_queue {
 	int				irq; /* Wired interrupt */
+	u32				nr_delay;
 
 	__le64				*base;
 	dma_addr_t			base_dma;
@@ -745,11 +748,16 @@ static int queue_sync_prod(struct arm_smmu_queue *q)
 	return ret;
 }
 
-static void queue_inc_prod(struct arm_smmu_queue *q)
+static void queue_inc_swprod(struct arm_smmu_queue *q)
 {
-	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
+	u32 prod = q->prod + 1;
 
 	q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
+}
+
+static void queue_inc_prod(struct arm_smmu_queue *q)
+{
+	queue_inc_swprod(q);
 	writel(q->prod, q->prod_reg);
 }
 
@@ -791,13 +799,24 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
 		*dst++ = cpu_to_le64(*src++);
 }
 
-static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
+static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent, int optimize)
 {
 	if (queue_full(q))
 		return -ENOSPC;
 
 	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
-	queue_inc_prod(q);
+
+	/*
+	 * We don't want too many commands to be delayed, this may lead the
+	 * followed sync command to wait for a long time.
+	 */
+	if (optimize && (++q->nr_delay < CMDQ_MAX_DELAYED)) {
+		queue_inc_swprod(q);
+	} else {
+		queue_inc_prod(q);
+		q->nr_delay = 0;
+	}
+
 	return 0;
 }
 
@@ -939,6 +958,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
 static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 				    struct arm_smmu_cmdq_ent *ent)
 {
+	int optimize = 0;
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
 	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
@@ -950,8 +970,17 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 		return;
 	}
 
+	/*
+	 * All TLBI commands should be followed by a sync command later.
+	 * The CFGI commands is the same, but they are rarely executed.
+	 * So just optimize TLBI commands now, to reduce the "if" judgement.
+	 */
+	if ((ent->opcode >= CMDQ_OP_TLBI_NH_ALL) &&
+	    (ent->opcode <= CMDQ_OP_TLBI_NSNH_ALL))
+		optimize = 1;
+
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
-	while (queue_insert_raw(q, cmd) == -ENOSPC) {
+	while (queue_insert_raw(q, cmd, optimize) == -ENOSPC) {
 		if (queue_poll_cons(q, false, wfe))
 			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 	}
@@ -2001,6 +2030,8 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
 		     << Q_BASE_LOG2SIZE_SHIFT;
 
 	q->prod = q->cons = 0;
+	q->nr_delay = 0;
+
 	return 0;
 }
 
@@ -2584,6 +2615,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 		dev_err(smmu->dev, "unit-length command queue not supported\n");
 		return -ENXIO;
 	}
+	BUILD_BUG_ON(CMDQ_MAX_DELAYED >= (1 << CMDQ_MAX_SZ_SHIFT));
 
 	smmu->evtq.q.max_n_shift = min((u32)EVTQ_MAX_SZ_SHIFT,
 				       reg >> IDR1_EVTQ_SHIFT & IDR1_EVTQ_MASK);
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 2/3] iommu/arm-smmu-v3: add support for unmap an iova range with only one tlb sync
  2017-09-12 13:00 [PATCH v2 0/3] arm-smmu: performance optimization Zhen Lei
  2017-09-12 13:00 ` [PATCH v2 1/3] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction Zhen Lei
@ 2017-09-12 13:00 ` Zhen Lei
  2017-10-18 13:00   ` Will Deacon
  2017-09-12 13:00 ` [PATCH v2 3/3] iommu/arm-smmu: add support for unmap a memory " Zhen Lei
  2017-09-19  4:31 ` [PATCH v2 0/3] arm-smmu: performance optimization Nate Watterson
  3 siblings, 1 reply; 11+ messages in thread
From: Zhen Lei @ 2017-09-12 13:00 UTC (permalink / raw)
  To: Will Deacon, Joerg Roedel, linux-arm-kernel, iommu, Robin Murphy,
	linux-kernel
  Cc: Hanjun Guo, Libin, Zhen Lei, Jinyue Li, Kefeng Wang

This patch is base on: 
(add02cfdc9bc2 "iommu: Introduce Interface for IOMMU TLB Flushing")

Because iotlb_sync is moved out of ".unmap = arm_smmu_unmap", some interval
".unmap" calls should explicitly followed by a iotlb_sync operation.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 drivers/iommu/arm-smmu-v3.c    | 10 ++++++++++
 drivers/iommu/io-pgtable-arm.c | 30 ++++++++++++++++++++----------
 drivers/iommu/io-pgtable.h     |  1 +
 3 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index ef42c4b..e92828e 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1772,6 +1772,15 @@ arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
 	return ops->unmap(ops, iova, size);
 }
 
+static void arm_smmu_iotlb_sync(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+
+	if (ops && ops->iotlb_sync)
+		ops->iotlb_sync(ops);
+}
+
 static phys_addr_t
 arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 {
@@ -1991,6 +2000,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.attach_dev		= arm_smmu_attach_dev,
 	.map			= arm_smmu_map,
 	.unmap			= arm_smmu_unmap,
+	.iotlb_sync		= arm_smmu_iotlb_sync,
 	.map_sg			= default_iommu_map_sg,
 	.iova_to_phys		= arm_smmu_iova_to_phys,
 	.add_device		= arm_smmu_add_device,
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index e8018a3..805efc9 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -304,6 +304,8 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
 		WARN_ON(!selftest_running);
 		return -EEXIST;
 	} else if (iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_TABLE) {
+		size_t unmapped;
+
 		/*
 		 * We need to unmap and free the old table before
 		 * overwriting it with a block entry.
@@ -312,7 +314,9 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
 		size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
 
 		tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
-		if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))
+		unmapped = __arm_lpae_unmap(data, iova, sz, lvl, tblp);
+		io_pgtable_tlb_sync(&data->iop);
+		if (WARN_ON(unmapped != sz))
 			return -EINVAL;
 	}
 
@@ -584,7 +588,6 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 			/* Also flush any partial walks */
 			io_pgtable_tlb_add_flush(iop, iova, size,
 						ARM_LPAE_GRANULE(data), false);
-			io_pgtable_tlb_sync(iop);
 			ptep = iopte_deref(pte, data);
 			__arm_lpae_free_pgtable(data, lvl + 1, ptep);
 		} else {
@@ -609,7 +612,6 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 static int arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
 			  size_t size)
 {
-	size_t unmapped;
 	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
 	arm_lpae_iopte *ptep = data->pgd;
 	int lvl = ARM_LPAE_START_LVL(data);
@@ -617,11 +619,14 @@ static int arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
 	if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))
 		return 0;
 
-	unmapped = __arm_lpae_unmap(data, iova, size, lvl, ptep);
-	if (unmapped)
-		io_pgtable_tlb_sync(&data->iop);
+	return __arm_lpae_unmap(data, iova, size, lvl, ptep);
+}
+
+static void arm_lpae_iotlb_sync(struct io_pgtable_ops *ops)
+{
+	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
 
-	return unmapped;
+	io_pgtable_tlb_sync(&data->iop);
 }
 
 static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
@@ -734,6 +739,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
 	data->iop.ops = (struct io_pgtable_ops) {
 		.map		= arm_lpae_map,
 		.unmap		= arm_lpae_unmap,
+		.iotlb_sync	= arm_lpae_iotlb_sync,
 		.iova_to_phys	= arm_lpae_iova_to_phys,
 	};
 
@@ -1030,7 +1036,7 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
 
 	int i, j;
 	unsigned long iova;
-	size_t size;
+	size_t size, unmapped;
 	struct io_pgtable_ops *ops;
 
 	selftest_running = true;
@@ -1082,7 +1088,9 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
 
 		/* Partial unmap */
 		size = 1UL << __ffs(cfg->pgsize_bitmap);
-		if (ops->unmap(ops, SZ_1G + size, size) != size)
+		unmapped = ops->unmap(ops, SZ_1G + size, size);
+		ops->iotlb_sync(ops);
+		if (unmapped != size)
 			return __FAIL(ops, i);
 
 		/* Remap of partial unmap */
@@ -1098,7 +1106,9 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
 		while (j != BITS_PER_LONG) {
 			size = 1UL << j;
 
-			if (ops->unmap(ops, iova, size) != size)
+			unmapped = ops->unmap(ops, iova, size);
+			ops->iotlb_sync(ops);
+			if (unmapped != size)
 				return __FAIL(ops, i);
 
 			if (ops->iova_to_phys(ops, iova + 42))
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index a3e6670..3a72e08 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -120,6 +120,7 @@ struct io_pgtable_ops {
 		   phys_addr_t paddr, size_t size, int prot);
 	int (*unmap)(struct io_pgtable_ops *ops, unsigned long iova,
 		     size_t size);
+	void (*iotlb_sync)(struct io_pgtable_ops *ops);
 	phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
 				    unsigned long iova);
 };
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 3/3] iommu/arm-smmu: add support for unmap a memory range with only one tlb sync
  2017-09-12 13:00 [PATCH v2 0/3] arm-smmu: performance optimization Zhen Lei
  2017-09-12 13:00 ` [PATCH v2 1/3] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction Zhen Lei
  2017-09-12 13:00 ` [PATCH v2 2/3] iommu/arm-smmu-v3: add support for unmap an iova range with only one tlb sync Zhen Lei
@ 2017-09-12 13:00 ` Zhen Lei
  2017-09-19  4:31 ` [PATCH v2 0/3] arm-smmu: performance optimization Nate Watterson
  3 siblings, 0 replies; 11+ messages in thread
From: Zhen Lei @ 2017-09-12 13:00 UTC (permalink / raw)
  To: Will Deacon, Joerg Roedel, linux-arm-kernel, iommu, Robin Murphy,
	linux-kernel
  Cc: Hanjun Guo, Libin, Zhen Lei, Jinyue Li, Kefeng Wang

This patch is base on: 
(add02cfdc9bc2 "iommu: Introduce Interface for IOMMU TLB Flushing")

Because iotlb_sync is moved out of ".unmap = arm_smmu_unmap", some interval
".unmap" calls should explicitly followed by a iotlb_sync operation.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 drivers/iommu/arm-smmu.c           | 10 ++++++++++
 drivers/iommu/io-pgtable-arm-v7s.c | 32 +++++++++++++++++++++-----------
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 3bdb799..bb57d67 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -1259,6 +1259,15 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
 	return ops->unmap(ops, iova, size);
 }
 
+static void arm_smmu_iotlb_sync(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+
+	if (ops && ops->iotlb_sync)
+		ops->iotlb_sync(ops);
+}
+
 static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
 					      dma_addr_t iova)
 {
@@ -1561,6 +1570,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.attach_dev		= arm_smmu_attach_dev,
 	.map			= arm_smmu_map,
 	.unmap			= arm_smmu_unmap,
+	.iotlb_sync		= arm_smmu_iotlb_sync,
 	.map_sg			= default_iommu_map_sg,
 	.iova_to_phys		= arm_smmu_iova_to_phys,
 	.add_device		= arm_smmu_add_device,
diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index d665d0d..457ad29 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -370,6 +370,8 @@ static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data,
 
 	for (i = 0; i < num_entries; i++)
 		if (ARM_V7S_PTE_IS_TABLE(ptep[i], lvl)) {
+			size_t unmapped;
+
 			/*
 			 * We need to unmap and free the old table before
 			 * overwriting it with a block entry.
@@ -378,8 +380,10 @@ static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data,
 			size_t sz = ARM_V7S_BLOCK_SIZE(lvl);
 
 			tblp = ptep - ARM_V7S_LVL_IDX(iova, lvl);
-			if (WARN_ON(__arm_v7s_unmap(data, iova + i * sz,
-						    sz, lvl, tblp) != sz))
+			unmapped = __arm_v7s_unmap(data, iova + i * sz,
+						    sz, lvl, tblp);
+			io_pgtable_tlb_sync(&data->iop);
+			if (WARN_ON(unmapped != sz))
 				return -EINVAL;
 		} else if (ptep[i]) {
 			/* We require an unmap first */
@@ -633,7 +637,6 @@ static int __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
 				/* Also flush any partial walks */
 				io_pgtable_tlb_add_flush(iop, iova, blk_size,
 					ARM_V7S_BLOCK_SIZE(lvl + 1), false);
-				io_pgtable_tlb_sync(iop);
 				ptep = iopte_deref(pte[i], lvl);
 				__arm_v7s_free_table(ptep, lvl + 1, data);
 			} else {
@@ -660,16 +663,18 @@ static int arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova,
 			 size_t size)
 {
 	struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
-	size_t unmapped;
 
 	if (WARN_ON(upper_32_bits(iova)))
 		return 0;
 
-	unmapped = __arm_v7s_unmap(data, iova, size, 1, data->pgd);
-	if (unmapped)
-		io_pgtable_tlb_sync(&data->iop);
+	return __arm_v7s_unmap(data, iova, size, 1, data->pgd);
+}
+
+static void arm_v7s_iotlb_sync(struct io_pgtable_ops *ops)
+{
+	struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
 
-	return unmapped;
+	io_pgtable_tlb_sync(&data->iop);
 }
 
 static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops,
@@ -734,6 +739,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 	data->iop.ops = (struct io_pgtable_ops) {
 		.map		= arm_v7s_map,
 		.unmap		= arm_v7s_unmap,
+		.iotlb_sync	= arm_v7s_iotlb_sync,
 		.iova_to_phys	= arm_v7s_iova_to_phys,
 	};
 
@@ -832,7 +838,7 @@ static int __init arm_v7s_do_selftests(void)
 		.quirks = IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA,
 		.pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
 	};
-	unsigned int iova, size, iova_start;
+	unsigned int iova, size, unmapped, iova_start;
 	unsigned int i, loopnr = 0;
 
 	selftest_running = true;
@@ -887,7 +893,9 @@ static int __init arm_v7s_do_selftests(void)
 	size = 1UL << __ffs(cfg.pgsize_bitmap);
 	while (i < loopnr) {
 		iova_start = i * SZ_16M;
-		if (ops->unmap(ops, iova_start + size, size) != size)
+		unmapped = ops->unmap(ops, iova_start + size, size);
+		ops->iotlb_sync(ops);
+		if (unmapped != size)
 			return __FAIL(ops);
 
 		/* Remap of partial unmap */
@@ -906,7 +914,9 @@ static int __init arm_v7s_do_selftests(void)
 	while (i != BITS_PER_LONG) {
 		size = 1UL << i;
 
-		if (ops->unmap(ops, iova, size) != size)
+		unmapped = ops->unmap(ops, iova, size);
+		ops->iotlb_sync(ops);
+		if (unmapped != size)
 			return __FAIL(ops);
 
 		if (ops->iova_to_phys(ops, iova + 42))
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 0/3] arm-smmu: performance optimization
  2017-09-12 13:00 [PATCH v2 0/3] arm-smmu: performance optimization Zhen Lei
                   ` (2 preceding siblings ...)
  2017-09-12 13:00 ` [PATCH v2 3/3] iommu/arm-smmu: add support for unmap a memory " Zhen Lei
@ 2017-09-19  4:31 ` Nate Watterson
  2017-09-19  6:26   ` Leizhen (ThunderTown)
  3 siblings, 1 reply; 11+ messages in thread
From: Nate Watterson @ 2017-09-19  4:31 UTC (permalink / raw)
  To: Zhen Lei, Will Deacon, Joerg Roedel, linux-arm-kernel, iommu,
	Robin Murphy, linux-kernel
  Cc: Jinyue Li, Kefeng Wang, Libin, Hanjun Guo

Hi Leizhen,

On 9/12/2017 9:00 AM, Zhen Lei wrote:
> v1 -> v2:
> base on (add02cfdc9bc2 "iommu: Introduce Interface for IOMMU TLB Flushing")
> 
> Zhen Lei (3):
>    iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock
>      confliction
>    iommu/arm-smmu-v3: add support for unmap an iova range with only one
>      tlb sync

I tested these (2) patches on QDF2400 hardware and saw performance
improvements in line with those I reported when testing the original
series. I don't have any hardware close at hand to test the 3rd patch
in the series so that will have to come from someone else.

Tested-by: Nate Watterson <nwatters@codeaurora.org>

Thanks,
Nate

>    iommu/arm-smmu: add support for unmap a memory range with only one tlb
>      sync
> 
>   drivers/iommu/arm-smmu-v3.c        | 52 ++++++++++++++++++++++++++++++++++----
>   drivers/iommu/arm-smmu.c           | 10 ++++++++
>   drivers/iommu/io-pgtable-arm-v7s.c | 32 +++++++++++++++--------
>   drivers/iommu/io-pgtable-arm.c     | 30 ++++++++++++++--------
>   drivers/iommu/io-pgtable.h         |  1 +
>   5 files changed, 99 insertions(+), 26 deletions(-)
> 

-- 
Qualcomm Datacenter Technologies as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 0/3] arm-smmu: performance optimization
  2017-09-19  4:31 ` [PATCH v2 0/3] arm-smmu: performance optimization Nate Watterson
@ 2017-09-19  6:26   ` Leizhen (ThunderTown)
  0 siblings, 0 replies; 11+ messages in thread
From: Leizhen (ThunderTown) @ 2017-09-19  6:26 UTC (permalink / raw)
  To: Nate Watterson, Will Deacon, Joerg Roedel, linux-arm-kernel,
	iommu, Robin Murphy, linux-kernel
  Cc: Jinyue Li, Kefeng Wang, Libin, Hanjun Guo



On 2017/9/19 12:31, Nate Watterson wrote:
> Hi Leizhen,
> 
> On 9/12/2017 9:00 AM, Zhen Lei wrote:
>> v1 -> v2:
>> base on (add02cfdc9bc2 "iommu: Introduce Interface for IOMMU TLB Flushing")
>>
>> Zhen Lei (3):
>>    iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock
>>      confliction
>>    iommu/arm-smmu-v3: add support for unmap an iova range with only one
>>      tlb sync
> 
> I tested these (2) patches on QDF2400 hardware and saw performance
> improvements in line with those I reported when testing the original
> series. I don't have any hardware close at hand to test the 3rd patch
> in the series so that will have to come from someone else.
Thanks a lot.

> 
> Tested-by: Nate Watterson <nwatters@codeaurora.org>
> 
> Thanks,
> Nate
> 
>>    iommu/arm-smmu: add support for unmap a memory range with only one tlb
>>      sync
>>
>>   drivers/iommu/arm-smmu-v3.c        | 52 ++++++++++++++++++++++++++++++++++----
>>   drivers/iommu/arm-smmu.c           | 10 ++++++++
>>   drivers/iommu/io-pgtable-arm-v7s.c | 32 +++++++++++++++--------
>>   drivers/iommu/io-pgtable-arm.c     | 30 ++++++++++++++--------
>>   drivers/iommu/io-pgtable.h         |  1 +
>>   5 files changed, 99 insertions(+), 26 deletions(-)
>>
> 

-- 
Thanks!
BestRegards

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 1/3] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-09-12 13:00 ` [PATCH v2 1/3] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction Zhen Lei
@ 2017-10-18 12:58   ` Will Deacon
  2017-10-19  3:00     ` Leizhen (ThunderTown)
  0 siblings, 1 reply; 11+ messages in thread
From: Will Deacon @ 2017-10-18 12:58 UTC (permalink / raw)
  To: Zhen Lei
  Cc: Joerg Roedel, linux-arm-kernel, iommu, Robin Murphy,
	linux-kernel, Hanjun Guo, Libin, Jinyue Li, Kefeng Wang

Hi Thunder,

On Tue, Sep 12, 2017 at 09:00:36PM +0800, Zhen Lei wrote:
> Because all TLBI commands should be followed by a SYNC command, to make
> sure that it has been completely finished. So we can just add the TLBI
> commands into the queue, and put off the execution until meet SYNC or
> other commands. To prevent the followed SYNC command waiting for a long
> time because of too many commands have been delayed, restrict the max
> delayed number.
> 
> According to my test, I got the same performance data as I replaced writel
> with writel_relaxed in queue_inc_prod.
> 
> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> ---
>  drivers/iommu/arm-smmu-v3.c | 42 +++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 37 insertions(+), 5 deletions(-)

If we want to go down the route of explicit command batching, I'd much
rather do it by implementing the iotlb_range_add callback in the driver,
and have a fixed-length array of batched ranges on the domain. We could
potentially toggle this function pointer based on the compatible string too,
if it shows only to benefit some systems.

Will

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 2/3] iommu/arm-smmu-v3: add support for unmap an iova range with only one tlb sync
  2017-09-12 13:00 ` [PATCH v2 2/3] iommu/arm-smmu-v3: add support for unmap an iova range with only one tlb sync Zhen Lei
@ 2017-10-18 13:00   ` Will Deacon
  2017-10-19  3:17     ` Leizhen (ThunderTown)
  0 siblings, 1 reply; 11+ messages in thread
From: Will Deacon @ 2017-10-18 13:00 UTC (permalink / raw)
  To: Zhen Lei
  Cc: Joerg Roedel, linux-arm-kernel, iommu, Robin Murphy,
	linux-kernel, Hanjun Guo, Libin, Jinyue Li, Kefeng Wang

On Tue, Sep 12, 2017 at 09:00:37PM +0800, Zhen Lei wrote:
> This patch is base on: 
> (add02cfdc9bc2 "iommu: Introduce Interface for IOMMU TLB Flushing")
> 
> Because iotlb_sync is moved out of ".unmap = arm_smmu_unmap", some interval
> ".unmap" calls should explicitly followed by a iotlb_sync operation.
> 
> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> ---
>  drivers/iommu/arm-smmu-v3.c    | 10 ++++++++++
>  drivers/iommu/io-pgtable-arm.c | 30 ++++++++++++++++++++----------
>  drivers/iommu/io-pgtable.h     |  1 +
>  3 files changed, 31 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index ef42c4b..e92828e 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -1772,6 +1772,15 @@ arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
>  	return ops->unmap(ops, iova, size);
>  }
>  
> +static void arm_smmu_iotlb_sync(struct iommu_domain *domain)
> +{
> +	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> +	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
> +
> +	if (ops && ops->iotlb_sync)
> +		ops->iotlb_sync(ops);
> +}
> +
>  static phys_addr_t
>  arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
>  {
> @@ -1991,6 +2000,7 @@ static struct iommu_ops arm_smmu_ops = {
>  	.attach_dev		= arm_smmu_attach_dev,
>  	.map			= arm_smmu_map,
>  	.unmap			= arm_smmu_unmap,
> +	.iotlb_sync		= arm_smmu_iotlb_sync,
>  	.map_sg			= default_iommu_map_sg,
>  	.iova_to_phys		= arm_smmu_iova_to_phys,
>  	.add_device		= arm_smmu_add_device,
> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index e8018a3..805efc9 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -304,6 +304,8 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
>  		WARN_ON(!selftest_running);
>  		return -EEXIST;
>  	} else if (iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_TABLE) {
> +		size_t unmapped;
> +
>  		/*
>  		 * We need to unmap and free the old table before
>  		 * overwriting it with a block entry.
> @@ -312,7 +314,9 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
>  		size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
>  
>  		tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
> -		if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))
> +		unmapped = __arm_lpae_unmap(data, iova, sz, lvl, tblp);
> +		io_pgtable_tlb_sync(&data->iop);
> +		if (WARN_ON(unmapped != sz))
>  			return -EINVAL;
>  	}
>  
> @@ -584,7 +588,6 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>  			/* Also flush any partial walks */
>  			io_pgtable_tlb_add_flush(iop, iova, size,
>  						ARM_LPAE_GRANULE(data), false);
> -			io_pgtable_tlb_sync(iop);
>  			ptep = iopte_deref(pte, data);
>  			__arm_lpae_free_pgtable(data, lvl + 1, ptep);
>  		} else {
> @@ -609,7 +612,6 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>  static int arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
>  			  size_t size)
>  {
> -	size_t unmapped;
>  	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
>  	arm_lpae_iopte *ptep = data->pgd;
>  	int lvl = ARM_LPAE_START_LVL(data);
> @@ -617,11 +619,14 @@ static int arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
>  	if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))
>  		return 0;
>  
> -	unmapped = __arm_lpae_unmap(data, iova, size, lvl, ptep);
> -	if (unmapped)
> -		io_pgtable_tlb_sync(&data->iop);
> +	return __arm_lpae_unmap(data, iova, size, lvl, ptep);
> +}

This change is already queued in Joerg's tree, due to a patch from Robin.

Will

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 1/3] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-10-18 12:58   ` Will Deacon
@ 2017-10-19  3:00     ` Leizhen (ThunderTown)
  2017-10-19  9:12       ` Will Deacon
  0 siblings, 1 reply; 11+ messages in thread
From: Leizhen (ThunderTown) @ 2017-10-19  3:00 UTC (permalink / raw)
  To: Will Deacon
  Cc: Joerg Roedel, linux-arm-kernel, iommu, Robin Murphy,
	linux-kernel, Hanjun Guo, Libin, Jinyue Li, Kefeng Wang



On 2017/10/18 20:58, Will Deacon wrote:
> Hi Thunder,
> 
> On Tue, Sep 12, 2017 at 09:00:36PM +0800, Zhen Lei wrote:
>> Because all TLBI commands should be followed by a SYNC command, to make
>> sure that it has been completely finished. So we can just add the TLBI
>> commands into the queue, and put off the execution until meet SYNC or
>> other commands. To prevent the followed SYNC command waiting for a long
>> time because of too many commands have been delayed, restrict the max
>> delayed number.
>>
>> According to my test, I got the same performance data as I replaced writel
>> with writel_relaxed in queue_inc_prod.
>>
>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>> ---
>>  drivers/iommu/arm-smmu-v3.c | 42 +++++++++++++++++++++++++++++++++++++-----
>>  1 file changed, 37 insertions(+), 5 deletions(-)
> 
> If we want to go down the route of explicit command batching, I'd much
> rather do it by implementing the iotlb_range_add callback in the driver,
> and have a fixed-length array of batched ranges on the domain. We could
I think even if iotlb_range_add callback is implemented, this patch is still valuable. The main purpose
of this patch is to reduce dsb operation. So in the scenario with iotlb_range_add implemented:
.iotlb_range_add:
spin_lock_irqsave(&smmu->cmdq.lock, flags);
...
add tlbi range-1 to cmq-queue
...
add tlbi range-n to cmq-queue			//n
dsb
...
spin_unlock_irqrestore(&smmu->cmdq.lock, flags);

.iotlb_sync
spin_lock_irqsave(&smmu->cmdq.lock, flags);
...
add cmd_sync to cmq-queue
dsb
...
spin_unlock_irqrestore(&smmu->cmdq.lock, flags);

Although iotlb_range_add can reduce n-1 dsb operations, but there are still 1 left. If n is not large enough,
this patch is helpful.


> potentially toggle this function pointer based on the compatible string too,
> if it shows only to benefit some systems.
[
On 2017/9/19 12:31, Nate Watterson wrote:
I tested these (2) patches on QDF2400 hardware and saw performance
improvements in line with those I reported when testing the original
series.
]

I'm not sure whether this patch can improve performance on QDF2400, because there are two patches. But at least
it seems harmless, maybe the other hardware platforms are the same.

> 
> Will
> 
> .
> 

-- 
Thanks!
BestRegards

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 2/3] iommu/arm-smmu-v3: add support for unmap an iova range with only one tlb sync
  2017-10-18 13:00   ` Will Deacon
@ 2017-10-19  3:17     ` Leizhen (ThunderTown)
  0 siblings, 0 replies; 11+ messages in thread
From: Leizhen (ThunderTown) @ 2017-10-19  3:17 UTC (permalink / raw)
  To: Will Deacon
  Cc: Joerg Roedel, linux-arm-kernel, iommu, Robin Murphy,
	linux-kernel, Hanjun Guo, Libin, Jinyue Li, Kefeng Wang



On 2017/10/18 21:00, Will Deacon wrote:
> On Tue, Sep 12, 2017 at 09:00:37PM +0800, Zhen Lei wrote:
>> This patch is base on: 
>> (add02cfdc9bc2 "iommu: Introduce Interface for IOMMU TLB Flushing")
>>
>> Because iotlb_sync is moved out of ".unmap = arm_smmu_unmap", some interval
>> ".unmap" calls should explicitly followed by a iotlb_sync operation.
>>
>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>> ---
>>  drivers/iommu/arm-smmu-v3.c    | 10 ++++++++++
>>  drivers/iommu/io-pgtable-arm.c | 30 ++++++++++++++++++++----------
>>  drivers/iommu/io-pgtable.h     |  1 +
>>  3 files changed, 31 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>> index ef42c4b..e92828e 100644
>> --- a/drivers/iommu/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm-smmu-v3.c
>> @@ -1772,6 +1772,15 @@ arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
>>  	return ops->unmap(ops, iova, size);
>>  }
>>  
>> +static void arm_smmu_iotlb_sync(struct iommu_domain *domain)
>> +{
>> +	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>> +	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>> +
>> +	if (ops && ops->iotlb_sync)
>> +		ops->iotlb_sync(ops);
>> +}
>> +
>>  static phys_addr_t
>>  arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
>>  {
>> @@ -1991,6 +2000,7 @@ static struct iommu_ops arm_smmu_ops = {
>>  	.attach_dev		= arm_smmu_attach_dev,
>>  	.map			= arm_smmu_map,
>>  	.unmap			= arm_smmu_unmap,
>> +	.iotlb_sync		= arm_smmu_iotlb_sync,
>>  	.map_sg			= default_iommu_map_sg,
>>  	.iova_to_phys		= arm_smmu_iova_to_phys,
>>  	.add_device		= arm_smmu_add_device,
>> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
>> index e8018a3..805efc9 100644
>> --- a/drivers/iommu/io-pgtable-arm.c
>> +++ b/drivers/iommu/io-pgtable-arm.c
>> @@ -304,6 +304,8 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
>>  		WARN_ON(!selftest_running);
>>  		return -EEXIST;
>>  	} else if (iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_TABLE) {
>> +		size_t unmapped;
>> +
>>  		/*
>>  		 * We need to unmap and free the old table before
>>  		 * overwriting it with a block entry.
>> @@ -312,7 +314,9 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
>>  		size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
>>  
>>  		tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
>> -		if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))
>> +		unmapped = __arm_lpae_unmap(data, iova, sz, lvl, tblp);
>> +		io_pgtable_tlb_sync(&data->iop);
>> +		if (WARN_ON(unmapped != sz))
>>  			return -EINVAL;
>>  	}
>>  
>> @@ -584,7 +588,6 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>>  			/* Also flush any partial walks */
>>  			io_pgtable_tlb_add_flush(iop, iova, size,
>>  						ARM_LPAE_GRANULE(data), false);
>> -			io_pgtable_tlb_sync(iop);
>>  			ptep = iopte_deref(pte, data);
>>  			__arm_lpae_free_pgtable(data, lvl + 1, ptep);
>>  		} else {
>> @@ -609,7 +612,6 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
>>  static int arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
>>  			  size_t size)
>>  {
>> -	size_t unmapped;
>>  	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
>>  	arm_lpae_iopte *ptep = data->pgd;
>>  	int lvl = ARM_LPAE_START_LVL(data);
>> @@ -617,11 +619,14 @@ static int arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
>>  	if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias)))
>>  		return 0;
>>  
>> -	unmapped = __arm_lpae_unmap(data, iova, size, lvl, ptep);
>> -	if (unmapped)
>> -		io_pgtable_tlb_sync(&data->iop);
>> +	return __arm_lpae_unmap(data, iova, size, lvl, ptep);
>> +}
> 
> This change is already queued in Joerg's tree, due to a patch from Robin.
Yes, I see. So this one can be skipped.

> 
> Will
> 
> .
> 

-- 
Thanks!
BestRegards

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 1/3] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-10-19  3:00     ` Leizhen (ThunderTown)
@ 2017-10-19  9:12       ` Will Deacon
  0 siblings, 0 replies; 11+ messages in thread
From: Will Deacon @ 2017-10-19  9:12 UTC (permalink / raw)
  To: Leizhen (ThunderTown)
  Cc: Joerg Roedel, linux-arm-kernel, iommu, Robin Murphy,
	linux-kernel, Hanjun Guo, Libin, Jinyue Li, Kefeng Wang

On Thu, Oct 19, 2017 at 11:00:45AM +0800, Leizhen (ThunderTown) wrote:
> 
> 
> On 2017/10/18 20:58, Will Deacon wrote:
> > Hi Thunder,
> > 
> > On Tue, Sep 12, 2017 at 09:00:36PM +0800, Zhen Lei wrote:
> >> Because all TLBI commands should be followed by a SYNC command, to make
> >> sure that it has been completely finished. So we can just add the TLBI
> >> commands into the queue, and put off the execution until meet SYNC or
> >> other commands. To prevent the followed SYNC command waiting for a long
> >> time because of too many commands have been delayed, restrict the max
> >> delayed number.
> >>
> >> According to my test, I got the same performance data as I replaced writel
> >> with writel_relaxed in queue_inc_prod.
> >>
> >> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> >> ---
> >>  drivers/iommu/arm-smmu-v3.c | 42 +++++++++++++++++++++++++++++++++++++-----
> >>  1 file changed, 37 insertions(+), 5 deletions(-)
> > 
> > If we want to go down the route of explicit command batching, I'd much
> > rather do it by implementing the iotlb_range_add callback in the driver,
> > and have a fixed-length array of batched ranges on the domain. We could
> I think even if iotlb_range_add callback is implemented, this patch is still valuable. The main purpose
> of this patch is to reduce dsb operation. So in the scenario with iotlb_range_add implemented:
> .iotlb_range_add:
> spin_lock_irqsave(&smmu->cmdq.lock, flags);
> ...
> add tlbi range-1 to cmq-queue
> ...
> add tlbi range-n to cmq-queue			//n
> dsb
> ...
> spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
> 
> .iotlb_sync
> spin_lock_irqsave(&smmu->cmdq.lock, flags);
> ...
> add cmd_sync to cmq-queue
> dsb
> ...
> spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
> 
> Although iotlb_range_add can reduce n-1 dsb operations, but there are
> still 1 left. If n is not large enough, this patch is helpful.

Then pick an n that is large enough, based on the compatible string.

Will

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2017-10-19  9:12 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-12 13:00 [PATCH v2 0/3] arm-smmu: performance optimization Zhen Lei
2017-09-12 13:00 ` [PATCH v2 1/3] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction Zhen Lei
2017-10-18 12:58   ` Will Deacon
2017-10-19  3:00     ` Leizhen (ThunderTown)
2017-10-19  9:12       ` Will Deacon
2017-09-12 13:00 ` [PATCH v2 2/3] iommu/arm-smmu-v3: add support for unmap an iova range with only one tlb sync Zhen Lei
2017-10-18 13:00   ` Will Deacon
2017-10-19  3:17     ` Leizhen (ThunderTown)
2017-09-12 13:00 ` [PATCH v2 3/3] iommu/arm-smmu: add support for unmap a memory " Zhen Lei
2017-09-19  4:31 ` [PATCH v2 0/3] arm-smmu: performance optimization Nate Watterson
2017-09-19  6:26   ` Leizhen (ThunderTown)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).