All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/4] ARM SMMU TLB sync improvements
@ 2017-03-30 16:56 ` Robin Murphy
  0 siblings, 0 replies; 18+ messages in thread
From: Robin Murphy @ 2017-03-30 16:56 UTC (permalink / raw)
  To: will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

Hi Will,

Here's a quick v2 to address your comments and drop the needless meddling
(whaddaya know, it makes the whole lot look simpler!)

I'll put it on my list to take a look at SMMUv3 queue polling as suggested.

Robin.

Robin Murphy (4):
  iommu/arm-smmu: Simplify ASID/VMID handling
  iommu/arm-smmu: Tidy up context bank indexing
  iommu/arm-smmu: Use per-context TLB sync as appropriate
  iommu/arm-smmu: Poll for TLB sync completion more effectively

 drivers/iommu/arm-smmu.c | 182 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 116 insertions(+), 66 deletions(-)

-- 
2.11.0.dirty

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 0/4] ARM SMMU TLB sync improvements
@ 2017-03-30 16:56 ` Robin Murphy
  0 siblings, 0 replies; 18+ messages in thread
From: Robin Murphy @ 2017-03-30 16:56 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Will,

Here's a quick v2 to address your comments and drop the needless meddling
(whaddaya know, it makes the whole lot look simpler!)

I'll put it on my list to take a look at SMMUv3 queue polling as suggested.

Robin.

Robin Murphy (4):
  iommu/arm-smmu: Simplify ASID/VMID handling
  iommu/arm-smmu: Tidy up context bank indexing
  iommu/arm-smmu: Use per-context TLB sync as appropriate
  iommu/arm-smmu: Poll for TLB sync completion more effectively

 drivers/iommu/arm-smmu.c | 182 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 116 insertions(+), 66 deletions(-)

-- 
2.11.0.dirty

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 1/4] iommu/arm-smmu: Simplify ASID/VMID handling
  2017-03-30 16:56 ` Robin Murphy
@ 2017-03-30 16:56     ` Robin Murphy
  -1 siblings, 0 replies; 18+ messages in thread
From: Robin Murphy @ 2017-03-30 16:56 UTC (permalink / raw)
  To: will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

Calculating ASIDs/VMIDs dynamically from arm_smmu_cfg was a neat trick,
but the global uniqueness workaround makes it somewhat more awkward, and
means we end up having to pass extra state around in certain cases just
to keep a handle on the offset.

We already have 16 bits going spare in arm_smmu_cfg; let's just
precalculate an ASID/VMID, plop it in there, and tidy up the users
accordingly. We'd also need something like this anyway if we ever get
near to thinking about SVM, so it's no bad thing.

Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
---

v2: No change

 drivers/iommu/arm-smmu.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index abf6496843a6..34b745bf59f2 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -404,14 +404,15 @@ enum arm_smmu_context_fmt {
 struct arm_smmu_cfg {
 	u8				cbndx;
 	u8				irptndx;
+	union {
+		u16			asid;
+		u16			vmid;
+	};
 	u32				cbar;
 	enum arm_smmu_context_fmt	fmt;
 };
 #define INVALID_IRPTNDX			0xff
 
-#define ARM_SMMU_CB_ASID(smmu, cfg) ((u16)(smmu)->cavium_id_base + (cfg)->cbndx)
-#define ARM_SMMU_CB_VMID(smmu, cfg) ((u16)(smmu)->cavium_id_base + (cfg)->cbndx + 1)
-
 enum arm_smmu_domain_stage {
 	ARM_SMMU_DOMAIN_S1 = 0,
 	ARM_SMMU_DOMAIN_S2,
@@ -603,12 +604,10 @@ static void arm_smmu_tlb_inv_context(void *cookie)
 
 	if (stage1) {
 		base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
-		writel_relaxed(ARM_SMMU_CB_ASID(smmu, cfg),
-			       base + ARM_SMMU_CB_S1_TLBIASID);
+		writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
 	} else {
 		base = ARM_SMMU_GR0(smmu);
-		writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg),
-			       base + ARM_SMMU_GR0_TLBIVMID);
+		writel_relaxed(cfg->vmid, base + ARM_SMMU_GR0_TLBIVMID);
 	}
 
 	__arm_smmu_tlb_sync(smmu);
@@ -629,14 +628,14 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 
 		if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
 			iova &= ~12UL;
-			iova |= ARM_SMMU_CB_ASID(smmu, cfg);
+			iova |= cfg->asid;
 			do {
 				writel_relaxed(iova, reg);
 				iova += granule;
 			} while (size -= granule);
 		} else {
 			iova >>= 12;
-			iova |= (u64)ARM_SMMU_CB_ASID(smmu, cfg) << 48;
+			iova |= (u64)cfg->asid << 48;
 			do {
 				writeq_relaxed(iova, reg);
 				iova += granule >> 12;
@@ -653,7 +652,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 		} while (size -= granule);
 	} else {
 		reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
-		writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg), reg);
+		writel_relaxed(cfg->vmid, reg);
 	}
 }
 
@@ -735,7 +734,7 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
 			reg = CBA2R_RW64_32BIT;
 		/* 16-bit VMIDs live in CBA2R */
 		if (smmu->features & ARM_SMMU_FEAT_VMID16)
-			reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBA2R_VMID_SHIFT;
+			reg |= cfg->vmid << CBA2R_VMID_SHIFT;
 
 		writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBA2R(cfg->cbndx));
 	}
@@ -754,26 +753,24 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
 			(CBAR_S1_MEMATTR_WB << CBAR_S1_MEMATTR_SHIFT);
 	} else if (!(smmu->features & ARM_SMMU_FEAT_VMID16)) {
 		/* 8-bit VMIDs live in CBAR */
-		reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBAR_VMID_SHIFT;
+		reg |= cfg->vmid << CBAR_VMID_SHIFT;
 	}
 	writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBAR(cfg->cbndx));
 
 	/* TTBRs */
 	if (stage1) {
-		u16 asid = ARM_SMMU_CB_ASID(smmu, cfg);
-
 		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
 			reg = pgtbl_cfg->arm_v7s_cfg.ttbr[0];
 			writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0);
 			reg = pgtbl_cfg->arm_v7s_cfg.ttbr[1];
 			writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR1);
-			writel_relaxed(asid, cb_base + ARM_SMMU_CB_CONTEXTIDR);
+			writel_relaxed(cfg->asid, cb_base + ARM_SMMU_CB_CONTEXTIDR);
 		} else {
 			reg64 = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
-			reg64 |= (u64)asid << TTBRn_ASID_SHIFT;
+			reg64 |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
 			writeq_relaxed(reg64, cb_base + ARM_SMMU_CB_TTBR0);
 			reg64 = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[1];
-			reg64 |= (u64)asid << TTBRn_ASID_SHIFT;
+			reg64 |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
 			writeq_relaxed(reg64, cb_base + ARM_SMMU_CB_TTBR1);
 		}
 	} else {
@@ -941,6 +938,11 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 		cfg->irptndx = cfg->cbndx;
 	}
 
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2)
+		cfg->vmid = cfg->cbndx + 1 + smmu->cavium_id_base;
+	else
+		cfg->asid = cfg->cbndx + smmu->cavium_id_base;
+
 	pgtbl_cfg = (struct io_pgtable_cfg) {
 		.pgsize_bitmap	= smmu->pgsize_bitmap,
 		.ias		= ias,
-- 
2.11.0.dirty

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 1/4] iommu/arm-smmu: Simplify ASID/VMID handling
@ 2017-03-30 16:56     ` Robin Murphy
  0 siblings, 0 replies; 18+ messages in thread
From: Robin Murphy @ 2017-03-30 16:56 UTC (permalink / raw)
  To: linux-arm-kernel

Calculating ASIDs/VMIDs dynamically from arm_smmu_cfg was a neat trick,
but the global uniqueness workaround makes it somewhat more awkward, and
means we end up having to pass extra state around in certain cases just
to keep a handle on the offset.

We already have 16 bits going spare in arm_smmu_cfg; let's just
precalculate an ASID/VMID, plop it in there, and tidy up the users
accordingly. We'd also need something like this anyway if we ever get
near to thinking about SVM, so it's no bad thing.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v2: No change

 drivers/iommu/arm-smmu.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index abf6496843a6..34b745bf59f2 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -404,14 +404,15 @@ enum arm_smmu_context_fmt {
 struct arm_smmu_cfg {
 	u8				cbndx;
 	u8				irptndx;
+	union {
+		u16			asid;
+		u16			vmid;
+	};
 	u32				cbar;
 	enum arm_smmu_context_fmt	fmt;
 };
 #define INVALID_IRPTNDX			0xff
 
-#define ARM_SMMU_CB_ASID(smmu, cfg) ((u16)(smmu)->cavium_id_base + (cfg)->cbndx)
-#define ARM_SMMU_CB_VMID(smmu, cfg) ((u16)(smmu)->cavium_id_base + (cfg)->cbndx + 1)
-
 enum arm_smmu_domain_stage {
 	ARM_SMMU_DOMAIN_S1 = 0,
 	ARM_SMMU_DOMAIN_S2,
@@ -603,12 +604,10 @@ static void arm_smmu_tlb_inv_context(void *cookie)
 
 	if (stage1) {
 		base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
-		writel_relaxed(ARM_SMMU_CB_ASID(smmu, cfg),
-			       base + ARM_SMMU_CB_S1_TLBIASID);
+		writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
 	} else {
 		base = ARM_SMMU_GR0(smmu);
-		writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg),
-			       base + ARM_SMMU_GR0_TLBIVMID);
+		writel_relaxed(cfg->vmid, base + ARM_SMMU_GR0_TLBIVMID);
 	}
 
 	__arm_smmu_tlb_sync(smmu);
@@ -629,14 +628,14 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 
 		if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
 			iova &= ~12UL;
-			iova |= ARM_SMMU_CB_ASID(smmu, cfg);
+			iova |= cfg->asid;
 			do {
 				writel_relaxed(iova, reg);
 				iova += granule;
 			} while (size -= granule);
 		} else {
 			iova >>= 12;
-			iova |= (u64)ARM_SMMU_CB_ASID(smmu, cfg) << 48;
+			iova |= (u64)cfg->asid << 48;
 			do {
 				writeq_relaxed(iova, reg);
 				iova += granule >> 12;
@@ -653,7 +652,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 		} while (size -= granule);
 	} else {
 		reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
-		writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg), reg);
+		writel_relaxed(cfg->vmid, reg);
 	}
 }
 
@@ -735,7 +734,7 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
 			reg = CBA2R_RW64_32BIT;
 		/* 16-bit VMIDs live in CBA2R */
 		if (smmu->features & ARM_SMMU_FEAT_VMID16)
-			reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBA2R_VMID_SHIFT;
+			reg |= cfg->vmid << CBA2R_VMID_SHIFT;
 
 		writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBA2R(cfg->cbndx));
 	}
@@ -754,26 +753,24 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
 			(CBAR_S1_MEMATTR_WB << CBAR_S1_MEMATTR_SHIFT);
 	} else if (!(smmu->features & ARM_SMMU_FEAT_VMID16)) {
 		/* 8-bit VMIDs live in CBAR */
-		reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBAR_VMID_SHIFT;
+		reg |= cfg->vmid << CBAR_VMID_SHIFT;
 	}
 	writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBAR(cfg->cbndx));
 
 	/* TTBRs */
 	if (stage1) {
-		u16 asid = ARM_SMMU_CB_ASID(smmu, cfg);
-
 		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
 			reg = pgtbl_cfg->arm_v7s_cfg.ttbr[0];
 			writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0);
 			reg = pgtbl_cfg->arm_v7s_cfg.ttbr[1];
 			writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR1);
-			writel_relaxed(asid, cb_base + ARM_SMMU_CB_CONTEXTIDR);
+			writel_relaxed(cfg->asid, cb_base + ARM_SMMU_CB_CONTEXTIDR);
 		} else {
 			reg64 = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
-			reg64 |= (u64)asid << TTBRn_ASID_SHIFT;
+			reg64 |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
 			writeq_relaxed(reg64, cb_base + ARM_SMMU_CB_TTBR0);
 			reg64 = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[1];
-			reg64 |= (u64)asid << TTBRn_ASID_SHIFT;
+			reg64 |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
 			writeq_relaxed(reg64, cb_base + ARM_SMMU_CB_TTBR1);
 		}
 	} else {
@@ -941,6 +938,11 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 		cfg->irptndx = cfg->cbndx;
 	}
 
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2)
+		cfg->vmid = cfg->cbndx + 1 + smmu->cavium_id_base;
+	else
+		cfg->asid = cfg->cbndx + smmu->cavium_id_base;
+
 	pgtbl_cfg = (struct io_pgtable_cfg) {
 		.pgsize_bitmap	= smmu->pgsize_bitmap,
 		.ias		= ias,
-- 
2.11.0.dirty

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 2/4] iommu/arm-smmu: Tidy up context bank indexing
  2017-03-30 16:56 ` Robin Murphy
@ 2017-03-30 16:56     ` Robin Murphy
  -1 siblings, 0 replies; 18+ messages in thread
From: Robin Murphy @ 2017-03-30 16:56 UTC (permalink / raw)
  To: will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

ARM_AMMU_CB() is calculated relative to ARM_SMMU_CB_BASE(), but the
latter is never of use on its own, and what we end up with is the same
ARM_SMMU_CB_BASE() + ARM_AMMU_CB() expression being duplicated at every
callsite. Folding the two together gives us a self-contained context
bank accessor which is much more pleasant to work with.

Secondly, we might as well simplify CB_BASE itself at the same time.
We use the address space size for its own sake precisely once, at probe
time, and every other usage is to dynamically calculate CB_BASE over
and over and over again. Let's flip things around so that we just
maintain the CB_BASE address directly.

Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
---

v2: No change

 drivers/iommu/arm-smmu.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 34b745bf59f2..e79b623f9165 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -216,8 +216,7 @@ enum arm_smmu_s2cr_privcfg {
 #define CBA2R_VMID_MASK			0xffff
 
 /* Translation context bank */
-#define ARM_SMMU_CB_BASE(smmu)		((smmu)->base + ((smmu)->size >> 1))
-#define ARM_SMMU_CB(smmu, n)		((n) * (1 << (smmu)->pgshift))
+#define ARM_SMMU_CB(smmu, n)	((smmu)->cb_base + ((n) << (smmu)->pgshift))
 
 #define ARM_SMMU_CB_SCTLR		0x0
 #define ARM_SMMU_CB_ACTLR		0x4
@@ -344,7 +343,7 @@ struct arm_smmu_device {
 	struct device			*dev;
 
 	void __iomem			*base;
-	unsigned long			size;
+	void __iomem			*cb_base;
 	unsigned long			pgshift;
 
 #define ARM_SMMU_FEAT_COHERENT_WALK	(1 << 0)
@@ -603,7 +602,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
 	void __iomem *base;
 
 	if (stage1) {
-		base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+		base = ARM_SMMU_CB(smmu, cfg->cbndx);
 		writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
 	} else {
 		base = ARM_SMMU_GR0(smmu);
@@ -623,7 +622,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 	void __iomem *reg;
 
 	if (stage1) {
-		reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
 		reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
 
 		if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
@@ -642,7 +641,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 			} while (size -= granule);
 		}
 	} else if (smmu->version == ARM_SMMU_V2) {
-		reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
 		reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
 			      ARM_SMMU_CB_S2_TLBIIPAS2;
 		iova >>= 12;
@@ -672,7 +671,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
 	void __iomem *cb_base;
 
-	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 	fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
 
 	if (!(fsr & FSR_FAULT))
@@ -725,7 +724,7 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
 
 	gr1_base = ARM_SMMU_GR1(smmu);
 	stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
-	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 
 	if (smmu->version > ARM_SMMU_V1) {
 		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
@@ -1007,7 +1006,7 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain)
 	 * Disable the context bank and free the page tables before freeing
 	 * it.
 	 */
-	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 	writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
 
 	if (cfg->irptndx != INVALID_IRPTNDX) {
@@ -1358,7 +1357,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
 	u64 phys;
 	unsigned long va;
 
-	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 
 	/* ATS1 registers can only be written atomically */
 	va = iova & ~0xfffUL;
@@ -1685,7 +1684,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
 
 	/* Make sure all context banks are disabled and clear CB_FSR  */
 	for (i = 0; i < smmu->num_context_banks; ++i) {
-		cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, i);
+		cb_base = ARM_SMMU_CB(smmu, i);
 		writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
 		writel_relaxed(FSR_FAULT, cb_base + ARM_SMMU_CB_FSR);
 		/*
@@ -1865,11 +1864,11 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
 
 	/* Check for size mismatch of SMMU address space from mapped region */
 	size = 1 << (((id >> ID1_NUMPAGENDXB_SHIFT) & ID1_NUMPAGENDXB_MASK) + 1);
-	size *= 2 << smmu->pgshift;
-	if (smmu->size != size)
+	size <<= smmu->pgshift;
+	if (smmu->cb_base != gr0_base + size)
 		dev_warn(smmu->dev,
 			"SMMU address space size (0x%lx) differs from mapped region size (0x%lx)!\n",
-			size, smmu->size);
+			size * 2, (smmu->cb_base - gr0_base) * 2);
 
 	smmu->num_s2_context_banks = (id >> ID1_NUMS2CB_SHIFT) & ID1_NUMS2CB_MASK;
 	smmu->num_context_banks = (id >> ID1_NUMCB_SHIFT) & ID1_NUMCB_MASK;
@@ -2105,7 +2104,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
 	smmu->base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(smmu->base))
 		return PTR_ERR(smmu->base);
-	smmu->size = resource_size(res);
+	smmu->cb_base = smmu->base + resource_size(res) / 2;
 
 	num_irqs = 0;
 	while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) {
-- 
2.11.0.dirty

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 2/4] iommu/arm-smmu: Tidy up context bank indexing
@ 2017-03-30 16:56     ` Robin Murphy
  0 siblings, 0 replies; 18+ messages in thread
From: Robin Murphy @ 2017-03-30 16:56 UTC (permalink / raw)
  To: linux-arm-kernel

ARM_AMMU_CB() is calculated relative to ARM_SMMU_CB_BASE(), but the
latter is never of use on its own, and what we end up with is the same
ARM_SMMU_CB_BASE() + ARM_AMMU_CB() expression being duplicated at every
callsite. Folding the two together gives us a self-contained context
bank accessor which is much more pleasant to work with.

Secondly, we might as well simplify CB_BASE itself at the same time.
We use the address space size for its own sake precisely once, at probe
time, and every other usage is to dynamically calculate CB_BASE over
and over and over again. Let's flip things around so that we just
maintain the CB_BASE address directly.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v2: No change

 drivers/iommu/arm-smmu.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 34b745bf59f2..e79b623f9165 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -216,8 +216,7 @@ enum arm_smmu_s2cr_privcfg {
 #define CBA2R_VMID_MASK			0xffff
 
 /* Translation context bank */
-#define ARM_SMMU_CB_BASE(smmu)		((smmu)->base + ((smmu)->size >> 1))
-#define ARM_SMMU_CB(smmu, n)		((n) * (1 << (smmu)->pgshift))
+#define ARM_SMMU_CB(smmu, n)	((smmu)->cb_base + ((n) << (smmu)->pgshift))
 
 #define ARM_SMMU_CB_SCTLR		0x0
 #define ARM_SMMU_CB_ACTLR		0x4
@@ -344,7 +343,7 @@ struct arm_smmu_device {
 	struct device			*dev;
 
 	void __iomem			*base;
-	unsigned long			size;
+	void __iomem			*cb_base;
 	unsigned long			pgshift;
 
 #define ARM_SMMU_FEAT_COHERENT_WALK	(1 << 0)
@@ -603,7 +602,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
 	void __iomem *base;
 
 	if (stage1) {
-		base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+		base = ARM_SMMU_CB(smmu, cfg->cbndx);
 		writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
 	} else {
 		base = ARM_SMMU_GR0(smmu);
@@ -623,7 +622,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 	void __iomem *reg;
 
 	if (stage1) {
-		reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
 		reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
 
 		if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
@@ -642,7 +641,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 			} while (size -= granule);
 		}
 	} else if (smmu->version == ARM_SMMU_V2) {
-		reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
 		reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
 			      ARM_SMMU_CB_S2_TLBIIPAS2;
 		iova >>= 12;
@@ -672,7 +671,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
 	void __iomem *cb_base;
 
-	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 	fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
 
 	if (!(fsr & FSR_FAULT))
@@ -725,7 +724,7 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
 
 	gr1_base = ARM_SMMU_GR1(smmu);
 	stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
-	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 
 	if (smmu->version > ARM_SMMU_V1) {
 		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
@@ -1007,7 +1006,7 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain)
 	 * Disable the context bank and free the page tables before freeing
 	 * it.
 	 */
-	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 	writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
 
 	if (cfg->irptndx != INVALID_IRPTNDX) {
@@ -1358,7 +1357,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
 	u64 phys;
 	unsigned long va;
 
-	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
+	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
 
 	/* ATS1 registers can only be written atomically */
 	va = iova & ~0xfffUL;
@@ -1685,7 +1684,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
 
 	/* Make sure all context banks are disabled and clear CB_FSR  */
 	for (i = 0; i < smmu->num_context_banks; ++i) {
-		cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, i);
+		cb_base = ARM_SMMU_CB(smmu, i);
 		writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
 		writel_relaxed(FSR_FAULT, cb_base + ARM_SMMU_CB_FSR);
 		/*
@@ -1865,11 +1864,11 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
 
 	/* Check for size mismatch of SMMU address space from mapped region */
 	size = 1 << (((id >> ID1_NUMPAGENDXB_SHIFT) & ID1_NUMPAGENDXB_MASK) + 1);
-	size *= 2 << smmu->pgshift;
-	if (smmu->size != size)
+	size <<= smmu->pgshift;
+	if (smmu->cb_base != gr0_base + size)
 		dev_warn(smmu->dev,
 			"SMMU address space size (0x%lx) differs from mapped region size (0x%lx)!\n",
-			size, smmu->size);
+			size * 2, (smmu->cb_base - gr0_base) * 2);
 
 	smmu->num_s2_context_banks = (id >> ID1_NUMS2CB_SHIFT) & ID1_NUMS2CB_MASK;
 	smmu->num_context_banks = (id >> ID1_NUMCB_SHIFT) & ID1_NUMCB_MASK;
@@ -2105,7 +2104,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
 	smmu->base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(smmu->base))
 		return PTR_ERR(smmu->base);
-	smmu->size = resource_size(res);
+	smmu->cb_base = smmu->base + resource_size(res) / 2;
 
 	num_irqs = 0;
 	while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) {
-- 
2.11.0.dirty

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 3/4] iommu/arm-smmu: Use per-context TLB sync as appropriate
  2017-03-30 16:56 ` Robin Murphy
@ 2017-03-30 16:56     ` Robin Murphy
  -1 siblings, 0 replies; 18+ messages in thread
From: Robin Murphy @ 2017-03-30 16:56 UTC (permalink / raw)
  To: will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

TLB synchronisation typically involves the SMMU blocking all incoming
transactions until the TLBs report completion of all outstanding
operations. In the common SMMUv2 configuration of a single distributed
SMMU serving multiple peripherals, that means that a single unmap
request has the potential to bring the hammer down on the entire system
if synchronised globally. Since stage 1 contexts, and stage 2 contexts
under SMMUv2, offer local sync operations, let's make use of those
wherever we can in the hope of minimising global disruption.

To that end, rather than add any more branches to the already unwieldy
monolithic TLB maintenance ops, break them up into smaller, neater,
functions which we can then mix and match as appropriate.

Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
---

v2: Undo unnecessary refactoring of arm_smmu_tlb_inv_range_nosync()
    (and forget about further attempts, since this turns out to be
    all of 3 lines longer than v1)

 drivers/iommu/arm-smmu.c | 115 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 81 insertions(+), 34 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index e79b623f9165..759d5f261160 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -237,6 +237,8 @@ enum arm_smmu_s2cr_privcfg {
 #define ARM_SMMU_CB_S1_TLBIVAL		0x620
 #define ARM_SMMU_CB_S2_TLBIIPAS2	0x630
 #define ARM_SMMU_CB_S2_TLBIIPAS2L	0x638
+#define ARM_SMMU_CB_TLBSYNC		0x7f0
+#define ARM_SMMU_CB_TLBSTATUS		0x7f4
 #define ARM_SMMU_CB_ATS1PR		0x800
 #define ARM_SMMU_CB_ATSR		0x8f0
 
@@ -569,14 +571,13 @@ static void __arm_smmu_free_bitmap(unsigned long *map, int idx)
 }
 
 /* Wait for any pending TLB invalidations to complete */
-static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
+static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
+				void __iomem *sync, void __iomem *status)
 {
 	int count = 0;
-	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
 
-	writel_relaxed(0, gr0_base + ARM_SMMU_GR0_sTLBGSYNC);
-	while (readl_relaxed(gr0_base + ARM_SMMU_GR0_sTLBGSTATUS)
-	       & sTLBGSTATUS_GSACTIVE) {
+	writel_relaxed(0, sync);
+	while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
 		cpu_relax();
 		if (++count == TLB_LOOP_TIMEOUT) {
 			dev_err_ratelimited(smmu->dev,
@@ -587,29 +588,49 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
 	}
 }
 
-static void arm_smmu_tlb_sync(void *cookie)
+static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
 {
-	struct arm_smmu_domain *smmu_domain = cookie;
-	__arm_smmu_tlb_sync(smmu_domain->smmu);
+	void __iomem *base = ARM_SMMU_GR0(smmu);
+
+	__arm_smmu_tlb_sync(smmu, base + ARM_SMMU_GR0_sTLBGSYNC,
+			    base + ARM_SMMU_GR0_sTLBGSTATUS);
 }
 
-static void arm_smmu_tlb_inv_context(void *cookie)
+static void arm_smmu_tlb_sync_context(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	void __iomem *base = ARM_SMMU_CB(smmu, smmu_domain->cfg.cbndx);
+
+	__arm_smmu_tlb_sync(smmu, base + ARM_SMMU_CB_TLBSYNC,
+			    base + ARM_SMMU_CB_TLBSTATUS);
+}
+
+static void arm_smmu_tlb_sync_vmid(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+
+	arm_smmu_tlb_sync_global(smmu_domain->smmu);
+}
+
+static void arm_smmu_tlb_inv_context_s1(void *cookie)
 {
 	struct arm_smmu_domain *smmu_domain = cookie;
 	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	void __iomem *base = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
+
+	writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
+	arm_smmu_tlb_sync_context(cookie);
+}
+
+static void arm_smmu_tlb_inv_context_s2(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
-	bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
-	void __iomem *base;
+	void __iomem *base = ARM_SMMU_GR0(smmu);
 
-	if (stage1) {
-		base = ARM_SMMU_CB(smmu, cfg->cbndx);
-		writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
-	} else {
-		base = ARM_SMMU_GR0(smmu);
-		writel_relaxed(cfg->vmid, base + ARM_SMMU_GR0_TLBIVMID);
-	}
-
-	__arm_smmu_tlb_sync(smmu);
+	writel_relaxed(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID);
+	arm_smmu_tlb_sync_global(smmu);
 }
 
 static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
@@ -617,12 +638,10 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 {
 	struct arm_smmu_domain *smmu_domain = cookie;
 	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
-	struct arm_smmu_device *smmu = smmu_domain->smmu;
 	bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
-	void __iomem *reg;
+	void __iomem *reg = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
 
 	if (stage1) {
-		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
 		reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
 
 		if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
@@ -640,8 +659,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 				iova += granule >> 12;
 			} while (size -= granule);
 		}
-	} else if (smmu->version == ARM_SMMU_V2) {
-		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
+	} else {
 		reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
 			      ARM_SMMU_CB_S2_TLBIIPAS2;
 		iova >>= 12;
@@ -649,16 +667,40 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 			smmu_write_atomic_lq(iova, reg);
 			iova += granule >> 12;
 		} while (size -= granule);
-	} else {
-		reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
-		writel_relaxed(cfg->vmid, reg);
 	}
 }
 
-static const struct iommu_gather_ops arm_smmu_gather_ops = {
-	.tlb_flush_all	= arm_smmu_tlb_inv_context,
+/*
+ * On MMU-401 at least, the cost of firing off multiple TLBIVMIDs appears
+ * almost negligible, but the benefit of getting the first one in as far ahead
+ * of the sync as possible is significant, hence we don't just make this a
+ * no-op and set .tlb_sync to arm_smmu_inv_context_s2() as you might think.
+ */
+static void arm_smmu_tlb_inv_vmid_nosync(unsigned long iova, size_t size,
+					 size_t granule, bool leaf, void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	void __iomem *base = ARM_SMMU_GR0(smmu_domain->smmu);
+
+	writel_relaxed(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID);
+}
+
+static const struct iommu_gather_ops arm_smmu_s1_tlb_ops = {
+	.tlb_flush_all	= arm_smmu_tlb_inv_context_s1,
 	.tlb_add_flush	= arm_smmu_tlb_inv_range_nosync,
-	.tlb_sync	= arm_smmu_tlb_sync,
+	.tlb_sync	= arm_smmu_tlb_sync_context,
+};
+
+static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v2 = {
+	.tlb_flush_all	= arm_smmu_tlb_inv_context_s2,
+	.tlb_add_flush	= arm_smmu_tlb_inv_range_nosync,
+	.tlb_sync	= arm_smmu_tlb_sync_context,
+};
+
+static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v1 = {
+	.tlb_flush_all	= arm_smmu_tlb_inv_context_s2,
+	.tlb_add_flush	= arm_smmu_tlb_inv_vmid_nosync,
+	.tlb_sync	= arm_smmu_tlb_sync_vmid,
 };
 
 static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
@@ -829,6 +871,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 	enum io_pgtable_fmt fmt;
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	const struct iommu_gather_ops *tlb_ops;
 
 	mutex_lock(&smmu_domain->init_mutex);
 	if (smmu_domain->smmu)
@@ -900,6 +943,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 			ias = min(ias, 32UL);
 			oas = min(oas, 32UL);
 		}
+		tlb_ops = &arm_smmu_s1_tlb_ops;
 		break;
 	case ARM_SMMU_DOMAIN_NESTED:
 		/*
@@ -918,12 +962,15 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 			ias = min(ias, 40UL);
 			oas = min(oas, 40UL);
 		}
+		if (smmu->version == ARM_SMMU_V2)
+			tlb_ops = &arm_smmu_s2_tlb_ops_v2;
+		else
+			tlb_ops = &arm_smmu_s2_tlb_ops_v1;
 		break;
 	default:
 		ret = -EINVAL;
 		goto out_unlock;
 	}
-
 	ret = __arm_smmu_alloc_bitmap(smmu->context_map, start,
 				      smmu->num_context_banks);
 	if (ret < 0)
@@ -946,7 +993,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 		.pgsize_bitmap	= smmu->pgsize_bitmap,
 		.ias		= ias,
 		.oas		= oas,
-		.tlb		= &arm_smmu_gather_ops,
+		.tlb		= tlb_ops,
 		.iommu_dev	= smmu->dev,
 	};
 
@@ -1730,7 +1777,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
 		reg |= sCR0_EXIDENABLE;
 
 	/* Push the button */
-	__arm_smmu_tlb_sync(smmu);
+	arm_smmu_tlb_sync_global(smmu);
 	writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
 }
 
-- 
2.11.0.dirty

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 3/4] iommu/arm-smmu: Use per-context TLB sync as appropriate
@ 2017-03-30 16:56     ` Robin Murphy
  0 siblings, 0 replies; 18+ messages in thread
From: Robin Murphy @ 2017-03-30 16:56 UTC (permalink / raw)
  To: linux-arm-kernel

TLB synchronisation typically involves the SMMU blocking all incoming
transactions until the TLBs report completion of all outstanding
operations. In the common SMMUv2 configuration of a single distributed
SMMU serving multiple peripherals, that means that a single unmap
request has the potential to bring the hammer down on the entire system
if synchronised globally. Since stage 1 contexts, and stage 2 contexts
under SMMUv2, offer local sync operations, let's make use of those
wherever we can in the hope of minimising global disruption.

To that end, rather than add any more branches to the already unwieldy
monolithic TLB maintenance ops, break them up into smaller, neater,
functions which we can then mix and match as appropriate.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v2: Undo unnecessary refactoring of arm_smmu_tlb_inv_range_nosync()
    (and forget about further attempts, since this turns out to be
    all of 3 lines longer than v1)

 drivers/iommu/arm-smmu.c | 115 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 81 insertions(+), 34 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index e79b623f9165..759d5f261160 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -237,6 +237,8 @@ enum arm_smmu_s2cr_privcfg {
 #define ARM_SMMU_CB_S1_TLBIVAL		0x620
 #define ARM_SMMU_CB_S2_TLBIIPAS2	0x630
 #define ARM_SMMU_CB_S2_TLBIIPAS2L	0x638
+#define ARM_SMMU_CB_TLBSYNC		0x7f0
+#define ARM_SMMU_CB_TLBSTATUS		0x7f4
 #define ARM_SMMU_CB_ATS1PR		0x800
 #define ARM_SMMU_CB_ATSR		0x8f0
 
@@ -569,14 +571,13 @@ static void __arm_smmu_free_bitmap(unsigned long *map, int idx)
 }
 
 /* Wait for any pending TLB invalidations to complete */
-static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
+static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
+				void __iomem *sync, void __iomem *status)
 {
 	int count = 0;
-	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
 
-	writel_relaxed(0, gr0_base + ARM_SMMU_GR0_sTLBGSYNC);
-	while (readl_relaxed(gr0_base + ARM_SMMU_GR0_sTLBGSTATUS)
-	       & sTLBGSTATUS_GSACTIVE) {
+	writel_relaxed(0, sync);
+	while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
 		cpu_relax();
 		if (++count == TLB_LOOP_TIMEOUT) {
 			dev_err_ratelimited(smmu->dev,
@@ -587,29 +588,49 @@ static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu)
 	}
 }
 
-static void arm_smmu_tlb_sync(void *cookie)
+static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
 {
-	struct arm_smmu_domain *smmu_domain = cookie;
-	__arm_smmu_tlb_sync(smmu_domain->smmu);
+	void __iomem *base = ARM_SMMU_GR0(smmu);
+
+	__arm_smmu_tlb_sync(smmu, base + ARM_SMMU_GR0_sTLBGSYNC,
+			    base + ARM_SMMU_GR0_sTLBGSTATUS);
 }
 
-static void arm_smmu_tlb_inv_context(void *cookie)
+static void arm_smmu_tlb_sync_context(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	void __iomem *base = ARM_SMMU_CB(smmu, smmu_domain->cfg.cbndx);
+
+	__arm_smmu_tlb_sync(smmu, base + ARM_SMMU_CB_TLBSYNC,
+			    base + ARM_SMMU_CB_TLBSTATUS);
+}
+
+static void arm_smmu_tlb_sync_vmid(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+
+	arm_smmu_tlb_sync_global(smmu_domain->smmu);
+}
+
+static void arm_smmu_tlb_inv_context_s1(void *cookie)
 {
 	struct arm_smmu_domain *smmu_domain = cookie;
 	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	void __iomem *base = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
+
+	writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
+	arm_smmu_tlb_sync_context(cookie);
+}
+
+static void arm_smmu_tlb_inv_context_s2(void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
-	bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
-	void __iomem *base;
+	void __iomem *base = ARM_SMMU_GR0(smmu);
 
-	if (stage1) {
-		base = ARM_SMMU_CB(smmu, cfg->cbndx);
-		writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
-	} else {
-		base = ARM_SMMU_GR0(smmu);
-		writel_relaxed(cfg->vmid, base + ARM_SMMU_GR0_TLBIVMID);
-	}
-
-	__arm_smmu_tlb_sync(smmu);
+	writel_relaxed(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID);
+	arm_smmu_tlb_sync_global(smmu);
 }
 
 static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
@@ -617,12 +638,10 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 {
 	struct arm_smmu_domain *smmu_domain = cookie;
 	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
-	struct arm_smmu_device *smmu = smmu_domain->smmu;
 	bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
-	void __iomem *reg;
+	void __iomem *reg = ARM_SMMU_CB(smmu_domain->smmu, cfg->cbndx);
 
 	if (stage1) {
-		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
 		reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
 
 		if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
@@ -640,8 +659,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 				iova += granule >> 12;
 			} while (size -= granule);
 		}
-	} else if (smmu->version == ARM_SMMU_V2) {
-		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
+	} else {
 		reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
 			      ARM_SMMU_CB_S2_TLBIIPAS2;
 		iova >>= 12;
@@ -649,16 +667,40 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
 			smmu_write_atomic_lq(iova, reg);
 			iova += granule >> 12;
 		} while (size -= granule);
-	} else {
-		reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
-		writel_relaxed(cfg->vmid, reg);
 	}
 }
 
-static const struct iommu_gather_ops arm_smmu_gather_ops = {
-	.tlb_flush_all	= arm_smmu_tlb_inv_context,
+/*
+ * On MMU-401 at least, the cost of firing off multiple TLBIVMIDs appears
+ * almost negligible, but the benefit of getting the first one in as far ahead
+ * of the sync as possible is significant, hence we don't just make this a
+ * no-op and set .tlb_sync to arm_smmu_inv_context_s2() as you might think.
+ */
+static void arm_smmu_tlb_inv_vmid_nosync(unsigned long iova, size_t size,
+					 size_t granule, bool leaf, void *cookie)
+{
+	struct arm_smmu_domain *smmu_domain = cookie;
+	void __iomem *base = ARM_SMMU_GR0(smmu_domain->smmu);
+
+	writel_relaxed(smmu_domain->cfg.vmid, base + ARM_SMMU_GR0_TLBIVMID);
+}
+
+static const struct iommu_gather_ops arm_smmu_s1_tlb_ops = {
+	.tlb_flush_all	= arm_smmu_tlb_inv_context_s1,
 	.tlb_add_flush	= arm_smmu_tlb_inv_range_nosync,
-	.tlb_sync	= arm_smmu_tlb_sync,
+	.tlb_sync	= arm_smmu_tlb_sync_context,
+};
+
+static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v2 = {
+	.tlb_flush_all	= arm_smmu_tlb_inv_context_s2,
+	.tlb_add_flush	= arm_smmu_tlb_inv_range_nosync,
+	.tlb_sync	= arm_smmu_tlb_sync_context,
+};
+
+static const struct iommu_gather_ops arm_smmu_s2_tlb_ops_v1 = {
+	.tlb_flush_all	= arm_smmu_tlb_inv_context_s2,
+	.tlb_add_flush	= arm_smmu_tlb_inv_vmid_nosync,
+	.tlb_sync	= arm_smmu_tlb_sync_vmid,
 };
 
 static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
@@ -829,6 +871,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 	enum io_pgtable_fmt fmt;
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+	const struct iommu_gather_ops *tlb_ops;
 
 	mutex_lock(&smmu_domain->init_mutex);
 	if (smmu_domain->smmu)
@@ -900,6 +943,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 			ias = min(ias, 32UL);
 			oas = min(oas, 32UL);
 		}
+		tlb_ops = &arm_smmu_s1_tlb_ops;
 		break;
 	case ARM_SMMU_DOMAIN_NESTED:
 		/*
@@ -918,12 +962,15 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 			ias = min(ias, 40UL);
 			oas = min(oas, 40UL);
 		}
+		if (smmu->version == ARM_SMMU_V2)
+			tlb_ops = &arm_smmu_s2_tlb_ops_v2;
+		else
+			tlb_ops = &arm_smmu_s2_tlb_ops_v1;
 		break;
 	default:
 		ret = -EINVAL;
 		goto out_unlock;
 	}
-
 	ret = __arm_smmu_alloc_bitmap(smmu->context_map, start,
 				      smmu->num_context_banks);
 	if (ret < 0)
@@ -946,7 +993,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 		.pgsize_bitmap	= smmu->pgsize_bitmap,
 		.ias		= ias,
 		.oas		= oas,
-		.tlb		= &arm_smmu_gather_ops,
+		.tlb		= tlb_ops,
 		.iommu_dev	= smmu->dev,
 	};
 
@@ -1730,7 +1777,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
 		reg |= sCR0_EXIDENABLE;
 
 	/* Push the button */
-	__arm_smmu_tlb_sync(smmu);
+	arm_smmu_tlb_sync_global(smmu);
 	writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
 }
 
-- 
2.11.0.dirty

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 4/4] iommu/arm-smmu: Poll for TLB sync completion more effectively
  2017-03-30 16:56 ` Robin Murphy
@ 2017-03-30 16:56     ` Robin Murphy
  -1 siblings, 0 replies; 18+ messages in thread
From: Robin Murphy @ 2017-03-30 16:56 UTC (permalink / raw)
  To: will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On relatively slow development platforms and software models, the
inefficiency of our TLB sync loop tends not to show up - for instance on
a Juno r1 board I typically see the TLBI has completed of its own accord
by the time we get to the sync, such that the latter finishes instantly.

However, on larger systems doing real I/O, it's less realistic for the
TLBs to go idle immediately, and at that point falling into the 1MHz
polling loop turns out to throw away performance drastically. Let's
strike a balance by polling more than once between pauses, such that we
have much more chance of catching normal operations completing before
committing to the fixed delay, but also backing off exponentially, since
if a sync really hasn't completed within one or two "reasonable time"
periods, it becomes increasingly unlikely that it ever will.

Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
---

v2: Restored the cpu_relax() to the inner loop

 drivers/iommu/arm-smmu.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 759d5f261160..a15ca86e9703 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -162,6 +162,7 @@
 #define ARM_SMMU_GR0_sTLBGSTATUS	0x74
 #define sTLBGSTATUS_GSACTIVE		(1 << 0)
 #define TLB_LOOP_TIMEOUT		1000000	/* 1s! */
+#define TLB_SPIN_COUNT			10
 
 /* Stream mapping registers */
 #define ARM_SMMU_GR0_SMR(n)		(0x800 + ((n) << 2))
@@ -574,18 +575,19 @@ static void __arm_smmu_free_bitmap(unsigned long *map, int idx)
 static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
 				void __iomem *sync, void __iomem *status)
 {
-	int count = 0;
+	unsigned int spin_cnt, delay;
 
 	writel_relaxed(0, sync);
-	while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
-		cpu_relax();
-		if (++count == TLB_LOOP_TIMEOUT) {
-			dev_err_ratelimited(smmu->dev,
-			"TLB sync timed out -- SMMU may be deadlocked\n");
-			return;
+	for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+		for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+			if (!(readl_relaxed(status) & sTLBGSTATUS_GSACTIVE))
+				return;
+			cpu_relax();
 		}
-		udelay(1);
+		udelay(delay);
 	}
+	dev_err_ratelimited(smmu->dev,
+			    "TLB sync timed out -- SMMU may be deadlocked\n");
 }
 
 static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
-- 
2.11.0.dirty

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 4/4] iommu/arm-smmu: Poll for TLB sync completion more effectively
@ 2017-03-30 16:56     ` Robin Murphy
  0 siblings, 0 replies; 18+ messages in thread
From: Robin Murphy @ 2017-03-30 16:56 UTC (permalink / raw)
  To: linux-arm-kernel

On relatively slow development platforms and software models, the
inefficiency of our TLB sync loop tends not to show up - for instance on
a Juno r1 board I typically see the TLBI has completed of its own accord
by the time we get to the sync, such that the latter finishes instantly.

However, on larger systems doing real I/O, it's less realistic for the
TLBs to go idle immediately, and at that point falling into the 1MHz
polling loop turns out to throw away performance drastically. Let's
strike a balance by polling more than once between pauses, such that we
have much more chance of catching normal operations completing before
committing to the fixed delay, but also backing off exponentially, since
if a sync really hasn't completed within one or two "reasonable time"
periods, it becomes increasingly unlikely that it ever will.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v2: Restored the cpu_relax() to the inner loop

 drivers/iommu/arm-smmu.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 759d5f261160..a15ca86e9703 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -162,6 +162,7 @@
 #define ARM_SMMU_GR0_sTLBGSTATUS	0x74
 #define sTLBGSTATUS_GSACTIVE		(1 << 0)
 #define TLB_LOOP_TIMEOUT		1000000	/* 1s! */
+#define TLB_SPIN_COUNT			10
 
 /* Stream mapping registers */
 #define ARM_SMMU_GR0_SMR(n)		(0x800 + ((n) << 2))
@@ -574,18 +575,19 @@ static void __arm_smmu_free_bitmap(unsigned long *map, int idx)
 static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
 				void __iomem *sync, void __iomem *status)
 {
-	int count = 0;
+	unsigned int spin_cnt, delay;
 
 	writel_relaxed(0, sync);
-	while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
-		cpu_relax();
-		if (++count == TLB_LOOP_TIMEOUT) {
-			dev_err_ratelimited(smmu->dev,
-			"TLB sync timed out -- SMMU may be deadlocked\n");
-			return;
+	for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
+		for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
+			if (!(readl_relaxed(status) & sTLBGSTATUS_GSACTIVE))
+				return;
+			cpu_relax();
 		}
-		udelay(1);
+		udelay(delay);
 	}
+	dev_err_ratelimited(smmu->dev,
+			    "TLB sync timed out -- SMMU may be deadlocked\n");
 }
 
 static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
-- 
2.11.0.dirty

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 1/4] iommu/arm-smmu: Simplify ASID/VMID handling
  2017-03-30 16:56     ` Robin Murphy
@ 2017-03-30 18:37         ` Jordan Crouse
  -1 siblings, 0 replies; 18+ messages in thread
From: Jordan Crouse @ 2017-03-30 18:37 UTC (permalink / raw)
  To: Robin Murphy
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	will.deacon-5wv7dgnIgG8,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On Thu, Mar 30, 2017 at 05:56:29PM +0100, Robin Murphy wrote:
> Calculating ASIDs/VMIDs dynamically from arm_smmu_cfg was a neat trick,
> but the global uniqueness workaround makes it somewhat more awkward, and
> means we end up having to pass extra state around in certain cases just
> to keep a handle on the offset.
> 
> We already have 16 bits going spare in arm_smmu_cfg; let's just
> precalculate an ASID/VMID, plop it in there, and tidy up the users
> accordingly. We'd also need something like this anyway if we ever get
> near to thinking about SVM, so it's no bad thing.

If it helps:

Reviewed-by: Jordan Crouse <jcrouse-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>

> Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
> ---
> 
> v2: No change
> 
>  drivers/iommu/arm-smmu.c | 36 +++++++++++++++++++-----------------
>  1 file changed, 19 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index abf6496843a6..34b745bf59f2 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -404,14 +404,15 @@ enum arm_smmu_context_fmt {
>  struct arm_smmu_cfg {
>  	u8				cbndx;
>  	u8				irptndx;
> +	union {
> +		u16			asid;
> +		u16			vmid;
> +	};
>  	u32				cbar;
>  	enum arm_smmu_context_fmt	fmt;
>  };
>  #define INVALID_IRPTNDX			0xff
>  
> -#define ARM_SMMU_CB_ASID(smmu, cfg) ((u16)(smmu)->cavium_id_base + (cfg)->cbndx)
> -#define ARM_SMMU_CB_VMID(smmu, cfg) ((u16)(smmu)->cavium_id_base + (cfg)->cbndx + 1)
> -
>  enum arm_smmu_domain_stage {
>  	ARM_SMMU_DOMAIN_S1 = 0,
>  	ARM_SMMU_DOMAIN_S2,
> @@ -603,12 +604,10 @@ static void arm_smmu_tlb_inv_context(void *cookie)
>  
>  	if (stage1) {
>  		base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> -		writel_relaxed(ARM_SMMU_CB_ASID(smmu, cfg),
> -			       base + ARM_SMMU_CB_S1_TLBIASID);
> +		writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
>  	} else {
>  		base = ARM_SMMU_GR0(smmu);
> -		writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg),
> -			       base + ARM_SMMU_GR0_TLBIVMID);
> +		writel_relaxed(cfg->vmid, base + ARM_SMMU_GR0_TLBIVMID);
>  	}
>  
>  	__arm_smmu_tlb_sync(smmu);
> @@ -629,14 +628,14 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
>  
>  		if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
>  			iova &= ~12UL;
> -			iova |= ARM_SMMU_CB_ASID(smmu, cfg);
> +			iova |= cfg->asid;
>  			do {
>  				writel_relaxed(iova, reg);
>  				iova += granule;
>  			} while (size -= granule);
>  		} else {
>  			iova >>= 12;
> -			iova |= (u64)ARM_SMMU_CB_ASID(smmu, cfg) << 48;
> +			iova |= (u64)cfg->asid << 48;
>  			do {
>  				writeq_relaxed(iova, reg);
>  				iova += granule >> 12;
> @@ -653,7 +652,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
>  		} while (size -= granule);
>  	} else {
>  		reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
> -		writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg), reg);
> +		writel_relaxed(cfg->vmid, reg);
>  	}
>  }
>  
> @@ -735,7 +734,7 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
>  			reg = CBA2R_RW64_32BIT;
>  		/* 16-bit VMIDs live in CBA2R */
>  		if (smmu->features & ARM_SMMU_FEAT_VMID16)
> -			reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBA2R_VMID_SHIFT;
> +			reg |= cfg->vmid << CBA2R_VMID_SHIFT;
>  
>  		writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBA2R(cfg->cbndx));
>  	}
> @@ -754,26 +753,24 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
>  			(CBAR_S1_MEMATTR_WB << CBAR_S1_MEMATTR_SHIFT);
>  	} else if (!(smmu->features & ARM_SMMU_FEAT_VMID16)) {
>  		/* 8-bit VMIDs live in CBAR */
> -		reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBAR_VMID_SHIFT;
> +		reg |= cfg->vmid << CBAR_VMID_SHIFT;
>  	}
>  	writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBAR(cfg->cbndx));
>  
>  	/* TTBRs */
>  	if (stage1) {
> -		u16 asid = ARM_SMMU_CB_ASID(smmu, cfg);
> -
>  		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
>  			reg = pgtbl_cfg->arm_v7s_cfg.ttbr[0];
>  			writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0);
>  			reg = pgtbl_cfg->arm_v7s_cfg.ttbr[1];
>  			writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR1);
> -			writel_relaxed(asid, cb_base + ARM_SMMU_CB_CONTEXTIDR);
> +			writel_relaxed(cfg->asid, cb_base + ARM_SMMU_CB_CONTEXTIDR);
>  		} else {
>  			reg64 = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
> -			reg64 |= (u64)asid << TTBRn_ASID_SHIFT;
> +			reg64 |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
>  			writeq_relaxed(reg64, cb_base + ARM_SMMU_CB_TTBR0);
>  			reg64 = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[1];
> -			reg64 |= (u64)asid << TTBRn_ASID_SHIFT;
> +			reg64 |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
>  			writeq_relaxed(reg64, cb_base + ARM_SMMU_CB_TTBR1);
>  		}
>  	} else {
> @@ -941,6 +938,11 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
>  		cfg->irptndx = cfg->cbndx;
>  	}
>  
> +	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2)
> +		cfg->vmid = cfg->cbndx + 1 + smmu->cavium_id_base;
> +	else
> +		cfg->asid = cfg->cbndx + smmu->cavium_id_base;
> +
>  	pgtbl_cfg = (struct io_pgtable_cfg) {
>  		.pgsize_bitmap	= smmu->pgsize_bitmap,
>  		.ias		= ias,
> -- 
> 2.11.0.dirty
> 
> _______________________________________________
> iommu mailing list
> iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 1/4] iommu/arm-smmu: Simplify ASID/VMID handling
@ 2017-03-30 18:37         ` Jordan Crouse
  0 siblings, 0 replies; 18+ messages in thread
From: Jordan Crouse @ 2017-03-30 18:37 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Mar 30, 2017 at 05:56:29PM +0100, Robin Murphy wrote:
> Calculating ASIDs/VMIDs dynamically from arm_smmu_cfg was a neat trick,
> but the global uniqueness workaround makes it somewhat more awkward, and
> means we end up having to pass extra state around in certain cases just
> to keep a handle on the offset.
> 
> We already have 16 bits going spare in arm_smmu_cfg; let's just
> precalculate an ASID/VMID, plop it in there, and tidy up the users
> accordingly. We'd also need something like this anyway if we ever get
> near to thinking about SVM, so it's no bad thing.

If it helps:

Reviewed-by: Jordan Crouse <jcrouse@codeaurora.org>

> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
> 
> v2: No change
> 
>  drivers/iommu/arm-smmu.c | 36 +++++++++++++++++++-----------------
>  1 file changed, 19 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index abf6496843a6..34b745bf59f2 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -404,14 +404,15 @@ enum arm_smmu_context_fmt {
>  struct arm_smmu_cfg {
>  	u8				cbndx;
>  	u8				irptndx;
> +	union {
> +		u16			asid;
> +		u16			vmid;
> +	};
>  	u32				cbar;
>  	enum arm_smmu_context_fmt	fmt;
>  };
>  #define INVALID_IRPTNDX			0xff
>  
> -#define ARM_SMMU_CB_ASID(smmu, cfg) ((u16)(smmu)->cavium_id_base + (cfg)->cbndx)
> -#define ARM_SMMU_CB_VMID(smmu, cfg) ((u16)(smmu)->cavium_id_base + (cfg)->cbndx + 1)
> -
>  enum arm_smmu_domain_stage {
>  	ARM_SMMU_DOMAIN_S1 = 0,
>  	ARM_SMMU_DOMAIN_S2,
> @@ -603,12 +604,10 @@ static void arm_smmu_tlb_inv_context(void *cookie)
>  
>  	if (stage1) {
>  		base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> -		writel_relaxed(ARM_SMMU_CB_ASID(smmu, cfg),
> -			       base + ARM_SMMU_CB_S1_TLBIASID);
> +		writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
>  	} else {
>  		base = ARM_SMMU_GR0(smmu);
> -		writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg),
> -			       base + ARM_SMMU_GR0_TLBIVMID);
> +		writel_relaxed(cfg->vmid, base + ARM_SMMU_GR0_TLBIVMID);
>  	}
>  
>  	__arm_smmu_tlb_sync(smmu);
> @@ -629,14 +628,14 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
>  
>  		if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
>  			iova &= ~12UL;
> -			iova |= ARM_SMMU_CB_ASID(smmu, cfg);
> +			iova |= cfg->asid;
>  			do {
>  				writel_relaxed(iova, reg);
>  				iova += granule;
>  			} while (size -= granule);
>  		} else {
>  			iova >>= 12;
> -			iova |= (u64)ARM_SMMU_CB_ASID(smmu, cfg) << 48;
> +			iova |= (u64)cfg->asid << 48;
>  			do {
>  				writeq_relaxed(iova, reg);
>  				iova += granule >> 12;
> @@ -653,7 +652,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
>  		} while (size -= granule);
>  	} else {
>  		reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
> -		writel_relaxed(ARM_SMMU_CB_VMID(smmu, cfg), reg);
> +		writel_relaxed(cfg->vmid, reg);
>  	}
>  }
>  
> @@ -735,7 +734,7 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
>  			reg = CBA2R_RW64_32BIT;
>  		/* 16-bit VMIDs live in CBA2R */
>  		if (smmu->features & ARM_SMMU_FEAT_VMID16)
> -			reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBA2R_VMID_SHIFT;
> +			reg |= cfg->vmid << CBA2R_VMID_SHIFT;
>  
>  		writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBA2R(cfg->cbndx));
>  	}
> @@ -754,26 +753,24 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
>  			(CBAR_S1_MEMATTR_WB << CBAR_S1_MEMATTR_SHIFT);
>  	} else if (!(smmu->features & ARM_SMMU_FEAT_VMID16)) {
>  		/* 8-bit VMIDs live in CBAR */
> -		reg |= ARM_SMMU_CB_VMID(smmu, cfg) << CBAR_VMID_SHIFT;
> +		reg |= cfg->vmid << CBAR_VMID_SHIFT;
>  	}
>  	writel_relaxed(reg, gr1_base + ARM_SMMU_GR1_CBAR(cfg->cbndx));
>  
>  	/* TTBRs */
>  	if (stage1) {
> -		u16 asid = ARM_SMMU_CB_ASID(smmu, cfg);
> -
>  		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH32_S) {
>  			reg = pgtbl_cfg->arm_v7s_cfg.ttbr[0];
>  			writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR0);
>  			reg = pgtbl_cfg->arm_v7s_cfg.ttbr[1];
>  			writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBR1);
> -			writel_relaxed(asid, cb_base + ARM_SMMU_CB_CONTEXTIDR);
> +			writel_relaxed(cfg->asid, cb_base + ARM_SMMU_CB_CONTEXTIDR);
>  		} else {
>  			reg64 = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[0];
> -			reg64 |= (u64)asid << TTBRn_ASID_SHIFT;
> +			reg64 |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
>  			writeq_relaxed(reg64, cb_base + ARM_SMMU_CB_TTBR0);
>  			reg64 = pgtbl_cfg->arm_lpae_s1_cfg.ttbr[1];
> -			reg64 |= (u64)asid << TTBRn_ASID_SHIFT;
> +			reg64 |= (u64)cfg->asid << TTBRn_ASID_SHIFT;
>  			writeq_relaxed(reg64, cb_base + ARM_SMMU_CB_TTBR1);
>  		}
>  	} else {
> @@ -941,6 +938,11 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
>  		cfg->irptndx = cfg->cbndx;
>  	}
>  
> +	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2)
> +		cfg->vmid = cfg->cbndx + 1 + smmu->cavium_id_base;
> +	else
> +		cfg->asid = cfg->cbndx + smmu->cavium_id_base;
> +
>  	pgtbl_cfg = (struct io_pgtable_cfg) {
>  		.pgsize_bitmap	= smmu->pgsize_bitmap,
>  		.ias		= ias,
> -- 
> 2.11.0.dirty
> 
> _______________________________________________
> iommu mailing list
> iommu at lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 2/4] iommu/arm-smmu: Tidy up context bank indexing
  2017-03-30 16:56     ` Robin Murphy
@ 2017-03-30 18:40         ` Jordan Crouse
  -1 siblings, 0 replies; 18+ messages in thread
From: Jordan Crouse @ 2017-03-30 18:40 UTC (permalink / raw)
  To: Robin Murphy
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	will.deacon-5wv7dgnIgG8,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On Thu, Mar 30, 2017 at 05:56:30PM +0100, Robin Murphy wrote:
> ARM_AMMU_CB() is calculated relative to ARM_SMMU_CB_BASE(), but the
> latter is never of use on its own, and what we end up with is the same
> ARM_SMMU_CB_BASE() + ARM_AMMU_CB() expression being duplicated at every
> callsite. Folding the two together gives us a self-contained context
> bank accessor which is much more pleasant to work with.
> 
> Secondly, we might as well simplify CB_BASE itself at the same time.
> We use the address space size for its own sake precisely once, at probe
> time, and every other usage is to dynamically calculate CB_BASE over
> and over and over again. Let's flip things around so that we just
> maintain the CB_BASE address directly.

Reviewed-by: Jordan Crouse <jcrouse-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>

> Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
> ---
> 
> v2: No change
> 
>  drivers/iommu/arm-smmu.c | 29 ++++++++++++++---------------
>  1 file changed, 14 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 34b745bf59f2..e79b623f9165 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -216,8 +216,7 @@ enum arm_smmu_s2cr_privcfg {
>  #define CBA2R_VMID_MASK			0xffff
>  
>  /* Translation context bank */
> -#define ARM_SMMU_CB_BASE(smmu)		((smmu)->base + ((smmu)->size >> 1))
> -#define ARM_SMMU_CB(smmu, n)		((n) * (1 << (smmu)->pgshift))
> +#define ARM_SMMU_CB(smmu, n)	((smmu)->cb_base + ((n) << (smmu)->pgshift))
>  
>  #define ARM_SMMU_CB_SCTLR		0x0
>  #define ARM_SMMU_CB_ACTLR		0x4
> @@ -344,7 +343,7 @@ struct arm_smmu_device {
>  	struct device			*dev;
>  
>  	void __iomem			*base;
> -	unsigned long			size;
> +	void __iomem			*cb_base;
>  	unsigned long			pgshift;
>  
>  #define ARM_SMMU_FEAT_COHERENT_WALK	(1 << 0)
> @@ -603,7 +602,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
>  	void __iomem *base;
>  
>  	if (stage1) {
> -		base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +		base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  		writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
>  	} else {
>  		base = ARM_SMMU_GR0(smmu);
> @@ -623,7 +622,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
>  	void __iomem *reg;
>  
>  	if (stage1) {
> -		reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
>  		reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
>  
>  		if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
> @@ -642,7 +641,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
>  			} while (size -= granule);
>  		}
>  	} else if (smmu->version == ARM_SMMU_V2) {
> -		reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
>  		reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
>  			      ARM_SMMU_CB_S2_TLBIIPAS2;
>  		iova >>= 12;
> @@ -672,7 +671,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
>  	struct arm_smmu_device *smmu = smmu_domain->smmu;
>  	void __iomem *cb_base;
>  
> -	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  	fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
>  
>  	if (!(fsr & FSR_FAULT))
> @@ -725,7 +724,7 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
>  
>  	gr1_base = ARM_SMMU_GR1(smmu);
>  	stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
> -	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  
>  	if (smmu->version > ARM_SMMU_V1) {
>  		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
> @@ -1007,7 +1006,7 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain)
>  	 * Disable the context bank and free the page tables before freeing
>  	 * it.
>  	 */
> -	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  	writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
>  
>  	if (cfg->irptndx != INVALID_IRPTNDX) {
> @@ -1358,7 +1357,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
>  	u64 phys;
>  	unsigned long va;
>  
> -	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  
>  	/* ATS1 registers can only be written atomically */
>  	va = iova & ~0xfffUL;
> @@ -1685,7 +1684,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
>  
>  	/* Make sure all context banks are disabled and clear CB_FSR  */
>  	for (i = 0; i < smmu->num_context_banks; ++i) {
> -		cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, i);
> +		cb_base = ARM_SMMU_CB(smmu, i);
>  		writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
>  		writel_relaxed(FSR_FAULT, cb_base + ARM_SMMU_CB_FSR);
>  		/*
> @@ -1865,11 +1864,11 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
>  
>  	/* Check for size mismatch of SMMU address space from mapped region */
>  	size = 1 << (((id >> ID1_NUMPAGENDXB_SHIFT) & ID1_NUMPAGENDXB_MASK) + 1);
> -	size *= 2 << smmu->pgshift;
> -	if (smmu->size != size)
> +	size <<= smmu->pgshift;
> +	if (smmu->cb_base != gr0_base + size)
>  		dev_warn(smmu->dev,
>  			"SMMU address space size (0x%lx) differs from mapped region size (0x%lx)!\n",
> -			size, smmu->size);
> +			size * 2, (smmu->cb_base - gr0_base) * 2);
>  
>  	smmu->num_s2_context_banks = (id >> ID1_NUMS2CB_SHIFT) & ID1_NUMS2CB_MASK;
>  	smmu->num_context_banks = (id >> ID1_NUMCB_SHIFT) & ID1_NUMCB_MASK;
> @@ -2105,7 +2104,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
>  	smmu->base = devm_ioremap_resource(dev, res);
>  	if (IS_ERR(smmu->base))
>  		return PTR_ERR(smmu->base);
> -	smmu->size = resource_size(res);
> +	smmu->cb_base = smmu->base + resource_size(res) / 2;
>  
>  	num_irqs = 0;
>  	while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) {
> -- 
> 2.11.0.dirty
> 
> _______________________________________________
> iommu mailing list
> iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 2/4] iommu/arm-smmu: Tidy up context bank indexing
@ 2017-03-30 18:40         ` Jordan Crouse
  0 siblings, 0 replies; 18+ messages in thread
From: Jordan Crouse @ 2017-03-30 18:40 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Mar 30, 2017 at 05:56:30PM +0100, Robin Murphy wrote:
> ARM_AMMU_CB() is calculated relative to ARM_SMMU_CB_BASE(), but the
> latter is never of use on its own, and what we end up with is the same
> ARM_SMMU_CB_BASE() + ARM_AMMU_CB() expression being duplicated at every
> callsite. Folding the two together gives us a self-contained context
> bank accessor which is much more pleasant to work with.
> 
> Secondly, we might as well simplify CB_BASE itself at the same time.
> We use the address space size for its own sake precisely once, at probe
> time, and every other usage is to dynamically calculate CB_BASE over
> and over and over again. Let's flip things around so that we just
> maintain the CB_BASE address directly.

Reviewed-by: Jordan Crouse <jcrouse@codeaurora.org>

> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
> 
> v2: No change
> 
>  drivers/iommu/arm-smmu.c | 29 ++++++++++++++---------------
>  1 file changed, 14 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 34b745bf59f2..e79b623f9165 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -216,8 +216,7 @@ enum arm_smmu_s2cr_privcfg {
>  #define CBA2R_VMID_MASK			0xffff
>  
>  /* Translation context bank */
> -#define ARM_SMMU_CB_BASE(smmu)		((smmu)->base + ((smmu)->size >> 1))
> -#define ARM_SMMU_CB(smmu, n)		((n) * (1 << (smmu)->pgshift))
> +#define ARM_SMMU_CB(smmu, n)	((smmu)->cb_base + ((n) << (smmu)->pgshift))
>  
>  #define ARM_SMMU_CB_SCTLR		0x0
>  #define ARM_SMMU_CB_ACTLR		0x4
> @@ -344,7 +343,7 @@ struct arm_smmu_device {
>  	struct device			*dev;
>  
>  	void __iomem			*base;
> -	unsigned long			size;
> +	void __iomem			*cb_base;
>  	unsigned long			pgshift;
>  
>  #define ARM_SMMU_FEAT_COHERENT_WALK	(1 << 0)
> @@ -603,7 +602,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
>  	void __iomem *base;
>  
>  	if (stage1) {
> -		base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +		base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  		writel_relaxed(cfg->asid, base + ARM_SMMU_CB_S1_TLBIASID);
>  	} else {
>  		base = ARM_SMMU_GR0(smmu);
> @@ -623,7 +622,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
>  	void __iomem *reg;
>  
>  	if (stage1) {
> -		reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
>  		reg += leaf ? ARM_SMMU_CB_S1_TLBIVAL : ARM_SMMU_CB_S1_TLBIVA;
>  
>  		if (cfg->fmt != ARM_SMMU_CTX_FMT_AARCH64) {
> @@ -642,7 +641,7 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
>  			} while (size -= granule);
>  		}
>  	} else if (smmu->version == ARM_SMMU_V2) {
> -		reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +		reg = ARM_SMMU_CB(smmu, cfg->cbndx);
>  		reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
>  			      ARM_SMMU_CB_S2_TLBIIPAS2;
>  		iova >>= 12;
> @@ -672,7 +671,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
>  	struct arm_smmu_device *smmu = smmu_domain->smmu;
>  	void __iomem *cb_base;
>  
> -	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  	fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
>  
>  	if (!(fsr & FSR_FAULT))
> @@ -725,7 +724,7 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
>  
>  	gr1_base = ARM_SMMU_GR1(smmu);
>  	stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
> -	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  
>  	if (smmu->version > ARM_SMMU_V1) {
>  		if (cfg->fmt == ARM_SMMU_CTX_FMT_AARCH64)
> @@ -1007,7 +1006,7 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain)
>  	 * Disable the context bank and free the page tables before freeing
>  	 * it.
>  	 */
> -	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  	writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
>  
>  	if (cfg->irptndx != INVALID_IRPTNDX) {
> @@ -1358,7 +1357,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
>  	u64 phys;
>  	unsigned long va;
>  
> -	cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
> +	cb_base = ARM_SMMU_CB(smmu, cfg->cbndx);
>  
>  	/* ATS1 registers can only be written atomically */
>  	va = iova & ~0xfffUL;
> @@ -1685,7 +1684,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
>  
>  	/* Make sure all context banks are disabled and clear CB_FSR  */
>  	for (i = 0; i < smmu->num_context_banks; ++i) {
> -		cb_base = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, i);
> +		cb_base = ARM_SMMU_CB(smmu, i);
>  		writel_relaxed(0, cb_base + ARM_SMMU_CB_SCTLR);
>  		writel_relaxed(FSR_FAULT, cb_base + ARM_SMMU_CB_FSR);
>  		/*
> @@ -1865,11 +1864,11 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
>  
>  	/* Check for size mismatch of SMMU address space from mapped region */
>  	size = 1 << (((id >> ID1_NUMPAGENDXB_SHIFT) & ID1_NUMPAGENDXB_MASK) + 1);
> -	size *= 2 << smmu->pgshift;
> -	if (smmu->size != size)
> +	size <<= smmu->pgshift;
> +	if (smmu->cb_base != gr0_base + size)
>  		dev_warn(smmu->dev,
>  			"SMMU address space size (0x%lx) differs from mapped region size (0x%lx)!\n",
> -			size, smmu->size);
> +			size * 2, (smmu->cb_base - gr0_base) * 2);
>  
>  	smmu->num_s2_context_banks = (id >> ID1_NUMS2CB_SHIFT) & ID1_NUMS2CB_MASK;
>  	smmu->num_context_banks = (id >> ID1_NUMCB_SHIFT) & ID1_NUMCB_MASK;
> @@ -2105,7 +2104,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
>  	smmu->base = devm_ioremap_resource(dev, res);
>  	if (IS_ERR(smmu->base))
>  		return PTR_ERR(smmu->base);
> -	smmu->size = resource_size(res);
> +	smmu->cb_base = smmu->base + resource_size(res) / 2;
>  
>  	num_irqs = 0;
>  	while ((res = platform_get_resource(pdev, IORESOURCE_IRQ, num_irqs))) {
> -- 
> 2.11.0.dirty
> 
> _______________________________________________
> iommu mailing list
> iommu at lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 4/4] iommu/arm-smmu: Poll for TLB sync completion more effectively
  2017-03-30 16:56     ` Robin Murphy
@ 2017-03-30 18:51         ` Jordan Crouse
  -1 siblings, 0 replies; 18+ messages in thread
From: Jordan Crouse @ 2017-03-30 18:51 UTC (permalink / raw)
  To: Robin Murphy
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	will.deacon-5wv7dgnIgG8,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On Thu, Mar 30, 2017 at 05:56:32PM +0100, Robin Murphy wrote:
> On relatively slow development platforms and software models, the
> inefficiency of our TLB sync loop tends not to show up - for instance on
> a Juno r1 board I typically see the TLBI has completed of its own accord
> by the time we get to the sync, such that the latter finishes instantly.
> 
> However, on larger systems doing real I/O, it's less realistic for the
> TLBs to go idle immediately, and at that point falling into the 1MHz
> polling loop turns out to throw away performance drastically. Let's
> strike a balance by polling more than once between pauses, such that we
> have much more chance of catching normal operations completing before
> committing to the fixed delay, but also backing off exponentially, since
> if a sync really hasn't completed within one or two "reasonable time"
> periods, it becomes increasingly unlikely that it ever will.

I really really like this.

Reviewed-by: Jordan Crouse <jcrouse-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>

> Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
> ---
> 
> v2: Restored the cpu_relax() to the inner loop
> 
>  drivers/iommu/arm-smmu.c | 18 ++++++++++--------
>  1 file changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 759d5f261160..a15ca86e9703 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -162,6 +162,7 @@
>  #define ARM_SMMU_GR0_sTLBGSTATUS	0x74
>  #define sTLBGSTATUS_GSACTIVE		(1 << 0)
>  #define TLB_LOOP_TIMEOUT		1000000	/* 1s! */
> +#define TLB_SPIN_COUNT			10
>  
>  /* Stream mapping registers */
>  #define ARM_SMMU_GR0_SMR(n)		(0x800 + ((n) << 2))
> @@ -574,18 +575,19 @@ static void __arm_smmu_free_bitmap(unsigned long *map, int idx)
>  static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
>  				void __iomem *sync, void __iomem *status)
>  {
> -	int count = 0;
> +	unsigned int spin_cnt, delay;
>  
>  	writel_relaxed(0, sync);
> -	while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
> -		cpu_relax();
> -		if (++count == TLB_LOOP_TIMEOUT) {
> -			dev_err_ratelimited(smmu->dev,
> -			"TLB sync timed out -- SMMU may be deadlocked\n");
> -			return;
> +	for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
> +		for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
> +			if (!(readl_relaxed(status) & sTLBGSTATUS_GSACTIVE))
> +				return;
> +			cpu_relax();
>  		}
> -		udelay(1);
> +		udelay(delay);
>  	}
> +	dev_err_ratelimited(smmu->dev,
> +			    "TLB sync timed out -- SMMU may be deadlocked\n");
>  }
>  
>  static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
> -- 
> 2.11.0.dirty
> 
> _______________________________________________
> iommu mailing list
> iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 4/4] iommu/arm-smmu: Poll for TLB sync completion more effectively
@ 2017-03-30 18:51         ` Jordan Crouse
  0 siblings, 0 replies; 18+ messages in thread
From: Jordan Crouse @ 2017-03-30 18:51 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Mar 30, 2017 at 05:56:32PM +0100, Robin Murphy wrote:
> On relatively slow development platforms and software models, the
> inefficiency of our TLB sync loop tends not to show up - for instance on
> a Juno r1 board I typically see the TLBI has completed of its own accord
> by the time we get to the sync, such that the latter finishes instantly.
> 
> However, on larger systems doing real I/O, it's less realistic for the
> TLBs to go idle immediately, and at that point falling into the 1MHz
> polling loop turns out to throw away performance drastically. Let's
> strike a balance by polling more than once between pauses, such that we
> have much more chance of catching normal operations completing before
> committing to the fixed delay, but also backing off exponentially, since
> if a sync really hasn't completed within one or two "reasonable time"
> periods, it becomes increasingly unlikely that it ever will.

I really really like this.

Reviewed-by: Jordan Crouse <jcrouse@codeaurora.org>

> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
> 
> v2: Restored the cpu_relax() to the inner loop
> 
>  drivers/iommu/arm-smmu.c | 18 ++++++++++--------
>  1 file changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 759d5f261160..a15ca86e9703 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -162,6 +162,7 @@
>  #define ARM_SMMU_GR0_sTLBGSTATUS	0x74
>  #define sTLBGSTATUS_GSACTIVE		(1 << 0)
>  #define TLB_LOOP_TIMEOUT		1000000	/* 1s! */
> +#define TLB_SPIN_COUNT			10
>  
>  /* Stream mapping registers */
>  #define ARM_SMMU_GR0_SMR(n)		(0x800 + ((n) << 2))
> @@ -574,18 +575,19 @@ static void __arm_smmu_free_bitmap(unsigned long *map, int idx)
>  static void __arm_smmu_tlb_sync(struct arm_smmu_device *smmu,
>  				void __iomem *sync, void __iomem *status)
>  {
> -	int count = 0;
> +	unsigned int spin_cnt, delay;
>  
>  	writel_relaxed(0, sync);
> -	while (readl_relaxed(status) & sTLBGSTATUS_GSACTIVE) {
> -		cpu_relax();
> -		if (++count == TLB_LOOP_TIMEOUT) {
> -			dev_err_ratelimited(smmu->dev,
> -			"TLB sync timed out -- SMMU may be deadlocked\n");
> -			return;
> +	for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) {
> +		for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) {
> +			if (!(readl_relaxed(status) & sTLBGSTATUS_GSACTIVE))
> +				return;
> +			cpu_relax();
>  		}
> -		udelay(1);
> +		udelay(delay);
>  	}
> +	dev_err_ratelimited(smmu->dev,
> +			    "TLB sync timed out -- SMMU may be deadlocked\n");
>  }
>  
>  static void arm_smmu_tlb_sync_global(struct arm_smmu_device *smmu)
> -- 
> 2.11.0.dirty
> 
> _______________________________________________
> iommu mailing list
> iommu at lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 0/4] ARM SMMU TLB sync improvements
  2017-03-30 16:56 ` Robin Murphy
@ 2017-03-31 12:38     ` Will Deacon
  -1 siblings, 0 replies; 18+ messages in thread
From: Will Deacon @ 2017-03-31 12:38 UTC (permalink / raw)
  To: Robin Murphy
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On Thu, Mar 30, 2017 at 05:56:28PM +0100, Robin Murphy wrote:
> Here's a quick v2 to address your comments and drop the needless meddling
> (whaddaya know, it makes the whole lot look simpler!)
> 
> I'll put it on my list to take a look at SMMUv3 queue polling as suggested.

Thanks, I've queued this with Jordan's tags.

Will

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v2 0/4] ARM SMMU TLB sync improvements
@ 2017-03-31 12:38     ` Will Deacon
  0 siblings, 0 replies; 18+ messages in thread
From: Will Deacon @ 2017-03-31 12:38 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, Mar 30, 2017 at 05:56:28PM +0100, Robin Murphy wrote:
> Here's a quick v2 to address your comments and drop the needless meddling
> (whaddaya know, it makes the whole lot look simpler!)
> 
> I'll put it on my list to take a look at SMMUv3 queue polling as suggested.

Thanks, I've queued this with Jordan's tags.

Will

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2017-03-31 12:38 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-30 16:56 [PATCH v2 0/4] ARM SMMU TLB sync improvements Robin Murphy
2017-03-30 16:56 ` Robin Murphy
     [not found] ` <cover.1490890890.git.robin.murphy-5wv7dgnIgG8@public.gmane.org>
2017-03-30 16:56   ` [PATCH v2 1/4] iommu/arm-smmu: Simplify ASID/VMID handling Robin Murphy
2017-03-30 16:56     ` Robin Murphy
     [not found]     ` <0ba73e166470855948841ccb7fed7763dfaf023e.1490890890.git.robin.murphy-5wv7dgnIgG8@public.gmane.org>
2017-03-30 18:37       ` Jordan Crouse
2017-03-30 18:37         ` Jordan Crouse
2017-03-30 16:56   ` [PATCH v2 2/4] iommu/arm-smmu: Tidy up context bank indexing Robin Murphy
2017-03-30 16:56     ` Robin Murphy
     [not found]     ` <331119ede82f893a5f032a7b322ad5bae0c73548.1490890890.git.robin.murphy-5wv7dgnIgG8@public.gmane.org>
2017-03-30 18:40       ` Jordan Crouse
2017-03-30 18:40         ` Jordan Crouse
2017-03-30 16:56   ` [PATCH v2 3/4] iommu/arm-smmu: Use per-context TLB sync as appropriate Robin Murphy
2017-03-30 16:56     ` Robin Murphy
2017-03-30 16:56   ` [PATCH v2 4/4] iommu/arm-smmu: Poll for TLB sync completion more effectively Robin Murphy
2017-03-30 16:56     ` Robin Murphy
     [not found]     ` <7c24c93137bc48f69225e6463d6d242b7d89d15c.1490890890.git.robin.murphy-5wv7dgnIgG8@public.gmane.org>
2017-03-30 18:51       ` Jordan Crouse
2017-03-30 18:51         ` Jordan Crouse
2017-03-31 12:38   ` [PATCH v2 0/4] ARM SMMU TLB sync improvements Will Deacon
2017-03-31 12:38     ` Will Deacon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.