All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/3] SMMUv3 CMD_SYNC optimisation
@ 2017-10-18 14:04 ` Robin Murphy
  0 siblings, 0 replies; 10+ messages in thread
From: Robin Murphy @ 2017-10-18 14:04 UTC (permalink / raw)
  To: will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

Here's a quick v3 to address enough comments to move the cons-polling
discussion forward, at least ;)

Since the preparatory two patches are now queued I'm not reposting them
here.

Robin.


Robin Murphy (3):
  iommu/arm-smmu-v3: Use CMD_SYNC completion MSI
  iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
  iommu/arm-smmu-v3: Use burst-polling for sync completion

 drivers/iommu/arm-smmu-v3.c | 142 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 119 insertions(+), 23 deletions(-)

-- 
2.13.4.dirty

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 0/3] SMMUv3 CMD_SYNC optimisation
@ 2017-10-18 14:04 ` Robin Murphy
  0 siblings, 0 replies; 10+ messages in thread
From: Robin Murphy @ 2017-10-18 14:04 UTC (permalink / raw)
  To: linux-arm-kernel

Here's a quick v3 to address enough comments to move the cons-polling
discussion forward, at least ;)

Since the preparatory two patches are now queued I'm not reposting them
here.

Robin.


Robin Murphy (3):
  iommu/arm-smmu-v3: Use CMD_SYNC completion MSI
  iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
  iommu/arm-smmu-v3: Use burst-polling for sync completion

 drivers/iommu/arm-smmu-v3.c | 142 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 119 insertions(+), 23 deletions(-)

-- 
2.13.4.dirty

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 1/3] iommu/arm-smmu-v3: Use CMD_SYNC completion MSI
  2017-10-18 14:04 ` Robin Murphy
@ 2017-10-18 14:04     ` Robin Murphy
  -1 siblings, 0 replies; 10+ messages in thread
From: Robin Murphy @ 2017-10-18 14:04 UTC (permalink / raw)
  To: will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

As an IRQ, the CMD_SYNC interrupt is not particularly useful, not least
because we often need to wait for sync completion within someone else's
IRQ handler anyway. However, when the SMMU is both coherent and supports
MSIs, we can have a lot more fun by not using it as an interrupt at all.
Following the example suggested in the architecture and using a write
targeting normal memory, we can let callers wait on a status variable
outside the lock instead of having to stall the entire queue or even
touch MMIO registers. Since multiple sync commands are guaranteed to
complete in order, a simple incrementing sequence count is all we need
to unambiguously support any realistic number of overlapping waiters.

Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
---

v3:
 - Give sync_nr and sync_count some spatial separation
 - Document the non-overflow assumption
 - Use relaxed atomic increment

 drivers/iommu/arm-smmu-v3.c | 51 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index d8c2759ba9f2..83b76404e882 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -377,7 +377,16 @@
 
 #define CMDQ_SYNC_0_CS_SHIFT		12
 #define CMDQ_SYNC_0_CS_NONE		(0UL << CMDQ_SYNC_0_CS_SHIFT)
+#define CMDQ_SYNC_0_CS_IRQ		(1UL << CMDQ_SYNC_0_CS_SHIFT)
 #define CMDQ_SYNC_0_CS_SEV		(2UL << CMDQ_SYNC_0_CS_SHIFT)
+#define CMDQ_SYNC_0_MSH_SHIFT		22
+#define CMDQ_SYNC_0_MSH_ISH		(3UL << CMDQ_SYNC_0_MSH_SHIFT)
+#define CMDQ_SYNC_0_MSIATTR_SHIFT	24
+#define CMDQ_SYNC_0_MSIATTR_OIWB	(0xfUL << CMDQ_SYNC_0_MSIATTR_SHIFT)
+#define CMDQ_SYNC_0_MSIDATA_SHIFT	32
+#define CMDQ_SYNC_0_MSIDATA_MASK	0xffffffffUL
+#define CMDQ_SYNC_1_MSIADDR_SHIFT	0
+#define CMDQ_SYNC_1_MSIADDR_MASK	0xffffffffffffcUL
 
 /* Event queue */
 #define EVTQ_ENT_DWORDS			4
@@ -409,6 +418,7 @@
 /* High-level queue structures */
 #define ARM_SMMU_POLL_TIMEOUT_US	100
 #define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US	1000000 /* 1s! */
+#define ARM_SMMU_SYNC_TIMEOUT_US	1000000 /* 1s! */
 
 #define MSI_IOVA_BASE			0x8000000
 #define MSI_IOVA_LENGTH			0x100000
@@ -504,6 +514,10 @@ struct arm_smmu_cmdq_ent {
 		} pri;
 
 		#define CMDQ_OP_CMD_SYNC	0x46
+		struct {
+			u32			msidata;
+			u64			msiaddr;
+		} sync;
 	};
 };
 
@@ -616,6 +630,7 @@ struct arm_smmu_device {
 
 	int				gerr_irq;
 	int				combined_irq;
+	atomic_t			sync_nr;
 
 	unsigned long			ias; /* IPA */
 	unsigned long			oas; /* PA */
@@ -634,6 +649,8 @@ struct arm_smmu_device {
 
 	struct arm_smmu_strtab_cfg	strtab_cfg;
 
+	u32				sync_count;
+
 	/* IOMMU core code handle */
 	struct iommu_device		iommu;
 };
@@ -878,7 +895,13 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 		}
 		break;
 	case CMDQ_OP_CMD_SYNC:
-		cmd[0] |= CMDQ_SYNC_0_CS_SEV;
+		if (ent->sync.msiaddr)
+			cmd[0] |= CMDQ_SYNC_0_CS_IRQ;
+		else
+			cmd[0] |= CMDQ_SYNC_0_CS_SEV;
+		cmd[0] |= CMDQ_SYNC_0_MSH_ISH | CMDQ_SYNC_0_MSIATTR_OIWB;
+		cmd[0] |= (u64)ent->sync.msidata << CMDQ_SYNC_0_MSIDATA_SHIFT;
+		cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
 		break;
 	default:
 		return -ENOENT;
@@ -964,21 +987,44 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 }
 
+/*
+ * The difference between val and sync_idx is bounded by the maximum size of
+ * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
+ */
+static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
+{
+	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US);
+	u32 val = smp_cond_load_acquire(&smmu->sync_count,
+					(int)(VAL - sync_idx) >= 0 ||
+					!ktime_before(ktime_get(), timeout));
+
+	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
+}
+
 static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
 	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
+		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
 	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
 	int ret;
 
+	if (msi) {
+		ent.sync.msidata = atomic_inc_return_relaxed(&smmu->sync_nr);
+		ent.sync.msiaddr = virt_to_phys(&smmu->sync_count);
+	}
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
 
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
 	arm_smmu_cmdq_insert_cmd(smmu, cmd);
-	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
+	if (!msi)
+		ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 
+	if (msi)
+		ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
 	if (ret)
 		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
 }
@@ -2156,6 +2202,7 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
 {
 	int ret;
 
+	atomic_set(&smmu->sync_nr, 0);
 	ret = arm_smmu_init_queues(smmu);
 	if (ret)
 		return ret;
-- 
2.13.4.dirty

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 1/3] iommu/arm-smmu-v3: Use CMD_SYNC completion MSI
@ 2017-10-18 14:04     ` Robin Murphy
  0 siblings, 0 replies; 10+ messages in thread
From: Robin Murphy @ 2017-10-18 14:04 UTC (permalink / raw)
  To: linux-arm-kernel

As an IRQ, the CMD_SYNC interrupt is not particularly useful, not least
because we often need to wait for sync completion within someone else's
IRQ handler anyway. However, when the SMMU is both coherent and supports
MSIs, we can have a lot more fun by not using it as an interrupt at all.
Following the example suggested in the architecture and using a write
targeting normal memory, we can let callers wait on a status variable
outside the lock instead of having to stall the entire queue or even
touch MMIO registers. Since multiple sync commands are guaranteed to
complete in order, a simple incrementing sequence count is all we need
to unambiguously support any realistic number of overlapping waiters.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v3:
 - Give sync_nr and sync_count some spatial separation
 - Document the non-overflow assumption
 - Use relaxed atomic increment

 drivers/iommu/arm-smmu-v3.c | 51 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index d8c2759ba9f2..83b76404e882 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -377,7 +377,16 @@
 
 #define CMDQ_SYNC_0_CS_SHIFT		12
 #define CMDQ_SYNC_0_CS_NONE		(0UL << CMDQ_SYNC_0_CS_SHIFT)
+#define CMDQ_SYNC_0_CS_IRQ		(1UL << CMDQ_SYNC_0_CS_SHIFT)
 #define CMDQ_SYNC_0_CS_SEV		(2UL << CMDQ_SYNC_0_CS_SHIFT)
+#define CMDQ_SYNC_0_MSH_SHIFT		22
+#define CMDQ_SYNC_0_MSH_ISH		(3UL << CMDQ_SYNC_0_MSH_SHIFT)
+#define CMDQ_SYNC_0_MSIATTR_SHIFT	24
+#define CMDQ_SYNC_0_MSIATTR_OIWB	(0xfUL << CMDQ_SYNC_0_MSIATTR_SHIFT)
+#define CMDQ_SYNC_0_MSIDATA_SHIFT	32
+#define CMDQ_SYNC_0_MSIDATA_MASK	0xffffffffUL
+#define CMDQ_SYNC_1_MSIADDR_SHIFT	0
+#define CMDQ_SYNC_1_MSIADDR_MASK	0xffffffffffffcUL
 
 /* Event queue */
 #define EVTQ_ENT_DWORDS			4
@@ -409,6 +418,7 @@
 /* High-level queue structures */
 #define ARM_SMMU_POLL_TIMEOUT_US	100
 #define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US	1000000 /* 1s! */
+#define ARM_SMMU_SYNC_TIMEOUT_US	1000000 /* 1s! */
 
 #define MSI_IOVA_BASE			0x8000000
 #define MSI_IOVA_LENGTH			0x100000
@@ -504,6 +514,10 @@ struct arm_smmu_cmdq_ent {
 		} pri;
 
 		#define CMDQ_OP_CMD_SYNC	0x46
+		struct {
+			u32			msidata;
+			u64			msiaddr;
+		} sync;
 	};
 };
 
@@ -616,6 +630,7 @@ struct arm_smmu_device {
 
 	int				gerr_irq;
 	int				combined_irq;
+	atomic_t			sync_nr;
 
 	unsigned long			ias; /* IPA */
 	unsigned long			oas; /* PA */
@@ -634,6 +649,8 @@ struct arm_smmu_device {
 
 	struct arm_smmu_strtab_cfg	strtab_cfg;
 
+	u32				sync_count;
+
 	/* IOMMU core code handle */
 	struct iommu_device		iommu;
 };
@@ -878,7 +895,13 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 		}
 		break;
 	case CMDQ_OP_CMD_SYNC:
-		cmd[0] |= CMDQ_SYNC_0_CS_SEV;
+		if (ent->sync.msiaddr)
+			cmd[0] |= CMDQ_SYNC_0_CS_IRQ;
+		else
+			cmd[0] |= CMDQ_SYNC_0_CS_SEV;
+		cmd[0] |= CMDQ_SYNC_0_MSH_ISH | CMDQ_SYNC_0_MSIATTR_OIWB;
+		cmd[0] |= (u64)ent->sync.msidata << CMDQ_SYNC_0_MSIDATA_SHIFT;
+		cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
 		break;
 	default:
 		return -ENOENT;
@@ -964,21 +987,44 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 }
 
+/*
+ * The difference between val and sync_idx is bounded by the maximum size of
+ * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
+ */
+static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
+{
+	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US);
+	u32 val = smp_cond_load_acquire(&smmu->sync_count,
+					(int)(VAL - sync_idx) >= 0 ||
+					!ktime_before(ktime_get(), timeout));
+
+	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
+}
+
 static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
 	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
+		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
 	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
 	int ret;
 
+	if (msi) {
+		ent.sync.msidata = atomic_inc_return_relaxed(&smmu->sync_nr);
+		ent.sync.msiaddr = virt_to_phys(&smmu->sync_count);
+	}
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
 
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
 	arm_smmu_cmdq_insert_cmd(smmu, cmd);
-	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
+	if (!msi)
+		ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 
+	if (msi)
+		ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
 	if (ret)
 		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
 }
@@ -2156,6 +2202,7 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
 {
 	int ret;
 
+	atomic_set(&smmu->sync_nr, 0);
 	ret = arm_smmu_init_queues(smmu);
 	if (ret)
 		return ret;
-- 
2.13.4.dirty

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 2/3] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
  2017-10-18 14:04 ` Robin Murphy
@ 2017-10-18 14:04     ` Robin Murphy
  -1 siblings, 0 replies; 10+ messages in thread
From: Robin Murphy @ 2017-10-18 14:04 UTC (permalink / raw)
  To: will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

Even without the MSI trick, we can still do a lot better than hogging
the entire queue while it drains. All we actually need to do for the
necessary guarantee of completion is wait for our particular command to
have been consumed, and as long as we keep track of where it is there is
no need to block other CPUs from adding further commands in the
meantime. There is one theoretical (but incredibly unlikely) edge case
to avoid, where cons has wrapped twice to still appear 'behind' the sync
position - this is easily disambiguated by adding a generation count to
the queue to indicate when prod wraps, since cons cannot wrap twice
without prod having wrapped at least once.

Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
---

v3:
 - Move queue checks into helpers
 - Avoid race by updating generation before prod (after some
   deliberation I've concluded that it doesn't actually need any
   special handling for the timeout failure case either)

 drivers/iommu/arm-smmu-v3.c | 91 +++++++++++++++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 83b76404e882..3130e7182dde 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -417,7 +417,6 @@
 
 /* High-level queue structures */
 #define ARM_SMMU_POLL_TIMEOUT_US	100
-#define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US	1000000 /* 1s! */
 #define ARM_SMMU_SYNC_TIMEOUT_US	1000000 /* 1s! */
 
 #define MSI_IOVA_BASE			0x8000000
@@ -540,6 +539,7 @@ struct arm_smmu_queue {
 struct arm_smmu_cmdq {
 	struct arm_smmu_queue		q;
 	spinlock_t			lock;
+	int				generation;
 };
 
 struct arm_smmu_evtq {
@@ -737,6 +737,17 @@ static bool queue_empty(struct arm_smmu_queue *q)
 	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
 }
 
+static bool queue_behind(struct arm_smmu_queue *q, u32 idx)
+{
+	return Q_IDX(q, q->cons) < Q_IDX(q, idx);
+}
+
+static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx)
+{
+	return Q_IDX(q, q->cons) >= Q_IDX(q, idx) &&
+	       Q_WRP(q, q->cons) == Q_WRP(q, idx);
+}
+
 static void queue_sync_cons(struct arm_smmu_queue *q)
 {
 	q->cons = readl_relaxed(q->cons_reg);
@@ -770,21 +781,12 @@ static void queue_inc_prod(struct arm_smmu_queue *q)
 	writel(q->prod, q->prod_reg);
 }
 
-/*
- * Wait for the SMMU to consume items. If drain is true, wait until the queue
- * is empty. Otherwise, wait until there is at least one free slot.
- */
-static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe)
+/* Wait for the SMMU to consume items, until there is at least one free slot */
+static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe)
 {
-	ktime_t timeout;
-	unsigned int delay = 1;
+	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
 
-	/* Wait longer if it's queue drain */
-	timeout = ktime_add_us(ktime_get(), drain ?
-					    ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US :
-					    ARM_SMMU_POLL_TIMEOUT_US);
-
-	while (queue_sync_cons(q), (drain ? !queue_empty(q) : queue_full(q))) {
+	while (queue_sync_cons(q), queue_full(q)) {
 		if (ktime_compare(ktime_get(), timeout) > 0)
 			return -ETIMEDOUT;
 
@@ -792,8 +794,7 @@ static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe)
 			wfe();
 		} else {
 			cpu_relax();
-			udelay(delay);
-			delay *= 2;
+			udelay(1);
 		}
 	}
 
@@ -959,15 +960,20 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
 	queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
 }
 
-static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
+static u32 arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
 {
 	struct arm_smmu_queue *q = &smmu->cmdq.q;
 	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
 
+	if (Q_IDX(q, q->prod + 1) == 0)
+		smmu->cmdq.generation++;
+
 	while (queue_insert_raw(q, cmd) == -ENOSPC) {
-		if (queue_poll_cons(q, false, wfe))
+		if (queue_poll_cons(q, wfe))
 			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 	}
+
+	return q->prod;
 }
 
 static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
@@ -1001,15 +1007,53 @@ static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
 	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
 }
 
+static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
+				   int sync_gen)
+{
+	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US);
+	struct arm_smmu_queue *q = &smmu->cmdq.q;
+	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+	unsigned int delay = 1;
+
+	do {
+		queue_sync_cons(q);
+		/*
+		 * If we see updates quickly enough, cons has passed sync_idx,
+		 * but not yet wrapped. At worst, cons might have actually
+		 * wrapped an even number of times, but that still guarantees
+		 * the original sync must have been consumed.
+		 */
+		if (queue_ahead_not_wrapped(q, sync_idx))
+			return 0;
+		/*
+		 * Otherwise, cons may have passed sync_idx and wrapped one or
+		 * more times to appear behind it again, but in that case prod
+		 * must also be one or more generations ahead.
+		 */
+		if (queue_behind(q, sync_idx) &&
+		    READ_ONCE(smmu->cmdq.generation) != sync_gen)
+			return 0;
+
+		if (wfe) {
+			wfe();
+		} else {
+			cpu_relax();
+			udelay(delay);
+			delay *= 2;
+		}
+	} while (ktime_before(ktime_get(), timeout));
+
+	return -ETIMEDOUT;
+}
+
 static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
-	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
 	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
 		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
 	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
-	int ret;
+	int ret, sync_idx, sync_gen;
 
 	if (msi) {
 		ent.sync.msidata = atomic_inc_return_relaxed(&smmu->sync_nr);
@@ -1018,13 +1062,14 @@ static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
 
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
-	arm_smmu_cmdq_insert_cmd(smmu, cmd);
-	if (!msi)
-		ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
+	sync_idx = arm_smmu_cmdq_insert_cmd(smmu, cmd);
+	sync_gen = READ_ONCE(smmu->cmdq.generation);
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 
 	if (msi)
 		ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
+	else
+		ret = arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
 	if (ret)
 		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
 }
-- 
2.13.4.dirty

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 2/3] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
@ 2017-10-18 14:04     ` Robin Murphy
  0 siblings, 0 replies; 10+ messages in thread
From: Robin Murphy @ 2017-10-18 14:04 UTC (permalink / raw)
  To: linux-arm-kernel

Even without the MSI trick, we can still do a lot better than hogging
the entire queue while it drains. All we actually need to do for the
necessary guarantee of completion is wait for our particular command to
have been consumed, and as long as we keep track of where it is there is
no need to block other CPUs from adding further commands in the
meantime. There is one theoretical (but incredibly unlikely) edge case
to avoid, where cons has wrapped twice to still appear 'behind' the sync
position - this is easily disambiguated by adding a generation count to
the queue to indicate when prod wraps, since cons cannot wrap twice
without prod having wrapped at least once.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v3:
 - Move queue checks into helpers
 - Avoid race by updating generation before prod (after some
   deliberation I've concluded that it doesn't actually need any
   special handling for the timeout failure case either)

 drivers/iommu/arm-smmu-v3.c | 91 +++++++++++++++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 83b76404e882..3130e7182dde 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -417,7 +417,6 @@
 
 /* High-level queue structures */
 #define ARM_SMMU_POLL_TIMEOUT_US	100
-#define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US	1000000 /* 1s! */
 #define ARM_SMMU_SYNC_TIMEOUT_US	1000000 /* 1s! */
 
 #define MSI_IOVA_BASE			0x8000000
@@ -540,6 +539,7 @@ struct arm_smmu_queue {
 struct arm_smmu_cmdq {
 	struct arm_smmu_queue		q;
 	spinlock_t			lock;
+	int				generation;
 };
 
 struct arm_smmu_evtq {
@@ -737,6 +737,17 @@ static bool queue_empty(struct arm_smmu_queue *q)
 	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
 }
 
+static bool queue_behind(struct arm_smmu_queue *q, u32 idx)
+{
+	return Q_IDX(q, q->cons) < Q_IDX(q, idx);
+}
+
+static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx)
+{
+	return Q_IDX(q, q->cons) >= Q_IDX(q, idx) &&
+	       Q_WRP(q, q->cons) == Q_WRP(q, idx);
+}
+
 static void queue_sync_cons(struct arm_smmu_queue *q)
 {
 	q->cons = readl_relaxed(q->cons_reg);
@@ -770,21 +781,12 @@ static void queue_inc_prod(struct arm_smmu_queue *q)
 	writel(q->prod, q->prod_reg);
 }
 
-/*
- * Wait for the SMMU to consume items. If drain is true, wait until the queue
- * is empty. Otherwise, wait until there is at least one free slot.
- */
-static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe)
+/* Wait for the SMMU to consume items, until there is at least one free slot */
+static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe)
 {
-	ktime_t timeout;
-	unsigned int delay = 1;
+	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
 
-	/* Wait longer if it's queue drain */
-	timeout = ktime_add_us(ktime_get(), drain ?
-					    ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US :
-					    ARM_SMMU_POLL_TIMEOUT_US);
-
-	while (queue_sync_cons(q), (drain ? !queue_empty(q) : queue_full(q))) {
+	while (queue_sync_cons(q), queue_full(q)) {
 		if (ktime_compare(ktime_get(), timeout) > 0)
 			return -ETIMEDOUT;
 
@@ -792,8 +794,7 @@ static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe)
 			wfe();
 		} else {
 			cpu_relax();
-			udelay(delay);
-			delay *= 2;
+			udelay(1);
 		}
 	}
 
@@ -959,15 +960,20 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
 	queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
 }
 
-static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
+static u32 arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
 {
 	struct arm_smmu_queue *q = &smmu->cmdq.q;
 	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
 
+	if (Q_IDX(q, q->prod + 1) == 0)
+		smmu->cmdq.generation++;
+
 	while (queue_insert_raw(q, cmd) == -ENOSPC) {
-		if (queue_poll_cons(q, false, wfe))
+		if (queue_poll_cons(q, wfe))
 			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 	}
+
+	return q->prod;
 }
 
 static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
@@ -1001,15 +1007,53 @@ static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
 	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
 }
 
+static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
+				   int sync_gen)
+{
+	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US);
+	struct arm_smmu_queue *q = &smmu->cmdq.q;
+	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+	unsigned int delay = 1;
+
+	do {
+		queue_sync_cons(q);
+		/*
+		 * If we see updates quickly enough, cons has passed sync_idx,
+		 * but not yet wrapped. At worst, cons might have actually
+		 * wrapped an even number of times, but that still guarantees
+		 * the original sync must have been consumed.
+		 */
+		if (queue_ahead_not_wrapped(q, sync_idx))
+			return 0;
+		/*
+		 * Otherwise, cons may have passed sync_idx and wrapped one or
+		 * more times to appear behind it again, but in that case prod
+		 * must also be one or more generations ahead.
+		 */
+		if (queue_behind(q, sync_idx) &&
+		    READ_ONCE(smmu->cmdq.generation) != sync_gen)
+			return 0;
+
+		if (wfe) {
+			wfe();
+		} else {
+			cpu_relax();
+			udelay(delay);
+			delay *= 2;
+		}
+	} while (ktime_before(ktime_get(), timeout));
+
+	return -ETIMEDOUT;
+}
+
 static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
-	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
 	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
 		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
 	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
-	int ret;
+	int ret, sync_idx, sync_gen;
 
 	if (msi) {
 		ent.sync.msidata = atomic_inc_return_relaxed(&smmu->sync_nr);
@@ -1018,13 +1062,14 @@ static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
 
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
-	arm_smmu_cmdq_insert_cmd(smmu, cmd);
-	if (!msi)
-		ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
+	sync_idx = arm_smmu_cmdq_insert_cmd(smmu, cmd);
+	sync_gen = READ_ONCE(smmu->cmdq.generation);
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 
 	if (msi)
 		ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
+	else
+		ret = arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
 	if (ret)
 		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
 }
-- 
2.13.4.dirty

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [RFT v3 3/3] iommu/arm-smmu-v3: Use burst-polling for sync completion
  2017-10-18 14:04 ` Robin Murphy
@ 2017-10-18 14:04     ` Robin Murphy
  -1 siblings, 0 replies; 10+ messages in thread
From: Robin Murphy @ 2017-10-18 14:04 UTC (permalink / raw)
  To: will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

While CMD_SYNC is unlikely to complete immediately such that we never go
round the polling loop, with a lightly-loaded queue it may still do so
long before the delay period is up. If we have no better completion
notifier, use similar logic as we have for SMMUv2 to spin a number of
times before each backoff, so that we have more chance of catching syncs
which complete relatively quickly and avoid delaying unnecessarily.

Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
---

I still have no idea whether this patch has any real merit or not.

 drivers/iommu/arm-smmu-v3.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 3130e7182dde..994b3a38f222 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -418,6 +418,7 @@
 /* High-level queue structures */
 #define ARM_SMMU_POLL_TIMEOUT_US	100
 #define ARM_SMMU_SYNC_TIMEOUT_US	1000000 /* 1s! */
+#define ARM_SMMU_SYNC_SPIN_COUNT	10
 
 #define MSI_IOVA_BASE			0x8000000
 #define MSI_IOVA_LENGTH			0x100000
@@ -1013,7 +1014,7 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
 	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US);
 	struct arm_smmu_queue *q = &smmu->cmdq.q;
 	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
-	unsigned int delay = 1;
+	unsigned int delay = 1, spin_cnt = 0;
 
 	do {
 		queue_sync_cons(q);
@@ -1036,10 +1037,13 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
 
 		if (wfe) {
 			wfe();
-		} else {
+		} else if (++spin_cnt < ARM_SMMU_SYNC_SPIN_COUNT) {
 			cpu_relax();
+			continue;
+		} else {
 			udelay(delay);
 			delay *= 2;
+			spin_cnt = 0;
 		}
 	} while (ktime_before(ktime_get(), timeout));
 
-- 
2.13.4.dirty

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [RFT v3 3/3] iommu/arm-smmu-v3: Use burst-polling for sync completion
@ 2017-10-18 14:04     ` Robin Murphy
  0 siblings, 0 replies; 10+ messages in thread
From: Robin Murphy @ 2017-10-18 14:04 UTC (permalink / raw)
  To: linux-arm-kernel

While CMD_SYNC is unlikely to complete immediately such that we never go
round the polling loop, with a lightly-loaded queue it may still do so
long before the delay period is up. If we have no better completion
notifier, use similar logic as we have for SMMUv2 to spin a number of
times before each backoff, so that we have more chance of catching syncs
which complete relatively quickly and avoid delaying unnecessarily.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

I still have no idea whether this patch has any real merit or not.

 drivers/iommu/arm-smmu-v3.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 3130e7182dde..994b3a38f222 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -418,6 +418,7 @@
 /* High-level queue structures */
 #define ARM_SMMU_POLL_TIMEOUT_US	100
 #define ARM_SMMU_SYNC_TIMEOUT_US	1000000 /* 1s! */
+#define ARM_SMMU_SYNC_SPIN_COUNT	10
 
 #define MSI_IOVA_BASE			0x8000000
 #define MSI_IOVA_LENGTH			0x100000
@@ -1013,7 +1014,7 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
 	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US);
 	struct arm_smmu_queue *q = &smmu->cmdq.q;
 	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
-	unsigned int delay = 1;
+	unsigned int delay = 1, spin_cnt = 0;
 
 	do {
 		queue_sync_cons(q);
@@ -1036,10 +1037,13 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
 
 		if (wfe) {
 			wfe();
-		} else {
+		} else if (++spin_cnt < ARM_SMMU_SYNC_SPIN_COUNT) {
 			cpu_relax();
+			continue;
+		} else {
 			udelay(delay);
 			delay *= 2;
+			spin_cnt = 0;
 		}
 	} while (ktime_before(ktime_get(), timeout));
 
-- 
2.13.4.dirty

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/3] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
  2017-10-18 14:04     ` Robin Murphy
@ 2018-07-19  7:22         ` Leizhen (ThunderTown)
  -1 siblings, 0 replies; 10+ messages in thread
From: Leizhen (ThunderTown) @ 2018-07-19  7:22 UTC (permalink / raw)
  To: Robin Murphy, will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r



On 2017/10/18 22:04, Robin Murphy wrote:
> Even without the MSI trick, we can still do a lot better than hogging
> the entire queue while it drains. All we actually need to do for the
> necessary guarantee of completion is wait for our particular command to
> have been consumed, and as long as we keep track of where it is there is
> no need to block other CPUs from adding further commands in the
> meantime. There is one theoretical (but incredibly unlikely) edge case
> to avoid, where cons has wrapped twice to still appear 'behind' the sync
> position - this is easily disambiguated by adding a generation count to
> the queue to indicate when prod wraps, since cons cannot wrap twice
> without prod having wrapped at least once.

Hi Robin,

I applied your patch and got below improvemnet for NVMe device.

Randomly Read  IOPS: 146K --> 214K
Randomly Write IOPS: 143K --> 212K

Tested-by: Zhen Lei <thunder.leizhen-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>

> 
> Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
> ---
> 
> v3:
>  - Move queue checks into helpers
>  - Avoid race by updating generation before prod (after some
>    deliberation I've concluded that it doesn't actually need any
>    special handling for the timeout failure case either)
> 
>  drivers/iommu/arm-smmu-v3.c | 91 +++++++++++++++++++++++++++++++++------------
>  1 file changed, 68 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 83b76404e882..3130e7182dde 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -417,7 +417,6 @@
>  
>  /* High-level queue structures */
>  #define ARM_SMMU_POLL_TIMEOUT_US	100
> -#define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US	1000000 /* 1s! */
>  #define ARM_SMMU_SYNC_TIMEOUT_US	1000000 /* 1s! */
>  
>  #define MSI_IOVA_BASE			0x8000000
> @@ -540,6 +539,7 @@ struct arm_smmu_queue {
>  struct arm_smmu_cmdq {
>  	struct arm_smmu_queue		q;
>  	spinlock_t			lock;
> +	int				generation;
>  };
>  
>  struct arm_smmu_evtq {
> @@ -737,6 +737,17 @@ static bool queue_empty(struct arm_smmu_queue *q)
>  	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
>  }
>  
> +static bool queue_behind(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) < Q_IDX(q, idx);
> +}
> +
> +static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) >= Q_IDX(q, idx) &&
> +	       Q_WRP(q, q->cons) == Q_WRP(q, idx);
> +}
> +
>  static void queue_sync_cons(struct arm_smmu_queue *q)
>  {
>  	q->cons = readl_relaxed(q->cons_reg);
> @@ -770,21 +781,12 @@ static void queue_inc_prod(struct arm_smmu_queue *q)
>  	writel(q->prod, q->prod_reg);
>  }
>  
> -/*
> - * Wait for the SMMU to consume items. If drain is true, wait until the queue
> - * is empty. Otherwise, wait until there is at least one free slot.
> - */
> -static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe)
> +/* Wait for the SMMU to consume items, until there is at least one free slot */
> +static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe)
>  {
> -	ktime_t timeout;
> -	unsigned int delay = 1;
> +	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
>  
> -	/* Wait longer if it's queue drain */
> -	timeout = ktime_add_us(ktime_get(), drain ?
> -					    ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US :
> -					    ARM_SMMU_POLL_TIMEOUT_US);
> -
> -	while (queue_sync_cons(q), (drain ? !queue_empty(q) : queue_full(q))) {
> +	while (queue_sync_cons(q), queue_full(q)) {
>  		if (ktime_compare(ktime_get(), timeout) > 0)
>  			return -ETIMEDOUT;
>  
> @@ -792,8 +794,7 @@ static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe)
>  			wfe();
>  		} else {
>  			cpu_relax();
> -			udelay(delay);
> -			delay *= 2;
> +			udelay(1);
>  		}
>  	}
>  
> @@ -959,15 +960,20 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
>  	queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
>  }
>  
> -static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
> +static u32 arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
>  {
>  	struct arm_smmu_queue *q = &smmu->cmdq.q;
>  	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>  
> +	if (Q_IDX(q, q->prod + 1) == 0)
> +		smmu->cmdq.generation++;
> +
>  	while (queue_insert_raw(q, cmd) == -ENOSPC) {
> -		if (queue_poll_cons(q, false, wfe))
> +		if (queue_poll_cons(q, wfe))
>  			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>  	}
> +
> +	return q->prod;
>  }
>  
>  static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
> @@ -1001,15 +1007,53 @@ static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
>  }
>  
> +static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
> +				   int sync_gen)
> +{
> +	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US);
> +	struct arm_smmu_queue *q = &smmu->cmdq.q;
> +	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> +	unsigned int delay = 1;
> +
> +	do {
> +		queue_sync_cons(q);
> +		/*
> +		 * If we see updates quickly enough, cons has passed sync_idx,
> +		 * but not yet wrapped. At worst, cons might have actually
> +		 * wrapped an even number of times, but that still guarantees
> +		 * the original sync must have been consumed.
> +		 */
> +		if (queue_ahead_not_wrapped(q, sync_idx))
> +			return 0;
> +		/*
> +		 * Otherwise, cons may have passed sync_idx and wrapped one or
> +		 * more times to appear behind it again, but in that case prod
> +		 * must also be one or more generations ahead.
> +		 */
> +		if (queue_behind(q, sync_idx) &&
> +		    READ_ONCE(smmu->cmdq.generation) != sync_gen)
> +			return 0;
> +
> +		if (wfe) {
> +			wfe();
> +		} else {
> +			cpu_relax();
> +			udelay(delay);
> +			delay *= 2;
> +		}
> +	} while (ktime_before(ktime_get(), timeout));
> +
> +	return -ETIMEDOUT;
> +}
> +
>  static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>  	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
>  		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
>  	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int ret;
> +	int ret, sync_idx, sync_gen;
>  
>  	if (msi) {
>  		ent.sync.msidata = atomic_inc_return_relaxed(&smmu->sync_nr);
> @@ -1018,13 +1062,14 @@ static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  	arm_smmu_cmdq_build_cmd(cmd, &ent);
>  
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	arm_smmu_cmdq_insert_cmd(smmu, cmd);
> -	if (!msi)
> -		ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
> +	sync_idx = arm_smmu_cmdq_insert_cmd(smmu, cmd);
> +	sync_gen = READ_ONCE(smmu->cmdq.generation);
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>  
>  	if (msi)
>  		ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
> +	else
> +		ret = arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  	if (ret)
>  		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>  }
> 

-- 
Thanks!
BestRegards

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3 2/3] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
@ 2018-07-19  7:22         ` Leizhen (ThunderTown)
  0 siblings, 0 replies; 10+ messages in thread
From: Leizhen (ThunderTown) @ 2018-07-19  7:22 UTC (permalink / raw)
  To: linux-arm-kernel



On 2017/10/18 22:04, Robin Murphy wrote:
> Even without the MSI trick, we can still do a lot better than hogging
> the entire queue while it drains. All we actually need to do for the
> necessary guarantee of completion is wait for our particular command to
> have been consumed, and as long as we keep track of where it is there is
> no need to block other CPUs from adding further commands in the
> meantime. There is one theoretical (but incredibly unlikely) edge case
> to avoid, where cons has wrapped twice to still appear 'behind' the sync
> position - this is easily disambiguated by adding a generation count to
> the queue to indicate when prod wraps, since cons cannot wrap twice
> without prod having wrapped at least once.

Hi Robin,

I applied your patch and got below improvemnet for NVMe device.

Randomly Read  IOPS: 146K --> 214K
Randomly Write IOPS: 143K --> 212K

Tested-by: Zhen Lei <thunder.leizhen@huawei.com>

> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
> 
> v3:
>  - Move queue checks into helpers
>  - Avoid race by updating generation before prod (after some
>    deliberation I've concluded that it doesn't actually need any
>    special handling for the timeout failure case either)
> 
>  drivers/iommu/arm-smmu-v3.c | 91 +++++++++++++++++++++++++++++++++------------
>  1 file changed, 68 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 83b76404e882..3130e7182dde 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -417,7 +417,6 @@
>  
>  /* High-level queue structures */
>  #define ARM_SMMU_POLL_TIMEOUT_US	100
> -#define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US	1000000 /* 1s! */
>  #define ARM_SMMU_SYNC_TIMEOUT_US	1000000 /* 1s! */
>  
>  #define MSI_IOVA_BASE			0x8000000
> @@ -540,6 +539,7 @@ struct arm_smmu_queue {
>  struct arm_smmu_cmdq {
>  	struct arm_smmu_queue		q;
>  	spinlock_t			lock;
> +	int				generation;
>  };
>  
>  struct arm_smmu_evtq {
> @@ -737,6 +737,17 @@ static bool queue_empty(struct arm_smmu_queue *q)
>  	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
>  }
>  
> +static bool queue_behind(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) < Q_IDX(q, idx);
> +}
> +
> +static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) >= Q_IDX(q, idx) &&
> +	       Q_WRP(q, q->cons) == Q_WRP(q, idx);
> +}
> +
>  static void queue_sync_cons(struct arm_smmu_queue *q)
>  {
>  	q->cons = readl_relaxed(q->cons_reg);
> @@ -770,21 +781,12 @@ static void queue_inc_prod(struct arm_smmu_queue *q)
>  	writel(q->prod, q->prod_reg);
>  }
>  
> -/*
> - * Wait for the SMMU to consume items. If drain is true, wait until the queue
> - * is empty. Otherwise, wait until there is at least one free slot.
> - */
> -static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe)
> +/* Wait for the SMMU to consume items, until there is at least one free slot */
> +static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe)
>  {
> -	ktime_t timeout;
> -	unsigned int delay = 1;
> +	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
>  
> -	/* Wait longer if it's queue drain */
> -	timeout = ktime_add_us(ktime_get(), drain ?
> -					    ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US :
> -					    ARM_SMMU_POLL_TIMEOUT_US);
> -
> -	while (queue_sync_cons(q), (drain ? !queue_empty(q) : queue_full(q))) {
> +	while (queue_sync_cons(q), queue_full(q)) {
>  		if (ktime_compare(ktime_get(), timeout) > 0)
>  			return -ETIMEDOUT;
>  
> @@ -792,8 +794,7 @@ static int queue_poll_cons(struct arm_smmu_queue *q, bool drain, bool wfe)
>  			wfe();
>  		} else {
>  			cpu_relax();
> -			udelay(delay);
> -			delay *= 2;
> +			udelay(1);
>  		}
>  	}
>  
> @@ -959,15 +960,20 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
>  	queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
>  }
>  
> -static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
> +static u32 arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
>  {
>  	struct arm_smmu_queue *q = &smmu->cmdq.q;
>  	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>  
> +	if (Q_IDX(q, q->prod + 1) == 0)
> +		smmu->cmdq.generation++;
> +
>  	while (queue_insert_raw(q, cmd) == -ENOSPC) {
> -		if (queue_poll_cons(q, false, wfe))
> +		if (queue_poll_cons(q, wfe))
>  			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>  	}
> +
> +	return q->prod;
>  }
>  
>  static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
> @@ -1001,15 +1007,53 @@ static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
>  }
>  
> +static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
> +				   int sync_gen)
> +{
> +	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US);
> +	struct arm_smmu_queue *q = &smmu->cmdq.q;
> +	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> +	unsigned int delay = 1;
> +
> +	do {
> +		queue_sync_cons(q);
> +		/*
> +		 * If we see updates quickly enough, cons has passed sync_idx,
> +		 * but not yet wrapped. At worst, cons might have actually
> +		 * wrapped an even number of times, but that still guarantees
> +		 * the original sync must have been consumed.
> +		 */
> +		if (queue_ahead_not_wrapped(q, sync_idx))
> +			return 0;
> +		/*
> +		 * Otherwise, cons may have passed sync_idx and wrapped one or
> +		 * more times to appear behind it again, but in that case prod
> +		 * must also be one or more generations ahead.
> +		 */
> +		if (queue_behind(q, sync_idx) &&
> +		    READ_ONCE(smmu->cmdq.generation) != sync_gen)
> +			return 0;
> +
> +		if (wfe) {
> +			wfe();
> +		} else {
> +			cpu_relax();
> +			udelay(delay);
> +			delay *= 2;
> +		}
> +	} while (ktime_before(ktime_get(), timeout));
> +
> +	return -ETIMEDOUT;
> +}
> +
>  static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>  	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
>  		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
>  	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int ret;
> +	int ret, sync_idx, sync_gen;
>  
>  	if (msi) {
>  		ent.sync.msidata = atomic_inc_return_relaxed(&smmu->sync_nr);
> @@ -1018,13 +1062,14 @@ static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  	arm_smmu_cmdq_build_cmd(cmd, &ent);
>  
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	arm_smmu_cmdq_insert_cmd(smmu, cmd);
> -	if (!msi)
> -		ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
> +	sync_idx = arm_smmu_cmdq_insert_cmd(smmu, cmd);
> +	sync_gen = READ_ONCE(smmu->cmdq.generation);
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>  
>  	if (msi)
>  		ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
> +	else
> +		ret = arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  	if (ret)
>  		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>  }
> 

-- 
Thanks!
BestRegards

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2018-07-19  7:22 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-18 14:04 [PATCH v3 0/3] SMMUv3 CMD_SYNC optimisation Robin Murphy
2017-10-18 14:04 ` Robin Murphy
     [not found] ` <cover.1508334262.git.robin.murphy-5wv7dgnIgG8@public.gmane.org>
2017-10-18 14:04   ` [PATCH v3 1/3] iommu/arm-smmu-v3: Use CMD_SYNC completion MSI Robin Murphy
2017-10-18 14:04     ` Robin Murphy
2017-10-18 14:04   ` [PATCH v3 2/3] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock Robin Murphy
2017-10-18 14:04     ` Robin Murphy
     [not found]     ` <625ffbc273577515c324f6deea66a366e675b751.1508334262.git.robin.murphy-5wv7dgnIgG8@public.gmane.org>
2018-07-19  7:22       ` Leizhen (ThunderTown)
2018-07-19  7:22         ` Leizhen (ThunderTown)
2017-10-18 14:04   ` [RFT v3 3/3] iommu/arm-smmu-v3: Use burst-polling for sync completion Robin Murphy
2017-10-18 14:04     ` Robin Murphy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.