All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
@ 2018-10-17 13:56 ` Robin Murphy
  0 siblings, 0 replies; 24+ messages in thread
From: Robin Murphy @ 2018-10-17 13:56 UTC (permalink / raw)
  To: will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

Even without the MSI trick, we can still do a lot better than hogging
the entire queue while it drains. All we actually need to do for the
necessary guarantee of completion is wait for our particular command to
have been consumed - as long as we keep track of where we inserted it,
there is no need to block other CPUs from adding further commands in the
meantime. There is one theoretical (but incredibly unlikely) edge case
to avoid, where cons has wrapped twice to still appear 'behind' the sync
position - this is easily disambiguated by adding a generation count to
the queue to indicate when prod wraps, since cons cannot wrap twice
without prod having wrapped at least once.

This also makes it reasonable to separate the two conceptually different
modes of polling such that command insertion - which really wants to be
fair and have minimal latency - is not subject to exponential backoff,
and returns to its original implementation.

Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
---

v5:
 - Rework to incorporate the back-to-back sync elision.
 - Refactor the generation count slightly to preemptively help with
   the HiSilicon MSI workaround.
 - Split the cleanup into a separate patch for ease of review (it could
   happily be squashed when applied).

The fundamental logic is copied directly from v4, but I've dropped the
previous tested-by since there are a fair few subtle changes in how it's
integrated. Patches are based on Will's iommu/devel branch plus my "Fix
big-endian CMD_SYNC writes" patch.

Robin.

 drivers/iommu/arm-smmu-v3.c | 94 +++++++++++++++++++++++++++----------
 1 file changed, 69 insertions(+), 25 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 867ba548c2cc..da8a91d116bf 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -588,6 +588,7 @@ struct arm_smmu_device {
 	struct arm_smmu_strtab_cfg	strtab_cfg;
 
 	u32				sync_count;
+	int				cmdq_generation;
 
 	/* IOMMU core code handle */
 	struct iommu_device		iommu;
@@ -676,6 +677,17 @@ static bool queue_empty(struct arm_smmu_queue *q)
 	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
 }
 
+static bool queue_behind(struct arm_smmu_queue *q, u32 idx)
+{
+	return Q_IDX(q, q->cons) < Q_IDX(q, idx);
+}
+
+static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx)
+{
+	return Q_IDX(q, q->cons) >= Q_IDX(q, idx) &&
+	       Q_WRP(q, q->cons) == Q_WRP(q, idx);
+}
+
 static void queue_sync_cons(struct arm_smmu_queue *q)
 {
 	q->cons = readl_relaxed(q->cons_reg);
@@ -709,33 +721,19 @@ static void queue_inc_prod(struct arm_smmu_queue *q)
 	writel(q->prod, q->prod_reg);
 }
 
-/*
- * Wait for the SMMU to consume items. If sync is true, wait until the queue
- * is empty. Otherwise, wait until there is at least one free slot.
- */
-static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
+static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe)
 {
-	ktime_t timeout;
-	unsigned int delay = 1, spin_cnt = 0;
+	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
 
-	/* Wait longer if it's a CMD_SYNC */
-	timeout = ktime_add_us(ktime_get(), sync ?
-					    ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
-					    ARM_SMMU_POLL_TIMEOUT_US);
-
-	while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) {
+	while (queue_sync_cons(q), queue_full(q)) {
 		if (ktime_compare(ktime_get(), timeout) > 0)
 			return -ETIMEDOUT;
 
 		if (wfe) {
 			wfe();
-		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
-			cpu_relax();
-			continue;
 		} else {
-			udelay(delay);
-			delay *= 2;
-			spin_cnt = 0;
+			cpu_relax();
+			udelay(1);
 		}
 	}
 
@@ -905,8 +903,11 @@ static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
 
 	smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
 
+	if (Q_IDX(q, q->prod + 1) == 0)
+		WRITE_ONCE(smmu->cmdq_generation, smmu->cmdq_generation + 1);
+
 	while (queue_insert_raw(q, cmd) == -ENOSPC) {
-		if (queue_poll_cons(q, false, wfe))
+		if (queue_poll_cons(q, wfe))
 			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 	}
 }
@@ -945,6 +946,48 @@ static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
 	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
 }
 
+static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
+				   int sync_gen)
+{
+	struct arm_smmu_queue *q = &smmu->cmdq.q;
+	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+	unsigned int delay = 1, spin_cnt = 0;
+	ktime_t timeout;
+
+	timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
+	do {
+		queue_sync_cons(q);
+		/*
+		 * If we see updates quickly enough, cons has passed sync_idx,
+		 * but not yet wrapped. At worst, cons might have actually
+		 * wrapped an even number of times, but that still guarantees
+		 * the original sync must have been consumed.
+		 */
+		if (queue_ahead_not_wrapped(q, sync_idx))
+			return 0;
+		/*
+		 * Otherwise, cons may have passed sync_idx and wrapped one or
+		 * more times to appear behind it again, but in that case prod
+		 * must also be one or more generations ahead.
+		 */
+		if (queue_behind(q, sync_idx) &&
+		    READ_ONCE(smmu->cmdq_generation) != sync_gen)
+			return 0;
+
+		if (wfe) {
+			wfe();
+		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
+			cpu_relax();
+		} else {
+			udelay(delay);
+			delay *= 2;
+			spin_cnt = 0;
+		}
+	} while (ktime_before(ktime_get(), timeout));
+
+	return -ETIMEDOUT;
+}
+
 static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
@@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
-	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
 	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
-	int ret;
+	int sync_idx, sync_gen;
 
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
 
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
-	arm_smmu_cmdq_insert_cmd(smmu, cmd);
-	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
+	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
+		arm_smmu_cmdq_insert_cmd(smmu, cmd);
+	sync_idx = smmu->cmdq.q.prod;
+	sync_gen = READ_ONCE(smmu->cmdq_generation);
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 
-	return ret;
+	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
 }
 
 static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
-- 
2.19.1.dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
@ 2018-10-17 13:56 ` Robin Murphy
  0 siblings, 0 replies; 24+ messages in thread
From: Robin Murphy @ 2018-10-17 13:56 UTC (permalink / raw)
  To: linux-arm-kernel

Even without the MSI trick, we can still do a lot better than hogging
the entire queue while it drains. All we actually need to do for the
necessary guarantee of completion is wait for our particular command to
have been consumed - as long as we keep track of where we inserted it,
there is no need to block other CPUs from adding further commands in the
meantime. There is one theoretical (but incredibly unlikely) edge case
to avoid, where cons has wrapped twice to still appear 'behind' the sync
position - this is easily disambiguated by adding a generation count to
the queue to indicate when prod wraps, since cons cannot wrap twice
without prod having wrapped at least once.

This also makes it reasonable to separate the two conceptually different
modes of polling such that command insertion - which really wants to be
fair and have minimal latency - is not subject to exponential backoff,
and returns to its original implementation.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v5:
 - Rework to incorporate the back-to-back sync elision.
 - Refactor the generation count slightly to preemptively help with
   the HiSilicon MSI workaround.
 - Split the cleanup into a separate patch for ease of review (it could
   happily be squashed when applied).

The fundamental logic is copied directly from v4, but I've dropped the
previous tested-by since there are a fair few subtle changes in how it's
integrated. Patches are based on Will's iommu/devel branch plus my "Fix
big-endian CMD_SYNC writes" patch.

Robin.

 drivers/iommu/arm-smmu-v3.c | 94 +++++++++++++++++++++++++++----------
 1 file changed, 69 insertions(+), 25 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 867ba548c2cc..da8a91d116bf 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -588,6 +588,7 @@ struct arm_smmu_device {
 	struct arm_smmu_strtab_cfg	strtab_cfg;
 
 	u32				sync_count;
+	int				cmdq_generation;
 
 	/* IOMMU core code handle */
 	struct iommu_device		iommu;
@@ -676,6 +677,17 @@ static bool queue_empty(struct arm_smmu_queue *q)
 	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
 }
 
+static bool queue_behind(struct arm_smmu_queue *q, u32 idx)
+{
+	return Q_IDX(q, q->cons) < Q_IDX(q, idx);
+}
+
+static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx)
+{
+	return Q_IDX(q, q->cons) >= Q_IDX(q, idx) &&
+	       Q_WRP(q, q->cons) == Q_WRP(q, idx);
+}
+
 static void queue_sync_cons(struct arm_smmu_queue *q)
 {
 	q->cons = readl_relaxed(q->cons_reg);
@@ -709,33 +721,19 @@ static void queue_inc_prod(struct arm_smmu_queue *q)
 	writel(q->prod, q->prod_reg);
 }
 
-/*
- * Wait for the SMMU to consume items. If sync is true, wait until the queue
- * is empty. Otherwise, wait until there is at least one free slot.
- */
-static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
+static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe)
 {
-	ktime_t timeout;
-	unsigned int delay = 1, spin_cnt = 0;
+	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
 
-	/* Wait longer if it's a CMD_SYNC */
-	timeout = ktime_add_us(ktime_get(), sync ?
-					    ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
-					    ARM_SMMU_POLL_TIMEOUT_US);
-
-	while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) {
+	while (queue_sync_cons(q), queue_full(q)) {
 		if (ktime_compare(ktime_get(), timeout) > 0)
 			return -ETIMEDOUT;
 
 		if (wfe) {
 			wfe();
-		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
-			cpu_relax();
-			continue;
 		} else {
-			udelay(delay);
-			delay *= 2;
-			spin_cnt = 0;
+			cpu_relax();
+			udelay(1);
 		}
 	}
 
@@ -905,8 +903,11 @@ static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
 
 	smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
 
+	if (Q_IDX(q, q->prod + 1) == 0)
+		WRITE_ONCE(smmu->cmdq_generation, smmu->cmdq_generation + 1);
+
 	while (queue_insert_raw(q, cmd) == -ENOSPC) {
-		if (queue_poll_cons(q, false, wfe))
+		if (queue_poll_cons(q, wfe))
 			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 	}
 }
@@ -945,6 +946,48 @@ static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
 	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
 }
 
+static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
+				   int sync_gen)
+{
+	struct arm_smmu_queue *q = &smmu->cmdq.q;
+	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
+	unsigned int delay = 1, spin_cnt = 0;
+	ktime_t timeout;
+
+	timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
+	do {
+		queue_sync_cons(q);
+		/*
+		 * If we see updates quickly enough, cons has passed sync_idx,
+		 * but not yet wrapped. At worst, cons might have actually
+		 * wrapped an even number of times, but that still guarantees
+		 * the original sync must have been consumed.
+		 */
+		if (queue_ahead_not_wrapped(q, sync_idx))
+			return 0;
+		/*
+		 * Otherwise, cons may have passed sync_idx and wrapped one or
+		 * more times to appear behind it again, but in that case prod
+		 * must also be one or more generations ahead.
+		 */
+		if (queue_behind(q, sync_idx) &&
+		    READ_ONCE(smmu->cmdq_generation) != sync_gen)
+			return 0;
+
+		if (wfe) {
+			wfe();
+		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
+			cpu_relax();
+		} else {
+			udelay(delay);
+			delay *= 2;
+			spin_cnt = 0;
+		}
+	} while (ktime_before(ktime_get(), timeout));
+
+	return -ETIMEDOUT;
+}
+
 static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
@@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
-	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
 	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
-	int ret;
+	int sync_idx, sync_gen;
 
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
 
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
-	arm_smmu_cmdq_insert_cmd(smmu, cmd);
-	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
+	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
+		arm_smmu_cmdq_insert_cmd(smmu, cmd);
+	sync_idx = smmu->cmdq.q.prod;
+	sync_gen = READ_ONCE(smmu->cmdq_generation);
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 
-	return ret;
+	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
 }
 
 static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
-- 
2.19.1.dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
  2018-10-17 13:56 ` Robin Murphy
@ 2018-10-17 13:56     ` Robin Murphy
  -1 siblings, 0 replies; 24+ messages in thread
From: Robin Murphy @ 2018-10-17 13:56 UTC (permalink / raw)
  To: will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

Now that both sync methods are more or less the same shape, we can save
some code and levels of indirection by rolling them up together again,
with just a couple of simple conditionals to discriminate the MSI and
queue-polling specifics.

Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
---
 drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
 1 file changed, 12 insertions(+), 37 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index da8a91d116bf..36db63e3afcf 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
  * The difference between val and sync_idx is bounded by the maximum size of
  * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
  */
-static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
+static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
 {
 	ktime_t timeout;
 	u32 val;
@@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
 	return -ETIMEDOUT;
 }
 
-static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
+static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
-	struct arm_smmu_cmdq_ent ent = {
-		.opcode = CMDQ_OP_CMD_SYNC,
-		.sync	= {
-			.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
-		},
-	};
+	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
+		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
+	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
+	int ret, sync_idx, sync_gen;
+
+	if (msi)
+		ent.sync.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count));
 
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
 
@@ -1009,39 +1010,13 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
 		arm_smmu_cmdq_build_cmd(cmd, &ent);
 		arm_smmu_cmdq_insert_cmd(smmu, cmd);
 	}
-
-	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
-
-	return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
-}
-
-static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
-{
-	u64 cmd[CMDQ_ENT_DWORDS];
-	unsigned long flags;
-	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
-	int sync_idx, sync_gen;
-
-	arm_smmu_cmdq_build_cmd(cmd, &ent);
-
-	spin_lock_irqsave(&smmu->cmdq.lock, flags);
-	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
-		arm_smmu_cmdq_insert_cmd(smmu, cmd);
 	sync_idx = smmu->cmdq.q.prod;
 	sync_gen = READ_ONCE(smmu->cmdq_generation);
+
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 
-	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
-}
-
-static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
-{
-	int ret;
-	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
-		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
-
-	ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
-		  : __arm_smmu_cmdq_issue_sync(smmu);
+	ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
+		  : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
 	if (ret)
 		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
 }
-- 
2.19.1.dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
@ 2018-10-17 13:56     ` Robin Murphy
  0 siblings, 0 replies; 24+ messages in thread
From: Robin Murphy @ 2018-10-17 13:56 UTC (permalink / raw)
  To: linux-arm-kernel

Now that both sync methods are more or less the same shape, we can save
some code and levels of indirection by rolling them up together again,
with just a couple of simple conditionals to discriminate the MSI and
queue-polling specifics.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
 drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
 1 file changed, 12 insertions(+), 37 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index da8a91d116bf..36db63e3afcf 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
  * The difference between val and sync_idx is bounded by the maximum size of
  * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
  */
-static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
+static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
 {
 	ktime_t timeout;
 	u32 val;
@@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
 	return -ETIMEDOUT;
 }
 
-static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
+static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
-	struct arm_smmu_cmdq_ent ent = {
-		.opcode = CMDQ_OP_CMD_SYNC,
-		.sync	= {
-			.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
-		},
-	};
+	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
+		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
+	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
+	int ret, sync_idx, sync_gen;
+
+	if (msi)
+		ent.sync.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count));
 
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
 
@@ -1009,39 +1010,13 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
 		arm_smmu_cmdq_build_cmd(cmd, &ent);
 		arm_smmu_cmdq_insert_cmd(smmu, cmd);
 	}
-
-	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
-
-	return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
-}
-
-static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
-{
-	u64 cmd[CMDQ_ENT_DWORDS];
-	unsigned long flags;
-	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
-	int sync_idx, sync_gen;
-
-	arm_smmu_cmdq_build_cmd(cmd, &ent);
-
-	spin_lock_irqsave(&smmu->cmdq.lock, flags);
-	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
-		arm_smmu_cmdq_insert_cmd(smmu, cmd);
 	sync_idx = smmu->cmdq.q.prod;
 	sync_gen = READ_ONCE(smmu->cmdq_generation);
+
 	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
 
-	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
-}
-
-static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
-{
-	int ret;
-	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
-		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
-
-	ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
-		  : __arm_smmu_cmdq_issue_sync(smmu);
+	ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
+		  : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
 	if (ret)
 		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
 }
-- 
2.19.1.dirty

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
  2018-10-17 13:56     ` Robin Murphy
@ 2018-10-17 14:38         ` John Garry
  -1 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-17 14:38 UTC (permalink / raw)
  To: Robin Murphy, will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Linuxarm,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On 17/10/2018 14:56, Robin Murphy wrote:
> Now that both sync methods are more or less the same shape, we can save
> some code and levels of indirection by rolling them up together again,
> with just a couple of simple conditionals to discriminate the MSI and
> queue-polling specifics.

Hi Robin, Will,

I had been thinking of this other patch previously:

iommu/arm-smmu-v3: Stop rebuilding non-MSI CMD_SYNC commands

The contents of the non-MSI CMD_SYNC command are fixed. This patch 
offers a small optimisation by keeping a sample of this command on the 
heap for re-use, thereby avoiding unnecessary re-building.

Signed-off-by: John Garry <john.garry-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 6947ccf..9d86c29 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -963,14 +963,16 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct 
arm_smmu_device *smmu)

  static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
  {
-       u64 cmd[CMDQ_ENT_DWORDS];
+       static u64 cmd[CMDQ_ENT_DWORDS] = {
+              FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) |
+              FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV) |
+              FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH) |
+              FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB),
+           0};
         unsigned long flags;
         bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
-       struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
         int ret;

-       arm_smmu_cmdq_build_cmd(cmd, &ent);
-
         spin_lock_irqsave(&smmu->cmdq.lock, flags);
         arm_smmu_cmdq_insert_cmd(smmu, cmd);
         ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);

But it seems that combining the the MSI and non-MSI methods would block 
this.

How do you feel about this?

Thanks,
John

>
> Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
> ---
>  drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
>  1 file changed, 12 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index da8a91d116bf..36db63e3afcf 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>   * The difference between val and sync_idx is bounded by the maximum size of
>   * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
>   */
> -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  {
>  	ktime_t timeout;
>  	u32 val;
> @@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
>  	return -ETIMEDOUT;
>  }
>
> -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
> +static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = {
> -		.opcode = CMDQ_OP_CMD_SYNC,
> -		.sync	= {
> -			.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
> -		},
> -	};
> +	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> +		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> +	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> +	int ret, sync_idx, sync_gen;
> +
> +	if (msi)
> +		ent.sync.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count));
>
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
>
> @@ -1009,39 +1010,13 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>  		arm_smmu_cmdq_build_cmd(cmd, &ent);
>  		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	}
> -
> -	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
> -
> -	return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
> -}
> -
> -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	u64 cmd[CMDQ_ENT_DWORDS];
> -	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int sync_idx, sync_gen;
> -
> -	arm_smmu_cmdq_build_cmd(cmd, &ent);
> -
> -	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
> -		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	sync_idx = smmu->cmdq.q.prod;
>  	sync_gen = READ_ONCE(smmu->cmdq_generation);
> +
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>
> -	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
> -}
> -
> -static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	int ret;
> -	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> -		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> -
> -	ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
> -		  : __arm_smmu_cmdq_issue_sync(smmu);
> +	ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
> +		  : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  	if (ret)
>  		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>  }
>

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
@ 2018-10-17 14:38         ` John Garry
  0 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-17 14:38 UTC (permalink / raw)
  To: linux-arm-kernel

On 17/10/2018 14:56, Robin Murphy wrote:
> Now that both sync methods are more or less the same shape, we can save
> some code and levels of indirection by rolling them up together again,
> with just a couple of simple conditionals to discriminate the MSI and
> queue-polling specifics.

Hi Robin, Will,

I had been thinking of this other patch previously:

iommu/arm-smmu-v3: Stop rebuilding non-MSI CMD_SYNC commands

The contents of the non-MSI CMD_SYNC command are fixed. This patch 
offers a small optimisation by keeping a sample of this command on the 
heap for re-use, thereby avoiding unnecessary re-building.

Signed-off-by: John Garry <john.garry@huawei.com>

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 6947ccf..9d86c29 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -963,14 +963,16 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct 
arm_smmu_device *smmu)

  static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
  {
-       u64 cmd[CMDQ_ENT_DWORDS];
+       static u64 cmd[CMDQ_ENT_DWORDS] = {
+              FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) |
+              FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV) |
+              FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH) |
+              FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB),
+           0};
         unsigned long flags;
         bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
-       struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
         int ret;

-       arm_smmu_cmdq_build_cmd(cmd, &ent);
-
         spin_lock_irqsave(&smmu->cmdq.lock, flags);
         arm_smmu_cmdq_insert_cmd(smmu, cmd);
         ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);

But it seems that combining the the MSI and non-MSI methods would block 
this.

How do you feel about this?

Thanks,
John

>
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>  drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
>  1 file changed, 12 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index da8a91d116bf..36db63e3afcf 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>   * The difference between val and sync_idx is bounded by the maximum size of
>   * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
>   */
> -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  {
>  	ktime_t timeout;
>  	u32 val;
> @@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
>  	return -ETIMEDOUT;
>  }
>
> -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
> +static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = {
> -		.opcode = CMDQ_OP_CMD_SYNC,
> -		.sync	= {
> -			.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
> -		},
> -	};
> +	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> +		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> +	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> +	int ret, sync_idx, sync_gen;
> +
> +	if (msi)
> +		ent.sync.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count));
>
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
>
> @@ -1009,39 +1010,13 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>  		arm_smmu_cmdq_build_cmd(cmd, &ent);
>  		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	}
> -
> -	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
> -
> -	return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
> -}
> -
> -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	u64 cmd[CMDQ_ENT_DWORDS];
> -	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int sync_idx, sync_gen;
> -
> -	arm_smmu_cmdq_build_cmd(cmd, &ent);
> -
> -	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
> -		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	sync_idx = smmu->cmdq.q.prod;
>  	sync_gen = READ_ONCE(smmu->cmdq_generation);
> +
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>
> -	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
> -}
> -
> -static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	int ret;
> -	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> -		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> -
> -	ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
> -		  : __arm_smmu_cmdq_issue_sync(smmu);
> +	ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
> +		  : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  	if (ret)
>  		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>  }
>

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
  2018-10-17 13:56 ` Robin Murphy
@ 2018-10-18  8:56     ` Andrew Murray
  -1 siblings, 0 replies; 24+ messages in thread
From: Andrew Murray @ 2018-10-18  8:56 UTC (permalink / raw)
  To: Robin Murphy
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	will.deacon-5wv7dgnIgG8,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On Wed, Oct 17, 2018 at 02:56:06PM +0100, Robin Murphy wrote:
> Even without the MSI trick, we can still do a lot better than hogging
> the entire queue while it drains. All we actually need to do for the
> necessary guarantee of completion is wait for our particular command to
> have been consumed - as long as we keep track of where we inserted it,
> there is no need to block other CPUs from adding further commands in the
> meantime. There is one theoretical (but incredibly unlikely) edge case
> to avoid, where cons has wrapped twice to still appear 'behind' the sync
> position - this is easily disambiguated by adding a generation count to
> the queue to indicate when prod wraps, since cons cannot wrap twice
> without prod having wrapped at least once.
> 
> This also makes it reasonable to separate the two conceptually different
> modes of polling such that command insertion - which really wants to be
> fair and have minimal latency - is not subject to exponential backoff,
> and returns to its original implementation.
> 
> Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
> ---
> 
> v5:
>  - Rework to incorporate the back-to-back sync elision.
>  - Refactor the generation count slightly to preemptively help with
>    the HiSilicon MSI workaround.
>  - Split the cleanup into a separate patch for ease of review (it could
>    happily be squashed when applied).
> 
> The fundamental logic is copied directly from v4, but I've dropped the
> previous tested-by since there are a fair few subtle changes in how it's
> integrated. Patches are based on Will's iommu/devel branch plus my "Fix
> big-endian CMD_SYNC writes" patch.

Reviewed-by: Andrew Murray <andrew.murray-5wv7dgnIgG8@public.gmane.org>

Thanks,

Andrew Murray

> 
> Robin.
> 
>  drivers/iommu/arm-smmu-v3.c | 94 +++++++++++++++++++++++++++----------
>  1 file changed, 69 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 867ba548c2cc..da8a91d116bf 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -588,6 +588,7 @@ struct arm_smmu_device {
>  	struct arm_smmu_strtab_cfg	strtab_cfg;
>  
>  	u32				sync_count;
> +	int				cmdq_generation;
>  
>  	/* IOMMU core code handle */
>  	struct iommu_device		iommu;
> @@ -676,6 +677,17 @@ static bool queue_empty(struct arm_smmu_queue *q)
>  	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
>  }
>  
> +static bool queue_behind(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) < Q_IDX(q, idx);
> +}
> +
> +static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) >= Q_IDX(q, idx) &&
> +	       Q_WRP(q, q->cons) == Q_WRP(q, idx);
> +}
> +
>  static void queue_sync_cons(struct arm_smmu_queue *q)
>  {
>  	q->cons = readl_relaxed(q->cons_reg);
> @@ -709,33 +721,19 @@ static void queue_inc_prod(struct arm_smmu_queue *q)
>  	writel(q->prod, q->prod_reg);
>  }
>  
> -/*
> - * Wait for the SMMU to consume items. If sync is true, wait until the queue
> - * is empty. Otherwise, wait until there is at least one free slot.
> - */
> -static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
> +static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe)
>  {
> -	ktime_t timeout;
> -	unsigned int delay = 1, spin_cnt = 0;
> +	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
>  
> -	/* Wait longer if it's a CMD_SYNC */
> -	timeout = ktime_add_us(ktime_get(), sync ?
> -					    ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
> -					    ARM_SMMU_POLL_TIMEOUT_US);
> -
> -	while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) {
> +	while (queue_sync_cons(q), queue_full(q)) {
>  		if (ktime_compare(ktime_get(), timeout) > 0)
>  			return -ETIMEDOUT;
>  
>  		if (wfe) {
>  			wfe();
> -		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
> -			cpu_relax();
> -			continue;
>  		} else {
> -			udelay(delay);
> -			delay *= 2;
> -			spin_cnt = 0;
> +			cpu_relax();
> +			udelay(1);
>  		}
>  	}
>  
> @@ -905,8 +903,11 @@ static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
>  
>  	smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
>  
> +	if (Q_IDX(q, q->prod + 1) == 0)
> +		WRITE_ONCE(smmu->cmdq_generation, smmu->cmdq_generation + 1);
> +
>  	while (queue_insert_raw(q, cmd) == -ENOSPC) {
> -		if (queue_poll_cons(q, false, wfe))
> +		if (queue_poll_cons(q, wfe))
>  			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>  	}
>  }
> @@ -945,6 +946,48 @@ static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
>  }
>  
> +static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
> +				   int sync_gen)
> +{
> +	struct arm_smmu_queue *q = &smmu->cmdq.q;
> +	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> +	unsigned int delay = 1, spin_cnt = 0;
> +	ktime_t timeout;
> +
> +	timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
> +	do {
> +		queue_sync_cons(q);
> +		/*
> +		 * If we see updates quickly enough, cons has passed sync_idx,
> +		 * but not yet wrapped. At worst, cons might have actually
> +		 * wrapped an even number of times, but that still guarantees
> +		 * the original sync must have been consumed.
> +		 */
> +		if (queue_ahead_not_wrapped(q, sync_idx))
> +			return 0;
> +		/*
> +		 * Otherwise, cons may have passed sync_idx and wrapped one or
> +		 * more times to appear behind it again, but in that case prod
> +		 * must also be one or more generations ahead.
> +		 */
> +		if (queue_behind(q, sync_idx) &&
> +		    READ_ONCE(smmu->cmdq_generation) != sync_gen)
> +			return 0;
> +
> +		if (wfe) {
> +			wfe();
> +		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
> +			cpu_relax();
> +		} else {
> +			udelay(delay);
> +			delay *= 2;
> +			spin_cnt = 0;
> +		}
> +	} while (ktime_before(ktime_get(), timeout));
> +
> +	return -ETIMEDOUT;
> +}
> +
>  static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
> @@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>  	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int ret;
> +	int sync_idx, sync_gen;
>  
>  	arm_smmu_cmdq_build_cmd(cmd, &ent);
>  
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	arm_smmu_cmdq_insert_cmd(smmu, cmd);
> -	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
> +	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
> +		arm_smmu_cmdq_insert_cmd(smmu, cmd);
> +	sync_idx = smmu->cmdq.q.prod;
> +	sync_gen = READ_ONCE(smmu->cmdq_generation);
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>  
> -	return ret;
> +	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  }
>  
>  static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -- 
> 2.19.1.dirty
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
@ 2018-10-18  8:56     ` Andrew Murray
  0 siblings, 0 replies; 24+ messages in thread
From: Andrew Murray @ 2018-10-18  8:56 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Oct 17, 2018 at 02:56:06PM +0100, Robin Murphy wrote:
> Even without the MSI trick, we can still do a lot better than hogging
> the entire queue while it drains. All we actually need to do for the
> necessary guarantee of completion is wait for our particular command to
> have been consumed - as long as we keep track of where we inserted it,
> there is no need to block other CPUs from adding further commands in the
> meantime. There is one theoretical (but incredibly unlikely) edge case
> to avoid, where cons has wrapped twice to still appear 'behind' the sync
> position - this is easily disambiguated by adding a generation count to
> the queue to indicate when prod wraps, since cons cannot wrap twice
> without prod having wrapped at least once.
> 
> This also makes it reasonable to separate the two conceptually different
> modes of polling such that command insertion - which really wants to be
> fair and have minimal latency - is not subject to exponential backoff,
> and returns to its original implementation.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
> 
> v5:
>  - Rework to incorporate the back-to-back sync elision.
>  - Refactor the generation count slightly to preemptively help with
>    the HiSilicon MSI workaround.
>  - Split the cleanup into a separate patch for ease of review (it could
>    happily be squashed when applied).
> 
> The fundamental logic is copied directly from v4, but I've dropped the
> previous tested-by since there are a fair few subtle changes in how it's
> integrated. Patches are based on Will's iommu/devel branch plus my "Fix
> big-endian CMD_SYNC writes" patch.

Reviewed-by: Andrew Murray <andrew.murray@arm.com>

Thanks,

Andrew Murray

> 
> Robin.
> 
>  drivers/iommu/arm-smmu-v3.c | 94 +++++++++++++++++++++++++++----------
>  1 file changed, 69 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 867ba548c2cc..da8a91d116bf 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -588,6 +588,7 @@ struct arm_smmu_device {
>  	struct arm_smmu_strtab_cfg	strtab_cfg;
>  
>  	u32				sync_count;
> +	int				cmdq_generation;
>  
>  	/* IOMMU core code handle */
>  	struct iommu_device		iommu;
> @@ -676,6 +677,17 @@ static bool queue_empty(struct arm_smmu_queue *q)
>  	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
>  }
>  
> +static bool queue_behind(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) < Q_IDX(q, idx);
> +}
> +
> +static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) >= Q_IDX(q, idx) &&
> +	       Q_WRP(q, q->cons) == Q_WRP(q, idx);
> +}
> +
>  static void queue_sync_cons(struct arm_smmu_queue *q)
>  {
>  	q->cons = readl_relaxed(q->cons_reg);
> @@ -709,33 +721,19 @@ static void queue_inc_prod(struct arm_smmu_queue *q)
>  	writel(q->prod, q->prod_reg);
>  }
>  
> -/*
> - * Wait for the SMMU to consume items. If sync is true, wait until the queue
> - * is empty. Otherwise, wait until there is at least one free slot.
> - */
> -static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
> +static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe)
>  {
> -	ktime_t timeout;
> -	unsigned int delay = 1, spin_cnt = 0;
> +	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
>  
> -	/* Wait longer if it's a CMD_SYNC */
> -	timeout = ktime_add_us(ktime_get(), sync ?
> -					    ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
> -					    ARM_SMMU_POLL_TIMEOUT_US);
> -
> -	while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) {
> +	while (queue_sync_cons(q), queue_full(q)) {
>  		if (ktime_compare(ktime_get(), timeout) > 0)
>  			return -ETIMEDOUT;
>  
>  		if (wfe) {
>  			wfe();
> -		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
> -			cpu_relax();
> -			continue;
>  		} else {
> -			udelay(delay);
> -			delay *= 2;
> -			spin_cnt = 0;
> +			cpu_relax();
> +			udelay(1);
>  		}
>  	}
>  
> @@ -905,8 +903,11 @@ static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
>  
>  	smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
>  
> +	if (Q_IDX(q, q->prod + 1) == 0)
> +		WRITE_ONCE(smmu->cmdq_generation, smmu->cmdq_generation + 1);
> +
>  	while (queue_insert_raw(q, cmd) == -ENOSPC) {
> -		if (queue_poll_cons(q, false, wfe))
> +		if (queue_poll_cons(q, wfe))
>  			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>  	}
>  }
> @@ -945,6 +946,48 @@ static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
>  }
>  
> +static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
> +				   int sync_gen)
> +{
> +	struct arm_smmu_queue *q = &smmu->cmdq.q;
> +	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> +	unsigned int delay = 1, spin_cnt = 0;
> +	ktime_t timeout;
> +
> +	timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
> +	do {
> +		queue_sync_cons(q);
> +		/*
> +		 * If we see updates quickly enough, cons has passed sync_idx,
> +		 * but not yet wrapped. At worst, cons might have actually
> +		 * wrapped an even number of times, but that still guarantees
> +		 * the original sync must have been consumed.
> +		 */
> +		if (queue_ahead_not_wrapped(q, sync_idx))
> +			return 0;
> +		/*
> +		 * Otherwise, cons may have passed sync_idx and wrapped one or
> +		 * more times to appear behind it again, but in that case prod
> +		 * must also be one or more generations ahead.
> +		 */
> +		if (queue_behind(q, sync_idx) &&
> +		    READ_ONCE(smmu->cmdq_generation) != sync_gen)
> +			return 0;
> +
> +		if (wfe) {
> +			wfe();
> +		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
> +			cpu_relax();
> +		} else {
> +			udelay(delay);
> +			delay *= 2;
> +			spin_cnt = 0;
> +		}
> +	} while (ktime_before(ktime_get(), timeout));
> +
> +	return -ETIMEDOUT;
> +}
> +
>  static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
> @@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>  	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int ret;
> +	int sync_idx, sync_gen;
>  
>  	arm_smmu_cmdq_build_cmd(cmd, &ent);
>  
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	arm_smmu_cmdq_insert_cmd(smmu, cmd);
> -	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
> +	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
> +		arm_smmu_cmdq_insert_cmd(smmu, cmd);
> +	sync_idx = smmu->cmdq.q.prod;
> +	sync_gen = READ_ONCE(smmu->cmdq_generation);
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>  
> -	return ret;
> +	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  }
>  
>  static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -- 
> 2.19.1.dirty
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
  2018-10-17 13:56     ` Robin Murphy
@ 2018-10-18  8:58         ` Andrew Murray
  -1 siblings, 0 replies; 24+ messages in thread
From: Andrew Murray @ 2018-10-18  8:58 UTC (permalink / raw)
  To: Robin Murphy
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	will.deacon-5wv7dgnIgG8,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On Wed, Oct 17, 2018 at 02:56:07PM +0100, Robin Murphy wrote:
> Now that both sync methods are more or less the same shape, we can save
> some code and levels of indirection by rolling them up together again,
> with just a couple of simple conditionals to discriminate the MSI and
> queue-polling specifics.
> 
> Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
> ---
>  drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
>  1 file changed, 12 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index da8a91d116bf..36db63e3afcf 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>   * The difference between val and sync_idx is bounded by the maximum size of
>   * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
>   */
> -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  {
>  	ktime_t timeout;
>  	u32 val;
> @@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
>  	return -ETIMEDOUT;
>  }
>  
> -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
> +static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = {
> -		.opcode = CMDQ_OP_CMD_SYNC,
> -		.sync	= {
> -			.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
> -		},

You indicated that "Patches are based on Will's iommu/devel branch plus my
"Fix big-endian CMD_SYNC writes" patch." - However your v2 of that patch didn't
include this cpu_to_le32 hunk. 

Reviewed-by: Andrew Murray <andrew.murray-5wv7dgnIgG8@public.gmane.org>

Thanks,

Andrew Murray

> -	};
> +	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> +		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> +	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> +	int ret, sync_idx, sync_gen;
> +
> +	if (msi)
> +		ent.sync.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count));
>  
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
>  
> @@ -1009,39 +1010,13 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>  		arm_smmu_cmdq_build_cmd(cmd, &ent);
>  		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	}
> -
> -	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
> -
> -	return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
> -}
> -
> -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	u64 cmd[CMDQ_ENT_DWORDS];
> -	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int sync_idx, sync_gen;
> -
> -	arm_smmu_cmdq_build_cmd(cmd, &ent);
> -
> -	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
> -		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	sync_idx = smmu->cmdq.q.prod;
>  	sync_gen = READ_ONCE(smmu->cmdq_generation);
> +
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>  
> -	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
> -}
> -
> -static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	int ret;
> -	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> -		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> -
> -	ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
> -		  : __arm_smmu_cmdq_issue_sync(smmu);
> +	ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
> +		  : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  	if (ret)
>  		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>  }
> -- 
> 2.19.1.dirty
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
@ 2018-10-18  8:58         ` Andrew Murray
  0 siblings, 0 replies; 24+ messages in thread
From: Andrew Murray @ 2018-10-18  8:58 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Oct 17, 2018 at 02:56:07PM +0100, Robin Murphy wrote:
> Now that both sync methods are more or less the same shape, we can save
> some code and levels of indirection by rolling them up together again,
> with just a couple of simple conditionals to discriminate the MSI and
> queue-polling specifics.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>  drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
>  1 file changed, 12 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index da8a91d116bf..36db63e3afcf 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>   * The difference between val and sync_idx is bounded by the maximum size of
>   * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
>   */
> -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  {
>  	ktime_t timeout;
>  	u32 val;
> @@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
>  	return -ETIMEDOUT;
>  }
>  
> -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
> +static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = {
> -		.opcode = CMDQ_OP_CMD_SYNC,
> -		.sync	= {
> -			.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
> -		},

You indicated that "Patches are based on Will's iommu/devel branch plus my
"Fix big-endian CMD_SYNC writes" patch." - However your v2 of that patch didn't
include this cpu_to_le32 hunk. 

Reviewed-by: Andrew Murray <andrew.murray@arm.com>

Thanks,

Andrew Murray

> -	};
> +	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> +		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> +	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> +	int ret, sync_idx, sync_gen;
> +
> +	if (msi)
> +		ent.sync.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count));
>  
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
>  
> @@ -1009,39 +1010,13 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>  		arm_smmu_cmdq_build_cmd(cmd, &ent);
>  		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	}
> -
> -	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
> -
> -	return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
> -}
> -
> -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	u64 cmd[CMDQ_ENT_DWORDS];
> -	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int sync_idx, sync_gen;
> -
> -	arm_smmu_cmdq_build_cmd(cmd, &ent);
> -
> -	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
> -		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	sync_idx = smmu->cmdq.q.prod;
>  	sync_gen = READ_ONCE(smmu->cmdq_generation);
> +
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>  
> -	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
> -}
> -
> -static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	int ret;
> -	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> -		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> -
> -	ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
> -		  : __arm_smmu_cmdq_issue_sync(smmu);
> +	ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
> +		  : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  	if (ret)
>  		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>  }
> -- 
> 2.19.1.dirty
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
  2018-10-17 13:56 ` Robin Murphy
@ 2018-10-18 10:55     ` John Garry
  -1 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-18 10:55 UTC (permalink / raw)
  To: Robin Murphy, will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Linuxarm,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On 17/10/2018 14:56, Robin Murphy wrote:
> Even without the MSI trick, we can still do a lot better than hogging
> the entire queue while it drains. All we actually need to do for the
> necessary guarantee of completion is wait for our particular command to
> have been consumed - as long as we keep track of where we inserted it,
> there is no need to block other CPUs from adding further commands in the
> meantime. There is one theoretical (but incredibly unlikely) edge case
> to avoid, where cons has wrapped twice to still appear 'behind' the sync
> position - this is easily disambiguated by adding a generation count to
> the queue to indicate when prod wraps, since cons cannot wrap twice
> without prod having wrapped at least once.
>
> This also makes it reasonable to separate the two conceptually different
> modes of polling such that command insertion - which really wants to be
> fair and have minimal latency - is not subject to exponential backoff,
> and returns to its original implementation.
>
> Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
> ---
>
> v5:
>  - Rework to incorporate the back-to-back sync elision.
>  - Refactor the generation count slightly to preemptively help with
>    the HiSilicon MSI workaround.
>  - Split the cleanup into a separate patch for ease of review (it could
>    happily be squashed when applied).
>
> The fundamental logic is copied directly from v4, but I've dropped the
> previous tested-by since there are a fair few subtle changes in how it's
> integrated. Patches are based on Will's iommu/devel branch plus my "Fix
> big-endian CMD_SYNC writes" patch.
>
> Robin.
>
>  drivers/iommu/arm-smmu-v3.c | 94 +++++++++++++++++++++++++++----------
>  1 file changed, 69 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 867ba548c2cc..da8a91d116bf 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -588,6 +588,7 @@ struct arm_smmu_device {
>  	struct arm_smmu_strtab_cfg	strtab_cfg;
>
>  	u32				sync_count;
> +	int				cmdq_generation;
>
>  	/* IOMMU core code handle */
>  	struct iommu_device		iommu;
> @@ -676,6 +677,17 @@ static bool queue_empty(struct arm_smmu_queue *q)
>  	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
>  }
>
> +static bool queue_behind(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) < Q_IDX(q, idx);
> +}
> +
> +static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) >= Q_IDX(q, idx) &&
> +	       Q_WRP(q, q->cons) == Q_WRP(q, idx);
> +}
> +
>  static void queue_sync_cons(struct arm_smmu_queue *q)
>  {
>  	q->cons = readl_relaxed(q->cons_reg);
> @@ -709,33 +721,19 @@ static void queue_inc_prod(struct arm_smmu_queue *q)
>  	writel(q->prod, q->prod_reg);
>  }
>
> -/*
> - * Wait for the SMMU to consume items. If sync is true, wait until the queue
> - * is empty. Otherwise, wait until there is at least one free slot.
> - */
> -static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
> +static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe)
>  {
> -	ktime_t timeout;
> -	unsigned int delay = 1, spin_cnt = 0;
> +	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
>
> -	/* Wait longer if it's a CMD_SYNC */
> -	timeout = ktime_add_us(ktime_get(), sync ?
> -					    ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
> -					    ARM_SMMU_POLL_TIMEOUT_US);
> -
> -	while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) {
> +	while (queue_sync_cons(q), queue_full(q)) {
>  		if (ktime_compare(ktime_get(), timeout) > 0)
>  			return -ETIMEDOUT;
>
>  		if (wfe) {
>  			wfe();
> -		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
> -			cpu_relax();
> -			continue;
>  		} else {
> -			udelay(delay);
> -			delay *= 2;
> -			spin_cnt = 0;
> +			cpu_relax();
> +			udelay(1);
>  		}
>  	}
>
> @@ -905,8 +903,11 @@ static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
>
>  	smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
>
> +	if (Q_IDX(q, q->prod + 1) == 0)
> +		WRITE_ONCE(smmu->cmdq_generation, smmu->cmdq_generation + 1);
> +
>  	while (queue_insert_raw(q, cmd) == -ENOSPC) {
> -		if (queue_poll_cons(q, false, wfe))
> +		if (queue_poll_cons(q, wfe))
>  			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>  	}
>  }
> @@ -945,6 +946,48 @@ static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
>  }
>
> +static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
> +				   int sync_gen)
> +{
> +	struct arm_smmu_queue *q = &smmu->cmdq.q;
> +	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> +	unsigned int delay = 1, spin_cnt = 0;
> +	ktime_t timeout;
> +
> +	timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
> +	do {
> +		queue_sync_cons(q);
> +		/*
> +		 * If we see updates quickly enough, cons has passed sync_idx,
> +		 * but not yet wrapped. At worst, cons might have actually
> +		 * wrapped an even number of times, but that still guarantees
> +		 * the original sync must have been consumed.
> +		 */
> +		if (queue_ahead_not_wrapped(q, sync_idx))
> +			return 0;
> +		/*
> +		 * Otherwise, cons may have passed sync_idx and wrapped one or
> +		 * more times to appear behind it again, but in that case prod
> +		 * must also be one or more generations ahead.
> +		 */
> +		if (queue_behind(q, sync_idx) &&
> +		    READ_ONCE(smmu->cmdq_generation) != sync_gen)
> +			return 0;
> +
> +		if (wfe) {
> +			wfe();
> +		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
> +			cpu_relax();
> +		} else {
> +			udelay(delay);
> +			delay *= 2;
> +			spin_cnt = 0;
> +		}
> +	} while (ktime_before(ktime_get(), timeout));
> +
> +	return -ETIMEDOUT;
> +}
> +
>  static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
> @@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>  	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int ret;
> +	int sync_idx, sync_gen;
>
>  	arm_smmu_cmdq_build_cmd(cmd, &ent);
>
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	arm_smmu_cmdq_insert_cmd(smmu, cmd);
> -	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
> +	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
> +		arm_smmu_cmdq_insert_cmd(smmu, cmd);

Hi Robin,

If we did stop rebuilding the non-MSI command as I suggested, then we 
would not have the case of building the command and then discarding it, 
right?

Thanks,
John

> +	sync_idx = smmu->cmdq.q.prod;
> +	sync_gen = READ_ONCE(smmu->cmdq_generation);
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>
> -	return ret;
> +	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  }
>
>  static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
@ 2018-10-18 10:55     ` John Garry
  0 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-18 10:55 UTC (permalink / raw)
  To: linux-arm-kernel

On 17/10/2018 14:56, Robin Murphy wrote:
> Even without the MSI trick, we can still do a lot better than hogging
> the entire queue while it drains. All we actually need to do for the
> necessary guarantee of completion is wait for our particular command to
> have been consumed - as long as we keep track of where we inserted it,
> there is no need to block other CPUs from adding further commands in the
> meantime. There is one theoretical (but incredibly unlikely) edge case
> to avoid, where cons has wrapped twice to still appear 'behind' the sync
> position - this is easily disambiguated by adding a generation count to
> the queue to indicate when prod wraps, since cons cannot wrap twice
> without prod having wrapped at least once.
>
> This also makes it reasonable to separate the two conceptually different
> modes of polling such that command insertion - which really wants to be
> fair and have minimal latency - is not subject to exponential backoff,
> and returns to its original implementation.
>
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>
> v5:
>  - Rework to incorporate the back-to-back sync elision.
>  - Refactor the generation count slightly to preemptively help with
>    the HiSilicon MSI workaround.
>  - Split the cleanup into a separate patch for ease of review (it could
>    happily be squashed when applied).
>
> The fundamental logic is copied directly from v4, but I've dropped the
> previous tested-by since there are a fair few subtle changes in how it's
> integrated. Patches are based on Will's iommu/devel branch plus my "Fix
> big-endian CMD_SYNC writes" patch.
>
> Robin.
>
>  drivers/iommu/arm-smmu-v3.c | 94 +++++++++++++++++++++++++++----------
>  1 file changed, 69 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 867ba548c2cc..da8a91d116bf 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -588,6 +588,7 @@ struct arm_smmu_device {
>  	struct arm_smmu_strtab_cfg	strtab_cfg;
>
>  	u32				sync_count;
> +	int				cmdq_generation;
>
>  	/* IOMMU core code handle */
>  	struct iommu_device		iommu;
> @@ -676,6 +677,17 @@ static bool queue_empty(struct arm_smmu_queue *q)
>  	       Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
>  }
>
> +static bool queue_behind(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) < Q_IDX(q, idx);
> +}
> +
> +static bool queue_ahead_not_wrapped(struct arm_smmu_queue *q, u32 idx)
> +{
> +	return Q_IDX(q, q->cons) >= Q_IDX(q, idx) &&
> +	       Q_WRP(q, q->cons) == Q_WRP(q, idx);
> +}
> +
>  static void queue_sync_cons(struct arm_smmu_queue *q)
>  {
>  	q->cons = readl_relaxed(q->cons_reg);
> @@ -709,33 +721,19 @@ static void queue_inc_prod(struct arm_smmu_queue *q)
>  	writel(q->prod, q->prod_reg);
>  }
>
> -/*
> - * Wait for the SMMU to consume items. If sync is true, wait until the queue
> - * is empty. Otherwise, wait until there is at least one free slot.
> - */
> -static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
> +static int queue_poll_cons(struct arm_smmu_queue *q, bool wfe)
>  {
> -	ktime_t timeout;
> -	unsigned int delay = 1, spin_cnt = 0;
> +	ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
>
> -	/* Wait longer if it's a CMD_SYNC */
> -	timeout = ktime_add_us(ktime_get(), sync ?
> -					    ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
> -					    ARM_SMMU_POLL_TIMEOUT_US);
> -
> -	while (queue_sync_cons(q), (sync ? !queue_empty(q) : queue_full(q))) {
> +	while (queue_sync_cons(q), queue_full(q)) {
>  		if (ktime_compare(ktime_get(), timeout) > 0)
>  			return -ETIMEDOUT;
>
>  		if (wfe) {
>  			wfe();
> -		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
> -			cpu_relax();
> -			continue;
>  		} else {
> -			udelay(delay);
> -			delay *= 2;
> -			spin_cnt = 0;
> +			cpu_relax();
> +			udelay(1);
>  		}
>  	}
>
> @@ -905,8 +903,11 @@ static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
>
>  	smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
>
> +	if (Q_IDX(q, q->prod + 1) == 0)
> +		WRITE_ONCE(smmu->cmdq_generation, smmu->cmdq_generation + 1);
> +
>  	while (queue_insert_raw(q, cmd) == -ENOSPC) {
> -		if (queue_poll_cons(q, false, wfe))
> +		if (queue_poll_cons(q, wfe))
>  			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>  	}
>  }
> @@ -945,6 +946,48 @@ static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  	return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
>  }
>
> +static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
> +				   int sync_gen)
> +{
> +	struct arm_smmu_queue *q = &smmu->cmdq.q;
> +	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> +	unsigned int delay = 1, spin_cnt = 0;
> +	ktime_t timeout;
> +
> +	timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
> +	do {
> +		queue_sync_cons(q);
> +		/*
> +		 * If we see updates quickly enough, cons has passed sync_idx,
> +		 * but not yet wrapped. At worst, cons might have actually
> +		 * wrapped an even number of times, but that still guarantees
> +		 * the original sync must have been consumed.
> +		 */
> +		if (queue_ahead_not_wrapped(q, sync_idx))
> +			return 0;
> +		/*
> +		 * Otherwise, cons may have passed sync_idx and wrapped one or
> +		 * more times to appear behind it again, but in that case prod
> +		 * must also be one or more generations ahead.
> +		 */
> +		if (queue_behind(q, sync_idx) &&
> +		    READ_ONCE(smmu->cmdq_generation) != sync_gen)
> +			return 0;
> +
> +		if (wfe) {
> +			wfe();
> +		} else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
> +			cpu_relax();
> +		} else {
> +			udelay(delay);
> +			delay *= 2;
> +			spin_cnt = 0;
> +		}
> +	} while (ktime_before(ktime_get(), timeout));
> +
> +	return -ETIMEDOUT;
> +}
> +
>  static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
> @@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>  	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int ret;
> +	int sync_idx, sync_gen;
>
>  	arm_smmu_cmdq_build_cmd(cmd, &ent);
>
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	arm_smmu_cmdq_insert_cmd(smmu, cmd);
> -	ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
> +	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
> +		arm_smmu_cmdq_insert_cmd(smmu, cmd);

Hi Robin,

If we did stop rebuilding the non-MSI command as I suggested, then we 
would not have the case of building the command and then discarding it, 
right?

Thanks,
John

> +	sync_idx = smmu->cmdq.q.prod;
> +	sync_gen = READ_ONCE(smmu->cmdq_generation);
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>
> -	return ret;
> +	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  }
>
>  static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
  2018-10-17 14:38         ` John Garry
@ 2018-10-18 11:18             ` Robin Murphy
  -1 siblings, 0 replies; 24+ messages in thread
From: Robin Murphy @ 2018-10-18 11:18 UTC (permalink / raw)
  To: John Garry, will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Linuxarm,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

Hi John,

On 17/10/18 15:38, John Garry wrote:
> On 17/10/2018 14:56, Robin Murphy wrote:
>> Now that both sync methods are more or less the same shape, we can save
>> some code and levels of indirection by rolling them up together again,
>> with just a couple of simple conditionals to discriminate the MSI and
>> queue-polling specifics.
> 
> Hi Robin, Will,
> 
> I had been thinking of this other patch previously:
> 
> iommu/arm-smmu-v3: Stop rebuilding non-MSI CMD_SYNC commands
> 
> The contents of the non-MSI CMD_SYNC command are fixed. This patch 
> offers a small optimisation by keeping a sample of this command on the 
> heap for re-use, thereby avoiding unnecessary re-building.

As far as I can tell, not counting the general call overhead which can 
be solved in less-specific ways, this essentially saves two MOVs, a 
MOVK, and an STP which in practice might not even reach L1 before it 
gets forwarded back out of the store buffer. Do you have any numbers for 
what difference this makes in terms of I/O performance or cache traffic?

Robin.

> Signed-off-by: John Garry <john.garry-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 6947ccf..9d86c29 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -963,14 +963,16 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct 
> arm_smmu_device *smmu)
> 
>   static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>   {
> -       u64 cmd[CMDQ_ENT_DWORDS];
> +       static u64 cmd[CMDQ_ENT_DWORDS] = {
> +              FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) |
> +              FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV) |
> +              FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH) |
> +              FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB),
> +           0};
>          unsigned long flags;
>          bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> -       struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>          int ret;
> 
> -       arm_smmu_cmdq_build_cmd(cmd, &ent);
> -
>          spin_lock_irqsave(&smmu->cmdq.lock, flags);
>          arm_smmu_cmdq_insert_cmd(smmu, cmd);
>          ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
> 
> But it seems that combining the the MSI and non-MSI methods would block 
> this.
> 
> How do you feel about this?
> 
> Thanks,
> John
> 
>>
>> Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>
>> ---
>>  drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
>>  1 file changed, 12 insertions(+), 37 deletions(-)
>>
>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>> index da8a91d116bf..36db63e3afcf 100644
>> --- a/drivers/iommu/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm-smmu-v3.c
>> @@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct 
>> arm_smmu_device *smmu,
>>   * The difference between val and sync_idx is bounded by the maximum 
>> size of
>>   * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe 
>> arithmetic.
>>   */
>> -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 
>> sync_idx)
>> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 
>> sync_idx)
>>  {
>>      ktime_t timeout;
>>      u32 val;
>> @@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct 
>> arm_smmu_device *smmu, u32 sync_idx,
>>      return -ETIMEDOUT;
>>  }
>>
>> -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>> +static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>  {
>>      u64 cmd[CMDQ_ENT_DWORDS];
>>      unsigned long flags;
>> -    struct arm_smmu_cmdq_ent ent = {
>> -        .opcode = CMDQ_OP_CMD_SYNC,
>> -        .sync    = {
>> -            .msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
>> -        },
>> -    };
>> +    bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
>> +           (smmu->features & ARM_SMMU_FEAT_COHERENCY);
>> +    struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>> +    int ret, sync_idx, sync_gen;
>> +
>> +    if (msi)
>> +        ent.sync.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count));
>>
>>      spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>
>> @@ -1009,39 +1010,13 @@ static int 
>> __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>>          arm_smmu_cmdq_build_cmd(cmd, &ent);
>>          arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>      }
>> -
>> -    spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>> -
>> -    return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
>> -}
>> -
>> -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>> -{
>> -    u64 cmd[CMDQ_ENT_DWORDS];
>> -    unsigned long flags;
>> -    struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>> -    int sync_idx, sync_gen;
>> -
>> -    arm_smmu_cmdq_build_cmd(cmd, &ent);
>> -
>> -    spin_lock_irqsave(&smmu->cmdq.lock, flags);
>> -    if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
>> -        arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>      sync_idx = smmu->cmdq.q.prod;
>>      sync_gen = READ_ONCE(smmu->cmdq_generation);
>> +
>>      spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>
>> -    return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>> -}
>> -
>> -static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>> -{
>> -    int ret;
>> -    bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
>> -           (smmu->features & ARM_SMMU_FEAT_COHERENCY);
>> -
>> -    ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
>> -          : __arm_smmu_cmdq_issue_sync(smmu);
>> +    ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
>> +          : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>>      if (ret)
>>          dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>>  }
>>
> 
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
@ 2018-10-18 11:18             ` Robin Murphy
  0 siblings, 0 replies; 24+ messages in thread
From: Robin Murphy @ 2018-10-18 11:18 UTC (permalink / raw)
  To: linux-arm-kernel

Hi John,

On 17/10/18 15:38, John Garry wrote:
> On 17/10/2018 14:56, Robin Murphy wrote:
>> Now that both sync methods are more or less the same shape, we can save
>> some code and levels of indirection by rolling them up together again,
>> with just a couple of simple conditionals to discriminate the MSI and
>> queue-polling specifics.
> 
> Hi Robin, Will,
> 
> I had been thinking of this other patch previously:
> 
> iommu/arm-smmu-v3: Stop rebuilding non-MSI CMD_SYNC commands
> 
> The contents of the non-MSI CMD_SYNC command are fixed. This patch 
> offers a small optimisation by keeping a sample of this command on the 
> heap for re-use, thereby avoiding unnecessary re-building.

As far as I can tell, not counting the general call overhead which can 
be solved in less-specific ways, this essentially saves two MOVs, a 
MOVK, and an STP which in practice might not even reach L1 before it 
gets forwarded back out of the store buffer. Do you have any numbers for 
what difference this makes in terms of I/O performance or cache traffic?

Robin.

> Signed-off-by: John Garry <john.garry@huawei.com>
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 6947ccf..9d86c29 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -963,14 +963,16 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct 
> arm_smmu_device *smmu)
> 
>  ?static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  ?{
> -?????? u64 cmd[CMDQ_ENT_DWORDS];
> +?????? static u64 cmd[CMDQ_ENT_DWORDS] = {
> +????????????? FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) |
> +????????????? FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV) |
> +????????????? FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH) |
> +????????????? FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB),
> +?????????? 0};
>  ??????? unsigned long flags;
>  ??????? bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> -?????? struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>  ??????? int ret;
> 
> -?????? arm_smmu_cmdq_build_cmd(cmd, &ent);
> -
>  ??????? spin_lock_irqsave(&smmu->cmdq.lock, flags);
>  ??????? arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  ??????? ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
> 
> But it seems that combining the the MSI and non-MSI methods would block 
> this.
> 
> How do you feel about this?
> 
> Thanks,
> John
> 
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>> ---
>> ?drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
>> ?1 file changed, 12 insertions(+), 37 deletions(-)
>>
>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>> index da8a91d116bf..36db63e3afcf 100644
>> --- a/drivers/iommu/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm-smmu-v3.c
>> @@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct 
>> arm_smmu_device *smmu,
>> ? * The difference between val and sync_idx is bounded by the maximum 
>> size of
>> ? * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe 
>> arithmetic.
>> ? */
>> -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 
>> sync_idx)
>> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 
>> sync_idx)
>> ?{
>> ???? ktime_t timeout;
>> ???? u32 val;
>> @@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct 
>> arm_smmu_device *smmu, u32 sync_idx,
>> ???? return -ETIMEDOUT;
>> ?}
>>
>> -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>> +static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>> ?{
>> ???? u64 cmd[CMDQ_ENT_DWORDS];
>> ???? unsigned long flags;
>> -??? struct arm_smmu_cmdq_ent ent = {
>> -??????? .opcode = CMDQ_OP_CMD_SYNC,
>> -??????? .sync??? = {
>> -??????????? .msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
>> -??????? },
>> -??? };
>> +??? bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
>> +?????????? (smmu->features & ARM_SMMU_FEAT_COHERENCY);
>> +??? struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>> +??? int ret, sync_idx, sync_gen;
>> +
>> +??? if (msi)
>> +??????? ent.sync.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count));
>>
>> ???? spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>
>> @@ -1009,39 +1010,13 @@ static int 
>> __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>> ???????? arm_smmu_cmdq_build_cmd(cmd, &ent);
>> ???????? arm_smmu_cmdq_insert_cmd(smmu, cmd);
>> ???? }
>> -
>> -??? spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>> -
>> -??? return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
>> -}
>> -
>> -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>> -{
>> -??? u64 cmd[CMDQ_ENT_DWORDS];
>> -??? unsigned long flags;
>> -??? struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>> -??? int sync_idx, sync_gen;
>> -
>> -??? arm_smmu_cmdq_build_cmd(cmd, &ent);
>> -
>> -??? spin_lock_irqsave(&smmu->cmdq.lock, flags);
>> -??? if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
>> -??????? arm_smmu_cmdq_insert_cmd(smmu, cmd);
>> ???? sync_idx = smmu->cmdq.q.prod;
>> ???? sync_gen = READ_ONCE(smmu->cmdq_generation);
>> +
>> ???? spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>
>> -??? return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>> -}
>> -
>> -static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>> -{
>> -??? int ret;
>> -??? bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
>> -?????????? (smmu->features & ARM_SMMU_FEAT_COHERENCY);
>> -
>> -??? ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
>> -????????? : __arm_smmu_cmdq_issue_sync(smmu);
>> +??? ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
>> +????????? : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>> ???? if (ret)
>> ???????? dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>> ?}
>>
> 
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
  2018-10-18 10:55     ` John Garry
@ 2018-10-18 11:19         ` Robin Murphy
  -1 siblings, 0 replies; 24+ messages in thread
From: Robin Murphy @ 2018-10-18 11:19 UTC (permalink / raw)
  To: John Garry, will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Linuxarm,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On 18/10/18 11:55, John Garry wrote:
[...]
>> @@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct 
>> arm_smmu_device *smmu)
>>  {
>>      u64 cmd[CMDQ_ENT_DWORDS];
>>      unsigned long flags;
>> -    bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>>      struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>> -    int ret;
>> +    int sync_idx, sync_gen;
>>
>>      arm_smmu_cmdq_build_cmd(cmd, &ent);
>>
>>      spin_lock_irqsave(&smmu->cmdq.lock, flags);
>> -    arm_smmu_cmdq_insert_cmd(smmu, cmd);
>> -    ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
>> +    if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
>> +        arm_smmu_cmdq_insert_cmd(smmu, cmd);
> 
> Hi Robin,
> 
> If we did stop rebuilding the non-MSI command as I suggested, then we 
> would not have the case of building the command and then discarding it, 
> right?

I suppose so. But that build/discard case can also be avoided by 
applying patch 2/2 of this series ;)

Robin.

> 
> Thanks,
> John
> 
>> +    sync_idx = smmu->cmdq.q.prod;
>> +    sync_gen = READ_ONCE(smmu->cmdq_generation);
>>      spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>
>> -    return ret;
>> +    return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>>  }
>>
>>  static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>
> 
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
@ 2018-10-18 11:19         ` Robin Murphy
  0 siblings, 0 replies; 24+ messages in thread
From: Robin Murphy @ 2018-10-18 11:19 UTC (permalink / raw)
  To: linux-arm-kernel

On 18/10/18 11:55, John Garry wrote:
[...]
>> @@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct 
>> arm_smmu_device *smmu)
>> ?{
>> ???? u64 cmd[CMDQ_ENT_DWORDS];
>> ???? unsigned long flags;
>> -??? bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>> ???? struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>> -??? int ret;
>> +??? int sync_idx, sync_gen;
>>
>> ???? arm_smmu_cmdq_build_cmd(cmd, &ent);
>>
>> ???? spin_lock_irqsave(&smmu->cmdq.lock, flags);
>> -??? arm_smmu_cmdq_insert_cmd(smmu, cmd);
>> -??? ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
>> +??? if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
>> +??????? arm_smmu_cmdq_insert_cmd(smmu, cmd);
> 
> Hi Robin,
> 
> If we did stop rebuilding the non-MSI command as I suggested, then we 
> would not have the case of building the command and then discarding it, 
> right?

I suppose so. But that build/discard case can also be avoided by 
applying patch 2/2 of this series ;)

Robin.

> 
> Thanks,
> John
> 
>> +??? sync_idx = smmu->cmdq.q.prod;
>> +??? sync_gen = READ_ONCE(smmu->cmdq_generation);
>> ???? spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>
>> -??? return ret;
>> +??? return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>> ?}
>>
>> ?static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>
> 
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
  2018-10-18 11:18             ` Robin Murphy
@ 2018-10-18 11:29               ` John Garry
  -1 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-18 11:29 UTC (permalink / raw)
  To: Robin Murphy, will.deacon
  Cc: iommu, Linuxarm, linux-arm-kernel, thunder.leizhen

On 18/10/2018 12:18, Robin Murphy wrote:
> Hi John,
>
> On 17/10/18 15:38, John Garry wrote:
>> On 17/10/2018 14:56, Robin Murphy wrote:
>>> Now that both sync methods are more or less the same shape, we can save
>>> some code and levels of indirection by rolling them up together again,
>>> with just a couple of simple conditionals to discriminate the MSI and
>>> queue-polling specifics.
>>
>> Hi Robin, Will,
>>
>> I had been thinking of this other patch previously:
>>
>> iommu/arm-smmu-v3: Stop rebuilding non-MSI CMD_SYNC commands
>>
>> The contents of the non-MSI CMD_SYNC command are fixed. This patch
>> offers a small optimisation by keeping a sample of this command on the
>> heap for re-use, thereby avoiding unnecessary re-building.
>

Hi Robin,

> As far as I can tell, not counting the general call overhead which can
> be solved in less-specific ways, this essentially saves two MOVs, a
> MOVK, and an STP which in practice might not even reach L1 before it
> gets forwarded back out of the store buffer. Do you have any numbers for
> what difference this makes in terms of I/O performance or cache traffic?

I'll try to get some numbers. I'm not expected anything big, but this 
just seemed like a cheap optimisation, maybe at the expense of slightly 
inconsistent code.

Thanks,
John

>
> Robin.
>
>> Signed-off-by: John Garry <john.garry@huawei.com>
>>
>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>> index 6947ccf..9d86c29 100644
>> --- a/drivers/iommu/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm-smmu-v3.c
>> @@ -963,14 +963,16 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct
>> arm_smmu_device *smmu)
>>
>>   static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>   {
>> -       u64 cmd[CMDQ_ENT_DWORDS];
>> +       static u64 cmd[CMDQ_ENT_DWORDS] = {
>> +              FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) |
>> +              FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV) |
>> +              FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH) |
>> +              FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB),
>> +           0};
>>          unsigned long flags;
>>          bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>> -       struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>>          int ret;
>>
>> -       arm_smmu_cmdq_build_cmd(cmd, &ent);
>> -
>>          spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>          arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>          ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
>>
>> But it seems that combining the the MSI and non-MSI methods would
>> block this.
>>
>> How do you feel about this?
>>
>> Thanks,
>> John
>>
>>>
>>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>>> ---
>>>  drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
>>>  1 file changed, 12 insertions(+), 37 deletions(-)
>>>
>>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>>> index da8a91d116bf..36db63e3afcf 100644
>>> --- a/drivers/iommu/arm-smmu-v3.c
>>> +++ b/drivers/iommu/arm-smmu-v3.c
>>> @@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct
>>> arm_smmu_device *smmu,
>>>   * The difference between val and sync_idx is bounded by the maximum
>>> size of
>>>   * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe
>>> arithmetic.
>>>   */
>>> -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu,
>>> u32 sync_idx)
>>> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32
>>> sync_idx)
>>>  {
>>>      ktime_t timeout;
>>>      u32 val;
>>> @@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct
>>> arm_smmu_device *smmu, u32 sync_idx,
>>>      return -ETIMEDOUT;
>>>  }
>>>
>>> -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>>> +static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>>  {
>>>      u64 cmd[CMDQ_ENT_DWORDS];
>>>      unsigned long flags;
>>> -    struct arm_smmu_cmdq_ent ent = {
>>> -        .opcode = CMDQ_OP_CMD_SYNC,
>>> -        .sync    = {
>>> -            .msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
>>> -        },
>>> -    };
>>> +    bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
>>> +           (smmu->features & ARM_SMMU_FEAT_COHERENCY);
>>> +    struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>>> +    int ret, sync_idx, sync_gen;
>>> +
>>> +    if (msi)
>>> +        ent.sync.msiaddr =
>>> cpu_to_le32(virt_to_phys(&smmu->sync_count));
>>>
>>>      spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>>
>>> @@ -1009,39 +1010,13 @@ static int
>>> __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>>>          arm_smmu_cmdq_build_cmd(cmd, &ent);
>>>          arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>>      }
>>> -
>>> -    spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>> -
>>> -    return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
>>> -}
>>> -
>>> -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>> -{
>>> -    u64 cmd[CMDQ_ENT_DWORDS];
>>> -    unsigned long flags;
>>> -    struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>>> -    int sync_idx, sync_gen;
>>> -
>>> -    arm_smmu_cmdq_build_cmd(cmd, &ent);
>>> -
>>> -    spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>> -    if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
>>> -        arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>>      sync_idx = smmu->cmdq.q.prod;
>>>      sync_gen = READ_ONCE(smmu->cmdq_generation);
>>> +
>>>      spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>>
>>> -    return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>>> -}
>>> -
>>> -static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>> -{
>>> -    int ret;
>>> -    bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
>>> -           (smmu->features & ARM_SMMU_FEAT_COHERENCY);
>>> -
>>> -    ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
>>> -          : __arm_smmu_cmdq_issue_sync(smmu);
>>> +    ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
>>> +          : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>>>      if (ret)
>>>          dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>>>  }
>>>
>>
>>
>
> .
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
@ 2018-10-18 11:29               ` John Garry
  0 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-18 11:29 UTC (permalink / raw)
  To: linux-arm-kernel

On 18/10/2018 12:18, Robin Murphy wrote:
> Hi John,
>
> On 17/10/18 15:38, John Garry wrote:
>> On 17/10/2018 14:56, Robin Murphy wrote:
>>> Now that both sync methods are more or less the same shape, we can save
>>> some code and levels of indirection by rolling them up together again,
>>> with just a couple of simple conditionals to discriminate the MSI and
>>> queue-polling specifics.
>>
>> Hi Robin, Will,
>>
>> I had been thinking of this other patch previously:
>>
>> iommu/arm-smmu-v3: Stop rebuilding non-MSI CMD_SYNC commands
>>
>> The contents of the non-MSI CMD_SYNC command are fixed. This patch
>> offers a small optimisation by keeping a sample of this command on the
>> heap for re-use, thereby avoiding unnecessary re-building.
>

Hi Robin,

> As far as I can tell, not counting the general call overhead which can
> be solved in less-specific ways, this essentially saves two MOVs, a
> MOVK, and an STP which in practice might not even reach L1 before it
> gets forwarded back out of the store buffer. Do you have any numbers for
> what difference this makes in terms of I/O performance or cache traffic?

I'll try to get some numbers. I'm not expected anything big, but this 
just seemed like a cheap optimisation, maybe at the expense of slightly 
inconsistent code.

Thanks,
John

>
> Robin.
>
>> Signed-off-by: John Garry <john.garry@huawei.com>
>>
>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>> index 6947ccf..9d86c29 100644
>> --- a/drivers/iommu/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm-smmu-v3.c
>> @@ -963,14 +963,16 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct
>> arm_smmu_device *smmu)
>>
>>   static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>   {
>> -       u64 cmd[CMDQ_ENT_DWORDS];
>> +       static u64 cmd[CMDQ_ENT_DWORDS] = {
>> +              FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) |
>> +              FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV) |
>> +              FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH) |
>> +              FIELD_PREP(CMDQ_SYNC_0_MSIATTR, ARM_SMMU_MEMATTR_OIWB),
>> +           0};
>>          unsigned long flags;
>>          bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>> -       struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>>          int ret;
>>
>> -       arm_smmu_cmdq_build_cmd(cmd, &ent);
>> -
>>          spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>          arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>          ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
>>
>> But it seems that combining the the MSI and non-MSI methods would
>> block this.
>>
>> How do you feel about this?
>>
>> Thanks,
>> John
>>
>>>
>>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>>> ---
>>>  drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
>>>  1 file changed, 12 insertions(+), 37 deletions(-)
>>>
>>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>>> index da8a91d116bf..36db63e3afcf 100644
>>> --- a/drivers/iommu/arm-smmu-v3.c
>>> +++ b/drivers/iommu/arm-smmu-v3.c
>>> @@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct
>>> arm_smmu_device *smmu,
>>>   * The difference between val and sync_idx is bounded by the maximum
>>> size of
>>>   * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe
>>> arithmetic.
>>>   */
>>> -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu,
>>> u32 sync_idx)
>>> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32
>>> sync_idx)
>>>  {
>>>      ktime_t timeout;
>>>      u32 val;
>>> @@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct
>>> arm_smmu_device *smmu, u32 sync_idx,
>>>      return -ETIMEDOUT;
>>>  }
>>>
>>> -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>>> +static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>>  {
>>>      u64 cmd[CMDQ_ENT_DWORDS];
>>>      unsigned long flags;
>>> -    struct arm_smmu_cmdq_ent ent = {
>>> -        .opcode = CMDQ_OP_CMD_SYNC,
>>> -        .sync    = {
>>> -            .msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
>>> -        },
>>> -    };
>>> +    bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
>>> +           (smmu->features & ARM_SMMU_FEAT_COHERENCY);
>>> +    struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>>> +    int ret, sync_idx, sync_gen;
>>> +
>>> +    if (msi)
>>> +        ent.sync.msiaddr =
>>> cpu_to_le32(virt_to_phys(&smmu->sync_count));
>>>
>>>      spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>>
>>> @@ -1009,39 +1010,13 @@ static int
>>> __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>>>          arm_smmu_cmdq_build_cmd(cmd, &ent);
>>>          arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>>      }
>>> -
>>> -    spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>> -
>>> -    return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
>>> -}
>>> -
>>> -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>> -{
>>> -    u64 cmd[CMDQ_ENT_DWORDS];
>>> -    unsigned long flags;
>>> -    struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>>> -    int sync_idx, sync_gen;
>>> -
>>> -    arm_smmu_cmdq_build_cmd(cmd, &ent);
>>> -
>>> -    spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>> -    if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
>>> -        arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>>      sync_idx = smmu->cmdq.q.prod;
>>>      sync_gen = READ_ONCE(smmu->cmdq_generation);
>>> +
>>>      spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>>
>>> -    return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>>> -}
>>> -
>>> -static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>> -{
>>> -    int ret;
>>> -    bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
>>> -           (smmu->features & ARM_SMMU_FEAT_COHERENCY);
>>> -
>>> -    ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
>>> -          : __arm_smmu_cmdq_issue_sync(smmu);
>>> +    ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
>>> +          : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>>>      if (ret)
>>>          dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>>>  }
>>>
>>
>>
>
> .
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
  2018-10-18 11:19         ` Robin Murphy
@ 2018-10-18 11:48           ` John Garry
  -1 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-18 11:48 UTC (permalink / raw)
  To: Robin Murphy, will.deacon
  Cc: iommu, Linuxarm, linux-arm-kernel, thunder.leizhen

On 18/10/2018 12:19, Robin Murphy wrote:
> On 18/10/18 11:55, John Garry wrote:
> [...]
>>> @@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct
>>> arm_smmu_device *smmu)
>>>  {
>>>      u64 cmd[CMDQ_ENT_DWORDS];
>>>      unsigned long flags;
>>> -    bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>>>      struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>>> -    int ret;
>>> +    int sync_idx, sync_gen;
>>>
>>>      arm_smmu_cmdq_build_cmd(cmd, &ent);
>>>
>>>      spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>> -    arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>> -    ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
>>> +    if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
>>> +        arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>
>> Hi Robin,
>>
>> If we did stop rebuilding the non-MSI command as I suggested, then we
>> would not have the case of building the command and then discarding
>> it, right?
>
> I suppose so. But that build/discard case can also be avoided by
> applying patch 2/2 of this series ;)

OK, I'll check it more detail. TBH, I found 2/2 hard to follow from just 
the diff.

Thanks,
John

>
> Robin.
>
>>
>> Thanks,
>> John
>>
>>> +    sync_idx = smmu->cmdq.q.prod;
>>> +    sync_gen = READ_ONCE(smmu->cmdq_generation);
>>>      spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>>
>>> -    return ret;
>>> +    return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>>>  }
>>>
>>>  static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>>
>>
>>
>
> .
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
@ 2018-10-18 11:48           ` John Garry
  0 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-18 11:48 UTC (permalink / raw)
  To: linux-arm-kernel

On 18/10/2018 12:19, Robin Murphy wrote:
> On 18/10/18 11:55, John Garry wrote:
> [...]
>>> @@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct
>>> arm_smmu_device *smmu)
>>>  {
>>>      u64 cmd[CMDQ_ENT_DWORDS];
>>>      unsigned long flags;
>>> -    bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>>>      struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>>> -    int ret;
>>> +    int sync_idx, sync_gen;
>>>
>>>      arm_smmu_cmdq_build_cmd(cmd, &ent);
>>>
>>>      spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>> -    arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>> -    ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
>>> +    if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
>>> +        arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>
>> Hi Robin,
>>
>> If we did stop rebuilding the non-MSI command as I suggested, then we
>> would not have the case of building the command and then discarding
>> it, right?
>
> I suppose so. But that build/discard case can also be avoided by
> applying patch 2/2 of this series ;)

OK, I'll check it more detail. TBH, I found 2/2 hard to follow from just 
the diff.

Thanks,
John

>
> Robin.
>
>>
>> Thanks,
>> John
>>
>>> +    sync_idx = smmu->cmdq.q.prod;
>>> +    sync_gen = READ_ONCE(smmu->cmdq_generation);
>>>      spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>>>
>>> -    return ret;
>>> +    return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>>>  }
>>>
>>>  static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>>>
>>
>>
>
> .
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
  2018-10-18 11:48           ` John Garry
@ 2018-10-19 14:30               ` John Garry
  -1 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-19 14:30 UTC (permalink / raw)
  To: Robin Murphy, will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Linuxarm,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On 18/10/2018 12:48, John Garry wrote:
> On 18/10/2018 12:19, Robin Murphy wrote:
>> On 18/10/18 11:55, John Garry wrote:
>> [...]
>>>> @@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct
>>>> arm_smmu_device *smmu)
>>>>  {
>>>>      u64 cmd[CMDQ_ENT_DWORDS];
>>>>      unsigned long flags;
>>>> -    bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>>>>      struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>>>> -    int ret;
>>>> +    int sync_idx, sync_gen;
>>>>
>>>>      arm_smmu_cmdq_build_cmd(cmd, &ent);
>>>>
>>>>      spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>>> -    arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>>> -    ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
>>>> +    if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
>>>> +        arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>>
>>> Hi Robin,
>>>
>>> If we did stop rebuilding the non-MSI command as I suggested, then we
>>> would not have the case of building the command and then discarding
>>> it, right?
>>
>> I suppose so. But that build/discard case can also be avoided by
>> applying patch 2/2 of this series ;)
>

So we can avoid the build/discard but the command is built under the 
lock. Hopefully worth it.

However there is still scope (and - considering the locking - maybe more 
need) for a static non-MSI CMD_SYNC command :)

Anyway, we'll look to get some numbers.

Cheers,

> OK, I'll check it more detail. TBH, I found 2/2 hard to follow from just
> the diff.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock
@ 2018-10-19 14:30               ` John Garry
  0 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-19 14:30 UTC (permalink / raw)
  To: linux-arm-kernel

On 18/10/2018 12:48, John Garry wrote:
> On 18/10/2018 12:19, Robin Murphy wrote:
>> On 18/10/18 11:55, John Garry wrote:
>> [...]
>>>> @@ -976,18 +1019,19 @@ static int __arm_smmu_cmdq_issue_sync(struct
>>>> arm_smmu_device *smmu)
>>>>  {
>>>>      u64 cmd[CMDQ_ENT_DWORDS];
>>>>      unsigned long flags;
>>>> -    bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>>>>      struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
>>>> -    int ret;
>>>> +    int sync_idx, sync_gen;
>>>>
>>>>      arm_smmu_cmdq_build_cmd(cmd, &ent);
>>>>
>>>>      spin_lock_irqsave(&smmu->cmdq.lock, flags);
>>>> -    arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>>> -    ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
>>>> +    if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
>>>> +        arm_smmu_cmdq_insert_cmd(smmu, cmd);
>>>
>>> Hi Robin,
>>>
>>> If we did stop rebuilding the non-MSI command as I suggested, then we
>>> would not have the case of building the command and then discarding
>>> it, right?
>>
>> I suppose so. But that build/discard case can also be avoided by
>> applying patch 2/2 of this series ;)
>

So we can avoid the build/discard but the command is built under the 
lock. Hopefully worth it.

However there is still scope (and - considering the locking - maybe more 
need) for a static non-MSI CMD_SYNC command :)

Anyway, we'll look to get some numbers.

Cheers,

> OK, I'll check it more detail. TBH, I found 2/2 hard to follow from just
> the diff.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
  2018-10-17 13:56     ` Robin Murphy
@ 2018-10-22 12:29         ` John Garry
  -1 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-22 12:29 UTC (permalink / raw)
  To: Robin Murphy, will.deacon-5wv7dgnIgG8
  Cc: iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r

On 17/10/2018 14:56, Robin Murphy wrote:
> Now that both sync methods are more or less the same shape, we can save
> some code and levels of indirection by rolling them up together again,
> with just a couple of simple conditionals to discriminate the MSI and
> queue-polling specifics.
>
> Signed-off-by: Robin Murphy <robin.murphy-5wv7dgnIgG8@public.gmane.org>

Tested-by: John Garry <john.garry-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>

I seem to be getting some boost in the scenarios I tested:
Storage controller: 746K IOPS (with) vs 730K (without)
NVMe disk:  471K IOPS (with) vs 420K IOPS (without)

Note that this is with strict mode set and without the CMD_SYNC 
optimisation I punted. And this is on D05, so no MSI-based CMD_SYNC support.

Thanks,
John

Ps. for anyone testing, use the v1 smmu "Fix Endian" patch to get this 
series to apply.

> ---
>  drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
>  1 file changed, 12 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index da8a91d116bf..36db63e3afcf 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>   * The difference between val and sync_idx is bounded by the maximum size of
>   * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
>   */
> -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  {
>  	ktime_t timeout;
>  	u32 val;
> @@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
>  	return -ETIMEDOUT;
>  }
>
> -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
> +static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = {
> -		.opcode = CMDQ_OP_CMD_SYNC,
> -		.sync	= {
> -			.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
> -		},
> -	};
> +	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> +		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> +	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> +	int ret, sync_idx, sync_gen;
> +
> +	if (msi)
> +		ent.sync.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count));
>
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
>
> @@ -1009,39 +1010,13 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>  		arm_smmu_cmdq_build_cmd(cmd, &ent);
>  		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	}
> -
> -	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
> -
> -	return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
> -}
> -
> -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	u64 cmd[CMDQ_ENT_DWORDS];
> -	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int sync_idx, sync_gen;
> -
> -	arm_smmu_cmdq_build_cmd(cmd, &ent);
> -
> -	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
> -		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	sync_idx = smmu->cmdq.q.prod;
>  	sync_gen = READ_ONCE(smmu->cmdq_generation);
> +
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>
> -	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
> -}
> -
> -static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	int ret;
> -	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> -		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> -
> -	ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
> -		  : __arm_smmu_cmdq_issue_sync(smmu);
> +	ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
> +		  : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  	if (ret)
>  		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>  }
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync()
@ 2018-10-22 12:29         ` John Garry
  0 siblings, 0 replies; 24+ messages in thread
From: John Garry @ 2018-10-22 12:29 UTC (permalink / raw)
  To: linux-arm-kernel

On 17/10/2018 14:56, Robin Murphy wrote:
> Now that both sync methods are more or less the same shape, we can save
> some code and levels of indirection by rolling them up together again,
> with just a couple of simple conditionals to discriminate the MSI and
> queue-polling specifics.
>
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>

Tested-by: John Garry <john.garry@huawei.com>

I seem to be getting some boost in the scenarios I tested:
Storage controller: 746K IOPS (with) vs 730K (without)
NVMe disk:  471K IOPS (with) vs 420K IOPS (without)

Note that this is with strict mode set and without the CMD_SYNC 
optimisation I punted. And this is on D05, so no MSI-based CMD_SYNC support.

Thanks,
John

Ps. for anyone testing, use the v1 smmu "Fix Endian" patch to get this 
series to apply.

> ---
>  drivers/iommu/arm-smmu-v3.c | 49 +++++++++----------------------------
>  1 file changed, 12 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index da8a91d116bf..36db63e3afcf 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -933,7 +933,7 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>   * The difference between val and sync_idx is bounded by the maximum size of
>   * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
>   */
> -static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
> +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx)
>  {
>  	ktime_t timeout;
>  	u32 val;
> @@ -988,16 +988,17 @@ static int arm_smmu_sync_poll_cons(struct arm_smmu_device *smmu, u32 sync_idx,
>  	return -ETIMEDOUT;
>  }
>
> -static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
> +static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = {
> -		.opcode = CMDQ_OP_CMD_SYNC,
> -		.sync	= {
> -			.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count)),
> -		},
> -	};
> +	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> +		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> +	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> +	int ret, sync_idx, sync_gen;
> +
> +	if (msi)
> +		ent.sync.msiaddr = cpu_to_le32(virt_to_phys(&smmu->sync_count));
>
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
>
> @@ -1009,39 +1010,13 @@ static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>  		arm_smmu_cmdq_build_cmd(cmd, &ent);
>  		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	}
> -
> -	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
> -
> -	return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
> -}
> -
> -static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	u64 cmd[CMDQ_ENT_DWORDS];
> -	unsigned long flags;
> -	struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> -	int sync_idx, sync_gen;
> -
> -	arm_smmu_cmdq_build_cmd(cmd, &ent);
> -
> -	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	if (smmu->prev_cmd_opcode != CMDQ_OP_CMD_SYNC)
> -		arm_smmu_cmdq_insert_cmd(smmu, cmd);
>  	sync_idx = smmu->cmdq.q.prod;
>  	sync_gen = READ_ONCE(smmu->cmdq_generation);
> +
>  	spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>
> -	return arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
> -}
> -
> -static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -	int ret;
> -	bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> -		   (smmu->features & ARM_SMMU_FEAT_COHERENCY);
> -
> -	ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
> -		  : __arm_smmu_cmdq_issue_sync(smmu);
> +	ret = msi ? arm_smmu_sync_poll_msi(smmu, ent.sync.msidata)
> +		  : arm_smmu_sync_poll_cons(smmu, sync_idx, sync_gen);
>  	if (ret)
>  		dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
>  }
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2018-10-22 12:29 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-17 13:56 [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock Robin Murphy
2018-10-17 13:56 ` Robin Murphy
     [not found] ` <61b4c3e5f1322dfe96ca2062a7fe058298340996.1539782799.git.robin.murphy-5wv7dgnIgG8@public.gmane.org>
2018-10-17 13:56   ` [PATCH v5 2/2] iommu/arm-smmu-v3: Reunify arm_smmu_cmdq_issue_sync() Robin Murphy
2018-10-17 13:56     ` Robin Murphy
     [not found]     ` <fe1b23d1980dd110eb2e8ffc01c2dd68632566d1.1539782799.git.robin.murphy-5wv7dgnIgG8@public.gmane.org>
2018-10-17 14:38       ` John Garry
2018-10-17 14:38         ` John Garry
     [not found]         ` <b35bb171-013b-3cb7-9a99-78505b9fdd18-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
2018-10-18 11:18           ` Robin Murphy
2018-10-18 11:18             ` Robin Murphy
2018-10-18 11:29             ` John Garry
2018-10-18 11:29               ` John Garry
2018-10-18  8:58       ` Andrew Murray
2018-10-18  8:58         ` Andrew Murray
2018-10-22 12:29       ` John Garry
2018-10-22 12:29         ` John Garry
2018-10-18  8:56   ` [PATCH v5 1/2] iommu/arm-smmu-v3: Poll for CMD_SYNC outside cmdq lock Andrew Murray
2018-10-18  8:56     ` Andrew Murray
2018-10-18 10:55   ` John Garry
2018-10-18 10:55     ` John Garry
     [not found]     ` <2c4e00a2-cd53-a6f5-8561-97379dcf9c02-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
2018-10-18 11:19       ` Robin Murphy
2018-10-18 11:19         ` Robin Murphy
2018-10-18 11:48         ` John Garry
2018-10-18 11:48           ` John Garry
     [not found]           ` <3e52b993-a850-301e-85e1-71597df45dc5-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
2018-10-19 14:30             ` John Garry
2018-10-19 14:30               ` John Garry

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.