[RFC][Patch v1 2/2] iommu/arm-smmu-v3: Add support for NVIDIA CMDQ-Virtualization hw

From: Nicolin Chen <nicolinc@nvidia.com>
To: <will@kernel.org>, <robin.murphy@arm.com>, <joro@8bytes.org>
Cc: <nicoleotsuka@gmail.com>, <vdumpa@nvidia.com>,
	<thierry.reding@gmail.com>, <linux-tegra@vger.kernel.org>,
	<nwatterson@nvidia.com>, <Jonathan.Cameron@huawei.com>,
	<jean-philippe@linaro.org>, <song.bao.hua@hisilicon.com>,
	<eric.auger@redhat.com>, <thunder.leizhen@huawei.com>,
	<yuzenghui@huawei.com>, <linux-kernel@vger.kernel.org>,
	<linux-arm-kernel@lists.infradead.org>,
	<iommu@lists.linux-foundation.org>
Subject: [RFC][Patch v1 2/2] iommu/arm-smmu-v3: Add support for NVIDIA CMDQ-Virtualization hw
Date: Fri, 23 Jul 2021 12:31:40 -0700	[thread overview]
Message-ID: <20210723193140.9690-3-nicolinc@nvidia.com> (raw)
In-Reply-To: <20210723193140.9690-1-nicolinc@nvidia.com>

From: Nate Watterson <nwatterson@nvidia.com>

NVIDIA's Grace SoC includes custom CMDQ-Virtualization (CMDQV)
hardware, which adds multiple VCMDQ interfaces to supplement
the architected SMMU_CMDQ in an effort to reduce contention.

To make use of these supplemental CMDQs in arm-smmu-v3 driver,
we borrow the "implementation infrastructure" design from the
arm-smmu driver, and add support for implementation defined
issue_cmdlist methods.

Signed-off-by: Nate Watterson <nwatterson@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 MAINTAINERS                                   |   2 +
 drivers/iommu/arm/arm-smmu-v3/Makefile        |   2 +-
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c  |   7 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  67 +--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  11 +
 .../iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c    | 425 ++++++++++++++++++
 6 files changed, 487 insertions(+), 27 deletions(-)
 create mode 100644 drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c

diff --git a/MAINTAINERS b/MAINTAINERS
index d69b2d4646be..e72e3459c9be 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18240,8 +18240,10 @@ F:	drivers/i2c/busses/i2c-tegra.c
 TEGRA IOMMU DRIVERS
 M:	Thierry Reding <thierry.reding@gmail.com>
 R:	Krishna Reddy <vdumpa@nvidia.com>
+R:	Nicolin Chen <nicoleotsuka@gmail.com>
 L:	linux-tegra@vger.kernel.org
 S:	Supported
+F:	drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c
 F:	drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
 F:	drivers/iommu/tegra*
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile
index 1f5838d3351b..0aa84c0a50ea 100644
--- a/drivers/iommu/arm/arm-smmu-v3/Makefile
+++ b/drivers/iommu/arm/arm-smmu-v3/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o
-arm_smmu_v3-objs-y += arm-smmu-v3.o arm-smmu-v3-impl.o
+arm_smmu_v3-objs-y += arm-smmu-v3.o arm-smmu-v3-impl.o nvidia-smmu-v3.o
 arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
 arm_smmu_v3-objs := $(arm_smmu_v3-objs-y)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c
index 6947d28067a8..37d062e40eb5 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c
@@ -4,5 +4,12 @@
 
 struct arm_smmu_device *arm_smmu_v3_impl_init(struct arm_smmu_device *smmu)
 {
+	/*
+	 * Nvidia implementation supports ACPI only, so calling its init()
+	 * unconditionally to walk through ACPI tables to probe the device.
+	 * It will keep the smmu pointer intact, if it fails.
+	 */
+	smmu = nvidia_smmu_v3_impl_init(smmu);
+
 	return smmu;
 }
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index b2d23de2b207..439809e1acd4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -336,9 +336,9 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 }
 
 static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
-					 u32 prod)
+					 u32 prod, struct arm_smmu_cmdq *cmdq)
 {
-	struct arm_smmu_queue *q = &smmu->cmdq.q;
+	struct arm_smmu_queue *q = &cmdq->q;
 	struct arm_smmu_cmdq_ent ent = {
 		.opcode = CMDQ_OP_CMD_SYNC,
 	};
@@ -575,11 +575,11 @@ static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
 
 /* Wait for the command queue to become non-full */
 static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
-					     struct arm_smmu_ll_queue *llq)
+					     struct arm_smmu_ll_queue *llq,
+					     struct arm_smmu_cmdq *cmdq)
 {
 	unsigned long flags;
 	struct arm_smmu_queue_poll qp;
-	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	int ret = 0;
 
 	/*
@@ -595,7 +595,7 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
 
 	queue_poll_init(smmu, &qp);
 	do {
-		llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+		llq->val = READ_ONCE(cmdq->q.llq.val);
 		if (!queue_full(llq))
 			break;
 
@@ -610,11 +610,11 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
  * Must be called with the cmdq lock held in some capacity.
  */
 static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
-					  struct arm_smmu_ll_queue *llq)
+					  struct arm_smmu_ll_queue *llq,
+					  struct arm_smmu_cmdq *cmdq)
 {
 	int ret = 0;
 	struct arm_smmu_queue_poll qp;
-	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
 
 	queue_poll_init(smmu, &qp);
@@ -634,15 +634,15 @@ static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
  * Must be called with the cmdq lock held in some capacity.
  */
 static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
-					       struct arm_smmu_ll_queue *llq)
+					       struct arm_smmu_ll_queue *llq,
+					       struct arm_smmu_cmdq *cmdq)
 {
 	struct arm_smmu_queue_poll qp;
-	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	u32 prod = llq->prod;
 	int ret = 0;
 
 	queue_poll_init(smmu, &qp);
-	llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
+	llq->val = READ_ONCE(cmdq->q.llq.val);
 	do {
 		if (queue_consumed(llq, prod))
 			break;
@@ -684,12 +684,13 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
 }
 
 static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
-					 struct arm_smmu_ll_queue *llq)
+					 struct arm_smmu_ll_queue *llq,
+					 struct arm_smmu_cmdq *cmdq)
 {
 	if (smmu->options & ARM_SMMU_OPT_MSIPOLL)
-		return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
+		return __arm_smmu_cmdq_poll_until_msi(smmu, llq, cmdq);
 
-	return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
+	return __arm_smmu_cmdq_poll_until_consumed(smmu, llq, cmdq);
 }
 
 static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
@@ -709,6 +710,14 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
 	}
 }
 
+static int arm_smmu_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, int n, bool sync)
+{
+	if (smmu->impl && smmu->impl->issue_cmdlist)
+		return smmu->impl->issue_cmdlist(smmu, cmds, n, sync);
+
+	return arm_smmu_cmdq_issue_cmdlist(smmu, cmds, n, sync, &smmu->cmdq);
+}
+
 /*
  * This is the actual insertion function, and provides the following
  * ordering guarantees to callers:
@@ -725,14 +734,13 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
  *   insert their own list of commands then all of the commands from one
  *   CPU will appear before any of the commands from the other CPU.
  */
-static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
-				       u64 *cmds, int n, bool sync)
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, int n, bool sync,
+				struct arm_smmu_cmdq *cmdq)
 {
 	u64 cmd_sync[CMDQ_ENT_DWORDS];
 	u32 prod;
 	unsigned long flags;
 	bool owner;
-	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	struct arm_smmu_ll_queue llq = {
 		.max_n_shift = cmdq->q.llq.max_n_shift,
 	}, head = llq;
@@ -746,7 +754,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 
 		while (!queue_has_space(&llq, n + sync)) {
 			local_irq_restore(flags);
-			if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
+			if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq, cmdq))
 				dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 			local_irq_save(flags);
 		}
@@ -772,7 +780,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 	arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
 	if (sync) {
 		prod = queue_inc_prod_n(&llq, n);
-		arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
+		arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod, cmdq);
 		queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
 
 		/*
@@ -822,7 +830,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 	/* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
 	if (sync) {
 		llq.prod = queue_inc_prod_n(&llq, n);
-		ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
+		ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq, cmdq);
 		if (ret) {
 			dev_err_ratelimited(smmu->dev,
 					    "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
@@ -856,12 +864,12 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 		return -EINVAL;
 	}
 
-	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
+	return arm_smmu_issue_cmdlist(smmu, cmd, 1, false);
 }
 
 static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
 {
-	return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
+	return arm_smmu_issue_cmdlist(smmu, NULL, 0, true);
 }
 
 static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
@@ -869,7 +877,7 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 				    struct arm_smmu_cmdq_ent *cmd)
 {
 	if (cmds->num == CMDQ_BATCH_ENTRIES) {
-		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
+		arm_smmu_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
 		cmds->num = 0;
 	}
 	arm_smmu_cmdq_build_cmd(&cmds->cmds[cmds->num * CMDQ_ENT_DWORDS], cmd);
@@ -879,7 +887,7 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
 				      struct arm_smmu_cmdq_batch *cmds)
 {
-	return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+	return arm_smmu_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
 }
 
 static int arm_smmu_page_response(struct device *dev,
@@ -2899,10 +2907,9 @@ static void arm_smmu_cmdq_free_bitmap(void *data)
 	bitmap_free(bitmap);
 }
 
-static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
+static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu, struct arm_smmu_cmdq *cmdq)
 {
 	int ret = 0;
-	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
 	atomic_long_t *bitmap;
 
@@ -2932,7 +2939,7 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
 	if (ret)
 		return ret;
 
-	ret = arm_smmu_cmdq_init(smmu);
+	ret = arm_smmu_cmdq_init(smmu, &smmu->cmdq);
 	if (ret)
 		return ret;
 
@@ -3416,6 +3423,14 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
 		return ret;
 	}
 
+	if (smmu->impl && smmu->impl->device_reset) {
+		ret = smmu->impl->device_reset(smmu);
+		if (ret) {
+			dev_err(smmu->dev, "failed at implementation specific device_reset\n");
+			return ret;
+		}
+	}
+
 	return 0;
 }
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 4c60ba14221b..baec2d3a46f9 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -647,6 +647,8 @@ struct arm_smmu_device {
 #define ARM_SMMU_OPT_MSIPOLL		(1 << 2)
 	u32				options;
 
+	const struct arm_smmu_impl	*impl;
+
 	struct arm_smmu_cmdq		cmdq;
 	struct arm_smmu_evtq		evtq;
 	struct arm_smmu_priq		priq;
@@ -807,7 +809,16 @@ static inline u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle)
 static inline void arm_smmu_sva_notifier_synchronize(void) {}
 #endif /* CONFIG_ARM_SMMU_V3_SVA */
 
+int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, int n, bool sync,
+				struct arm_smmu_cmdq *cmdq);
+
 /* Implementation details */
+struct arm_smmu_impl {
+	int (*device_reset)(struct arm_smmu_device *smmu);
+	int (*issue_cmdlist)(struct arm_smmu_device *smmu, u64 *cmds, int n, bool sync);
+};
+
 struct arm_smmu_device *arm_smmu_v3_impl_init(struct arm_smmu_device *smmu);
+struct arm_smmu_device *nvidia_smmu_v3_impl_init(struct arm_smmu_device *smmu);
 
 #endif /* _ARM_SMMU_V3_H */
diff --git a/drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c
new file mode 100644
index 000000000000..ceec2a24057f
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define dev_fmt(fmt) "nvidia_smmu_cmdqv: " fmt
+
+#include <linux/acpi.h>
+#include <linux/dma-mapping.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/platform_device.h>
+
+#include <acpi/acpixf.h>
+
+#include "arm-smmu-v3.h"
+
+#define NVIDIA_SMMU_CMDQV_HID		"NVDA0600"
+
+/* CMDQV global config regs */
+#define NVIDIA_CMDQV_CONFIG		0x0000
+#define  CMDQV_EN			BIT(0)
+
+#define NVIDIA_CMDQV_PARAM		0x0004
+#define  CMDQV_NUM_VINTF_LOG2		GENMASK(11, 8)
+#define  CMDQV_NUM_VCMDQ_LOG2		GENMASK(7, 4)
+
+#define NVIDIA_CMDQV_STATUS		0x0008
+#define  CMDQV_STATUS			GENMASK(2, 1)
+#define  CMDQV_ENABLED			BIT(0)
+
+#define NVIDIA_CMDQV_VINTF_ERR_MAP	0x000C
+#define NVIDIA_CMDQV_VINTF_INT_MASK	0x0014
+#define NVIDIA_CMDQV_VCMDQ_ERR_MAP	0x001C
+
+#define NVIDIA_CMDQV_CMDQ_ALLOC(q)	(0x0200 + 0x4*(q))
+#define  CMDQV_CMDQ_ALLOC_VINTF		GENMASK(20, 15)
+#define  CMDQV_CMDQ_ALLOC_LVCMDQ	GENMASK(7, 1)
+#define  CMDQV_CMDQ_ALLOCATED		BIT(0)
+
+/* VINTF config regs */
+#define NVIDIA_CMDQV_VINTF(v)		(0x1000 + 0x100*(v))
+
+#define NVIDIA_VINTF_CONFIG		0x0000
+#define  VINTF_HYP_OWN			BIT(17)
+#define  VINTF_VMID			GENMASK(16, 1)
+#define  VINTF_EN			BIT(0)
+
+#define NVIDIA_VINTF_STATUS		0x0004
+#define  VINTF_STATUS			GENMASK(3, 1)
+#define  VINTF_ENABLED			BIT(0)
+
+/* VCMDQ config regs */
+#define NVIDIA_CMDQV_VCMDQ(q)		(0x10000 + 0x80*(q))
+
+#define NVIDIA_VCMDQ_CONS		0x00000
+#define  VCMDQ_CONS_ERR			GENMASK(30, 24)
+
+#define NVIDIA_VCMDQ_PROD		0x00004
+
+#define NVIDIA_VCMDQ_CONFIG		0x00008
+#define  VCMDQ_EN			BIT(0)
+
+#define NVIDIA_VCMDQ_STATUS		0x0000C
+#define  VCMDQ_ENABLED			BIT(0)
+
+#define NVIDIA_VCMDQ_GERROR		0x00010
+#define NVIDIA_VCMDQ_GERRORN		0x00014
+
+#define NVIDIA_VCMDQ_BASE		0x10000
+#define  VCMDQ_ADDR			GENMASK(63, 5)
+#define  VCMDQ_LOG2SIZE			GENMASK(4, 0)
+
+struct nvidia_smmu_vintf {
+	u16			idx;
+	u32			cfg;
+	u32			status;
+
+	void __iomem		*base;
+	struct arm_smmu_cmdq	*vcmdqs;
+};
+
+struct nvidia_smmu {
+	struct arm_smmu_device	smmu;
+
+	struct device		*cmdqv_dev;
+	void __iomem		*cmdqv_base;
+	int			cmdqv_irq;
+
+	/* CMDQV Hardware Params */
+	u16			num_total_vintfs;
+	u16			num_total_vcmdqs;
+	u16			num_vcmdqs_per_vintf;
+
+	/* CMDQV_VINTF(0) reserved for host kernel use */
+	struct nvidia_smmu_vintf vintf0;
+};
+
+static irqreturn_t nvidia_smmu_cmdqv_isr(int irq, void *devid)
+{
+	struct nvidia_smmu *nsmmu = (struct nvidia_smmu *)devid;
+	struct nvidia_smmu_vintf *vintf0 = &nsmmu->vintf0;
+	u32 vintf_err_map[2];
+	u32 vcmdq_err_map[4];
+
+	vintf_err_map[0] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VINTF_ERR_MAP);
+	vintf_err_map[1] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VINTF_ERR_MAP + 0x4);
+
+	vcmdq_err_map[0] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VCMDQ_ERR_MAP);
+	vcmdq_err_map[1] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VCMDQ_ERR_MAP + 0x4);
+	vcmdq_err_map[2] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VCMDQ_ERR_MAP + 0x8);
+	vcmdq_err_map[3] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VCMDQ_ERR_MAP + 0xC);
+
+	dev_warn(nsmmu->cmdqv_dev,
+		 "Unexpected cmdqv error reported: vintf_map %08X %08X, vcmdq_map %08X %08X %08X %08X\n",
+		 vintf_err_map[0], vintf_err_map[1], vcmdq_err_map[0], vcmdq_err_map[1],
+		 vcmdq_err_map[2], vcmdq_err_map[3]);
+
+	/* If the error was reported by vintf0, avoid using any of its VCMDQs */
+	if (vintf_err_map[vintf0->idx / 32] & (1 << (vintf0->idx % 32))) {
+		vintf0->status = readl_relaxed(vintf0->base + NVIDIA_VINTF_STATUS);
+
+		dev_warn(nsmmu->cmdqv_dev, "error (0x%lX) reported by host vintf0 - disabling its vcmdqs\n",
+			 FIELD_GET(VINTF_STATUS, vintf0->status));
+	} else if (vintf_err_map[0] || vintf_err_map[1]) {
+		dev_err(nsmmu->cmdqv_dev, "cmdqv error interrupt triggered by unassigned vintf!\n");
+	}
+
+	return IRQ_HANDLED;
+}
+
+/* Adapt struct arm_smmu_cmdq init sequences from arm-smmu-v3.c for VCMDQs */
+static int nvidia_smmu_init_one_arm_smmu_cmdq(struct nvidia_smmu *nsmmu,
+					      struct arm_smmu_cmdq *cmdq,
+					      void __iomem *vcmdq_base,
+					      u16 idx)
+{
+	struct arm_smmu_queue *q = &cmdq->q;
+	size_t qsz;
+
+	/* struct arm_smmu_cmdq config normally done in arm_smmu_device_hw_probe() */
+	q->llq.max_n_shift = ilog2(SZ_64K >> CMDQ_ENT_SZ_SHIFT);
+
+	/* struct arm_smmu_cmdq config normally done in arm_smmu_init_one_queue() */
+	qsz = (1 << q->llq.max_n_shift) << CMDQ_ENT_SZ_SHIFT;
+	q->base = dmam_alloc_coherent(nsmmu->cmdqv_dev, qsz, &q->base_dma, GFP_KERNEL);
+	if (!q->base) {
+		dev_err(nsmmu->cmdqv_dev, "failed to allocate 0x%zX bytes for VCMDQ%u\n",
+			qsz, idx);
+		return -ENOMEM;
+	}
+	dev_dbg(nsmmu->cmdqv_dev, "allocated %u entries for VCMDQ%u @ 0x%llX [%pad] ++ %zX",
+		1 << q->llq.max_n_shift, idx, (u64)q->base, &q->base_dma, qsz);
+
+	q->prod_reg = vcmdq_base + NVIDIA_VCMDQ_PROD;
+	q->cons_reg = vcmdq_base + NVIDIA_VCMDQ_CONS;
+	q->ent_dwords = CMDQ_ENT_DWORDS;
+
+	q->q_base  = q->base_dma & VCMDQ_ADDR;
+	q->q_base |= FIELD_PREP(VCMDQ_LOG2SIZE, q->llq.max_n_shift);
+
+	q->llq.prod = q->llq.cons = 0;
+
+	/* struct arm_smmu_cmdq config normally done in arm_smmu_cmdq_init() */
+	atomic_set(&cmdq->owner_prod, 0);
+	atomic_set(&cmdq->lock, 0);
+
+	cmdq->valid_map = (atomic_long_t *)bitmap_zalloc(1 << q->llq.max_n_shift, GFP_KERNEL);
+	if (!cmdq->valid_map) {
+		dev_err(nsmmu->cmdqv_dev, "failed to allocate valid_map for VCMDQ%u\n", idx);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int nvidia_smmu_cmdqv_init(struct nvidia_smmu *nsmmu)
+{
+	struct nvidia_smmu_vintf *vintf0 = &nsmmu->vintf0;
+	u32 regval;
+	u16 idx;
+	int ret;
+
+	/* Setup vintf0 for host kernel */
+	vintf0->idx = 0;
+	vintf0->base = nsmmu->cmdqv_base + NVIDIA_CMDQV_VINTF(0);
+
+	regval = FIELD_PREP(VINTF_HYP_OWN, nsmmu->num_total_vintfs > 1);
+	writel_relaxed(regval, vintf0->base + NVIDIA_VINTF_CONFIG);
+
+	regval |= FIELD_PREP(VINTF_EN, 1);
+	writel_relaxed(regval, vintf0->base + NVIDIA_VINTF_CONFIG);
+
+	vintf0->cfg = regval;
+
+	ret = readl_relaxed_poll_timeout(vintf0->base + NVIDIA_VINTF_STATUS,
+					 regval, regval == VINTF_ENABLED,
+					 1, ARM_SMMU_POLL_TIMEOUT_US);
+	vintf0->status = regval;
+	if (ret) {
+		dev_err(nsmmu->cmdqv_dev, "failed to enable VINTF[%u]: STATUS = 0x%08X\n",
+			vintf0->idx, regval);
+		return ret;
+	}
+
+	/* Allocate vcmdqs to vintf0 */
+	for (idx = 0; idx < nsmmu->num_vcmdqs_per_vintf; idx++) {
+		regval  = FIELD_PREP(CMDQV_CMDQ_ALLOC_VINTF, vintf0->idx);
+		regval |= FIELD_PREP(CMDQV_CMDQ_ALLOC_LVCMDQ, idx);
+		regval |= CMDQV_CMDQ_ALLOCATED;
+		writel_relaxed(regval, nsmmu->cmdqv_base + NVIDIA_CMDQV_CMDQ_ALLOC(idx));
+	}
+
+	/* Build an arm_smmu_cmdq for each vcmdq allocated to vintf0 */
+	vintf0->vcmdqs = devm_kcalloc(nsmmu->cmdqv_dev, nsmmu->num_vcmdqs_per_vintf,
+				      sizeof(*vintf0->vcmdqs), GFP_KERNEL);
+	if (!vintf0->vcmdqs)
+		return -ENOMEM;
+
+	for (idx = 0; idx < nsmmu->num_vcmdqs_per_vintf; idx++) {
+		void __iomem *vcmdq_base = nsmmu->cmdqv_base + NVIDIA_CMDQV_VCMDQ(idx);
+		struct arm_smmu_cmdq *cmdq = &vintf0->vcmdqs[idx];
+
+		/* Setup struct arm_smmu_cmdq data members */
+		nvidia_smmu_init_one_arm_smmu_cmdq(nsmmu, cmdq, vcmdq_base, idx);
+
+		/* Configure and enable the vcmdq */
+		writel_relaxed(0, vcmdq_base + NVIDIA_VCMDQ_PROD);
+		writel_relaxed(0, vcmdq_base + NVIDIA_VCMDQ_CONS);
+
+		writeq_relaxed(cmdq->q.q_base, vcmdq_base + NVIDIA_VCMDQ_BASE);
+
+		writel_relaxed(VCMDQ_EN, vcmdq_base + NVIDIA_VCMDQ_CONFIG);
+		ret = readl_poll_timeout(vcmdq_base + NVIDIA_VCMDQ_STATUS,
+					 regval, regval == VCMDQ_ENABLED,
+					 1, ARM_SMMU_POLL_TIMEOUT_US);
+		if (ret) {
+			u32 gerror = readl_relaxed(vcmdq_base + NVIDIA_VCMDQ_GERROR);
+			u32 gerrorn = readl_relaxed(vcmdq_base + NVIDIA_VCMDQ_GERRORN);
+			u32 cons = readl_relaxed(vcmdq_base + NVIDIA_VCMDQ_CONS);
+
+			dev_err(nsmmu->cmdqv_dev,
+				"failed to enable VCMDQ[%u]: GERROR=0x%X, GERRORN=0x%X, CONS=0x%X\n",
+				idx, gerror, gerrorn, cons);
+			return ret;
+		}
+
+		dev_info(nsmmu->cmdqv_dev, "VCMDQ%u allocated to VINTF%u as CMDQ%u\n",
+			 idx, vintf0->idx, idx);
+	}
+
+	return 0;
+}
+
+static int nvidia_smmu_probe(struct nvidia_smmu *nsmmu)
+{
+	struct platform_device *cmdqv_pdev = to_platform_device(nsmmu->cmdqv_dev);
+	struct resource *res;
+	u32 regval;
+
+	/* Base address */
+	res = platform_get_resource(cmdqv_pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENXIO;
+
+	nsmmu->cmdqv_base = devm_ioremap_resource(nsmmu->cmdqv_dev, res);
+	if (IS_ERR(nsmmu->cmdqv_base))
+		return PTR_ERR(nsmmu->cmdqv_base);
+
+	/* Interrupt */
+	nsmmu->cmdqv_irq = platform_get_irq(cmdqv_pdev, 0);
+	if (nsmmu->cmdqv_irq < 0) {
+		dev_warn(nsmmu->cmdqv_dev, "no cmdqv interrupt - errors will not be reported\n");
+		nsmmu->cmdqv_irq = 0;
+	}
+
+	/* Probe the h/w */
+	regval = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_CONFIG);
+	if (!FIELD_GET(CMDQV_EN, regval)) {
+		dev_err(nsmmu->cmdqv_dev, "CMDQV h/w is disabled: CMDQV_CONFIG=0x%08X\n", regval);
+		return -ENODEV;
+	}
+
+	regval = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_STATUS);
+	if (!FIELD_GET(CMDQV_ENABLED, regval) || FIELD_GET(CMDQV_STATUS, regval)) {
+		dev_err(nsmmu->cmdqv_dev, "CMDQV h/w not ready: CMDQV_STATUS=0x%08X\n", regval);
+		return -ENODEV;
+	}
+
+	regval = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_PARAM);
+	nsmmu->num_total_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval);
+	nsmmu->num_total_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval);
+	nsmmu->num_vcmdqs_per_vintf = nsmmu->num_total_vcmdqs / nsmmu->num_total_vintfs;
+
+	return 0;
+}
+
+static int nvidia_smmu_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, int n, bool sync)
+{
+	struct nvidia_smmu *nsmmu = (struct nvidia_smmu *)smmu;
+	struct nvidia_smmu_vintf *vintf0 = &nsmmu->vintf0;
+	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
+	u16 idx;
+
+	/* Make sure vintf0 is enabled and healthy */
+	if (vintf0->status != VINTF_ENABLED)
+		goto issue_cmdlist;
+
+	/* Check for illegal CMDs */
+	if (!FIELD_GET(VINTF_HYP_OWN, vintf0->cfg)) {
+		u64 opcode = (n) ? FIELD_GET(CMDQ_0_OP, cmds[0]) : CMDQ_OP_CMD_SYNC;
+
+		switch (opcode) {
+		case CMDQ_OP_TLBI_NH_ASID:
+		case CMDQ_OP_TLBI_NH_VA:
+		case CMDQ_OP_TLBI_S12_VMALL:
+		case CMDQ_OP_TLBI_S2_IPA:
+		case CMDQ_OP_ATC_INV:
+			break;
+		default:
+			goto issue_cmdlist;
+		}
+	}
+
+	/*
+	 * Select a vcmdq to use. Here we use a temporal solution to
+	 * balance out traffic on cmdq issuing: each cmdq has its own
+	 * lock, if all cpus issue cmdlist using the same cmdq, only
+	 * one CPU at a time can enter the process, while the others
+	 * will be spinning at the same lock.
+	 */
+	idx = smp_processor_id() % nsmmu->num_vcmdqs_per_vintf;
+	cmdq = &vintf0->vcmdqs[idx];
+
+issue_cmdlist:
+	return arm_smmu_cmdq_issue_cmdlist(smmu, cmds, n, sync, cmdq);
+}
+
+static int nvidia_smmu_device_reset(struct arm_smmu_device *smmu)
+{
+	struct nvidia_smmu *nsmmu = (struct nvidia_smmu *)smmu;
+	int ret;
+
+	ret = nvidia_smmu_cmdqv_init(nsmmu);
+	if (ret)
+		return ret;
+
+	if (nsmmu->cmdqv_irq) {
+		ret = devm_request_irq(nsmmu->cmdqv_dev, nsmmu->cmdqv_irq, nvidia_smmu_cmdqv_isr,
+				       IRQF_SHARED, "nvidia-smmu-cmdqv", nsmmu);
+		if (ret) {
+			dev_err(nsmmu->cmdqv_dev, "failed to claim irq (%d): %d\n",
+				nsmmu->cmdqv_irq, ret);
+			return ret;
+		}
+	}
+
+	/* Disable FEAT_MSI and OPT_MSIPOLL since VCMDQs only support CMD_SYNC w/CS_NONE */
+	smmu->features &= ~ARM_SMMU_FEAT_MSI;
+	smmu->options &= ~ARM_SMMU_OPT_MSIPOLL;
+
+	return 0;
+}
+
+const struct arm_smmu_impl nvidia_smmu_impl = {
+	.device_reset = nvidia_smmu_device_reset,
+	.issue_cmdlist = nvidia_smmu_issue_cmdlist,
+};
+
+#ifdef CONFIG_ACPI
+struct nvidia_smmu *nvidia_smmu_create(struct arm_smmu_device *smmu)
+{
+	struct nvidia_smmu *nsmmu = NULL;
+	struct acpi_iort_node *node;
+	struct acpi_device *adev;
+	struct device *cmdqv_dev;
+	const char *match_uid;
+
+	if (acpi_disabled)
+		return NULL;
+
+	/* Look for a device in the DSDT whose _UID matches the SMMU's iort_node identifier */
+	node = *(struct acpi_iort_node **)dev_get_platdata(smmu->dev);
+	match_uid = kasprintf(GFP_KERNEL, "%u", node->identifier);
+	adev = acpi_dev_get_first_match_dev(NVIDIA_SMMU_CMDQV_HID, match_uid, -1);
+	kfree(match_uid);
+
+	if (!adev)
+		return NULL;
+
+	cmdqv_dev = bus_find_device_by_acpi_dev(&platform_bus_type, adev);
+	if (!cmdqv_dev)
+		return NULL;
+
+	dev_info(smmu->dev, "found companion CMDQV device, %s", dev_name(cmdqv_dev));
+
+	nsmmu = devm_krealloc(smmu->dev, smmu, sizeof(*nsmmu), GFP_KERNEL);
+	if (!nsmmu)
+		return ERR_PTR(-ENOMEM);
+
+	nsmmu->cmdqv_dev = cmdqv_dev;
+
+	return nsmmu;
+}
+#else
+struct nvidia_smmu *nvidia_smmu_create(struct arm_smmu_device *smmu)
+{
+	return NULL;
+}
+#endif
+
+struct arm_smmu_device *nvidia_smmu_v3_impl_init(struct arm_smmu_device *smmu)
+{
+	struct nvidia_smmu *nsmmu;
+	int ret;
+
+	nsmmu = nvidia_smmu_create(smmu);
+	if (!nsmmu)
+		return smmu;
+
+	ret = nvidia_smmu_probe(nsmmu);
+	if (ret)
+		return ERR_PTR(ret);
+
+	nsmmu->smmu.impl = &nvidia_smmu_impl;
+
+	return &nsmmu->smmu;
+}
-- 
2.17.1