dmaengine Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v3 01/18] irqchip: Add IMS (Interrupt Message Storage) driver
       [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
@ 2020-09-15 23:27 ` Dave Jiang
  2020-09-30 18:23   ` Thomas Gleixner
  2020-09-15 23:27 ` [PATCH v3 02/18] iommu/vt-d: Add DEV-MSI support Dave Jiang
                   ` (11 subsequent siblings)
  12 siblings, 1 reply; 50+ messages in thread
From: Dave Jiang @ 2020-09-15 23:27 UTC (permalink / raw)
  To: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

From: Thomas Gleixner <tglx@linutronix.de>

Generic IMS irq chips and irq domain implementations for IMS based devices
which store the interrupt messages in an array in device memory.

Allocation and freeing of interrupts happens via the generic
msi_domain_alloc/free_irqs() interface. No special purpose IMS magic
required as long as the interrupt domain is stored in the underlying device
struct.

[Megha : Fixed compile time errors
	 Added necessary dependencies to IMS_MSI_ARRAY config
	 Fixed polarity of IMS_VECTOR_CTRL
	 Added missing read after write in write_msg
	 Tested the IMS infrastructure with the IDXD driver]

Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/irqchip/Kconfig             |   14 +++
 drivers/irqchip/Makefile            |    1 
 drivers/irqchip/irq-ims-msi.c       |  181 +++++++++++++++++++++++++++++++++++
 include/linux/irqchip/irq-ims-msi.h |   45 +++++++++
 4 files changed, 241 insertions(+)
 create mode 100644 drivers/irqchip/irq-ims-msi.c
 create mode 100644 include/linux/irqchip/irq-ims-msi.h

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index bfc9719dbcdc..76479e9b1b4c 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -571,4 +571,18 @@ config LOONGSON_PCH_MSI
 	help
 	  Support for the Loongson PCH MSI Controller.
 
+config IMS_MSI
+	depends on PCI
+	select DEVICE_MSI
+	bool
+
+config IMS_MSI_ARRAY
+	bool "IMS Interrupt Message Storm MSI controller for device memory storage arrays"
+	depends on PCI
+	select IMS_MSI
+	select GENERIC_MSI_IRQ_DOMAIN
+	help
+	  Support for IMS Interrupt Message Storm MSI controller
+	  with IMS slot storage in a slot array in device memory
+
 endmenu
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 133f9c45744a..5ccd8336f8e3 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -111,3 +111,4 @@ obj-$(CONFIG_LOONGSON_HTPIC)		+= irq-loongson-htpic.o
 obj-$(CONFIG_LOONGSON_HTVEC)		+= irq-loongson-htvec.o
 obj-$(CONFIG_LOONGSON_PCH_PIC)		+= irq-loongson-pch-pic.o
 obj-$(CONFIG_LOONGSON_PCH_MSI)		+= irq-loongson-pch-msi.o
+obj-$(CONFIG_IMS_MSI)			+= irq-ims-msi.o
diff --git a/drivers/irqchip/irq-ims-msi.c b/drivers/irqchip/irq-ims-msi.c
new file mode 100644
index 000000000000..581febf2bca0
--- /dev/null
+++ b/drivers/irqchip/irq-ims-msi.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+// (C) Copyright 2020 Thomas Gleixner <tglx@linutronix.de>
+/*
+ * Shared interrupt chips and irq domains for IMS devices
+ */
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/msi.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+
+#include <linux/irqchip/irq-ims-msi.h>
+
+#ifdef CONFIG_IMS_MSI_ARRAY
+
+struct ims_array_data {
+	struct ims_array_info	info;
+	unsigned long		map[0];
+};
+
+static void ims_array_mask_irq(struct irq_data *data)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+	u32 __iomem *ctrl = &slot->ctrl;
+
+	iowrite32(ioread32(ctrl) | IMS_VECTOR_CTRL_MASK, ctrl);
+	ioread32(ctrl); /* Flush write to device */
+}
+
+static void ims_array_unmask_irq(struct irq_data *data)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+	u32 __iomem *ctrl = &slot->ctrl;
+
+	iowrite32(ioread32(ctrl) & ~IMS_VECTOR_CTRL_MASK, ctrl);
+}
+
+static void ims_array_write_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+
+	iowrite32(msg->address_lo, &slot->address_lo);
+	iowrite32(msg->address_hi, &slot->address_hi);
+	iowrite32(msg->data, &slot->data);
+	ioread32(slot);
+}
+
+static const struct irq_chip ims_array_msi_controller = {
+	.name			= "IMS",
+	.irq_mask		= ims_array_mask_irq,
+	.irq_unmask		= ims_array_unmask_irq,
+	.irq_write_msi_msg	= ims_array_write_msi_msg,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+static void ims_array_reset_slot(struct ims_slot __iomem *slot)
+{
+	iowrite32(0, &slot->address_lo);
+	iowrite32(0, &slot->address_hi);
+	iowrite32(0, &slot->data);
+	iowrite32(0, &slot->ctrl);
+}
+
+static void ims_array_free_msi_store(struct irq_domain *domain,
+				     struct device *dev)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct ims_array_data *ims = info->data;
+	struct msi_desc *entry;
+
+	for_each_msi_entry(entry, dev) {
+		if (entry->device_msi.priv_iomem) {
+			clear_bit(entry->device_msi.hwirq, ims->map);
+			ims_array_reset_slot(entry->device_msi.priv_iomem);
+			entry->device_msi.priv_iomem = NULL;
+			entry->device_msi.hwirq = 0;
+		}
+	}
+}
+
+static int ims_array_alloc_msi_store(struct irq_domain *domain,
+				     struct device *dev, int nvec)
+{
+	struct msi_domain_info *info = domain->host_data;
+	struct ims_array_data *ims = info->data;
+	struct msi_desc *entry;
+
+	for_each_msi_entry(entry, dev) {
+		unsigned int idx;
+
+		idx = find_first_zero_bit(ims->map, ims->info.max_slots);
+		if (idx >= ims->info.max_slots)
+			goto fail;
+		set_bit(idx, ims->map);
+		entry->device_msi.priv_iomem = &ims->info.slots[idx];
+		entry->device_msi.hwirq = idx;
+	}
+	return 0;
+
+fail:
+	ims_array_free_msi_store(domain, dev);
+	return -ENOSPC;
+}
+
+struct ims_array_domain_template {
+	struct msi_domain_ops	ops;
+	struct msi_domain_info	info;
+};
+
+static const struct ims_array_domain_template ims_array_domain_template = {
+	.ops = {
+		.msi_alloc_store	= ims_array_alloc_msi_store,
+		.msi_free_store		= ims_array_free_msi_store,
+	},
+	.info = {
+		.flags		= MSI_FLAG_USE_DEF_DOM_OPS |
+				  MSI_FLAG_USE_DEF_CHIP_OPS,
+		.handler	= handle_edge_irq,
+		.handler_name	= "edge",
+	},
+};
+
+struct irq_domain *
+pci_ims_array_create_msi_irq_domain(struct pci_dev *pdev,
+				    struct ims_array_info *ims_info)
+{
+	struct ims_array_domain_template *info;
+	struct ims_array_data *data;
+	struct irq_domain *domain;
+	struct irq_chip *chip;
+	unsigned int size;
+
+	/* Allocate new domain storage */
+	info = kmemdup(&ims_array_domain_template,
+		       sizeof(ims_array_domain_template), GFP_KERNEL);
+	if (!info)
+		return NULL;
+	/* Link the ops */
+	info->info.ops = &info->ops;
+
+	/* Allocate ims_info along with the bitmap */
+	size = sizeof(*data);
+	size += BITS_TO_LONGS(ims_info->max_slots) * sizeof(unsigned long);
+	data = kzalloc(size, GFP_KERNEL);
+	if (!data)
+		goto err_info;
+
+	data->info = *ims_info;
+	info->info.data = data;
+
+	/*
+	 * Allocate an interrupt chip because the core needs to be able to
+	 * update it with default callbacks.
+	 */
+	chip = kmemdup(&ims_array_msi_controller,
+		       sizeof(ims_array_msi_controller), GFP_KERNEL);
+	if (!chip)
+		goto err_data;
+	info->info.chip = chip;
+
+	domain = pci_subdevice_msi_create_irq_domain(pdev, &info->info);
+	if (!domain)
+		goto err_chip;
+
+	return domain;
+
+err_chip:
+	kfree(chip);
+err_data:
+	kfree(data);
+err_info:
+	kfree(info);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pci_ims_array_create_msi_irq_domain);
+
+#endif /* CONFIG_IMS_MSI_ARRAY */
diff --git a/include/linux/irqchip/irq-ims-msi.h b/include/linux/irqchip/irq-ims-msi.h
new file mode 100644
index 000000000000..2982ef3c0ed4
--- /dev/null
+++ b/include/linux/irqchip/irq-ims-msi.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* (C) Copyright 2020 Thomas Gleixner <tglx@linutronix.de> */
+
+#ifndef _LINUX_IRQCHIP_IRQ_IMS_MSI_H
+#define _LINUX_IRQCHIP_IRQ_IMS_MSI_H
+
+#include <linux/types.h>
+
+/**
+ * ims_hw_slot - The hardware layout of an IMS based MSI message
+ * @address_lo:	Lower 32bit address
+ * @address_hi:	Upper 32bit address
+ * @data:	Message data
+ * @ctrl:	Control word
+ *
+ * This structure is used by both the device memory array and the queue
+ * memory variants of IMS.
+ */
+struct ims_slot {
+	u32	address_lo;
+	u32	address_hi;
+	u32	data;
+	u32	ctrl;
+} __packed;
+
+/* Bit to mask the interrupt in ims_hw_slot::ctrl */
+#define IMS_VECTOR_CTRL_MASK	0x01
+
+/**
+ * struct ims_array_info - Information to create an IMS array domain
+ * @slots:	Pointer to the start of the array
+ * @max_slots:	Maximum number of slots in the array
+ */
+struct ims_array_info {
+	struct ims_slot		__iomem *slots;
+	unsigned int		max_slots;
+};
+
+struct pci_dev;
+struct irq_domain;
+
+struct irq_domain *pci_ims_array_create_msi_irq_domain(struct pci_dev *pdev,
+						       struct ims_array_info *ims_info);
+
+#endif


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH v3 02/18] iommu/vt-d: Add DEV-MSI support
       [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
  2020-09-15 23:27 ` [PATCH v3 01/18] irqchip: Add IMS (Interrupt Message Storage) driver Dave Jiang
@ 2020-09-15 23:27 ` Dave Jiang
  2020-09-30 18:32   ` Thomas Gleixner
  2020-09-15 23:28 ` [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver Dave Jiang
                   ` (10 subsequent siblings)
  12 siblings, 1 reply; 50+ messages in thread
From: Dave Jiang @ 2020-09-15 23:27 UTC (permalink / raw)
  To: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

From: Megha Dey <megha.dey@intel.com>

Add required support in the interrupt remapping driver for devices
which generate dev-msi interrupts and use the intel remapping
domain as the parent domain.

Signed-off-by: Megha Dey <megha.dey@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
---
 drivers/iommu/intel/irq_remapping.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 0cfce1d3b7bb..75e388263b78 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1303,9 +1303,10 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 	case X86_IRQ_ALLOC_TYPE_HPET:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
 	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
+	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
 		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
 			set_hpet_sid(irte, info->devid);
-		else
+		else if (info->type != X86_IRQ_ALLOC_TYPE_DEV_MSI)
 			set_msi_sid(irte, msi_desc_to_pci_dev(info->desc));
 
 		msg->address_hi = MSI_ADDR_BASE_HI;
@@ -1358,7 +1359,8 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
 	if (!info || !iommu)
 		return -EINVAL;
 	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI &&
-	    info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX)
+	    info->type != X86_IRQ_ALLOC_TYPE_PCI_MSIX &&
+	    info->type != X86_IRQ_ALLOC_TYPE_DEV_MSI)
 		return -EINVAL;
 
 	/*


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver
       [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
  2020-09-15 23:27 ` [PATCH v3 01/18] irqchip: Add IMS (Interrupt Message Storage) driver Dave Jiang
  2020-09-15 23:27 ` [PATCH v3 02/18] iommu/vt-d: Add DEV-MSI support Dave Jiang
@ 2020-09-15 23:28 ` Dave Jiang
  2020-09-30 18:47   ` Thomas Gleixner
  2020-09-15 23:28 ` [PATCH v3 07/18] dmaengine: idxd: add basic mdev registration and helper functions Dave Jiang
                   ` (9 subsequent siblings)
  12 siblings, 1 reply; 50+ messages in thread
From: Dave Jiang @ 2020-09-15 23:28 UTC (permalink / raw)
  To: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

In preparation for support of VFIO mediated device for idxd driver, the
enabling for Interrupt Message Store (IMS) interrupts is added for the idxd
With IMS support the idxd driver can dynamically allocate interrupts on a
per mdev basis based on how many IMS vectors that are mapped to the mdev
device. This commit only provides the support functions in the base driver
and not the VFIO mdev code utilization.

The commit has some portal related changes. A "portal" is a special
location within the MMIO BAR2 of the DSA device where descriptors are
submitted via the CPU command MOVDIR64B or ENQCMD(S). The offset for the
portal address determines whether the submitted descriptor is for MSI-X
or IMS notification.

See Intel SIOV spec for more details:
https://software.intel.com/en-us/download/intel-scalable-io-virtualization-technical-specification

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/dma/idxd/cdev.c   |    4 ++-
 drivers/dma/idxd/idxd.h   |   14 +++++++++---
 drivers/dma/idxd/init.c   |   52 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/submit.c |   11 ++++++++--
 drivers/dma/idxd/sysfs.c  |   11 ++++++++++
 5 files changed, 84 insertions(+), 8 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index c3976156db2f..06fed39ff17f 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -157,8 +157,8 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma)
 		return rc;
 
 	vma->vm_flags |= VM_DONTCOPY;
-	pfn = (base + idxd_get_wq_portal_full_offset(wq->id,
-				IDXD_PORTAL_LIMITED)) >> PAGE_SHIFT;
+	pfn = (base + idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED,
+						     IDXD_IRQ_MSIX)) >> PAGE_SHIFT;
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	vma->vm_private_data = ctx;
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 2cd541c5faab..acc444ad9db6 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -146,6 +146,7 @@ enum idxd_device_state {
 enum idxd_device_flag {
 	IDXD_FLAG_CONFIGURABLE = 0,
 	IDXD_FLAG_CMD_RUNNING,
+	IDXD_FLAG_SIOV_SUPPORTED,
 };
 
 struct idxd_device {
@@ -170,6 +171,7 @@ struct idxd_device {
 
 	int num_groups;
 
+	u32 ims_offset;
 	u32 msix_perm_offset;
 	u32 wqcfg_offset;
 	u32 grpcfg_offset;
@@ -177,6 +179,7 @@ struct idxd_device {
 
 	u64 max_xfer_bytes;
 	u32 max_batch_size;
+	int ims_size;
 	int max_groups;
 	int max_engines;
 	int max_tokens;
@@ -196,6 +199,7 @@ struct idxd_device {
 	struct work_struct work;
 
 	int *int_handles;
+	struct sbitmap ims_sbmap;
 };
 
 /* IDXD software descriptor */
@@ -233,15 +237,17 @@ enum idxd_interrupt_type {
 	IDXD_IRQ_IMS,
 };
 
-static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot)
+static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot,
+					    enum idxd_interrupt_type irq_type)
 {
-	return prot * 0x1000;
+	return prot * 0x1000 + irq_type * 0x2000;
 }
 
 static inline int idxd_get_wq_portal_full_offset(int wq_id,
-						 enum idxd_portal_prot prot)
+						 enum idxd_portal_prot prot,
+						 enum idxd_interrupt_type irq_type)
 {
-	return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot);
+	return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot, irq_type);
 }
 
 static inline void idxd_set_type(struct idxd_device *idxd)
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index fb8b1001d35a..7d08f2b92de7 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -231,10 +231,51 @@ static void idxd_read_table_offsets(struct idxd_device *idxd)
 	idxd->msix_perm_offset = offsets.msix_perm * 0x100;
 	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n",
 		idxd->msix_perm_offset);
+	idxd->ims_offset = offsets.ims * 0x100;
+	dev_dbg(dev, "IDXD IMS Offset: %#x\n", idxd->ims_offset);
 	idxd->perfmon_offset = offsets.perfmon * 0x100;
 	dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset);
 }
 
+#define PCI_DEVSEC_CAP		0x23
+#define SIOVDVSEC1(offset)	((offset) + 0x4)
+#define SIOVDVSEC2(offset)	((offset) + 0x8)
+#define DVSECID			0x5
+#define SIOVCAP(offset)		((offset) + 0x14)
+
+static void idxd_check_siov(struct idxd_device *idxd)
+{
+	struct pci_dev *pdev = idxd->pdev;
+	struct device *dev = &pdev->dev;
+	int dvsec;
+	u16 val16;
+	u32 val32;
+
+	dvsec = pci_find_ext_capability(pdev, PCI_DEVSEC_CAP);
+	pci_read_config_word(pdev, SIOVDVSEC1(dvsec), &val16);
+	if (val16 != PCI_VENDOR_ID_INTEL) {
+		dev_dbg(&pdev->dev, "DVSEC vendor id is not Intel\n");
+		return;
+	}
+
+	pci_read_config_word(pdev, SIOVDVSEC2(dvsec), &val16);
+	if (val16 != DVSECID) {
+		dev_dbg(&pdev->dev, "DVSEC ID is not SIOV\n");
+		return;
+	}
+
+	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
+	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
+		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
+		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
+		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
+		dev_dbg(&pdev->dev, "IMS supported for device\n");
+		return;
+	}
+
+	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");
+}
+
 static void idxd_read_caps(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
@@ -253,6 +294,7 @@ static void idxd_read_caps(struct idxd_device *idxd)
 	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
 	idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
 	dev_dbg(dev, "max batch size: %u\n", idxd->max_batch_size);
+	idxd_check_siov(idxd);
 	if (idxd->hw.gen_cap.config_en)
 		set_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags);
 
@@ -347,9 +389,19 @@ static int idxd_probe(struct idxd_device *idxd)
 
 	idxd->major = idxd_cdev_get_major(idxd);
 
+	if (idxd->ims_size) {
+		rc = sbitmap_init_node(&idxd->ims_sbmap, idxd->ims_size, -1,
+				       GFP_KERNEL, dev_to_node(dev));
+		if (rc < 0)
+			goto sbitmap_fail;
+	}
 	dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id);
 	return 0;
 
+ sbitmap_fail:
+	mutex_lock(&idxd_idr_lock);
+	idr_remove(&idxd_idrs[idxd->type], idxd->id);
+	mutex_unlock(&idxd_idr_lock);
  err_idr_fail:
 	idxd_mask_error_interrupts(idxd);
 	idxd_mask_msix_vectors(idxd);
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 2b3c2f132af7..8dd72cf3c838 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -27,7 +27,13 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 		desc->hw->int_handle = wq->vec_ptr;
 	} else {
 		desc->vec_ptr = wq->vec_ptr;
-		desc->hw->int_handle = idxd->int_handles[desc->vec_ptr];
+		/*
+		 * int_handles are only for descriptor completion. However for device
+		 * MSIX enumeration, vec 0 is used for misc interrupts. Therefore even
+		 * though we are rotating through 1...N for descriptor interrupts, we
+		 * need to acqurie the int_handles from 0..N-1.
+		 */
+		desc->hw->int_handle = idxd->int_handles[desc->vec_ptr - 1];
 	}
 
 	return desc;
@@ -87,7 +93,8 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	if (idxd->state != IDXD_DEV_ENABLED)
 		return -EIO;
 
-	portal = wq->dportal + idxd_get_wq_portal_offset(IDXD_PORTAL_UNLIMITED);
+	portal = wq->dportal + idxd_get_wq_portal_offset(IDXD_PORTAL_LIMITED, IDXD_IRQ_MSIX);
+
 	/*
 	 * The wmb() flushes writes to coherent DMA data before possibly
 	 * triggering a DMA read. The wmb() is necessary even on UP because
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 00d2347399bb..4914316ae493 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1237,6 +1237,16 @@ static ssize_t numa_node_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(numa_node);
 
+static ssize_t ims_size_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", idxd->ims_size);
+}
+static DEVICE_ATTR_RO(ims_size);
+
 static ssize_t max_batch_size_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
@@ -1422,6 +1432,7 @@ static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_max_work_queues_size.attr,
 	&dev_attr_max_engines.attr,
 	&dev_attr_numa_node.attr,
+	&dev_attr_ims_size.attr,
 	&dev_attr_max_batch_size.attr,
 	&dev_attr_max_transfer_size.attr,
 	&dev_attr_op_cap.attr,


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH v3 07/18] dmaengine: idxd: add basic mdev registration and helper functions
       [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
                   ` (2 preceding siblings ...)
  2020-09-15 23:28 ` [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver Dave Jiang
@ 2020-09-15 23:28 ` Dave Jiang
  2020-09-15 23:28 ` [PATCH v3 10/18] dmaengine: idxd: virtual device commands emulation Dave Jiang
                   ` (8 subsequent siblings)
  12 siblings, 0 replies; 50+ messages in thread
From: Dave Jiang @ 2020-09-15 23:28 UTC (permalink / raw)
  To: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

Create a mediated device through the VFIO mediated device framework. The
mdev framework allows creation of an mediated device by the driver with
portion of the device's resources. The driver will emulate the slow path
such as the PCI config space, MMIO bar, and the command registers. The
descriptor submission portal(s) will be mmaped to the guest in order to
submit descriptors directly by the guest kernel or apps. The mediated
device support code in the idxd will be referred to as the Virtual
Device Composition Module (vdcm). Add basic plumbing to fill out the
mdev_parent_ops struct that VFIO mdev requires to support a mediated
device.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/dma/Kconfig       |    7 
 drivers/dma/idxd/Makefile |    2 
 drivers/dma/idxd/idxd.h   |   11 +
 drivers/dma/idxd/init.c   |   11 +
 drivers/dma/idxd/mdev.c   |  957 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/dma/idxd/mdev.h   |  118 ++++++
 drivers/dma/idxd/vdev.c   |   87 ++++
 drivers/dma/idxd/vdev.h   |   21 +
 8 files changed, 1214 insertions(+)
 create mode 100644 drivers/dma/idxd/mdev.c
 create mode 100644 drivers/dma/idxd/mdev.h
 create mode 100644 drivers/dma/idxd/vdev.c
 create mode 100644 drivers/dma/idxd/vdev.h

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 518a1437862a..a39392157dc2 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -296,6 +296,13 @@ config INTEL_IDXD
 
 	  If unsure, say N.
 
+config INTEL_IDXD_MDEV
+	bool "IDXD VFIO Mediated Device Support"
+	depends on INTEL_IDXD
+	depends on VFIO_MDEV
+	depends on VFIO_MDEV_DEVICE
+	depends on IRQ_BYPASS_MANAGER
+
 config INTEL_IOATDMA
 	tristate "Intel I/OAT DMA support"
 	depends on PCI && X86_64
diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile
index 8978b898d777..30cad704a95a 100644
--- a/drivers/dma/idxd/Makefile
+++ b/drivers/dma/idxd/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_INTEL_IDXD) += idxd.o
 idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o
+
+idxd-$(CONFIG_INTEL_IDXD_MDEV) += mdev.o vdev.o
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 22d00e980908..649b389f07e6 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -116,6 +116,7 @@ struct idxd_wq {
 	char name[WQ_NAME_SIZE + 1];
 	u64 max_xfer_bytes;
 	u32 max_batch_size;
+	struct list_head vdcm_list;
 };
 
 struct idxd_engine {
@@ -147,6 +148,7 @@ enum idxd_device_flag {
 	IDXD_FLAG_CONFIGURABLE = 0,
 	IDXD_FLAG_CMD_RUNNING,
 	IDXD_FLAG_SIOV_SUPPORTED,
+	IDXD_FLAG_MDEV_ENABLED,
 };
 
 struct idxd_device {
@@ -227,6 +229,11 @@ static inline bool wq_dedicated(struct idxd_wq *wq)
 	return test_bit(WQ_FLAG_DEDICATED, &wq->flags);
 }
 
+static inline bool device_mdev_enabled(struct idxd_device *idxd)
+{
+	return test_bit(IDXD_FLAG_MDEV_ENABLED, &idxd->flags);
+}
+
 enum idxd_portal_prot {
 	IDXD_PORTAL_UNLIMITED = 0,
 	IDXD_PORTAL_LIMITED,
@@ -345,4 +352,8 @@ int idxd_cdev_get_major(struct idxd_device *idxd);
 int idxd_wq_add_cdev(struct idxd_wq *wq);
 void idxd_wq_del_cdev(struct idxd_wq *wq);
 
+/* mdev */
+int idxd_mdev_host_init(struct idxd_device *idxd);
+void idxd_mdev_host_release(struct idxd_device *idxd);
+
 #endif
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 7d08f2b92de7..7c5bfa14c269 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -201,6 +201,7 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 		wq->idxd_cdev.minor = -1;
 		wq->max_xfer_bytes = idxd->max_xfer_bytes;
 		wq->max_batch_size = idxd->max_batch_size;
+		INIT_LIST_HEAD(&wq->vdcm_list);
 	}
 
 	for (i = 0; i < idxd->max_engines; i++) {
@@ -462,6 +463,14 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return -ENODEV;
 	}
 
+	if (IS_ENABLED(CONFIG_INTEL_IDXD_MDEV)) {
+		rc = idxd_mdev_host_init(idxd);
+		if (rc < 0)
+			dev_warn(dev, "VFIO mdev not setup: %d\n", rc);
+		else
+			set_bit(IDXD_FLAG_MDEV_ENABLED, &idxd->flags);
+	}
+
 	rc = idxd_setup_sysfs(idxd);
 	if (rc) {
 		dev_err(dev, "IDXD sysfs setup failed\n");
@@ -536,6 +545,8 @@ static void idxd_remove(struct pci_dev *pdev)
 	dev_dbg(&pdev->dev, "%s called\n", __func__);
 	idxd_cleanup_sysfs(idxd);
 	idxd_shutdown(pdev);
+	if (IS_ENABLED(CONFIG_INTEL_IDXD_MDEV) && device_mdev_enabled(idxd))
+		idxd_mdev_host_release(idxd);
 	mutex_lock(&idxd_idr_lock);
 	idr_remove(&idxd_idrs[idxd->type], idxd->id);
 	mutex_unlock(&idxd_idr_lock);
diff --git a/drivers/dma/idxd/mdev.c b/drivers/dma/idxd/mdev.c
new file mode 100644
index 000000000000..8219e23b80b2
--- /dev/null
+++ b/drivers/dma/idxd/mdev.c
@@ -0,0 +1,957 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019,2020 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/sched/task.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+#include <linux/msi.h>
+#include <linux/intel-iommu.h>
+#include <linux/intel-svm.h>
+#include <linux/kvm_host.h>
+#include <linux/eventfd.h>
+#include <linux/circ_buf.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+#include "idxd.h"
+#include "../../vfio/pci/vfio_pci_private.h"
+#include "mdev.h"
+#include "vdev.h"
+
+static u64 idxd_pci_config[] = {
+	0x001000000b258086ULL,
+	0x0080000008800000ULL,
+	0x000000000000000cULL,
+	0x000000000000000cULL,
+	0x0000000000000000ULL,
+	0x2010808600000000ULL,
+	0x0000004000000000ULL,
+	0x000000ff00000000ULL,
+	0x0000060000015011ULL, /* MSI-X capability, hardcoded 2 entries, Encoded as N-1 */
+	0x0000070000000000ULL,
+	0x0000000000920010ULL, /* PCIe capability */
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+	0x0000000000000000ULL,
+};
+
+static int idxd_vdcm_set_irqs(struct vdcm_idxd *vidxd, uint32_t flags, unsigned int index,
+			      unsigned int start, unsigned int count, void *data);
+
+static inline void reset_vconfig(struct vdcm_idxd *vidxd)
+{
+	memset(vidxd->cfg, 0, VIDXD_MAX_CFG_SPACE_SZ);
+	memcpy(vidxd->cfg, idxd_pci_config, sizeof(idxd_pci_config));
+}
+
+static inline void reset_vmmio(struct vdcm_idxd *vidxd)
+{
+	memset(&vidxd->bar0, 0, VIDXD_MAX_MMIO_SPACE_SZ);
+}
+
+static void idxd_vdcm_init(struct vdcm_idxd *vidxd)
+{
+	struct idxd_wq *wq = vidxd->wq;
+
+	reset_vconfig(vidxd);
+	reset_vmmio(vidxd);
+
+	vidxd->bar_size[0] = VIDXD_BAR0_SIZE;
+	vidxd->bar_size[1] = VIDXD_BAR2_SIZE;
+
+	vidxd_mmio_init(vidxd);
+
+	if (wq_dedicated(wq) && wq->state == IDXD_WQ_ENABLED)
+		idxd_wq_disable(wq);
+}
+
+static void idxd_vdcm_release(struct mdev_device *mdev)
+{
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	struct device *dev = mdev_dev(mdev);
+
+	dev_dbg(dev, "vdcm_idxd_release %d\n", vidxd->type->type);
+	mutex_lock(&vidxd->dev_lock);
+	if (!vidxd->refcount)
+		goto out;
+
+        idxd_vdcm_set_irqs(vidxd, VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+			   VFIO_PCI_MSIX_IRQ_INDEX, 0, 0, NULL);
+
+	vidxd_free_ims_entries(vidxd);
+
+	/* Re-initialize the VIDXD to a pristine state for re-use */
+	idxd_vdcm_init(vidxd);
+	vidxd->refcount--;
+
+ out:
+	mutex_unlock(&vidxd->dev_lock);
+}
+
+static struct vdcm_idxd *vdcm_vidxd_create(struct idxd_device *idxd, struct mdev_device *mdev,
+					   struct vdcm_idxd_type *type)
+{
+	struct vdcm_idxd *vidxd;
+	struct idxd_wq *wq = NULL;
+	int i;
+
+	/* PLACEHOLDER, wq matching comes later */
+
+	if (!wq)
+		return ERR_PTR(-ENODEV);
+
+	vidxd = kzalloc(sizeof(*vidxd), GFP_KERNEL);
+	if (!vidxd)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&vidxd->dev_lock);
+	vidxd->idxd = idxd;
+	vidxd->vdev.mdev = mdev;
+	vidxd->wq = wq;
+	mdev_set_drvdata(mdev, vidxd);
+	vidxd->type = type;
+	vidxd->num_wqs = VIDXD_MAX_WQS;
+
+	for (i = 0; i < VIDXD_MAX_MSIX_VECS - 1; i++)
+		vidxd->ims_index[i] = -1;
+
+	idxd_vdcm_init(vidxd);
+	mutex_lock(&wq->wq_lock);
+	idxd_wq_get(wq);
+	mutex_unlock(&wq->wq_lock);
+
+	return vidxd;
+}
+
+static struct vdcm_idxd_type idxd_mdev_types[IDXD_MDEV_TYPES];
+
+static struct vdcm_idxd_type *idxd_vdcm_find_vidxd_type(struct device *dev,
+							const char *name)
+{
+	int i;
+	char dev_name[IDXD_MDEV_NAME_LEN];
+
+	for (i = 0; i < IDXD_MDEV_TYPES; i++) {
+		snprintf(dev_name, IDXD_MDEV_NAME_LEN, "idxd-%s",
+			 idxd_mdev_types[i].name);
+
+		if (!strncmp(name, dev_name, IDXD_MDEV_NAME_LEN))
+			return &idxd_mdev_types[i];
+	}
+
+	return NULL;
+}
+
+static int idxd_vdcm_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+	struct vdcm_idxd *vidxd;
+	struct vdcm_idxd_type *type;
+	struct device *dev, *parent;
+	struct idxd_device *idxd;
+	struct idxd_wq *wq;
+
+	parent = mdev_parent_dev(mdev);
+	idxd = dev_get_drvdata(parent);
+	dev = mdev_dev(mdev);
+
+	mdev_set_iommu_device(dev, parent);
+	type = idxd_vdcm_find_vidxd_type(dev, kobject_name(kobj));
+	if (!type) {
+		dev_err(dev, "failed to find type %s to create\n",
+			kobject_name(kobj));
+		return -EINVAL;
+	}
+
+	vidxd = vdcm_vidxd_create(idxd, mdev, type);
+	if (IS_ERR(vidxd)) {
+		dev_err(dev, "failed to create vidxd: %ld\n", PTR_ERR(vidxd));
+		return PTR_ERR(vidxd);
+	}
+
+	wq = vidxd->wq;
+	mutex_lock(&wq->wq_lock);
+	list_add(&vidxd->list, &wq->vdcm_list);
+	mutex_unlock(&wq->wq_lock);
+	dev_dbg(dev, "mdev creation success: %s\n", dev_name(mdev_dev(mdev)));
+
+	return 0;
+}
+
+static int idxd_vdcm_remove(struct mdev_device *mdev)
+{
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	struct idxd_device *idxd = vidxd->idxd;
+	struct device *dev = &idxd->pdev->dev;
+	struct idxd_wq *wq = vidxd->wq;
+
+	dev_dbg(dev, "%s: removing for wq %d\n", __func__, vidxd->wq->id);
+
+	mutex_lock(&wq->wq_lock);
+	list_del(&vidxd->list);
+	idxd_wq_put(wq);
+	mutex_unlock(&wq->wq_lock);
+
+	kfree(vidxd);
+	return 0;
+}
+
+static int idxd_vdcm_open(struct mdev_device *mdev)
+{
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	int rc;
+	struct vdcm_idxd_type *type = vidxd->type;
+	struct device *dev = mdev_dev(mdev);
+
+	dev_dbg(dev, "%s: type: %d\n", __func__, type->type);
+
+	mutex_lock(&vidxd->dev_lock);
+	if (vidxd->refcount)
+		goto out;
+
+	/* allocate and setup IMS entries */
+	rc = vidxd_setup_ims_entries(vidxd);
+	if (rc < 0)
+		goto out;
+
+	vidxd->refcount++;
+	mutex_unlock(&vidxd->dev_lock);
+
+	return rc;
+
+ out:
+	mutex_unlock(&vidxd->dev_lock);
+	return rc;
+}
+
+static ssize_t idxd_vdcm_rw(struct mdev_device *mdev, char *buf, size_t count, loff_t *ppos,
+			    enum idxd_vdcm_rw mode)
+{
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	struct device *dev = mdev_dev(mdev);
+	int rc = -EINVAL;
+
+	if (index >= VFIO_PCI_NUM_REGIONS) {
+		dev_err(dev, "invalid index: %u\n", index);
+		return -EINVAL;
+	}
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		if (mode == IDXD_VDCM_WRITE)
+			rc = vidxd_cfg_write(vidxd, pos, buf, count);
+		else
+			rc = vidxd_cfg_read(vidxd, pos, buf, count);
+		break;
+	case VFIO_PCI_BAR0_REGION_INDEX:
+	case VFIO_PCI_BAR1_REGION_INDEX:
+		if (mode == IDXD_VDCM_WRITE)
+			rc = vidxd_mmio_write(vidxd, vidxd->bar_val[0] + pos, buf, count);
+		else
+			rc = vidxd_mmio_read(vidxd, vidxd->bar_val[0] + pos, buf, count);
+		break;
+	case VFIO_PCI_BAR2_REGION_INDEX:
+	case VFIO_PCI_BAR3_REGION_INDEX:
+	case VFIO_PCI_BAR4_REGION_INDEX:
+	case VFIO_PCI_BAR5_REGION_INDEX:
+	case VFIO_PCI_VGA_REGION_INDEX:
+	case VFIO_PCI_ROM_REGION_INDEX:
+	default:
+		dev_err(dev, "unsupported region: %u\n", index);
+	}
+
+	return rc == 0 ? count : rc;
+}
+
+static ssize_t idxd_vdcm_read(struct mdev_device *mdev, char __user *buf, size_t count,
+			      loff_t *ppos)
+{
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	unsigned int done = 0;
+	int rc;
+
+	mutex_lock(&vidxd->dev_lock);
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			rc = idxd_vdcm_rw(mdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_READ);
+			if (rc <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			rc = idxd_vdcm_rw(mdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_READ);
+			if (rc <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			rc = idxd_vdcm_rw(mdev, &val, sizeof(val), ppos,
+					  IDXD_VDCM_READ);
+			if (rc <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 1;
+		}
+
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	mutex_unlock(&vidxd->dev_lock);
+	return done;
+
+ read_err:
+	mutex_unlock(&vidxd->dev_lock);
+	return -EFAULT;
+}
+
+static ssize_t idxd_vdcm_write(struct mdev_device *mdev, const char __user *buf, size_t count,
+			       loff_t *ppos)
+{
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	unsigned int done = 0;
+	int rc;
+
+	mutex_lock(&vidxd->dev_lock);
+	while (count) {
+		size_t filled;
+
+		if (count >= 4 && !(*ppos % 4)) {
+			u32 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			rc = idxd_vdcm_rw(mdev, (char *)&val, sizeof(val),
+					  ppos, IDXD_VDCM_WRITE);
+			if (rc <= 0)
+				goto write_err;
+
+			filled = 4;
+		} else if (count >= 2 && !(*ppos % 2)) {
+			u16 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			rc = idxd_vdcm_rw(mdev, (char *)&val,
+					  sizeof(val), ppos, IDXD_VDCM_WRITE);
+			if (rc <= 0)
+				goto write_err;
+
+			filled = 2;
+		} else {
+			u8 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			rc = idxd_vdcm_rw(mdev, &val, sizeof(val),
+					  ppos, IDXD_VDCM_WRITE);
+			if (rc <= 0)
+				goto write_err;
+
+			filled = 1;
+		}
+
+		count -= filled;
+		done += filled;
+		*ppos += filled;
+		buf += filled;
+	}
+
+	mutex_unlock(&vidxd->dev_lock);
+	return done;
+
+write_err:
+	mutex_unlock(&vidxd->dev_lock);
+	return -EFAULT;
+}
+
+static int check_vma(struct idxd_wq *wq, struct vm_area_struct *vma)
+{
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int idxd_vdcm_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+{
+	unsigned int wq_idx, rc;
+	unsigned long req_size, pgoff = 0, offset;
+	pgprot_t pg_prot;
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	struct idxd_wq *wq = vidxd->wq;
+	struct idxd_device *idxd = vidxd->idxd;
+	enum idxd_portal_prot virt_portal, phys_portal;
+	phys_addr_t base = pci_resource_start(idxd->pdev, IDXD_WQ_BAR);
+	struct device *dev = mdev_dev(mdev);
+
+	rc = check_vma(wq, vma);
+	if (rc)
+		return rc;
+
+	pg_prot = vma->vm_page_prot;
+	req_size = vma->vm_end - vma->vm_start;
+	vma->vm_flags |= VM_DONTCOPY;
+
+	offset = (vma->vm_pgoff << PAGE_SHIFT) &
+		 ((1ULL << VFIO_PCI_OFFSET_SHIFT) - 1);
+
+	wq_idx = offset >> (PAGE_SHIFT + 2);
+	if (wq_idx >= 1) {
+		dev_err(dev, "mapping invalid wq %d off %lx\n",
+			wq_idx, offset);
+		return -EINVAL;
+	}
+
+	/*
+	 * Check and see if the guest wants to map to the limited or unlimited portal.
+	 * The driver will allow mapping to unlimited portal only if the the wq is a
+	 * dedicated wq. Otherwise, it goes to limited.
+	 */
+	virt_portal = ((offset >> PAGE_SHIFT) & 0x3) == 1;
+	phys_portal = IDXD_PORTAL_LIMITED;
+	if (virt_portal == IDXD_PORTAL_UNLIMITED && wq_dedicated(wq))
+		phys_portal = IDXD_PORTAL_UNLIMITED;
+
+	/* We always map IMS portals to the guest */
+	pgoff = (base + idxd_get_wq_portal_full_offset(wq->id, phys_portal,
+						       IDXD_IRQ_IMS)) >> PAGE_SHIFT;
+
+	dev_dbg(dev, "mmap %lx %lx %lx %lx\n", vma->vm_start, pgoff, req_size,
+		pgprot_val(pg_prot));
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_private_data = mdev;
+	vma->vm_pgoff = pgoff;
+
+	return remap_pfn_range(vma, vma->vm_start, pgoff, req_size, pg_prot);
+}
+
+static int idxd_vdcm_get_irq_count(struct vdcm_idxd *vidxd, int type)
+{
+	/*
+	 * Even though the number of MSIX vectors supported are not tied to number of
+	 * wqs being exported, the current design is to allow 1 vector per WQ for guest.
+	 * So here we end up with num of wqs plus 1 that handles the misc interrupts.
+	 */
+	if (type == VFIO_PCI_MSI_IRQ_INDEX || type == VFIO_PCI_MSIX_IRQ_INDEX)
+		return VIDXD_MAX_MSIX_VECS;
+
+	return 0;
+}
+
+static irqreturn_t idxd_guest_wq_completion(int irq, void *data)
+{
+	struct ims_irq_entry *irq_entry = data;
+	struct vdcm_idxd *vidxd = irq_entry->vidxd;
+	int msix_idx = irq_entry->int_src;
+
+	vidxd_send_interrupt(vidxd, msix_idx + 1);
+	return IRQ_HANDLED;
+}
+
+static int msix_trigger_unregister(struct vdcm_idxd *vidxd, int index)
+{
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct ims_irq_entry *irq_entry;
+	int rc;
+
+	if (!vidxd->vdev.msix_trigger[index])
+		return 0;
+
+	dev_dbg(dev, "disable MSIX trigger %d\n", index);
+	if (index) {
+		struct irq_bypass_producer *producer;
+
+		producer = &vidxd->vdev.producer[index];
+		irq_bypass_unregister_producer(producer);
+
+		irq_entry = &vidxd->irq_entries[index - 1];
+		if (irq_entry->irq_set) {
+			free_irq(irq_entry->irq, irq_entry);
+			irq_entry->irq_set = false;
+		}
+		rc = vidxd_disable_host_ims_pasid(vidxd, index - 1);
+		if (rc)
+			return rc;
+	}
+	eventfd_ctx_put(vidxd->vdev.msix_trigger[index]);
+	vidxd->vdev.msix_trigger[index] = NULL;
+
+	return 0;
+}
+
+static int msix_trigger_register(struct vdcm_idxd *vidxd, u32 fd, int index)
+{
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct ims_irq_entry *irq_entry;
+	struct eventfd_ctx *trigger;
+	int rc;
+
+	/* No need to do anything if trigger already created */
+	if (vidxd->vdev.msix_trigger[index])
+		return 0;
+
+	dev_dbg(dev, "enable MSIX trigger %d\n", index);
+	trigger = eventfd_ctx_fdget(fd);
+	if (IS_ERR(trigger)) {
+		dev_warn(dev, "eventfd_ctx_fdget failed %d\n", index);
+		return PTR_ERR(trigger);
+	}
+
+	/*
+	 * The MSIX vector 0 is emulated by the mdev. Starting with vector 1
+	 * the interrupt is backed by IMS and needs to be set up, but we
+	 * will be setting up entry 0 of the IMS vectors. So here we pass
+	 * in i - 1 to the host setup and irq_entries.
+	 */
+	if (index) {
+		struct irq_bypass_producer *producer;
+
+		producer = &vidxd->vdev.producer[index];
+		irq_entry = &vidxd->irq_entries[index - 1];
+		rc = vidxd_enable_host_ims_pasid(vidxd, index - 1);
+		if (rc) {
+			dev_warn(dev, "failed to enable host ims pasid\n");
+			eventfd_ctx_put(trigger);
+			return rc;
+		}
+
+		rc = request_irq(irq_entry->irq, idxd_guest_wq_completion, 0, "idxd-ims", irq_entry);
+		if (rc) {
+			dev_warn(dev, "failed to request ims irq\n");
+			eventfd_ctx_put(trigger);
+			vidxd_disable_host_ims_pasid(vidxd, index - 1);
+			return rc;
+		}
+
+		producer->token = trigger;
+		producer->irq = irq_entry->irq;
+		rc = irq_bypass_register_producer(producer);
+		if (unlikely(rc))
+			dev_info(dev, "irq bypass producer (token %p) registration failed: %d\n",
+				 producer->token, rc);
+
+		irq_entry->irq_set = true;
+	}
+
+	vidxd->vdev.msix_trigger[index] = trigger;
+	return 0;
+}
+
+static int vdcm_idxd_set_msix_trigger(struct vdcm_idxd *vidxd,
+				      unsigned int index, unsigned int start,
+				      unsigned int count, uint32_t flags,
+				      void *data)
+{
+	int i, rc = 0;
+
+	if (count > VIDXD_MAX_MSIX_ENTRIES - 1)
+		count = VIDXD_MAX_MSIX_ENTRIES - 1;
+
+	/*
+	 * The MSIX vector 0 is emulated by the mdev. Starting with vector 1
+	 * the interrupt is backed by IMS and needs to be set up, but we
+	 * will be setting up entry 0 of the IMS vectors. So here we pass
+	 * in i - 1 to the host setup and irq_entries.
+	 */
+	if (count == 0 && (flags & VFIO_IRQ_SET_DATA_NONE)) {
+		/* Disable all MSIX entries */
+		for (i = 0; i < VIDXD_MAX_MSIX_ENTRIES; i++) {
+			rc = msix_trigger_unregister(vidxd, i);
+			if (rc < 0)
+				return rc;
+		}
+		return 0;
+	}
+
+	for (i = 0; i < count; i++) {
+		if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+			u32 fd = *(u32 *)(data + i * sizeof(u32));
+
+			rc = msix_trigger_register(vidxd, fd, i);
+			if (rc < 0)
+				return rc;
+		} else if (flags & VFIO_IRQ_SET_DATA_NONE) {
+			rc = msix_trigger_unregister(vidxd, i);
+			if (rc < 0)
+				return rc;
+		}
+	}
+	return rc;
+}
+
+static int idxd_vdcm_set_irqs(struct vdcm_idxd *vidxd, uint32_t flags,
+			      unsigned int index, unsigned int start,
+			      unsigned int count, void *data)
+{
+	int (*func)(struct vdcm_idxd *vidxd, unsigned int index,
+		    unsigned int start, unsigned int count, uint32_t flags,
+		    void *data) = NULL;
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	switch (index) {
+	case VFIO_PCI_INTX_IRQ_INDEX:
+		dev_warn(dev, "intx interrupts not supported.\n");
+		break;
+	case VFIO_PCI_MSI_IRQ_INDEX:
+		dev_dbg(dev, "msi interrupt.\n");
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_MASK:
+		case VFIO_IRQ_SET_ACTION_UNMASK:
+			break;
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			func = vdcm_idxd_set_msix_trigger;
+			break;
+		}
+		break;
+	case VFIO_PCI_MSIX_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_MASK:
+		case VFIO_IRQ_SET_ACTION_UNMASK:
+			break;
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			func = vdcm_idxd_set_msix_trigger;
+			break;
+		}
+		break;
+	default:
+		return -ENOTTY;
+	}
+
+	if (!func)
+		return -ENOTTY;
+
+	return func(vidxd, index, start, count, flags, data);
+}
+
+static void vidxd_vdcm_reset(struct vdcm_idxd *vidxd)
+{
+	vidxd_reset(vidxd);
+}
+
+static long idxd_vdcm_ioctl(struct mdev_device *mdev, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct vdcm_idxd *vidxd = mdev_get_drvdata(mdev);
+	unsigned long minsz;
+	int rc = -EINVAL;
+	struct device *dev = mdev_dev(mdev);
+
+	dev_dbg(dev, "vidxd %p ioctl, cmd: %d\n", vidxd, cmd);
+
+	mutex_lock(&vidxd->dev_lock);
+	if (cmd == VFIO_DEVICE_GET_INFO) {
+		struct vfio_device_info info;
+
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		if (info.argsz < minsz) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		info.flags = VFIO_DEVICE_FLAGS_PCI;
+		info.flags |= VFIO_DEVICE_FLAGS_RESET;
+		info.num_regions = VFIO_PCI_NUM_REGIONS;
+		info.num_irqs = VFIO_PCI_NUM_IRQS;
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			rc = -EFAULT;
+		else
+			rc = 0;
+		goto out;
+	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+		struct vfio_region_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
+		size_t size;
+		int nr_areas = 1;
+		int cap_type_id = 0;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		if (info.argsz < minsz) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		switch (info.index) {
+		case VFIO_PCI_CONFIG_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = VIDXD_MAX_CFG_SPACE_SZ;
+			info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+			break;
+		case VFIO_PCI_BAR0_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = vidxd->bar_size[info.index];
+			if (!info.size) {
+				info.flags = 0;
+				break;
+			}
+
+			info.flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+			break;
+		case VFIO_PCI_BAR1_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = 0;
+			info.flags = 0;
+			break;
+		case VFIO_PCI_BAR2_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.flags = VFIO_REGION_INFO_FLAG_CAPS | VFIO_REGION_INFO_FLAG_MMAP |
+				     VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+			info.size = vidxd->bar_size[1];
+
+			/*
+			 * Every WQ has two areas for unlimited and limited
+			 * MSI-X portals. IMS portals are not reported
+			 */
+			nr_areas = 2;
+
+			size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
+			sparse = kzalloc(size, GFP_KERNEL);
+			if (!sparse) {
+				rc = -ENOMEM;
+				goto out;
+			}
+
+			sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+			sparse->header.version = 1;
+			sparse->nr_areas = nr_areas;
+			cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+
+			sparse->areas[0].offset = 0;
+			sparse->areas[0].size = PAGE_SIZE;
+
+			sparse->areas[1].offset = PAGE_SIZE;
+			sparse->areas[1].size = PAGE_SIZE;
+			break;
+
+		case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = 0;
+			info.flags = 0;
+			dev_dbg(dev, "get region info bar:%d\n", info.index);
+			break;
+
+		case VFIO_PCI_ROM_REGION_INDEX:
+		case VFIO_PCI_VGA_REGION_INDEX:
+			dev_dbg(dev, "get region info index:%d\n", info.index);
+			break;
+		default: {
+			if (info.index >= VFIO_PCI_NUM_REGIONS)
+				rc = -EINVAL;
+			else
+				rc = 0;
+			goto out;
+		} /* default */
+		} /* info.index switch */
+
+		if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
+			if (cap_type_id == VFIO_REGION_INFO_CAP_SPARSE_MMAP) {
+				rc = vfio_info_add_capability(&caps, &sparse->header,
+							      sizeof(*sparse) + (sparse->nr_areas *
+							      sizeof(*sparse->areas)));
+				kfree(sparse);
+				if (rc)
+					goto out;
+			}
+		}
+
+		if (caps.size) {
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+				info.cap_offset = 0;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg + sizeof(info),
+						 caps.buf, caps.size)) {
+					kfree(caps.buf);
+					rc = -EFAULT;
+					goto out;
+				}
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			rc = -EFAULT;
+		else
+			rc = 0;
+		goto out;
+	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
+		struct vfio_irq_info info;
+
+		minsz = offsetofend(struct vfio_irq_info, count);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		switch (info.index) {
+		case VFIO_PCI_MSI_IRQ_INDEX:
+		case VFIO_PCI_MSIX_IRQ_INDEX:
+		default:
+			rc = -EINVAL;
+			goto out;
+		} /* switch(info.index) */
+
+		info.flags = VFIO_IRQ_INFO_EVENTFD | VFIO_IRQ_INFO_NORESIZE;
+		info.count = idxd_vdcm_get_irq_count(vidxd, info.index);
+
+		if (copy_to_user((void __user *)arg, &info, minsz))
+			rc = -EFAULT;
+		else
+			rc = 0;
+		goto out;
+	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
+		struct vfio_irq_set hdr;
+		u8 *data = NULL;
+		size_t data_size = 0;
+
+		minsz = offsetofend(struct vfio_irq_set, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz)) {
+			rc = -EFAULT;
+			goto out;
+		}
+
+		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
+			int max = idxd_vdcm_get_irq_count(vidxd, hdr.index);
+
+			rc = vfio_set_irqs_validate_and_prepare(&hdr, max, VFIO_PCI_NUM_IRQS,
+								&data_size);
+			if (rc) {
+				dev_err(dev, "intel:vfio_set_irqs_validate_and_prepare failed\n");
+				rc = -EINVAL;
+				goto out;
+			}
+			if (data_size) {
+				data = memdup_user((void __user *)(arg + minsz), data_size);
+				if (IS_ERR(data)) {
+					rc = PTR_ERR(data);
+					goto out;
+				}
+			}
+		}
+
+		if (!data) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		rc = idxd_vdcm_set_irqs(vidxd, hdr.flags, hdr.index, hdr.start, hdr.count, data);
+		kfree(data);
+		goto out;
+	} else if (cmd == VFIO_DEVICE_RESET) {
+		vidxd_vdcm_reset(vidxd);
+	}
+
+ out:
+	mutex_unlock(&vidxd->dev_lock);
+	return rc;
+}
+
+static const struct mdev_parent_ops idxd_vdcm_ops = {
+	.create			= idxd_vdcm_create,
+	.remove			= idxd_vdcm_remove,
+	.open			= idxd_vdcm_open,
+	.release		= idxd_vdcm_release,
+	.read			= idxd_vdcm_read,
+	.write			= idxd_vdcm_write,
+	.mmap			= idxd_vdcm_mmap,
+	.ioctl			= idxd_vdcm_ioctl,
+};
+
+int idxd_mdev_host_init(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int rc;
+
+	if (!test_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags))
+		return -EOPNOTSUPP;
+
+	if (iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_AUX)) {
+		rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_AUX);
+		if (rc < 0) {
+			dev_warn(dev, "Failed to enable aux-domain: %d\n", rc);
+			return rc;
+		}
+	} else {
+		dev_warn(dev, "No aux-domain feature.\n");
+		return -EOPNOTSUPP;
+	}
+
+	return mdev_register_device(dev, &idxd_vdcm_ops);
+}
+
+void idxd_mdev_host_release(struct idxd_device *idxd)
+{
+	struct device *dev = &idxd->pdev->dev;
+	int rc;
+
+	mdev_unregister_device(dev);
+	if (iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_AUX)) {
+		rc = iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_AUX);
+		if (rc < 0)
+			dev_warn(dev, "Failed to disable aux-domain: %d\n",
+				 rc);
+	}
+}
diff --git a/drivers/dma/idxd/mdev.h b/drivers/dma/idxd/mdev.h
new file mode 100644
index 000000000000..f369d46b4435
--- /dev/null
+++ b/drivers/dma/idxd/mdev.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2019,2020 Intel Corporation. All rights rsvd. */
+
+#ifndef _IDXD_MDEV_H_
+#define _IDXD_MDEV_H_
+
+/* two 64-bit BARs implemented */
+#define VIDXD_MAX_BARS 2
+#define VIDXD_MAX_CFG_SPACE_SZ 4096
+#define VIDXD_MAX_MMIO_SPACE_SZ 8192
+#define VIDXD_MSIX_TBL_SZ_OFFSET 0x42
+#define VIDXD_CAP_CTRL_SZ 0x100
+#define VIDXD_GRP_CTRL_SZ 0x100
+#define VIDXD_WQ_CTRL_SZ 0x100
+#define VIDXD_WQ_OCPY_INT_SZ 0x20
+#define VIDXD_MSIX_TBL_SZ 0x90
+#define VIDXD_MSIX_PERM_TBL_SZ 0x48
+
+#define VIDXD_MSIX_TABLE_OFFSET 0x600
+#define VIDXD_MSIX_PERM_OFFSET 0x300
+#define VIDXD_GRPCFG_OFFSET 0x400
+#define VIDXD_WQCFG_OFFSET 0x500
+#define VIDXD_IMS_OFFSET 0x1000
+
+#define VIDXD_BAR0_SIZE  0x2000
+#define VIDXD_BAR2_SIZE  0x20000
+#define VIDXD_MAX_MSIX_ENTRIES  (VIDXD_MSIX_TBL_SZ / 0x10)
+#define VIDXD_MAX_WQS	1
+#define VIDXD_MAX_MSIX_VECS	2
+
+#define	VIDXD_ATS_OFFSET 0x100
+#define	VIDXD_PRS_OFFSET 0x110
+#define VIDXD_PASID_OFFSET 0x120
+#define VIDXD_MSIX_PBA_OFFSET 0x700
+
+struct ims_irq_entry {
+	struct vdcm_idxd *vidxd;
+	int int_src;
+	unsigned int irq;
+	bool irq_set;
+};
+
+struct idxd_vdev {
+	struct mdev_device *mdev;
+	struct eventfd_ctx *msix_trigger[VIDXD_MAX_MSIX_ENTRIES];
+	struct irq_bypass_producer producer[VIDXD_MAX_MSIX_ENTRIES];
+};
+
+struct vdcm_idxd {
+	struct idxd_device *idxd;
+	struct idxd_wq *wq;
+	struct idxd_vdev vdev;
+	struct vdcm_idxd_type *type;
+	int num_wqs;
+	u64 ims_index[VIDXD_MAX_MSIX_VECS - 1];
+	struct msix_entry ims_entry;
+	struct ims_irq_entry irq_entries[VIDXD_MAX_MSIX_VECS - 1];
+
+	/* For VM use case */
+	u64 bar_val[VIDXD_MAX_BARS];
+	u64 bar_size[VIDXD_MAX_BARS];
+	u8 cfg[VIDXD_MAX_CFG_SPACE_SZ];
+	u8 bar0[VIDXD_MAX_MMIO_SPACE_SZ];
+	struct list_head list;
+	struct mutex dev_lock; /* lock for vidxd resources */
+
+	int refcount;
+};
+
+static inline struct vdcm_idxd *to_vidxd(struct idxd_vdev *vdev)
+{
+	return container_of(vdev, struct vdcm_idxd, vdev);
+}
+
+#define IDXD_MDEV_NAME_LEN 16
+#define IDXD_MDEV_DESCRIPTION_LEN 64
+
+enum idxd_mdev_type {
+	IDXD_MDEV_TYPE_1_DWQ = 0,
+};
+
+#define IDXD_MDEV_TYPES 1
+
+struct vdcm_idxd_type {
+	char name[IDXD_MDEV_NAME_LEN];
+	char description[IDXD_MDEV_DESCRIPTION_LEN];
+	enum idxd_mdev_type type;
+	unsigned int avail_instance;
+};
+
+enum idxd_vdcm_rw {
+	IDXD_VDCM_READ = 0,
+	IDXD_VDCM_WRITE,
+};
+
+static inline u64 get_reg_val(void *buf, int size)
+{
+	u64 val = 0;
+
+	switch (size) {
+	case 8:
+		val = *(uint64_t *)buf;
+		break;
+	case 4:
+		val = *(uint32_t *)buf;
+		break;
+	case 2:
+		val = *(uint16_t *)buf;
+		break;
+	case 1:
+		val = *(uint8_t *)buf;
+		break;
+	}
+
+	return val;
+}
+
+#endif
diff --git a/drivers/dma/idxd/vdev.c b/drivers/dma/idxd/vdev.c
new file mode 100644
index 000000000000..4da89ee61b6e
--- /dev/null
+++ b/drivers/dma/idxd/vdev.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2019,2020 Intel Corporation. All rights rsvd. */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/sched/task.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/vfio.h>
+#include <linux/mdev.h>
+#include <linux/msi.h>
+#include <linux/intel-iommu.h>
+#include <linux/intel-svm.h>
+#include <linux/kvm_host.h>
+#include <linux/eventfd.h>
+#include <uapi/linux/idxd.h>
+#include "registers.h"
+#include "idxd.h"
+#include "../../vfio/pci/vfio_pci_private.h"
+#include "mdev.h"
+#include "vdev.h"
+
+int vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx)
+{
+	/* PLACE HOLDER */
+	return 0;
+}
+
+int vidxd_disable_host_ims_pasid(struct vdcm_idxd *vidxd, int ims_idx)
+{
+	/* PLACEHOLDER */
+	return 0;
+}
+
+int vidxd_enable_host_ims_pasid(struct vdcm_idxd *vidxd, int ims_idx)
+{
+	/* PLACEHOLDER */
+	return 0;
+}
+
+int vidxd_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size)
+{
+	/* PLACEHOLDER */
+	return 0;
+}
+
+int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size)
+{
+	/* PLACEHOLDER */
+	return 0;
+}
+
+int vidxd_cfg_read(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int count)
+{
+	/* PLACEHOLDER */
+	return 0;
+}
+
+int vidxd_cfg_write(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int size)
+{
+	/* PLACEHOLDER */
+	return 0;
+}
+
+void vidxd_mmio_init(struct vdcm_idxd *vidxd)
+{
+	/* PLACEHOLDER */
+}
+
+void vidxd_reset(struct vdcm_idxd *vidxd)
+{
+	/* PLACEHOLDER */
+}
+
+void vidxd_free_ims_entries(struct vdcm_idxd *vidxd)
+{
+	/* PLACEHOLDER */
+}
+
+int vidxd_setup_ims_entries(struct vdcm_idxd *vidxd)
+{
+	/* PLACEHOLDER */
+	return 0;
+}
diff --git a/drivers/dma/idxd/vdev.h b/drivers/dma/idxd/vdev.h
new file mode 100644
index 000000000000..cd5a30d90e9a
--- /dev/null
+++ b/drivers/dma/idxd/vdev.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2019,2020 Intel Corporation. All rights rsvd. */
+
+#ifndef _IDXD_VDEV_H_
+#define _IDXD_VDEV_H_
+
+#include "mdev.h"
+
+int vidxd_mmio_read(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size);
+int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size);
+int vidxd_cfg_read(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int count);
+int vidxd_cfg_write(struct vdcm_idxd *vidxd, unsigned int pos, void *buf, unsigned int size);
+void vidxd_mmio_init(struct vdcm_idxd *vidxd);
+void vidxd_reset(struct vdcm_idxd *vidxd);
+int vidxd_disable_host_ims_pasid(struct vdcm_idxd *vidxd, int ims_idx);
+int vidxd_enable_host_ims_pasid(struct vdcm_idxd *vidxd, int ims_idx);
+int vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx);
+void vidxd_free_ims_entries(struct vdcm_idxd *vidxd);
+int vidxd_setup_ims_entries(struct vdcm_idxd *vidxd);
+
+#endif


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH v3 10/18] dmaengine: idxd: virtual device commands emulation
       [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
                   ` (3 preceding siblings ...)
  2020-09-15 23:28 ` [PATCH v3 07/18] dmaengine: idxd: add basic mdev registration and helper functions Dave Jiang
@ 2020-09-15 23:28 ` Dave Jiang
  2020-09-15 23:28 ` [PATCH v3 12/18] dmaengine: idxd: add mdev type as a new wq type Dave Jiang
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 50+ messages in thread
From: Dave Jiang @ 2020-09-15 23:28 UTC (permalink / raw)
  To: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

Add all the helper functions that supports the emulation of the commands
that are submitted to the device command register.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/dma/idxd/registers.h |   15 +-
 drivers/dma/idxd/vdev.c      |  410 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 420 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 97acd0aaecbe..6531a40fad2e 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -115,7 +115,8 @@ union gencfg_reg {
 union genctrl_reg {
 	struct {
 		u32 softerr_int_en:1;
-		u32 rsvd:31;
+		u32 halt_state_int_en:1;
+		u32 rsvd:30;
 	};
 	u32 bits;
 } __packed;
@@ -137,6 +138,8 @@ enum idxd_device_status_state {
 	IDXD_DEVICE_STATE_HALT,
 };
 
+#define IDXD_GENSTATS_MASK		0x03
+
 enum idxd_device_reset_type {
 	IDXD_DEVICE_RESET_SOFTWARE = 0,
 	IDXD_DEVICE_RESET_FLR,
@@ -149,6 +152,7 @@ enum idxd_device_reset_type {
 #define IDXD_INTC_CMD			0x02
 #define IDXD_INTC_OCCUPY			0x04
 #define IDXD_INTC_PERFMON_OVFL		0x08
+#define IDXD_INTC_HALT_STATE		0x10
 
 #define IDXD_CMD_OFFSET			0xa0
 union idxd_command_reg {
@@ -160,6 +164,7 @@ union idxd_command_reg {
 	};
 	u32 bits;
 } __packed;
+#define IDXD_CMD_INT_MASK		0x80000000
 
 enum idxd_cmd {
 	IDXD_CMD_ENABLE_DEVICE = 1,
@@ -217,7 +222,7 @@ enum idxd_cmdsts_err {
 	/* disable device errors */
 	IDXD_CMDSTS_ERR_DIS_DEV_EN = 0x31,
 	/* disable WQ, drain WQ, abort WQ, reset WQ */
-	IDXD_CMDSTS_ERR_DEV_NOT_EN,
+	IDXD_CMDSTS_ERR_WQ_NOT_EN,
 	/* request interrupt handle */
 	IDXD_CMDSTS_ERR_INVAL_INT_IDX = 0x41,
 	IDXD_CMDSTS_ERR_NO_HANDLE,
@@ -354,4 +359,10 @@ union wqcfg {
 					(n) * sizeof(union wqcfg) +\
 					sizeof(u32) * (ofs))
 
+enum idxd_wq_hw_state {
+	IDXD_WQ_DEV_DISABLED = 0,
+	IDXD_WQ_DEV_ENABLED,
+	IDXD_WQ_DEV_BUSY,
+};
+
 #endif
diff --git a/drivers/dma/idxd/vdev.c b/drivers/dma/idxd/vdev.c
index 2357d4c596d5..efdfb67ae51a 100644
--- a/drivers/dma/idxd/vdev.c
+++ b/drivers/dma/idxd/vdev.c
@@ -81,6 +81,30 @@ static void vidxd_report_error(struct vdcm_idxd *vidxd, unsigned int error)
 	}
 }
 
+static int idxd_get_mdev_pasid(struct mdev_device *mdev)
+{
+	struct iommu_domain *domain;
+	struct device *dev = mdev_dev(mdev);
+	struct device *iommu_device;
+	struct device *(*fn)(struct device *dev);
+
+	fn = symbol_get(mdev_get_iommu_device);
+	if (fn) {
+		iommu_device = fn(dev);
+		symbol_put(mdev_get_iommu_device);
+	} else {
+		return -ENODEV;
+	}
+
+	iommu_device = mdev_get_iommu_device(dev);
+
+	domain = iommu_get_domain_for_dev(dev);
+	if (!domain)
+		return -EINVAL;
+
+	return iommu_aux_get_pasid(domain, iommu_device);
+}
+
 int vidxd_mmio_write(struct vdcm_idxd *vidxd, u64 pos, void *buf, unsigned int size)
 {
 	u32 offset = pos & (vidxd->bar_size[0] - 1);
@@ -470,12 +494,347 @@ void vidxd_mmio_init(struct vdcm_idxd *vidxd)
 
 static void idxd_complete_command(struct vdcm_idxd *vidxd, enum idxd_cmdsts_err val)
 {
-	/* PLACEHOLDER */
+	u8 *bar0 = vidxd->bar0;
+	u32 *cmd = (u32 *)(bar0 + IDXD_CMD_OFFSET);
+	u32 *cmdsts = (u32 *)(bar0 + IDXD_CMDSTS_OFFSET);
+	u32 *intcause = (u32 *)(bar0 + IDXD_INTCAUSE_OFFSET);
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	*cmdsts = val;
+	dev_dbg(dev, "%s: cmd: %#x  status: %#x\n", __func__, *cmd, val);
+
+	if (*cmd & IDXD_CMD_INT_MASK) {
+		*intcause |= IDXD_INTC_CMD;
+		vidxd_send_interrupt(vidxd, 0);
+	}
+}
+
+static void vidxd_enable(struct vdcm_idxd *vidxd)
+{
+	u8 *bar0 = vidxd->bar0;
+	union gensts_reg *gensts = (union gensts_reg *)(bar0 + IDXD_GENSTATS_OFFSET);
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (gensts->state == IDXD_DEVICE_STATE_ENABLED)
+		return idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DEV_ENABLED);
+
+	/* Check PCI configuration */
+	if (!(vidxd->cfg[PCI_COMMAND] & PCI_COMMAND_MASTER))
+		return idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_BUSMASTER_EN);
+
+	gensts->state = IDXD_DEVICE_STATE_ENABLED;
+
+	return idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_disable(struct vdcm_idxd *vidxd)
+{
+	struct idxd_wq *wq;
+	union wqcfg *wqcfg;
+	u8 *bar0 = vidxd->bar0;
+	union gensts_reg *gensts = (union gensts_reg *)(bar0 + IDXD_GENSTATS_OFFSET);
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u32 status;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (gensts->state == IDXD_DEVICE_STATE_DISABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DIS_DEV_EN);
+		return;
+	}
+
+	wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	wq = vidxd->wq;
+
+	/* If it is a DWQ, need to disable the DWQ as well */
+	if (wq_dedicated(wq)) {
+		idxd_wq_disable(wq, &status);
+		if (status) {
+			dev_warn(dev, "vidxd disable (wq disable) failed: %#x\n", status);
+			idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DIS_DEV_EN);
+			return;
+		}
+	} else {
+		idxd_wq_drain(wq, &status);
+		if (status)
+			dev_warn(dev, "vidxd disable (wq drain) failed: %#x\n", status);
+	}
+
+	wqcfg->wq_state = 0;
+	gensts->state = IDXD_DEVICE_STATE_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_drain_all(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_wq *wq = vidxd->wq;
+
+	dev_dbg(dev, "%s\n", __func__);
+
+	idxd_wq_drain(wq, NULL);
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_drain(struct vdcm_idxd *vidxd, int val)
+{
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	u32 status;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (wqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	idxd_wq_drain(wq, &status);
+	if (status) {
+		dev_dbg(dev, "wq drain failed: %#x\n", status);
+		idxd_complete_command(vidxd, status);
+		return;
+	}
+
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_abort_all(struct vdcm_idxd *vidxd)
+{
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_wq *wq = vidxd->wq;
+
+	dev_dbg(dev, "%s\n", __func__);
+	idxd_wq_abort(wq, NULL);
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_abort(struct vdcm_idxd *vidxd, int val)
+{
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	struct idxd_wq *wq = vidxd->wq;
+	u32 status;
+
+	dev_dbg(dev, "%s\n", __func__);
+	if (wqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	idxd_wq_abort(wq, &status);
+	if (status) {
+		dev_dbg(dev, "wq abort failed: %#x\n", status);
+		idxd_complete_command(vidxd, status);
+		return;
+	}
+
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
 }
 
 void vidxd_reset(struct vdcm_idxd *vidxd)
 {
-	/* PLACEHOLDER */
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u8 *bar0 = vidxd->bar0;
+	union gensts_reg *gensts = (union gensts_reg *)(bar0 + IDXD_GENSTATS_OFFSET);
+	struct idxd_wq *wq;
+
+	dev_dbg(dev, "%s\n", __func__);
+	gensts->state = IDXD_DEVICE_STATE_DRAIN;
+	wq = vidxd->wq;
+
+	if (wq->state == IDXD_WQ_ENABLED) {
+		idxd_wq_abort(wq, NULL);
+		idxd_wq_disable(wq, NULL);
+	}
+
+	vidxd_mmio_init(vidxd);
+	gensts->state = IDXD_DEVICE_STATE_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_reset(struct vdcm_idxd *vidxd, int wq_id_mask)
+{
+	struct idxd_wq *wq;
+	u8 *bar0 = vidxd->bar0;
+	union wqcfg *wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u32 status;
+
+	wq = vidxd->wq;
+	dev_dbg(dev, "vidxd reset wq %u:%u\n", 0, wq->id);
+
+	if (wqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	idxd_wq_abort(wq, &status);
+	if (status) {
+		dev_dbg(dev, "vidxd reset wq failed to abort: %#x\n", status);
+		idxd_complete_command(vidxd, status);
+		return;
+	}
+
+	idxd_wq_disable(wq, &status);
+	if (status) {
+		dev_dbg(dev, "vidxd reset wq failed to disable: %#x\n", status);
+		idxd_complete_command(vidxd, status);
+		return;
+	}
+
+	wqcfg->wq_state = IDXD_WQ_DEV_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_alloc_int_handle(struct vdcm_idxd *vidxd, int vidx)
+{
+	bool ims = (vidx >> 16) & 1;
+	u32 cmdsts;
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	vidx = vidx & 0xffff;
+
+	dev_dbg(dev, "allocating int handle for %x\n", vidx);
+
+	if (vidx != 1) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_INVAL_INT_IDX);
+		return;
+	}
+
+	if (ims) {
+		dev_warn(dev, "IMS allocation is not implemented yet\n");
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_NO_HANDLE);
+	} else {
+		vidx--; /* MSIX idx 0 is a slow path interrupt */
+		cmdsts = vidxd->ims_index[vidx] << 8;
+		dev_dbg(dev, "int handle %d:%lld\n", vidx, vidxd->ims_index[vidx]);
+		idxd_complete_command(vidxd, cmdsts);
+	}
+}
+
+static void vidxd_wq_enable(struct vdcm_idxd *vidxd, int wq_id)
+{
+	struct idxd_wq *wq;
+	u8 *bar0 = vidxd->bar0;
+	union wq_cap_reg *wqcap;
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	struct idxd_device *idxd;
+	union wqcfg *vwqcfg, *wqcfg;
+	unsigned long flags;
+	int wq_pasid;
+	u32 status;
+	int priv;
+
+	if (wq_id >= VIDXD_MAX_WQS) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_INVAL_WQIDX);
+		return;
+	}
+
+	idxd = vidxd->idxd;
+	wq = vidxd->wq;
+
+	dev_dbg(dev, "%s: wq %u:%u\n", __func__, wq_id, wq->id);
+
+	vwqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET + wq_id * 32);
+	wqcap = (union wq_cap_reg *)(bar0 + IDXD_WQCAP_OFFSET);
+	wqcfg = &wq->wqcfg;
+
+	if (vidxd_state(vidxd) != IDXD_DEVICE_STATE_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_DEV_NOTEN);
+		return;
+	}
+
+	if (vwqcfg->wq_state != IDXD_WQ_DEV_DISABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_ENABLED);
+		return;
+	}
+
+	if (wq_dedicated(wq) && wqcap->dedicated_mode == 0) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_MODE);
+		return;
+	}
+
+	wq_pasid = idxd_get_mdev_pasid(mdev);
+	priv = 1;
+
+	if (wq_pasid >= 0) {
+		wqcfg->bits[2] &= ~0x3fffff00;
+		wqcfg->priv = priv;
+		wqcfg->pasid_en = 1;
+		wqcfg->pasid = wq_pasid;
+		dev_dbg(dev, "program pasid %d in wq %d\n", wq_pasid, wq->id);
+		spin_lock_irqsave(&idxd->dev_lock, flags);
+		idxd_wq_setup_pasid(wq, wq_pasid);
+		idxd_wq_setup_priv(wq, priv);
+		spin_unlock_irqrestore(&idxd->dev_lock, flags);
+		idxd_wq_enable(wq, &status);
+		if (status) {
+			dev_err(dev, "vidxd enable wq %d failed\n", wq->id);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+	} else {
+		dev_err(dev, "idxd pasid setup failed wq %d wq_pasid %d\n", wq->id, wq_pasid);
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_PASID_EN);
+		return;
+	}
+
+	vwqcfg->wq_state = IDXD_WQ_DEV_ENABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
+}
+
+static void vidxd_wq_disable(struct vdcm_idxd *vidxd, int wq_id_mask)
+{
+	struct idxd_wq *wq;
+	union wqcfg *wqcfg;
+	u8 *bar0 = vidxd->bar0;
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+	u32 status;
+
+	wq = vidxd->wq;
+
+	dev_dbg(dev, "vidxd disable wq %u:%u\n", 0, wq->id);
+
+	wqcfg = (union wqcfg *)(bar0 + VIDXD_WQCFG_OFFSET);
+	if (wqcfg->wq_state != IDXD_WQ_DEV_ENABLED) {
+		idxd_complete_command(vidxd, IDXD_CMDSTS_ERR_WQ_NOT_EN);
+		return;
+	}
+
+	/* If it is a DWQ, need to disable the DWQ as well */
+	if (wq_dedicated(wq)) {
+		idxd_wq_disable(wq, &status);
+		if (status) {
+			dev_warn(dev, "vidxd disable wq failed: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+	} else {
+		idxd_wq_drain(wq, &status);
+		if (status) {
+			dev_warn(dev, "vidxd disable drain wq failed: %#x\n", status);
+			idxd_complete_command(vidxd, status);
+			return;
+		}
+	}
+
+	wqcfg->wq_state = IDXD_WQ_DEV_DISABLED;
+	idxd_complete_command(vidxd, IDXD_CMDSTS_SUCCESS);
 }
 
 void vidxd_free_ims_entries(struct vdcm_idxd *vidxd)
@@ -491,5 +850,50 @@ int vidxd_setup_ims_entries(struct vdcm_idxd *vidxd)
 
 void vidxd_do_command(struct vdcm_idxd *vidxd, u32 val)
 {
-	/* PLACEHOLDER */
+	union idxd_command_reg *reg = (union idxd_command_reg *)(vidxd->bar0 + IDXD_CMD_OFFSET);
+	struct mdev_device *mdev = vidxd->vdev.mdev;
+	struct device *dev = mdev_dev(mdev);
+
+	reg->bits = val;
+
+	dev_dbg(dev, "%s: cmd code: %u reg: %x\n", __func__, reg->cmd, reg->bits);
+
+	switch (reg->cmd) {
+	case IDXD_CMD_ENABLE_DEVICE:
+		vidxd_enable(vidxd);
+		break;
+	case IDXD_CMD_DISABLE_DEVICE:
+		vidxd_disable(vidxd);
+		break;
+	case IDXD_CMD_DRAIN_ALL:
+		vidxd_drain_all(vidxd);
+		break;
+	case IDXD_CMD_ABORT_ALL:
+		vidxd_abort_all(vidxd);
+		break;
+	case IDXD_CMD_RESET_DEVICE:
+		vidxd_reset(vidxd);
+		break;
+	case IDXD_CMD_ENABLE_WQ:
+		vidxd_wq_enable(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_DISABLE_WQ:
+		vidxd_wq_disable(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_DRAIN_WQ:
+		vidxd_wq_drain(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_ABORT_WQ:
+		vidxd_wq_abort(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_RESET_WQ:
+		vidxd_wq_reset(vidxd, reg->operand);
+		break;
+	case IDXD_CMD_REQUEST_INT_HANDLE:
+		vidxd_alloc_int_handle(vidxd, reg->operand);
+		break;
+	default:
+		idxd_complete_command(vidxd, IDXD_CMDSTS_INVAL_CMD);
+		break;
+	}
 }


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH v3 12/18] dmaengine: idxd: add mdev type as a new wq type
       [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
                   ` (4 preceding siblings ...)
  2020-09-15 23:28 ` [PATCH v3 10/18] dmaengine: idxd: virtual device commands emulation Dave Jiang
@ 2020-09-15 23:28 ` Dave Jiang
  2020-09-15 23:29 ` [PATCH v3 13/18] dmaengine: idxd: add dedicated wq mdev type Dave Jiang
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 50+ messages in thread
From: Dave Jiang @ 2020-09-15 23:28 UTC (permalink / raw)
  To: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

Add "mdev" wq type and support helpers. The mdev wq type marks the wq
to be utilized as a VFIO mediated device.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/dma/idxd/idxd.h  |    2 ++
 drivers/dma/idxd/sysfs.c |   13 +++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 23d3ebca1da5..23287f8fd19a 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -65,6 +65,7 @@ enum idxd_wq_type {
 	IDXD_WQT_NONE = 0,
 	IDXD_WQT_KERNEL,
 	IDXD_WQT_USER,
+	IDXD_WQT_MDEV,
 };
 
 struct idxd_cdev {
@@ -290,6 +291,7 @@ void idxd_cleanup_sysfs(struct idxd_device *idxd);
 int idxd_register_driver(void);
 void idxd_unregister_driver(void);
 struct bus_type *idxd_get_bus_type(struct idxd_device *idxd);
+bool is_idxd_wq_mdev(struct idxd_wq *wq);
 
 /* device interrupt control */
 irqreturn_t idxd_irq_handler(int vec, void *data);
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index cbb1038f1ed0..6eb67f744c8e 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -14,6 +14,7 @@ static char *idxd_wq_type_names[] = {
 	[IDXD_WQT_NONE]		= "none",
 	[IDXD_WQT_KERNEL]	= "kernel",
 	[IDXD_WQT_USER]		= "user",
+	[IDXD_WQT_MDEV]		= "mdev",
 };
 
 static void idxd_conf_device_release(struct device *dev)
@@ -69,6 +70,11 @@ static inline bool is_idxd_wq_cdev(struct idxd_wq *wq)
 	return wq->type == IDXD_WQT_USER;
 }
 
+inline bool is_idxd_wq_mdev(struct idxd_wq *wq)
+{
+	return wq->type == IDXD_WQT_MDEV ? true : false;
+}
+
 static int idxd_config_bus_match(struct device *dev,
 				 struct device_driver *drv)
 {
@@ -987,8 +993,9 @@ static ssize_t wq_type_show(struct device *dev,
 		return sprintf(buf, "%s\n",
 			       idxd_wq_type_names[IDXD_WQT_KERNEL]);
 	case IDXD_WQT_USER:
-		return sprintf(buf, "%s\n",
-			       idxd_wq_type_names[IDXD_WQT_USER]);
+		return sprintf(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_USER]);
+	case IDXD_WQT_MDEV:
+		return sprintf(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_MDEV]);
 	case IDXD_WQT_NONE:
 	default:
 		return sprintf(buf, "%s\n",
@@ -1015,6 +1022,8 @@ static ssize_t wq_type_store(struct device *dev,
 		wq->type = IDXD_WQT_KERNEL;
 	else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_USER]))
 		wq->type = IDXD_WQT_USER;
+	else if (sysfs_streq(buf, idxd_wq_type_names[IDXD_WQT_MDEV]))
+		wq->type = IDXD_WQT_MDEV;
 	else
 		return -EINVAL;
 


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH v3 13/18] dmaengine: idxd: add dedicated wq mdev type
       [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
                   ` (5 preceding siblings ...)
  2020-09-15 23:28 ` [PATCH v3 12/18] dmaengine: idxd: add mdev type as a new wq type Dave Jiang
@ 2020-09-15 23:29 ` Dave Jiang
  2020-09-15 23:29 ` [PATCH v3 14/18] dmaengine: idxd: add new wq state for mdev Dave Jiang
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 50+ messages in thread
From: Dave Jiang @ 2020-09-15 23:29 UTC (permalink / raw)
  To: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

Add the support code for "1dwq" mdev type. This mdev type follows the
standard VFIO mdev flow. The "1dwq" type will export a single dedicated wq
to the mdev. The dwq will have read-only configuration that is configured
by the host. The mdev type does not support PASID and SVA and will match
the stage 1 driver in functional support. For backward compatibility, the
mdev will maintain the DSA spec definition of this mdev type once the
commit goes upstream.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/dma/idxd/mdev.c |  142 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 133 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/idxd/mdev.c b/drivers/dma/idxd/mdev.c
index 6b91abd4d8d9..2d3ff1a50d39 100644
--- a/drivers/dma/idxd/mdev.c
+++ b/drivers/dma/idxd/mdev.c
@@ -99,21 +99,58 @@ static void idxd_vdcm_release(struct mdev_device *mdev)
 	mutex_unlock(&vidxd->dev_lock);
 }
 
+static struct idxd_wq *find_any_dwq(struct idxd_device *idxd)
+{
+	int i;
+	struct idxd_wq *wq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		wq = &idxd->wqs[i];
+
+		if (wq->state != IDXD_WQ_ENABLED)
+			continue;
+
+		if (!wq_dedicated(wq))
+			continue;
+
+		if (idxd_wq_refcount(wq) != 0)
+			continue;
+
+		spin_unlock_irqrestore(&idxd->dev_lock, flags);
+		mutex_lock(&wq->wq_lock);
+		if (idxd_wq_refcount(wq)) {
+			spin_lock_irqsave(&idxd->dev_lock, flags);
+			continue;
+		}
+
+		idxd_wq_get(wq);
+		mutex_unlock(&wq->wq_lock);
+		return wq;
+	}
+
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	return NULL;
+}
+
 static struct vdcm_idxd *vdcm_vidxd_create(struct idxd_device *idxd, struct mdev_device *mdev,
 					   struct vdcm_idxd_type *type)
 {
 	struct vdcm_idxd *vidxd;
 	struct idxd_wq *wq = NULL;
-	int i;
-
-	/* PLACEHOLDER, wq matching comes later */
+	int i, rc;
 
+	if (type->type == IDXD_MDEV_TYPE_1_DWQ)
+		wq = find_any_dwq(idxd);
 	if (!wq)
 		return ERR_PTR(-ENODEV);
 
 	vidxd = kzalloc(sizeof(*vidxd), GFP_KERNEL);
-	if (!vidxd)
-		return ERR_PTR(-ENOMEM);
+	if (!vidxd) {
+		rc = -ENOMEM;
+		goto err;
+	}
 
 	mutex_init(&vidxd->dev_lock);
 	vidxd->idxd = idxd;
@@ -127,14 +164,23 @@ static struct vdcm_idxd *vdcm_vidxd_create(struct idxd_device *idxd, struct mdev
 		vidxd->ims_index[i] = -1;
 
 	idxd_vdcm_init(vidxd);
-	mutex_lock(&wq->wq_lock);
-	idxd_wq_get(wq);
-	mutex_unlock(&wq->wq_lock);
 
 	return vidxd;
+
+ err:
+	mutex_lock(&wq->wq_lock);
+	idxd_wq_put(wq);
+	mutex_unlock(&wq->wq_lock);
+	return ERR_PTR(rc);
 }
 
-static struct vdcm_idxd_type idxd_mdev_types[IDXD_MDEV_TYPES];
+static struct vdcm_idxd_type idxd_mdev_types[IDXD_MDEV_TYPES] = {
+	{
+		.name = "1dwq-v1",
+		.description = "IDXD MDEV with 1 dedicated workqueue",
+		.type = IDXD_MDEV_TYPE_1_DWQ,
+	},
+};
 
 static struct vdcm_idxd_type *idxd_vdcm_find_vidxd_type(struct device *dev,
 							const char *name)
@@ -910,7 +956,85 @@ static long idxd_vdcm_ioctl(struct mdev_device *mdev, unsigned int cmd,
 	return rc;
 }
 
+static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
+{
+	struct vdcm_idxd_type *type;
+
+	type = idxd_vdcm_find_vidxd_type(dev, kobject_name(kobj));
+
+	if (type)
+		return sprintf(buf, "%s\n", type->description);
+
+	return -EINVAL;
+}
+static MDEV_TYPE_ATTR_RO(name);
+
+static int find_available_mdev_instances(struct idxd_device *idxd, struct vdcm_idxd_type *type)
+{
+	int count = 0, i;
+	unsigned long flags;
+
+	if (type->type != IDXD_MDEV_TYPE_1_DWQ)
+		return 0;
+
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	for (i = 0; i < idxd->max_wqs; i++) {
+		struct idxd_wq *wq;
+
+		wq = &idxd->wqs[i];
+		if (!is_idxd_wq_mdev(wq) || !wq_dedicated(wq) || idxd_wq_refcount(wq))
+			continue;
+
+		count++;
+	}
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+	return count;
+}
+
+static ssize_t available_instances_show(struct kobject *kobj,
+					struct device *dev, char *buf)
+{
+	int count;
+	struct idxd_device *idxd = dev_get_drvdata(dev);
+	struct vdcm_idxd_type *type;
+
+	type = idxd_vdcm_find_vidxd_type(dev, kobject_name(kobj));
+	if (!type)
+		return -EINVAL;
+
+	count = find_available_mdev_instances(idxd, type);
+
+	return sprintf(buf, "%d\n", count);
+}
+static MDEV_TYPE_ATTR_RO(available_instances);
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+			       char *buf)
+{
+	return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+static MDEV_TYPE_ATTR_RO(device_api);
+
+static struct attribute *idxd_mdev_types_attrs[] = {
+	&mdev_type_attr_name.attr,
+	&mdev_type_attr_device_api.attr,
+	&mdev_type_attr_available_instances.attr,
+	NULL,
+};
+
+static struct attribute_group idxd_mdev_type_group0 = {
+	.name = "1dwq-v1",
+	.attrs = idxd_mdev_types_attrs,
+};
+
+static struct attribute_group *idxd_mdev_type_groups[] = {
+	&idxd_mdev_type_group0,
+	NULL,
+};
+
 static const struct mdev_parent_ops idxd_vdcm_ops = {
+	.supported_type_groups	= idxd_mdev_type_groups,
 	.create			= idxd_vdcm_create,
 	.remove			= idxd_vdcm_remove,
 	.open			= idxd_vdcm_open,


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH v3 14/18] dmaengine: idxd: add new wq state for mdev
       [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
                   ` (6 preceding siblings ...)
  2020-09-15 23:29 ` [PATCH v3 13/18] dmaengine: idxd: add dedicated wq mdev type Dave Jiang
@ 2020-09-15 23:29 ` Dave Jiang
  2020-09-15 23:29 ` [PATCH v3 15/18] dmaengine: idxd: add error notification from host driver to mediated device Dave Jiang
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 50+ messages in thread
From: Dave Jiang @ 2020-09-15 23:29 UTC (permalink / raw)
  To: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

When a dedicated wq is enabled as mdev, we must disable the wq on the
device in order to program the pasid to the wq. Introduce a wq state
IDXD_WQ_LOCKED that is software state only in order to prevent the user
from modifying the configuration while mdev wq is in this state. While
in this state, the wq is not in DISABLED state and will prevent any
modifications to the configuration. It is also not in the ENABLED state
and therefore prevents any actions allowed in the ENABLED state.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/dma/idxd/idxd.h  |    1 +
 drivers/dma/idxd/mdev.c  |    4 +++-
 drivers/dma/idxd/sysfs.c |    2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 23287f8fd19a..f67c0036f968 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -55,6 +55,7 @@ struct idxd_group {
 enum idxd_wq_state {
 	IDXD_WQ_DISABLED = 0,
 	IDXD_WQ_ENABLED,
+	IDXD_WQ_LOCKED,
 };
 
 enum idxd_wq_flag {
diff --git a/drivers/dma/idxd/mdev.c b/drivers/dma/idxd/mdev.c
index 2d3ff1a50d39..ea07e7c1ba31 100644
--- a/drivers/dma/idxd/mdev.c
+++ b/drivers/dma/idxd/mdev.c
@@ -72,8 +72,10 @@ static void idxd_vdcm_init(struct vdcm_idxd *vidxd)
 
 	vidxd_mmio_init(vidxd);
 
-	if (wq_dedicated(wq) && wq->state == IDXD_WQ_ENABLED)
+	if (wq_dedicated(wq) && wq->state == IDXD_WQ_ENABLED) {
 		idxd_wq_disable(wq, NULL);
+		wq->state = IDXD_WQ_LOCKED;
+	}
 }
 
 static void idxd_vdcm_release(struct mdev_device *mdev)
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 6eb67f744c8e..4cc05392130c 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -797,6 +797,8 @@ static ssize_t wq_state_show(struct device *dev,
 		return sprintf(buf, "disabled\n");
 	case IDXD_WQ_ENABLED:
 		return sprintf(buf, "enabled\n");
+	case IDXD_WQ_LOCKED:
+		return sprintf(buf, "locked\n");
 	}
 
 	return sprintf(buf, "unknown\n");


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH v3 15/18] dmaengine: idxd: add error notification from host driver to mediated device
       [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
                   ` (7 preceding siblings ...)
  2020-09-15 23:29 ` [PATCH v3 14/18] dmaengine: idxd: add new wq state for mdev Dave Jiang
@ 2020-09-15 23:29 ` Dave Jiang
  2020-09-15 23:29 ` [PATCH v3 16/18] dmaengine: idxd: add ABI documentation for mediated device support Dave Jiang
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 50+ messages in thread
From: Dave Jiang @ 2020-09-15 23:29 UTC (permalink / raw)
  To: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

When a device error occurs, the mediated device need to be notified in
order to notify the guest of device error. Add support to notify the
specific mdev when an error is wq specific and broadcast errors to all mdev
when it's a generic device error.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/dma/idxd/idxd.h |   12 ++++++++++++
 drivers/dma/idxd/irq.c  |    4 ++++
 drivers/dma/idxd/vdev.c |   32 ++++++++++++++++++++++++++++++++
 drivers/dma/idxd/vdev.h |    1 +
 4 files changed, 49 insertions(+)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index f67c0036f968..07e1e3fcd4aa 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -359,4 +359,16 @@ void idxd_wq_del_cdev(struct idxd_wq *wq);
 int idxd_mdev_host_init(struct idxd_device *idxd);
 void idxd_mdev_host_release(struct idxd_device *idxd);
 
+#ifdef CONFIG_INTEL_IDXD_MDEV
+void idxd_vidxd_send_errors(struct idxd_device *idxd);
+void idxd_wq_vidxd_send_errors(struct idxd_wq *wq);
+#else
+static inline void idxd_vidxd_send_errors(struct idxd_device *idxd)
+{
+}
+static inline void idxd_wq_vidxd_send_errors(struct idxd_wq *wq)
+{
+}
+#endif /* CONFIG_INTEL_IDXD_MDEV */
+
 #endif
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 04f00255dae0..f9df0910b799 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -79,6 +79,8 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 
 			if (wq->type == IDXD_WQT_USER)
 				wake_up_interruptible(&wq->idxd_cdev.err_queue);
+			else if (wq->type == IDXD_WQT_MDEV)
+				idxd_wq_vidxd_send_errors(wq);
 		} else {
 			int i;
 
@@ -87,6 +89,8 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 
 				if (wq->type == IDXD_WQT_USER)
 					wake_up_interruptible(&wq->idxd_cdev.err_queue);
+				else if (wq->type == IDXD_WQT_MDEV)
+					idxd_wq_vidxd_send_errors(wq);
 			}
 		}
 
diff --git a/drivers/dma/idxd/vdev.c b/drivers/dma/idxd/vdev.c
index fa7c27d746cc..ce2f19b9b860 100644
--- a/drivers/dma/idxd/vdev.c
+++ b/drivers/dma/idxd/vdev.c
@@ -978,3 +978,35 @@ void vidxd_do_command(struct vdcm_idxd *vidxd, u32 val)
 		break;
 	}
 }
+
+static void vidxd_send_errors(struct vdcm_idxd *vidxd)
+{
+	struct idxd_device *idxd = vidxd->idxd;
+	u8 *bar0 = vidxd->bar0;
+	union sw_err_reg *swerr = (union sw_err_reg *)(bar0 + IDXD_SWERR_OFFSET);
+	union genctrl_reg *genctrl = (union genctrl_reg *)(bar0 + IDXD_GENCTRL_OFFSET);
+	int i;
+
+	if (swerr->valid) {
+		if (!swerr->overflow)
+			swerr->overflow = 1;
+		return;
+	}
+
+	lockdep_assert_held(&idxd->dev_lock);
+	for (i = 0; i < 4; i++) {
+		swerr->bits[i] = idxd->sw_err.bits[i];
+		swerr++;
+	}
+
+	if (genctrl->softerr_int_en)
+		vidxd_send_interrupt(vidxd, 0);
+}
+
+void idxd_wq_vidxd_send_errors(struct idxd_wq *wq)
+{
+	struct vdcm_idxd *vidxd;
+
+	list_for_each_entry(vidxd, &wq->vdcm_list, list)
+		vidxd_send_errors(vidxd);
+}
diff --git a/drivers/dma/idxd/vdev.h b/drivers/dma/idxd/vdev.h
index e04c92c432d8..1bfdcdeed8e7 100644
--- a/drivers/dma/idxd/vdev.h
+++ b/drivers/dma/idxd/vdev.h
@@ -25,5 +25,6 @@ int vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx);
 void vidxd_free_ims_entries(struct vdcm_idxd *vidxd);
 int vidxd_setup_ims_entries(struct vdcm_idxd *vidxd);
 void vidxd_do_command(struct vdcm_idxd *vidxd, u32 val);
+void idxd_wq_vidxd_send_errors(struct idxd_wq *wq);
 
 #endif


^ permalink raw reply	[flat|nested] 50+ messages in thread

* [PATCH v3 16/18] dmaengine: idxd: add ABI documentation for mediated device support
       [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
                   ` (8 preceding siblings ...)
  2020-09-15 23:29 ` [PATCH v3 15/18] dmaengine: idxd: add error notification from host driver to mediated device Dave Jiang
@ 2020-09-15 23:29 ` Dave Jiang
  2020-09-17 15:06 ` [PATCH v3 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver Jason Gunthorpe
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 50+ messages in thread
From: Dave Jiang @ 2020-09-15 23:29 UTC (permalink / raw)
  To: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

From: Jing Lin <jing.lin@intel.com>

Add the sysfs attribute bits in ABI/stable for mediated device and guest
support.

Signed-off-by: Jing Lin <jing.lin@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 Documentation/ABI/stable/sysfs-driver-dma-idxd |   11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index b44183880935..6bb925b55027 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -77,6 +77,12 @@ Contact:        dmaengine@vger.kernel.org
 Description:    The operation capability bit mask specify the operation types
 		supported by the this device.
 
+What:           /sys/bus/dsa/devices/dsa<m>/ims_size
+Date:           Sep 8, 2020
+KernelVersion:  5.10.0
+Contact:        dmaengine@vger.kernel.org
+Description:	Number of entries in the interrupt message storage table.
+
 What:           /sys/bus/dsa/devices/dsa<m>/state
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
@@ -139,8 +145,9 @@ Date:           Oct 25, 2019
 KernelVersion:  5.6.0
 Contact:        dmaengine@vger.kernel.org
 Description:    The type of this work queue, it can be "kernel" type for work
-		queue usages in the kernel space or "user" type for work queue
-		usages by applications in user space.
+		queue usages in the kernel space, "user" type for work queue
+		usages by applications in user space, or "mdev" type for
+		VFIO mediated devices.
 
 What:           /sys/bus/dsa/devices/wq<m>.<n>/cdev_minor
 Date:           Oct 25, 2019


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver
       [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
                   ` (9 preceding siblings ...)
  2020-09-15 23:29 ` [PATCH v3 16/18] dmaengine: idxd: add ABI documentation for mediated device support Dave Jiang
@ 2020-09-17 15:06 ` Jason Gunthorpe
  2020-09-17 17:15   ` Dave Jiang
       [not found] ` <160021248280.67751.12525558281536923518.stgit@djiang5-desk3.ch.intel.com>
       [not found] ` <160021253189.67751.12686144284999931703.stgit@djiang5-desk3.ch.intel.com>
  12 siblings, 1 reply; 50+ messages in thread
From: Jason Gunthorpe @ 2020-09-17 15:06 UTC (permalink / raw)
  To: Dave Jiang
  Cc: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Tue, Sep 15, 2020 at 04:27:35PM -0700, Dave Jiang wrote:
>  drivers/dma/idxd/idxd.h                            |   65 +
>  drivers/dma/idxd/init.c                            |  100 ++
>  drivers/dma/idxd/irq.c                             |    6 
>  drivers/dma/idxd/mdev.c                            | 1089 ++++++++++++++++++++
>  drivers/dma/idxd/mdev.h                            |  118 ++

It is common that drivers of a subsystem will be under that
subsystem's directory tree. This allows the subsystem community to
manage pages related to their subsystem and it's drivers.

Should the mdev parts be moved there?

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver
  2020-09-17 15:06 ` [PATCH v3 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver Jason Gunthorpe
@ 2020-09-17 17:15   ` Dave Jiang
  2020-09-17 17:27     ` Jason Gunthorpe
  2020-09-17 17:30     ` Alex Williamson
  0 siblings, 2 replies; 50+ messages in thread
From: Dave Jiang @ 2020-09-17 17:15 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm



On 9/17/2020 8:06 AM, Jason Gunthorpe wrote:
> On Tue, Sep 15, 2020 at 04:27:35PM -0700, Dave Jiang wrote:
>>   drivers/dma/idxd/idxd.h                            |   65 +
>>   drivers/dma/idxd/init.c                            |  100 ++
>>   drivers/dma/idxd/irq.c                             |    6
>>   drivers/dma/idxd/mdev.c                            | 1089 ++++++++++++++++++++
>>   drivers/dma/idxd/mdev.h                            |  118 ++
> 
> It is common that drivers of a subsystem will be under that
> subsystem's directory tree. This allows the subsystem community to
> manage pages related to their subsystem and it's drivers.
> 
> Should the mdev parts be moved there?

I personally don't have a preference. I'll defer to Alex or Kirti to provide 
that guidance. It may make certains things like dealing with dma fault regions 
and etc easier using vfio calls from vfio_pci_private.h later on for vSVM 
support. It also may be the better code review and maintenance domain and 
alleviate Vinod having to deal with that portion since it's not dmaengine domain.

> 
> Jason
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver
  2020-09-17 17:15   ` Dave Jiang
@ 2020-09-17 17:27     ` Jason Gunthorpe
  2020-09-17 17:30     ` Alex Williamson
  1 sibling, 0 replies; 50+ messages in thread
From: Jason Gunthorpe @ 2020-09-17 17:27 UTC (permalink / raw)
  To: Dave Jiang
  Cc: vkoul, megha.dey, maz, bhelgaas, tglx, alex.williamson,
	jacob.jun.pan, ashok.raj, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Thu, Sep 17, 2020 at 10:15:24AM -0700, Dave Jiang wrote:
> 
> 
> On 9/17/2020 8:06 AM, Jason Gunthorpe wrote:
> > On Tue, Sep 15, 2020 at 04:27:35PM -0700, Dave Jiang wrote:
> > >   drivers/dma/idxd/idxd.h                            |   65 +
> > >   drivers/dma/idxd/init.c                            |  100 ++
> > >   drivers/dma/idxd/irq.c                             |    6
> > >   drivers/dma/idxd/mdev.c                            | 1089 ++++++++++++++++++++
> > >   drivers/dma/idxd/mdev.h                            |  118 ++
> > 
> > It is common that drivers of a subsystem will be under that
> > subsystem's directory tree. This allows the subsystem community to
> > manage pages related to their subsystem and it's drivers.
> > 
> > Should the mdev parts be moved there?
> 
> I personally don't have a preference. I'll defer to Alex or Kirti to provide
> that guidance. It may make certains things like dealing with dma fault
> regions and etc easier using vfio calls from vfio_pci_private.h later on for
> vSVM support. It also may be the better code review and maintenance domain
> and alleviate Vinod having to deal with that portion since it's not
> dmaengine domain.

That is the general reason, yes. Asking the dmaengine maintainer to
review mdev just means it won't be reviewed properly.

This mistake has been made before and I view it as a lesson from the
ARM SOC disaggregation.

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver
  2020-09-17 17:15   ` Dave Jiang
  2020-09-17 17:27     ` Jason Gunthorpe
@ 2020-09-17 17:30     ` Alex Williamson
  2020-09-17 17:37       ` Jason Gunthorpe
  1 sibling, 1 reply; 50+ messages in thread
From: Alex Williamson @ 2020-09-17 17:30 UTC (permalink / raw)
  To: Dave Jiang
  Cc: Jason Gunthorpe, vkoul, megha.dey, maz, bhelgaas, tglx,
	jacob.jun.pan, ashok.raj, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Thu, 17 Sep 2020 10:15:24 -0700
Dave Jiang <dave.jiang@intel.com> wrote:

> On 9/17/2020 8:06 AM, Jason Gunthorpe wrote:
> > On Tue, Sep 15, 2020 at 04:27:35PM -0700, Dave Jiang wrote:  
> >>   drivers/dma/idxd/idxd.h                            |   65 +
> >>   drivers/dma/idxd/init.c                            |  100 ++
> >>   drivers/dma/idxd/irq.c                             |    6
> >>   drivers/dma/idxd/mdev.c                            | 1089 ++++++++++++++++++++
> >>   drivers/dma/idxd/mdev.h                            |  118 ++  
> > 
> > It is common that drivers of a subsystem will be under that
> > subsystem's directory tree. This allows the subsystem community to
> > manage pages related to their subsystem and it's drivers.
> > 
> > Should the mdev parts be moved there?  
> 
> I personally don't have a preference. I'll defer to Alex or Kirti to provide 
> that guidance. It may make certains things like dealing with dma fault regions 
> and etc easier using vfio calls from vfio_pci_private.h later on for vSVM 
> support. It also may be the better code review and maintenance domain and 
> alleviate Vinod having to deal with that portion since it's not dmaengine domain.

TBH, I'd expect an mdev driver to be co-located with the remainder of
its parent driver.  It's hopefully sharing more code with that driver
than anything in mdev of vfio (either of which should be exported for
the vendor driver).  mdev support is largely expected to be a feature of
a driver, much like it is for i915, not necessarily its sole purpose for
existing (see the rejected fpga mdev driver).  Also being nearby
vfio_pci_private shouldn't invite anyone other than vfio_pci to make
use of that header.  Thanks,

Alex


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver
  2020-09-17 17:30     ` Alex Williamson
@ 2020-09-17 17:37       ` Jason Gunthorpe
  0 siblings, 0 replies; 50+ messages in thread
From: Jason Gunthorpe @ 2020-09-17 17:37 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, tglx, jacob.jun.pan,
	ashok.raj, yi.l.liu, baolu.lu, kevin.tian, sanjay.k.kumar,
	tony.luck, jing.lin, dan.j.williams, kwankhede, eric.auger,
	parav, rafael, netanelg, shahafs, yan.y.zhao, pbonzini,
	samuel.ortiz, mona.hossain, dmaengine, linux-kernel, x86,
	linux-pci, kvm

On Thu, Sep 17, 2020 at 11:30:16AM -0600, Alex Williamson wrote:
> On Thu, 17 Sep 2020 10:15:24 -0700
> Dave Jiang <dave.jiang@intel.com> wrote:
> 
> > On 9/17/2020 8:06 AM, Jason Gunthorpe wrote:
> > > On Tue, Sep 15, 2020 at 04:27:35PM -0700, Dave Jiang wrote:  
> > >>   drivers/dma/idxd/idxd.h                            |   65 +
> > >>   drivers/dma/idxd/init.c                            |  100 ++
> > >>   drivers/dma/idxd/irq.c                             |    6
> > >>   drivers/dma/idxd/mdev.c                            | 1089 ++++++++++++++++++++
> > >>   drivers/dma/idxd/mdev.h                            |  118 ++  
> > > 
> > > It is common that drivers of a subsystem will be under that
> > > subsystem's directory tree. This allows the subsystem community to
> > > manage pages related to their subsystem and it's drivers.
> > > 
> > > Should the mdev parts be moved there?  
> > 
> > I personally don't have a preference. I'll defer to Alex or Kirti to provide 
> > that guidance. It may make certains things like dealing with dma fault regions 
> > and etc easier using vfio calls from vfio_pci_private.h later on for vSVM 
> > support. It also may be the better code review and maintenance domain and 
> > alleviate Vinod having to deal with that portion since it's not dmaengine domain.
> 
> TBH, I'd expect an mdev driver to be co-located with the remainder of
> its parent driver. 

Multifunction drivers are always split up according to the subsystem
their functions are part of.

See the recent lost argument about the Habanalab NIC driver not being
under net/ even though the rest of the driver is in misc/

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 01/18] irqchip: Add IMS (Interrupt Message Storage) driver
  2020-09-15 23:27 ` [PATCH v3 01/18] irqchip: Add IMS (Interrupt Message Storage) driver Dave Jiang
@ 2020-09-30 18:23   ` Thomas Gleixner
  2020-10-01 22:59     ` Dey, Megha
  0 siblings, 1 reply; 50+ messages in thread
From: Thomas Gleixner @ 2020-09-30 18:23 UTC (permalink / raw)
  To: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

On Tue, Sep 15 2020 at 16:27, Dave Jiang wrote:
> From: Thomas Gleixner <tglx@linutronix.de>
> +config IMS_MSI_ARRAY
> +	bool "IMS Interrupt Message Storm MSI controller for device memory storage arrays"

Hehe, you missed a Message Storm :)

> +	depends on PCI
> +	select IMS_MSI
> +	select GENERIC_MSI_IRQ_DOMAIN
> +	help
> +	  Support for IMS Interrupt Message Storm MSI controller

and another one.

> +	  with IMS slot storage in a slot array in device memory
> +
> +static void ims_array_mask_irq(struct irq_data *data)
> +{
> +	struct msi_desc *desc = irq_data_get_msi_desc(data);
> +	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
> +	u32 __iomem *ctrl = &slot->ctrl;
> +
> +	iowrite32(ioread32(ctrl) | IMS_VECTOR_CTRL_MASK, ctrl);
> +	ioread32(ctrl); /* Flush write to device */

Bah, I fundamentaly hate tail comments. They are a distraction and
disturb the reading flow. Put it above the ioread32() please.

> +static void ims_array_unmask_irq(struct irq_data *data)
> +{
> +	struct msi_desc *desc = irq_data_get_msi_desc(data);
> +	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
> +	u32 __iomem *ctrl = &slot->ctrl;
> +
> +	iowrite32(ioread32(ctrl) & ~IMS_VECTOR_CTRL_MASK, ctrl);

Why is this one not flushed?

> +}
> +
> +static void ims_array_write_msi_msg(struct irq_data *data, struct msi_msg *msg)
> +{
> +	struct msi_desc *desc = irq_data_get_msi_desc(data);
> +	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
> +
> +	iowrite32(msg->address_lo, &slot->address_lo);
> +	iowrite32(msg->address_hi, &slot->address_hi);
> +	iowrite32(msg->data, &slot->data);
> +	ioread32(slot);

Yuck? slot points to the struct and just because ioread32() accepts a
void pointer does not make it any more readable.

> +static void ims_array_reset_slot(struct ims_slot __iomem *slot)
> +{
> +	iowrite32(0, &slot->address_lo);
> +	iowrite32(0, &slot->address_hi);
> +	iowrite32(0, &slot->data);
> +	iowrite32(0, &slot->ctrl);

Flush?

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 02/18] iommu/vt-d: Add DEV-MSI support
  2020-09-15 23:27 ` [PATCH v3 02/18] iommu/vt-d: Add DEV-MSI support Dave Jiang
@ 2020-09-30 18:32   ` Thomas Gleixner
  2020-10-01 23:26     ` Dey, Megha
  2020-10-08  7:54     ` David Woodhouse
  0 siblings, 2 replies; 50+ messages in thread
From: Thomas Gleixner @ 2020-09-30 18:32 UTC (permalink / raw)
  To: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

On Tue, Sep 15 2020 at 16:27, Dave Jiang wrote:
> @@ -1303,9 +1303,10 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
>  	case X86_IRQ_ALLOC_TYPE_HPET:
>  	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
>  	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
> +	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
>  		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
>  			set_hpet_sid(irte, info->devid);
> -		else
> +		else if (info->type != X86_IRQ_ALLOC_TYPE_DEV_MSI)
>  			set_msi_sid(irte,
>  			msi_desc_to_pci_dev(info->desc));

Gah. this starts to become unreadable.

diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 8f4ce72570ce..0c1ea8ceec31 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1271,6 +1271,16 @@ static struct irq_chip intel_ir_chip = {
 	.irq_set_vcpu_affinity	= intel_ir_set_vcpu_affinity,
 };
 
+static void irte_prepare_msg(struct msi_msg *msg, int index, int subhandle)
+{
+	msg->address_hi = MSI_ADDR_BASE_HI;
+	msg->data = sub_handle;
+	msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+			  MSI_ADDR_IR_SHV |
+			  MSI_ADDR_IR_INDEX1(index) |
+			  MSI_ADDR_IR_INDEX2(index);
+}
+
 static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 					     struct irq_cfg *irq_cfg,
 					     struct irq_alloc_info *info,
@@ -1312,19 +1322,18 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
 		break;
 
 	case X86_IRQ_ALLOC_TYPE_HPET:
+		set_hpet_sid(irte, info->hpet_id);
+		irte_prepare_msg(msg, index, sub_handle);
+		break;
+
 	case X86_IRQ_ALLOC_TYPE_MSI:
 	case X86_IRQ_ALLOC_TYPE_MSIX:
-		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
-			set_hpet_sid(irte, info->hpet_id);
-		else
-			set_msi_sid(irte, info->msi_dev);
-
-		msg->address_hi = MSI_ADDR_BASE_HI;
-		msg->data = sub_handle;
-		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
-				  MSI_ADDR_IR_SHV |
-				  MSI_ADDR_IR_INDEX1(index) |
-				  MSI_ADDR_IR_INDEX2(index);
+		set_msi_sid(irte, info->msi_dev);
+		irte_prepare_msg(msg, index, sub_handle);
+		break;
+
+	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
+		irte_prepare_msg(msg, index, sub_handle);
 		break;
 
 	default:

Hmm?

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 04/18] dmaengine: idxd: add interrupt handle request support
       [not found] ` <160021248280.67751.12525558281536923518.stgit@djiang5-desk3.ch.intel.com>
@ 2020-09-30 18:36   ` Thomas Gleixner
  2020-10-01 20:16     ` Dave Jiang
  0 siblings, 1 reply; 50+ messages in thread
From: Thomas Gleixner @ 2020-09-30 18:36 UTC (permalink / raw)
  To: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

On Tue, Sep 15 2020 at 16:28, Dave Jiang wrote:
>  
> +#define INT_HANDLE_IMS_TABLE	0x10000
> +int idxd_device_request_int_handle(struct idxd_device *idxd, int idx,
> +				   int *handle, enum idxd_interrupt_type irq_type)

New lines exist for a reason and this glued together define and function
definition is unreadable garbage.

Also is that magic bit a software flag or defined by hardware? If the
latter then you want to move it to the other hardware defines.

Thanks,

        tglx
 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver
  2020-09-15 23:28 ` [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver Dave Jiang
@ 2020-09-30 18:47   ` Thomas Gleixner
  2020-09-30 18:51     ` Jason Gunthorpe
  2020-10-01 20:48     ` Dave Jiang
  0 siblings, 2 replies; 50+ messages in thread
From: Thomas Gleixner @ 2020-09-30 18:47 UTC (permalink / raw)
  To: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

On Tue, Sep 15 2020 at 16:28, Dave Jiang wrote:
>  struct idxd_device {
> @@ -170,6 +171,7 @@ struct idxd_device {
>  
>  	int num_groups;
>  
> +	u32 ims_offset;
>  	u32 msix_perm_offset;
>  	u32 wqcfg_offset;
>  	u32 grpcfg_offset;
> @@ -177,6 +179,7 @@ struct idxd_device {
>  
>  	u64 max_xfer_bytes;
>  	u32 max_batch_size;
> +	int ims_size;
>  	int max_groups;
>  	int max_engines;
>  	int max_tokens;
> @@ -196,6 +199,7 @@ struct idxd_device {
>  	struct work_struct work;
>  
>  	int *int_handles;
> +	struct sbitmap ims_sbmap;

This bitmap is needed for what?

> --- a/drivers/dma/idxd/init.c
> +++ b/drivers/dma/idxd/init.c
> @@ -231,10 +231,51 @@ static void idxd_read_table_offsets(struct idxd_device *idxd)
>  	idxd->msix_perm_offset = offsets.msix_perm * 0x100;
>  	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n",
>  		idxd->msix_perm_offset);
> +	idxd->ims_offset = offsets.ims * 0x100;

Magic constant pulled out of thin air. #define ....

> +	dev_dbg(dev, "IDXD IMS Offset: %#x\n", idxd->ims_offset);
>  	idxd->perfmon_offset = offsets.perfmon * 0x100;
>  	dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset);
>  }
>  
> +#define PCI_DEVSEC_CAP		0x23
> +#define SIOVDVSEC1(offset)	((offset) + 0x4)
> +#define SIOVDVSEC2(offset)	((offset) + 0x8)
> +#define DVSECID			0x5
> +#define SIOVCAP(offset)		((offset) + 0x14)
> +
> +static void idxd_check_siov(struct idxd_device *idxd)
> +{
> +	struct pci_dev *pdev = idxd->pdev;
> +	struct device *dev = &pdev->dev;
> +	int dvsec;
> +	u16 val16;
> +	u32 val32;
> +
> +	dvsec = pci_find_ext_capability(pdev, PCI_DEVSEC_CAP);
> +	pci_read_config_word(pdev, SIOVDVSEC1(dvsec), &val16);
> +	if (val16 != PCI_VENDOR_ID_INTEL) {
> +		dev_dbg(&pdev->dev, "DVSEC vendor id is not Intel\n");
> +		return;
> +	}
> +
> +	pci_read_config_word(pdev, SIOVDVSEC2(dvsec), &val16);
> +	if (val16 != DVSECID) {
> +		dev_dbg(&pdev->dev, "DVSEC ID is not SIOV\n");
> +		return;
> +	}
> +
> +	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
> +	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
> +		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
> +		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
> +		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
> +		dev_dbg(&pdev->dev, "IMS supported for device\n");
> +		return;
> +	}
> +
> +	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");

It's really hard to find the code inside all of this dev_dbg()
noise. But why is this capability check done in this driver? Is this
capability stuff really IDXD specific or is the next device which
supports this going to copy and pasta the above?

>  static void idxd_read_caps(struct idxd_device *idxd)
>  {
>  	struct device *dev = &idxd->pdev->dev;
> @@ -253,6 +294,7 @@ static void idxd_read_caps(struct idxd_device *idxd)
>  	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
>  	idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
>  	dev_dbg(dev, "max batch size: %u\n", idxd->max_batch_size);
> +	idxd_check_siov(idxd);
>  	if (idxd->hw.gen_cap.config_en)
>  		set_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags);
>  
> @@ -347,9 +389,19 @@ static int idxd_probe(struct idxd_device *idxd)
>  
>  	idxd->major = idxd_cdev_get_major(idxd);
>  
> +	if (idxd->ims_size) {
> +		rc = sbitmap_init_node(&idxd->ims_sbmap, idxd->ims_size, -1,
> +				       GFP_KERNEL, dev_to_node(dev));
> +		if (rc < 0)
> +			goto sbitmap_fail;
> +	}

Ah, here the bitmap is allocated, but it's still completely unclear what
it is used for.

The subject line is misleading as hell. This does not add support, it's
doing some magic capability checks and allocates stuff which nobody
knows what it is used for.

Thanks,

        tglx



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver
  2020-09-30 18:47   ` Thomas Gleixner
@ 2020-09-30 18:51     ` Jason Gunthorpe
  2020-09-30 21:48       ` Thomas Gleixner
  2020-09-30 21:49       ` Raj, Ashok
  2020-10-01 20:48     ` Dave Jiang
  1 sibling, 2 replies; 50+ messages in thread
From: Jason Gunthorpe @ 2020-09-30 18:51 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Wed, Sep 30, 2020 at 08:47:00PM +0200, Thomas Gleixner wrote:

> > +	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
> > +	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
> > +		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
> > +		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
> > +		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
> > +		dev_dbg(&pdev->dev, "IMS supported for device\n");
> > +		return;
> > +	}
> > +
> > +	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");
> 
> It's really hard to find the code inside all of this dev_dbg()
> noise. But why is this capability check done in this driver? Is this
> capability stuff really IDXD specific or is the next device which
> supports this going to copy and pasta the above?

It is the weirdest thing, IMHO. Intel defined a dvsec cap in their
SIOV cookbook, but as far as I can see it serves no purpose at
all.

Last time I asked I got some unclear mumbling about "OEMs".

I expect you'll see all Intel drivers copying this code.

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
       [not found] ` <160021253189.67751.12686144284999931703.stgit@djiang5-desk3.ch.intel.com>
@ 2020-09-30 19:57   ` Thomas Gleixner
  2020-10-07 21:54     ` Dave Jiang
  0 siblings, 1 reply; 50+ messages in thread
From: Thomas Gleixner @ 2020-09-30 19:57 UTC (permalink / raw)
  To: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, jgg, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

On Tue, Sep 15 2020 at 16:28, Dave Jiang wrote:
> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
> index a39392157dc2..115a8f49aab3 100644
> --- a/drivers/dma/Kconfig
> +++ b/drivers/dma/Kconfig
> @@ -301,6 +301,7 @@ config INTEL_IDXD_MDEV
>  	depends on INTEL_IDXD
>  	depends on VFIO_MDEV
>  	depends on VFIO_MDEV_DEVICE
> +	depends on IMS_MSI_ARRAY

select?

> int idxd_mdev_host_init(struct idxd_device *idxd)
> {
>	struct device *dev = &idxd->pdev->dev;

> +	ims_info.max_slots = idxd->ims_size;
> +	ims_info.slots = idxd->reg_base + idxd->ims_offset;
> +	dev->msi_domain =
> pci_ims_array_create_msi_irq_domain(idxd->pdev, &ims_info);

1) creating the domain can fail and checking the return code is overrated

2) dev_set_msi_domain() exists for a reason. If we change any of this in
   struct device then we can chase all the open coded mess in drivers
   like this.

Also can you please explain how this is supposed to work?

idxd->pdev is the host PCI device. So why are you overwriting the MSI
domain of the underlying host device? This works by chance because you
allocate the regular MSIX interrupts for the host device _before_
invoking this.

IIRC, I provided you ASCII art to show how all of this is supposed to be
structured...

>  int vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx)
>  {
>  	int rc = -1;
> @@ -44,15 +46,63 @@ int vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx)
>  	return rc;
>  }
>  
> +#define IMS_PASID_ENABLE	0x8
>  int vidxd_disable_host_ims_pasid(struct vdcm_idxd *vidxd, int ims_idx)

Yet more unreadable glue. The coding style of this stuff is horrible.

>  {
> -	/* PLACEHOLDER */
> +	struct mdev_device *mdev = vidxd->vdev.mdev;
> +	struct device *dev = mdev_dev(mdev);
> +	unsigned int ims_offset;
> +	struct idxd_device *idxd = vidxd->idxd;
> +	u32 val;
> +
> +	/*
> +	 * Current implementation limits to 1 WQ for the vdev and therefore
> +	 * also only 1 IMS interrupt for that vdev.
> +	 */
> +	if (ims_idx >= VIDXD_MAX_WQS) {
> +		dev_warn(dev, "ims_idx greater than vidxd allowed: %d\n", ims_idx);

This warning text makes no sense whatsoever.

> +		return -EINVAL;
> +	}
> +
> +	ims_offset = idxd->ims_offset + vidxd->ims_index[ims_idx] * 0x10;
> +	val = ioread32(idxd->reg_base + ims_offset + 12);
> +	val &= ~IMS_PASID_ENABLE;
> +	iowrite32(val, idxd->reg_base + ims_offset + 12);

*0x10 + 12 !?!? 

Reusing struct ims_slot from the irq chip driver would not be convoluted
enough, right?

Aside of that this is fiddling in the IMS storage array behind the irq
chips back without any comment here and a big fat comment about the
shared usage of ims_slot::ctrl in the irq chip driver.

This is kernel programming, not the obfuscated C code contest.

> +	/* Setup the PASID filtering */
> +	pasid = idxd_get_mdev_pasid(mdev);
> +
> +	if (pasid >= 0) {
> +		ims_offset = idxd->ims_offset + vidxd->ims_index[ims_idx] * 0x10;
> +		val = ioread32(idxd->reg_base + ims_offset + 12);
> +		val |= IMS_PASID_ENABLE | (pasid << 12) | (val & 0x7);
> +		iowrite32(val, idxd->reg_base + ims_offset + 12);

More magic numbers and more fiddling in the IMS slot.

> +	} else {
> +		dev_warn(dev, "pasid setup failed for ims entry %lld\n", vidxd->ims_index[ims_idx]);
> +		return -ENXIO;
> +	}
> +
>  	return 0;
>  }
>  
> @@ -839,12 +889,43 @@ static void vidxd_wq_disable(struct vdcm_idxd *vidxd, int wq_id_mask)
>  
>  void vidxd_free_ims_entries(struct vdcm_idxd *vidxd)
>  {
> -	/* PLACEHOLDER */
> +	struct irq_domain *irq_domain;
> +	struct mdev_device *mdev = vidxd->vdev.mdev;
> +	struct device *dev = mdev_dev(mdev);
> +	int i;
> +
> +	for (i = 0; i < VIDXD_MAX_MSIX_VECS - 1; i++)
> +		vidxd->ims_index[i] = -1;
> +
> +	irq_domain = vidxd->idxd->pdev->dev.msi_domain;

See above.

> +	msi_domain_free_irqs(irq_domain, dev);

>  int vidxd_setup_ims_entries(struct vdcm_idxd *vidxd)
>  {
> -	/* PLACEHOLDER */
> +	struct irq_domain *irq_domain;
> +	struct idxd_device *idxd = vidxd->idxd;
> +	struct mdev_device *mdev = vidxd->vdev.mdev;
> +	struct device *dev = mdev_dev(mdev);
> +	int vecs = VIDXD_MAX_MSIX_VECS - 1;
> +	struct msi_desc *entry;
> +	struct ims_irq_entry *irq_entry;
> +	int rc, i = 0;
> +
> +	irq_domain = idxd->pdev->dev.msi_domain;

Ditto.

> +	rc = msi_domain_alloc_irqs(irq_domain, dev, vecs);
> +	if (rc < 0)
> +		return rc;
> +
> +	for_each_msi_entry(entry, dev) {
> +		irq_entry = &vidxd->irq_entries[i];
> +		irq_entry->vidxd = vidxd;
> +		irq_entry->int_src = i;

Redundant information because it's the index in the array. What for?

> +		irq_entry->irq = entry->irq;
> +		vidxd->ims_index[i] = entry->device_msi.hwirq;

The point of having two arrays to store related information is?

It's at least orders of magnitudes better than the previous trainwreck,
but oh well...

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver
  2020-09-30 18:51     ` Jason Gunthorpe
@ 2020-09-30 21:48       ` Thomas Gleixner
  2020-09-30 21:49       ` Raj, Ashok
  1 sibling, 0 replies; 50+ messages in thread
From: Thomas Gleixner @ 2020-09-30 21:48 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Wed, Sep 30 2020 at 15:51, Jason Gunthorpe wrote:

> On Wed, Sep 30, 2020 at 08:47:00PM +0200, Thomas Gleixner wrote:
>
>> > +	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
>> > +	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
>> > +		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
>> > +		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
>> > +		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
>> > +		dev_dbg(&pdev->dev, "IMS supported for device\n");
>> > +		return;
>> > +	}
>> > +
>> > +	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");
>> 
>> It's really hard to find the code inside all of this dev_dbg()
>> noise. But why is this capability check done in this driver? Is this
>> capability stuff really IDXD specific or is the next device which
>> supports this going to copy and pasta the above?
>
> It is the weirdest thing, IMHO. Intel defined a dvsec cap in their
> SIOV cookbook, but as far as I can see it serves no purpose at
> all.

Why am I not surprised?

> Last time I asked I got some unclear mumbling about "OEMs".

See above.

But it reads the IMS storage array size out of this capability, so it
looks like it has some value.

> I expect you'll see all Intel drivers copying this code.

Just to set the expectations straight:

   1) Has this capability stuff any value aside of being mentioned in
      the SIOV cookbook?

   2) If it has no value, then just remove the mess

   3) If it has value then this wants to go to the PCI core and fill in
      some SIOV specific data structure when PCI evaluates the
      capabilities. Or at least have a generic function which can be
      called by the magic SIOV capable drivers.

Thanks,

        tglx



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver
  2020-09-30 18:51     ` Jason Gunthorpe
  2020-09-30 21:48       ` Thomas Gleixner
@ 2020-09-30 21:49       ` Raj, Ashok
  2020-09-30 21:57         ` Thomas Gleixner
  2020-09-30 22:38         ` Jason Gunthorpe
  1 sibling, 2 replies; 50+ messages in thread
From: Raj, Ashok @ 2020-09-30 21:49 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Thomas Gleixner, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm, Ashok Raj

Hi Jason

On Wed, Sep 30, 2020 at 03:51:03PM -0300, Jason Gunthorpe wrote:
> On Wed, Sep 30, 2020 at 08:47:00PM +0200, Thomas Gleixner wrote:
> 
> > > +	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
> > > +	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
> > > +		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
> > > +		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
> > > +		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
> > > +		dev_dbg(&pdev->dev, "IMS supported for device\n");
> > > +		return;
> > > +	}
> > > +
> > > +	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");
> > 
> > It's really hard to find the code inside all of this dev_dbg()
> > noise. But why is this capability check done in this driver? Is this
> > capability stuff really IDXD specific or is the next device which
> > supports this going to copy and pasta the above?
> 
> It is the weirdest thing, IMHO. Intel defined a dvsec cap in their
> SIOV cookbook, but as far as I can see it serves no purpose at
> all.
> 
> Last time I asked I got some unclear mumbling about "OEMs".
> 
One of the parameters it has is the "supported system page-sizes" which is
usually there in the SRIOV properties. So it needed a place holder for
that. 

IMS is a device specific capability, and I almost forgot why we needed
until I had to checking internally. Remember when a device is given to a
guest, MSIX routing via Interrupt Remapping is automatic via the VFIO/IRQFD
and such.

When we provision an entire PCI device that is IMS capable. The guest
driver does know it can update the IMS entries directly without going to
the host. But in order to do remapping we need something like how we manage
PASID allocation from guest, so an IRTE entry can be allocated and the host
driver can write the proper values for IMS.

Hope this helps clear things up.

Cheers,
Ashok


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver
  2020-09-30 21:49       ` Raj, Ashok
@ 2020-09-30 21:57         ` Thomas Gleixner
  2020-10-01  1:07           ` Raj, Ashok
  2020-09-30 22:38         ` Jason Gunthorpe
  1 sibling, 1 reply; 50+ messages in thread
From: Thomas Gleixner @ 2020-09-30 21:57 UTC (permalink / raw)
  To: Raj, Ashok, Jason Gunthorpe
  Cc: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian, sanjay.k.kumar,
	tony.luck, jing.lin, dan.j.williams, kwankhede, eric.auger,
	parav, rafael, netanelg, shahafs, yan.y.zhao, pbonzini,
	samuel.ortiz, mona.hossain, dmaengine, linux-kernel, x86,
	linux-pci, kvm, Ashok Raj

On Wed, Sep 30 2020 at 14:49, Ashok Raj wrote:
>> It is the weirdest thing, IMHO. Intel defined a dvsec cap in their
>> SIOV cookbook, but as far as I can see it serves no purpose at
>> all.
>> 
>> Last time I asked I got some unclear mumbling about "OEMs".
>> 
> One of the parameters it has is the "supported system page-sizes" which is
> usually there in the SRIOV properties. So it needed a place holder for
> that. 
>
> IMS is a device specific capability, and I almost forgot why we needed
> until I had to checking internally. Remember when a device is given to a
> guest, MSIX routing via Interrupt Remapping is automatic via the VFIO/IRQFD
> and such.

-ENOPARSE

> When we provision an entire PCI device that is IMS capable. The guest
> driver does know it can update the IMS entries directly without going to
> the host. But in order to do remapping we need something like how we manage
> PASID allocation from guest, so an IRTE entry can be allocated and the host
> driver can write the proper values for IMS.

And how is that related to that capbility thing?

Also this stuff is host side and not guest side. I seriously doubt that
you want to hand in the whole PCI device which contains the IMS
thing. The whole point of IMS was as far as I was told that you can
create gazillions of subdevices and have seperate MSI interrupts for
each of them.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver
  2020-09-30 21:49       ` Raj, Ashok
  2020-09-30 21:57         ` Thomas Gleixner
@ 2020-09-30 22:38         ` Jason Gunthorpe
  1 sibling, 0 replies; 50+ messages in thread
From: Jason Gunthorpe @ 2020-09-30 22:38 UTC (permalink / raw)
  To: Raj, Ashok
  Cc: Thomas Gleixner, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Wed, Sep 30, 2020 at 02:49:41PM -0700, Raj, Ashok wrote:

> One of the parameters it has is the "supported system page-sizes" which is
> usually there in the SRIOV properties. So it needed a place holder for
> that. 

No idea why this would be a PCI cap. It is certainly not something so
universal it needs standardizing. There are many ways a device can
manage a BAR to match a required protection granularity.

> When we provision an entire PCI device that is IMS capable. The guest
> driver does know it can update the IMS entries directly without going to
> the host. But in order to do remapping we need something like how we manage
> PASID allocation from guest, so an IRTE entry can be allocated and the host
> driver can write the proper values for IMS.

Look at the architecture we ended up with.

You need to make pci_subdevice_msi_create_irq_domain() fail if the
platform can't provide the functionality.

Working around that with PCI caps is pretty gross.

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver
  2020-09-30 21:57         ` Thomas Gleixner
@ 2020-10-01  1:07           ` Raj, Ashok
  2020-10-01  8:44             ` Thomas Gleixner
  0 siblings, 1 reply; 50+ messages in thread
From: Raj, Ashok @ 2020-10-01  1:07 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jason Gunthorpe, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm, Ashok Raj

Hi Thomas,

On Wed, Sep 30, 2020 at 11:57:22PM +0200, Thomas Gleixner wrote:
> On Wed, Sep 30 2020 at 14:49, Ashok Raj wrote:
> >> It is the weirdest thing, IMHO. Intel defined a dvsec cap in their
> >> SIOV cookbook, but as far as I can see it serves no purpose at
> >> all.
> >> 
> >> Last time I asked I got some unclear mumbling about "OEMs".
> >> 
> > One of the parameters it has is the "supported system page-sizes" which is
> > usually there in the SRIOV properties. So it needed a place holder for
> > that. 
> >
> > IMS is a device specific capability, and I almost forgot why we needed
> > until I had to checking internally. Remember when a device is given to a
> > guest, MSIX routing via Interrupt Remapping is automatic via the VFIO/IRQFD
> > and such.
> 
> -ENOPARSE

Let me try again to see if this will turn into ESUCCESS :-)

Devices exposed to guest need host OS support for programming interrupt
entries in the IOMMU interrupt remapping table. VFIO provides those 
services for standard interrupt schemes like MSI/MSIx for instance. 
Since IMS is device specific VFIO can't provide an intercept when 
IMS entries are programmed by the guest OS. 

If the virtualisation software doesn't expose vIOMMU with virtual
capabilities to allocate IRTE entries and support for vIRTE in guest
then its expected to turn off the IMS capability. Hence driver running 
in guest will not enable IMS.

> 
> > When we provision an entire PCI device that is IMS capable. The guest
> > driver does know it can update the IMS entries directly without going to
> > the host. But in order to do remapping we need something like how we manage
> > PASID allocation from guest, so an IRTE entry can be allocated and the host
> > driver can write the proper values for IMS.
> 
> And how is that related to that capbility thing?
> 
> Also this stuff is host side and not guest side. I seriously doubt that
> you want to hand in the whole PCI device which contains the IMS

You are right, but nothing prevents a user from simply taking a full PCI
device and assign to guest. 

> thing. The whole point of IMS was as far as I was told that you can
> create gazillions of subdevices and have seperate MSI interrupts for
> each of them.


Cheers,
Ashok

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver
  2020-10-01  1:07           ` Raj, Ashok
@ 2020-10-01  8:44             ` Thomas Gleixner
  0 siblings, 0 replies; 50+ messages in thread
From: Thomas Gleixner @ 2020-10-01  8:44 UTC (permalink / raw)
  To: Raj, Ashok
  Cc: Jason Gunthorpe, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm, Ashok Raj

On Wed, Sep 30 2020 at 18:07, Ashok Raj wrote:
> On Wed, Sep 30, 2020 at 11:57:22PM +0200, Thomas Gleixner wrote:
>
> Devices exposed to guest need host OS support for programming interrupt
> entries in the IOMMU interrupt remapping table. VFIO provides those 
> services for standard interrupt schemes like MSI/MSIx for instance. 
> Since IMS is device specific VFIO can't provide an intercept when 
> IMS entries are programmed by the guest OS.

Why is IMS exposed to the guest in the first place? You expose a
subdevice to a guest, right? And that subdevice should not even know
that IMS exists simply because IMS is strictly host specific.

The obvious emulation here is to make the subdevice look like a PCI
device and expose emulated MSIX (not MSI) which is intercepted when
accessing the MSIX table and then redirected to the proper place along
with IRTE and PASID and whatever.

>> Also this stuff is host side and not guest side. I seriously doubt that
>> you want to hand in the whole PCI device which contains the IMS
>
> You are right, but nothing prevents a user from simply taking a full PCI
> device and assign to guest. 

You surely can and should prevent that because it makes no sense and
cannot work.

That's why you want a generic check for 'this device is magic SIOV or
whatever' and not something burried deep into a driver..

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 04/18] dmaengine: idxd: add interrupt handle request support
  2020-09-30 18:36   ` [PATCH v3 04/18] dmaengine: idxd: add interrupt handle request support Thomas Gleixner
@ 2020-10-01 20:16     ` Dave Jiang
  0 siblings, 0 replies; 50+ messages in thread
From: Dave Jiang @ 2020-10-01 20:16 UTC (permalink / raw)
  To: Thomas Gleixner, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, ashok.raj, jgg, yi.l.liu,
	baolu.lu, kevin.tian, sanjay.k.kumar, tony.luck, jing.lin,
	dan.j.williams, kwankhede, eric.auger, parav, rafael, netanelg,
	shahafs, yan.y.zhao, pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm



On 9/30/2020 11:36 AM, Thomas Gleixner wrote:
> On Tue, Sep 15 2020 at 16:28, Dave Jiang wrote:
>>   
>> +#define INT_HANDLE_IMS_TABLE	0x10000
>> +int idxd_device_request_int_handle(struct idxd_device *idxd, int idx,
>> +				   int *handle, enum idxd_interrupt_type irq_type)
> 
> New lines exist for a reason and this glued together define and function
> definition is unreadable garbage.
> 
> Also is that magic bit a software flag or defined by hardware? If the
> latter then you want to move it to the other hardware defines.

Will move this to hardware register header.

> 
> Thanks,
> 
>          tglx
>   
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver
  2020-09-30 18:47   ` Thomas Gleixner
  2020-09-30 18:51     ` Jason Gunthorpe
@ 2020-10-01 20:48     ` Dave Jiang
  1 sibling, 0 replies; 50+ messages in thread
From: Dave Jiang @ 2020-10-01 20:48 UTC (permalink / raw)
  To: Thomas Gleixner, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, ashok.raj, jgg, yi.l.liu,
	baolu.lu, kevin.tian, sanjay.k.kumar, tony.luck, jing.lin,
	dan.j.williams, kwankhede, eric.auger, parav, rafael, netanelg,
	shahafs, yan.y.zhao, pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm



On 9/30/2020 11:47 AM, Thomas Gleixner wrote:
> On Tue, Sep 15 2020 at 16:28, Dave Jiang wrote:
>>   struct idxd_device {
>> @@ -170,6 +171,7 @@ struct idxd_device {
>>   
>>   	int num_groups;
>>   
>> +	u32 ims_offset;
>>   	u32 msix_perm_offset;
>>   	u32 wqcfg_offset;
>>   	u32 grpcfg_offset;
>> @@ -177,6 +179,7 @@ struct idxd_device {
>>   
>>   	u64 max_xfer_bytes;
>>   	u32 max_batch_size;
>> +	int ims_size;
>>   	int max_groups;
>>   	int max_engines;
>>   	int max_tokens;
>> @@ -196,6 +199,7 @@ struct idxd_device {
>>   	struct work_struct work;
>>   
>>   	int *int_handles;
>> +	struct sbitmap ims_sbmap;
> 
> This bitmap is needed for what?

Nothing anymore. I forgot to remove. All this is handled by MSI core now with 
code from you.

> 
>> --- a/drivers/dma/idxd/init.c
>> +++ b/drivers/dma/idxd/init.c
>> @@ -231,10 +231,51 @@ static void idxd_read_table_offsets(struct idxd_device *idxd)
>>   	idxd->msix_perm_offset = offsets.msix_perm * 0x100;
>>   	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n",
>>   		idxd->msix_perm_offset);
>> +	idxd->ims_offset = offsets.ims * 0x100;
> 
> Magic constant pulled out of thin air. #define ....

Will fix

> 
>> +	dev_dbg(dev, "IDXD IMS Offset: %#x\n", idxd->ims_offset);
>>   	idxd->perfmon_offset = offsets.perfmon * 0x100;
>>   	dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset);
>>   }
>>   
>> +#define PCI_DEVSEC_CAP		0x23
>> +#define SIOVDVSEC1(offset)	((offset) + 0x4)
>> +#define SIOVDVSEC2(offset)	((offset) + 0x8)
>> +#define DVSECID			0x5
>> +#define SIOVCAP(offset)		((offset) + 0x14)
>> +
>> +static void idxd_check_siov(struct idxd_device *idxd)
>> +{
>> +	struct pci_dev *pdev = idxd->pdev;
>> +	struct device *dev = &pdev->dev;
>> +	int dvsec;
>> +	u16 val16;
>> +	u32 val32;
>> +
>> +	dvsec = pci_find_ext_capability(pdev, PCI_DEVSEC_CAP);
>> +	pci_read_config_word(pdev, SIOVDVSEC1(dvsec), &val16);
>> +	if (val16 != PCI_VENDOR_ID_INTEL) {
>> +		dev_dbg(&pdev->dev, "DVSEC vendor id is not Intel\n");
>> +		return;
>> +	}
>> +
>> +	pci_read_config_word(pdev, SIOVDVSEC2(dvsec), &val16);
>> +	if (val16 != DVSECID) {
>> +		dev_dbg(&pdev->dev, "DVSEC ID is not SIOV\n");
>> +		return;
>> +	}
>> +
>> +	pci_read_config_dword(pdev, SIOVCAP(dvsec), &val32);
>> +	if ((val32 & 0x1) && idxd->hw.gen_cap.max_ims_mult) {
>> +		idxd->ims_size = idxd->hw.gen_cap.max_ims_mult * 256ULL;
>> +		dev_dbg(dev, "IMS size: %u\n", idxd->ims_size);
>> +		set_bit(IDXD_FLAG_SIOV_SUPPORTED, &idxd->flags);
>> +		dev_dbg(&pdev->dev, "IMS supported for device\n");
>> +		return;
>> +	}
>> +
>> +	dev_dbg(&pdev->dev, "SIOV unsupported for device\n");
> 
> It's really hard to find the code inside all of this dev_dbg()
> noise. But why is this capability check done in this driver? Is this
> capability stuff really IDXD specific or is the next device which
> supports this going to copy and pasta the above?

Will look into move this into a common detection function for all similar 
devices. This should be common for all Intel devices that support SIOV.

> 
>>   static void idxd_read_caps(struct idxd_device *idxd)
>>   {
>>   	struct device *dev = &idxd->pdev->dev;
>> @@ -253,6 +294,7 @@ static void idxd_read_caps(struct idxd_device *idxd)
>>   	dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes);
>>   	idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift;
>>   	dev_dbg(dev, "max batch size: %u\n", idxd->max_batch_size);
>> +	idxd_check_siov(idxd);
>>   	if (idxd->hw.gen_cap.config_en)
>>   		set_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags);
>>   
>> @@ -347,9 +389,19 @@ static int idxd_probe(struct idxd_device *idxd)
>>   
>>   	idxd->major = idxd_cdev_get_major(idxd);
>>   
>> +	if (idxd->ims_size) {
>> +		rc = sbitmap_init_node(&idxd->ims_sbmap, idxd->ims_size, -1,
>> +				       GFP_KERNEL, dev_to_node(dev));
>> +		if (rc < 0)
>> +			goto sbitmap_fail;
>> +	}
> 
> Ah, here the bitmap is allocated, but it's still completely unclear what
> it is used for.

Need to remove.

> 
> The subject line is misleading as hell. This does not add support, it's
> doing some magic capability checks and allocates stuff which nobody
> knows what it is used for.

With the unneeded code removal and moving the SIOV detection code to common 
implementation, it should be more clear.

> 
> Thanks,
> 
>          tglx
> 
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 01/18] irqchip: Add IMS (Interrupt Message Storage) driver
  2020-09-30 18:23   ` Thomas Gleixner
@ 2020-10-01 22:59     ` Dey, Megha
  0 siblings, 0 replies; 50+ messages in thread
From: Dey, Megha @ 2020-10-01 22:59 UTC (permalink / raw)
  To: Thomas Gleixner, Dave Jiang, vkoul, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, ashok.raj, jgg, yi.l.liu,
	baolu.lu, kevin.tian, sanjay.k.kumar, tony.luck, jing.lin,
	dan.j.williams, kwankhede, eric.auger, parav, rafael, netanelg,
	shahafs, yan.y.zhao, pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

Hi Thomas,

On 9/30/2020 11:23 AM, Thomas Gleixner wrote:
> On Tue, Sep 15 2020 at 16:27, Dave Jiang wrote:
>> From: Thomas Gleixner <tglx@linutronix.de>
>> +config IMS_MSI_ARRAY
>> +	bool "IMS Interrupt Message Storm MSI controller for device memory storage arrays"
> Hehe, you missed a Message Storm :)
i will change this in the next version
>> +	depends on PCI
>> +	select IMS_MSI
>> +	select GENERIC_MSI_IRQ_DOMAIN
>> +	help
>> +	  Support for IMS Interrupt Message Storm MSI controller
> and another one.
ok :)
>
>> +	  with IMS slot storage in a slot array in device memory
>> +
>> +static void ims_array_mask_irq(struct irq_data *data)
>> +{
>> +	struct msi_desc *desc = irq_data_get_msi_desc(data);
>> +	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
>> +	u32 __iomem *ctrl = &slot->ctrl;
>> +
>> +	iowrite32(ioread32(ctrl) | IMS_VECTOR_CTRL_MASK, ctrl);
>> +	ioread32(ctrl); /* Flush write to device */
> Bah, I fundamentaly hate tail comments. They are a distraction and
> disturb the reading flow. Put it above the ioread32() please.
Will do.
>
>> +static void ims_array_unmask_irq(struct irq_data *data)
>> +{
>> +	struct msi_desc *desc = irq_data_get_msi_desc(data);
>> +	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
>> +	u32 __iomem *ctrl = &slot->ctrl;
>> +
>> +	iowrite32(ioread32(ctrl) & ~IMS_VECTOR_CTRL_MASK, ctrl);
> Why is this one not flushed?
I will add it.
>
>> +}
>> +
>> +static void ims_array_write_msi_msg(struct irq_data *data, struct msi_msg *msg)
>> +{
>> +	struct msi_desc *desc = irq_data_get_msi_desc(data);
>> +	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
>> +
>> +	iowrite32(msg->address_lo, &slot->address_lo);
>> +	iowrite32(msg->address_hi, &slot->address_hi);
>> +	iowrite32(msg->data, &slot->data);
>> +	ioread32(slot);
> Yuck? slot points to the struct and just because ioread32() accepts a
> void pointer does not make it any more readable.
hmm ok , will change this.
>
>> +static void ims_array_reset_slot(struct ims_slot __iomem *slot)
>> +{
>> +	iowrite32(0, &slot->address_lo);
>> +	iowrite32(0, &slot->address_hi);
>> +	iowrite32(0, &slot->data);
>> +	iowrite32(0, &slot->ctrl);
> Flush?

ok!

-Megha

>
> Thanks,
>
>          tglx

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 02/18] iommu/vt-d: Add DEV-MSI support
  2020-09-30 18:32   ` Thomas Gleixner
@ 2020-10-01 23:26     ` Dey, Megha
  2020-10-02 11:13       ` Thomas Gleixner
  2020-10-08  7:54     ` David Woodhouse
  1 sibling, 1 reply; 50+ messages in thread
From: Dey, Megha @ 2020-10-01 23:26 UTC (permalink / raw)
  To: Thomas Gleixner, Dave Jiang, vkoul, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, ashok.raj, jgg, yi.l.liu,
	baolu.lu, kevin.tian, sanjay.k.kumar, tony.luck, jing.lin,
	dan.j.williams, kwankhede, eric.auger, parav, rafael, netanelg,
	shahafs, yan.y.zhao, pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

Hi Thomas,

On 9/30/2020 11:32 AM, Thomas Gleixner wrote:
> On Tue, Sep 15 2020 at 16:27, Dave Jiang wrote:
>> @@ -1303,9 +1303,10 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
>>   	case X86_IRQ_ALLOC_TYPE_HPET:
>>   	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
>>   	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
>> +	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
>>   		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
>>   			set_hpet_sid(irte, info->devid);
>> -		else
>> +		else if (info->type != X86_IRQ_ALLOC_TYPE_DEV_MSI)
>>   			set_msi_sid(irte,
>>   			msi_desc_to_pci_dev(info->desc));
> Gah. this starts to become unreadable.
hmm ok will change it.
>
> diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
> index 8f4ce72570ce..0c1ea8ceec31 100644
> --- a/drivers/iommu/intel/irq_remapping.c
> +++ b/drivers/iommu/intel/irq_remapping.c
> @@ -1271,6 +1271,16 @@ static struct irq_chip intel_ir_chip = {
>   	.irq_set_vcpu_affinity	= intel_ir_set_vcpu_affinity,
>   };
>   
> +static void irte_prepare_msg(struct msi_msg *msg, int index, int subhandle)
> +{
> +	msg->address_hi = MSI_ADDR_BASE_HI;
> +	msg->data = sub_handle;
> +	msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
> +			  MSI_ADDR_IR_SHV |
> +			  MSI_ADDR_IR_INDEX1(index) |
> +			  MSI_ADDR_IR_INDEX2(index);
> +}
> +
>   static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
>   					     struct irq_cfg *irq_cfg,
>   					     struct irq_alloc_info *info,
> @@ -1312,19 +1322,18 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
>   		break;
>   
>   	case X86_IRQ_ALLOC_TYPE_HPET:
> +		set_hpet_sid(irte, info->hpet_id);
> +		irte_prepare_msg(msg, index, sub_handle);
> +		break;
> +
>   	case X86_IRQ_ALLOC_TYPE_MSI:
>   	case X86_IRQ_ALLOC_TYPE_MSIX:
> -		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
> -			set_hpet_sid(irte, info->hpet_id);
> -		else
> -			set_msi_sid(irte, info->msi_dev);
> -
> -		msg->address_hi = MSI_ADDR_BASE_HI;
> -		msg->data = sub_handle;
> -		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
> -				  MSI_ADDR_IR_SHV |
> -				  MSI_ADDR_IR_INDEX1(index) |
> -				  MSI_ADDR_IR_INDEX2(index);
> +		set_msi_sid(irte, info->msi_dev);
> +		irte_prepare_msg(msg, index, sub_handle);
> +		break;
> +
> +	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
> +		irte_prepare_msg(msg, index, sub_handle);
>   		break;
>   
>   	default:
>
> Hmm?

ok so I have no clue what happened here. This was the patch that was 
sent out:

https://lore.kernel.org/lkml/160021246905.67751.1674517279122764758.stgit@djiang5-desk3.ch.intel.com/

and this does not have the above change. Not sure what happened here.

Anyways, this should not be there.

>
> Thanks,
>
>          tglx

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 02/18] iommu/vt-d: Add DEV-MSI support
  2020-10-01 23:26     ` Dey, Megha
@ 2020-10-02 11:13       ` Thomas Gleixner
  0 siblings, 0 replies; 50+ messages in thread
From: Thomas Gleixner @ 2020-10-02 11:13 UTC (permalink / raw)
  To: Dey, Megha, Dave Jiang, vkoul, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

On Thu, Oct 01 2020 at 16:26, Megha Dey wrote:
> On 9/30/2020 11:32 AM, Thomas Gleixner wrote:
>> diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
>> index 8f4ce72570ce..0c1ea8ceec31 100644
>> --- a/drivers/iommu/intel/irq_remapping.c
>> +++ b/drivers/iommu/intel/irq_remapping.c
>> @@ -1271,6 +1271,16 @@ static struct irq_chip intel_ir_chip = {
>>   	.irq_set_vcpu_affinity	= intel_ir_set_vcpu_affinity,
>>   };
>>   
>> +static void irte_prepare_msg(struct msi_msg *msg, int index, int subhandle)
>> +{
>> +	msg->address_hi = MSI_ADDR_BASE_HI;
>> +	msg->data = sub_handle;
>> +	msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
>> +			  MSI_ADDR_IR_SHV |
>> +			  MSI_ADDR_IR_INDEX1(index) |
>> +			  MSI_ADDR_IR_INDEX2(index);
>> +}
>> +
>>   static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
>>   					     struct irq_cfg *irq_cfg,
>>   					     struct irq_alloc_info *info,
>> @@ -1312,19 +1322,18 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
>>   		break;
>>   
>>   	case X86_IRQ_ALLOC_TYPE_HPET:
>> +		set_hpet_sid(irte, info->hpet_id);
>> +		irte_prepare_msg(msg, index, sub_handle);
>> +		break;
>> +
>>   	case X86_IRQ_ALLOC_TYPE_MSI:
>>   	case X86_IRQ_ALLOC_TYPE_MSIX:
>> -		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
>> -			set_hpet_sid(irte, info->hpet_id);
>> -		else
>> -			set_msi_sid(irte, info->msi_dev);
>> -
>> -		msg->address_hi = MSI_ADDR_BASE_HI;
>> -		msg->data = sub_handle;
>> -		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
>> -				  MSI_ADDR_IR_SHV |
>> -				  MSI_ADDR_IR_INDEX1(index) |
>> -				  MSI_ADDR_IR_INDEX2(index);
>> +		set_msi_sid(irte, info->msi_dev);
>> +		irte_prepare_msg(msg, index, sub_handle);
>> +		break;
>> +
>> +	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
>> +		irte_prepare_msg(msg, index, sub_handle);
>>   		break;
>>   
>>   	default:
>>
>> Hmm?
>
> ok so I have no clue what happened here. This was the patch that was 
> sent out:
> and this does not have the above change. Not sure what happened here.

Of course it was not there. I added this in my reply obviously for
illustration. It's not '> ' quoted, right?

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-09-30 19:57   ` [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm Thomas Gleixner
@ 2020-10-07 21:54     ` Dave Jiang
  2020-10-08  7:39       ` Thomas Gleixner
  0 siblings, 1 reply; 50+ messages in thread
From: Dave Jiang @ 2020-10-07 21:54 UTC (permalink / raw)
  To: Thomas Gleixner, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, ashok.raj, jgg, yi.l.liu,
	baolu.lu, kevin.tian, sanjay.k.kumar, tony.luck, jing.lin,
	dan.j.williams, kwankhede, eric.auger, parav, rafael, netanelg,
	shahafs, yan.y.zhao, pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm



On 9/30/2020 12:57 PM, Thomas Gleixner wrote:
> On Tue, Sep 15 2020 at 16:28, Dave Jiang wrote:
>> diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
>> index a39392157dc2..115a8f49aab3 100644
>> --- a/drivers/dma/Kconfig
>> +++ b/drivers/dma/Kconfig
>> @@ -301,6 +301,7 @@ config INTEL_IDXD_MDEV
>>   	depends on INTEL_IDXD
>>   	depends on VFIO_MDEV
>>   	depends on VFIO_MDEV_DEVICE
>> +	depends on IMS_MSI_ARRAY
> 
> select?

Will fix

> 
>> int idxd_mdev_host_init(struct idxd_device *idxd)
>> {
>> 	struct device *dev = &idxd->pdev->dev;
> 
>> +	ims_info.max_slots = idxd->ims_size;
>> +	ims_info.slots = idxd->reg_base + idxd->ims_offset;
>> +	dev->msi_domain =
>> pci_ims_array_create_msi_irq_domain(idxd->pdev, &ims_info);
> 
> 1) creating the domain can fail and checking the return code is overrated
> 
> 2) dev_set_msi_domain() exists for a reason. If we change any of this in
>     struct device then we can chase all the open coded mess in drivers
>     like this.
> 
> Also can you please explain how this is supposed to work?
> 
> idxd->pdev is the host PCI device. So why are you overwriting the MSI
> domain of the underlying host device? This works by chance because you
> allocate the regular MSIX interrupts for the host device _before_
> invoking this.
> 
> IIRC, I provided you ASCII art to show how all of this is supposed to be
> structured...

Yes. I see now that the implementation is wrong. In the updated code for next rev, I'm saving the domain to the idxd driver context. 

	ims_info.max_slots = idxd->ims_size;
	ims_info.slots = idxd->reg_base + idxd->ims_offset;
	idxd->ims_domain = pci_ims_array_create_msi_irq_domain(idxd->pdev, &ims_info);

dev_set_msi_domain() will be called with mdev later on when mdevs are being created. 

	struct device *dev = mdev_dev(mdev);

        irq_domain = idxd->ims_domain;
        dev_set_msi_domain(dev, irq_domain);
        rc = msi_domain_alloc_irqs(irq_domain, dev, vecs);
        if (rc < 0)
                return rc;

        for_each_msi_entry(entry, dev) {
                irq_entry = &vidxd->irq_entries[i];
                irq_entry->vidxd = vidxd;
                irq_entry->entry = entry;
                irq_entry->id = i;
                i++;
        }


> 
>>   int vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx)
>>   {
>>   	int rc = -1;
>> @@ -44,15 +46,63 @@ int vidxd_send_interrupt(struct vdcm_idxd *vidxd, int msix_idx)
>>   	return rc;
>>   }
>>   
>> +#define IMS_PASID_ENABLE	0x8
>>   int vidxd_disable_host_ims_pasid(struct vdcm_idxd *vidxd, int ims_idx)
> 
> Yet more unreadable glue. The coding style of this stuff is horrible.
> 
>>   {
>> -	/* PLACEHOLDER */
>> +	struct mdev_device *mdev = vidxd->vdev.mdev;
>> +	struct device *dev = mdev_dev(mdev);
>> +	unsigned int ims_offset;
>> +	struct idxd_device *idxd = vidxd->idxd;
>> +	u32 val;
>> +
>> +	/*
>> +	 * Current implementation limits to 1 WQ for the vdev and therefore
>> +	 * also only 1 IMS interrupt for that vdev.
>> +	 */
>> +	if (ims_idx >= VIDXD_MAX_WQS) {
>> +		dev_warn(dev, "ims_idx greater than vidxd allowed: %d\n", ims_idx);
> 
> This warning text makes no sense whatsoever.
> 
>> +		return -EINVAL;
>> +	}
>> +
>> +	ims_offset = idxd->ims_offset + vidxd->ims_index[ims_idx] * 0x10;
>> +	val = ioread32(idxd->reg_base + ims_offset + 12);
>> +	val &= ~IMS_PASID_ENABLE;
>> +	iowrite32(val, idxd->reg_base + ims_offset + 12);
> 
> *0x10 + 12 !?!?
> 
> Reusing struct ims_slot from the irq chip driver would not be convoluted
> enough, right?

Yes. Fixing that.

> 
> Aside of that this is fiddling in the IMS storage array behind the irq
> chips back without any comment here and a big fat comment about the
> shared usage of ims_slot::ctrl in the irq chip driver.
> 

This is to program the pasid fields in the IMS table entry. Was thinking the pasid fields may be considered device specific so didn't attempt to add the support to the core code. 

> This is kernel programming, not the obfuscated C code contest.
> 
>> +	/* Setup the PASID filtering */
>> +	pasid = idxd_get_mdev_pasid(mdev);
>> +
>> +	if (pasid >= 0) {
>> +		ims_offset = idxd->ims_offset + vidxd->ims_index[ims_idx] * 0x10;
>> +		val = ioread32(idxd->reg_base + ims_offset + 12);
>> +		val |= IMS_PASID_ENABLE | (pasid << 12) | (val & 0x7);
>> +		iowrite32(val, idxd->reg_base + ims_offset + 12);
> 
> More magic numbers and more fiddling in the IMS slot.
> 
>> +	} else {
>> +		dev_warn(dev, "pasid setup failed for ims entry %lld\n", vidxd->ims_index[ims_idx]);
>> +		return -ENXIO;
>> +	}
>> +
>>   	return 0;
>>   }
>>   
>> @@ -839,12 +889,43 @@ static void vidxd_wq_disable(struct vdcm_idxd *vidxd, int wq_id_mask)
>>   
>>   void vidxd_free_ims_entries(struct vdcm_idxd *vidxd)
>>   {
>> -	/* PLACEHOLDER */
>> +	struct irq_domain *irq_domain;
>> +	struct mdev_device *mdev = vidxd->vdev.mdev;
>> +	struct device *dev = mdev_dev(mdev);
>> +	int i;
>> +
>> +	for (i = 0; i < VIDXD_MAX_MSIX_VECS - 1; i++)
>> +		vidxd->ims_index[i] = -1;
>> +
>> +	irq_domain = vidxd->idxd->pdev->dev.msi_domain;
> 
> See above.

struct device *dev = mdev_dev(mdev);

irq_domain = dev_get_msi_domain(dev);

> 
>> +	msi_domain_free_irqs(irq_domain, dev);
> 
>>   int vidxd_setup_ims_entries(struct vdcm_idxd *vidxd)
>>   {
>> -	/* PLACEHOLDER */
>> +	struct irq_domain *irq_domain;
>> +	struct idxd_device *idxd = vidxd->idxd;
>> +	struct mdev_device *mdev = vidxd->vdev.mdev;
>> +	struct device *dev = mdev_dev(mdev);
>> +	int vecs = VIDXD_MAX_MSIX_VECS - 1;
>> +	struct msi_desc *entry;
>> +	struct ims_irq_entry *irq_entry;
>> +	int rc, i = 0;
>> +
>> +	irq_domain = idxd->pdev->dev.msi_domain;
> 
> Ditto.
> 
>> +	rc = msi_domain_alloc_irqs(irq_domain, dev, vecs);
>> +	if (rc < 0)
>> +		return rc;
>> +
>> +	for_each_msi_entry(entry, dev) {
>> +		irq_entry = &vidxd->irq_entries[i];
>> +		irq_entry->vidxd = vidxd;
>> +		irq_entry->int_src = i;
> 
> Redundant information because it's the index in the array. What for?

Yes. I'm setting a ptr to the entry in order to retrieve the needed info. No more duplication.

> 
>> +		irq_entry->irq = entry->irq;
>> +		vidxd->ims_index[i] = entry->device_msi.hwirq;
> 
> The point of having two arrays to store related information is?
> 
> It's at least orders of magnitudes better than the previous trainwreck,
> but oh well...
> 
> Thanks,
> 
>          tglx
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-07 21:54     ` Dave Jiang
@ 2020-10-08  7:39       ` Thomas Gleixner
  2020-10-08 16:51         ` Dave Jiang
  0 siblings, 1 reply; 50+ messages in thread
From: Thomas Gleixner @ 2020-10-08  7:39 UTC (permalink / raw)
  To: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

On Wed, Oct 07 2020 at 14:54, Dave Jiang wrote:
> On 9/30/2020 12:57 PM, Thomas Gleixner wrote:
>> Aside of that this is fiddling in the IMS storage array behind the irq
>> chips back without any comment here and a big fat comment about the
>> shared usage of ims_slot::ctrl in the irq chip driver.
>> 
> This is to program the pasid fields in the IMS table entry. Was
> thinking the pasid fields may be considered device specific so didn't
> attempt to add the support to the core code.

Well, the problem is that this is not really irq chip functionality.

But the PASID programming needs to touch the IMS storage which is also
touched by the irq chip.

This might be correct as is, but without a big fat comment explaining
WHY it is safe to do so without any form of serialization this is just
voodoo and unreviewable.

Can you please explain when the PASID is programmed and what the state
of the interrupt is at that point? Is this a one off setup operation or
does this happen dynamically at random points during runtime?

This needs to be clarified first.

Thanks,

        tglx



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 02/18] iommu/vt-d: Add DEV-MSI support
  2020-09-30 18:32   ` Thomas Gleixner
  2020-10-01 23:26     ` Dey, Megha
@ 2020-10-08  7:54     ` David Woodhouse
  2020-10-20 21:42       ` Dey, Megha
  1 sibling, 1 reply; 50+ messages in thread
From: David Woodhouse @ 2020-10-08  7:54 UTC (permalink / raw)
  To: Thomas Gleixner, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, ashok.raj, jgg, yi.l.liu,
	baolu.lu, kevin.tian, sanjay.k.kumar, tony.luck, jing.lin,
	dan.j.williams, kwankhede, eric.auger, parav, rafael, netanelg,
	shahafs, yan.y.zhao, pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm


[-- Attachment #1: Type: text/plain, Size: 2769 bytes --]

On Wed, 2020-09-30 at 20:32 +0200, Thomas Gleixner wrote:
> On Tue, Sep 15 2020 at 16:27, Dave Jiang wrote:
> > @@ -1303,9 +1303,10 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
> >  	case X86_IRQ_ALLOC_TYPE_HPET:
> >  	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
> >  	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
> > +	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
> >  		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
> >  			set_hpet_sid(irte, info->devid);
> > -		else
> > +		else if (info->type != X86_IRQ_ALLOC_TYPE_DEV_MSI)
> >  			set_msi_sid(irte,
> >  			msi_desc_to_pci_dev(info->desc));
> 
> Gah. this starts to become unreadable.
> 
> diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
> index 8f4ce72570ce..0c1ea8ceec31 100644
> --- a/drivers/iommu/intel/irq_remapping.c
> +++ b/drivers/iommu/intel/irq_remapping.c
> @@ -1271,6 +1271,16 @@ static struct irq_chip intel_ir_chip = {
>  	.irq_set_vcpu_affinity	= intel_ir_set_vcpu_affinity,
>  };
>  
> +static void irte_prepare_msg(struct msi_msg *msg, int index, int subhandle)
> +{
> +	msg->address_hi = MSI_ADDR_BASE_HI;
> +	msg->data = sub_handle;
> +	msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
> +			  MSI_ADDR_IR_SHV |
> +			  MSI_ADDR_IR_INDEX1(index) |
> +			  MSI_ADDR_IR_INDEX2(index);
> +}
> +
>  static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
>  					     struct irq_cfg *irq_cfg,
>  					     struct irq_alloc_info *info,
> @@ -1312,19 +1322,18 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
>  		break;
>  
>  	case X86_IRQ_ALLOC_TYPE_HPET:
> +		set_hpet_sid(irte, info->hpet_id);
> +		irte_prepare_msg(msg, index, sub_handle);
> +		break;
> +
>  	case X86_IRQ_ALLOC_TYPE_MSI:
>  	case X86_IRQ_ALLOC_TYPE_MSIX:
> -		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
> -			set_hpet_sid(irte, info->hpet_id);
> -		else
> -			set_msi_sid(irte, info->msi_dev);
> -
> -		msg->address_hi = MSI_ADDR_BASE_HI;
> -		msg->data = sub_handle;
> -		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
> -				  MSI_ADDR_IR_SHV |
> -				  MSI_ADDR_IR_INDEX1(index) |
> -				  MSI_ADDR_IR_INDEX2(index);
> +		set_msi_sid(irte, info->msi_dev);
> +		irte_prepare_msg(msg, index, sub_handle);
> +		break;
> +
> +	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
> +		irte_prepare_msg(msg, index, sub_handle);
>  		break;
>  
>  	default:
> 
> Hmm?

It'd get a bit nicer if you *always* did the irte_prepare_msg() part to
generate the MSI message. Let the IOAPIC driver swizzle that into the
IOAPIC RTE for itself. You have no business composing an IOAPIC RTE
here.

Then your switch statement is *only* for setting the SID in the IRTE
appropriately.


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-08  7:39       ` Thomas Gleixner
@ 2020-10-08 16:51         ` Dave Jiang
  2020-10-08 23:17           ` Thomas Gleixner
  0 siblings, 1 reply; 50+ messages in thread
From: Dave Jiang @ 2020-10-08 16:51 UTC (permalink / raw)
  To: Thomas Gleixner, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, ashok.raj, jgg, yi.l.liu,
	baolu.lu, kevin.tian, sanjay.k.kumar, tony.luck, jing.lin,
	dan.j.williams, kwankhede, eric.auger, parav, rafael, netanelg,
	shahafs, yan.y.zhao, pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm



On 10/8/2020 12:39 AM, Thomas Gleixner wrote:
> On Wed, Oct 07 2020 at 14:54, Dave Jiang wrote:
>> On 9/30/2020 12:57 PM, Thomas Gleixner wrote:
>>> Aside of that this is fiddling in the IMS storage array behind the irq
>>> chips back without any comment here and a big fat comment about the
>>> shared usage of ims_slot::ctrl in the irq chip driver.
>>>
>> This is to program the pasid fields in the IMS table entry. Was
>> thinking the pasid fields may be considered device specific so didn't
>> attempt to add the support to the core code.
> 
> Well, the problem is that this is not really irq chip functionality.
> 
> But the PASID programming needs to touch the IMS storage which is also
> touched by the irq chip.
> 
> This might be correct as is, but without a big fat comment explaining
> WHY it is safe to do so without any form of serialization this is just
> voodoo and unreviewable.
> 
> Can you please explain when the PASID is programmed and what the state
> of the interrupt is at that point? Is this a one off setup operation or
> does this happen dynamically at random points during runtime?

I will put in comments for the function to explain why and when we modify the 
pasid field for the IMS entry. Programming of the pasid is done right before we 
request irq. And the clearing is done after we free the irq. We will not be 
touching the IMS field at runtime. So the touching of the entry should be safe.

> 
> This needs to be clarified first.
> 
> Thanks,
> 
>          tglx
> 
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-08 16:51         ` Dave Jiang
@ 2020-10-08 23:17           ` Thomas Gleixner
  2020-10-08 23:32             ` Jason Gunthorpe
  0 siblings, 1 reply; 50+ messages in thread
From: Thomas Gleixner @ 2020-10-08 23:17 UTC (permalink / raw)
  To: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, jgg, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm

Dave,

On Thu, Oct 08 2020 at 09:51, Dave Jiang wrote:
> On 10/8/2020 12:39 AM, Thomas Gleixner wrote:
>> On Wed, Oct 07 2020 at 14:54, Dave Jiang wrote:
>>> On 9/30/2020 12:57 PM, Thomas Gleixner wrote:
>>>> Aside of that this is fiddling in the IMS storage array behind the irq
>>>> chips back without any comment here and a big fat comment about the
>>>> shared usage of ims_slot::ctrl in the irq chip driver.
>>>>
>>> This is to program the pasid fields in the IMS table entry. Was
>>> thinking the pasid fields may be considered device specific so didn't
>>> attempt to add the support to the core code.
>> 
>> Well, the problem is that this is not really irq chip functionality.
>> 
>> But the PASID programming needs to touch the IMS storage which is also
>> touched by the irq chip.
>> 
>> This might be correct as is, but without a big fat comment explaining
>> WHY it is safe to do so without any form of serialization this is just
>> voodoo and unreviewable.
>> 
>> Can you please explain when the PASID is programmed and what the state
>> of the interrupt is at that point? Is this a one off setup operation or
>> does this happen dynamically at random points during runtime?
>
> I will put in comments for the function to explain why and when we modify the 
> pasid field for the IMS entry. Programming of the pasid is done right before we 
> request irq. And the clearing is done after we free the irq. We will not be 
> touching the IMS field at runtime. So the touching of the entry should be safe.

Thanks for clarifying that.

Thinking more about it, that very same thing will be needed for any
other IMS device and of course this is not going to end well because
some driver will fiddle with the PASID at the wrong time.

Find below an infrastructure patch which adds ASID support to the core
and a delta patch for the IMS irq chip. All of it neither compiled nor
tested.

Setting IMS_PASID_ENABLE might have other conditions which I don't know
of, so the IMS part might be completely wrong, but you get the idea.

Thanks,

        tglx

8<-----------
Subject: genirq: Provide irq_set_asid()
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 08 Oct 2020 23:32:16 +0200

Provide storage and a setter for an Address Space IDentifier.

The identifier is stored in the top level irq_data and it only can be
modified when the interrupt is not requested.

Add the necessary storage and helper functions and validate that interrupts
which require an ASID have one assigned.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h    |    8 ++++++++
 kernel/irq/internals.h |   10 ++++++++++
 kernel/irq/irqdesc.c   |    1 +
 kernel/irq/manage.c    |   40 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+)

--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -161,6 +161,7 @@ struct irq_common_data {
  * @mask:		precomputed bitmask for accessing the chip registers
  * @irq:		interrupt number
  * @hwirq:		hardware interrupt number, local to the interrupt domain
+ * @asid:		Optional address space ID for this interrupt.
  * @common:		point to data shared by all irqchips
  * @chip:		low level interrupt hardware access
  * @domain:		Interrupt translation domain; responsible for mapping
@@ -174,6 +175,7 @@ struct irq_data {
 	u32			mask;
 	unsigned int		irq;
 	unsigned long		hwirq;
+	u32			asid;
 	struct irq_common_data	*common;
 	struct irq_chip		*chip;
 	struct irq_domain	*domain;
@@ -183,6 +185,8 @@ struct irq_data {
 	void			*chip_data;
 };
 
+#define IRQ_INVALID_ASID	U32_MAX
+
 /*
  * Bit masks for irq_common_data.state_use_accessors
  *
@@ -555,6 +559,8 @@ struct irq_chip {
  * IRQCHIP_EOI_THREADED:	Chip requires eoi() on unmask in threaded mode
  * IRQCHIP_SUPPORTS_LEVEL_MSI	Chip can provide two doorbells for Level MSIs
  * IRQCHIP_SUPPORTS_NMI:	Chip can deliver NMIs, only for root irqchips
+ * IRQCHIP_REQUIRES_ASID:	Interrupt chip requires a valid Address Space
+ *				IDentifier
  */
 enum {
 	IRQCHIP_SET_TYPE_MASKED		= (1 <<  0),
@@ -566,6 +572,7 @@ enum {
 	IRQCHIP_EOI_THREADED		= (1 <<  6),
 	IRQCHIP_SUPPORTS_LEVEL_MSI	= (1 <<  7),
 	IRQCHIP_SUPPORTS_NMI		= (1 <<  8),
+	IRQCHIP_REQUIRES_ASID		= (1 <<  9),
 };
 
 #include <linux/irqdesc.h>
@@ -792,6 +799,7 @@ extern int irq_set_irq_type(unsigned int
 extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry);
 extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
 				struct msi_desc *entry);
+extern int irq_set_asid(unsigned int irq, u32 asid);
 extern struct irq_data *irq_get_irq_data(unsigned int irq);
 
 static inline struct irq_chip *irq_get_chip(unsigned int irq)
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -281,6 +281,16 @@ static inline void
 irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
 #endif
 
+static inline bool irq_requires_asid(struct irq_desc *desc)
+{
+	return desc->irq_data.chip->flags & IRQCHIP_REQUIRES_ASID;
+}
+
+static inline bool irq_invalid_asid(struct irq_desc *desc)
+{
+	return desc->irq_data.asid == IRQ_INVALID_ASID;
+}
+
 #ifdef CONFIG_IRQ_TIMINGS
 
 #define IRQ_TIMINGS_SHIFT	5
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -115,6 +115,7 @@ static void desc_set_defaults(unsigned i
 	irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
 	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
 	irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
+	desc->irq_data.asid = IRQ_INVALID_ASID;
 	desc->handle_irq = handle_bad_irq;
 	desc->depth = 1;
 	desc->irq_count = 0;
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1520,6 +1520,22 @@ static int
 	}
 
 	/*
+	 * If the topmost interrupt chip requires that a valid Address
+	 * Space IDentifier is set, validate that the interrupt is not
+	 * shared and that a valid ASID is set.
+	 */
+	if (irq_requires_asid(desc)) {
+		if (shared || irq_invalid_asid(desc)) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		/*
+		 * CHECKME: Is there a way to figure out that the ASID
+		 * makes sense in the context of the caller?
+		 */
+	}
+
+	/*
 	 * Setup the thread mask for this irqaction for ONESHOT. For
 	 * !ONESHOT irqs the thread mask is 0 so we can avoid a
 	 * conditional in irq_wake_thread().
@@ -1848,6 +1864,8 @@ static struct irqaction *__free_irq(stru
 		 */
 		raw_spin_lock_irqsave(&desc->lock, flags);
 		irq_domain_deactivate_irq(&desc->irq_data);
+		/* Invalidate the ASID associated to this interrupt */
+		desc->irq_data.asid = IRQ_INVALID_ASID;
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 		irq_release_resources(desc);
@@ -2752,3 +2770,25 @@ int irq_set_irqchip_state(unsigned int i
 	return err;
 }
 EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
+
+int irq_set_asid(unsigned int irq, u32 asid)
+{
+	struct irq_desc *desc;
+	struct irq_data *data;
+	unsigned long flags;
+	int err = -EBUSY;
+
+	desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+	if (!desc)
+		return -EINVAL;
+
+	/* If the interrupt is active changig ASID is a NONO */
+	if (!desc->action) {
+		data = irq_desc_get_irq_data(desc);
+		data->asid = asid;
+	}
+
+	irq_put_desc_busunlock(desc, flags);
+	return err;
+}
+EXPORT_SYMBOL_GPL(irq_set_asid);

8<------------------

Subject: irqchip/ims: PASID support
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 09 Oct 2020 01:02:09 +0200

NOT-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/irqchip/irq-ims-msi.c       |   29 ++++++++++++++++++++++-------
 include/linux/irqchip/irq-ims-msi.h |    2 ++
 2 files changed, 24 insertions(+), 7 deletions(-)

--- a/drivers/irqchip/irq-ims-msi.c
+++ b/drivers/irqchip/irq-ims-msi.c
@@ -18,14 +18,26 @@ struct ims_array_data {
 	unsigned long		map[0];
 };
 
+#define IMS_ASID_SHIFT		12
+
+static inline u32 ims_ctrl_val(struct irq_data *data, u32 ctrl)
+{
+	return ctrl | (data->asid) << 12 | IMS_PASID_ENABLE;
+}
+
+static inline void iowrite32_and_flush(u32 value, void __iomem *addr)
+{
+	iowrite32(value, addr);
+	ioread32(addr);
+}
+
 static void ims_array_mask_irq(struct irq_data *data)
 {
 	struct msi_desc *desc = irq_data_get_msi_desc(data);
 	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
 	u32 __iomem *ctrl = &slot->ctrl;
 
-	iowrite32(ioread32(ctrl) | IMS_VECTOR_CTRL_MASK, ctrl);
-	ioread32(ctrl); /* Flush write to device */
+	iowrite32_and_flush(ims_ctrl_val(data, IMS_VECTOR_CTRL_MASK), ctrl);
 }
 
 static void ims_array_unmask_irq(struct irq_data *data)
@@ -34,7 +46,10 @@ static void ims_array_unmask_irq(struct
 	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
 	u32 __iomem *ctrl = &slot->ctrl;
 
-	iowrite32(ioread32(ctrl) & ~IMS_VECTOR_CTRL_MASK, ctrl);
+	if (!WARN_ON_ONCE(data->asid == IRQ_INVALID_ASID))
+		return;
+
+	iowrite32_and_flush(ims_ctrl_val(data, 0), ctrl);
 }
 
 static void ims_array_write_msi_msg(struct irq_data *data, struct msi_msg *msg)
@@ -44,8 +59,7 @@ static void ims_array_write_msi_msg(stru
 
 	iowrite32(msg->address_lo, &slot->address_lo);
 	iowrite32(msg->address_hi, &slot->address_hi);
-	iowrite32(msg->data, &slot->data);
-	ioread32(slot);
+	iowrite32_and_flush(msg->data, &slot->data);
 }
 
 static const struct irq_chip ims_array_msi_controller = {
@@ -54,7 +68,8 @@ static const struct irq_chip ims_array_m
 	.irq_unmask		= ims_array_unmask_irq,
 	.irq_write_msi_msg	= ims_array_write_msi_msg,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
+	.flags			= IRQCHIP_SKIP_SET_WAKE |
+				  IRQCHIP_REQUIRES_ASID,
 };
 
 static void ims_array_reset_slot(struct ims_slot __iomem *slot)
@@ -62,7 +77,7 @@ static void ims_array_reset_slot(struct
 	iowrite32(0, &slot->address_lo);
 	iowrite32(0, &slot->address_hi);
 	iowrite32(0, &slot->data);
-	iowrite32(0, &slot->ctrl);
+	iowrite32_and_flush(IMS_VECTOR_CTRL_MASK, &slot->ctrl);
 }
 
 static void ims_array_free_msi_store(struct irq_domain *domain,
--- a/include/linux/irqchip/irq-ims-msi.h
+++ b/include/linux/irqchip/irq-ims-msi.h
@@ -25,6 +25,8 @@ struct ims_slot {
 
 /* Bit to mask the interrupt in ims_hw_slot::ctrl */
 #define IMS_VECTOR_CTRL_MASK	0x01
+/* Bit to enable PASID in ims_hw_slot::ctrl */
+#define IMS_PASID_ENABLE	0x08
 
 /**
  * struct ims_array_info - Information to create an IMS array domain

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-08 23:17           ` Thomas Gleixner
@ 2020-10-08 23:32             ` Jason Gunthorpe
  2020-10-09  0:27               ` Dave Jiang
                                 ` (2 more replies)
  0 siblings, 3 replies; 50+ messages in thread
From: Jason Gunthorpe @ 2020-10-08 23:32 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Fri, Oct 09, 2020 at 01:17:38AM +0200, Thomas Gleixner wrote:
> Dave,
> 
> On Thu, Oct 08 2020 at 09:51, Dave Jiang wrote:
> > On 10/8/2020 12:39 AM, Thomas Gleixner wrote:
> >> On Wed, Oct 07 2020 at 14:54, Dave Jiang wrote:
> >>> On 9/30/2020 12:57 PM, Thomas Gleixner wrote:
> >>>> Aside of that this is fiddling in the IMS storage array behind the irq
> >>>> chips back without any comment here and a big fat comment about the
> >>>> shared usage of ims_slot::ctrl in the irq chip driver.
> >>>>
> >>> This is to program the pasid fields in the IMS table entry. Was
> >>> thinking the pasid fields may be considered device specific so didn't
> >>> attempt to add the support to the core code.
> >> 
> >> Well, the problem is that this is not really irq chip functionality.
> >> 
> >> But the PASID programming needs to touch the IMS storage which is also
> >> touched by the irq chip.
> >> 
> >> This might be correct as is, but without a big fat comment explaining
> >> WHY it is safe to do so without any form of serialization this is just
> >> voodoo and unreviewable.
> >> 
> >> Can you please explain when the PASID is programmed and what the state
> >> of the interrupt is at that point? Is this a one off setup operation or
> >> does this happen dynamically at random points during runtime?
> >
> > I will put in comments for the function to explain why and when we modify the 
> > pasid field for the IMS entry. Programming of the pasid is done right before we 
> > request irq. And the clearing is done after we free the irq. We will not be 
> > touching the IMS field at runtime. So the touching of the entry should be safe.
> 
> Thanks for clarifying that.
> 
> Thinking more about it, that very same thing will be needed for any
> other IMS device and of course this is not going to end well because
> some driver will fiddle with the PASID at the wrong time.

Why? This looks like some quirk of the IDXD HW where it just randomly
put PASID along with the IRQ mask register. Probably because PASID is
not the full 32 bits.

AFAIK the PASID is not tagged on the MemWr TLP triggering the
interrupt, so it really is unrelated to the irq.

I think the ioread to get the PASID is rather ugly, it should pluck
the PASID out of some driver specific data structure with proper
locking, and thus use the sleepable version of the irqchip?

This is really not that different from what I was describing for queue
contexts - the queue context needs to be assigned to the irq # before
it can be used in the irq chip other wise there is no idea where to
write the msg to. Just like pasid here.

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-08 23:32             ` Jason Gunthorpe
@ 2020-10-09  0:27               ` Dave Jiang
  2020-10-09  1:22               ` Raj, Ashok
  2020-10-09 14:44               ` Thomas Gleixner
  2 siblings, 0 replies; 50+ messages in thread
From: Dave Jiang @ 2020-10-09  0:27 UTC (permalink / raw)
  To: Jason Gunthorpe, Thomas Gleixner
  Cc: vkoul, megha.dey, maz, bhelgaas, alex.williamson, jacob.jun.pan,
	ashok.raj, yi.l.liu, baolu.lu, kevin.tian, sanjay.k.kumar,
	tony.luck, jing.lin, dan.j.williams, kwankhede, eric.auger,
	parav, rafael, netanelg, shahafs, yan.y.zhao, pbonzini,
	samuel.ortiz, mona.hossain, dmaengine, linux-kernel, x86,
	linux-pci, kvm



On 10/8/2020 4:32 PM, Jason Gunthorpe wrote:
> On Fri, Oct 09, 2020 at 01:17:38AM +0200, Thomas Gleixner wrote:
>> Dave,
>>
>> On Thu, Oct 08 2020 at 09:51, Dave Jiang wrote:
>>> On 10/8/2020 12:39 AM, Thomas Gleixner wrote:
>>>> On Wed, Oct 07 2020 at 14:54, Dave Jiang wrote:
>>>>> On 9/30/2020 12:57 PM, Thomas Gleixner wrote:
>>>>>> Aside of that this is fiddling in the IMS storage array behind the irq
>>>>>> chips back without any comment here and a big fat comment about the
>>>>>> shared usage of ims_slot::ctrl in the irq chip driver.
>>>>>>
>>>>> This is to program the pasid fields in the IMS table entry. Was
>>>>> thinking the pasid fields may be considered device specific so didn't
>>>>> attempt to add the support to the core code.
>>>>
>>>> Well, the problem is that this is not really irq chip functionality.
>>>>
>>>> But the PASID programming needs to touch the IMS storage which is also
>>>> touched by the irq chip.
>>>>
>>>> This might be correct as is, but without a big fat comment explaining
>>>> WHY it is safe to do so without any form of serialization this is just
>>>> voodoo and unreviewable.
>>>>
>>>> Can you please explain when the PASID is programmed and what the state
>>>> of the interrupt is at that point? Is this a one off setup operation or
>>>> does this happen dynamically at random points during runtime?
>>>
>>> I will put in comments for the function to explain why and when we modify the
>>> pasid field for the IMS entry. Programming of the pasid is done right before we
>>> request irq. And the clearing is done after we free the irq. We will not be
>>> touching the IMS field at runtime. So the touching of the entry should be safe.
>>
>> Thanks for clarifying that.
>>
>> Thinking more about it, that very same thing will be needed for any
>> other IMS device and of course this is not going to end well because
>> some driver will fiddle with the PASID at the wrong time.
> 
> Why? This looks like some quirk of the IDXD HW where it just randomly
> put PASID along with the IRQ mask register. Probably because PASID is
> not the full 32 bits.

The hardware checks that the PASID in the descriptor matches the PASID in the 
IMS entry, to prevent user-mode software from arbitrarily choosing any interrupt 
vector it wants. User mode software has to request an IMS entry from the kernel 
driver and the driver fills in the PASID in the IMS so that only that process 
can use that IMS entry.

> 
> AFAIK the PASID is not tagged on the MemWr TLP triggering the
> interrupt, so it really is unrelated to the irq.
> 
> I think the ioread to get the PASID is rather ugly, it should pluck
> the PASID out of some driver specific data structure with proper
> locking, and thus use the sleepable version of the irqchip?
> 
> This is really not that different from what I was describing for queue
> contexts - the queue context needs to be assigned to the irq # before
> it can be used in the irq chip other wise there is no idea where to
> write the msg to. Just like pasid here.
> 
> Jason
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-08 23:32             ` Jason Gunthorpe
  2020-10-09  0:27               ` Dave Jiang
@ 2020-10-09  1:22               ` Raj, Ashok
  2020-10-09 11:57                 ` Jason Gunthorpe
  2020-10-09 14:44               ` Thomas Gleixner
  2 siblings, 1 reply; 50+ messages in thread
From: Raj, Ashok @ 2020-10-09  1:22 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Thomas Gleixner, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm, Ashok Raj

Hi Jason

On Thu, Oct 08, 2020 at 08:32:10PM -0300, Jason Gunthorpe wrote:
> On Fri, Oct 09, 2020 at 01:17:38AM +0200, Thomas Gleixner wrote:
> > Dave,
> > 
> > On Thu, Oct 08 2020 at 09:51, Dave Jiang wrote:
> > > On 10/8/2020 12:39 AM, Thomas Gleixner wrote:
> > >> On Wed, Oct 07 2020 at 14:54, Dave Jiang wrote:
> > >>> On 9/30/2020 12:57 PM, Thomas Gleixner wrote:
> > >>>> Aside of that this is fiddling in the IMS storage array behind the irq
> > >>>> chips back without any comment here and a big fat comment about the
> > >>>> shared usage of ims_slot::ctrl in the irq chip driver.
> > >>>>
> > >>> This is to program the pasid fields in the IMS table entry. Was
> > >>> thinking the pasid fields may be considered device specific so didn't
> > >>> attempt to add the support to the core code.
> > >> 
> > >> Well, the problem is that this is not really irq chip functionality.
> > >> 
> > >> But the PASID programming needs to touch the IMS storage which is also
> > >> touched by the irq chip.
> > >> 
> > >> This might be correct as is, but without a big fat comment explaining
> > >> WHY it is safe to do so without any form of serialization this is just
> > >> voodoo and unreviewable.
> > >> 
> > >> Can you please explain when the PASID is programmed and what the state
> > >> of the interrupt is at that point? Is this a one off setup operation or
> > >> does this happen dynamically at random points during runtime?
> > >
> > > I will put in comments for the function to explain why and when we modify the 
> > > pasid field for the IMS entry. Programming of the pasid is done right before we 
> > > request irq. And the clearing is done after we free the irq. We will not be 
> > > touching the IMS field at runtime. So the touching of the entry should be safe.
> > 
> > Thanks for clarifying that.
> > 
> > Thinking more about it, that very same thing will be needed for any
> > other IMS device and of course this is not going to end well because
> > some driver will fiddle with the PASID at the wrong time.
> 
> Why? This looks like some quirk of the IDXD HW where it just randomly
> put PASID along with the IRQ mask register. Probably because PASID is
> not the full 32 bits.

Not randomly put there Jason :-).. There is a good reason for it. I'm sure
Dave must have responded already. ENQCMD for DSA has the interrupt handle
on which the notification should be sent. Since the data from from user
space HW will verify if the PASID for IMS entry matches what is there in
the descriptor. 

Check description in section 9.2.2.1 of the DSA specification, when PASID
enable is 1, this field is checked against the PASID field of the
descriptor. Also check Section 5.4 and Interrupt Virtualization 7.3.3 for
more info.

> 
> AFAIK the PASID is not tagged on the MemWr TLP triggering the
> interrupt, so it really is unrelated to the irq.

Correct, the purpose is not to send PASID prefix for interrupt tranactions.

> 
> I think the ioread to get the PASID is rather ugly, it should pluck

Where do you see the ioread? I suppose idxd driver will fill in from the
aux_domain default PASID. Not reading from the device IMS entry.

> the PASID out of some driver specific data structure with proper
> locking, and thus use the sleepable version of the irqchip?
> 
> This is really not that different from what I was describing for queue
> contexts - the queue context needs to be assigned to the irq # before
> it can be used in the irq chip other wise there is no idea where to
> write the msg to. Just like pasid here.

Sorry, I don't follow you on this.. you mean context in hardware or user
context that holds interrupt addr/data values?


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-09  1:22               ` Raj, Ashok
@ 2020-10-09 11:57                 ` Jason Gunthorpe
  2020-10-09 12:43                   ` Raj, Ashok
  0 siblings, 1 reply; 50+ messages in thread
From: Jason Gunthorpe @ 2020-10-09 11:57 UTC (permalink / raw)
  To: Raj, Ashok
  Cc: Thomas Gleixner, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Thu, Oct 08, 2020 at 06:22:31PM -0700, Raj, Ashok wrote:

> Not randomly put there Jason :-).. There is a good reason for it. 

Sure the PASID value being associated with the IRQ make sense, but
combining that register with the interrupt mask is just a compltely
random thing to do.

If this HW was using MSI-X PASID would have been given its own
register.

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-09 11:57                 ` Jason Gunthorpe
@ 2020-10-09 12:43                   ` Raj, Ashok
  2020-10-09 12:49                     ` Jason Gunthorpe
  0 siblings, 1 reply; 50+ messages in thread
From: Raj, Ashok @ 2020-10-09 12:43 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Thomas Gleixner, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm, Ashok Raj

On Fri, Oct 09, 2020 at 08:57:37AM -0300, Jason Gunthorpe wrote:
> On Thu, Oct 08, 2020 at 06:22:31PM -0700, Raj, Ashok wrote:
> 
> > Not randomly put there Jason :-).. There is a good reason for it. 
> 
> Sure the PASID value being associated with the IRQ make sense, but
> combining that register with the interrupt mask is just a compltely
> random thing to do.

Hummm... Not sure what you are complaining.. but in any case giving
hardware a more efficient way to store interrupt entries breaking any
boundaries that maybe implied by the spec is why IMS was defined.

> 
> If this HW was using MSI-X PASID would have been given its own
> register.

Well there is no MSI-X PASID is there? 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-09 12:43                   ` Raj, Ashok
@ 2020-10-09 12:49                     ` Jason Gunthorpe
  2020-10-09 13:02                       ` Raj, Ashok
  0 siblings, 1 reply; 50+ messages in thread
From: Jason Gunthorpe @ 2020-10-09 12:49 UTC (permalink / raw)
  To: Raj, Ashok
  Cc: Thomas Gleixner, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Fri, Oct 09, 2020 at 05:43:07AM -0700, Raj, Ashok wrote:
> On Fri, Oct 09, 2020 at 08:57:37AM -0300, Jason Gunthorpe wrote:
> > On Thu, Oct 08, 2020 at 06:22:31PM -0700, Raj, Ashok wrote:
> > 
> > > Not randomly put there Jason :-).. There is a good reason for it. 
> > 
> > Sure the PASID value being associated with the IRQ make sense, but
> > combining that register with the interrupt mask is just a compltely
> > random thing to do.
> 
> Hummm... Not sure what you are complaining.. but in any case giving
> hardware a more efficient way to store interrupt entries breaking any
> boundaries that maybe implied by the spec is why IMS was defined.

I'm saying this PASID stuff is just some HW detail of IDXD and nothing
that the core irqchip code should concern itself with

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-09 12:49                     ` Jason Gunthorpe
@ 2020-10-09 13:02                       ` Raj, Ashok
  2020-10-09 13:12                         ` Jason Gunthorpe
  0 siblings, 1 reply; 50+ messages in thread
From: Raj, Ashok @ 2020-10-09 13:02 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Thomas Gleixner, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm, Ashok Raj

On Fri, Oct 09, 2020 at 09:49:45AM -0300, Jason Gunthorpe wrote:
> On Fri, Oct 09, 2020 at 05:43:07AM -0700, Raj, Ashok wrote:
> > On Fri, Oct 09, 2020 at 08:57:37AM -0300, Jason Gunthorpe wrote:
> > > On Thu, Oct 08, 2020 at 06:22:31PM -0700, Raj, Ashok wrote:
> > > 
> > > > Not randomly put there Jason :-).. There is a good reason for it. 
> > > 
> > > Sure the PASID value being associated with the IRQ make sense, but
> > > combining that register with the interrupt mask is just a compltely
> > > random thing to do.
> > 
> > Hummm... Not sure what you are complaining.. but in any case giving
> > hardware a more efficient way to store interrupt entries breaking any
> > boundaries that maybe implied by the spec is why IMS was defined.
> 
> I'm saying this PASID stuff is just some HW detail of IDXD and nothing
> that the core irqchip code should concern itself with

Ok, so you are saying this is device specific why is generic framework
having to worry about the PASID stuff? 

I thought we are consolidating code that otherwise similar drivers would
require anyway. I thought that's what Thomas was accomplishing with the new
framework.

He is the architect in chief for this... having PASID in the framework
seems like everybody could benefit instead of just being idxd specific.

This isn't the only game in town. 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-09 13:02                       ` Raj, Ashok
@ 2020-10-09 13:12                         ` Jason Gunthorpe
  2020-10-09 13:40                           ` Raj, Ashok
  0 siblings, 1 reply; 50+ messages in thread
From: Jason Gunthorpe @ 2020-10-09 13:12 UTC (permalink / raw)
  To: Raj, Ashok
  Cc: Thomas Gleixner, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Fri, Oct 09, 2020 at 06:02:09AM -0700, Raj, Ashok wrote:
> On Fri, Oct 09, 2020 at 09:49:45AM -0300, Jason Gunthorpe wrote:
> > On Fri, Oct 09, 2020 at 05:43:07AM -0700, Raj, Ashok wrote:
> > > On Fri, Oct 09, 2020 at 08:57:37AM -0300, Jason Gunthorpe wrote:
> > > > On Thu, Oct 08, 2020 at 06:22:31PM -0700, Raj, Ashok wrote:
> > > > 
> > > > > Not randomly put there Jason :-).. There is a good reason for it. 
> > > > 
> > > > Sure the PASID value being associated with the IRQ make sense, but
> > > > combining that register with the interrupt mask is just a compltely
> > > > random thing to do.
> > > 
> > > Hummm... Not sure what you are complaining.. but in any case giving
> > > hardware a more efficient way to store interrupt entries breaking any
> > > boundaries that maybe implied by the spec is why IMS was defined.
> > 
> > I'm saying this PASID stuff is just some HW detail of IDXD and nothing
> > that the core irqchip code should concern itself with
> 
> Ok, so you are saying this is device specific why is generic framework
> having to worry about the PASID stuff? 
> 
> I thought we are consolidating code that otherwise similar drivers would
> require anyway. I thought that's what Thomas was accomplishing with the new
> framework.

My point is why would another driver combine PASID and the IRQ mask in
one register? There is no spec saying to do this, no common design
reason, it has *nothing* to do with the IRQ mask other than IDXD made
a completely random choice to put the IRQ mask and PASID in the same 32
bit register.

At the very least we should see a bunch more drivers doing this same
thing before we declare some kind of pattern

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-09 13:12                         ` Jason Gunthorpe
@ 2020-10-09 13:40                           ` Raj, Ashok
  0 siblings, 0 replies; 50+ messages in thread
From: Raj, Ashok @ 2020-10-09 13:40 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Thomas Gleixner, Dave Jiang, vkoul, megha.dey, maz, bhelgaas,
	alex.williamson, jacob.jun.pan, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm, Ashok Raj

On Fri, Oct 09, 2020 at 10:12:18AM -0300, Jason Gunthorpe wrote:
> On Fri, Oct 09, 2020 at 06:02:09AM -0700, Raj, Ashok wrote:
> > On Fri, Oct 09, 2020 at 09:49:45AM -0300, Jason Gunthorpe wrote:
> > > On Fri, Oct 09, 2020 at 05:43:07AM -0700, Raj, Ashok wrote:
> > > > On Fri, Oct 09, 2020 at 08:57:37AM -0300, Jason Gunthorpe wrote:
> > > > > On Thu, Oct 08, 2020 at 06:22:31PM -0700, Raj, Ashok wrote:
> > > > > 
> > > > > > Not randomly put there Jason :-).. There is a good reason for it. 
> > > > > 
> > > > > Sure the PASID value being associated with the IRQ make sense, but
> > > > > combining that register with the interrupt mask is just a compltely
> > > > > random thing to do.
> > > > 
> > > > Hummm... Not sure what you are complaining.. but in any case giving
> > > > hardware a more efficient way to store interrupt entries breaking any
> > > > boundaries that maybe implied by the spec is why IMS was defined.
> > > 
> > > I'm saying this PASID stuff is just some HW detail of IDXD and nothing
> > > that the core irqchip code should concern itself with
> > 
> > Ok, so you are saying this is device specific why is generic framework
> > having to worry about the PASID stuff? 
> > 
> > I thought we are consolidating code that otherwise similar drivers would
> > require anyway. I thought that's what Thomas was accomplishing with the new
> > framework.
> 
> My point is why would another driver combine PASID and the IRQ mask in
> one register? There is no spec saying to do this, no common design

IMS is a concept. How a device organizes its interrupt data is completely
hardware specific. Some vendor could keep them organized like how MSIx is
done today, and put PASID's in a separate offset. Or put all interrupt
related entries all in one place like how idxd handles it today.

Cheers,
Ashok

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-08 23:32             ` Jason Gunthorpe
  2020-10-09  0:27               ` Dave Jiang
  2020-10-09  1:22               ` Raj, Ashok
@ 2020-10-09 14:44               ` Thomas Gleixner
  2020-10-09 14:52                 ` Jason Gunthorpe
  2 siblings, 1 reply; 50+ messages in thread
From: Thomas Gleixner @ 2020-10-09 14:44 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Thu, Oct 08 2020 at 20:32, Jason Gunthorpe wrote:
> On Fri, Oct 09, 2020 at 01:17:38AM +0200, Thomas Gleixner wrote:
>> Thinking more about it, that very same thing will be needed for any
>> other IMS device and of course this is not going to end well because
>> some driver will fiddle with the PASID at the wrong time.
>
> Why? This looks like some quirk of the IDXD HW where it just randomly
> put PASID along with the IRQ mask register. Probably because PASID is
> not the full 32 bits.
>
> AFAIK the PASID is not tagged on the MemWr TLP triggering the
> interrupt, so it really is unrelated to the irq.

Right you are. With brain awake this does not make sense.

> I think the ioread to get the PASID is rather ugly, it should pluck
> the PASID out of some driver specific data structure with proper
> locking, and thus use the sleepable version of the irqchip?
>
> This is really not that different from what I was describing for queue
> contexts - the queue context needs to be assigned to the irq # before
> it can be used in the irq chip other wise there is no idea where to
> write the msg to. Just like pasid here.

Not really. In the IDXD case the storage is known when the host device
and the irq domain is initialized which is not the case for your variant
and it neither needs to send a magic command to the device to update the
data.

Due to the already explained racy behaviour of MSI affinity changes when
interrupt remapping is disabled, that async update would just not work
and I really don't want to see the resulting tinkering to paper over the
problem. TBH, even if that problem would not exist, my faith in driver
writers getting any of this correct is close to zero and I rather spend
a few brain cycles now than staring at the creative mess later.

So putting the brainfart of yesterday aside, we can simply treat this as
auxiliary data.

The patch below implements an interface for that and adds support to the
IMS driver. That interface is properly serialized and does not put any
restrictions on when this is called. (I know another interrupt chip
which could be simplified that way).

All the IDXD driver has to do is:

   auxval = ims_ctrl_pasid_aux(pasid, enabled);
   irq_set_auxdata(irqnr, IMS_AUXDATA_CONTROL_WORD, auxval);

I agree that irq_set_auxdata() is not the most elegant thing, but the
alternative solutions I looked at are just worse.

For now I just kept the data in the IMS storage which still requires an
ioread(), but these interrupts are not frequently masked and unmasked
during normal operation so performance is not a concern and caching that
value is an orthogonal optimization if someone cares.

Thanks,

        tglx

8<-------------

 drivers/irqchip/irq-ims-msi.c       |   35 +++++++++++++++++++++++++++++------
 include/linux/interrupt.h           |    2 ++
 include/linux/irq.h                 |    4 ++++
 include/linux/irqchip/irq-ims-msi.h |   25 ++++++++++++++++++++++++-
 kernel/irq/manage.c                 |   32 ++++++++++++++++++++++++++++++++
 5 files changed, 91 insertions(+), 7 deletions(-)

--- a/drivers/irqchip/irq-ims-msi.c
+++ b/drivers/irqchip/irq-ims-msi.c
@@ -18,14 +18,19 @@ struct ims_array_data {
 	unsigned long		map[0];
 };
 
+static inline void iowrite32_and_flush(u32 value, void __iomem *addr)
+{
+	iowrite32(value, addr);
+	ioread32(addr);
+}
+
 static void ims_array_mask_irq(struct irq_data *data)
 {
 	struct msi_desc *desc = irq_data_get_msi_desc(data);
 	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
 	u32 __iomem *ctrl = &slot->ctrl;
 
-	iowrite32(ioread32(ctrl) | IMS_VECTOR_CTRL_MASK, ctrl);
-	ioread32(ctrl); /* Flush write to device */
+	iowrite32_and_flush(ioread32(ctrl) | IMS_CTRL_VECTOR_MASKBIT, ctrl);
 }
 
 static void ims_array_unmask_irq(struct irq_data *data)
@@ -34,7 +39,7 @@ static void ims_array_unmask_irq(struct
 	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
 	u32 __iomem *ctrl = &slot->ctrl;
 
-	iowrite32(ioread32(ctrl) & ~IMS_VECTOR_CTRL_MASK, ctrl);
+	iowrite32_and_flush(ioread32(ctrl) & ~IMS_CTRL_VECTOR_MASKBIT, ctrl);
 }
 
 static void ims_array_write_msi_msg(struct irq_data *data, struct msi_msg *msg)
@@ -44,8 +49,24 @@ static void ims_array_write_msi_msg(stru
 
 	iowrite32(msg->address_lo, &slot->address_lo);
 	iowrite32(msg->address_hi, &slot->address_hi);
-	iowrite32(msg->data, &slot->data);
-	ioread32(slot);
+	iowrite32_and_flush(msg->data, &slot->data);
+}
+
+static int ims_array_set_auxdata(struct irq_data *data, unsigned int which,
+				 u64 auxval)
+{
+	struct msi_desc *desc = irq_data_get_msi_desc(data);
+	struct ims_slot __iomem *slot = desc->device_msi.priv_iomem;
+	u32 val, __iomem *ctrl = &slot->ctrl;
+
+	if (which != IMS_AUXDATA_CONTROL_WORD)
+		return -EINVAL;
+	if (auxval & ~(u64)IMS_CONTROL_WORD_AUXMASK)
+		return -EINVAL;
+
+	val = ioread32(ctrl) & IMS_CONTROL_WORD_IRQMASK;
+	iowrite32_and_flush(val | (u32) auxval, ctrl);
+	return 0;
 }
 
 static const struct irq_chip ims_array_msi_controller = {
@@ -53,6 +74,7 @@ static const struct irq_chip ims_array_m
 	.irq_mask		= ims_array_mask_irq,
 	.irq_unmask		= ims_array_unmask_irq,
 	.irq_write_msi_msg	= ims_array_write_msi_msg,
+	.irq_set_auxdata	= ims_array_set_auxdata,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
@@ -62,7 +84,7 @@ static void ims_array_reset_slot(struct
 	iowrite32(0, &slot->address_lo);
 	iowrite32(0, &slot->address_hi);
 	iowrite32(0, &slot->data);
-	iowrite32(0, &slot->ctrl);
+	iowrite32_and_flush(IMS_CTRL_VECTOR_MASKBIT, &slot->ctrl);
 }
 
 static void ims_array_free_msi_store(struct irq_domain *domain,
@@ -97,6 +119,7 @@ static int ims_array_alloc_msi_store(str
 			goto fail;
 		set_bit(idx, ims->map);
 		entry->device_msi.priv_iomem = &ims->info.slots[idx];
+		ims_array_reset_slot(entry->device_msi.priv_iomem);
 		entry->device_msi.hwirq = idx;
 	}
 	return 0;
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -487,6 +487,8 @@ extern int irq_get_irqchip_state(unsigne
 extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
 				 bool state);
 
+int irq_set_auxdata(unsigned int irq, unsigned int which, u64 val);
+
 #ifdef CONFIG_IRQ_FORCED_THREADING
 # ifdef CONFIG_PREEMPT_RT
 #  define force_irqthreads	(true)
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -481,6 +481,8 @@ static inline irq_hw_number_t irqd_to_hw
  *				irq_request_resources
  * @irq_compose_msi_msg:	optional to compose message content for MSI
  * @irq_write_msi_msg:	optional to write message content for MSI
+ * @irq_set_auxdata:	Optional function to update auxiliary data e.g. in
+ *			shared registers
  * @irq_get_irqchip_state:	return the internal state of an interrupt
  * @irq_set_irqchip_state:	set the internal state of a interrupt
  * @irq_set_vcpu_affinity:	optional to target a vCPU in a virtual machine
@@ -528,6 +530,8 @@ struct irq_chip {
 	void		(*irq_compose_msi_msg)(struct irq_data *data, struct msi_msg *msg);
 	void		(*irq_write_msi_msg)(struct irq_data *data, struct msi_msg *msg);
 
+	int		(*irq_set_auxdata)(struct irq_data *data, unsigned int which, u64 auxval);
+
 	int		(*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state);
 	int		(*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state);
 
--- a/include/linux/irqchip/irq-ims-msi.h
+++ b/include/linux/irqchip/irq-ims-msi.h
@@ -5,6 +5,7 @@
 #define _LINUX_IRQCHIP_IRQ_IMS_MSI_H
 
 #include <linux/types.h>
+#include <linux/bits.h>
 
 /**
  * ims_hw_slot - The hardware layout of an IMS based MSI message
@@ -23,8 +24,30 @@ struct ims_slot {
 	u32	ctrl;
 } __packed;
 
+/*
+ * The IMS control word utilizes bit 0-2 for interrupt control. The remaining
+ * bits can contain auxiliary data.
+ */
+#define IMS_CONTROL_WORD_IRQMASK	GENMASK(2, 0)
+#define IMS_CONTROL_WORD_AUXMASK	GENMASK(31, 3)
+
 /* Bit to mask the interrupt in ims_hw_slot::ctrl */
-#define IMS_VECTOR_CTRL_MASK	0x01
+#define IMS_CTRL_VECTOR_MASKBIT		BIT(0)
+
+/* Auxiliary control word data related defines */
+enum {
+	IMS_AUXDATA_CONTROL_WORD,
+};
+
+#define IMS_CTRL_PASID_ENABLE		BIT(3)
+#define IMS_CTRL_PASID_SHIFT		12
+
+static inline u32 ims_ctrl_pasid_aux(unsigned int pasid, bool enable)
+{
+	u32 auxval = pasid << IMS_CTRL_PASID_SHIFT;
+
+	return enable ? auxval | IMS_CTRL_PASID_ENABLE : auxval;
+}
 
 /**
  * struct ims_array_info - Information to create an IMS array domain
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2752,3 +2752,35 @@ int irq_set_irqchip_state(unsigned int i
 	return err;
 }
 EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
+
+/**
+ * irq_set_auxdata - Set auxiliary data
+ * @irq:	Interrupt to update
+ * @which:	Selector which data to update
+ * @auxval:	Auxiliary data value
+ *
+ * Function to update auxiliary data for an interrupt, e.g. to update data
+ * which is stored in a shared register or data storage (e.g. IMS).
+ */
+int irq_set_auxdata(unsigned int irq, unsigned int which, u64 val)
+{
+	struct irq_desc *desc;
+	struct irq_data *data;
+	unsigned long flags;
+	int res = -ENODEV;
+
+	desc = irq_get_desc_buslock(irq, &flags, 0);
+	if (!desc)
+		return -EINVAL;
+
+	for (data = &desc->irq_data; data; data = irqd_get_parent_data(data)) {
+		if (data->chip->irq_set_auxdata) {
+			res = data->chip->irq_set_auxdata(data, which, val);
+			break;
+		}
+	}
+
+	irq_put_desc_busunlock(desc, flags);
+	return res;
+}
+EXPORT_SYMBOL_GPL(irq_set_auxdata);

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-09 14:44               ` Thomas Gleixner
@ 2020-10-09 14:52                 ` Jason Gunthorpe
  2020-10-09 16:02                   ` Thomas Gleixner
  0 siblings, 1 reply; 50+ messages in thread
From: Jason Gunthorpe @ 2020-10-09 14:52 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Fri, Oct 09, 2020 at 04:44:27PM +0200, Thomas Gleixner wrote:
> > This is really not that different from what I was describing for queue
> > contexts - the queue context needs to be assigned to the irq # before
> > it can be used in the irq chip other wise there is no idea where to
> > write the msg to. Just like pasid here.
> 
> Not really. In the IDXD case the storage is known when the host device
> and the irq domain is initialized which is not the case for your variant
> and it neither needs to send a magic command to the device to update the
> data.

I mean, needing the PASID vs needing the memory address before the IRQ
can be use are basically the same issue. Data needs to be attached to
the IRQ before it can be programmed.. In this case programming with
the wrong PASID could lead to a security issue.

> All the IDXD driver has to do is:
> 
>    auxval = ims_ctrl_pasid_aux(pasid, enabled);
>    irq_set_auxdata(irqnr, IMS_AUXDATA_CONTROL_WORD, auxval);
> 
> I agree that irq_set_auxdata() is not the most elegant thing, but the
> alternative solutions I looked at are just worse.

It seems reasonable, but quite an obfuscated way to tell a driver they
need to hold irq_get_desc_buslock() when touching data shared with the
irqchip ops.. Not that I have a better suggestion

Jason

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm
  2020-10-09 14:52                 ` Jason Gunthorpe
@ 2020-10-09 16:02                   ` Thomas Gleixner
  0 siblings, 0 replies; 50+ messages in thread
From: Thomas Gleixner @ 2020-10-09 16:02 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Dave Jiang, vkoul, megha.dey, maz, bhelgaas, alex.williamson,
	jacob.jun.pan, ashok.raj, yi.l.liu, baolu.lu, kevin.tian,
	sanjay.k.kumar, tony.luck, jing.lin, dan.j.williams, kwankhede,
	eric.auger, parav, rafael, netanelg, shahafs, yan.y.zhao,
	pbonzini, samuel.ortiz, mona.hossain, dmaengine, linux-kernel,
	x86, linux-pci, kvm

On Fri, Oct 09 2020 at 11:52, Jason Gunthorpe wrote:
> On Fri, Oct 09, 2020 at 04:44:27PM +0200, Thomas Gleixner wrote:
>> > This is really not that different from what I was describing for queue
>> > contexts - the queue context needs to be assigned to the irq # before
>> > it can be used in the irq chip other wise there is no idea where to
>> > write the msg to. Just like pasid here.
>> 
>> Not really. In the IDXD case the storage is known when the host device
>> and the irq domain is initialized which is not the case for your variant
>> and it neither needs to send a magic command to the device to update the
>> data.
>
> I mean, needing the PASID vs needing the memory address before the IRQ
> can be use are basically the same issue. Data needs to be attached to
> the IRQ before it can be programmed.. In this case programming with
> the wrong PASID could lead to a security issue.

Yeah. I looked at doing it similar to the callback I added for
retrieving the shadow storage pointer, but the PASID is not necessarily
established at that point.

>> I agree that irq_set_auxdata() is not the most elegant thing, but the
>> alternative solutions I looked at are just worse.
>
> It seems reasonable, but quite an obfuscated way to tell a driver they
> need to hold irq_get_desc_buslock() when touching data shared with the
> irqchip ops.. Not that I have a better suggestion

It's an obfuscated way to make obfuscated hardware supported :)

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [PATCH v3 02/18] iommu/vt-d: Add DEV-MSI support
  2020-10-08  7:54     ` David Woodhouse
@ 2020-10-20 21:42       ` Dey, Megha
  0 siblings, 0 replies; 50+ messages in thread
From: Dey, Megha @ 2020-10-20 21:42 UTC (permalink / raw)
  To: David Woodhouse, Thomas Gleixner, Jiang, Dave, vkoul, maz,
	bhelgaas, alex.williamson, Pan, Jacob jun, Raj, Ashok, jgg, Liu,
	Yi L, Lu, Baolu, Tian, Kevin, Kumar, Sanjay K, Luck, Tony, Lin,
	Jing, Williams, Dan J, kwankhede, eric.auger, parav, rafael,
	netanelg, shahafs, yan.y.zhao, pbonzini, Ortiz, Samuel, Hossain,
	Mona
  Cc: dmaengine, linux-kernel, x86, linux-pci, kvm



Hi David,
 
> On Wed, 2020-09-30 at 20:32 +0200, Thomas Gleixner wrote:
> > On Tue, Sep 15 2020 at 16:27, Dave Jiang wrote:
> > > @@ -1303,9 +1303,10 @@ static void
> intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
> > >  	case X86_IRQ_ALLOC_TYPE_HPET:
> > >  	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
> > >  	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
> > > +	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
> > >  		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
> > >  			set_hpet_sid(irte, info->devid);
> > > -		else
> > > +		else if (info->type != X86_IRQ_ALLOC_TYPE_DEV_MSI)
> > >  			set_msi_sid(irte,
> > >  			msi_desc_to_pci_dev(info->desc));
> >
> > Gah. this starts to become unreadable.
> >
> > diff --git a/drivers/iommu/intel/irq_remapping.c
> b/drivers/iommu/intel/irq_remapping.c
> > index 8f4ce72570ce..0c1ea8ceec31 100644
> > --- a/drivers/iommu/intel/irq_remapping.c
> > +++ b/drivers/iommu/intel/irq_remapping.c
> > @@ -1271,6 +1271,16 @@ static struct irq_chip intel_ir_chip = {
> >  	.irq_set_vcpu_affinity	= intel_ir_set_vcpu_affinity,
> >  };
> >
> > +static void irte_prepare_msg(struct msi_msg *msg, int index, int subhandle)
> > +{
> > +	msg->address_hi = MSI_ADDR_BASE_HI;
> > +	msg->data = sub_handle;
> > +	msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
> > +			  MSI_ADDR_IR_SHV |
> > +			  MSI_ADDR_IR_INDEX1(index) |
> > +			  MSI_ADDR_IR_INDEX2(index);
> > +}
> > +
> >  static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
> >  					     struct irq_cfg *irq_cfg,
> >  					     struct irq_alloc_info *info,
> > @@ -1312,19 +1322,18 @@ static void
> intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
> >  		break;
> >
> >  	case X86_IRQ_ALLOC_TYPE_HPET:
> > +		set_hpet_sid(irte, info->hpet_id);
> > +		irte_prepare_msg(msg, index, sub_handle);
> > +		break;
> > +
> >  	case X86_IRQ_ALLOC_TYPE_MSI:
> >  	case X86_IRQ_ALLOC_TYPE_MSIX:
> > -		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
> > -			set_hpet_sid(irte, info->hpet_id);
> > -		else
> > -			set_msi_sid(irte, info->msi_dev);
> > -
> > -		msg->address_hi = MSI_ADDR_BASE_HI;
> > -		msg->data = sub_handle;
> > -		msg->address_lo = MSI_ADDR_BASE_LO |
> MSI_ADDR_IR_EXT_INT |
> > -				  MSI_ADDR_IR_SHV |
> > -				  MSI_ADDR_IR_INDEX1(index) |
> > -				  MSI_ADDR_IR_INDEX2(index);
> > +		set_msi_sid(irte, info->msi_dev);
> > +		irte_prepare_msg(msg, index, sub_handle);
> > +		break;
> > +
> > +	case X86_IRQ_ALLOC_TYPE_DEV_MSI:
> > +		irte_prepare_msg(msg, index, sub_handle);
> >  		break;
> >
> >  	default:
> >
> > Hmm?
> 
> It'd get a bit nicer if you *always* did the irte_prepare_msg() part to
> generate the MSI message. Let the IOAPIC driver swizzle that into the
> IOAPIC RTE for itself. You have no business composing an IOAPIC RTE
> here.
> 
> Then your switch statement is *only* for setting the SID in the IRTE
> appropriately.

I don’t think I fully understand what needs to be done, but if we move the IOAPIC RTE configure into the IOAPIC driver, wouldn't that mean that the IOAPIC driver should be aware of interrupt remapping?

Right now IOAPIC case here configures an RTE entry, and the MSI related cases configure the msi message(through the irte_prepare_msg()). So unless irte_prepare_msg somehow even configures IOAPIC message, we cannot always do irte_prepare_msg right? 

It would be great if you could provide some insight here 😊 

Thanks,
Megha

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, back to index

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <160021207013.67751.8220471499908137671.stgit@djiang5-desk3.ch.intel.com>
2020-09-15 23:27 ` [PATCH v3 01/18] irqchip: Add IMS (Interrupt Message Storage) driver Dave Jiang
2020-09-30 18:23   ` Thomas Gleixner
2020-10-01 22:59     ` Dey, Megha
2020-09-15 23:27 ` [PATCH v3 02/18] iommu/vt-d: Add DEV-MSI support Dave Jiang
2020-09-30 18:32   ` Thomas Gleixner
2020-10-01 23:26     ` Dey, Megha
2020-10-02 11:13       ` Thomas Gleixner
2020-10-08  7:54     ` David Woodhouse
2020-10-20 21:42       ` Dey, Megha
2020-09-15 23:28 ` [PATCH v3 05/18] dmaengine: idxd: add IMS support in base driver Dave Jiang
2020-09-30 18:47   ` Thomas Gleixner
2020-09-30 18:51     ` Jason Gunthorpe
2020-09-30 21:48       ` Thomas Gleixner
2020-09-30 21:49       ` Raj, Ashok
2020-09-30 21:57         ` Thomas Gleixner
2020-10-01  1:07           ` Raj, Ashok
2020-10-01  8:44             ` Thomas Gleixner
2020-09-30 22:38         ` Jason Gunthorpe
2020-10-01 20:48     ` Dave Jiang
2020-09-15 23:28 ` [PATCH v3 07/18] dmaengine: idxd: add basic mdev registration and helper functions Dave Jiang
2020-09-15 23:28 ` [PATCH v3 10/18] dmaengine: idxd: virtual device commands emulation Dave Jiang
2020-09-15 23:28 ` [PATCH v3 12/18] dmaengine: idxd: add mdev type as a new wq type Dave Jiang
2020-09-15 23:29 ` [PATCH v3 13/18] dmaengine: idxd: add dedicated wq mdev type Dave Jiang
2020-09-15 23:29 ` [PATCH v3 14/18] dmaengine: idxd: add new wq state for mdev Dave Jiang
2020-09-15 23:29 ` [PATCH v3 15/18] dmaengine: idxd: add error notification from host driver to mediated device Dave Jiang
2020-09-15 23:29 ` [PATCH v3 16/18] dmaengine: idxd: add ABI documentation for mediated device support Dave Jiang
2020-09-17 15:06 ` [PATCH v3 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver Jason Gunthorpe
2020-09-17 17:15   ` Dave Jiang
2020-09-17 17:27     ` Jason Gunthorpe
2020-09-17 17:30     ` Alex Williamson
2020-09-17 17:37       ` Jason Gunthorpe
     [not found] ` <160021248280.67751.12525558281536923518.stgit@djiang5-desk3.ch.intel.com>
2020-09-30 18:36   ` [PATCH v3 04/18] dmaengine: idxd: add interrupt handle request support Thomas Gleixner
2020-10-01 20:16     ` Dave Jiang
     [not found] ` <160021253189.67751.12686144284999931703.stgit@djiang5-desk3.ch.intel.com>
2020-09-30 19:57   ` [PATCH v3 11/18] dmaengine: idxd: ims setup for the vdcm Thomas Gleixner
2020-10-07 21:54     ` Dave Jiang
2020-10-08  7:39       ` Thomas Gleixner
2020-10-08 16:51         ` Dave Jiang
2020-10-08 23:17           ` Thomas Gleixner
2020-10-08 23:32             ` Jason Gunthorpe
2020-10-09  0:27               ` Dave Jiang
2020-10-09  1:22               ` Raj, Ashok
2020-10-09 11:57                 ` Jason Gunthorpe
2020-10-09 12:43                   ` Raj, Ashok
2020-10-09 12:49                     ` Jason Gunthorpe
2020-10-09 13:02                       ` Raj, Ashok
2020-10-09 13:12                         ` Jason Gunthorpe
2020-10-09 13:40                           ` Raj, Ashok
2020-10-09 14:44               ` Thomas Gleixner
2020-10-09 14:52                 ` Jason Gunthorpe
2020-10-09 16:02                   ` Thomas Gleixner

dmaengine Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/dmaengine/0 dmaengine/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 dmaengine dmaengine/ https://lore.kernel.org/dmaengine \
		dmaengine@vger.kernel.org
	public-inbox-index dmaengine

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.dmaengine


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git