All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/6] ATS capability support for Intel IOMMU
@ 2009-02-12 12:50 Yu Zhao
  2009-02-12 12:50 ` [PATCH v3 1/6] PCI: support the ATS capability Yu Zhao
                   ` (9 more replies)
  0 siblings, 10 replies; 20+ messages in thread
From: Yu Zhao @ 2009-02-12 12:50 UTC (permalink / raw)
  To: jbarnes, dwmw2; +Cc: linux-pci, iommu, kvm, linux-kernel, Yu Zhao

This patch series implements Address Translation Service support for
the Intel IOMMU. ATS makes the PCI Endpoint be able to request the
DMA address translation from the IOMMU and cache the translation in
the Endpoint, thus alleviate IOMMU pressure and improve the hardware
performance in the I/O virtualization environment.


Changelog: v2 -> v3
  1, throw error message if VT-d hardware detects invalid descriptor
     on Queued Invalidation interface (David Woodhouse)
  2, avoid using pci_find_ext_capability every time when reading ATS
     Invalidate Queue Depth (Matthew Wilcox)
Changelog: v1 -> v2
  added 'static' prefix to a local LIST_HEAD (Andrew Morton)


Yu Zhao (6):
  PCI: support the ATS capability
  VT-d: parse ATSR in DMA Remapping Reporting Structure
  VT-d: add queue invalidation fault status support
  VT-d: add device IOTLB invalidation support
  VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
  VT-d: support the device IOTLB

 drivers/pci/dmar.c           |  230 ++++++++++++++++++++++++++++++++++++++----
 drivers/pci/intel-iommu.c    |  135 ++++++++++++++++++++-----
 drivers/pci/intr_remapping.c |   21 ++--
 drivers/pci/pci.c            |   72 +++++++++++++
 include/linux/dmar.h         |    9 ++
 include/linux/intel-iommu.h  |   19 +++-
 include/linux/pci.h          |   16 +++
 include/linux/pci_regs.h     |   10 ++
 8 files changed, 457 insertions(+), 55 deletions(-)


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 1/6] PCI: support the ATS capability
  2009-02-12 12:50 [PATCH v3 0/6] ATS capability support for Intel IOMMU Yu Zhao
@ 2009-02-12 12:50 ` Yu Zhao
  2009-02-12 12:50 ` [PATCH v3 2/6] VT-d: parse ATSR in DMA Remapping Reporting Structure Yu Zhao
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Yu Zhao @ 2009-02-12 12:50 UTC (permalink / raw)
  To: jbarnes, dwmw2; +Cc: linux-pci, iommu, kvm, linux-kernel, Yu Zhao

The ATS spec can be found at http://www.pcisig.com/specifications/iov/ats/
(it requires membership).

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
 drivers/pci/pci.c        |   72 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h      |   16 ++++++++++
 include/linux/pci_regs.h |   10 ++++++
 3 files changed, 98 insertions(+), 0 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e3efe6b..87018ab 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1462,6 +1462,78 @@ void pci_enable_ari(struct pci_dev *dev)
 }
 
 /**
+ * pci_enable_ats - enable the ATS capability
+ * @dev: the PCI device
+ * @ps: the IOMMU page shift
+ *
+ * Returns 0 on success, or a negative value on error.
+ */
+int pci_enable_ats(struct pci_dev *dev, int ps)
+{
+	int pos;
+	u16 ctrl;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
+	if (!pos)
+		return -ENODEV;
+
+	if (ps < PCI_ATS_MIN_STU)
+		return -EINVAL;
+
+	ctrl = PCI_ATS_CTRL_STU(ps - PCI_ATS_MIN_STU) | PCI_ATS_CTRL_ENABLE;
+	pci_write_config_word(dev, pos + PCI_ATS_CTRL, ctrl);
+
+	dev->ats_enabled = 1;
+
+	return 0;
+}
+
+/**
+ * pci_disable_ats - disable the ATS capability
+ * @dev: the PCI device
+ */
+void pci_disable_ats(struct pci_dev *dev)
+{
+	int pos;
+	u16 ctrl;
+
+	if (!dev->ats_enabled)
+		return;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
+	if (!pos)
+		return;
+
+	pci_read_config_word(dev, pos + PCI_ATS_CTRL, &ctrl);
+	ctrl &= ~PCI_ATS_CTRL_ENABLE;
+	pci_write_config_word(dev, pos + PCI_ATS_CTRL, ctrl);
+}
+
+/**
+ * pci_ats_queue_depth - query ATS Invalidate Queue Depth
+ * @dev: the PCI device
+ *
+ * Returns the queue depth on success, or 0 on error.
+ */
+int pci_ats_queue_depth(struct pci_dev *dev)
+{
+	int pos;
+	u16 cap;
+
+	if (dev->ats_qdep)
+		return dev->ats_qdep;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS);
+	if (!pos)
+		return 0;
+
+	pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap);
+	dev->ats_qdep = PCI_ATS_CAP_QDEP(cap) ?  PCI_ATS_CAP_QDEP(cap) :
+						 PCI_ATS_MAX_QDEP;
+	return dev->ats_qdep;
+}
+
+/**
  * pci_swizzle_interrupt_pin - swizzle INTx for device behind bridge
  * @dev: the PCI device
  * @pin: the INTx pin (1=INTA, 2=INTB, 3=INTD, 4=INTD)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7bd624b..cab680b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -254,6 +254,7 @@ struct pci_dev {
 	unsigned int 	msi_enabled:1;
 	unsigned int	msix_enabled:1;
 	unsigned int	ari_enabled:1;	/* ARI forwarding */
+	unsigned int	ats_enabled:1;	/* Address Translation Service */
 	unsigned int	is_managed:1;
 	unsigned int	is_pcie:1;
 	unsigned int	state_saved:1;
@@ -270,6 +271,7 @@ struct pci_dev {
 	struct list_head msi_list;
 #endif
 	struct pci_vpd *vpd;
+	int		ats_qdep;	/* ATS Invalidate Queue Depth */
 };
 
 extern struct pci_dev *alloc_pci_dev(void);
@@ -1194,5 +1196,19 @@ int pci_ext_cfg_avail(struct pci_dev *dev);
 
 void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
 
+extern int pci_enable_ats(struct pci_dev *dev, int ps);
+extern void pci_disable_ats(struct pci_dev *dev);
+extern int pci_ats_queue_depth(struct pci_dev *dev);
+/**
+ * pci_ats_enabled - query the ATS status
+ * @dev: the PCI device
+ *
+ * Returns 1 if ATS capability is enabled, or 0 if not.
+ */
+static inline int pci_ats_enabled(struct pci_dev *dev)
+{
+	return dev->ats_enabled;
+}
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index 027815b..3858b4f 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -498,6 +498,7 @@
 #define PCI_EXT_CAP_ID_DSN	3
 #define PCI_EXT_CAP_ID_PWR	4
 #define PCI_EXT_CAP_ID_ARI	14
+#define PCI_EXT_CAP_ID_ATS	15
 
 /* Advanced Error Reporting */
 #define PCI_ERR_UNCOR_STATUS	4	/* Uncorrectable Error Status */
@@ -615,4 +616,13 @@
 #define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
 #define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
 
+/* Address Translation Service */
+#define PCI_ATS_CAP		0x04	/* ATS Capability Register */
+#define  PCI_ATS_CAP_QDEP(x)	((x) & 0x1f)	/* Invalidate Queue Depth */
+#define  PCI_ATS_MAX_QDEP	32	/* Max Invalidate Queue Depth */
+#define PCI_ATS_CTRL		0x06	/* ATS Control Register */
+#define  PCI_ATS_CTRL_ENABLE	0x8000	/* ATS Enable */
+#define  PCI_ATS_CTRL_STU(x)	((x) & 0x1f)	/* Smallest Translation Unit */
+#define  PCI_ATS_MIN_STU	12	/* shift of minimum STU block */
+
 #endif /* LINUX_PCI_REGS_H */
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v3 2/6] VT-d: parse ATSR in DMA Remapping Reporting Structure
  2009-02-12 12:50 [PATCH v3 0/6] ATS capability support for Intel IOMMU Yu Zhao
  2009-02-12 12:50 ` [PATCH v3 1/6] PCI: support the ATS capability Yu Zhao
@ 2009-02-12 12:50 ` Yu Zhao
  2009-02-12 12:50 ` [PATCH v3 3/6] VT-d: add queue invalidation fault status support Yu Zhao
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Yu Zhao @ 2009-02-12 12:50 UTC (permalink / raw)
  To: jbarnes, dwmw2; +Cc: linux-pci, iommu, kvm, linux-kernel, Yu Zhao

Parse the Root Port ATS Capability Reporting Structure in DMA Remapping
Reporting Structure ACPI table.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
 drivers/pci/dmar.c          |  112 ++++++++++++++++++++++++++++++++++++++++--
 include/linux/dmar.h        |    9 ++++
 include/linux/intel-iommu.h |    1 +
 3 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index f5a662a..bd37b3c 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -254,6 +254,84 @@ rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
 	}
 	return ret;
 }
+
+static LIST_HEAD(dmar_atsr_units);
+
+static int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
+{
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
+	atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
+	if (!atsru)
+		return -ENOMEM;
+
+	atsru->hdr = hdr;
+	atsru->include_all = atsr->flags & 0x1;
+
+	list_add(&atsru->list, &dmar_atsr_units);
+
+	return 0;
+}
+
+static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
+{
+	int rc;
+	struct acpi_dmar_atsr *atsr;
+
+	if (atsru->include_all)
+		return 0;
+
+	atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+	rc = dmar_parse_dev_scope((void *)(atsr + 1),
+				(void *)atsr + atsr->header.length,
+				&atsru->devices_cnt, &atsru->devices,
+				atsr->segment);
+	if (rc || !atsru->devices_cnt) {
+		list_del(&atsru->list);
+		kfree(atsru);
+	}
+
+	return rc;
+}
+
+int dmar_find_matched_atsr_unit(struct pci_dev *dev)
+{
+	int i;
+	struct pci_bus *bus;
+	struct acpi_dmar_atsr *atsr;
+	struct dmar_atsr_unit *atsru;
+
+	list_for_each_entry(atsru, &dmar_atsr_units, list) {
+		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
+		if (atsr->segment == pci_domain_nr(dev->bus))
+			goto found;
+	}
+
+	return 0;
+
+found:
+	for (bus = dev->bus; bus; bus = bus->parent) {
+		struct pci_dev *bridge = bus->self;
+
+		if (!bridge || !bridge->is_pcie ||
+		    bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
+			return 0;
+
+		if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
+			for (i = 0; i < atsru->devices_cnt; i++)
+				if (atsru->devices[i] == bridge)
+					return 1;
+			break;
+		}
+	}
+
+	if (atsru->include_all)
+		return 1;
+
+	return 0;
+}
 #endif
 
 static void __init
@@ -261,22 +339,28 @@ dmar_table_print_dmar_entry(struct acpi_dmar_header *header)
 {
 	struct acpi_dmar_hardware_unit *drhd;
 	struct acpi_dmar_reserved_memory *rmrr;
+	struct acpi_dmar_atsr *atsr;
 
 	switch (header->type) {
 	case ACPI_DMAR_TYPE_HARDWARE_UNIT:
-		drhd = (struct acpi_dmar_hardware_unit *)header;
+		drhd = container_of(header, struct acpi_dmar_hardware_unit,
+				    header);
 		printk (KERN_INFO PREFIX
-			"DRHD (flags: 0x%08x)base: 0x%016Lx\n",
-			drhd->flags, (unsigned long long)drhd->address);
+			"DRHD base: %#016Lx flags: %#x\n",
+			(unsigned long long)drhd->address, drhd->flags);
 		break;
 	case ACPI_DMAR_TYPE_RESERVED_MEMORY:
-		rmrr = (struct acpi_dmar_reserved_memory *)header;
-
+		rmrr = container_of(header, struct acpi_dmar_reserved_memory,
+				    header);
 		printk (KERN_INFO PREFIX
-			"RMRR base: 0x%016Lx end: 0x%016Lx\n",
+			"RMRR base: %#016Lx end: %#016Lx\n",
 			(unsigned long long)rmrr->base_address,
 			(unsigned long long)rmrr->end_address);
 		break;
+	case ACPI_DMAR_TYPE_ATSR:
+		atsr = container_of(header, struct acpi_dmar_atsr, header);
+		printk(KERN_INFO PREFIX "ATSR flags: %#x\n", atsr->flags);
+		break;
 	}
 }
 
@@ -341,6 +425,11 @@ parse_dmar_table(void)
 			ret = dmar_parse_one_rmrr(entry_header);
 #endif
 			break;
+		case ACPI_DMAR_TYPE_ATSR:
+#ifdef CONFIG_DMAR
+			ret = dmar_parse_one_atsr(entry_header);
+#endif
+			break;
 		default:
 			printk(KERN_WARNING PREFIX
 				"Unknown DMAR structure type\n");
@@ -409,11 +498,19 @@ int __init dmar_dev_scope_init(void)
 #ifdef CONFIG_DMAR
 	{
 		struct dmar_rmrr_unit *rmrr, *rmrr_n;
+		struct dmar_atsr_unit *atsr, *atsr_n;
+
 		list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
 			ret = rmrr_parse_dev(rmrr);
 			if (ret)
 				return ret;
 		}
+
+		list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
+			ret = atsr_parse_dev(atsr);
+			if (ret)
+				return ret;
+		}
 	}
 #endif
 
@@ -446,6 +543,9 @@ int __init dmar_table_init(void)
 #ifdef CONFIG_DMAR
 	if (list_empty(&dmar_rmrr_units))
 		printk(KERN_INFO PREFIX "No RMRR found\n");
+
+	if (list_empty(&dmar_atsr_units))
+		printk(KERN_INFO PREFIX "No ATSR found\n");
 #endif
 
 #ifdef CONFIG_INTR_REMAP
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index f284407..d3a1234 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -142,6 +142,15 @@ struct dmar_rmrr_unit {
 
 #define for_each_rmrr_units(rmrr) \
 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
+
+struct dmar_atsr_unit {
+	struct list_head list;		/* list of ATSR units */
+	struct acpi_dmar_header *hdr;	/* ACPI header */
+	struct pci_dev **devices;	/* target devices */
+	int devices_cnt;		/* target device count */
+	u8 include_all:1;		/* include all ports */
+};
+
 /* Intel DMAR  initialization functions */
 extern int intel_iommu_init(void);
 #else
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index c4f6c10..5323ad9 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -316,6 +316,7 @@ static inline void __iommu_flush_cache(
 }
 
 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
+extern int dmar_find_matched_atsr_unit(struct pci_dev *dev);
 
 extern int alloc_iommu(struct dmar_drhd_unit *drhd);
 extern void free_iommu(struct intel_iommu *iommu);
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v3 3/6] VT-d: add queue invalidation fault status support
  2009-02-12 12:50 [PATCH v3 0/6] ATS capability support for Intel IOMMU Yu Zhao
  2009-02-12 12:50 ` [PATCH v3 1/6] PCI: support the ATS capability Yu Zhao
  2009-02-12 12:50 ` [PATCH v3 2/6] VT-d: parse ATSR in DMA Remapping Reporting Structure Yu Zhao
@ 2009-02-12 12:50 ` Yu Zhao
  2009-02-12 12:50 ` [PATCH v3 4/6] VT-d: add device IOTLB invalidation support Yu Zhao
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Yu Zhao @ 2009-02-12 12:50 UTC (permalink / raw)
  To: jbarnes, dwmw2; +Cc: linux-pci, iommu, kvm, linux-kernel, Yu Zhao

Check fault register after submitting an queue invalidation request.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
 drivers/pci/dmar.c           |   63 ++++++++++++++++++++++++++++++++----------
 drivers/pci/intr_remapping.c |   21 ++++++++------
 include/linux/intel-iommu.h  |    4 ++-
 3 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index bd37b3c..66dda07 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -671,19 +671,53 @@ static inline void reclaim_free_desc(struct q_inval *qi)
 	}
 }
 
+static int qi_check_fault(struct intel_iommu *iommu, int index)
+{
+	u32 fault;
+	int head;
+	struct q_inval *qi = iommu->qi;
+	int wait_index = (index + 1) % QI_LENGTH;
+
+	fault = readl(iommu->reg + DMAR_FSTS_REG);
+
+	/*
+	 * If IQE happens, the head points to the descriptor associated
+	 * with the error. No new descriptors are fetched until the IQE
+	 * is cleared.
+	 */
+	if (fault & DMA_FSTS_IQE) {
+		head = readl(iommu->reg + DMAR_IQH_REG);
+		if ((head >> DMAR_IQ_OFFSET) == index) {
+			printk(KERN_ERR "VT-d detected invalid descriptor: "
+				"low=%llx, high=%llx\n",
+				(unsigned long long)qi->desc[index].low,
+				(unsigned long long)qi->desc[index].high);
+			memcpy(&qi->desc[index], &qi->desc[wait_index],
+					sizeof(struct qi_desc));
+			__iommu_flush_cache(iommu, &qi->desc[index],
+					sizeof(struct qi_desc));
+			writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Submit the queued invalidation descriptor to the remapping
  * hardware unit and wait for its completion.
  */
-void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
+int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 {
+	int rc = 0;
 	struct q_inval *qi = iommu->qi;
 	struct qi_desc *hw, wait_desc;
 	int wait_index, index;
 	unsigned long flags;
 
 	if (!qi)
-		return;
+		return 0;
 
 	hw = qi->desc;
 
@@ -701,7 +735,8 @@ void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 
 	hw[index] = *desc;
 
-	wait_desc.low = QI_IWD_STATUS_DATA(2) | QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
+	wait_desc.low = QI_IWD_STATUS_DATA(QI_DONE) |
+			QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
 	wait_desc.high = virt_to_phys(&qi->desc_status[wait_index]);
 
 	hw[wait_index] = wait_desc;
@@ -712,13 +747,11 @@ void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 	qi->free_head = (qi->free_head + 2) % QI_LENGTH;
 	qi->free_cnt -= 2;
 
-	spin_lock(&iommu->register_lock);
 	/*
 	 * update the HW tail register indicating the presence of
 	 * new descriptors.
 	 */
-	writel(qi->free_head << 4, iommu->reg + DMAR_IQT_REG);
-	spin_unlock(&iommu->register_lock);
+	writel(qi->free_head << DMAR_IQ_OFFSET, iommu->reg + DMAR_IQT_REG);
 
 	while (qi->desc_status[wait_index] != QI_DONE) {
 		/*
@@ -728,6 +761,10 @@ void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 		 * a deadlock where the interrupt context can wait indefinitely
 		 * for free slots in the queue.
 		 */
+		rc = qi_check_fault(iommu, index);
+		if (rc)
+			break;
+
 		spin_unlock(&qi->q_lock);
 		cpu_relax();
 		spin_lock(&qi->q_lock);
@@ -737,6 +774,8 @@ void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 
 	reclaim_free_desc(qi);
 	spin_unlock_irqrestore(&qi->q_lock, flags);
+
+	return rc;
 }
 
 /*
@@ -749,13 +788,13 @@ void qi_global_iec(struct intel_iommu *iommu)
 	desc.low = QI_IEC_TYPE;
 	desc.high = 0;
 
+	/* should never fail */
 	qi_submit_sync(&desc, iommu);
 }
 
 int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
 		     u64 type, int non_present_entry_flush)
 {
-
 	struct qi_desc desc;
 
 	if (non_present_entry_flush) {
@@ -769,10 +808,7 @@ int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm,
 			| QI_CC_GRAN(type) | QI_CC_TYPE;
 	desc.high = 0;
 
-	qi_submit_sync(&desc, iommu);
-
-	return 0;
-
+	return qi_submit_sync(&desc, iommu);
 }
 
 int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
@@ -802,10 +838,7 @@ int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 	desc.high = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih)
 		| QI_IOTLB_AM(size_order);
 
-	qi_submit_sync(&desc, iommu);
-
-	return 0;
-
+	return qi_submit_sync(&desc, iommu);
 }
 
 /*
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f78371b..45effc5 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -207,7 +207,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 	return index;
 }
 
-static void qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
+static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
 {
 	struct qi_desc desc;
 
@@ -215,7 +215,7 @@ static void qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
 		   | QI_IEC_SELECTIVE;
 	desc.high = 0;
 
-	qi_submit_sync(&desc, iommu);
+	return qi_submit_sync(&desc, iommu);
 }
 
 int map_irq_to_irte_handle(int irq, u16 *sub_handle)
@@ -283,6 +283,7 @@ int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
 
 int modify_irte(int irq, struct irte *irte_modified)
 {
+	int rc;
 	int index;
 	struct irte *irte;
 	struct intel_iommu *iommu;
@@ -303,14 +304,15 @@ int modify_irte(int irq, struct irte *irte_modified)
 	set_64bit((unsigned long *)irte, irte_modified->low | (1 << 1));
 	__iommu_flush_cache(iommu, irte, sizeof(*irte));
 
-	qi_flush_iec(iommu, index, 0);
-
+	rc = qi_flush_iec(iommu, index, 0);
 	spin_unlock(&irq_2_ir_lock);
-	return 0;
+
+	return rc;
 }
 
 int flush_irte(int irq)
 {
+	int rc;
 	int index;
 	struct intel_iommu *iommu;
 	struct irq_2_iommu *irq_iommu;
@@ -326,10 +328,10 @@ int flush_irte(int irq)
 
 	index = irq_iommu->irte_index + irq_iommu->sub_handle;
 
-	qi_flush_iec(iommu, index, irq_iommu->irte_mask);
+	rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask);
 	spin_unlock(&irq_2_ir_lock);
 
-	return 0;
+	return rc;
 }
 
 struct intel_iommu *map_ioapic_to_ir(int apic)
@@ -355,6 +357,7 @@ struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
 
 int free_irte(int irq)
 {
+	int rc = 0;
 	int index, i;
 	struct irte *irte;
 	struct intel_iommu *iommu;
@@ -375,7 +378,7 @@ int free_irte(int irq)
 	if (!irq_iommu->sub_handle) {
 		for (i = 0; i < (1 << irq_iommu->irte_mask); i++)
 			set_64bit((unsigned long *)irte, 0);
-		qi_flush_iec(iommu, index, irq_iommu->irte_mask);
+		rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask);
 	}
 
 	irq_iommu->iommu = NULL;
@@ -385,7 +388,7 @@ int free_irte(int irq)
 
 	spin_unlock(&irq_2_ir_lock);
 
-	return 0;
+	return rc;
 }
 
 static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 5323ad9..0a220c9 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -53,6 +53,7 @@
 #define	DMAR_PHMLIMIT_REG 0x78	/* pmrr high limit */
 #define DMAR_IQH_REG	0x80	/* Invalidation queue head register */
 #define DMAR_IQT_REG	0x88	/* Invalidation queue tail register */
+#define DMAR_IQ_OFFSET	4	/* Invalidation queue head/tail offset */
 #define DMAR_IQA_REG	0x90	/* Invalidation queue addr register */
 #define DMAR_ICS_REG	0x98	/* Invalidation complete status register */
 #define DMAR_IRTA_REG	0xb8    /* Interrupt remapping table addr register */
@@ -194,6 +195,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 /* FSTS_REG */
 #define DMA_FSTS_PPF ((u32)2)
 #define DMA_FSTS_PFO ((u32)1)
+#define DMA_FSTS_IQE (1 << 4)
 #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
 
 /* FRCD_REG, 32 bits access */
@@ -329,7 +331,7 @@ extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 			  unsigned int size_order, u64 type,
 			  int non_present_entry_flush);
 
-extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
+extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
 extern void *intel_alloc_coherent(struct device *, size_t, dma_addr_t *, gfp_t);
 extern void intel_free_coherent(struct device *, size_t, void *, dma_addr_t);
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v3 4/6] VT-d: add device IOTLB invalidation support
  2009-02-12 12:50 [PATCH v3 0/6] ATS capability support for Intel IOMMU Yu Zhao
                   ` (2 preceding siblings ...)
  2009-02-12 12:50 ` [PATCH v3 3/6] VT-d: add queue invalidation fault status support Yu Zhao
@ 2009-02-12 12:50 ` Yu Zhao
  2009-02-12 12:50 ` [PATCH v3 5/6] VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps Yu Zhao
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Yu Zhao @ 2009-02-12 12:50 UTC (permalink / raw)
  To: jbarnes, dwmw2; +Cc: linux-pci, iommu, kvm, linux-kernel, Yu Zhao

Support device IOTLB invalidation to flush the translation cached in the
Endpoint.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
 drivers/pci/dmar.c          |   63 ++++++++++++++++++++++++++++++++++++++++--
 include/linux/intel-iommu.h |   13 ++++++++-
 2 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 66dda07..93b38e7 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -664,7 +664,8 @@ void free_iommu(struct intel_iommu *iommu)
  */
 static inline void reclaim_free_desc(struct q_inval *qi)
 {
-	while (qi->desc_status[qi->free_tail] == QI_DONE) {
+	while (qi->desc_status[qi->free_tail] == QI_DONE ||
+	       qi->desc_status[qi->free_tail] == QI_ABORT) {
 		qi->desc_status[qi->free_tail] = QI_FREE;
 		qi->free_tail = (qi->free_tail + 1) % QI_LENGTH;
 		qi->free_cnt++;
@@ -674,10 +675,13 @@ static inline void reclaim_free_desc(struct q_inval *qi)
 static int qi_check_fault(struct intel_iommu *iommu, int index)
 {
 	u32 fault;
-	int head;
+	int head, tail;
 	struct q_inval *qi = iommu->qi;
 	int wait_index = (index + 1) % QI_LENGTH;
 
+	if (qi->desc_status[wait_index] == QI_ABORT)
+		return -EAGAIN;
+
 	fault = readl(iommu->reg + DMAR_FSTS_REG);
 
 	/*
@@ -701,6 +705,32 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
 		}
 	}
 
+	/*
+	 * If ITE happens, all pending wait_desc commands are aborted.
+	 * No new descriptors are fetched until the ITE is cleared.
+	 */
+	if (fault & DMA_FSTS_ITE) {
+		head = readl(iommu->reg + DMAR_IQH_REG);
+		head = ((head >> DMAR_IQ_OFFSET) - 1 + QI_LENGTH) % QI_LENGTH;
+		head |= 1;
+		tail = readl(iommu->reg + DMAR_IQT_REG);
+		tail = ((tail >> DMAR_IQ_OFFSET) - 1 + QI_LENGTH) % QI_LENGTH;
+
+		writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
+
+		do {
+			if (qi->desc_status[head] == QI_IN_USE)
+				qi->desc_status[head] = QI_ABORT;
+			head = (head - 2 + QI_LENGTH) % QI_LENGTH;
+		} while (head != tail);
+
+		if (qi->desc_status[wait_index] == QI_ABORT)
+			return -EAGAIN;
+	}
+
+	if (fault & DMA_FSTS_ICE)
+		writel(DMA_FSTS_ICE, iommu->reg + DMAR_FSTS_REG);
+
 	return 0;
 }
 
@@ -710,7 +740,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int index)
  */
 int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 {
-	int rc = 0;
+	int rc;
 	struct q_inval *qi = iommu->qi;
 	struct qi_desc *hw, wait_desc;
 	int wait_index, index;
@@ -721,6 +751,9 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 
 	hw = qi->desc;
 
+restart:
+	rc = 0;
+
 	spin_lock_irqsave(&qi->q_lock, flags);
 	while (qi->free_cnt < 3) {
 		spin_unlock_irqrestore(&qi->q_lock, flags);
@@ -775,6 +808,9 @@ int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu)
 	reclaim_free_desc(qi);
 	spin_unlock_irqrestore(&qi->q_lock, flags);
 
+	if (rc == -EAGAIN)
+		goto restart;
+
 	return rc;
 }
 
@@ -841,6 +877,27 @@ int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 	return qi_submit_sync(&desc, iommu);
 }
 
+int qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
+			u64 addr, unsigned mask)
+{
+	struct qi_desc desc;
+
+	if (mask) {
+		BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
+		addr |= (1 << (VTD_PAGE_SHIFT + mask - 1)) - 1;
+		desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
+	} else
+		desc.high = QI_DEV_IOTLB_ADDR(addr);
+
+	if (qdep >= QI_DEV_IOTLB_MAX_INVS)
+		qdep = 0;
+
+	desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
+		   QI_DIOTLB_TYPE;
+
+	return qi_submit_sync(&desc, iommu);
+}
+
 /*
  * Enable Queued Invalidation interface. This is a must to support
  * interrupt-remapping. Also used by DMA-remapping, which replaces
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 0a220c9..d82bdac 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -196,6 +196,8 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define DMA_FSTS_PPF ((u32)2)
 #define DMA_FSTS_PFO ((u32)1)
 #define DMA_FSTS_IQE (1 << 4)
+#define DMA_FSTS_ICE (1 << 5)
+#define DMA_FSTS_ITE (1 << 6)
 #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
 
 /* FRCD_REG, 32 bits access */
@@ -224,7 +226,8 @@ do {									\
 enum {
 	QI_FREE,
 	QI_IN_USE,
-	QI_DONE
+	QI_DONE,
+	QI_ABORT
 };
 
 #define QI_CC_TYPE		0x1
@@ -253,6 +256,12 @@ enum {
 #define QI_CC_DID(did)		(((u64)did) << 16)
 #define QI_CC_GRAN(gran)	(((u64)gran) >> (DMA_CCMD_INVL_GRANU_OFFSET-4))
 
+#define QI_DEV_IOTLB_SID(sid)	((u64)((sid) & 0xffff) << 32)
+#define QI_DEV_IOTLB_QDEP(qdep)	(((qdep) & 0x1f) << 16)
+#define QI_DEV_IOTLB_ADDR(addr)	((u64)(addr) & VTD_PAGE_MASK)
+#define QI_DEV_IOTLB_SIZE	1
+#define QI_DEV_IOTLB_MAX_INVS	32
+
 struct qi_desc {
 	u64 low, high;
 };
@@ -330,6 +339,8 @@ extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
 extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 			  unsigned int size_order, u64 type,
 			  int non_present_entry_flush);
+extern int qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
+			       u64 addr, unsigned mask);
 
 extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v3 5/6] VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
  2009-02-12 12:50 [PATCH v3 0/6] ATS capability support for Intel IOMMU Yu Zhao
                   ` (3 preceding siblings ...)
  2009-02-12 12:50 ` [PATCH v3 4/6] VT-d: add device IOTLB invalidation support Yu Zhao
@ 2009-02-12 12:50 ` Yu Zhao
  2009-02-12 12:50 ` [PATCH v3 6/6] VT-d: support the device IOTLB Yu Zhao
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Yu Zhao @ 2009-02-12 12:50 UTC (permalink / raw)
  To: jbarnes, dwmw2; +Cc: linux-pci, iommu, kvm, linux-kernel, Yu Zhao

Make iommu_flush_iotlb_psi() and flush_unmaps() easier to read.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
 drivers/pci/intel-iommu.c |   46 +++++++++++++++++++++-----------------------
 1 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index f4b7c79..5fdbed3 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -925,30 +925,27 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 	u64 addr, unsigned int pages, int non_present_entry_flush)
 {
-	unsigned int mask;
+	int rc;
+	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
 
 	BUG_ON(addr & (~VTD_PAGE_MASK));
 	BUG_ON(pages == 0);
 
-	/* Fallback to domain selective flush if no PSI support */
-	if (!cap_pgsel_inv(iommu->cap))
-		return iommu->flush.flush_iotlb(iommu, did, 0, 0,
-						DMA_TLB_DSI_FLUSH,
-						non_present_entry_flush);
-
 	/*
+	 * Fallback to domain selective flush if no PSI support or the size is
+	 * too big.
 	 * PSI requires page size to be 2 ^ x, and the base address is naturally
 	 * aligned to the size
 	 */
-	mask = ilog2(__roundup_pow_of_two(pages));
-	/* Fallback to domain selective flush if size is too big */
-	if (mask > cap_max_amask_val(iommu->cap))
-		return iommu->flush.flush_iotlb(iommu, did, 0, 0,
-			DMA_TLB_DSI_FLUSH, non_present_entry_flush);
-
-	return iommu->flush.flush_iotlb(iommu, did, addr, mask,
-					DMA_TLB_PSI_FLUSH,
-					non_present_entry_flush);
+	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
+		rc = iommu->flush.flush_iotlb(iommu, did, 0, 0,
+						DMA_TLB_DSI_FLUSH,
+						non_present_entry_flush);
+	else
+		rc = iommu->flush.flush_iotlb(iommu, did, addr, mask,
+						DMA_TLB_PSI_FLUSH,
+						non_present_entry_flush);
+	return rc;
 }
 
 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
@@ -2301,15 +2298,16 @@ static void flush_unmaps(void)
 		if (!iommu)
 			continue;
 
-		if (deferred_flush[i].next) {
-			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
-						 DMA_TLB_GLOBAL_FLUSH, 0);
-			for (j = 0; j < deferred_flush[i].next; j++) {
-				__free_iova(&deferred_flush[i].domain[j]->iovad,
-						deferred_flush[i].iova[j]);
-			}
-			deferred_flush[i].next = 0;
+		if (!deferred_flush[i].next)
+			continue;
+
+		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
+					 DMA_TLB_GLOBAL_FLUSH, 0);
+		for (j = 0; j < deferred_flush[i].next; j++) {
+			__free_iova(&deferred_flush[i].domain[j]->iovad,
+					deferred_flush[i].iova[j]);
 		}
+		deferred_flush[i].next = 0;
 	}
 
 	list_size = 0;
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v3 6/6] VT-d: support the device IOTLB
  2009-02-12 12:50 [PATCH v3 0/6] ATS capability support for Intel IOMMU Yu Zhao
                   ` (4 preceding siblings ...)
  2009-02-12 12:50 ` [PATCH v3 5/6] VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps Yu Zhao
@ 2009-02-12 12:50 ` Yu Zhao
  2009-02-14 23:20   ` Grant Grundler
  2009-02-13  3:44 ` [PATCH v3 0/6] ATS capability support for Intel IOMMU Matthew Wilcox
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 20+ messages in thread
From: Yu Zhao @ 2009-02-12 12:50 UTC (permalink / raw)
  To: jbarnes, dwmw2; +Cc: linux-pci, iommu, kvm, linux-kernel, Yu Zhao

Support device IOTLB (i.e. ATS) for both native and KVM environments.

Signed-off-by: Yu Zhao <yu.zhao@intel.com>
---
 drivers/pci/intel-iommu.c   |   95 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/intel-iommu.h |    1 +
 2 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 5fdbed3..fe09e7a 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -125,6 +125,7 @@ static inline void context_set_fault_enable(struct context_entry *context)
 }
 
 #define CONTEXT_TT_MULTI_LEVEL 0
+#define CONTEXT_TT_DEV_IOTLB	1
 
 static inline void context_set_translation_type(struct context_entry *context,
 						unsigned long value)
@@ -240,6 +241,7 @@ struct device_domain_info {
 	struct list_head global; /* link to global list */
 	u8 bus;			/* PCI bus numer */
 	u8 devfn;		/* PCI devfn number */
+	struct intel_iommu *iommu; /* IOMMU used by this device */
 	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
 	struct dmar_domain *domain; /* pointer to domain */
 };
@@ -922,6 +924,74 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 	return 0;
 }
 
+static struct device_domain_info *
+iommu_support_dev_iotlb(struct dmar_domain *domain, u8 bus, u8 devfn)
+{
+	int found = 0;
+	unsigned long flags;
+	struct device_domain_info *info;
+	struct intel_iommu *iommu = device_to_iommu(bus, devfn);
+
+	if (!ecap_dev_iotlb_support(iommu->ecap))
+		return NULL;
+
+	if (!iommu->qi)
+		return NULL;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &domain->devices, link)
+		if (info->dev && info->bus == bus && info->devfn == devfn) {
+			found = 1;
+			break;
+		}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	if (!found)
+		return NULL;
+
+	if (!dmar_find_matched_atsr_unit(info->dev))
+		return NULL;
+
+	info->iommu = iommu;
+	if (!pci_ats_queue_depth(info->dev))
+		return NULL;
+
+	return info;
+}
+
+static void iommu_enable_dev_iotlb(struct device_domain_info *info)
+{
+	pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
+}
+
+static void iommu_disable_dev_iotlb(struct device_domain_info *info)
+{
+	if (info->dev && pci_ats_enabled(info->dev))
+		pci_disable_ats(info->dev);
+}
+
+static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
+				  u64 addr, unsigned mask)
+{
+	int rc;
+	u16 sid, qdep;
+	unsigned long flags;
+	struct device_domain_info *info;
+
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry(info, &domain->devices, link) {
+		if (!info->dev || !pci_ats_enabled(info->dev))
+			continue;
+
+		sid = info->bus << 8 | info->devfn;
+		qdep = pci_ats_queue_depth(info->dev);
+		rc = qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
+		if (rc)
+			printk(KERN_ERR "IOMMU: flush device IOTLB failed\n");
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}
+
 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 	u64 addr, unsigned int pages, int non_present_entry_flush)
 {
@@ -945,6 +1015,9 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
 		rc = iommu->flush.flush_iotlb(iommu, did, addr, mask,
 						DMA_TLB_PSI_FLUSH,
 						non_present_entry_flush);
+	if (!rc && !non_present_entry_flush)
+		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
+
 	return rc;
 }
 
@@ -1469,6 +1542,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 	unsigned long ndomains;
 	int id;
 	int agaw;
+	struct device_domain_info *info;
 
 	pr_debug("Set context mapping for %02x:%02x.%d\n",
 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
@@ -1534,7 +1608,11 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 	context_set_domain_id(context, id);
 	context_set_address_width(context, iommu->agaw);
 	context_set_address_root(context, virt_to_phys(pgd));
-	context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
+	info = iommu_support_dev_iotlb(domain, bus, devfn);
+	if (info)
+		context_set_translation_type(context, CONTEXT_TT_DEV_IOTLB);
+	else
+		context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
 	context_set_fault_enable(context);
 	context_set_present(context);
 	domain_flush_cache(domain, context, sizeof(*context));
@@ -1546,6 +1624,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 		iommu_flush_write_buffer(iommu);
 	else
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
+	if (info)
+		iommu_enable_dev_iotlb(info);
 
 	spin_unlock_irqrestore(&iommu->lock, flags);
 
@@ -1687,6 +1767,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
 			info->dev->dev.archdata.iommu = NULL;
 		spin_unlock_irqrestore(&device_domain_lock, flags);
 
+		iommu_disable_dev_iotlb(info);
 		iommu = device_to_iommu(info->bus, info->devfn);
 		iommu_detach_dev(iommu, info->bus, info->devfn);
 		free_devinfo_mem(info);
@@ -2304,8 +2385,14 @@ static void flush_unmaps(void)
 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
 					 DMA_TLB_GLOBAL_FLUSH, 0);
 		for (j = 0; j < deferred_flush[i].next; j++) {
-			__free_iova(&deferred_flush[i].domain[j]->iovad,
-					deferred_flush[i].iova[j]);
+			unsigned long mask;
+			struct iova *iova = deferred_flush[i].iova[j];
+
+			mask = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT;
+			mask = ilog2(mask >> VTD_PAGE_SHIFT);
+			iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
+					iova->pfn_lo << PAGE_SHIFT, mask);
+			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
 		}
 		deferred_flush[i].next = 0;
 	}
@@ -2792,6 +2879,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
 				info->dev->dev.archdata.iommu = NULL;
 			spin_unlock_irqrestore(&device_domain_lock, flags);
 
+			iommu_disable_dev_iotlb(info);
 			iommu_detach_dev(iommu, info->bus, info->devfn);
 			free_devinfo_mem(info);
 
@@ -2840,6 +2928,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
 
 		spin_unlock_irqrestore(&device_domain_lock, flags1);
 
+		iommu_disable_dev_iotlb(info);
 		iommu = device_to_iommu(info->bus, info->devfn);
 		iommu_detach_dev(iommu, info->bus, info->devfn);
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d82bdac..609af82 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -123,6 +123,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define ecap_qis(e)		((e) & 0x2)
 #define ecap_eim_support(e)	((e >> 4) & 0x1)
 #define ecap_ir_support(e)	((e >> 3) & 0x1)
+#define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
 #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
 
 
-- 
1.6.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 0/6] ATS capability support for Intel IOMMU
  2009-02-12 12:50 [PATCH v3 0/6] ATS capability support for Intel IOMMU Yu Zhao
                   ` (5 preceding siblings ...)
  2009-02-12 12:50 ` [PATCH v3 6/6] VT-d: support the device IOTLB Yu Zhao
@ 2009-02-13  3:44 ` Matthew Wilcox
  2009-02-13  5:27   ` Zhao, Yu
  2009-02-14 22:59 ` Grant Grundler
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 20+ messages in thread
From: Matthew Wilcox @ 2009-02-13  3:44 UTC (permalink / raw)
  To: Yu Zhao; +Cc: jbarnes, dwmw2, linux-pci, iommu, kvm, linux-kernel

On Thu, Feb 12, 2009 at 08:50:32PM +0800, Yu Zhao wrote:
>   2, avoid using pci_find_ext_capability every time when reading ATS
>      Invalidate Queue Depth (Matthew Wilcox)

er ... I didn't say it was a problem.  I said I couldn't tell if it was a
problem.  There's no point in taking up an extra 4 bytes per pci_dev if
it's not a performance problem.  How often do we query the queue depth?
Is this something a device driver will call once per device and then
remember for itself, or only use at setup?  Or is it something we call
every millisecond?

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 0/6] ATS capability support for Intel IOMMU
  2009-02-13  3:44 ` [PATCH v3 0/6] ATS capability support for Intel IOMMU Matthew Wilcox
@ 2009-02-13  5:27   ` Zhao, Yu
  0 siblings, 0 replies; 20+ messages in thread
From: Zhao, Yu @ 2009-02-13  5:27 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: jbarnes, dwmw2, linux-pci, iommu, kvm, linux-kernel

Matthew Wilcox wrote:
> On Thu, Feb 12, 2009 at 08:50:32PM +0800, Yu Zhao wrote:
>>   2, avoid using pci_find_ext_capability every time when reading ATS
>>      Invalidate Queue Depth (Matthew Wilcox)
> 
> er ... I didn't say it was a problem.  I said I couldn't tell if it was a
> problem.  There's no point in taking up an extra 4 bytes per pci_dev if
> it's not a performance problem.  How often do we query the queue depth?
> Is this something a device driver will call once per device and then
> remember for itself, or only use at setup?  Or is it something we call
> every millisecond?
> 

The query function is called only once per device in v2 patch series. The
queue depth is cached in a VT-d private data structure, and it's used when
device driver calls pci_unmap_single or pci_unmap_sg. My concern was that
packing everything into `pci_dev' would make it grows very fast. 

For v3, the queue depth is removed from the VT-d `device_domain_info'.
Instead, it's cached in `pci_dev', and the query function is called every
time when the queue depth is needed. It would be easier to write the
IOMMU-specific ATS code for AMD, IBM and other vendors' IOMMUs, if they
have same requirement as Intel's (needs to query the queue depth every
time when invalidating the IOMMU TLB cache inside the device). So it looks
having the queue depth in `pci_dev' is reasonable, but I'm not sure.

Following is the logics I used in v2.

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 261b6bd..a7ff7cb 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -240,6 +241,8 @@ struct device_domain_info {
 	struct list_head global; /* link to global list */
 	u8 bus;			/* PCI bus numer */
 	u8 devfn;		/* PCI devfn number */
+	int qdep;		/* invalidate queue depth */
+	struct intel_iommu *iommu; /* IOMMU used by this device */
 	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
 	struct dmar_domain *domain; /* pointer to domain */
 };

<snipped>

+	info->iommu = iommu;
+	info->qdep = pci_ats_qdep(info->dev);
+	if (!info->qdep)
+		return NULL;

<snipped>

+	list_for_each_entry(info, &domain->devices, link) {
+		if (!info->dev || !pci_ats_enabled(info->dev))
+			continue;
+
+		sid = info->bus << 8 | info->devfn;
+		rc = qi_flush_dev_iotlb(info->iommu, sid,
+					info->qdep, addr, mask);
+		if (rc)
+			printk(KERN_ERR "IOMMU: flush device IOTLB failed\n");
+	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
+}

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 0/6] ATS capability support for Intel IOMMU
  2009-02-12 12:50 [PATCH v3 0/6] ATS capability support for Intel IOMMU Yu Zhao
                   ` (6 preceding siblings ...)
  2009-02-13  3:44 ` [PATCH v3 0/6] ATS capability support for Intel IOMMU Matthew Wilcox
@ 2009-02-14 22:59 ` Grant Grundler
  2009-02-26  2:50   ` Yu Zhao
  2009-03-20  2:30 ` Jesse Barnes
  2009-03-20  2:30 ` Jesse Barnes
  9 siblings, 1 reply; 20+ messages in thread
From: Grant Grundler @ 2009-02-14 22:59 UTC (permalink / raw)
  To: Yu Zhao; +Cc: jbarnes, dwmw2, linux-pci, iommu, kvm, linux-kernel

On Thu, Feb 12, 2009 at 08:50:32PM +0800, Yu Zhao wrote:
> This patch series implements Address Translation Service support for
> the Intel IOMMU. ATS makes the PCI Endpoint be able to request the
> DMA address translation from the IOMMU and cache the translation in
> the Endpoint, thus alleviate IOMMU pressure and improve the hardware
> performance in the I/O virtualization environment.
> 
> 
> Changelog: v2 -> v3
>   1, throw error message if VT-d hardware detects invalid descriptor
>      on Queued Invalidation interface (David Woodhouse)
>   2, avoid using pci_find_ext_capability every time when reading ATS
>      Invalidate Queue Depth (Matthew Wilcox)
> Changelog: v1 -> v2
>   added 'static' prefix to a local LIST_HEAD (Andrew Morton)
> 
> 
> Yu Zhao (6):
>   PCI: support the ATS capability
>   VT-d: parse ATSR in DMA Remapping Reporting Structure
>   VT-d: add queue invalidation fault status support
>   VT-d: add device IOTLB invalidation support
>   VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
>   VT-d: support the device IOTLB
> 
>  drivers/pci/dmar.c           |  230 ++++++++++++++++++++++++++++++++++++++----

Yu,
Can you please add something to Documentation/PCI/pci.txt?
New API I'm seeing are:
+extern int pci_enable_ats(struct pci_dev *dev, int ps);
+extern void pci_disable_ats(struct pci_dev *dev);
+extern int pci_ats_queue_depth(struct pci_dev *dev);

Do these also need to be EXPORT_SYMBOL_GPL() as well?
Or are drivers never expected to call the above?

thanks,
grant

>  drivers/pci/intel-iommu.c    |  135 ++++++++++++++++++++-----
>  drivers/pci/intr_remapping.c |   21 ++--
>  drivers/pci/pci.c            |   72 +++++++++++++
>  include/linux/dmar.h         |    9 ++
>  include/linux/intel-iommu.h  |   19 +++-
>  include/linux/pci.h          |   16 +++
>  include/linux/pci_regs.h     |   10 ++
>  8 files changed, 457 insertions(+), 55 deletions(-)
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 6/6] VT-d: support the device IOTLB
  2009-02-12 12:50 ` [PATCH v3 6/6] VT-d: support the device IOTLB Yu Zhao
@ 2009-02-14 23:20   ` Grant Grundler
  2009-02-26  3:21     ` Yu Zhao
  0 siblings, 1 reply; 20+ messages in thread
From: Grant Grundler @ 2009-02-14 23:20 UTC (permalink / raw)
  To: Yu Zhao; +Cc: jbarnes, dwmw2, linux-pci, iommu, kvm, linux-kernel

On Thu, Feb 12, 2009 at 08:50:38PM +0800, Yu Zhao wrote:
> Support device IOTLB (i.e. ATS) for both native and KVM environments.
> 
> Signed-off-by: Yu Zhao <yu.zhao@intel.com>
> ---
>  drivers/pci/intel-iommu.c   |   95 +++++++++++++++++++++++++++++++++++++++++-
>  include/linux/intel-iommu.h |    1 +
>  2 files changed, 93 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
> index 5fdbed3..fe09e7a 100644
> --- a/drivers/pci/intel-iommu.c
> +++ b/drivers/pci/intel-iommu.c
> @@ -125,6 +125,7 @@ static inline void context_set_fault_enable(struct context_entry *context)
>  }
>  
>  #define CONTEXT_TT_MULTI_LEVEL 0
> +#define CONTEXT_TT_DEV_IOTLB	1
>  
>  static inline void context_set_translation_type(struct context_entry *context,
>  						unsigned long value)
> @@ -240,6 +241,7 @@ struct device_domain_info {
>  	struct list_head global; /* link to global list */
>  	u8 bus;			/* PCI bus numer */
>  	u8 devfn;		/* PCI devfn number */
> +	struct intel_iommu *iommu; /* IOMMU used by this device */
>  	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
>  	struct dmar_domain *domain; /* pointer to domain */
>  };
> @@ -922,6 +924,74 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
>  	return 0;
>  }
>  
> +static struct device_domain_info *
> +iommu_support_dev_iotlb(struct dmar_domain *domain, u8 bus, u8 devfn)
> +{
> +	int found = 0;
> +	unsigned long flags;
> +	struct device_domain_info *info;
> +	struct intel_iommu *iommu = device_to_iommu(bus, devfn);
> +
> +	if (!ecap_dev_iotlb_support(iommu->ecap))
> +		return NULL;
> +
> +	if (!iommu->qi)
> +		return NULL;
> +
> +	spin_lock_irqsave(&device_domain_lock, flags);
> +	list_for_each_entry(info, &domain->devices, link)
> +		if (info->dev && info->bus == bus && info->devfn == devfn) {
> +			found = 1;
> +			break;
> +		}
> +	spin_unlock_irqrestore(&device_domain_lock, flags);
> +
> +	if (!found)
> +		return NULL;
> +
> +	if (!dmar_find_matched_atsr_unit(info->dev))
> +		return NULL;
> +
> +	info->iommu = iommu;
> +	if (!pci_ats_queue_depth(info->dev))
> +		return NULL;
> +
> +	return info;
> +}
> +
> +static void iommu_enable_dev_iotlb(struct device_domain_info *info)
> +{
> +	pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
> +}

Why is a static function defined that calls a global function?

> +
> +static void iommu_disable_dev_iotlb(struct device_domain_info *info)
> +{
> +	if (info->dev && pci_ats_enabled(info->dev))
> +		pci_disable_ats(info->dev);
> +}

ditto. pci_disable_ats() should be able to handle the case when
"info->dev" is NULL and will know if ATS is enabled.

I think both of these functions can be dropped and just directly call pci_*_ats().

> +
> +static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
> +				  u64 addr, unsigned mask)
> +{
> +	int rc;
> +	u16 sid, qdep;
> +	unsigned long flags;
> +	struct device_domain_info *info;
> +
> +	spin_lock_irqsave(&device_domain_lock, flags);
> +	list_for_each_entry(info, &domain->devices, link) {

Would it be possible to define a single domain for each PCI device?
Or does "domain" represent an IOMMU?
Sorry, I forgot...I'm sure someone has mentioned this the past.

I want to point out list_for_each_entry() is effectively a "nested loop".
iommu_flush_dev_iotlb() will get called alot from flush_unmaps().
Perhaps do the lookup once there and pass that as a parameter?
I don't know if that is feasible. But if this is a very frequently
used code path, every CPU cycle counts.


> +		if (!info->dev || !pci_ats_enabled(info->dev))
> +			continue;
> +
> +		sid = info->bus << 8 | info->devfn;
> +		qdep = pci_ats_queue_depth(info->dev);

re Matthew Wilcox's comment - looks like caching ats_queue_depth
is appropriate.

thanks,
gant

> +		rc = qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
> +		if (rc)
> +			printk(KERN_ERR "IOMMU: flush device IOTLB failed\n");

Can this be a dev_printk please?

Perhaps in general review the use of "printk" so when errors are reported,
users will know which devices might be affected by the failure.
If more than a few printk's should be "converted" to dev_printk(), I'd
be happy if that were a seperate patch (submitted later).


> +	}
> +	spin_unlock_irqrestore(&device_domain_lock, flags);
> +}
> +
>  static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
>  	u64 addr, unsigned int pages, int non_present_entry_flush)
>  {
> @@ -945,6 +1015,9 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
>  		rc = iommu->flush.flush_iotlb(iommu, did, addr, mask,
>  						DMA_TLB_PSI_FLUSH,
>  						non_present_entry_flush);
> +	if (!rc && !non_present_entry_flush)
> +		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
> +
>  	return rc;
>  }
>  
> @@ -1469,6 +1542,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
>  	unsigned long ndomains;
>  	int id;
>  	int agaw;
> +	struct device_domain_info *info;
>  
>  	pr_debug("Set context mapping for %02x:%02x.%d\n",
>  		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
> @@ -1534,7 +1608,11 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
>  	context_set_domain_id(context, id);
>  	context_set_address_width(context, iommu->agaw);
>  	context_set_address_root(context, virt_to_phys(pgd));
> -	context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
> +	info = iommu_support_dev_iotlb(domain, bus, devfn);
> +	if (info)
> +		context_set_translation_type(context, CONTEXT_TT_DEV_IOTLB);
> +	else
> +		context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);

Would it be ok to rewrite this as:
+		context_set_translation_type(context,
+			 info ? CONTEXT_TT_DEV_IOTLB : CONTEXT_TT_MULTI_LEVEL);

>  	context_set_fault_enable(context);
>  	context_set_present(context);
>  	domain_flush_cache(domain, context, sizeof(*context));
> @@ -1546,6 +1624,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
>  		iommu_flush_write_buffer(iommu);
>  	else
>  		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);

Adding a blank line here would make this more readable.
(AFAIK, not required by coding style, just my opinion.)

> +	if (info)
> +		iommu_enable_dev_iotlb(info);

Could iommu_enable_dev_iotlb() (or pci_enable_ats()) check if "info" is NULL?
Then this would just be a simple function call.

And it would be consistent with usage of iommu_disable_dev_iotlb().

thanks,
grant

>  
>  	spin_unlock_irqrestore(&iommu->lock, flags);
>  
> @@ -1687,6 +1767,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
>  			info->dev->dev.archdata.iommu = NULL;
>  		spin_unlock_irqrestore(&device_domain_lock, flags);
>  
> +		iommu_disable_dev_iotlb(info);
>  		iommu = device_to_iommu(info->bus, info->devfn);
>  		iommu_detach_dev(iommu, info->bus, info->devfn);
>  		free_devinfo_mem(info);
> @@ -2304,8 +2385,14 @@ static void flush_unmaps(void)
>  		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
>  					 DMA_TLB_GLOBAL_FLUSH, 0);
>  		for (j = 0; j < deferred_flush[i].next; j++) {
> -			__free_iova(&deferred_flush[i].domain[j]->iovad,
> -					deferred_flush[i].iova[j]);
> +			unsigned long mask;
> +			struct iova *iova = deferred_flush[i].iova[j];
> +
> +			mask = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT;
> +			mask = ilog2(mask >> VTD_PAGE_SHIFT);
> +			iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
> +					iova->pfn_lo << PAGE_SHIFT, mask);
> +			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
>  		}
>  		deferred_flush[i].next = 0;
>  	}
> @@ -2792,6 +2879,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
>  				info->dev->dev.archdata.iommu = NULL;
>  			spin_unlock_irqrestore(&device_domain_lock, flags);
>  
> +			iommu_disable_dev_iotlb(info);
>  			iommu_detach_dev(iommu, info->bus, info->devfn);
>  			free_devinfo_mem(info);
>  
> @@ -2840,6 +2928,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
>  
>  		spin_unlock_irqrestore(&device_domain_lock, flags1);
>  
> +		iommu_disable_dev_iotlb(info);
>  		iommu = device_to_iommu(info->bus, info->devfn);
>  		iommu_detach_dev(iommu, info->bus, info->devfn);
>  
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index d82bdac..609af82 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -123,6 +123,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
>  #define ecap_qis(e)		((e) & 0x2)
>  #define ecap_eim_support(e)	((e >> 4) & 0x1)
>  #define ecap_ir_support(e)	((e >> 3) & 0x1)
> +#define ecap_dev_iotlb_support(e)	(((e) >> 2) & 0x1)
>  #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
>  
>  
> -- 
> 1.6.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 0/6] ATS capability support for Intel IOMMU
  2009-02-14 22:59 ` Grant Grundler
@ 2009-02-26  2:50   ` Yu Zhao
  2009-02-26  3:46     ` Greg KH
  2009-02-27  7:19     ` Grant Grundler
  0 siblings, 2 replies; 20+ messages in thread
From: Yu Zhao @ 2009-02-26  2:50 UTC (permalink / raw)
  To: Grant Grundler; +Cc: jbarnes, dwmw2, linux-pci, iommu, kvm, linux-kernel

On Sun, Feb 15, 2009 at 06:59:10AM +0800, Grant Grundler wrote:
> On Thu, Feb 12, 2009 at 08:50:32PM +0800, Yu Zhao wrote:
> > This patch series implements Address Translation Service support for
> > the Intel IOMMU. ATS makes the PCI Endpoint be able to request the
> > DMA address translation from the IOMMU and cache the translation in
> > the Endpoint, thus alleviate IOMMU pressure and improve the hardware
> > performance in the I/O virtualization environment.
> > 
> > 
> > Changelog: v2 -> v3
> >   1, throw error message if VT-d hardware detects invalid descriptor
> >      on Queued Invalidation interface (David Woodhouse)
> >   2, avoid using pci_find_ext_capability every time when reading ATS
> >      Invalidate Queue Depth (Matthew Wilcox)
> > Changelog: v1 -> v2
> >   added 'static' prefix to a local LIST_HEAD (Andrew Morton)
> > 
> > 
> > Yu Zhao (6):
> >   PCI: support the ATS capability
> >   VT-d: parse ATSR in DMA Remapping Reporting Structure
> >   VT-d: add queue invalidation fault status support
> >   VT-d: add device IOTLB invalidation support
> >   VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
> >   VT-d: support the device IOTLB
> > 
> >  drivers/pci/dmar.c           |  230 ++++++++++++++++++++++++++++++++++++++----
> 
> Yu,
> Can you please add something to Documentation/PCI/pci.txt?
> New API I'm seeing are:
> +extern int pci_enable_ats(struct pci_dev *dev, int ps);
> +extern void pci_disable_ats(struct pci_dev *dev);
> +extern int pci_ats_queue_depth(struct pci_dev *dev);

Yes, I'll document these new API.

> Do these also need to be EXPORT_SYMBOL_GPL() as well?
> Or are drivers never expected to call the above?

PCI device driver shouldn't use these API, only IOMMU driver (can't be module)
would use them. Anyway it's a good idea to export them :-)

Thanks,
Yu

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 6/6] VT-d: support the device IOTLB
  2009-02-14 23:20   ` Grant Grundler
@ 2009-02-26  3:21     ` Yu Zhao
  0 siblings, 0 replies; 20+ messages in thread
From: Yu Zhao @ 2009-02-26  3:21 UTC (permalink / raw)
  To: Grant Grundler; +Cc: jbarnes, dwmw2, linux-pci, iommu, kvm, linux-kernel

On Sun, Feb 15, 2009 at 07:20:52AM +0800, Grant Grundler wrote:
> On Thu, Feb 12, 2009 at 08:50:38PM +0800, Yu Zhao wrote:
> > Support device IOTLB (i.e. ATS) for both native and KVM environments.
> > +
> > +static void iommu_enable_dev_iotlb(struct device_domain_info *info)
> > +{
> > +	pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
> > +}
> 
> Why is a static function defined that calls a global function?

There would be some extra steps to do before VT-d enables ATS in the
future, so this wrapper makes code expandable later.

> 
> > +
> > +static void iommu_disable_dev_iotlb(struct device_domain_info *info)
> > +{
> > +	if (info->dev && pci_ats_enabled(info->dev))
> > +		pci_disable_ats(info->dev);
> > +}
> 
> ditto. pci_disable_ats() should be able to handle the case when
> "info->dev" is NULL and will know if ATS is enabled.

The info->dev could be NULL only because the VT-d code makes it so. AMD
an IBM IOMMU may not have this requirement. If we make pci_disable_ats()
accept NULL pci_dev, it would fail to catch some errors like using
pci_disable_ats without calling pci_enable_ats before.

> 
> I think both of these functions can be dropped and just directly call pci_*_ats().
> 
> > +
> > +static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
> > +				  u64 addr, unsigned mask)
> > +{
> > +	int rc;
> > +	u16 sid, qdep;
> > +	unsigned long flags;
> > +	struct device_domain_info *info;
> > +
> > +	spin_lock_irqsave(&device_domain_lock, flags);
> > +	list_for_each_entry(info, &domain->devices, link) {
> 
> Would it be possible to define a single domain for each PCI device?
> Or does "domain" represent an IOMMU?
> Sorry, I forgot...I'm sure someone has mentioned this the past.

A domain represents one translation mapping. For device used by the host,
there is one domain per device. Device assigned to a guest shares one
domain.

> 
> I want to point out list_for_each_entry() is effectively a "nested loop".
> iommu_flush_dev_iotlb() will get called alot from flush_unmaps().
> Perhaps do the lookup once there and pass that as a parameter?
> I don't know if that is feasible. But if this is a very frequently
> used code path, every CPU cycle counts.

iommu_flush_dev_iotlb() is only used to flush the devices used in the
host, which means there is always one entry on the list.

> 
> 
> > +		if (!info->dev || !pci_ats_enabled(info->dev))
> > +			continue;
> > +
> > +		sid = info->bus << 8 | info->devfn;
> > +		qdep = pci_ats_queue_depth(info->dev);
> 
> re Matthew Wilcox's comment - looks like caching ats_queue_depth
> is appropriate.

Yes, it's cached as of v3.

> > +		rc = qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
> > +		if (rc)
> > +			printk(KERN_ERR "IOMMU: flush device IOTLB failed\n");
> 
> Can this be a dev_printk please?

Yes, will replace it with dev_err().

> Perhaps in general review the use of "printk" so when errors are reported,
> users will know which devices might be affected by the failure.
> If more than a few printk's should be "converted" to dev_printk(), I'd
> be happy if that were a seperate patch (submitted later).
> 
> 
> >  	pr_debug("Set context mapping for %02x:%02x.%d\n",
> >  		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
> > @@ -1534,7 +1608,11 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
> >  	context_set_domain_id(context, id);
> >  	context_set_address_width(context, iommu->agaw);
> >  	context_set_address_root(context, virt_to_phys(pgd));
> > -	context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
> > +	info = iommu_support_dev_iotlb(domain, bus, devfn);
> > +	if (info)
> > +		context_set_translation_type(context, CONTEXT_TT_DEV_IOTLB);
> > +	else
> > +		context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
> 
> Would it be ok to rewrite this as:
> +		context_set_translation_type(context,
> +			 info ? CONTEXT_TT_DEV_IOTLB : CONTEXT_TT_MULTI_LEVEL);

Yes, this one looks better.

> 
> >  	context_set_fault_enable(context);
> >  	context_set_present(context);
> >  	domain_flush_cache(domain, context, sizeof(*context));
> > @@ -1546,6 +1624,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
> >  		iommu_flush_write_buffer(iommu);
> >  	else
> >  		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
> 
> Adding a blank line here would make this more readable.
> (AFAIK, not required by coding style, just my opinion.)

Yes, I prefer a bank line here too, somehow I missed it.

> 
> > +	if (info)
> > +		iommu_enable_dev_iotlb(info);
> 
> Could iommu_enable_dev_iotlb() (or pci_enable_ats()) check if "info" is NULL?
> Then this would just be a simple function call.
> 
> And it would be consistent with usage of iommu_disable_dev_iotlb().

Yes, good idea.

Thanks a lot for reviewing it!
Yu

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 0/6] ATS capability support for Intel IOMMU
  2009-02-26  2:50   ` Yu Zhao
@ 2009-02-26  3:46     ` Greg KH
  2009-02-27  7:19     ` Grant Grundler
  1 sibling, 0 replies; 20+ messages in thread
From: Greg KH @ 2009-02-26  3:46 UTC (permalink / raw)
  To: Yu Zhao
  Cc: Grant Grundler, jbarnes, dwmw2, linux-pci, iommu, kvm, linux-kernel

On Thu, Feb 26, 2009 at 10:50:35AM +0800, Yu Zhao wrote:
> On Sun, Feb 15, 2009 at 06:59:10AM +0800, Grant Grundler wrote:
> > On Thu, Feb 12, 2009 at 08:50:32PM +0800, Yu Zhao wrote:
> > > This patch series implements Address Translation Service support for
> > > the Intel IOMMU. ATS makes the PCI Endpoint be able to request the
> > > DMA address translation from the IOMMU and cache the translation in
> > > the Endpoint, thus alleviate IOMMU pressure and improve the hardware
> > > performance in the I/O virtualization environment.
> > > 
> > > 
> > > Changelog: v2 -> v3
> > >   1, throw error message if VT-d hardware detects invalid descriptor
> > >      on Queued Invalidation interface (David Woodhouse)
> > >   2, avoid using pci_find_ext_capability every time when reading ATS
> > >      Invalidate Queue Depth (Matthew Wilcox)
> > > Changelog: v1 -> v2
> > >   added 'static' prefix to a local LIST_HEAD (Andrew Morton)
> > > 
> > > 
> > > Yu Zhao (6):
> > >   PCI: support the ATS capability
> > >   VT-d: parse ATSR in DMA Remapping Reporting Structure
> > >   VT-d: add queue invalidation fault status support
> > >   VT-d: add device IOTLB invalidation support
> > >   VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
> > >   VT-d: support the device IOTLB
> > > 
> > >  drivers/pci/dmar.c           |  230 ++++++++++++++++++++++++++++++++++++++----
> > 
> > Yu,
> > Can you please add something to Documentation/PCI/pci.txt?
> > New API I'm seeing are:
> > +extern int pci_enable_ats(struct pci_dev *dev, int ps);
> > +extern void pci_disable_ats(struct pci_dev *dev);
> > +extern int pci_ats_queue_depth(struct pci_dev *dev);
> 
> Yes, I'll document these new API.
> 
> > Do these also need to be EXPORT_SYMBOL_GPL() as well?
> > Or are drivers never expected to call the above?
> 
> PCI device driver shouldn't use these API, only IOMMU driver (can't be module)
> would use them. Anyway it's a good idea to export them :-)

Don't export them if no one is using them, that's just a waste of space.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 0/6] ATS capability support for Intel IOMMU
  2009-02-26  2:50   ` Yu Zhao
  2009-02-26  3:46     ` Greg KH
@ 2009-02-27  7:19     ` Grant Grundler
  1 sibling, 0 replies; 20+ messages in thread
From: Grant Grundler @ 2009-02-27  7:19 UTC (permalink / raw)
  To: Yu Zhao
  Cc: Grant Grundler, jbarnes, dwmw2, linux-pci, iommu, kvm, linux-kernel

On Thu, Feb 26, 2009 at 10:50:35AM +0800, Yu Zhao wrote:
...
> > Yu,
> > Can you please add something to Documentation/PCI/pci.txt?
> > New API I'm seeing are:
> > +extern int pci_enable_ats(struct pci_dev *dev, int ps);
> > +extern void pci_disable_ats(struct pci_dev *dev);
> > +extern int pci_ats_queue_depth(struct pci_dev *dev);
> 
> Yes, I'll document these new API.

Thank you! (But maybe this was a bad idea)

> > Do these also need to be EXPORT_SYMBOL_GPL() as well?
> > Or are drivers never expected to call the above?
> 
> PCI device driver shouldn't use these API, only IOMMU driver (can't be module)
> would use them. Anyway it's a good idea to export them :-)

No, it's not a good idea to export if only IOMMU drivers should use them.
Exporting the symbols can only lead to abuse.

In fact, my request to add them to pci.txt sounds like a bad idea.
I was thinking this was for device drivers.

In any case, documenting the API and intended use is good.
It's probably sufficient to add comments where the functions are defined.

thanks,
grant

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 0/6] ATS capability support for Intel IOMMU
  2009-02-12 12:50 [PATCH v3 0/6] ATS capability support for Intel IOMMU Yu Zhao
                   ` (7 preceding siblings ...)
  2009-02-14 22:59 ` Grant Grundler
@ 2009-03-20  2:30 ` Jesse Barnes
  2009-03-20  2:30 ` Jesse Barnes
  9 siblings, 0 replies; 20+ messages in thread
From: Jesse Barnes @ 2009-03-20  2:30 UTC (permalink / raw)
  To: Yu Zhao; +Cc: dwmw2, linux-pci, iommu, kvm, linux-kernel, Yu Zhao

On Thu, 12 Feb 2009 20:50:32 +0800
Yu Zhao <yu.zhao@intel.com> wrote:

> This patch series implements Address Translation Service support for
> the Intel IOMMU. ATS makes the PCI Endpoint be able to request the
> DMA address translation from the IOMMU and cache the translation in
> the Endpoint, thus alleviate IOMMU pressure and improve the hardware
> performance in the I/O virtualization environment.
> 
> 
> Changelog: v2 -> v3
>   1, throw error message if VT-d hardware detects invalid descriptor
>      on Queued Invalidation interface (David Woodhouse)
>   2, avoid using pci_find_ext_capability every time when reading ATS
>      Invalidate Queue Depth (Matthew Wilcox)
> Changelog: v1 -> v2
>   added 'static' prefix to a local LIST_HEAD (Andrew Morton)
> 
> 
> Yu Zhao (6):
>   PCI: support the ATS capability
>   VT-d: parse ATSR in DMA Remapping Reporting Structure
>   VT-d: add queue invalidation fault status support
>   VT-d: add device IOTLB invalidation support
>   VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
>   VT-d: support the device IOTLB

Is this one ready too, Yu?  Care to send a respin incorporating the
last set of feedback?

Thanks,
-- 
Jesse Barnes, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 0/6] ATS capability support for Intel IOMMU
  2009-02-12 12:50 [PATCH v3 0/6] ATS capability support for Intel IOMMU Yu Zhao
                   ` (8 preceding siblings ...)
  2009-03-20  2:30 ` Jesse Barnes
@ 2009-03-20  2:30 ` Jesse Barnes
  2009-03-20  2:47   ` Zhao, Yu
  9 siblings, 1 reply; 20+ messages in thread
From: Jesse Barnes @ 2009-03-20  2:30 UTC (permalink / raw)
  To: Yu Zhao; +Cc: dwmw2, linux-pci, iommu, kvm, linux-kernel, Yu Zhao

On Thu, 12 Feb 2009 20:50:32 +0800
Yu Zhao <yu.zhao@intel.com> wrote:

> This patch series implements Address Translation Service support for
> the Intel IOMMU. ATS makes the PCI Endpoint be able to request the
> DMA address translation from the IOMMU and cache the translation in
> the Endpoint, thus alleviate IOMMU pressure and improve the hardware
> performance in the I/O virtualization environment.
> 
> 
> Changelog: v2 -> v3
>   1, throw error message if VT-d hardware detects invalid descriptor
>      on Queued Invalidation interface (David Woodhouse)
>   2, avoid using pci_find_ext_capability every time when reading ATS
>      Invalidate Queue Depth (Matthew Wilcox)
> Changelog: v1 -> v2
>   added 'static' prefix to a local LIST_HEAD (Andrew Morton)
> 
> 
> Yu Zhao (6):
>   PCI: support the ATS capability
>   VT-d: parse ATSR in DMA Remapping Reporting Structure
>   VT-d: add queue invalidation fault status support
>   VT-d: add device IOTLB invalidation support
>   VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
>   VT-d: support the device IOTLB

Um nevermind, this should go through the IOMMU tree (David?).

-- 
Jesse Barnes, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 0/6] ATS capability support for Intel IOMMU
  2009-03-20  2:30 ` Jesse Barnes
@ 2009-03-20  2:47   ` Zhao, Yu
  2009-03-20 11:15     ` David Woodhouse
  0 siblings, 1 reply; 20+ messages in thread
From: Zhao, Yu @ 2009-03-20  2:47 UTC (permalink / raw)
  To: Jesse Barnes, dwmw2; +Cc: linux-pci, iommu, kvm, linux-kernel

Jesse Barnes wrote:
> On Thu, 12 Feb 2009 20:50:32 +0800
> Yu Zhao <yu.zhao@intel.com> wrote:
> 
>> This patch series implements Address Translation Service support for
>> the Intel IOMMU. ATS makes the PCI Endpoint be able to request the
>> DMA address translation from the IOMMU and cache the translation in
>> the Endpoint, thus alleviate IOMMU pressure and improve the hardware
>> performance in the I/O virtualization environment.
>>
>>
>> Changelog: v2 -> v3
>>   1, throw error message if VT-d hardware detects invalid descriptor
>>      on Queued Invalidation interface (David Woodhouse)
>>   2, avoid using pci_find_ext_capability every time when reading ATS
>>      Invalidate Queue Depth (Matthew Wilcox)
>> Changelog: v1 -> v2
>>   added 'static' prefix to a local LIST_HEAD (Andrew Morton)
>>
>>
>> Yu Zhao (6):
>>   PCI: support the ATS capability
>>   VT-d: parse ATSR in DMA Remapping Reporting Structure
>>   VT-d: add queue invalidation fault status support
>>   VT-d: add device IOTLB invalidation support
>>   VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps
>>   VT-d: support the device IOTLB
> 
> Um nevermind, this should go through the IOMMU tree (David?).

If it's possible, I'd like it go through the PCI tree because the ATS 
depends on the SR-IOV. This dependency is not reflected in this v3 
series since the SR-IOV is not in-tree and I don't want to break the 
build after people apply the ATS on their tree.

So Dave, can I get an ack from you and let Jesse pull the IOMMU change 
to his tree? Or let this ATS go to 2.6.31?

Thanks,
Yu

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 0/6] ATS capability support for Intel IOMMU
  2009-03-20  2:47   ` Zhao, Yu
@ 2009-03-20 11:15     ` David Woodhouse
  2009-03-23  5:22       ` Yu Zhao
  0 siblings, 1 reply; 20+ messages in thread
From: David Woodhouse @ 2009-03-20 11:15 UTC (permalink / raw)
  To: Zhao, Yu; +Cc: Jesse Barnes, linux-pci, iommu, kvm, linux-kernel

On Fri, 2009-03-20 at 10:47 +0800, Zhao, Yu wrote:
> If it's possible, I'd like it go through the PCI tree because the ATS 
> depends on the SR-IOV. This dependency is not reflected in this v3 
> series since the SR-IOV is not in-tree and I don't want to break the 
> build after people apply the ATS on their tree.

In what way will it depend on SR-IOV?

> So Dave, can I get an ack from you and let Jesse pull the IOMMU change
> to his tree? Or let this ATS go to 2.6.31?

Want to show the latest version of the patches which depend on SR-IOV,
and I can ack them?

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 0/6] ATS capability support for Intel IOMMU
  2009-03-20 11:15     ` David Woodhouse
@ 2009-03-23  5:22       ` Yu Zhao
  0 siblings, 0 replies; 20+ messages in thread
From: Yu Zhao @ 2009-03-23  5:22 UTC (permalink / raw)
  To: David Woodhouse; +Cc: Jesse Barnes, linux-pci, iommu, kvm, linux-kernel

On Fri, Mar 20, 2009 at 07:15:51PM +0800, David Woodhouse wrote:
> On Fri, 2009-03-20 at 10:47 +0800, Zhao, Yu wrote:
> > If it's possible, I'd like it go through the PCI tree because the ATS 
> > depends on the SR-IOV. This dependency is not reflected in this v3 
> > series since the SR-IOV is not in-tree and I don't want to break the 
> > build after people apply the ATS on their tree.
> 
> In what way will it depend on SR-IOV?

The SR-IOV spec section 3.7.4 says that the Smallest Translation Unit and
the Invalidate Queue Depth fields in the Virtual Function's ATS capability
are hard-wired to 0. So we need some special handling when enabling the ATS
capability for the Virtual Function.

                  Table 3-26: ATS Capability Register
-------------+-----------------------------------------+---------------+--------------
Bit Location | PF and VF Register Differences From ATS | PF Attributes | VF Attributes
-------------+-----------------------------------------+---------------+--------------
             |    Smallest Translation Unit (STU)      |               |
   20:16     |    Hardwired to 0 for VFs.              |      ATS      |      RO
             |    PF value applies to all VFs.         |               |
-------------+-----------------------------------------+---------------+--------------
             |    Invalidate Queue Depth               |               |
   28:24     |    Hardwired to 0 for VFs.              |      ATS      |      RO
             |    Depth of shared PF input queue.      |               |
-------------+-----------------------------------------+---------------+--------------

> > So Dave, can I get an ack from you and let Jesse pull the IOMMU change
> > to his tree? Or let this ATS go to 2.6.31?
> 
> Want to show the latest version of the patches which depend on SR-IOV,
> and I can ack them?

Sure, thanks!

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2009-03-23  5:21 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-02-12 12:50 [PATCH v3 0/6] ATS capability support for Intel IOMMU Yu Zhao
2009-02-12 12:50 ` [PATCH v3 1/6] PCI: support the ATS capability Yu Zhao
2009-02-12 12:50 ` [PATCH v3 2/6] VT-d: parse ATSR in DMA Remapping Reporting Structure Yu Zhao
2009-02-12 12:50 ` [PATCH v3 3/6] VT-d: add queue invalidation fault status support Yu Zhao
2009-02-12 12:50 ` [PATCH v3 4/6] VT-d: add device IOTLB invalidation support Yu Zhao
2009-02-12 12:50 ` [PATCH v3 5/6] VT-d: cleanup iommu_flush_iotlb_psi and flush_unmaps Yu Zhao
2009-02-12 12:50 ` [PATCH v3 6/6] VT-d: support the device IOTLB Yu Zhao
2009-02-14 23:20   ` Grant Grundler
2009-02-26  3:21     ` Yu Zhao
2009-02-13  3:44 ` [PATCH v3 0/6] ATS capability support for Intel IOMMU Matthew Wilcox
2009-02-13  5:27   ` Zhao, Yu
2009-02-14 22:59 ` Grant Grundler
2009-02-26  2:50   ` Yu Zhao
2009-02-26  3:46     ` Greg KH
2009-02-27  7:19     ` Grant Grundler
2009-03-20  2:30 ` Jesse Barnes
2009-03-20  2:30 ` Jesse Barnes
2009-03-20  2:47   ` Zhao, Yu
2009-03-20 11:15     ` David Woodhouse
2009-03-23  5:22       ` Yu Zhao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.