linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] iommu/dmar: expose fault counters via sysfs
@ 2019-10-15 15:11 Yuri Volchkov
  2019-10-15 15:11 ` [PATCH 1/2] iommu/dmar: collect fault statistics Yuri Volchkov
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Yuri Volchkov @ 2019-10-15 15:11 UTC (permalink / raw)
  To: iommu, linux-kernel, linux-pci
  Cc: joro, bhelgaas, dwmw2, neugebar, Yuri Volchkov

For health monitoring, it can be useful to know if iommu is behaving as
expected. DMAR faults can be an indicator that a device:
 - has been misconfigured, or
 - has experienced a hardware hiccup and replacement should
   be considered, or
 - has been issuing faults due to malicious activity

Currently the only way to check if there were any DMAR faults on the
host is to scan the dmesg output. However this approach is not very
elegant. The information we are looking for can be wrapped out of the
buffer, or masked (since it is a rate-limited print) by another
device.

The series adds counters for DMAR faults and exposes them via sysfs.

Yuri Volchkov (2):
  iommu/dmar: collect fault statistics
  iommu/dmar: catch early fault occurrences

 drivers/iommu/dmar.c        | 182 ++++++++++++++++++++++++++++++++----
 drivers/iommu/intel-iommu.c |   1 +
 drivers/pci/pci-sysfs.c     |  20 ++++
 include/linux/intel-iommu.h |   4 +
 include/linux/pci.h         |  11 +++
 5 files changed, 201 insertions(+), 17 deletions(-)

-- 
2.23.0




Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Ralf Herbrich
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879




^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/2] iommu/dmar: collect fault statistics
  2019-10-15 15:11 [PATCH 0/2] iommu/dmar: expose fault counters via sysfs Yuri Volchkov
@ 2019-10-15 15:11 ` Yuri Volchkov
  2019-10-16 17:16   ` Bjorn Helgaas
  2019-10-15 15:11 ` [PATCH 2/2] iommu/dmar: catch early fault occurrences Yuri Volchkov
  2019-10-16  0:45 ` [PATCH 0/2] iommu/dmar: expose fault counters via sysfs Lu Baolu
  2 siblings, 1 reply; 5+ messages in thread
From: Yuri Volchkov @ 2019-10-15 15:11 UTC (permalink / raw)
  To: iommu, linux-kernel, linux-pci
  Cc: joro, bhelgaas, dwmw2, neugebar, Yuri Volchkov

Currently dmar_fault handler only prints a message in the dmesg. This
commit introduces counters - how many faults have happened, and
exposes them via sysfs. Each pci device will have an entry
'dmar_faults' reading from which will give user 3 lines
  remap: xxx
  read: xxx
  write: xxx

This functionality is targeted for health monitoring daemons.

Signed-off-by: Yuri Volchkov <volchkov@amazon.de>
---
 drivers/iommu/dmar.c        | 133 +++++++++++++++++++++++++++++++-----
 drivers/pci/pci-sysfs.c     |  20 ++++++
 include/linux/intel-iommu.h |   3 +
 include/linux/pci.h         |  11 +++
 4 files changed, 150 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index eecd6a421667..0749873e3e41 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1107,6 +1107,7 @@ static void free_iommu(struct intel_iommu *iommu)
 	}
 
 	if (iommu->irq) {
+		destroy_workqueue(iommu->fault_wq);
 		if (iommu->pr_irq) {
 			free_irq(iommu->pr_irq, iommu);
 			dmar_free_hwirq(iommu->pr_irq);
@@ -1672,9 +1673,46 @@ void dmar_msi_read(int irq, struct msi_msg *msg)
 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
-static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
-		u8 fault_reason, int pasid, u16 source_id,
-		unsigned long long addr)
+struct dmar_fault_info {
+	struct work_struct work;
+	struct intel_iommu *iommu;
+	int type;
+	int pasid;
+	u16 source_id;
+	unsigned long long addr;
+	u8 fault_reason;
+};
+
+static struct kmem_cache *dmar_fault_info_cache;
+int __init dmar_fault_info_cache_init(void)
+{
+	int ret = 0;
+
+	dmar_fault_info_cache =
+		kmem_cache_create("dmar_fault_info",
+				  sizeof(struct dmar_fault_info), 0,
+				  SLAB_HWCACHE_ALIGN, NULL);
+	if (!dmar_fault_info_cache) {
+		pr_err("Couldn't create dmar_fault_info cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static inline void *alloc_dmar_fault_info(void)
+{
+	return kmem_cache_alloc(dmar_fault_info_cache, GFP_ATOMIC);
+}
+
+static inline void free_dmar_fault_info(void *vaddr)
+{
+	kmem_cache_free(dmar_fault_info_cache, vaddr);
+}
+
+static int dmar_fault_dump_one(struct intel_iommu *iommu, int type,
+			       u8 fault_reason, int pasid, u16 source_id,
+			       unsigned long long addr)
 {
 	const char *reason;
 	int fault_type;
@@ -1695,6 +1733,57 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
 	return 0;
 }
 
+static int dmar_fault_handle_one(struct dmar_fault_info *info)
+{
+	struct pci_dev *pdev;
+	u8 devfn;
+	atomic_t *pcnt;
+
+	devfn = PCI_DEVFN(PCI_SLOT(info->source_id), PCI_FUNC(info->source_id));
+	pdev = pci_get_domain_bus_and_slot(info->iommu->segment,
+					   PCI_BUS_NUM(info->source_id), devfn);
+
+	if (!pdev)
+		return -ENXIO;
+
+	if (info->fault_reason == INTR_REMAP)
+		pcnt = &pdev->faults_cnt.remap;
+	else if (info->type)
+		pcnt = &pdev->faults_cnt.read;
+	else
+		pcnt = &pdev->faults_cnt.write;
+
+	atomic_inc(pcnt);
+	return 0;
+}
+
+static void dmar_fault_handle_work(struct work_struct *work)
+{
+	struct dmar_fault_info *info;
+
+	info = container_of(work, struct dmar_fault_info, work);
+
+	dmar_fault_handle_one(info);
+	free_dmar_fault_info(info);
+}
+
+static int dmar_fault_schedule_one(struct intel_iommu *iommu, int type,
+				   u8 fault_reason, int pasid, u16 source_id,
+				   unsigned long long addr)
+{
+	struct dmar_fault_info *info = alloc_dmar_fault_info();
+
+	INIT_WORK(&info->work, dmar_fault_handle_work);
+	info->iommu = iommu;
+	info->type = type;
+	info->fault_reason = fault_reason;
+	info->pasid = pasid;
+	info->source_id = source_id;
+	info->addr = addr;
+
+	return queue_work(iommu->fault_wq, &info->work);
+}
+
 #define PRIMARY_FAULT_REG_LEN (16)
 irqreturn_t dmar_fault(int irq, void *dev_id)
 {
@@ -1733,20 +1822,18 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
 		if (!(data & DMA_FRCD_F))
 			break;
 
-		if (!ratelimited) {
-			fault_reason = dma_frcd_fault_reason(data);
-			type = dma_frcd_type(data);
+		fault_reason = dma_frcd_fault_reason(data);
+		type = dma_frcd_type(data);
 
-			pasid = dma_frcd_pasid_value(data);
-			data = readl(iommu->reg + reg +
-				     fault_index * PRIMARY_FAULT_REG_LEN + 8);
-			source_id = dma_frcd_source_id(data);
+		pasid = dma_frcd_pasid_value(data);
+		data = readl(iommu->reg + reg +
+			     fault_index * PRIMARY_FAULT_REG_LEN + 8);
+		source_id = dma_frcd_source_id(data);
 
-			pasid_present = dma_frcd_pasid_present(data);
-			guest_addr = dmar_readq(iommu->reg + reg +
+		pasid_present = dma_frcd_pasid_present(data);
+		guest_addr = dmar_readq(iommu->reg + reg +
 					fault_index * PRIMARY_FAULT_REG_LEN);
-			guest_addr = dma_frcd_page_addr(guest_addr);
-		}
+		guest_addr = dma_frcd_page_addr(guest_addr);
 
 		/* clear the fault */
 		writel(DMA_FRCD_F, iommu->reg + reg +
@@ -1756,9 +1843,13 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
 
 		if (!ratelimited)
 			/* Using pasid -1 if pasid is not present */
-			dmar_fault_do_one(iommu, type, fault_reason,
-					  pasid_present ? pasid : -1,
-					  source_id, guest_addr);
+			dmar_fault_dump_one(iommu, type, fault_reason,
+					    pasid_present ? pasid : -1,
+					    source_id, guest_addr);
+		if (irq != -1)
+			dmar_fault_schedule_one(iommu, type, fault_reason,
+						pasid_present ? pasid : -1,
+						source_id, guest_addr);
 
 		fault_index++;
 		if (fault_index >= cap_num_fault_regs(iommu->cap))
@@ -1784,6 +1875,10 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
 	if (iommu->irq)
 		return 0;
 
+	iommu->fault_wq = alloc_ordered_workqueue("fault_%s", 0, iommu->name);
+	if (!iommu->fault_wq)
+		return -ENOMEM;
+
 	irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
 	if (irq > 0) {
 		iommu->irq = irq;
@@ -1803,6 +1898,9 @@ int __init enable_drhd_fault_handling(void)
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu;
 
+	if (dmar_fault_info_cache_init())
+		return -1;
+
 	/*
 	 * Enable fault control interrupt.
 	 */
@@ -1813,6 +1911,7 @@ int __init enable_drhd_fault_handling(void)
 		if (ret) {
 			pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n",
 			       (unsigned long long)drhd->reg_base_addr, ret);
+			kmem_cache_destroy(dmar_fault_info_cache);
 			return -1;
 		}
 
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 07bd84c50238..d01514fca6d1 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -263,6 +263,23 @@ static ssize_t ari_enabled_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(ari_enabled);
 
+#ifdef CONFIG_DMAR_TABLE
+static ssize_t dmar_faults_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int remap, read, write;
+
+	remap = atomic_xchg(&pdev->faults_cnt.remap, 0);
+	read = atomic_xchg(&pdev->faults_cnt.read, 0);
+	write = atomic_xchg(&pdev->faults_cnt.write, 0);
+
+	return sprintf(buf, "remap: %d\nread: %d\nwrite: %d\n", remap, read,
+		       write);
+}
+static DEVICE_ATTR_RO(dmar_faults);
+#endif
+
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 			     char *buf)
 {
@@ -623,6 +640,9 @@ static struct attribute *pci_dev_attrs[] = {
 #endif
 	&dev_attr_driver_override.attr,
 	&dev_attr_ari_enabled.attr,
+#ifdef CONFIG_DMAR_TABLE
+	&dev_attr_dmar_faults.attr,
+#endif
 	NULL,
 };
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index ed11ef594378..f8963c833fb0 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -552,6 +552,9 @@ struct intel_iommu {
 	struct iommu_device iommu;  /* IOMMU core code handle */
 	int		node;
 	u32		flags;      /* Software defined flags */
+#ifdef CONFIG_DMAR_TABLE
+	struct workqueue_struct *fault_wq; /* Fault handler */
+#endif
 };
 
 /* PCI domain-device relationship */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 14efa2586a8c..d9cc94fbf0ee 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -286,6 +286,14 @@ struct pci_vpd;
 struct pci_sriov;
 struct pci_p2pdma;
 
+#ifdef CONFIG_DMAR_TABLE
+struct pci_dmar_faults_cnt {
+	atomic_t read; /* How many read faults occurred */
+	atomic_t write; /* How many write faults occurred */
+	atomic_t remap; /* How many remap faults occurred */
+};
+#endif
+
 /* The pci_dev structure describes PCI devices */
 struct pci_dev {
 	struct list_head bus_list;	/* Node in per-bus list */
@@ -468,6 +476,9 @@ struct pci_dev {
 	char		*driver_override; /* Driver name to force a match */
 
 	unsigned long	priv_flags;	/* Private flags for the PCI driver */
+#ifdef CONFIG_DMAR_TABLE
+	struct pci_dmar_faults_cnt faults_cnt; /* Dmar fault statistics */
+#endif
 };
 
 static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
-- 
2.23.0




Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Ralf Herbrich
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879




^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] iommu/dmar: catch early fault occurrences
  2019-10-15 15:11 [PATCH 0/2] iommu/dmar: expose fault counters via sysfs Yuri Volchkov
  2019-10-15 15:11 ` [PATCH 1/2] iommu/dmar: collect fault statistics Yuri Volchkov
@ 2019-10-15 15:11 ` Yuri Volchkov
  2019-10-16  0:45 ` [PATCH 0/2] iommu/dmar: expose fault counters via sysfs Lu Baolu
  2 siblings, 0 replies; 5+ messages in thread
From: Yuri Volchkov @ 2019-10-15 15:11 UTC (permalink / raw)
  To: iommu, linux-kernel, linux-pci
  Cc: joro, bhelgaas, dwmw2, neugebar, Yuri Volchkov

First dmar faults can happen even before linux have scanned PCI
bus. In such case worker will not have chance to lookup a
corresponding 'struct pci_dev'.

This commit defers processing of the fault until intel_iommu_init
function has been called. At this point of time pci devices will be
already initialized.

Signed-off-by: Yuri Volchkov <volchkov@amazon.de>
---
 drivers/iommu/dmar.c        | 51 ++++++++++++++++++++++++++++++++++++-
 drivers/iommu/intel-iommu.c |  1 +
 include/linux/intel-iommu.h |  1 +
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 0749873e3e41..8db2af3de29f 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1674,7 +1674,10 @@ void dmar_msi_read(int irq, struct msi_msg *msg)
 }
 
 struct dmar_fault_info {
-	struct work_struct work;
+	union {
+		struct work_struct work;
+		struct list_head backlog_list;
+	};
 	struct intel_iommu *iommu;
 	int type;
 	int pasid;
@@ -1757,12 +1760,58 @@ static int dmar_fault_handle_one(struct dmar_fault_info *info)
 	return 0;
 }
 
+struct fault_backlog {
+	struct list_head queue;
+	struct mutex lock;
+	bool is_active;
+};
+
+static struct fault_backlog fault_backlog = {
+	.queue = LIST_HEAD_INIT(fault_backlog.queue),
+	.lock = __MUTEX_INITIALIZER(fault_backlog.lock),
+	.is_active = true,
+};
+
+void dmar_process_deferred_faults(void)
+{
+	struct dmar_fault_info *info, *tmp;
+
+	mutex_lock(&fault_backlog.lock);
+	WARN_ON(!fault_backlog.is_active);
+
+	list_for_each_entry_safe(info, tmp, &fault_backlog.queue,
+				 backlog_list) {
+		dmar_fault_handle_one(info);
+		list_del(&info->backlog_list);
+		free_dmar_fault_info(info);
+	}
+	fault_backlog.is_active = false;
+	mutex_unlock(&fault_backlog.lock);
+}
+
 static void dmar_fault_handle_work(struct work_struct *work)
 {
 	struct dmar_fault_info *info;
 
 	info = container_of(work, struct dmar_fault_info, work);
 
+	if (fault_backlog.is_active) {
+		/* Postpone handling until PCI devices will be
+		 * initialized
+		 */
+
+		mutex_lock(&fault_backlog.lock);
+		if (!fault_backlog.is_active) {
+			mutex_unlock(&fault_backlog.lock);
+			goto process;
+		}
+
+		list_add(&info->backlog_list, &fault_backlog.queue);
+		mutex_unlock(&fault_backlog.lock);
+		return;
+	}
+
+process:
 	dmar_fault_handle_one(info);
 	free_dmar_fault_info(info);
 }
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 3f974919d3bd..a97c05fac5e9 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5041,6 +5041,7 @@ int __init intel_iommu_init(void)
 		iommu_disable_protect_mem_regions(iommu);
 	}
 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
+	dmar_process_deferred_faults();
 
 	intel_iommu_enabled = 1;
 	intel_iommu_debugfs_init();
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index f8963c833fb0..480a31b41263 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -649,6 +649,7 @@ extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
 extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
 extern int dmar_ir_support(void);
+extern void dmar_process_deferred_faults(void);
 
 void *alloc_pgtable_page(int node);
 void free_pgtable_page(void *vaddr);
-- 
2.23.0




Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Ralf Herbrich
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879




^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 0/2] iommu/dmar: expose fault counters via sysfs
  2019-10-15 15:11 [PATCH 0/2] iommu/dmar: expose fault counters via sysfs Yuri Volchkov
  2019-10-15 15:11 ` [PATCH 1/2] iommu/dmar: collect fault statistics Yuri Volchkov
  2019-10-15 15:11 ` [PATCH 2/2] iommu/dmar: catch early fault occurrences Yuri Volchkov
@ 2019-10-16  0:45 ` Lu Baolu
  2 siblings, 0 replies; 5+ messages in thread
From: Lu Baolu @ 2019-10-16  0:45 UTC (permalink / raw)
  To: Yuri Volchkov, iommu, linux-kernel, linux-pci
  Cc: baolu.lu, joro, bhelgaas, dwmw2, neugebar

Hi,

On 10/15/19 11:11 PM, Yuri Volchkov wrote:
> For health monitoring, it can be useful to know if iommu is behaving as
> expected. DMAR faults can be an indicator that a device:
>   - has been misconfigured, or
>   - has experienced a hardware hiccup and replacement should
>     be considered, or
>   - has been issuing faults due to malicious activity
> 
> Currently the only way to check if there were any DMAR faults on the
> host is to scan the dmesg output. However this approach is not very
> elegant. The information we are looking for can be wrapped out of the
> buffer, or masked (since it is a rate-limited print) by another
> device.
> 
> The series adds counters for DMAR faults and exposes them via sysfs.
> 

We now have an iommu API named iommu_register_fault_handler() to
register callbacks for dmar faults. How about monitoring the dmar
fault through this api so that your code could be generic and vendor
agnostic?

Best regards,
Baolu

> Yuri Volchkov (2):
>    iommu/dmar: collect fault statistics
>    iommu/dmar: catch early fault occurrences
> 
>   drivers/iommu/dmar.c        | 182 ++++++++++++++++++++++++++++++++----
>   drivers/iommu/intel-iommu.c |   1 +
>   drivers/pci/pci-sysfs.c     |  20 ++++
>   include/linux/intel-iommu.h |   4 +
>   include/linux/pci.h         |  11 +++
>   5 files changed, 201 insertions(+), 17 deletions(-)
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] iommu/dmar: collect fault statistics
  2019-10-15 15:11 ` [PATCH 1/2] iommu/dmar: collect fault statistics Yuri Volchkov
@ 2019-10-16 17:16   ` Bjorn Helgaas
  0 siblings, 0 replies; 5+ messages in thread
From: Bjorn Helgaas @ 2019-10-16 17:16 UTC (permalink / raw)
  To: Yuri Volchkov; +Cc: iommu, linux-kernel, linux-pci, joro, dwmw2, neugebar

Hi Yuri,

On Tue, Oct 15, 2019 at 05:11:11PM +0200, Yuri Volchkov wrote:
> Currently dmar_fault handler only prints a message in the dmesg. This
> commit introduces counters - how many faults have happened, and
> exposes them via sysfs. Each pci device will have an entry
> 'dmar_faults' reading from which will give user 3 lines
>   remap: xxx
>   read: xxx
>   write: xxx

I think you should have three files instead of putting all these in a
single file.  See https://lore.kernel.org/r/20190621072911.GA21600@kroah.com
They should also be documented in Documentation/ABI/

I'm not sure this should be DMAR-specific.  Couldn't we count similar
events for other IOMMUs as well?

> This functionality is targeted for health monitoring daemons.
> 
> Signed-off-by: Yuri Volchkov <volchkov@amazon.de>
> ---
>  drivers/iommu/dmar.c        | 133 +++++++++++++++++++++++++++++++-----
>  drivers/pci/pci-sysfs.c     |  20 ++++++
>  include/linux/intel-iommu.h |   3 +
>  include/linux/pci.h         |  11 +++
>  4 files changed, 150 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
> index eecd6a421667..0749873e3e41 100644
> --- a/drivers/iommu/dmar.c
> +++ b/drivers/iommu/dmar.c
> @@ -1107,6 +1107,7 @@ static void free_iommu(struct intel_iommu *iommu)
>  	}
>  
>  	if (iommu->irq) {
> +		destroy_workqueue(iommu->fault_wq);
>  		if (iommu->pr_irq) {
>  			free_irq(iommu->pr_irq, iommu);
>  			dmar_free_hwirq(iommu->pr_irq);
> @@ -1672,9 +1673,46 @@ void dmar_msi_read(int irq, struct msi_msg *msg)
>  	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
>  }
>  
> -static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
> -		u8 fault_reason, int pasid, u16 source_id,
> -		unsigned long long addr)
> +struct dmar_fault_info {
> +	struct work_struct work;
> +	struct intel_iommu *iommu;
> +	int type;
> +	int pasid;
> +	u16 source_id;
> +	unsigned long long addr;
> +	u8 fault_reason;
> +};
> +
> +static struct kmem_cache *dmar_fault_info_cache;
> +int __init dmar_fault_info_cache_init(void)
> +{
> +	int ret = 0;
> +
> +	dmar_fault_info_cache =
> +		kmem_cache_create("dmar_fault_info",
> +				  sizeof(struct dmar_fault_info), 0,
> +				  SLAB_HWCACHE_ALIGN, NULL);
> +	if (!dmar_fault_info_cache) {
> +		pr_err("Couldn't create dmar_fault_info cache\n");
> +		ret = -ENOMEM;
> +	}
> +
> +	return ret;
> +}
> +
> +static inline void *alloc_dmar_fault_info(void)
> +{
> +	return kmem_cache_alloc(dmar_fault_info_cache, GFP_ATOMIC);
> +}
> +
> +static inline void free_dmar_fault_info(void *vaddr)
> +{
> +	kmem_cache_free(dmar_fault_info_cache, vaddr);
> +}
> +
> +static int dmar_fault_dump_one(struct intel_iommu *iommu, int type,
> +			       u8 fault_reason, int pasid, u16 source_id,
> +			       unsigned long long addr)
>  {
>  	const char *reason;
>  	int fault_type;
> @@ -1695,6 +1733,57 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
>  	return 0;
>  }
>  
> +static int dmar_fault_handle_one(struct dmar_fault_info *info)
> +{
> +	struct pci_dev *pdev;
> +	u8 devfn;
> +	atomic_t *pcnt;
> +
> +	devfn = PCI_DEVFN(PCI_SLOT(info->source_id), PCI_FUNC(info->source_id));
> +	pdev = pci_get_domain_bus_and_slot(info->iommu->segment,
> +					   PCI_BUS_NUM(info->source_id), devfn);

I'm sure you've considered this already, but it's not completely clear
to me whether these counters should be in the pci_dev (as in this
patch) or in something IOMMU-related.

The pci_dev is nice because you automatically have counters for each
PCI device, and most faults can be tied back to a device.

But on the other hand, it's not the PCI device actually detecting and
reporting the error.  It's the IOMMU reporting the fault, and while
it's *likely* there's a pci_dev corresponding to the
bus/device/function info in the IOMMU error registers, there's no
guarantee: the device may have been hot-removed between reading the
IOMMU registers and doing the pci_get_domain_bus_and_slot(), or (I
suspect) faults could be caused by corrupted or malicious TLPs.

Another possible issue is that if the counts are in the pci_dev,
they're lost if the device is removed, which might happen while
diagnosing faulty hardware.

So I tend to think this is really IOMMU error information and the
IOMMU driver should handle it itself, including logging and exposing
it via sysfs.

> +	if (!pdev)
> +		return -ENXIO;
> +
> +	if (info->fault_reason == INTR_REMAP)
> +		pcnt = &pdev->faults_cnt.remap;
> +	else if (info->type)
> +		pcnt = &pdev->faults_cnt.read;
> +	else
> +		pcnt = &pdev->faults_cnt.write;
> +
> +	atomic_inc(pcnt);

pci_get_domain_bus_and_slot() increments pdev's reference count, so
you would need to release it here.

> +	return 0;
> +}
> +
> +static void dmar_fault_handle_work(struct work_struct *work)
> +{
> +	struct dmar_fault_info *info;
> +
> +	info = container_of(work, struct dmar_fault_info, work);
> +
> +	dmar_fault_handle_one(info);
> +	free_dmar_fault_info(info);
> +}
> +
> +static int dmar_fault_schedule_one(struct intel_iommu *iommu, int type,
> +				   u8 fault_reason, int pasid, u16 source_id,
> +				   unsigned long long addr)
> +{
> +	struct dmar_fault_info *info = alloc_dmar_fault_info();
> +
> +	INIT_WORK(&info->work, dmar_fault_handle_work);
> +	info->iommu = iommu;
> +	info->type = type;
> +	info->fault_reason = fault_reason;
> +	info->pasid = pasid;
> +	info->source_id = source_id;
> +	info->addr = addr;
> +
> +	return queue_work(iommu->fault_wq, &info->work);
> +}
> +
>  #define PRIMARY_FAULT_REG_LEN (16)
>  irqreturn_t dmar_fault(int irq, void *dev_id)
>  {
> @@ -1733,20 +1822,18 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
>  		if (!(data & DMA_FRCD_F))
>  			break;
>  
> -		if (!ratelimited) {
> -			fault_reason = dma_frcd_fault_reason(data);
> -			type = dma_frcd_type(data);
> +		fault_reason = dma_frcd_fault_reason(data);
> +		type = dma_frcd_type(data);
>  
> -			pasid = dma_frcd_pasid_value(data);
> -			data = readl(iommu->reg + reg +
> -				     fault_index * PRIMARY_FAULT_REG_LEN + 8);
> -			source_id = dma_frcd_source_id(data);
> +		pasid = dma_frcd_pasid_value(data);
> +		data = readl(iommu->reg + reg +
> +			     fault_index * PRIMARY_FAULT_REG_LEN + 8);
> +		source_id = dma_frcd_source_id(data);
>  
> -			pasid_present = dma_frcd_pasid_present(data);
> -			guest_addr = dmar_readq(iommu->reg + reg +
> +		pasid_present = dma_frcd_pasid_present(data);
> +		guest_addr = dmar_readq(iommu->reg + reg +
>  					fault_index * PRIMARY_FAULT_REG_LEN);
> -			guest_addr = dma_frcd_page_addr(guest_addr);
> -		}
> +		guest_addr = dma_frcd_page_addr(guest_addr);
>  
>  		/* clear the fault */
>  		writel(DMA_FRCD_F, iommu->reg + reg +
> @@ -1756,9 +1843,13 @@ irqreturn_t dmar_fault(int irq, void *dev_id)
>  
>  		if (!ratelimited)
>  			/* Using pasid -1 if pasid is not present */
> -			dmar_fault_do_one(iommu, type, fault_reason,
> -					  pasid_present ? pasid : -1,
> -					  source_id, guest_addr);
> +			dmar_fault_dump_one(iommu, type, fault_reason,
> +					    pasid_present ? pasid : -1,
> +					    source_id, guest_addr);
> +		if (irq != -1)
> +			dmar_fault_schedule_one(iommu, type, fault_reason,
> +						pasid_present ? pasid : -1,
> +						source_id, guest_addr);
>  
>  		fault_index++;
>  		if (fault_index >= cap_num_fault_regs(iommu->cap))
> @@ -1784,6 +1875,10 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
>  	if (iommu->irq)
>  		return 0;
>  
> +	iommu->fault_wq = alloc_ordered_workqueue("fault_%s", 0, iommu->name);
> +	if (!iommu->fault_wq)
> +		return -ENOMEM;
> +
>  	irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
>  	if (irq > 0) {
>  		iommu->irq = irq;
> @@ -1803,6 +1898,9 @@ int __init enable_drhd_fault_handling(void)
>  	struct dmar_drhd_unit *drhd;
>  	struct intel_iommu *iommu;
>  
> +	if (dmar_fault_info_cache_init())
> +		return -1;
> +
>  	/*
>  	 * Enable fault control interrupt.
>  	 */
> @@ -1813,6 +1911,7 @@ int __init enable_drhd_fault_handling(void)
>  		if (ret) {
>  			pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n",
>  			       (unsigned long long)drhd->reg_base_addr, ret);
> +			kmem_cache_destroy(dmar_fault_info_cache);
>  			return -1;
>  		}
>  
> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> index 07bd84c50238..d01514fca6d1 100644
> --- a/drivers/pci/pci-sysfs.c
> +++ b/drivers/pci/pci-sysfs.c
> @@ -263,6 +263,23 @@ static ssize_t ari_enabled_show(struct device *dev,
>  }
>  static DEVICE_ATTR_RO(ari_enabled);
>  
> +#ifdef CONFIG_DMAR_TABLE
> +static ssize_t dmar_faults_show(struct device *dev,
> +				struct device_attribute *attr, char *buf)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	int remap, read, write;
> +
> +	remap = atomic_xchg(&pdev->faults_cnt.remap, 0);
> +	read = atomic_xchg(&pdev->faults_cnt.read, 0);
> +	write = atomic_xchg(&pdev->faults_cnt.write, 0);

Doesn't this clear-on-read approach allow anybody to clear these
counters?  This looks like it's world-readable, and I'm not sure an
unprivileged user should be able to clear the counters.

> +
> +	return sprintf(buf, "remap: %d\nread: %d\nwrite: %d\n", remap, read,
> +		       write);
> +}
> +static DEVICE_ATTR_RO(dmar_faults);
> +#endif
> +
>  static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
>  			     char *buf)
>  {
> @@ -623,6 +640,9 @@ static struct attribute *pci_dev_attrs[] = {
>  #endif
>  	&dev_attr_driver_override.attr,
>  	&dev_attr_ari_enabled.attr,
> +#ifdef CONFIG_DMAR_TABLE
> +	&dev_attr_dmar_faults.attr,
> +#endif
>  	NULL,
>  };
>  
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index ed11ef594378..f8963c833fb0 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -552,6 +552,9 @@ struct intel_iommu {
>  	struct iommu_device iommu;  /* IOMMU core code handle */
>  	int		node;
>  	u32		flags;      /* Software defined flags */
> +#ifdef CONFIG_DMAR_TABLE
> +	struct workqueue_struct *fault_wq; /* Fault handler */
> +#endif
>  };
>  
>  /* PCI domain-device relationship */
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 14efa2586a8c..d9cc94fbf0ee 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -286,6 +286,14 @@ struct pci_vpd;
>  struct pci_sriov;
>  struct pci_p2pdma;
>  
> +#ifdef CONFIG_DMAR_TABLE
> +struct pci_dmar_faults_cnt {
> +	atomic_t read; /* How many read faults occurred */
> +	atomic_t write; /* How many write faults occurred */
> +	atomic_t remap; /* How many remap faults occurred */
> +};
> +#endif
> +
>  /* The pci_dev structure describes PCI devices */
>  struct pci_dev {
>  	struct list_head bus_list;	/* Node in per-bus list */
> @@ -468,6 +476,9 @@ struct pci_dev {
>  	char		*driver_override; /* Driver name to force a match */
>  
>  	unsigned long	priv_flags;	/* Private flags for the PCI driver */
> +#ifdef CONFIG_DMAR_TABLE
> +	struct pci_dmar_faults_cnt faults_cnt; /* Dmar fault statistics */
> +#endif
>  };
>  
>  static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
> -- 
> 2.23.0
> 
> 
> 
> 
> Amazon Development Center Germany GmbH
> Krausenstr. 38
> 10117 Berlin
> Geschaeftsfuehrung: Christian Schlaeger, Ralf Herbrich
> Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
> Sitz: Berlin
> Ust-ID: DE 289 237 879
> 
> 
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-10-16 17:26 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-15 15:11 [PATCH 0/2] iommu/dmar: expose fault counters via sysfs Yuri Volchkov
2019-10-15 15:11 ` [PATCH 1/2] iommu/dmar: collect fault statistics Yuri Volchkov
2019-10-16 17:16   ` Bjorn Helgaas
2019-10-15 15:11 ` [PATCH 2/2] iommu/dmar: catch early fault occurrences Yuri Volchkov
2019-10-16  0:45 ` [PATCH 0/2] iommu/dmar: expose fault counters via sysfs Lu Baolu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).