linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH]intel-iommu batched iotlb flushes
@ 2008-02-11 22:41 mark gross
  2008-02-11 23:27 ` Randy Dunlap
  2008-02-12  8:52 ` Muli Ben-Yehuda
  0 siblings, 2 replies; 16+ messages in thread
From: mark gross @ 2008-02-11 22:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, linux-mm

The intel-iommu hardware requires a polling operation to flush IOTLB
PTE's after an unmap operation.  Through some TSC instrumentation of a
netperf UDP stream with small packets test case it was seen that the
flush operations where sucking up to 16% of the CPU time doing
iommu_flush_iotlb's

The following patch batches the IOTLB flushes removing most of the
overhead in flushing the IOTLB's.  It works by building a list of to be
released IOVA's that is iterated over when a timer goes off or when a
high water mark is reached.

The wrinkle this has is that the memory protection and page fault
warnings from errant DMA operations is somewhat reduced, hence a kernel
parameter is added to revert back to the "strict" page flush / unmap
behavior. 

The hole is the following scenarios: 
do many map_signal operations, do some unmap_signals, reuse a recently
unmapped page, <errant DMA hardware sneaks through and steps on reused
memory>

Or: you have rouge hardware using DMA's to look at pages: do many
map_signal's, do many unmap_singles, reuse some unmapped pages : 
<rouge hardware looks at reused page>

Note : these holes are very hard to get too, as the IOTLB is small and
only the PTE's still in the IOTLB can be accessed through this
mechanism.

Its recommended that strict is used when developing drivers that do DMA
operations to catch bugs early.  For production code where performance
is desired running with the batched IOTLB flushing is a good way to
go.


--mgross


Signed-off-by: <mgross@linux.intel.com>



Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c	2008-02-07 13:03:10.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c	2008-02-11 10:38:49.000000000 -0800
@@ -21,6 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/bitmap.h>
+#include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
@@ -30,6 +31,7 @@
 #include <linux/dmar.h>
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
+#include <linux/timer.h>
 #include "iova.h"
 #include "intel-iommu.h"
 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,39 @@
 
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
+
+static void flush_unmaps_timeout(unsigned long data);
+
+static struct timer_list unmap_timer =
+	TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
+
+struct unmap_list {
+	struct list_head list;
+	struct dmar_domain *domain;
+	struct iova *iova;
+};
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long 	*g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
 static void domain_remove_dev_info(struct dmar_domain *domain);
 
 static int dmar_disabled;
 static int __initdata dmar_map_gfx = 1;
 static int dmar_forcedac;
+static int intel_iommu_strict;
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +103,13 @@
 			printk(KERN_INFO
 				"Intel-IOMMU: disable GFX device mapping\n");
 		} else if (!strncmp(str, "forcedac", 8)) {
-			printk (KERN_INFO
+			printk(KERN_INFO
 				"Intel-IOMMU: Forcing DAC for PCI devices\n");
 			dmar_forcedac = 1;
+		} else if (!strncmp(str, "strict", 8)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: disable bached IOTLB flush\n");
+			intel_iommu_strict = 1;
 		}
 
 		str += strcspn(str, ",");
@@ -965,17 +999,13 @@
 		set_bit(0, iommu->domain_ids);
 	return 0;
 }
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+					struct dmar_drhd_unit *drhd)
 {
-	struct intel_iommu *iommu;
 	int ret;
 	int map_size;
 	u32 ver;
 
-	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
-	if (!iommu)
-		return NULL;
 	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
 	if (!iommu->reg) {
 		printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1426,7 @@
 	int index;
 
 	while (dev) {
-		for (index = 0; index < cnt; index ++)
+		for (index = 0; index < cnt; index++)
 			if (dev == devices[index])
 				return 1;
 
@@ -1661,7 +1691,7 @@
 	struct dmar_rmrr_unit *rmrr;
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
-	int ret, unit = 0;
+	int nlongs, i, ret, unit = 0;
 
 	/*
 	 * for each drhd
@@ -1672,7 +1702,29 @@
 	for_each_drhd_unit(drhd) {
 		if (drhd->ignored)
 			continue;
-		iommu = alloc_iommu(drhd);
+		g_num_of_iommus++;
+	}
+
+	nlongs = BITS_TO_LONGS(g_num_of_iommus);
+	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+	if (!g_iommus_to_flush) {
+		printk(KERN_ERR "Allocating bitmap array failed\n");
+		return -ENOMEM;
+	}
+
+	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+	if (!g_iommus) {
+		kfree(g_iommus_to_flush);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	i = 0;
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = alloc_iommu(&g_iommus[i], drhd);
+		i++;
 		if (!iommu) {
 			ret = -ENOMEM;
 			goto error;
@@ -1705,7 +1757,6 @@
 	 * endfor
 	 */
 	for_each_rmrr_units(rmrr) {
-		int i;
 		for (i = 0; i < rmrr->devices_cnt; i++) {
 			pdev = rmrr->devices[i];
 			/* some BIOS lists non-exist devices in DMAR table */
@@ -1909,6 +1960,54 @@
 	return 0;
 }
 
+static void flush_unmaps(void)
+{
+	struct iova *node, *n;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	timer_on = 0;
+
+	/*just flush them all*/
+	for (i = 0; i < g_num_of_iommus; i++) {
+		if (test_and_clear_bit(i, g_iommus_to_flush))
+			iommu_flush_iotlb_global(&g_iommus[i], 0);
+	}
+
+	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+		/* free iova */
+		list_del(&node->list);
+		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+	}
+	list_size = 0;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+	flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	iova->dmar = dom;
+	list_add(&iova->list, &unmaps_to_do);
+	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+	if (!timer_on) {
+		unmap_timer.expires = jiffies + msecs_to_jiffies(10);
+		mod_timer(&unmap_timer, unmap_timer.expires);
+		timer_on = 1;
+	}
+	list_size++;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
 	size_t size, int dir)
 {
@@ -1936,13 +2035,21 @@
 	dma_pte_clear_range(domain, start_addr, start_addr + size);
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
-	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
-			size >> PAGE_SHIFT_4K, 0))
-		iommu_flush_write_buffer(domain->iommu);
-
-	/* free iova */
-	__free_iova(&domain->iovad, iova);
+	if (intel_iommu_strict) {
+		if (iommu_flush_iotlb_psi(domain->iommu,
+			domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+			iommu_flush_write_buffer(domain->iommu);
+		/* free iova */
+		__free_iova(&domain->iovad, iova);
+	} else {
+		add_unmap(domain, iova);
+		/*
+		 * queue up the release of the unmap to save the 1/6th of the
+		 * cpu used up by the iotlb flush operation...
+		 */
+		if (list_size > high_watermark)
+			flush_unmaps();
+	}
 }
 
 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2373,10 @@
 	if (dmar_table_init())
 		return 	-ENODEV;
 
+	high_watermark = 250;
+	intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+	debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+					intel_iommu_debug, &high_watermark);
 	iommu_init_mempool();
 	dmar_init_reserved_ranges();
 
@@ -2281,6 +2392,7 @@
 	printk(KERN_INFO
 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
 
+	init_timer(&unmap_timer);
 	force_iommu = 1;
 	dma_ops = &intel_dma_ops;
 	return 0;
Index: linux-2.6.24-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/iova.h	2008-02-07 13:03:10.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/iova.h	2008-02-07 13:06:28.000000000 -0800
@@ -23,6 +23,8 @@
 	struct rb_node	node;
 	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
 	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+	struct list_head list;
+	void *dmar;
 };
 
 /* holds all the iova translations for a domain */
Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-11 13:44:23.000000000 -0800
+++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-11 14:23:37.000000000 -0800
@@ -822,6 +822,10 @@
 			than 32 bit addressing. The default is to look
 			for translation below 32 bit and if not available
 			then look in the higher range.
+		strict [Default Off]
+			With this option on every umap_signle will
+			result in a hardware IOTLB flush opperation as
+			opposed to batching them for performance.
 
 	io_delay=	[X86-32,X86-64] I/O delay method
 		0x80

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-11 22:41 [PATCH]intel-iommu batched iotlb flushes mark gross
@ 2008-02-11 23:27 ` Randy Dunlap
  2008-02-12 16:05   ` mark gross
  2008-02-12  8:52 ` Muli Ben-Yehuda
  1 sibling, 1 reply; 16+ messages in thread
From: Randy Dunlap @ 2008-02-11 23:27 UTC (permalink / raw)
  To: mgross; +Cc: Andrew Morton, linux-kernel, linux-mm

On Mon, 11 Feb 2008 14:41:05 -0800 mark gross wrote:

> The hole is the following scenarios: 
> do many map_signal operations, do some unmap_signals, reuse a recently
> unmapped page, <errant DMA hardware sneaks through and steps on reused
> memory>
> 
> Or: you have rouge hardware using DMA's to look at pages: do many

  or rogue hardware?

> map_signal's, do many unmap_singles, reuse some unmapped pages : 

  signal .................... single

> <rouge hardware looks at reused page>

   why rouge?

> Note : these holes are very hard to get too, as the IOTLB is small and
> only the PTE's still in the IOTLB can be accessed through this
> mechanism.
> 
> Its recommended that strict is used when developing drivers that do DMA
> operations to catch bugs early.  For production code where performance
> is desired running with the batched IOTLB flushing is a good way to
> go.
> 
> 
> --mgross
> 
> 
> Signed-off-by: <mgross@linux.intel.com>
> 
> 
> 
> Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
> ===================================================================
> --- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c	2008-02-07 13:03:10.000000000 -0800
> +++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c	2008-02-11 10:38:49.000000000 -0800

> @@ -50,11 +52,39 @@
>  
>  #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
>  
> +
> +static void flush_unmaps_timeout(unsigned long data);
> +
> +static struct timer_list unmap_timer =
> +	TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
> +
> +struct unmap_list {
> +	struct list_head list;
> +	struct dmar_domain *domain;
> +	struct iova *iova;
> +};
> +
> +static struct intel_iommu *g_iommus;
> +/* bitmap for indexing intel_iommus */
> +static unsigned long 	*g_iommus_to_flush;
> +static int g_num_of_iommus;
> +
> +static DEFINE_SPINLOCK(async_umap_flush_lock);
> +static LIST_HEAD(unmaps_to_do);
> +
> +static int timer_on;
> +static long list_size;
> +static int high_watermark;
> +
> +static struct dentry *intel_iommu_debug, *debug;
> +
> +
>  static void domain_remove_dev_info(struct dmar_domain *domain);
>  
>  static int dmar_disabled;
>  static int __initdata dmar_map_gfx = 1;
>  static int dmar_forcedac;
> +static int intel_iommu_strict;
>  
>  #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
>  static DEFINE_SPINLOCK(device_domain_lock);
> @@ -73,9 +103,13 @@
>  			printk(KERN_INFO
>  				"Intel-IOMMU: disable GFX device mapping\n");
>  		} else if (!strncmp(str, "forcedac", 8)) {
> -			printk (KERN_INFO
> +			printk(KERN_INFO
>  				"Intel-IOMMU: Forcing DAC for PCI devices\n");
>  			dmar_forcedac = 1;
> +		} else if (!strncmp(str, "strict", 8)) {
> +			printk(KERN_INFO
> +				"Intel-IOMMU: disable bached IOTLB flush\n");

                                                      batched

> +			intel_iommu_strict = 1;
>  		}
>  
>  		str += strcspn(str, ",");

> @@ -1672,7 +1702,29 @@
>  	for_each_drhd_unit(drhd) {
>  		if (drhd->ignored)
>  			continue;
> -		iommu = alloc_iommu(drhd);
> +		g_num_of_iommus++;
> +	}
> +
> +	nlongs = BITS_TO_LONGS(g_num_of_iommus);
> +	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> +	if (!g_iommus_to_flush) {
> +		printk(KERN_ERR "Allocating bitmap array failed\n");

                identify:       "IOMMU:

> +		return -ENOMEM;
> +	}
> +
> +	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> +	if (!g_iommus) {
> +		kfree(g_iommus_to_flush);
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +
> +	i = 0;
> +	for_each_drhd_unit(drhd) {
> +		if (drhd->ignored)
> +			continue;
> +		iommu = alloc_iommu(&g_iommus[i], drhd);
> +		i++;
>  		if (!iommu) {
>  			ret = -ENOMEM;
>  			goto error;

> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> ===================================================================
> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-11 13:44:23.000000000 -0800
> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-11 14:23:37.000000000 -0800
> @@ -822,6 +822,10 @@
>  			than 32 bit addressing. The default is to look
>  			for translation below 32 bit and if not available
>  			then look in the higher range.
> +		strict [Default Off]
> +			With this option on every umap_signle will

                                         on, every unmap_si{ngle,gnal} ??

> +			result in a hardware IOTLB flush opperation as
> +			opposed to batching them for performance.
>  
>  	io_delay=	[X86-32,X86-64] I/O delay method
>  		0x80


---
~Randy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-11 22:41 [PATCH]intel-iommu batched iotlb flushes mark gross
  2008-02-11 23:27 ` Randy Dunlap
@ 2008-02-12  8:52 ` Muli Ben-Yehuda
  2008-02-12  9:00   ` David Miller
  2008-02-12 15:37   ` mark gross
  1 sibling, 2 replies; 16+ messages in thread
From: Muli Ben-Yehuda @ 2008-02-12  8:52 UTC (permalink / raw)
  To: mark gross; +Cc: Andrew Morton, linux-kernel, linux-mm

On Mon, Feb 11, 2008 at 02:41:05PM -0800, mark gross wrote:

> The intel-iommu hardware requires a polling operation to flush IOTLB
> PTE's after an unmap operation.  Through some TSC instrumentation of
> a netperf UDP stream with small packets test case it was seen that
> the flush operations where sucking up to 16% of the CPU time doing
> iommu_flush_iotlb's
> 
> The following patch batches the IOTLB flushes removing most of the
> overhead in flushing the IOTLB's.  It works by building a list of to
> be released IOVA's that is iterated over when a timer goes off or
> when a high water mark is reached.
> 
> The wrinkle this has is that the memory protection and page fault
> warnings from errant DMA operations is somewhat reduced, hence a kernel
> parameter is added to revert back to the "strict" page flush / unmap
> behavior. 
> 
> The hole is the following scenarios: 
> do many map_signal operations, do some unmap_signals, reuse a recently
> unmapped page, <errant DMA hardware sneaks through and steps on reused
> memory>
> 
> Or: you have rouge hardware using DMA's to look at pages: do many
> map_signal's, do many unmap_singles, reuse some unmapped pages : 
> <rouge hardware looks at reused page>
> 
> Note : these holes are very hard to get too, as the IOTLB is small
> and only the PTE's still in the IOTLB can be accessed through this
> mechanism.
> 
> Its recommended that strict is used when developing drivers that do
> DMA operations to catch bugs early.  For production code where
> performance is desired running with the batched IOTLB flushing is a
> good way to go.

While I don't disagree with this patch in principle (Calgary does the
same thing due to expensive IOTLB flushes) the right way to fix it
IMHO is to fix the drivers to batch mapping and unmapping operations
or map up-front and unmap when done. The streaming DMA-API was
designed to conserve IOMMU mappings for machines where IOMMU mappings
are a scarce resource, and is a poor fit for a modern IOMMU such as
VT-d with a 64-bit IO address space (or even an IOMMU with a 32-bit
address space such as Calgary) where there are plenty of IOMMU
mappings available.

Cheers,
Muli

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-12  8:52 ` Muli Ben-Yehuda
@ 2008-02-12  9:00   ` David Miller
  2008-02-12  9:07     ` Muli Ben-Yehuda
  2008-02-12 15:54     ` mark gross
  2008-02-12 15:37   ` mark gross
  1 sibling, 2 replies; 16+ messages in thread
From: David Miller @ 2008-02-12  9:00 UTC (permalink / raw)
  To: muli; +Cc: mgross, akpm, linux-kernel, linux-mm

From: Muli Ben-Yehuda <muli@il.ibm.com>
Date: Tue, 12 Feb 2008 10:52:56 +0200

> The streaming DMA-API was designed to conserve IOMMU mappings for
> machines where IOMMU mappings are a scarce resource, and is a poor
> fit for a modern IOMMU such as VT-d with a 64-bit IO address space
> (or even an IOMMU with a 32-bit address space such as Calgary) where
> there are plenty of IOMMU mappings available.

For the 64-bit case what you are suggesting eventually amounts
to mapping all available RAM in the IOMMU.

Although an extreme version of your suggestion, it would be the
most efficient as it would require zero IOMMU flush operations.

But we'd lose things like protection and other benefits.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-12  9:00   ` David Miller
@ 2008-02-12  9:07     ` Muli Ben-Yehuda
  2008-02-12 15:54     ` mark gross
  1 sibling, 0 replies; 16+ messages in thread
From: Muli Ben-Yehuda @ 2008-02-12  9:07 UTC (permalink / raw)
  To: David Miller; +Cc: mgross, akpm, linux-kernel, linux-mm

On Tue, Feb 12, 2008 at 01:00:06AM -0800, David Miller wrote:
> From: Muli Ben-Yehuda <muli@il.ibm.com>
> Date: Tue, 12 Feb 2008 10:52:56 +0200
> 
> > The streaming DMA-API was designed to conserve IOMMU mappings for
> > machines where IOMMU mappings are a scarce resource, and is a poor
> > fit for a modern IOMMU such as VT-d with a 64-bit IO address space
> > (or even an IOMMU with a 32-bit address space such as Calgary)
> > where there are plenty of IOMMU mappings available.
> 
> For the 64-bit case what you are suggesting eventually amounts to
> mapping all available RAM in the IOMMU.
> 
> Although an extreme version of your suggestion, it would be the most
> efficient as it would require zero IOMMU flush operations.
>
> But we'd lose things like protection and other benefits.

For the extreme case you are correct. There's an inherent trade-off
between IOMMU performance and protection guarantees, where one end of
the spectrum is represented by the streaming DMA-API and the other end
is represented by simply mapping all available memory. It's an open
question what is the right point in between. I think that an optimal
strategy might be "keep the mapping around for as long as it is safe",
i.e., keep a mapping to a frame for as long as the frame is owned by
whoever requested the mapping in the first place. Once ownership of
the frame is passed to another entity (e.g., from the driver to the
stack), revoke the original mapping. This implies a way for the kernel
to track frame ownership and communicate frame ownership changes to
the DMA-API layer, which we don't currently have.

Cheers,
Muli


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-12  8:52 ` Muli Ben-Yehuda
  2008-02-12  9:00   ` David Miller
@ 2008-02-12 15:37   ` mark gross
  1 sibling, 0 replies; 16+ messages in thread
From: mark gross @ 2008-02-12 15:37 UTC (permalink / raw)
  To: Muli Ben-Yehuda; +Cc: Andrew Morton, linux-kernel, linux-mm

On Tue, Feb 12, 2008 at 10:52:56AM +0200, Muli Ben-Yehuda wrote:
> On Mon, Feb 11, 2008 at 02:41:05PM -0800, mark gross wrote:
> 
> > The intel-iommu hardware requires a polling operation to flush IOTLB
> > PTE's after an unmap operation.  Through some TSC instrumentation of
> > a netperf UDP stream with small packets test case it was seen that
> > the flush operations where sucking up to 16% of the CPU time doing
> > iommu_flush_iotlb's
> > 
> > The following patch batches the IOTLB flushes removing most of the
> > overhead in flushing the IOTLB's.  It works by building a list of to
> > be released IOVA's that is iterated over when a timer goes off or
> > when a high water mark is reached.
> > 
> > The wrinkle this has is that the memory protection and page fault
> > warnings from errant DMA operations is somewhat reduced, hence a kernel
> > parameter is added to revert back to the "strict" page flush / unmap
> > behavior. 
> > 
> > The hole is the following scenarios: 
> > do many map_signal operations, do some unmap_signals, reuse a recently
> > unmapped page, <errant DMA hardware sneaks through and steps on reused
> > memory>
> > 
> > Or: you have rouge hardware using DMA's to look at pages: do many
> > map_signal's, do many unmap_singles, reuse some unmapped pages : 
> > <rouge hardware looks at reused page>
> > 
> > Note : these holes are very hard to get too, as the IOTLB is small
> > and only the PTE's still in the IOTLB can be accessed through this
> > mechanism.
> > 
> > Its recommended that strict is used when developing drivers that do
> > DMA operations to catch bugs early.  For production code where
> > performance is desired running with the batched IOTLB flushing is a
> > good way to go.
> 
> While I don't disagree with this patch in principle (Calgary does the
> same thing due to expensive IOTLB flushes) the right way to fix it
> IMHO is to fix the drivers to batch mapping and unmapping operations
> or map up-front and unmap when done. The streaming DMA-API was
> designed to conserve IOMMU mappings for machines where IOMMU mappings
> are a scarce resource, and is a poor fit for a modern IOMMU such as
> VT-d with a 64-bit IO address space (or even an IOMMU with a 32-bit
> address space such as Calgary) where there are plenty of IOMMU
> mappings available.

Yes, have a DMA pool of DMA addresses to use and re-use in the stack
instead of setting up and tearing down the PTE's is something we need to
look at closely for network and other high DMA traffic stacks. 

--mgross



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-12  9:00   ` David Miller
  2008-02-12  9:07     ` Muli Ben-Yehuda
@ 2008-02-12 15:54     ` mark gross
  2008-02-12 23:46       ` David Miller
  1 sibling, 1 reply; 16+ messages in thread
From: mark gross @ 2008-02-12 15:54 UTC (permalink / raw)
  To: David Miller; +Cc: muli, akpm, linux-kernel, linux-mm

On Tue, Feb 12, 2008 at 01:00:06AM -0800, David Miller wrote:
> From: Muli Ben-Yehuda <muli@il.ibm.com>
> Date: Tue, 12 Feb 2008 10:52:56 +0200
> 
> > The streaming DMA-API was designed to conserve IOMMU mappings for
> > machines where IOMMU mappings are a scarce resource, and is a poor
> > fit for a modern IOMMU such as VT-d with a 64-bit IO address space
> > (or even an IOMMU with a 32-bit address space such as Calgary) where
> > there are plenty of IOMMU mappings available.
> 
> For the 64-bit case what you are suggesting eventually amounts
> to mapping all available RAM in the IOMMU.

Something could be done:
we could enable drivers to have DMA-pools they manage that get mapped
and are re-used.

I would rather the DMA-pools be tied to PID's that way any bad behavior
would be limited to the address space of the process using the device.
I haven't thought about how hard this would be to do but it would be
nice.  I think this could be tricky.

Application sets up ring buffer of device DMA memory, passes this to
driver/stack.  Need to handle hitting high water marks and application
exit clean up sanely... 

--mgross

> 
> Although an extreme version of your suggestion, it would be the
> most efficient as it would require zero IOMMU flush operations.
> 
> But we'd lose things like protection and other benefits.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-11 23:27 ` Randy Dunlap
@ 2008-02-12 16:05   ` mark gross
  2008-02-12 16:34     ` Randy Dunlap
  0 siblings, 1 reply; 16+ messages in thread
From: mark gross @ 2008-02-12 16:05 UTC (permalink / raw)
  To: Randy Dunlap; +Cc: Andrew Morton, linux-kernel, linux-mm

On Mon, Feb 11, 2008 at 03:27:16PM -0800, Randy Dunlap wrote:
> On Mon, 11 Feb 2008 14:41:05 -0800 mark gross wrote:
> 
> > The hole is the following scenarios: 
> > do many map_signal operations, do some unmap_signals, reuse a recently
> > unmapped page, <errant DMA hardware sneaks through and steps on reused
> > memory>
> > 
> > Or: you have rouge hardware using DMA's to look at pages: do many
> 
>   or rogue hardware?

yes bad-guy hardware.

> 
> > map_signal's, do many unmap_singles, reuse some unmapped pages : 
> 
>   signal .................... single
> 
> > <rouge hardware looks at reused page>
> 
>    why rouge?

because I'm a dumb ass.

> 
> > Note : these holes are very hard to get too, as the IOTLB is small and
> > only the PTE's still in the IOTLB can be accessed through this
> > mechanism.
> > 
> > Its recommended that strict is used when developing drivers that do DMA
> > operations to catch bugs early.  For production code where performance
> > is desired running with the batched IOTLB flushing is a good way to
> > go.
> > 
> > 
> > --mgross
> > 
> > 
> > Signed-off-by: <mgross@linux.intel.com>
> > 
> > 
> > 
> > Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c	2008-02-07 13:03:10.000000000 -0800
> > +++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c	2008-02-11 10:38:49.000000000 -0800
> 
> > @@ -50,11 +52,39 @@
> >  
> >  #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
> >  
> > +
> > +static void flush_unmaps_timeout(unsigned long data);
> > +
> > +static struct timer_list unmap_timer =
> > +	TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
> > +
> > +struct unmap_list {
> > +	struct list_head list;
> > +	struct dmar_domain *domain;
> > +	struct iova *iova;
> > +};
> > +
> > +static struct intel_iommu *g_iommus;
> > +/* bitmap for indexing intel_iommus */
> > +static unsigned long 	*g_iommus_to_flush;
> > +static int g_num_of_iommus;
> > +
> > +static DEFINE_SPINLOCK(async_umap_flush_lock);
> > +static LIST_HEAD(unmaps_to_do);
> > +
> > +static int timer_on;
> > +static long list_size;
> > +static int high_watermark;
> > +
> > +static struct dentry *intel_iommu_debug, *debug;
> > +
> > +
> >  static void domain_remove_dev_info(struct dmar_domain *domain);
> >  
> >  static int dmar_disabled;
> >  static int __initdata dmar_map_gfx = 1;
> >  static int dmar_forcedac;
> > +static int intel_iommu_strict;
> >  
> >  #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
> >  static DEFINE_SPINLOCK(device_domain_lock);
> > @@ -73,9 +103,13 @@
> >  			printk(KERN_INFO
> >  				"Intel-IOMMU: disable GFX device mapping\n");
> >  		} else if (!strncmp(str, "forcedac", 8)) {
> > -			printk (KERN_INFO
> > +			printk(KERN_INFO
> >  				"Intel-IOMMU: Forcing DAC for PCI devices\n");
> >  			dmar_forcedac = 1;
> > +		} else if (!strncmp(str, "strict", 8)) {
> > +			printk(KERN_INFO
> > +				"Intel-IOMMU: disable bached IOTLB flush\n");
> 
>                                                       batched

fixed.

> 
> > +			intel_iommu_strict = 1;
> >  		}
> >  
> >  		str += strcspn(str, ",");
> 
> > @@ -1672,7 +1702,29 @@
> >  	for_each_drhd_unit(drhd) {
> >  		if (drhd->ignored)
> >  			continue;
> > -		iommu = alloc_iommu(drhd);
> > +		g_num_of_iommus++;
> > +	}
> > +
> > +	nlongs = BITS_TO_LONGS(g_num_of_iommus);
> > +	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> > +	if (!g_iommus_to_flush) {
> > +		printk(KERN_ERR "Allocating bitmap array failed\n");
> 
>                 identify:       "IOMMU:
> 
> > +		return -ENOMEM;
> > +	}
> > +
> > +	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> > +	if (!g_iommus) {
> > +		kfree(g_iommus_to_flush);
> > +		ret = -ENOMEM;
> > +		goto error;
> > +	}
> > +
> > +	i = 0;
> > +	for_each_drhd_unit(drhd) {
> > +		if (drhd->ignored)
> > +			continue;
> > +		iommu = alloc_iommu(&g_iommus[i], drhd);
> > +		i++;
> >  		if (!iommu) {
> >  			ret = -ENOMEM;
> >  			goto error;
> 
> > Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-11 13:44:23.000000000 -0800
> > +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-11 14:23:37.000000000 -0800
> > @@ -822,6 +822,10 @@
> >  			than 32 bit addressing. The default is to look
> >  			for translation below 32 bit and if not available
> >  			then look in the higher range.
> > +		strict [Default Off]
> > +			With this option on every umap_signle will
> 
>                                          on, every unmap_si{ngle,gnal} ??
>

fixed.

 
> > +			result in a hardware IOTLB flush opperation as
> > +			opposed to batching them for performance.
> >  
> >  	io_delay=	[X86-32,X86-64] I/O delay method
> >  		0x80
> 
>

thanks!

Signed-off-by: <mgross@linux.intel.com>

 
Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c	2008-02-12 07:47:04.000000000 -0800
@@ -21,6 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/bitmap.h>
+#include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
@@ -30,6 +31,7 @@
 #include <linux/dmar.h>
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
+#include <linux/timer.h>
 #include "iova.h"
 #include "intel-iommu.h"
 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,39 @@
 
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
+
+static void flush_unmaps_timeout(unsigned long data);
+
+static struct timer_list unmap_timer =
+	TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
+
+struct unmap_list {
+	struct list_head list;
+	struct dmar_domain *domain;
+	struct iova *iova;
+};
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long 	*g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
 static void domain_remove_dev_info(struct dmar_domain *domain);
 
 static int dmar_disabled;
 static int __initdata dmar_map_gfx = 1;
 static int dmar_forcedac;
+static int intel_iommu_strict;
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +103,13 @@
 			printk(KERN_INFO
 				"Intel-IOMMU: disable GFX device mapping\n");
 		} else if (!strncmp(str, "forcedac", 8)) {
-			printk (KERN_INFO
+			printk(KERN_INFO
 				"Intel-IOMMU: Forcing DAC for PCI devices\n");
 			dmar_forcedac = 1;
+		} else if (!strncmp(str, "strict", 8)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: disable batched IOTLB flush\n");
+			intel_iommu_strict = 1;
 		}
 
 		str += strcspn(str, ",");
@@ -965,17 +999,13 @@
 		set_bit(0, iommu->domain_ids);
 	return 0;
 }
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+					struct dmar_drhd_unit *drhd)
 {
-	struct intel_iommu *iommu;
 	int ret;
 	int map_size;
 	u32 ver;
 
-	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
-	if (!iommu)
-		return NULL;
 	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
 	if (!iommu->reg) {
 		printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1426,7 @@
 	int index;
 
 	while (dev) {
-		for (index = 0; index < cnt; index ++)
+		for (index = 0; index < cnt; index++)
 			if (dev == devices[index])
 				return 1;
 
@@ -1661,7 +1691,7 @@
 	struct dmar_rmrr_unit *rmrr;
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
-	int ret, unit = 0;
+	int nlongs, i, ret, unit = 0;
 
 	/*
 	 * for each drhd
@@ -1672,7 +1702,30 @@
 	for_each_drhd_unit(drhd) {
 		if (drhd->ignored)
 			continue;
-		iommu = alloc_iommu(drhd);
+		g_num_of_iommus++;
+	}
+
+	nlongs = BITS_TO_LONGS(g_num_of_iommus);
+	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+	if (!g_iommus_to_flush) {
+		printk(KERN_ERR "Intel-IOMMU: \
+			Allocating bitmap array failed\n");
+		return -ENOMEM;
+	}
+
+	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+	if (!g_iommus) {
+		kfree(g_iommus_to_flush);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	i = 0;
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = alloc_iommu(&g_iommus[i], drhd);
+		i++;
 		if (!iommu) {
 			ret = -ENOMEM;
 			goto error;
@@ -1705,7 +1758,6 @@
 	 * endfor
 	 */
 	for_each_rmrr_units(rmrr) {
-		int i;
 		for (i = 0; i < rmrr->devices_cnt; i++) {
 			pdev = rmrr->devices[i];
 			/* some BIOS lists non-exist devices in DMAR table */
@@ -1909,6 +1961,54 @@
 	return 0;
 }
 
+static void flush_unmaps(void)
+{
+	struct iova *node, *n;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	timer_on = 0;
+
+	/*just flush them all*/
+	for (i = 0; i < g_num_of_iommus; i++) {
+		if (test_and_clear_bit(i, g_iommus_to_flush))
+			iommu_flush_iotlb_global(&g_iommus[i], 0);
+	}
+
+	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+		/* free iova */
+		list_del(&node->list);
+		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+	}
+	list_size = 0;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+	flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	iova->dmar = dom;
+	list_add(&iova->list, &unmaps_to_do);
+	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+	if (!timer_on) {
+		unmap_timer.expires = jiffies + msecs_to_jiffies(10);
+		mod_timer(&unmap_timer, unmap_timer.expires);
+		timer_on = 1;
+	}
+	list_size++;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
 	size_t size, int dir)
 {
@@ -1936,13 +2036,21 @@
 	dma_pte_clear_range(domain, start_addr, start_addr + size);
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
-	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
-			size >> PAGE_SHIFT_4K, 0))
-		iommu_flush_write_buffer(domain->iommu);
-
-	/* free iova */
-	__free_iova(&domain->iovad, iova);
+	if (intel_iommu_strict) {
+		if (iommu_flush_iotlb_psi(domain->iommu,
+			domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+			iommu_flush_write_buffer(domain->iommu);
+		/* free iova */
+		__free_iova(&domain->iovad, iova);
+	} else {
+		add_unmap(domain, iova);
+		/*
+		 * queue up the release of the unmap to save the 1/6th of the
+		 * cpu used up by the iotlb flush operation...
+		 */
+		if (list_size > high_watermark)
+			flush_unmaps();
+	}
 }
 
 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2374,10 @@
 	if (dmar_table_init())
 		return 	-ENODEV;
 
+	high_watermark = 250;
+	intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+	debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+					intel_iommu_debug, &high_watermark);
 	iommu_init_mempool();
 	dmar_init_reserved_ranges();
 
@@ -2281,6 +2393,7 @@
 	printk(KERN_INFO
 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
 
+	init_timer(&unmap_timer);
 	force_iommu = 1;
 	dma_ops = &intel_dma_ops;
 	return 0;
Index: linux-2.6.24-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/iova.h	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/iova.h	2008-02-12 07:39:53.000000000 -0800
@@ -23,6 +23,8 @@
 	struct rb_node	node;
 	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
 	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+	struct list_head list;
+	void *dmar;
 };
 
 /* holds all the iova translations for a domain */
Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-12 07:47:02.000000000 -0800
@@ -822,6 +822,10 @@
 			than 32 bit addressing. The default is to look
 			for translation below 32 bit and if not available
 			then look in the higher range.
+		strict [Default Off]
+			With this option on every umap_signal opperation will
+			result in a hardware IOTLB flush opperation as opposed
+			to batching them for performance.
 
 	io_delay=	[X86-32,X86-64] I/O delay method
 		0x80

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-12 16:05   ` mark gross
@ 2008-02-12 16:34     ` Randy Dunlap
  2008-02-12 19:55       ` mark gross
  0 siblings, 1 reply; 16+ messages in thread
From: Randy Dunlap @ 2008-02-12 16:34 UTC (permalink / raw)
  To: mgross; +Cc: Andrew Morton, linux-kernel, linux-mm

mark gross wrote:
>  
> Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
> ===================================================================
> --- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c	2008-02-12 07:12:06.000000000 -0800
> +++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c	2008-02-12 07:47:04.000000000 -0800

> @@ -1672,7 +1702,30 @@
>  	for_each_drhd_unit(drhd) {
>  		if (drhd->ignored)
>  			continue;
> -		iommu = alloc_iommu(drhd);
> +		g_num_of_iommus++;
> +	}
> +
> +	nlongs = BITS_TO_LONGS(g_num_of_iommus);
> +	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> +	if (!g_iommus_to_flush) {
> +		printk(KERN_ERR "Intel-IOMMU: \
> +			Allocating bitmap array failed\n");

I think that will make a string like below:
Intel-IOMMU: 			Allocating bitmap array failed

> +		return -ENOMEM;
> +	}
> +
> +	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> +	if (!g_iommus) {
> +		kfree(g_iommus_to_flush);
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +
> +	i = 0;
> +	for_each_drhd_unit(drhd) {
> +		if (drhd->ignored)
> +			continue;
> +		iommu = alloc_iommu(&g_iommus[i], drhd);
> +		i++;
>  		if (!iommu) {
>  			ret = -ENOMEM;
>  			goto error;

> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> ===================================================================
> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-12 07:47:02.000000000 -0800
> @@ -822,6 +822,10 @@
>  			than 32 bit addressing. The default is to look
>  			for translation below 32 bit and if not available
>  			then look in the higher range.
> +		strict [Default Off]
> +			With this option on every umap_signal opperation will

			                          unmap_single operation

> +			result in a hardware IOTLB flush opperation as opposed
> +			to batching them for performance.
>  
>  	io_delay=	[X86-32,X86-64] I/O delay method
>  		0x80


-- 
~Randy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-12 16:34     ` Randy Dunlap
@ 2008-02-12 19:55       ` mark gross
  2008-02-12 20:21         ` Randy Dunlap
  0 siblings, 1 reply; 16+ messages in thread
From: mark gross @ 2008-02-12 19:55 UTC (permalink / raw)
  To: Randy Dunlap; +Cc: Andrew Morton, linux-kernel, linux-mm

On Tue, Feb 12, 2008 at 08:34:39AM -0800, Randy Dunlap wrote:
> mark gross wrote:
>>  Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
>> ===================================================================
>> --- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c	2008-02-12 
>> 07:12:06.000000000 -0800
>> +++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c	2008-02-12 
>> 07:47:04.000000000 -0800
>
>> @@ -1672,7 +1702,30 @@
>>  	for_each_drhd_unit(drhd) {
>>  		if (drhd->ignored)
>>  			continue;
>> -		iommu = alloc_iommu(drhd);
>> +		g_num_of_iommus++;
>> +	}
>> +
>> +	nlongs = BITS_TO_LONGS(g_num_of_iommus);
>> +	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
>> +	if (!g_iommus_to_flush) {
>> +		printk(KERN_ERR "Intel-IOMMU: \
>> +			Allocating bitmap array failed\n");
>
> I think that will make a string like below:
> Intel-IOMMU: 			Allocating bitmap array failed
>

you are right.

>> +		return -ENOMEM;
>> +	}
>> +		strict [Default Off]
>> +			With this option on every umap_signal opperation will
>
> 			                          unmap_single operation
>
>> +			result in a hardware IOTLB flush opperation as opposed

Signed-off-by: <mgross@linux.intel.com>


Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c	2008-02-12 11:35:42.000000000 -0800
@@ -21,6 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/bitmap.h>
+#include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
@@ -30,6 +31,7 @@
 #include <linux/dmar.h>
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
+#include <linux/timer.h>
 #include "iova.h"
 #include "intel-iommu.h"
 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,39 @@
 
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
+
+static void flush_unmaps_timeout(unsigned long data);
+
+static struct timer_list unmap_timer =
+	TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
+
+struct unmap_list {
+	struct list_head list;
+	struct dmar_domain *domain;
+	struct iova *iova;
+};
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long 	*g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
 static void domain_remove_dev_info(struct dmar_domain *domain);
 
 static int dmar_disabled;
 static int __initdata dmar_map_gfx = 1;
 static int dmar_forcedac;
+static int intel_iommu_strict;
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +103,13 @@
 			printk(KERN_INFO
 				"Intel-IOMMU: disable GFX device mapping\n");
 		} else if (!strncmp(str, "forcedac", 8)) {
-			printk (KERN_INFO
+			printk(KERN_INFO
 				"Intel-IOMMU: Forcing DAC for PCI devices\n");
 			dmar_forcedac = 1;
+		} else if (!strncmp(str, "strict", 8)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: disable batched IOTLB flush\n");
+			intel_iommu_strict = 1;
 		}
 
 		str += strcspn(str, ",");
@@ -965,17 +999,13 @@
 		set_bit(0, iommu->domain_ids);
 	return 0;
 }
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+					struct dmar_drhd_unit *drhd)
 {
-	struct intel_iommu *iommu;
 	int ret;
 	int map_size;
 	u32 ver;
 
-	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
-	if (!iommu)
-		return NULL;
 	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
 	if (!iommu->reg) {
 		printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1426,7 @@
 	int index;
 
 	while (dev) {
-		for (index = 0; index < cnt; index ++)
+		for (index = 0; index < cnt; index++)
 			if (dev == devices[index])
 				return 1;
 
@@ -1661,7 +1691,7 @@
 	struct dmar_rmrr_unit *rmrr;
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
-	int ret, unit = 0;
+	int nlongs, i, ret, unit = 0;
 
 	/*
 	 * for each drhd
@@ -1672,7 +1702,30 @@
 	for_each_drhd_unit(drhd) {
 		if (drhd->ignored)
 			continue;
-		iommu = alloc_iommu(drhd);
+		g_num_of_iommus++;
+	}
+
+	nlongs = BITS_TO_LONGS(g_num_of_iommus);
+	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+	if (!g_iommus_to_flush) {
+		printk(KERN_ERR "Intel-IOMMU: "
+			"Allocating bitmap array failed\n");
+		return -ENOMEM;
+	}
+
+	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+	if (!g_iommus) {
+		kfree(g_iommus_to_flush);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	i = 0;
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = alloc_iommu(&g_iommus[i], drhd);
+		i++;
 		if (!iommu) {
 			ret = -ENOMEM;
 			goto error;
@@ -1705,7 +1758,6 @@
 	 * endfor
 	 */
 	for_each_rmrr_units(rmrr) {
-		int i;
 		for (i = 0; i < rmrr->devices_cnt; i++) {
 			pdev = rmrr->devices[i];
 			/* some BIOS lists non-exist devices in DMAR table */
@@ -1909,6 +1961,54 @@
 	return 0;
 }
 
+static void flush_unmaps(void)
+{
+	struct iova *node, *n;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	timer_on = 0;
+
+	/*just flush them all*/
+	for (i = 0; i < g_num_of_iommus; i++) {
+		if (test_and_clear_bit(i, g_iommus_to_flush))
+			iommu_flush_iotlb_global(&g_iommus[i], 0);
+	}
+
+	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+		/* free iova */
+		list_del(&node->list);
+		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+	}
+	list_size = 0;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+	flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	iova->dmar = dom;
+	list_add(&iova->list, &unmaps_to_do);
+	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+	if (!timer_on) {
+		unmap_timer.expires = jiffies + msecs_to_jiffies(10);
+		mod_timer(&unmap_timer, unmap_timer.expires);
+		timer_on = 1;
+	}
+	list_size++;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
 	size_t size, int dir)
 {
@@ -1936,13 +2036,21 @@
 	dma_pte_clear_range(domain, start_addr, start_addr + size);
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
-	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
-			size >> PAGE_SHIFT_4K, 0))
-		iommu_flush_write_buffer(domain->iommu);
-
-	/* free iova */
-	__free_iova(&domain->iovad, iova);
+	if (intel_iommu_strict) {
+		if (iommu_flush_iotlb_psi(domain->iommu,
+			domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+			iommu_flush_write_buffer(domain->iommu);
+		/* free iova */
+		__free_iova(&domain->iovad, iova);
+	} else {
+		add_unmap(domain, iova);
+		/*
+		 * queue up the release of the unmap to save the 1/6th of the
+		 * cpu used up by the iotlb flush operation...
+		 */
+		if (list_size > high_watermark)
+			flush_unmaps();
+	}
 }
 
 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2374,10 @@
 	if (dmar_table_init())
 		return 	-ENODEV;
 
+	high_watermark = 250;
+	intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+	debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+					intel_iommu_debug, &high_watermark);
 	iommu_init_mempool();
 	dmar_init_reserved_ranges();
 
@@ -2281,6 +2393,7 @@
 	printk(KERN_INFO
 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
 
+	init_timer(&unmap_timer);
 	force_iommu = 1;
 	dma_ops = &intel_dma_ops;
 	return 0;
Index: linux-2.6.24-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/iova.h	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/iova.h	2008-02-12 07:39:53.000000000 -0800
@@ -23,6 +23,8 @@
 	struct rb_node	node;
 	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
 	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+	struct list_head list;
+	void *dmar;
 };
 
 /* holds all the iova translations for a domain */
Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-12 11:36:07.000000000 -0800
@@ -822,6 +822,10 @@
 			than 32 bit addressing. The default is to look
 			for translation below 32 bit and if not available
 			then look in the higher range.
+		strict [Default Off]
+			With this option on every umap_signle operation will
+			result in a hardware IOTLB flush operation as opposed
+			to batching them for performance.
 
 	io_delay=	[X86-32,X86-64] I/O delay method
 		0x80

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-12 19:55       ` mark gross
@ 2008-02-12 20:21         ` Randy Dunlap
  2008-02-13 18:10           ` mark gross
  0 siblings, 1 reply; 16+ messages in thread
From: Randy Dunlap @ 2008-02-12 20:21 UTC (permalink / raw)
  To: mgross; +Cc: Andrew Morton, linux-kernel, linux-mm

> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> ===================================================================
> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-12 11:36:07.000000000 -0800
> @@ -822,6 +822,10 @@
>  			than 32 bit addressing. The default is to look
>  			for translation below 32 bit and if not available
>  			then look in the higher range.
> +		strict [Default Off]
> +			With this option on every umap_signle operation will

	umap_signle ???  (again)

> +			result in a hardware IOTLB flush operation as opposed
> +			to batching them for performance.
>  
>  	io_delay=	[X86-32,X86-64] I/O delay method
>  		0x80


-- 
~Randy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-12 15:54     ` mark gross
@ 2008-02-12 23:46       ` David Miller
  2008-02-13 18:31         ` mark gross
  0 siblings, 1 reply; 16+ messages in thread
From: David Miller @ 2008-02-12 23:46 UTC (permalink / raw)
  To: mgross; +Cc: muli, akpm, linux-kernel, linux-mm

From: mark gross <mgross@linux.intel.com>
Date: Tue, 12 Feb 2008 07:54:48 -0800

> Something could be done:
> we could enable drivers to have DMA-pools they manage that get mapped
> and are re-used.
> 
> I would rather the DMA-pools be tied to PID's that way any bad behavior
> would be limited to the address space of the process using the device.
> I haven't thought about how hard this would be to do but it would be
> nice.  I think this could be tricky.

Yes, this is a good idea especially for networking.

For transmit on 10GB links the IOMMU setup is near the top
of the profiles.

What a driver could do is determine the maximum number of
IOMMU pages it could need to map one maximally sized packet.
So then it allocates enough space for all such entries in
it's TX ring.

This eliminates the range allocation from the transmit path.
All that's left is "remap DMA range X to scatterlist Y"

And yes it would be nice to have dma_map_skb() type interfaces
so that we don't walk into the IOMMU code N times per packet.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-12 20:21         ` Randy Dunlap
@ 2008-02-13 18:10           ` mark gross
  2008-02-13 18:23             ` Randy Dunlap
  0 siblings, 1 reply; 16+ messages in thread
From: mark gross @ 2008-02-13 18:10 UTC (permalink / raw)
  To: Randy Dunlap; +Cc: Andrew Morton, linux-kernel, linux-mm

On Tue, Feb 12, 2008 at 12:21:08PM -0800, Randy Dunlap wrote:
>> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
>> ===================================================================
>> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 
>> 07:12:06.000000000 -0800
>> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-12 
>> 11:36:07.000000000 -0800
>> @@ -822,6 +822,10 @@
>>  			than 32 bit addressing. The default is to look
>>  			for translation below 32 bit and if not available
>>  			then look in the higher range.
>> +		strict [Default Off]
>> +			With this option on every umap_signle operation will
>
> 	umap_signle ???  (again)
>
>> +			result in a hardware IOTLB flush operation as opposed
>> +			to batching them for performance.
>>   	io_delay=	[X86-32,X86-64] I/O delay method
>>  		0x80
>
>
> -- 
> ~Randy
>

Signed-off-by: <mgross@linux.intel.com>


Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c	2008-02-12 11:35:42.000000000 -0800
@@ -21,6 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/bitmap.h>
+#include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
@@ -30,6 +31,7 @@
 #include <linux/dmar.h>
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
+#include <linux/timer.h>
 #include "iova.h"
 #include "intel-iommu.h"
 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,39 @@
 
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
+
+static void flush_unmaps_timeout(unsigned long data);
+
+static struct timer_list unmap_timer =
+	TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
+
+struct unmap_list {
+	struct list_head list;
+	struct dmar_domain *domain;
+	struct iova *iova;
+};
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long 	*g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
 static void domain_remove_dev_info(struct dmar_domain *domain);
 
 static int dmar_disabled;
 static int __initdata dmar_map_gfx = 1;
 static int dmar_forcedac;
+static int intel_iommu_strict;
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +103,13 @@
 			printk(KERN_INFO
 				"Intel-IOMMU: disable GFX device mapping\n");
 		} else if (!strncmp(str, "forcedac", 8)) {
-			printk (KERN_INFO
+			printk(KERN_INFO
 				"Intel-IOMMU: Forcing DAC for PCI devices\n");
 			dmar_forcedac = 1;
+		} else if (!strncmp(str, "strict", 8)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: disable batched IOTLB flush\n");
+			intel_iommu_strict = 1;
 		}
 
 		str += strcspn(str, ",");
@@ -965,17 +999,13 @@
 		set_bit(0, iommu->domain_ids);
 	return 0;
 }
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+					struct dmar_drhd_unit *drhd)
 {
-	struct intel_iommu *iommu;
 	int ret;
 	int map_size;
 	u32 ver;
 
-	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
-	if (!iommu)
-		return NULL;
 	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
 	if (!iommu->reg) {
 		printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1426,7 @@
 	int index;
 
 	while (dev) {
-		for (index = 0; index < cnt; index ++)
+		for (index = 0; index < cnt; index++)
 			if (dev == devices[index])
 				return 1;
 
@@ -1661,7 +1691,7 @@
 	struct dmar_rmrr_unit *rmrr;
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
-	int ret, unit = 0;
+	int nlongs, i, ret, unit = 0;
 
 	/*
 	 * for each drhd
@@ -1672,7 +1702,30 @@
 	for_each_drhd_unit(drhd) {
 		if (drhd->ignored)
 			continue;
-		iommu = alloc_iommu(drhd);
+		g_num_of_iommus++;
+	}
+
+	nlongs = BITS_TO_LONGS(g_num_of_iommus);
+	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+	if (!g_iommus_to_flush) {
+		printk(KERN_ERR "Intel-IOMMU: "
+			"Allocating bitmap array failed\n");
+		return -ENOMEM;
+	}
+
+	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+	if (!g_iommus) {
+		kfree(g_iommus_to_flush);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	i = 0;
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = alloc_iommu(&g_iommus[i], drhd);
+		i++;
 		if (!iommu) {
 			ret = -ENOMEM;
 			goto error;
@@ -1705,7 +1758,6 @@
 	 * endfor
 	 */
 	for_each_rmrr_units(rmrr) {
-		int i;
 		for (i = 0; i < rmrr->devices_cnt; i++) {
 			pdev = rmrr->devices[i];
 			/* some BIOS lists non-exist devices in DMAR table */
@@ -1909,6 +1961,54 @@
 	return 0;
 }
 
+static void flush_unmaps(void)
+{
+	struct iova *node, *n;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	timer_on = 0;
+
+	/*just flush them all*/
+	for (i = 0; i < g_num_of_iommus; i++) {
+		if (test_and_clear_bit(i, g_iommus_to_flush))
+			iommu_flush_iotlb_global(&g_iommus[i], 0);
+	}
+
+	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+		/* free iova */
+		list_del(&node->list);
+		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+	}
+	list_size = 0;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+	flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	iova->dmar = dom;
+	list_add(&iova->list, &unmaps_to_do);
+	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+	if (!timer_on) {
+		unmap_timer.expires = jiffies + msecs_to_jiffies(10);
+		mod_timer(&unmap_timer, unmap_timer.expires);
+		timer_on = 1;
+	}
+	list_size++;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
 	size_t size, int dir)
 {
@@ -1936,13 +2036,21 @@
 	dma_pte_clear_range(domain, start_addr, start_addr + size);
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
-	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
-			size >> PAGE_SHIFT_4K, 0))
-		iommu_flush_write_buffer(domain->iommu);
-
-	/* free iova */
-	__free_iova(&domain->iovad, iova);
+	if (intel_iommu_strict) {
+		if (iommu_flush_iotlb_psi(domain->iommu,
+			domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+			iommu_flush_write_buffer(domain->iommu);
+		/* free iova */
+		__free_iova(&domain->iovad, iova);
+	} else {
+		add_unmap(domain, iova);
+		/*
+		 * queue up the release of the unmap to save the 1/6th of the
+		 * cpu used up by the iotlb flush operation...
+		 */
+		if (list_size > high_watermark)
+			flush_unmaps();
+	}
 }
 
 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2374,10 @@
 	if (dmar_table_init())
 		return 	-ENODEV;
 
+	high_watermark = 250;
+	intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+	debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+					intel_iommu_debug, &high_watermark);
 	iommu_init_mempool();
 	dmar_init_reserved_ranges();
 
@@ -2281,6 +2393,7 @@
 	printk(KERN_INFO
 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
 
+	init_timer(&unmap_timer);
 	force_iommu = 1;
 	dma_ops = &intel_dma_ops;
 	return 0;
Index: linux-2.6.24-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/iova.h	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/iova.h	2008-02-12 07:39:53.000000000 -0800
@@ -23,6 +23,8 @@
 	struct rb_node	node;
 	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
 	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+	struct list_head list;
+	void *dmar;
 };
 
 /* holds all the iova translations for a domain */
Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-12 11:36:07.000000000 -0800
@@ -822,6 +822,10 @@
 			than 32 bit addressing. The default is to look
 			for translation below 32 bit and if not available
 			then look in the higher range.
+		strict [Default Off]
+			With this option on every umap_single operation will
+			result in a hardware IOTLB flush operation as opposed
+			to batching them for performance.
 
 	io_delay=	[X86-32,X86-64] I/O delay method
 		0x80

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-13 18:10           ` mark gross
@ 2008-02-13 18:23             ` Randy Dunlap
  2008-02-13 19:35               ` mark gross
  0 siblings, 1 reply; 16+ messages in thread
From: Randy Dunlap @ 2008-02-13 18:23 UTC (permalink / raw)
  To: mgross; +Cc: Andrew Morton, linux-kernel, linux-mm

> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> ===================================================================
> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-12 11:36:07.000000000 -0800
> @@ -822,6 +822,10 @@
>  			than 32 bit addressing. The default is to look
>  			for translation below 32 bit and if not available
>  			then look in the higher range.
> +		strict [Default Off]
> +			With this option on every umap_single operation will

so I'll ask one more time :(
Shouldn't this be "unmap_single" ?

> +			result in a hardware IOTLB flush operation as opposed
> +			to batching them for performance.
>  
>  	io_delay=	[X86-32,X86-64] I/O delay method
>  		0x80


-- 
~Randy

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-12 23:46       ` David Miller
@ 2008-02-13 18:31         ` mark gross
  0 siblings, 0 replies; 16+ messages in thread
From: mark gross @ 2008-02-13 18:31 UTC (permalink / raw)
  To: David Miller; +Cc: muli, akpm, linux-kernel, linux-mm

On Tue, Feb 12, 2008 at 07:54:48AM -0800, David Miller wrote:
> > Something could be done:
> > we could enable drivers to have DMA-pools they manage that get mapped
> > and are re-used.
> > 
> > I would rather the DMA-pools be tied to PID's that way any bad behavior
> > would be limited to the address space of the process using the device.
> > I haven't thought about how hard this would be to do but it would be
> > nice.  I think this could be tricky.
> 
> Yes, this is a good idea especially for networking.
> 
> For transmit on 10GB links the IOMMU setup is near the top
> of the profiles.

true.
 
> What a driver could do is determine the maximum number of
> IOMMU pages it could need to map one maximally sized packet.
> So then it allocates enough space for all such entries in
> it's TX ring.
> 
> This eliminates the range allocation from the transmit path.
> All that's left is "remap DMA range X to scatterlist Y"
> 
> And yes it would be nice to have dma_map_skb() type interfaces
> so that we don't walk into the IOMMU code N times per packet.


/me starts looking more closely at how this could be done...

--mgross
 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]intel-iommu batched iotlb flushes
  2008-02-13 18:23             ` Randy Dunlap
@ 2008-02-13 19:35               ` mark gross
  0 siblings, 0 replies; 16+ messages in thread
From: mark gross @ 2008-02-13 19:35 UTC (permalink / raw)
  To: Randy Dunlap; +Cc: Andrew Morton, linux-kernel, linux-mm

On Wed, Feb 13, 2008 at 10:23:34AM -0800, Randy Dunlap wrote:
>> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
>> ===================================================================
>> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 
>> 07:12:06.000000000 -0800
>> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-12 
>> 11:36:07.000000000 -0800
>> @@ -822,6 +822,10 @@
>>  			than 32 bit addressing. The default is to look
>>  			for translation below 32 bit and if not available
>>  			then look in the higher range.
>> +		strict [Default Off]
>> +			With this option on every umap_single operation will
>
> so I'll ask one more time :(
> Shouldn't this be "unmap_single" ?
>
>> +			result in a hardware IOTLB flush operation as opposed
>> +			to batching them for performance.
>>   	io_delay=	[X86-32,X86-64] I/O delay method
>>  		0x80
>
>

crap I was focused on the single.

Signed-off-by: <mgross@linux.intel.com>


Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c	2008-02-12 11:35:42.000000000 -0800
@@ -21,6 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/bitmap.h>
+#include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
@@ -30,6 +31,7 @@
 #include <linux/dmar.h>
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
+#include <linux/timer.h>
 #include "iova.h"
 #include "intel-iommu.h"
 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,39 @@
 
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
+
+static void flush_unmaps_timeout(unsigned long data);
+
+static struct timer_list unmap_timer =
+	TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
+
+struct unmap_list {
+	struct list_head list;
+	struct dmar_domain *domain;
+	struct iova *iova;
+};
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long 	*g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
 static void domain_remove_dev_info(struct dmar_domain *domain);
 
 static int dmar_disabled;
 static int __initdata dmar_map_gfx = 1;
 static int dmar_forcedac;
+static int intel_iommu_strict;
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +103,13 @@
 			printk(KERN_INFO
 				"Intel-IOMMU: disable GFX device mapping\n");
 		} else if (!strncmp(str, "forcedac", 8)) {
-			printk (KERN_INFO
+			printk(KERN_INFO
 				"Intel-IOMMU: Forcing DAC for PCI devices\n");
 			dmar_forcedac = 1;
+		} else if (!strncmp(str, "strict", 8)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: disable batched IOTLB flush\n");
+			intel_iommu_strict = 1;
 		}
 
 		str += strcspn(str, ",");
@@ -965,17 +999,13 @@
 		set_bit(0, iommu->domain_ids);
 	return 0;
 }
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+					struct dmar_drhd_unit *drhd)
 {
-	struct intel_iommu *iommu;
 	int ret;
 	int map_size;
 	u32 ver;
 
-	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
-	if (!iommu)
-		return NULL;
 	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
 	if (!iommu->reg) {
 		printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1426,7 @@
 	int index;
 
 	while (dev) {
-		for (index = 0; index < cnt; index ++)
+		for (index = 0; index < cnt; index++)
 			if (dev == devices[index])
 				return 1;
 
@@ -1661,7 +1691,7 @@
 	struct dmar_rmrr_unit *rmrr;
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
-	int ret, unit = 0;
+	int nlongs, i, ret, unit = 0;
 
 	/*
 	 * for each drhd
@@ -1672,7 +1702,30 @@
 	for_each_drhd_unit(drhd) {
 		if (drhd->ignored)
 			continue;
-		iommu = alloc_iommu(drhd);
+		g_num_of_iommus++;
+	}
+
+	nlongs = BITS_TO_LONGS(g_num_of_iommus);
+	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+	if (!g_iommus_to_flush) {
+		printk(KERN_ERR "Intel-IOMMU: "
+			"Allocating bitmap array failed\n");
+		return -ENOMEM;
+	}
+
+	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+	if (!g_iommus) {
+		kfree(g_iommus_to_flush);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	i = 0;
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = alloc_iommu(&g_iommus[i], drhd);
+		i++;
 		if (!iommu) {
 			ret = -ENOMEM;
 			goto error;
@@ -1705,7 +1758,6 @@
 	 * endfor
 	 */
 	for_each_rmrr_units(rmrr) {
-		int i;
 		for (i = 0; i < rmrr->devices_cnt; i++) {
 			pdev = rmrr->devices[i];
 			/* some BIOS lists non-exist devices in DMAR table */
@@ -1909,6 +1961,54 @@
 	return 0;
 }
 
+static void flush_unmaps(void)
+{
+	struct iova *node, *n;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	timer_on = 0;
+
+	/*just flush them all*/
+	for (i = 0; i < g_num_of_iommus; i++) {
+		if (test_and_clear_bit(i, g_iommus_to_flush))
+			iommu_flush_iotlb_global(&g_iommus[i], 0);
+	}
+
+	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+		/* free iova */
+		list_del(&node->list);
+		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+	}
+	list_size = 0;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+	flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	iova->dmar = dom;
+	list_add(&iova->list, &unmaps_to_do);
+	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+	if (!timer_on) {
+		unmap_timer.expires = jiffies + msecs_to_jiffies(10);
+		mod_timer(&unmap_timer, unmap_timer.expires);
+		timer_on = 1;
+	}
+	list_size++;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
 	size_t size, int dir)
 {
@@ -1936,13 +2036,21 @@
 	dma_pte_clear_range(domain, start_addr, start_addr + size);
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
-	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
-			size >> PAGE_SHIFT_4K, 0))
-		iommu_flush_write_buffer(domain->iommu);
-
-	/* free iova */
-	__free_iova(&domain->iovad, iova);
+	if (intel_iommu_strict) {
+		if (iommu_flush_iotlb_psi(domain->iommu,
+			domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+			iommu_flush_write_buffer(domain->iommu);
+		/* free iova */
+		__free_iova(&domain->iovad, iova);
+	} else {
+		add_unmap(domain, iova);
+		/*
+		 * queue up the release of the unmap to save the 1/6th of the
+		 * cpu used up by the iotlb flush operation...
+		 */
+		if (list_size > high_watermark)
+			flush_unmaps();
+	}
 }
 
 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2374,10 @@
 	if (dmar_table_init())
 		return 	-ENODEV;
 
+	high_watermark = 250;
+	intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+	debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+					intel_iommu_debug, &high_watermark);
 	iommu_init_mempool();
 	dmar_init_reserved_ranges();
 
@@ -2281,6 +2393,7 @@
 	printk(KERN_INFO
 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
 
+	init_timer(&unmap_timer);
 	force_iommu = 1;
 	dma_ops = &intel_dma_ops;
 	return 0;
Index: linux-2.6.24-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/iova.h	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/iova.h	2008-02-12 07:39:53.000000000 -0800
@@ -23,6 +23,8 @@
 	struct rb_node	node;
 	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
 	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+	struct list_head list;
+	void *dmar;
 };
 
 /* holds all the iova translations for a domain */
Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-13 11:17:22.000000000 -0800
@@ -822,6 +822,10 @@
 			than 32 bit addressing. The default is to look
 			for translation below 32 bit and if not available
 			then look in the higher range.
+		strict [Default Off]
+			With this option on every unmap_single operation will
+			result in a hardware IOTLB flush operation as opposed
+			to batching them for performance.
 
 	io_delay=	[X86-32,X86-64] I/O delay method
 		0x80

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2008-02-13 19:37 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-02-11 22:41 [PATCH]intel-iommu batched iotlb flushes mark gross
2008-02-11 23:27 ` Randy Dunlap
2008-02-12 16:05   ` mark gross
2008-02-12 16:34     ` Randy Dunlap
2008-02-12 19:55       ` mark gross
2008-02-12 20:21         ` Randy Dunlap
2008-02-13 18:10           ` mark gross
2008-02-13 18:23             ` Randy Dunlap
2008-02-13 19:35               ` mark gross
2008-02-12  8:52 ` Muli Ben-Yehuda
2008-02-12  9:00   ` David Miller
2008-02-12  9:07     ` Muli Ben-Yehuda
2008-02-12 15:54     ` mark gross
2008-02-12 23:46       ` David Miller
2008-02-13 18:31         ` mark gross
2008-02-12 15:37   ` mark gross

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).