linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH]iommu-iotlb-flushing
@ 2008-02-21  0:06 mark gross
  2008-02-23  8:05 ` [PATCH]iommu-iotlb-flushing Andrew Morton
  0 siblings, 1 reply; 11+ messages in thread
From: mark gross @ 2008-02-21  0:06 UTC (permalink / raw)
  To: Andrew Morton, lkml

The following patch is for batching up the flushing of the IOTLB for
the DMAR implementation found in the Intel VT-d hardware.  It works by
building a list of to be flushed IOTLB entries and a bitmap list of
which DMAR engine they are from.

After either a high water mark (250 accessible via debugfs) or 10ms the
list of iova's will be reclaimed and the DMAR engines associated are
IOTLB-flushed.

This approach recovers 15 to 20% of the performance lost when using the
IOMMU for my netperf udp stream benchmark with small packets.  It can be
disabled with a kernel boot parameter
"intel_iommu=strict".

Its use does weaken the IOMMU protections a bit.

I would like to see this go into MM for a while and then onto mainline.

Thanks,


Signed-off-by: <mgross@linux.intel.com>


Index: linux-2.6.24-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/intel-iommu.c	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/intel-iommu.c	2008-02-12 11:35:42.000000000 -0800
@@ -21,6 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/bitmap.h>
+#include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
@@ -30,6 +31,7 @@
 #include <linux/dmar.h>
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
+#include <linux/timer.h>
 #include "iova.h"
 #include "intel-iommu.h"
 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,39 @@
 
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
+
+static void flush_unmaps_timeout(unsigned long data);
+
+static struct timer_list unmap_timer =
+	TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
+
+struct unmap_list {
+	struct list_head list;
+	struct dmar_domain *domain;
+	struct iova *iova;
+};
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long 	*g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
 static void domain_remove_dev_info(struct dmar_domain *domain);
 
 static int dmar_disabled;
 static int __initdata dmar_map_gfx = 1;
 static int dmar_forcedac;
+static int intel_iommu_strict;
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +103,13 @@
 			printk(KERN_INFO
 				"Intel-IOMMU: disable GFX device mapping\n");
 		} else if (!strncmp(str, "forcedac", 8)) {
-			printk (KERN_INFO
+			printk(KERN_INFO
 				"Intel-IOMMU: Forcing DAC for PCI devices\n");
 			dmar_forcedac = 1;
+		} else if (!strncmp(str, "strict", 8)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: disable batched IOTLB flush\n");
+			intel_iommu_strict = 1;
 		}
 
 		str += strcspn(str, ",");
@@ -965,17 +999,13 @@
 		set_bit(0, iommu->domain_ids);
 	return 0;
 }
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+					struct dmar_drhd_unit *drhd)
 {
-	struct intel_iommu *iommu;
 	int ret;
 	int map_size;
 	u32 ver;
 
-	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
-	if (!iommu)
-		return NULL;
 	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
 	if (!iommu->reg) {
 		printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1426,7 @@
 	int index;
 
 	while (dev) {
-		for (index = 0; index < cnt; index ++)
+		for (index = 0; index < cnt; index++)
 			if (dev == devices[index])
 				return 1;
 
@@ -1661,7 +1691,7 @@
 	struct dmar_rmrr_unit *rmrr;
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
-	int ret, unit = 0;
+	int nlongs, i, ret, unit = 0;
 
 	/*
 	 * for each drhd
@@ -1672,7 +1702,30 @@
 	for_each_drhd_unit(drhd) {
 		if (drhd->ignored)
 			continue;
-		iommu = alloc_iommu(drhd);
+		g_num_of_iommus++;
+	}
+
+	nlongs = BITS_TO_LONGS(g_num_of_iommus);
+	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+	if (!g_iommus_to_flush) {
+		printk(KERN_ERR "Intel-IOMMU: "
+			"Allocating bitmap array failed\n");
+		return -ENOMEM;
+	}
+
+	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+	if (!g_iommus) {
+		kfree(g_iommus_to_flush);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	i = 0;
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = alloc_iommu(&g_iommus[i], drhd);
+		i++;
 		if (!iommu) {
 			ret = -ENOMEM;
 			goto error;
@@ -1705,7 +1758,6 @@
 	 * endfor
 	 */
 	for_each_rmrr_units(rmrr) {
-		int i;
 		for (i = 0; i < rmrr->devices_cnt; i++) {
 			pdev = rmrr->devices[i];
 			/* some BIOS lists non-exist devices in DMAR table */
@@ -1909,6 +1961,54 @@
 	return 0;
 }
 
+static void flush_unmaps(void)
+{
+	struct iova *node, *n;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	timer_on = 0;
+
+	/*just flush them all*/
+	for (i = 0; i < g_num_of_iommus; i++) {
+		if (test_and_clear_bit(i, g_iommus_to_flush))
+			iommu_flush_iotlb_global(&g_iommus[i], 0);
+	}
+
+	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+		/* free iova */
+		list_del(&node->list);
+		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+	}
+	list_size = 0;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+	flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	iova->dmar = dom;
+	list_add(&iova->list, &unmaps_to_do);
+	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+	if (!timer_on) {
+		unmap_timer.expires = jiffies + msecs_to_jiffies(10);
+		mod_timer(&unmap_timer, unmap_timer.expires);
+		timer_on = 1;
+	}
+	list_size++;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
 	size_t size, int dir)
 {
@@ -1936,13 +2036,21 @@
 	dma_pte_clear_range(domain, start_addr, start_addr + size);
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
-	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
-			size >> PAGE_SHIFT_4K, 0))
-		iommu_flush_write_buffer(domain->iommu);
-
-	/* free iova */
-	__free_iova(&domain->iovad, iova);
+	if (intel_iommu_strict) {
+		if (iommu_flush_iotlb_psi(domain->iommu,
+			domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+			iommu_flush_write_buffer(domain->iommu);
+		/* free iova */
+		__free_iova(&domain->iovad, iova);
+	} else {
+		add_unmap(domain, iova);
+		/*
+		 * queue up the release of the unmap to save the 1/6th of the
+		 * cpu used up by the iotlb flush operation...
+		 */
+		if (list_size > high_watermark)
+			flush_unmaps();
+	}
 }
 
 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2374,10 @@
 	if (dmar_table_init())
 		return 	-ENODEV;
 
+	high_watermark = 250;
+	intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+	debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+					intel_iommu_debug, &high_watermark);
 	iommu_init_mempool();
 	dmar_init_reserved_ranges();
 
@@ -2281,6 +2393,7 @@
 	printk(KERN_INFO
 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
 
+	init_timer(&unmap_timer);
 	force_iommu = 1;
 	dma_ops = &intel_dma_ops;
 	return 0;
Index: linux-2.6.24-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.24-mm1.orig/drivers/pci/iova.h	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/drivers/pci/iova.h	2008-02-12 07:39:53.000000000 -0800
@@ -23,6 +23,8 @@
 	struct rb_node	node;
 	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
 	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+	struct list_head list;
+	void *dmar;
 };
 
 /* holds all the iova translations for a domain */
Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
+++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-13 11:17:22.000000000 -0800
@@ -822,6 +822,10 @@
 			than 32 bit addressing. The default is to look
 			for translation below 32 bit and if not available
 			then look in the higher range.
+		strict [Default Off]
+			With this option on every unmap_single operation will
+			result in a hardware IOTLB flush operation as opposed
+			to batching them for performance.
 
 	io_delay=	[X86-32,X86-64] I/O delay method
 		0x80

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]iommu-iotlb-flushing
  2008-02-21  0:06 [PATCH]iommu-iotlb-flushing mark gross
@ 2008-02-23  8:05 ` Andrew Morton
  2008-02-25 16:28   ` [PATCH]iommu-iotlb-flushing mark gross
  2008-02-29 23:18   ` [PATCH]iommu-iotlb-flushing mark gross
  0 siblings, 2 replies; 11+ messages in thread
From: Andrew Morton @ 2008-02-23  8:05 UTC (permalink / raw)
  To: mgross; +Cc: lkml

On Wed, 20 Feb 2008 16:06:23 -0800 mark gross <mgross@linux.intel.com> wrote:

> The following patch is for batching up the flushing of the IOTLB for
> the DMAR implementation found in the Intel VT-d hardware.  It works by
> building a list of to be flushed IOTLB entries and a bitmap list of
> which DMAR engine they are from.
> 
> After either a high water mark (250 accessible via debugfs) or 10ms the
> list of iova's will be reclaimed and the DMAR engines associated are
> IOTLB-flushed.
> 
> This approach recovers 15 to 20% of the performance lost when using the
> IOMMU for my netperf udp stream benchmark with small packets.  It can be
> disabled with a kernel boot parameter
> "intel_iommu=strict".
> 
> Its use does weaken the IOMMU protections a bit.
> 
> I would like to see this go into MM for a while and then onto mainline.
> 
> ...
>
> +static struct timer_list unmap_timer =
> +	TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);

Could use DEFINE_TIMER here.

> +struct unmap_list {
> +	struct list_head list;
> +	struct dmar_domain *domain;
> +	struct iova *iova;
> +};

unmap_list doens't seem to be used anywhere?

> +static struct intel_iommu *g_iommus;
> +/* bitmap for indexing intel_iommus */
> +static unsigned long 	*g_iommus_to_flush;
> +static int g_num_of_iommus;
> +
> +static DEFINE_SPINLOCK(async_umap_flush_lock);
> +static LIST_HEAD(unmaps_to_do);
> +
> +static int timer_on;
> +static long list_size;
> +static int high_watermark;
> +
> +static struct dentry *intel_iommu_debug, *debug;
> +
> +
>  static void domain_remove_dev_info(struct dmar_domain *domain);
>  
>  static int dmar_disabled;
>  static int __initdata dmar_map_gfx = 1;
>  static int dmar_forcedac;
> +static int intel_iommu_strict;
>  
>  #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
>  static DEFINE_SPINLOCK(device_domain_lock);
> @@ -73,9 +103,13 @@
>  			printk(KERN_INFO
>  				"Intel-IOMMU: disable GFX device mapping\n");
>  		} else if (!strncmp(str, "forcedac", 8)) {
> -			printk (KERN_INFO
> +			printk(KERN_INFO
>  				"Intel-IOMMU: Forcing DAC for PCI devices\n");
>  			dmar_forcedac = 1;
> +		} else if (!strncmp(str, "strict", 8)) {

s/8/6/

> +			printk(KERN_INFO
> +				"Intel-IOMMU: disable batched IOTLB flush\n");
> +			intel_iommu_strict = 1;
>  		}
>  
>  		str += strcspn(str, ",");
> @@ -965,17 +999,13 @@
>  		set_bit(0, iommu->domain_ids);
>  	return 0;
>  }
> -
> -static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
> +static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
> +					struct dmar_drhd_unit *drhd)
>  {
> -	struct intel_iommu *iommu;
>  	int ret;
>  	int map_size;
>  	u32 ver;
>  
> -	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> -	if (!iommu)
> -		return NULL;
>  	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
>  	if (!iommu->reg) {
>  		printk(KERN_ERR "IOMMU: can't map the region\n");
> @@ -1396,7 +1426,7 @@
>  	int index;
>  
>  	while (dev) {
> -		for (index = 0; index < cnt; index ++)
> +		for (index = 0; index < cnt; index++)
>  			if (dev == devices[index])
>  				return 1;
>  
> @@ -1661,7 +1691,7 @@
>  	struct dmar_rmrr_unit *rmrr;
>  	struct pci_dev *pdev;
>  	struct intel_iommu *iommu;
> -	int ret, unit = 0;
> +	int nlongs, i, ret, unit = 0;
>  
>  	/*
>  	 * for each drhd
> @@ -1672,7 +1702,30 @@
>  	for_each_drhd_unit(drhd) {
>  		if (drhd->ignored)
>  			continue;
> -		iommu = alloc_iommu(drhd);
> +		g_num_of_iommus++;

No locking needed for g_num_of_iommus?

> +	}
> +
> +	nlongs = BITS_TO_LONGS(g_num_of_iommus);

Would this code be neater if it used the <linux/bitmap.h> stuff?

> +	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> +	if (!g_iommus_to_flush) {
> +		printk(KERN_ERR "Intel-IOMMU: "
> +			"Allocating bitmap array failed\n");
> +		return -ENOMEM;

Are you sure we aren't leaking anything here?  Like the alloc_iommu() above?

> +	}
> +
> +	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> +	if (!g_iommus) {
> +		kfree(g_iommus_to_flush);
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +
> +	i = 0;
> +	for_each_drhd_unit(drhd) {
> +		if (drhd->ignored)
> +			continue;
> +		iommu = alloc_iommu(&g_iommus[i], drhd);
> +		i++;
>  		if (!iommu) {
>  			ret = -ENOMEM;
>  			goto error;
> @@ -1705,7 +1758,6 @@
>  	 * endfor
>  	 */
>  	for_each_rmrr_units(rmrr) {
> -		int i;
>  		for (i = 0; i < rmrr->devices_cnt; i++) {
>  			pdev = rmrr->devices[i];
>  			/* some BIOS lists non-exist devices in DMAR table */
> @@ -1909,6 +1961,54 @@
>  	return 0;
>  }
>  
> +static void flush_unmaps(void)
> +{
> +	struct iova *node, *n;
> +	unsigned long flags;
> +	int i;
> +
> +	spin_lock_irqsave(&async_umap_flush_lock, flags);
> +	timer_on = 0;
> +
> +	/*just flush them all*/

I'm surprised that checkpatch didn't grump about the odd commenting style.

> +	for (i = 0; i < g_num_of_iommus; i++) {
> +		if (test_and_clear_bit(i, g_iommus_to_flush))
> +			iommu_flush_iotlb_global(&g_iommus[i], 0);
> +	}
> +
> +	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
> +		/* free iova */
> +		list_del(&node->list);
> +		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
> +
> +	}
> +	list_size = 0;
> +	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> +}
> +
> +static void flush_unmaps_timeout(unsigned long data)
> +{
> +	flush_unmaps();
> +}
> +
> +static void add_unmap(struct dmar_domain *dom, struct iova *iova)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&async_umap_flush_lock, flags);

How scalable is this?

> +	iova->dmar = dom;
> +	list_add(&iova->list, &unmaps_to_do);
> +	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
> +
> +	if (!timer_on) {
> +		unmap_timer.expires = jiffies + msecs_to_jiffies(10);
> +		mod_timer(&unmap_timer, unmap_timer.expires);

No, this modifies unmap_timer.expires twice.  Might be racy too.  You want

	mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));

> +		timer_on = 1;
> +	}
> +	list_size++;
> +	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> +}
> +
>  static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
>  	size_t size, int dir)
>  {
> @@ -1936,13 +2036,21 @@
>  	dma_pte_clear_range(domain, start_addr, start_addr + size);
>  	/* free page tables */
>  	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
> -
> -	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
> -			size >> PAGE_SHIFT_4K, 0))
> -		iommu_flush_write_buffer(domain->iommu);
> -
> -	/* free iova */
> -	__free_iova(&domain->iovad, iova);
> +	if (intel_iommu_strict) {
> +		if (iommu_flush_iotlb_psi(domain->iommu,
> +			domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
> +			iommu_flush_write_buffer(domain->iommu);
> +		/* free iova */
> +		__free_iova(&domain->iovad, iova);
> +	} else {
> +		add_unmap(domain, iova);
> +		/*
> +		 * queue up the release of the unmap to save the 1/6th of the
> +		 * cpu used up by the iotlb flush operation...
> +		 */
> +		if (list_size > high_watermark)
> +			flush_unmaps();
> +	}
>  }
>  
>  static void * intel_alloc_coherent(struct device *hwdev, size_t size,
> @@ -2266,6 +2374,10 @@
>  	if (dmar_table_init())
>  		return 	-ENODEV;
>  
> +	high_watermark = 250;
> +	intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
> +	debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
> +					intel_iommu_debug, &high_watermark);
>  	iommu_init_mempool();
>  	dmar_init_reserved_ranges();
>  
> @@ -2281,6 +2393,7 @@
>  	printk(KERN_INFO
>  	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
>  
> +	init_timer(&unmap_timer);

I see timers being added, but I see no del_timer_sync()s added on cleanup
paths.  Are you sure that we don't have races on various teardown paths?

>  	force_iommu = 1;
>  	dma_ops = &intel_dma_ops;
>  	return 0;
> Index: linux-2.6.24-mm1/drivers/pci/iova.h
> ===================================================================
> --- linux-2.6.24-mm1.orig/drivers/pci/iova.h	2008-02-12 07:12:06.000000000 -0800
> +++ linux-2.6.24-mm1/drivers/pci/iova.h	2008-02-12 07:39:53.000000000 -0800
> @@ -23,6 +23,8 @@
>  	struct rb_node	node;
>  	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
>  	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
> +	struct list_head list;
> +	void *dmar;
>  };
>  
>  /* holds all the iova translations for a domain */
> Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> ===================================================================
> --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
> +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-13 11:17:22.000000000 -0800
> @@ -822,6 +822,10 @@
>  			than 32 bit addressing. The default is to look
>  			for translation below 32 bit and if not available
>  			then look in the higher range.
> +		strict [Default Off]
> +			With this option on every unmap_single operation will
> +			result in a hardware IOTLB flush operation as opposed
> +			to batching them for performance.

boot-time options suck.  Is it not possible to tweak this at runtime?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]iommu-iotlb-flushing
  2008-02-23  8:05 ` [PATCH]iommu-iotlb-flushing Andrew Morton
@ 2008-02-25 16:28   ` mark gross
  2008-02-25 18:40     ` [PATCH]iommu-iotlb-flushing Andrew Morton
  2008-02-29 23:18   ` [PATCH]iommu-iotlb-flushing mark gross
  1 sibling, 1 reply; 11+ messages in thread
From: mark gross @ 2008-02-25 16:28 UTC (permalink / raw)
  To: Andrew Morton; +Cc: lkml

On Sat, Feb 23, 2008 at 12:05:17AM -0800, Andrew Morton wrote:
> On Wed, 20 Feb 2008 16:06:23 -0800 mark gross <mgross@linux.intel.com> wrote:
> 
> > The following patch is for batching up the flushing of the IOTLB for
> > the DMAR implementation found in the Intel VT-d hardware.  It works by
> > building a list of to be flushed IOTLB entries and a bitmap list of
> > which DMAR engine they are from.
> > 
> > After either a high water mark (250 accessible via debugfs) or 10ms the
> > list of iova's will be reclaimed and the DMAR engines associated are
> > IOTLB-flushed.
> > 
> > This approach recovers 15 to 20% of the performance lost when using the
> > IOMMU for my netperf udp stream benchmark with small packets.  It can be
> > disabled with a kernel boot parameter
> > "intel_iommu=strict".
> > 
> > Its use does weaken the IOMMU protections a bit.
> > 
> > I would like to see this go into MM for a while and then onto mainline.
> > 
> > ...
> >
> > +static struct timer_list unmap_timer =
> > +	TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
> 
> Could use DEFINE_TIMER here.

ok
> 
> > +struct unmap_list {
> > +	struct list_head list;
> > +	struct dmar_domain *domain;
> > +	struct iova *iova;
> > +};
> 
> unmap_list doens't seem to be used anywhere?

oops, left over from earlier version.

> 
> > +static struct intel_iommu *g_iommus;
> > +/* bitmap for indexing intel_iommus */
> > +static unsigned long 	*g_iommus_to_flush;
> > +static int g_num_of_iommus;
> > +
> > +static DEFINE_SPINLOCK(async_umap_flush_lock);
> > +static LIST_HEAD(unmaps_to_do);
> > +
> > +static int timer_on;
> > +static long list_size;
> > +static int high_watermark;
> > +
> > +static struct dentry *intel_iommu_debug, *debug;
> > +
> > +
> >  static void domain_remove_dev_info(struct dmar_domain *domain);
> >  
> >  static int dmar_disabled;
> >  static int __initdata dmar_map_gfx = 1;
> >  static int dmar_forcedac;
> > +static int intel_iommu_strict;
> >  
> >  #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
> >  static DEFINE_SPINLOCK(device_domain_lock);
> > @@ -73,9 +103,13 @@
> >  			printk(KERN_INFO
> >  				"Intel-IOMMU: disable GFX device mapping\n");
> >  		} else if (!strncmp(str, "forcedac", 8)) {
> > -			printk (KERN_INFO
> > +			printk(KERN_INFO
> >  				"Intel-IOMMU: Forcing DAC for PCI devices\n");
> >  			dmar_forcedac = 1;
> > +		} else if (!strncmp(str, "strict", 8)) {
> 
> s/8/6/

ack.

> 
> > +			printk(KERN_INFO
> > +				"Intel-IOMMU: disable batched IOTLB flush\n");
> > +			intel_iommu_strict = 1;
> >  		}
> >  
> >  		str += strcspn(str, ",");
> > @@ -965,17 +999,13 @@
> >  		set_bit(0, iommu->domain_ids);
> >  	return 0;
> >  }
> > -
> > -static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
> > +static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
> > +					struct dmar_drhd_unit *drhd)
> >  {
> > -	struct intel_iommu *iommu;
> >  	int ret;
> >  	int map_size;
> >  	u32 ver;
> >  
> > -	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> > -	if (!iommu)
> > -		return NULL;
> >  	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
> >  	if (!iommu->reg) {
> >  		printk(KERN_ERR "IOMMU: can't map the region\n");
> > @@ -1396,7 +1426,7 @@
> >  	int index;
> >  
> >  	while (dev) {
> > -		for (index = 0; index < cnt; index ++)
> > +		for (index = 0; index < cnt; index++)
> >  			if (dev == devices[index])
> >  				return 1;
> >  
> > @@ -1661,7 +1691,7 @@
> >  	struct dmar_rmrr_unit *rmrr;
> >  	struct pci_dev *pdev;
> >  	struct intel_iommu *iommu;
> > -	int ret, unit = 0;
> > +	int nlongs, i, ret, unit = 0;
> >  
> >  	/*
> >  	 * for each drhd
> > @@ -1672,7 +1702,30 @@
> >  	for_each_drhd_unit(drhd) {
> >  		if (drhd->ignored)
> >  			continue;
> > -		iommu = alloc_iommu(drhd);
> > +		g_num_of_iommus++;
> 
> No locking needed for g_num_of_iommus?
> 

I'll double check if its needed, but it wouldn't hurt.  This code is on
the kernel startup / init path. 

> > +	}
> > +
> > +	nlongs = BITS_TO_LONGS(g_num_of_iommus);
> 
> Would this code be neater if it used the <linux/bitmap.h> stuff?

I'll look into it.

> 
> > +	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> > +	if (!g_iommus_to_flush) {
> > +		printk(KERN_ERR "Intel-IOMMU: "
> > +			"Allocating bitmap array failed\n");
> > +		return -ENOMEM;
> 
> Are you sure we aren't leaking anything here?  Like the alloc_iommu() above?

Once you set up the IOMMU's you never take them down or re-set them up.
This code runs one and one time at boot up.  

> 
> > +	}
> > +
> > +	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> > +	if (!g_iommus) {
> > +		kfree(g_iommus_to_flush);
> > +		ret = -ENOMEM;
> > +		goto error;
> > +	}
> > +
> > +	i = 0;
> > +	for_each_drhd_unit(drhd) {
> > +		if (drhd->ignored)
> > +			continue;
> > +		iommu = alloc_iommu(&g_iommus[i], drhd);
> > +		i++;
> >  		if (!iommu) {
> >  			ret = -ENOMEM;
> >  			goto error;
> > @@ -1705,7 +1758,6 @@
> >  	 * endfor
> >  	 */
> >  	for_each_rmrr_units(rmrr) {
> > -		int i;
> >  		for (i = 0; i < rmrr->devices_cnt; i++) {
> >  			pdev = rmrr->devices[i];
> >  			/* some BIOS lists non-exist devices in DMAR table */
> > @@ -1909,6 +1961,54 @@
> >  	return 0;
> >  }
> >  
> > +static void flush_unmaps(void)
> > +{
> > +	struct iova *node, *n;
> > +	unsigned long flags;
> > +	int i;
> > +
> > +	spin_lock_irqsave(&async_umap_flush_lock, flags);
> > +	timer_on = 0;
> > +
> > +	/*just flush them all*/
> 
> I'm surprised that checkpatch didn't grump about the odd commenting style.

It didn't.  What's odd about the comment style here?

> 
> > +	for (i = 0; i < g_num_of_iommus; i++) {
> > +		if (test_and_clear_bit(i, g_iommus_to_flush))
> > +			iommu_flush_iotlb_global(&g_iommus[i], 0);
> > +	}
> > +
> > +	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
> > +		/* free iova */
> > +		list_del(&node->list);
> > +		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
> > +
> > +	}
> > +	list_size = 0;
> > +	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> > +}
> > +
> > +static void flush_unmaps_timeout(unsigned long data)
> > +{
> > +	flush_unmaps();
> > +}
> > +
> > +static void add_unmap(struct dmar_domain *dom, struct iova *iova)
> > +{
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&async_umap_flush_lock, flags);
> 
> How scalable is this?

Not very.  But, its better than blocking on the hardware poll IOTLB
flush operation on every unmap.  Is there a lock less way to insert
into this list?  I suppose I could have one lock per DMAR-engine but,
that would still have the scalability issue.  (perhaps a list per IOVA?
/me needs to think on this a bit)

The best way is to get the network stack to reuse dma buffers when using
an iommu and avoid the unmap operation altogether.  But thats a longer
term goal.

> 
> > +	iova->dmar = dom;
> > +	list_add(&iova->list, &unmaps_to_do);
> > +	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
> > +
> > +	if (!timer_on) {
> > +		unmap_timer.expires = jiffies + msecs_to_jiffies(10);
> > +		mod_timer(&unmap_timer, unmap_timer.expires);
> 
> No, this modifies unmap_timer.expires twice.  Might be racy too.  You want
> 
> 	mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));

ack

> 
> > +		timer_on = 1;
> > +	}
> > +	list_size++;
> > +	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> > +}
> > +
> >  static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
> >  	size_t size, int dir)
> >  {
> > @@ -1936,13 +2036,21 @@
> >  	dma_pte_clear_range(domain, start_addr, start_addr + size);
> >  	/* free page tables */
> >  	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
> > -
> > -	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
> > -			size >> PAGE_SHIFT_4K, 0))
> > -		iommu_flush_write_buffer(domain->iommu);
> > -
> > -	/* free iova */
> > -	__free_iova(&domain->iovad, iova);
> > +	if (intel_iommu_strict) {
> > +		if (iommu_flush_iotlb_psi(domain->iommu,
> > +			domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
> > +			iommu_flush_write_buffer(domain->iommu);
> > +		/* free iova */
> > +		__free_iova(&domain->iovad, iova);
> > +	} else {
> > +		add_unmap(domain, iova);
> > +		/*
> > +		 * queue up the release of the unmap to save the 1/6th of the
> > +		 * cpu used up by the iotlb flush operation...
> > +		 */
> > +		if (list_size > high_watermark)
> > +			flush_unmaps();
> > +	}
> >  }
> >  
> >  static void * intel_alloc_coherent(struct device *hwdev, size_t size,
> > @@ -2266,6 +2374,10 @@
> >  	if (dmar_table_init())
> >  		return 	-ENODEV;
> >  
> > +	high_watermark = 250;
> > +	intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
> > +	debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
> > +					intel_iommu_debug, &high_watermark);
> >  	iommu_init_mempool();
> >  	dmar_init_reserved_ranges();
> >  
> > @@ -2281,6 +2393,7 @@
> >  	printk(KERN_INFO
> >  	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
> >  
> > +	init_timer(&unmap_timer);
> 
> I see timers being added, but I see no del_timer_sync()s added on cleanup
> paths.  Are you sure that we don't have races on various teardown paths?
> 

This code doesn't really tear down well.  However; at this point in
intel_iommu_init there is no further error paths to put tear down code.


> >  	force_iommu = 1;
> >  	dma_ops = &intel_dma_ops;
> >  	return 0;
> > Index: linux-2.6.24-mm1/drivers/pci/iova.h
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/drivers/pci/iova.h	2008-02-12 07:12:06.000000000 -0800
> > +++ linux-2.6.24-mm1/drivers/pci/iova.h	2008-02-12 07:39:53.000000000 -0800
> > @@ -23,6 +23,8 @@
> >  	struct rb_node	node;
> >  	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
> >  	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
> > +	struct list_head list;
> > +	void *dmar;
> >  };
> >  
> >  /* holds all the iova translations for a domain */
> > Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
> > +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-13 11:17:22.000000000 -0800
> > @@ -822,6 +822,10 @@
> >  			than 32 bit addressing. The default is to look
> >  			for translation below 32 bit and if not available
> >  			then look in the higher range.
> > +		strict [Default Off]
> > +			With this option on every unmap_single operation will
> > +			result in a hardware IOTLB flush operation as opposed
> > +			to batching them for performance.
> 
> boot-time options suck.  Is it not possible to tweak this at runtime?

Yes they do.  There may be a way to enable / disable this behavior at
runtime.  Let me think on it a bit.

Thank you for looking at this!

--mgross


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]iommu-iotlb-flushing
  2008-02-25 16:28   ` [PATCH]iommu-iotlb-flushing mark gross
@ 2008-02-25 18:40     ` Andrew Morton
  0 siblings, 0 replies; 11+ messages in thread
From: Andrew Morton @ 2008-02-25 18:40 UTC (permalink / raw)
  To: mgross; +Cc: lkml

On Mon, 25 Feb 2008 08:28:51 -0800 mark gross <mgross@linux.intel.com> wrote:

> > > +	/*just flush them all*/
> > 
> > I'm surprised that checkpatch didn't grump about the odd commenting style.
> 
> It didn't.  What's odd about the comment style here?

We normally put a space after "/*" and before "*/"

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]iommu-iotlb-flushing
  2008-02-23  8:05 ` [PATCH]iommu-iotlb-flushing Andrew Morton
  2008-02-25 16:28   ` [PATCH]iommu-iotlb-flushing mark gross
@ 2008-02-29 23:18   ` mark gross
  2008-03-01  5:54     ` [PATCH]iommu-iotlb-flushing Greg KH
  2008-03-01  7:10     ` [PATCH]iommu-iotlb-flushing Grant Grundler
  1 sibling, 2 replies; 11+ messages in thread
From: mark gross @ 2008-02-29 23:18 UTC (permalink / raw)
  To: Andrew Morton, greg; +Cc: lkml, linux-pci

On Sat, Feb 23, 2008 at 12:05:17AM -0800, Andrew Morton wrote:
> On Wed, 20 Feb 2008 16:06:23 -0800 mark gross <mgross@linux.intel.com> wrote:
> 
> > The following patch is for batching up the flushing of the IOTLB for
> > the DMAR implementation found in the Intel VT-d hardware.  It works by
> > building a list of to be flushed IOTLB entries and a bitmap list of
> > which DMAR engine they are from.
> > 
> > After either a high water mark (250 accessible via debugfs) or 10ms the
> > list of iova's will be reclaimed and the DMAR engines associated are
> > IOTLB-flushed.
> > 
> > This approach recovers 15 to 20% of the performance lost when using the
> > IOMMU for my netperf udp stream benchmark with small packets.  It can be
> > disabled with a kernel boot parameter
> > "intel_iommu=strict".
> > 
> > Its use does weaken the IOMMU protections a bit.
> > 
> > I would like to see this go into MM for a while and then onto mainline.
> > 
> > ...
> >
> > +static struct timer_list unmap_timer =
> > +	TIMER_INITIALIZER(flush_unmaps_timeout, 0, 0);
> 
> Could use DEFINE_TIMER here.

fixed.

> 
> > +struct unmap_list {
> > +	struct list_head list;
> > +	struct dmar_domain *domain;
> > +	struct iova *iova;
> > +};
> 
> unmap_list doens't seem to be used anywhere?

fixed.

> 
> > +static struct intel_iommu *g_iommus;
> > +/* bitmap for indexing intel_iommus */
> > +static unsigned long 	*g_iommus_to_flush;
> > +static int g_num_of_iommus;
> > +
> > +static DEFINE_SPINLOCK(async_umap_flush_lock);
> > +static LIST_HEAD(unmaps_to_do);
> > +
> > +static int timer_on;
> > +static long list_size;
> > +static int high_watermark;
> > +
> > +static struct dentry *intel_iommu_debug, *debug;
> > +
> > +
> >  static void domain_remove_dev_info(struct dmar_domain *domain);
> >  
> >  static int dmar_disabled;
> >  static int __initdata dmar_map_gfx = 1;
> >  static int dmar_forcedac;
> > +static int intel_iommu_strict;
> >  
> >  #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
> >  static DEFINE_SPINLOCK(device_domain_lock);
> > @@ -73,9 +103,13 @@
> >  			printk(KERN_INFO
> >  				"Intel-IOMMU: disable GFX device mapping\n");
> >  		} else if (!strncmp(str, "forcedac", 8)) {
> > -			printk (KERN_INFO
> > +			printk(KERN_INFO
> >  				"Intel-IOMMU: Forcing DAC for PCI devices\n");
> >  			dmar_forcedac = 1;
> > +		} else if (!strncmp(str, "strict", 8)) {
> 
> s/8/6/

fixed.

> 
> > +			printk(KERN_INFO
> > +				"Intel-IOMMU: disable batched IOTLB flush\n");
> > +			intel_iommu_strict = 1;
> >  		}
> >  
> >  		str += strcspn(str, ",");
> > @@ -965,17 +999,13 @@
> >  		set_bit(0, iommu->domain_ids);
> >  	return 0;
> >  }
> > -
> > -static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
> > +static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
> > +					struct dmar_drhd_unit *drhd)
> >  {
> > -	struct intel_iommu *iommu;
> >  	int ret;
> >  	int map_size;
> >  	u32 ver;
> >  
> > -	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> > -	if (!iommu)
> > -		return NULL;
> >  	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
> >  	if (!iommu->reg) {
> >  		printk(KERN_ERR "IOMMU: can't map the region\n");
> > @@ -1396,7 +1426,7 @@
> >  	int index;
> >  
> >  	while (dev) {
> > -		for (index = 0; index < cnt; index ++)
> > +		for (index = 0; index < cnt; index++)
> >  			if (dev == devices[index])
> >  				return 1;
> >  
> > @@ -1661,7 +1691,7 @@
> >  	struct dmar_rmrr_unit *rmrr;
> >  	struct pci_dev *pdev;
> >  	struct intel_iommu *iommu;
> > -	int ret, unit = 0;
> > +	int nlongs, i, ret, unit = 0;
> >  
> >  	/*
> >  	 * for each drhd
> > @@ -1672,7 +1702,30 @@
> >  	for_each_drhd_unit(drhd) {
> >  		if (drhd->ignored)
> >  			continue;
> > -		iommu = alloc_iommu(drhd);
> > +		g_num_of_iommus++;
> 
> No locking needed for g_num_of_iommus?

Yup, not needed.  Only the single threaded __init path increments this
and after that only read accesses happen.

> 
> > +	}
> > +
> > +	nlongs = BITS_TO_LONGS(g_num_of_iommus);
> 
> Would this code be neater if it used the <linux/bitmap.h> stuff?
> 

I just looked, I don't think bitmap.h helps me with my bitmap use in
this module.


> > +	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
> > +	if (!g_iommus_to_flush) {
> > +		printk(KERN_ERR "Intel-IOMMU: "
> > +			"Allocating bitmap array failed\n");
> > +		return -ENOMEM;
> 
> Are you sure we aren't leaking anything here?  Like the alloc_iommu() above?
> 

Fixed.  Was leaking the g_iommus_to_flush on the error path.


> > +	}
> > +
> > +	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
> > +	if (!g_iommus) {
> > +		kfree(g_iommus_to_flush);
> > +		ret = -ENOMEM;
> > +		goto error;
> > +	}
> > +
> > +	i = 0;
> > +	for_each_drhd_unit(drhd) {
> > +		if (drhd->ignored)
> > +			continue;
> > +		iommu = alloc_iommu(&g_iommus[i], drhd);
> > +		i++;
> >  		if (!iommu) {
> >  			ret = -ENOMEM;
> >  			goto error;
> > @@ -1705,7 +1758,6 @@
> >  	 * endfor
> >  	 */
> >  	for_each_rmrr_units(rmrr) {
> > -		int i;
> >  		for (i = 0; i < rmrr->devices_cnt; i++) {
> >  			pdev = rmrr->devices[i];
> >  			/* some BIOS lists non-exist devices in DMAR table */
> > @@ -1909,6 +1961,54 @@
> >  	return 0;
> >  }
> >  
> > +static void flush_unmaps(void)
> > +{
> > +	struct iova *node, *n;
> > +	unsigned long flags;
> > +	int i;
> > +
> > +	spin_lock_irqsave(&async_umap_flush_lock, flags);
> > +	timer_on = 0;
> > +
> > +	/*just flush them all*/
> 
> I'm surprised that checkpatch didn't grump about the odd commenting style.
> 

fixed.

> > +	for (i = 0; i < g_num_of_iommus; i++) {
> > +		if (test_and_clear_bit(i, g_iommus_to_flush))
> > +			iommu_flush_iotlb_global(&g_iommus[i], 0);
> > +	}
> > +
> > +	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
> > +		/* free iova */
> > +		list_del(&node->list);
> > +		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
> > +
> > +	}
> > +	list_size = 0;
> > +	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> > +}
> > +
> > +static void flush_unmaps_timeout(unsigned long data)
> > +{
> > +	flush_unmaps();
> > +}
> > +
> > +static void add_unmap(struct dmar_domain *dom, struct iova *iova)
> > +{
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&async_umap_flush_lock, flags);
> 
> How scalable is this?

Its not very, but I'm doing an insert walk of a list and need the locks.
On the bright side, its way better than spin locking and poling the
IOTLB bit for cleared on every unmap.

> 
> > +	iova->dmar = dom;
> > +	list_add(&iova->list, &unmaps_to_do);
> > +	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
> > +
> > +	if (!timer_on) {
> > +		unmap_timer.expires = jiffies + msecs_to_jiffies(10);
> > +		mod_timer(&unmap_timer, unmap_timer.expires);
> 
> No, this modifies unmap_timer.expires twice.  Might be racy too.  You want
> 
> 	mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
> 

Fixed.

> > +		timer_on = 1;
> > +	}
> > +	list_size++;
> > +	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> > +}
> > +
> >  static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
> >  	size_t size, int dir)
> >  {
> > @@ -1936,13 +2036,21 @@
> >  	dma_pte_clear_range(domain, start_addr, start_addr + size);
> >  	/* free page tables */
> >  	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
> > -
> > -	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
> > -			size >> PAGE_SHIFT_4K, 0))
> > -		iommu_flush_write_buffer(domain->iommu);
> > -
> > -	/* free iova */
> > -	__free_iova(&domain->iovad, iova);
> > +	if (intel_iommu_strict) {
> > +		if (iommu_flush_iotlb_psi(domain->iommu,
> > +			domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
> > +			iommu_flush_write_buffer(domain->iommu);
> > +		/* free iova */
> > +		__free_iova(&domain->iovad, iova);
> > +	} else {
> > +		add_unmap(domain, iova);
> > +		/*
> > +		 * queue up the release of the unmap to save the 1/6th of the
> > +		 * cpu used up by the iotlb flush operation...
> > +		 */
> > +		if (list_size > high_watermark)
> > +			flush_unmaps();
> > +	}
> >  }
> >  
> >  static void * intel_alloc_coherent(struct device *hwdev, size_t size,
> > @@ -2266,6 +2374,10 @@
> >  	if (dmar_table_init())
> >  		return 	-ENODEV;
> >  
> > +	high_watermark = 250;
> > +	intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
> > +	debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
> > +					intel_iommu_debug, &high_watermark);
> >  	iommu_init_mempool();
> >  	dmar_init_reserved_ranges();
> >  
> > @@ -2281,6 +2393,7 @@
> >  	printk(KERN_INFO
> >  	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
> >  
> > +	init_timer(&unmap_timer);
> 
> I see timers being added, but I see no del_timer_sync()s added on cleanup
> paths.  Are you sure that we don't have races on various teardown paths?
> 

The only error path to delete the single timer created is at the bottom
of intel_iommu_init.  I don't think there is a place for the timer to be
delete and any error clean up paths to put it.

> >  	force_iommu = 1;
> >  	dma_ops = &intel_dma_ops;
> >  	return 0;
> > Index: linux-2.6.24-mm1/drivers/pci/iova.h
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/drivers/pci/iova.h	2008-02-12 07:12:06.000000000 -0800
> > +++ linux-2.6.24-mm1/drivers/pci/iova.h	2008-02-12 07:39:53.000000000 -0800
> > @@ -23,6 +23,8 @@
> >  	struct rb_node	node;
> >  	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
> >  	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
> > +	struct list_head list;
> > +	void *dmar;
> >  };
> >  
> >  /* holds all the iova translations for a domain */
> > Index: linux-2.6.24-mm1/Documentation/kernel-parameters.txt
> > ===================================================================
> > --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
> > +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-13 11:17:22.000000000 -0800
> > @@ -822,6 +822,10 @@
> >  			than 32 bit addressing. The default is to look
> >  			for translation below 32 bit and if not available
> >  			then look in the higher range.
> > +		strict [Default Off]
> > +			With this option on every unmap_single operation will
> > +			result in a hardware IOTLB flush operation as opposed
> > +			to batching them for performance.
> 
> boot-time options suck.  Is it not possible to tweak this at runtime?

Yes, I could easily create a sysfs or debugfs mechanism for turning it
on / off at run time.  I would like input on the preferred way to do this.
sysfs or debugfs?


Thanks again for your review on this.  The following is the updated
patch.

--mgross

Signed-off-by: <mgross@linux.intel.com>



Index: linux-2.6.25-rc2-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.25-rc2-mm1.orig/drivers/pci/intel-iommu.c	2008-02-26 11:15:46.000000000 -0800
+++ linux-2.6.25-rc2-mm1/drivers/pci/intel-iommu.c	2008-02-29 14:36:55.000000000 -0800
@@ -21,6 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/bitmap.h>
+#include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
@@ -30,6 +31,7 @@
 #include <linux/dmar.h>
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
+#include <linux/timer.h>
 #include "iova.h"
 #include "intel-iommu.h"
 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
@@ -50,11 +52,32 @@
 
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
+
+static void flush_unmaps_timeout(unsigned long data);
+
+DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
+
+static struct intel_iommu *g_iommus;
+/* bitmap for indexing intel_iommus */
+static unsigned long 	*g_iommus_to_flush;
+static int g_num_of_iommus;
+
+static DEFINE_SPINLOCK(async_umap_flush_lock);
+static LIST_HEAD(unmaps_to_do);
+
+static int timer_on;
+static long list_size;
+static int high_watermark;
+
+static struct dentry *intel_iommu_debug, *debug;
+
+
 static void domain_remove_dev_info(struct dmar_domain *domain);
 
 static int dmar_disabled;
 static int __initdata dmar_map_gfx = 1;
 static int dmar_forcedac;
+static int intel_iommu_strict;
 
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -73,9 +96,13 @@
 			printk(KERN_INFO
 				"Intel-IOMMU: disable GFX device mapping\n");
 		} else if (!strncmp(str, "forcedac", 8)) {
-			printk (KERN_INFO
+			printk(KERN_INFO
 				"Intel-IOMMU: Forcing DAC for PCI devices\n");
 			dmar_forcedac = 1;
+		} else if (!strncmp(str, "strict", 6)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: disable batched IOTLB flush\n");
+			intel_iommu_strict = 1;
 		}
 
 		str += strcspn(str, ",");
@@ -965,17 +992,13 @@
 		set_bit(0, iommu->domain_ids);
 	return 0;
 }
-
-static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
+static struct intel_iommu *alloc_iommu(struct intel_iommu *iommu,
+					struct dmar_drhd_unit *drhd)
 {
-	struct intel_iommu *iommu;
 	int ret;
 	int map_size;
 	u32 ver;
 
-	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
-	if (!iommu)
-		return NULL;
 	iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
 	if (!iommu->reg) {
 		printk(KERN_ERR "IOMMU: can't map the region\n");
@@ -1396,7 +1419,7 @@
 	int index;
 
 	while (dev) {
-		for (index = 0; index < cnt; index ++)
+		for (index = 0; index < cnt; index++)
 			if (dev == devices[index])
 				return 1;
 
@@ -1661,7 +1684,7 @@
 	struct dmar_rmrr_unit *rmrr;
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
-	int ret, unit = 0;
+	int nlongs, i, ret, unit = 0;
 
 	/*
 	 * for each drhd
@@ -1672,7 +1695,35 @@
 	for_each_drhd_unit(drhd) {
 		if (drhd->ignored)
 			continue;
-		iommu = alloc_iommu(drhd);
+		g_num_of_iommus++;
+		/*
+		 * lock not needed as this is only incremented in the single
+		 * threaded kernel __init code path all other access are read
+		 * only
+		 */
+	}
+
+	nlongs = BITS_TO_LONGS(g_num_of_iommus);
+	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
+	if (!g_iommus_to_flush) {
+		printk(KERN_ERR "Intel-IOMMU: "
+			"Allocating bitmap array failed\n");
+		return -ENOMEM;
+	}
+
+	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
+	if (!g_iommus) {
+		kfree(g_iommus_to_flush);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	i = 0;
+	for_each_drhd_unit(drhd) {
+		if (drhd->ignored)
+			continue;
+		iommu = alloc_iommu(&g_iommus[i], drhd);
+		i++;
 		if (!iommu) {
 			ret = -ENOMEM;
 			goto error;
@@ -1705,7 +1756,6 @@
 	 * endfor
 	 */
 	for_each_rmrr_units(rmrr) {
-		int i;
 		for (i = 0; i < rmrr->devices_cnt; i++) {
 			pdev = rmrr->devices[i];
 			/* some BIOS lists non-exist devices in DMAR table */
@@ -1761,6 +1811,7 @@
 		iommu = drhd->iommu;
 		free_iommu(iommu);
 	}
+	kfree(g_iommus);
 	return ret;
 }
 
@@ -1909,6 +1960,53 @@
 	return 0;
 }
 
+static void flush_unmaps(void)
+{
+	struct iova *node, *n;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	timer_on = 0;
+
+	/* just flush them all */
+	for (i = 0; i < g_num_of_iommus; i++) {
+		if (test_and_clear_bit(i, g_iommus_to_flush))
+			iommu_flush_iotlb_global(&g_iommus[i], 0);
+	}
+
+	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
+		/* free iova */
+		list_del(&node->list);
+		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
+
+	}
+	list_size = 0;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
+static void flush_unmaps_timeout(unsigned long data)
+{
+	flush_unmaps();
+}
+
+static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
+	iova->dmar = dom;
+	list_add(&iova->list, &unmaps_to_do);
+	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+
+	if (!timer_on) {
+		mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
+		timer_on = 1;
+	}
+	list_size++;
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
+}
+
 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
 	size_t size, int dir)
 {
@@ -1936,13 +2034,21 @@
 	dma_pte_clear_range(domain, start_addr, start_addr + size);
 	/* free page tables */
 	dma_pte_free_pagetable(domain, start_addr, start_addr + size);
-
-	if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
-			size >> PAGE_SHIFT_4K, 0))
-		iommu_flush_write_buffer(domain->iommu);
-
-	/* free iova */
-	__free_iova(&domain->iovad, iova);
+	if (intel_iommu_strict) {
+		if (iommu_flush_iotlb_psi(domain->iommu,
+			domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
+			iommu_flush_write_buffer(domain->iommu);
+		/* free iova */
+		__free_iova(&domain->iovad, iova);
+	} else {
+		add_unmap(domain, iova);
+		/*
+		 * queue up the release of the unmap to save the 1/6th of the
+		 * cpu used up by the iotlb flush operation...
+		 */
+		if (list_size > high_watermark)
+			flush_unmaps();
+	}
 }
 
 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
@@ -2266,6 +2372,10 @@
 	if (dmar_table_init())
 		return 	-ENODEV;
 
+	high_watermark = 250;
+	intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
+	debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
+					intel_iommu_debug, &high_watermark);
 	iommu_init_mempool();
 	dmar_init_reserved_ranges();
 
@@ -2281,6 +2391,7 @@
 	printk(KERN_INFO
 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
 
+	init_timer(&unmap_timer);
 	force_iommu = 1;
 	dma_ops = &intel_dma_ops;
 	return 0;
Index: linux-2.6.25-rc2-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.25-rc2-mm1.orig/drivers/pci/iova.h	2008-02-26 11:15:46.000000000 -0800
+++ linux-2.6.25-rc2-mm1/drivers/pci/iova.h	2008-02-29 14:11:52.000000000 -0800
@@ -23,6 +23,8 @@
 	struct rb_node	node;
 	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
 	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
+	struct list_head list;
+	void *dmar;
 };
 
 /* holds all the iova translations for a domain */
Index: linux-2.6.25-rc2-mm1/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.25-rc2-mm1.orig/Documentation/kernel-parameters.txt	2008-02-26 11:16:01.000000000 -0800
+++ linux-2.6.25-rc2-mm1/Documentation/kernel-parameters.txt	2008-02-26 13:44:06.000000000 -0800
@@ -830,6 +830,10 @@
 			than 32 bit addressing. The default is to look
 			for translation below 32 bit and if not available
 			then look in the higher range.
+		strict [Default Off]
+			With this option on every unmap_single operation will
+			result in a hardware IOTLB flush operation as opposed
+			to batching them for performance.
 
 	io_delay=	[X86-32,X86-64] I/O delay method
 		0x80

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]iommu-iotlb-flushing
  2008-02-29 23:18   ` [PATCH]iommu-iotlb-flushing mark gross
@ 2008-03-01  5:54     ` Greg KH
  2008-03-01  7:10     ` [PATCH]iommu-iotlb-flushing Grant Grundler
  1 sibling, 0 replies; 11+ messages in thread
From: Greg KH @ 2008-03-01  5:54 UTC (permalink / raw)
  To: mark gross; +Cc: Andrew Morton, lkml, linux-pci

On Fri, Feb 29, 2008 at 03:18:41PM -0800, mark gross wrote:
> > > --- linux-2.6.24-mm1.orig/Documentation/kernel-parameters.txt	2008-02-12 07:12:06.000000000 -0800
> > > +++ linux-2.6.24-mm1/Documentation/kernel-parameters.txt	2008-02-13 11:17:22.000000000 -0800
> > > @@ -822,6 +822,10 @@
> > >  			than 32 bit addressing. The default is to look
> > >  			for translation below 32 bit and if not available
> > >  			then look in the higher range.
> > > +		strict [Default Off]
> > > +			With this option on every unmap_single operation will
> > > +			result in a hardware IOTLB flush operation as opposed
> > > +			to batching them for performance.
> > 
> > boot-time options suck.  Is it not possible to tweak this at runtime?
> 
> Yes, I could easily create a sysfs or debugfs mechanism for turning it
> on / off at run time.  I would like input on the preferred way to do this.
> sysfs or debugfs?

Which is it, a debugging option, or something that anyone would want to
change?  If debugging, make it a debugfs file.  Otherwise sysfs if fine,
but be sure to add a Documentation/ABI/ file for it.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]iommu-iotlb-flushing
  2008-02-29 23:18   ` [PATCH]iommu-iotlb-flushing mark gross
  2008-03-01  5:54     ` [PATCH]iommu-iotlb-flushing Greg KH
@ 2008-03-01  7:10     ` Grant Grundler
  2008-03-03 18:34       ` [PATCH]iommu-iotlb-flushing mark gross
  1 sibling, 1 reply; 11+ messages in thread
From: Grant Grundler @ 2008-03-01  7:10 UTC (permalink / raw)
  To: mark gross; +Cc: Andrew Morton, greg, lkml, linux-pci

On Fri, Feb 29, 2008 at 03:18:41PM -0800, mark gross wrote:
> On Sat, Feb 23, 2008 at 12:05:17AM -0800, Andrew Morton wrote:
> > On Wed, 20 Feb 2008 16:06:23 -0800 mark gross <mgross@linux.intel.com> wrote:
> > 
> > > The following patch is for batching up the flushing of the IOTLB for
> > > the DMAR implementation found in the Intel VT-d hardware.  It works by
> > > building a list of to be flushed IOTLB entries and a bitmap list of
> > > which DMAR engine they are from.

I didn't see the original patch - thanks for cc'ing linux-pci.

PA-RISC and IA64 have been doing this for several years now.
See DELAYED_RESOURCE_CNT usage in drivers/parisc/sba_iommu.c
and in arch/ia64/hp/common/sba_iommu.c.

I prototyped the same concept for x86_64 GART support but it didn't
seem to matter on benchmarks since most of the devices I use are
64-bit and don't need to use the IOMMU. IDE/SATA controllers 
are 32-bit but there is LOTS of overhead in so many other places,
this change made less than 0.3 difference for them.
(And the decision to use uncached memory for the IO Pdir made this moot).
I'd be happy to post the patch if someone wants to see it though.

> > > This approach recovers 15 to 20% of the performance lost when using the
> > > IOMMU for my netperf udp stream benchmark with small packets.  It can be
> > > disabled with a kernel boot parameter
> > > "intel_iommu=strict".
> > > 
> > > Its use does weaken the IOMMU protections a bit.

One can do a few things to limit how much the protections
are weakened:
1) Invalidate the IO TLB entry and IO Pdir entries (just don't force
  syncronization). At least this was possible on the IOMMU's
  I'm familiar with.

2) release the IO Pdir entry for re-use once syncronization has been forced.
   Syncronization/flushing of the IO TLB shoot-down is the most expensive
   ops AFAICT on the IOMMU's I've worked with.

I don't ever recall finding a bug where the device was DMA'ing to a
buffer again shortly after unmap call. Most DMA errors are device driver
not programming the DMA engines correctly.

> > > +	if (intel_iommu_strict) {

I suggest adding an "unlikely()" around this test so the compiler
can generate better code for this and it's clear what your intent is.


I just looked at the implementation of flush_unmaps().
I strongly reccomend comparing this implementation with the
DELAYED_RESOURCE_CNT since this looks like it will need 2X or more
of the CPU cycles to execute for each entry. Use of "list_del()"
is substantially more expensive than a simple linear arrary.
(Fewer entries per cacheline by 4X maybe?)

Another problem is every now and then some IO is going to burn a bunch
of CPU cycles and introduce inconsistent performance for particular
unmap operations. One has to tradeoff amortizing the cost of the
IOMMU flushing with the number of "unusable" IO Pdir entries and more
consistent CPU utilization for each unmap() call.  My gut feeling
is 250 is rather high for high_watermark unless flushing the IO TLB
is extremely expensive.


...
> > boot-time options suck.  Is it not possible to tweak this at runtime?
> 
> Yes, I could easily create a sysfs or debugfs mechanism for turning it
> on / off at run time.  I would like input on the preferred way to do this.
> sysfs or debugfs?

I prefer a compile time constant since we are talking about fixed costs
for this implementation.  The compiler can do a better job with a constant.
I know this sounds nit picky but if I can't sufficiently emphasized how
perf critical DMA map/unmap code is.

If it has to be a variable, I prefer sysfs but don't have a good reason
for that.

> Signed-off-by: <mgross@linux.intel.com>

Ah! Maybe you can get together with Matthew Wilcox (also) and consider
how the IOMMU code might be included in the scsi_ram driver he wrote.
Or maybe just directly use the ram disk (rd.c) instead.

At LSF, I suggested using the IOAT DMA engine (if available) do real DMA.
Running the IO requests through the IOMMU would expose the added CPU cost
of the IOMMU in it's worst case. This doesn't need to go to kernel.org
but might help you isolate perf bottlenecks in this iommu code.


> Index: linux-2.6.25-rc2-mm1/drivers/pci/intel-iommu.c
> ===================================================================
> --- linux-2.6.25-rc2-mm1.orig/drivers/pci/intel-iommu.c	2008-02-26 11:15:46.000000000 -0800
> +++ linux-2.6.25-rc2-mm1/drivers/pci/intel-iommu.c	2008-02-29 14:36:55.000000000 -0800
...
> +static void flush_unmaps_timeout(unsigned long data);
> +
> +DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);

Why bother with the timer?

This just adds more overhead and won't help much to improve
protection.  If someone needs tighter protection, the "strict"
unmapping/flushing parameter should be sufficient to track
down the issues they might be seeing.  And it's perfectly OK
to be sitting on a few "unusable" IOMMU entries.

...
> +static void flush_unmaps(void)
> +{
> +	struct iova *node, *n;
> +	unsigned long flags;
> +	int i;
> +
> +	spin_lock_irqsave(&async_umap_flush_lock, flags);
> +	timer_on = 0;
> +
> +	/* just flush them all */
> +	for (i = 0; i < g_num_of_iommus; i++) {
> +		if (test_and_clear_bit(i, g_iommus_to_flush))
> +			iommu_flush_iotlb_global(&g_iommus[i], 0);
> +	}

When we hit a highwater mark, would be make sense to only
flush the iommu associated with the device in question?

I'm trying to limit the amount of time spent in any single
call to flush_unmaps(). If iommu_flush_iotlb_global() is really
fast (ie not 1000s of cycles), then this might be still ok.
But it would certainly make more sense to only flush the
iommu associated with the IO device calling unmap.

> +	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
> +		/* free iova */
> +		list_del(&node->list);

Using a linear array would be alot more efficient than list_del().
One could increment a local index (we are holding the spinlock) and
not touch the data again. See code snippet from sba_iommu.c below.

> +		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
> +
> +	}
> +	list_size = 0;
> +	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> +}
...
> +static void add_unmap(struct dmar_domain *dom, struct iova *iova)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&async_umap_flush_lock, flags);
> +	iova->dmar = dom;
> +	list_add(&iova->list, &unmaps_to_do);
> +	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
> +
> +	if (!timer_on) {
> +		mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
> +		timer_on = 1;
> +	}
> +	list_size++;
> +	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> +}

I'd like to compare the above with code from parisc sba_iommu.c
which does the same thing:
...
        d = &(ioc->saved[ioc->saved_cnt]);
        d->iova = iova;
        d->size = size;
        if (++(ioc->saved_cnt) >= DELAYED_RESOURCE_CNT) {
...

(plus spinlock acquire/release)

map/unmap_single is called _alot_. It's probably called more often
than the low level CPU interrupt handler on a busy NIC.

Removing stats gathering from the SBA code yielded an easily
measurable difference in perf. I don't recall exactly what it was.
Apologies for not quantifying the diference when I made the change:
http://lists.parisc-linux.org/pipermail/parisc-linux-cvs/2004-January/033739.html


> @@ -23,6 +23,8 @@
>  	struct rb_node	node;
>  	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
>  	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
> +	struct list_head list;

Can we have a more descriptive name than "list"?

> +	void *dmar;

Why isn't this declared "struct dmar_domain *"?

I'm looking at this usage in the patch:
+               __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);

Given this is all for intel IOMMUs, I expect a struct could be visible.
But I might be overlooking some higher design here.

thanks and I hope the above helps,
grant

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]iommu-iotlb-flushing
  2008-03-01  7:10     ` [PATCH]iommu-iotlb-flushing Grant Grundler
@ 2008-03-03 18:34       ` mark gross
  2008-03-05 18:23         ` [PATCH]iommu-iotlb-flushing Grant Grundler
  0 siblings, 1 reply; 11+ messages in thread
From: mark gross @ 2008-03-03 18:34 UTC (permalink / raw)
  To: Grant Grundler; +Cc: Andrew Morton, greg, lkml, linux-pci

On Sat, Mar 01, 2008 at 12:10:43AM -0700, Grant Grundler wrote:
> On Fri, Feb 29, 2008 at 03:18:41PM -0800, mark gross wrote:
> > On Sat, Feb 23, 2008 at 12:05:17AM -0800, Andrew Morton wrote:
> > > On Wed, 20 Feb 2008 16:06:23 -0800 mark gross <mgross@linux.intel.com> wrote:
> > > 
> > > > The following patch is for batching up the flushing of the IOTLB for
> > > > the DMAR implementation found in the Intel VT-d hardware.  It works by
> > > > building a list of to be flushed IOTLB entries and a bitmap list of
> > > > which DMAR engine they are from.
> 
> I didn't see the original patch - thanks for cc'ing linux-pci.
> 
> PA-RISC and IA64 have been doing this for several years now.
> See DELAYED_RESOURCE_CNT usage in drivers/parisc/sba_iommu.c
> and in arch/ia64/hp/common/sba_iommu.c.
> 
> I prototyped the same concept for x86_64 GART support but it didn't
> seem to matter on benchmarks since most of the devices I use are
> 64-bit and don't need to use the IOMMU. IDE/SATA controllers 
> are 32-bit but there is LOTS of overhead in so many other places,
> this change made less than 0.3 difference for them.

The intel DMAR engines with the TLB's do have a measurable overhead
flushing TLB entries on every unmapsingle.  This hardware forces a bit
poll on TLB flush completion that adds up on high rates of unmap
operations.  (about 15% on my netperf small packet 32 byte UDP stream
over a 1GigE adapter)

FWIW: I think IOMMU use for avoiding bounce buffers is a thing of the
past.  (at least I have a "show me where it helps" point of view for
modern hardware.  what new hardware needs bounce buffers anymore?)

Also, to be fair to the Intel hardware the performance overhead is
really only noticeable on small packed and 10Gig benchmarking.  Most
users will not be able to notice the overhead.

> (And the decision to use uncached memory for the IO Pdir made this moot).
> I'd be happy to post the patch if someone wants to see it though.
> 
> > > > This approach recovers 15 to 20% of the performance lost when using the
> > > > IOMMU for my netperf udp stream benchmark with small packets.  It can be
> > > > disabled with a kernel boot parameter
> > > > "intel_iommu=strict".
> > > > 
> > > > Its use does weaken the IOMMU protections a bit.
> 
> One can do a few things to limit how much the protections
> are weakened:
> 1) Invalidate the IO TLB entry and IO Pdir entries (just don't force
>   syncronization). At least this was possible on the IOMMU's
>   I'm familiar with.

The default is the TLB invalidation. It has a nasty polling loop that
hurts on the small packet netperf benchmark.

> 
> 2) release the IO Pdir entry for re-use once syncronization has been forced.
>    Syncronization/flushing of the IO TLB shoot-down is the most expensive
>    ops AFAICT on the IOMMU's I've worked with.

this is what the patch does.

> 
> I don't ever recall finding a bug where the device was DMA'ing to a
> buffer again shortly after unmap call. Most DMA errors are device driver
> not programming the DMA engines correctly.
> 

This makes me feel a bit better.


> > > > +	if (intel_iommu_strict) {
> 
> I suggest adding an "unlikely()" around this test so the compiler
> can generate better code for this and it's clear what your intent is.
> 

Ack

> 
> I just looked at the implementation of flush_unmaps().
> I strongly reccomend comparing this implementation with the
> DELAYED_RESOURCE_CNT since this looks like it will need 2X or more
> of the CPU cycles to execute for each entry. Use of "list_del()"
> is substantially more expensive than a simple linear arrary.
> (Fewer entries per cacheline by 4X maybe?)
>

/me looks, an array of pointers to iova's to clean up makes good sense.
I don't know why I didn't think of this approach first.  Working with
lists is messy and if I can avoid it I usually try.
 
> Another problem is every now and then some IO is going to burn a bunch
> of CPU cycles and introduce inconsistent performance for particular
> unmap operations. One has to tradeoff amortizing the cost of the
> IOMMU flushing with the number of "unusable" IO Pdir entries and more
> consistent CPU utilization for each unmap() call.  My gut feeling
> is 250 is rather high for high_watermark unless flushing the IO TLB
> is extremely expensive.
> 

the 250 high water mark came from 1gig small packet netperf udp
streaming testing.  The 1000 I initially used didn't perform quite as
well.  Nothing scientific went into this number just some tests cases.

> 
> ...
> > > boot-time options suck.  Is it not possible to tweak this at runtime?
> > 
> > Yes, I could easily create a sysfs or debugfs mechanism for turning it
> > on / off at run time.  I would like input on the preferred way to do this.
> > sysfs or debugfs?
> 
> I prefer a compile time constant since we are talking about fixed costs
> for this implementation.  The compiler can do a better job with a constant.
> I know this sounds nit picky but if I can't sufficiently emphasized how
> perf critical DMA map/unmap code is.

Compile time options lock out OSV's from using the feature.  These
IOMMU's are in desktop systems now.  My job is to make it not suck so
OSV's will leave DMAR enabled in production kernels.  (well at least
try to anyway.)

> 
> If it has to be a variable, I prefer sysfs but don't have a good reason
> for that.
> 
> > Signed-off-by: <mgross@linux.intel.com>
> 
> Ah! Maybe you can get together with Matthew Wilcox (also) and consider
> how the IOMMU code might be included in the scsi_ram driver he wrote.
> Or maybe just directly use the ram disk (rd.c) instead.

Yup, Willi has just started to look over my shoulder on some of this.
He pointed me at some OLS papers and whatnot to look at.  I botched my
chance to work face to face with him last week on his last visit to
Oregon but we'll keep talking.

> 
> At LSF, I suggested using the IOAT DMA engine (if available) do real DMA.
> Running the IO requests through the IOMMU would expose the added CPU cost
> of the IOMMU in it's worst case. This doesn't need to go to kernel.org
> but might help you isolate perf bottlenecks in this iommu code.

Oh the worst cases are easy.  Just run a 10gig NIC and see what you get.
With a 1Gig NIC you need to drop the packet size down to 32 bytes or
something tiny like that to really notice it.  16K packets
have only a smallish overhead that are iffy as benchmarks because the
CPU utilization is too low (<25%) to be a good measure.

FWIW : throughput isn't effected so much as the CPU overhead.
iommu=strict: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 25% cpu 
with this patch: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 18% cpu 
IOMMU=OFF: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 11% cpu 


> 
> 
> > Index: linux-2.6.25-rc2-mm1/drivers/pci/intel-iommu.c
> > ===================================================================
> > --- linux-2.6.25-rc2-mm1.orig/drivers/pci/intel-iommu.c	2008-02-26 11:15:46.000000000 -0800
> > +++ linux-2.6.25-rc2-mm1/drivers/pci/intel-iommu.c	2008-02-29 14:36:55.000000000 -0800
> ...
> > +static void flush_unmaps_timeout(unsigned long data);
> > +
> > +DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
> 
> Why bother with the timer?
> 
> This just adds more overhead and won't help much to improve
> protection.  If someone needs tighter protection, the "strict"
> unmapping/flushing parameter should be sufficient to track
> down the issues they might be seeing.  And it's perfectly OK
> to be sitting on a few "unusable" IOMMU entries.
> 

/me thinks about this.  You have a point.  I put in the timer to build
the time window of any rogue DMA actions.  I still worry about
potentially unbounded times where a stale IOTLB entry could be used.
What do others think?

> ...
> > +static void flush_unmaps(void)
> > +{
> > +	struct iova *node, *n;
> > +	unsigned long flags;
> > +	int i;
> > +
> > +	spin_lock_irqsave(&async_umap_flush_lock, flags);
> > +	timer_on = 0;
> > +
> > +	/* just flush them all */
> > +	for (i = 0; i < g_num_of_iommus; i++) {
> > +		if (test_and_clear_bit(i, g_iommus_to_flush))
> > +			iommu_flush_iotlb_global(&g_iommus[i], 0);
> > +	}
> 
> When we hit a highwater mark, would be make sense to only
> flush the iommu associated with the device in question?
> 

This is what I do.  The bitmap is a mapping to which DMAR engines (IOMMU)
need do be flushed.

> I'm trying to limit the amount of time spent in any single
> call to flush_unmaps(). If iommu_flush_iotlb_global() is really
> fast (ie not 1000s of cycles), then this might be still ok.

Its fast, and global in this case is global to the passed in
DMAR-engine, not the system.

> But it would certainly make more sense to only flush the
> iommu associated with the IO device calling unmap.
> 
> > +	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
> > +		/* free iova */
> > +		list_del(&node->list);
> 
> Using a linear array would be alot more efficient than list_del().
> One could increment a local index (we are holding the spinlock) and
> not touch the data again. See code snippet from sba_iommu.c below.
> 

You have a point.  The only downside is that the high water mark would
no longer be a runtime tunable as it would be tied to the size of this
array.

> > +		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
> > +
> > +	}
> > +	list_size = 0;
> > +	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> > +}
> ...
> > +static void add_unmap(struct dmar_domain *dom, struct iova *iova)
> > +{
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&async_umap_flush_lock, flags);
> > +	iova->dmar = dom;
> > +	list_add(&iova->list, &unmaps_to_do);
> > +	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
> > +
> > +	if (!timer_on) {
> > +		mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
> > +		timer_on = 1;
> > +	}
> > +	list_size++;
> > +	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
> > +}
> 
> I'd like to compare the above with code from parisc sba_iommu.c
> which does the same thing:
> ...
>         d = &(ioc->saved[ioc->saved_cnt]);
>         d->iova = iova;
>         d->size = size;
>         if (++(ioc->saved_cnt) >= DELAYED_RESOURCE_CNT) {
> ...
> 
> (plus spinlock acquire/release)
> 
> map/unmap_single is called _alot_. It's probably called more often
> than the low level CPU interrupt handler on a busy NIC.
>

yes it is.
 
> Removing stats gathering from the SBA code yielded an easily
> measurable difference in perf. I don't recall exactly what it was.
> Apologies for not quantifying the diference when I made the change:
> http://lists.parisc-linux.org/pipermail/parisc-linux-cvs/2004-January/033739.html
> 
> 
> > @@ -23,6 +23,8 @@
> >  	struct rb_node	node;
> >  	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
> >  	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
> > +	struct list_head list;
> 
> Can we have a more descriptive name than "list"?
>

something like "deferred_flush_list" perhaps?
  
> > +	void *dmar;
> 
> Why isn't this declared "struct dmar_domain *"?
> 
> I'm looking at this usage in the patch:
> +               __free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
> 
> Given this is all for intel IOMMUs, I expect a struct could be visible.
> But I might be overlooking some higher design here.

The RB trees need to know which DMAR engine the iovad is part of.the
dmar_domain pointer provides this.  I think the disconnect is that the
intel vt-d systems have multiple IOMMU's to juggle where I think you are
assuming there is only one.

> 
> thanks and I hope the above helps,

It helps a lot! thanks.  I'm going to take a hard look at moving from a
list to an array for batching up the iova's to defer-flush.
> grant

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH]iommu-iotlb-flushing
  2008-03-03 18:34       ` [PATCH]iommu-iotlb-flushing mark gross
@ 2008-03-05 18:23         ` Grant Grundler
  2008-03-05 23:01           ` [PATCH] Use an array instead of a list for deffered intel-iommu iotlb flushing [PATCH]iommu-iotlb-flushing mark gross
  0 siblings, 1 reply; 11+ messages in thread
From: Grant Grundler @ 2008-03-05 18:23 UTC (permalink / raw)
  To: mark gross; +Cc: Grant Grundler, Andrew Morton, greg, lkml, linux-pci

On Mon, Mar 03, 2008 at 10:34:11AM -0800, mark gross wrote:
> The intel DMAR engines with the TLB's do have a measurable overhead
> flushing TLB entries on every unmapsingle.  This hardware forces a bit
> poll on TLB flush completion that adds up on high rates of unmap
> operations.  (about 15% on my netperf small packet 32 byte UDP stream
> over a 1GigE adapter)

So if we flushed the IOTLB every 16 unmap calls, it should be < 1% penalty?

Using that logic is how I decided the current calue of DELAYED_RESOURCE_CNT
on parisc.

> FWIW: I think IOMMU use for avoiding bounce buffers is a thing of the
> past.

Agreed. PA-RISC is a legacy platform at this point.

> (at least I have a "show me where it helps" point of view for
> modern hardware.  what new hardware needs bounce buffers anymore?)

Otherwise, Anything running in a legacy IDE mode is using 32-bit DMA.
So linux  still has plenty of SATA drivers using 32-bit DMA.

All the NICs, FC, Infiniband, et al are PCI-e and thus by definition
64-bit DMA capable.

> Also, to be fair to the Intel hardware the performance overhead is
> really only noticeable on small packed and 10Gig benchmarking.  Most
> users will not be able to notice the overhead.

*nod* - I know. That why I use pktgen to measure the dma map/unmap overhead.
Note that with solid state storage (should be more visible in the next year
or so), the transaction rate is going to look alot more like a NIC than
the traditional HBA storage controller. So map/unmap performance will
matter for those configurations too.


> > One can do a few things to limit how much the protections
> > are weakened:
> > 1) Invalidate the IO TLB entry and IO Pdir entries (just don't force
> >   syncronization). At least this was possible on the IOMMU's
> >   I'm familiar with.
> 
> The default is the TLB invalidation. It has a nasty polling loop that
> hurts on the small packet netperf benchmark.
> 
> > 
> > 2) release the IO Pdir entry for re-use once syncronization has been forced.
> >    Syncronization/flushing of the IO TLB shoot-down is the most expensive
> >    ops AFAICT on the IOMMU's I've worked with.
> 
> this is what the patch does.

Ok - I wasn't sure which step was the "syncronize step".

BTW, I hope you (and others from Intel - go willy! ;) can give feedback
to the Intel/AMD chipset designers to ditch this design ASAP.
It clearly sucks.

> > I just looked at the implementation of flush_unmaps().
> > I strongly reccomend comparing this implementation with the
> > DELAYED_RESOURCE_CNT since this looks like it will need 2X or more
> > of the CPU cycles to execute for each entry. Use of "list_del()"
> > is substantially more expensive than a simple linear arrary.
> > (Fewer entries per cacheline by 4X maybe?)
> >
> 
> /me looks, an array of pointers to iova's to clean up makes good sense.
> I don't know why I didn't think of this approach first.  Working with
> lists is messy and if I can avoid it I usually try.

*shrug* I didn't think too much about it on the original implementation
and it happened to work out nicely. Think cacheline "life cycle" when
picking array sizes (or when comparing to linked lists) and you'll
usually end up with the best performing design.

>  
> > Another problem is every now and then some IO is going to burn a bunch
> > of CPU cycles and introduce inconsistent performance for particular
> > unmap operations. One has to tradeoff amortizing the cost of the
> > IOMMU flushing with the number of "unusable" IO Pdir entries and more
> > consistent CPU utilization for each unmap() call.  My gut feeling
> > is 250 is rather high for high_watermark unless flushing the IO TLB
> > is extremely expensive.
> > 
> 
> the 250 high water mark came from 1gig small packet netperf udp
> streaming testing.  The 1000 I initially used didn't perform quite as
> well.  Nothing scientific went into this number just some tests cases.

Well, it can be more scientific. Determine the overhead (with vs without).
Then divide by how many times one can afford to ignore the sync op.
That will be (roughly) be the new overhead.

> > ...
> > > > boot-time options suck.  Is it not possible to tweak this at runtime?
> > > 
> > > Yes, I could easily create a sysfs or debugfs mechanism for turning it
> > > on / off at run time.  I would like input on the preferred way to do this.
> > > sysfs or debugfs?
> > 
> > I prefer a compile time constant since we are talking about fixed costs
> > for this implementation.  The compiler can do a better job with a constant.
> > I know this sounds nit picky but if I can't sufficiently emphasized how
> > perf critical DMA map/unmap code is.
> 
> Compile time options lock out OSV's from using the feature.

Huh? I think you misunderstood. Decide how _many_ you want to defer
at compile time and leave the general feature enabled as a runtime option.
(e.g. use_vtd=1 boot param or when booting with Xen or KVM enabled).

> These IOMMU's are in desktop systems now.  My job is to make it not
> suck so OSV's will leave DMAR enabled in production kernels.
> (well at least try to anyway.)

Ok - my suggestion still applies. "compile time constant" only refers
to the number of unmaps the code will defer. e.g. "borrow" DELAY_RESOURCE_CNT.
I understand the DMAR needs to be enabled at runtime for production.
(I've spent ~5 years dealing with RH/SuSE/Debian IA64-linux kernels...)

If you can reduce the overhead to < 1% for pktgen, TPC-C won't
notice it and I doubt specweb will either.

> > 
> > If it has to be a variable, I prefer sysfs but don't have a good reason
> > for that.
> > 
> > > Signed-off-by: <mgross@linux.intel.com>
> > 
> > Ah! Maybe you can get together with Matthew Wilcox (also) and consider
> > how the IOMMU code might be included in the scsi_ram driver he wrote.
> > Or maybe just directly use the ram disk (rd.c) instead.
> 
> Yup, Willi has just started to look over my shoulder on some of this.
> He pointed me at some OLS papers and whatnot to look at.  I botched my
> chance to work face to face with him last week on his last visit to
> Oregon but we'll keep talking.

Excellent. I'm also happy to answer questions on linux-pci about this.
And willy will be back in oregon I'm sure. :)
You can also visit him in Ottawa if you register for OLS this year.

> > At LSF, I suggested using the IOAT DMA engine (if available) do real DMA.
> > Running the IO requests through the IOMMU would expose the added CPU cost
> > of the IOMMU in it's worst case. This doesn't need to go to kernel.org
> > but might help you isolate perf bottlenecks in this iommu code.
> 
> Oh the worst cases are easy.  Just run a 10gig NIC and see what you get.

Well, you still need a consistent benchmark/workload (e.g. pktgen) and
a precise tool to measure the overhead (e.g. oprofile or perfmon2).

Using the IOAT would allow people without 10G NICs to mess around with this.

> With a 1Gig NIC you need to drop the packet size down to 32 bytes or
> something tiny like that to really notice it.  16K packets
> have only a smallish overhead that are iffy as benchmarks because the
> CPU utilization is too low (<25%) to be a good measure.

*nod*

> FWIW : throughput isn't effected so much as the CPU overhead.
> iommu=strict: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 25% cpu 
> with this patch: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 18% cpu 
> IOMMU=OFF: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 11% cpu 

Understood. That's why netperf (see netperf.org) measures "service demand".
Taking CPU away from user space generally results in lower benchmark/app perf.

thanks,
grant

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] Use an array instead of a list for deffered intel-iommu iotlb flushing  Re: [PATCH]iommu-iotlb-flushing
  2008-03-05 18:23         ` [PATCH]iommu-iotlb-flushing Grant Grundler
@ 2008-03-05 23:01           ` mark gross
  2008-03-08 17:20             ` Grant Grundler
  0 siblings, 1 reply; 11+ messages in thread
From: mark gross @ 2008-03-05 23:01 UTC (permalink / raw)
  To: Grant Grundler; +Cc: Andrew Morton, greg, lkml, linux-pci

On Wed, Mar 05, 2008 at 11:23:15AM -0700, Grant Grundler wrote:
> On Mon, Mar 03, 2008 at 10:34:11AM -0800, mark gross wrote:
> > The intel DMAR engines with the TLB's do have a measurable overhead
> > flushing TLB entries on every unmapsingle.  This hardware forces a bit
> > poll on TLB flush completion that adds up on high rates of unmap
> > operations.  (about 15% on my netperf small packet 32 byte UDP stream
> > over a 1GigE adapter)
> 
> So if we flushed the IOTLB every 16 unmap calls, it should be < 1% penalty?
> 
> Using that logic is how I decided the current calue of DELAYED_RESOURCE_CNT
> on parisc.
> 
> > FWIW: I think IOMMU use for avoiding bounce buffers is a thing of the
> > past.
> 
> Agreed. PA-RISC is a legacy platform at this point.
> 
> > (at least I have a "show me where it helps" point of view for
> > modern hardware.  what new hardware needs bounce buffers anymore?)
> 
> Otherwise, Anything running in a legacy IDE mode is using 32-bit DMA.
> So linux  still has plenty of SATA drivers using 32-bit DMA.
> 
> All the NICs, FC, Infiniband, et al are PCI-e and thus by definition
> 64-bit DMA capable.
> 
> > Also, to be fair to the Intel hardware the performance overhead is
> > really only noticeable on small packed and 10Gig benchmarking.  Most
> > users will not be able to notice the overhead.
> 
> *nod* - I know. That why I use pktgen to measure the dma map/unmap overhead.
> Note that with solid state storage (should be more visible in the next year
> or so), the transaction rate is going to look alot more like a NIC than
> the traditional HBA storage controller. So map/unmap performance will
> matter for those configurations too.

Sweet! now I have an excuse to get one of those spiffy SD-Disks!  

> 
> 
> > > One can do a few things to limit how much the protections
> > > are weakened:
> > > 1) Invalidate the IO TLB entry and IO Pdir entries (just don't force
> > >   syncronization). At least this was possible on the IOMMU's
> > >   I'm familiar with.
> > 
> > The default is the TLB invalidation. It has a nasty polling loop that
> > hurts on the small packet netperf benchmark.
> > 
> > > 
> > > 2) release the IO Pdir entry for re-use once syncronization has been forced.
> > >    Syncronization/flushing of the IO TLB shoot-down is the most expensive
> > >    ops AFAICT on the IOMMU's I've worked with.
> > 
> > this is what the patch does.
> 
> Ok - I wasn't sure which step was the "syncronize step".
> 
> BTW, I hope you (and others from Intel - go willy! ;) can give feedback
> to the Intel/AMD chipset designers to ditch this design ASAP.
> It clearly sucks.
> 

The HW implementation IS evolving.  Especially as the MCH and more of
the chipsets are moved into the CPU package.  It will get better over
time, but the protection will never be "free".

> > > I just looked at the implementation of flush_unmaps().
> > > I strongly reccomend comparing this implementation with the
> > > DELAYED_RESOURCE_CNT since this looks like it will need 2X or more
> > > of the CPU cycles to execute for each entry. Use of "list_del()"
> > > is substantially more expensive than a simple linear arrary.
> > > (Fewer entries per cacheline by 4X maybe?)
> > >
> > 
> > /me looks, an array of pointers to iova's to clean up makes good sense.
> > I don't know why I didn't think of this approach first.  Working with
> > lists is messy and if I can avoid it I usually try.
> 
> *shrug* I didn't think too much about it on the original implementation
> and it happened to work out nicely. Think cacheline "life cycle" when
> picking array sizes (or when comparing to linked lists) and you'll
> usually end up with the best performing design.
> 
> >  
> > > Another problem is every now and then some IO is going to burn a bunch
> > > of CPU cycles and introduce inconsistent performance for particular
> > > unmap operations. One has to tradeoff amortizing the cost of the
> > > IOMMU flushing with the number of "unusable" IO Pdir entries and more
> > > consistent CPU utilization for each unmap() call.  My gut feeling
> > > is 250 is rather high for high_watermark unless flushing the IO TLB
> > > is extremely expensive.
> > > 
> > 
> > the 250 high water mark came from 1gig small packet netperf udp
> > streaming testing.  The 1000 I initially used didn't perform quite as
> > well.  Nothing scientific went into this number just some tests cases.
> 
> Well, it can be more scientific. Determine the overhead (with vs without).
> Then divide by how many times one can afford to ignore the sync op.
> That will be (roughly) be the new overhead.
> 
> > > ...
> > > > > boot-time options suck.  Is it not possible to tweak this at runtime?
> > > > 
> > > > Yes, I could easily create a sysfs or debugfs mechanism for turning it
> > > > on / off at run time.  I would like input on the preferred way to do this.
> > > > sysfs or debugfs?
> > > 
> > > I prefer a compile time constant since we are talking about fixed costs
> > > for this implementation.  The compiler can do a better job with a constant.
> > > I know this sounds nit picky but if I can't sufficiently emphasized how
> > > perf critical DMA map/unmap code is.
> > 
> > Compile time options lock out OSV's from using the feature.
> 
> Huh? I think you misunderstood. Decide how _many_ you want to defer
> at compile time and leave the general feature enabled as a runtime option.
> (e.g. use_vtd=1 boot param or when booting with Xen or KVM enabled).
> 
> > These IOMMU's are in desktop systems now.  My job is to make it not
> > suck so OSV's will leave DMAR enabled in production kernels.
> > (well at least try to anyway.)
> 
> Ok - my suggestion still applies. "compile time constant" only refers
> to the number of unmaps the code will defer. e.g. "borrow" DELAY_RESOURCE_CNT.
> I understand the DMAR needs to be enabled at runtime for production.
> (I've spent ~5 years dealing with RH/SuSE/Debian IA64-linux kernels...)
> 
> If you can reduce the overhead to < 1% for pktgen, TPC-C won't
> notice it and I doubt specweb will either.

Sadly the overhead only a fraction of the overhead is due to the IOTLB
flushes.  I can't wave away the IOVA management overhead with batched
flushes of the IOTLB.

> 
> > > 
> > > If it has to be a variable, I prefer sysfs but don't have a good reason
> > > for that.
> > > 
> > > > Signed-off-by: <mgross@linux.intel.com>
> > > 
> > > Ah! Maybe you can get together with Matthew Wilcox (also) and consider
> > > how the IOMMU code might be included in the scsi_ram driver he wrote.
> > > Or maybe just directly use the ram disk (rd.c) instead.
> > 
> > Yup, Willi has just started to look over my shoulder on some of this.
> > He pointed me at some OLS papers and whatnot to look at.  I botched my
> > chance to work face to face with him last week on his last visit to
> > Oregon but we'll keep talking.
> 
> Excellent. I'm also happy to answer questions on linux-pci about this.
> And willy will be back in oregon I'm sure. :)
> You can also visit him in Ottawa if you register for OLS this year.
> 

I'm not going to OLS this year :(  travel budget is severely restricted
because of over spending last year.

> > > At LSF, I suggested using the IOAT DMA engine (if available) do real DMA.
> > > Running the IO requests through the IOMMU would expose the added CPU cost
> > > of the IOMMU in it's worst case. This doesn't need to go to kernel.org
> > > but might help you isolate perf bottlenecks in this iommu code.
> > 
> > Oh the worst cases are easy.  Just run a 10gig NIC and see what you get.
> 
> Well, you still need a consistent benchmark/workload (e.g. pktgen) and
> a precise tool to measure the overhead (e.g. oprofile or perfmon2).
> 

I've been using oprofile, and some TSC counters I have in an out-of tree
patch for instrumenting the code and dumping the cycles per
code-path-of-interest.  Its been pretty effective, but it affects the
throughput of the run.

> Using the IOAT would allow people without 10G NICs to mess around with this.
> 

I'll have to check this out.

> > With a 1Gig NIC you need to drop the packet size down to 32 bytes or
> > something tiny like that to really notice it.  16K packets
> > have only a smallish overhead that are iffy as benchmarks because the
> > CPU utilization is too low (<25%) to be a good measure.
> 
> *nod*
> 
> > FWIW : throughput isn't effected so much as the CPU overhead.
> > iommu=strict: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 25% cpu 
> > with this patch: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 18% cpu 
> > IOMMU=OFF: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 11% cpu 
> 
> Understood. That's why netperf (see netperf.org) measures "service demand".
> Taking CPU away from user space generally results in lower benchmark/app perf.
>

The following patch is an update to use an array instead of a list of
IOVA's in the implementation of defered iotlb flushes.  It takes
inspiration from sba_iommu.c

I like this implementation better as it encapsulates the batch process
within intel-iommu.c, and no longer touches iova.h (which is shared)

Performance data:  Netperf 32byte UDP streaming
2.6.25-rc3-mm1:
IOMMU-strict : 58Mps @ 62% cpu
NO-IOMMU : 71Mbs @ 41% cpu
List-based IOMMU-default-batched-IOTLB flush: 66Mbps @ 57% cpu

with this patch:
IOMMU-strict : 73Mps @ 75% cpu
NO-IOMMU : 74Mbs @ 42% cpu
Array-based IOMMU-default-batched-IOTLB flush: 72Mbps @ 62% cpu

--mgross


Signed-off-by: <mgross@linux.intel.com>


Index: linux-2.6.25-rc3-mm1/drivers/pci/intel-iommu.c
===================================================================
--- linux-2.6.25-rc3-mm1.orig/drivers/pci/intel-iommu.c	2008-03-05 11:34:41.000000000 -0800
+++ linux-2.6.25-rc3-mm1/drivers/pci/intel-iommu.c	2008-03-05 12:50:21.000000000 -0800
@@ -59,8 +59,17 @@
 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 
 static struct intel_iommu *g_iommus;
+
+#define HIGH_WATER_MARK 250
+struct deferred_flush_tables {
+	int next;
+	struct iova *iova[HIGH_WATER_MARK];
+	struct dmar_domain *domain[HIGH_WATER_MARK];
+};
+
+static struct deferred_flush_tables *deferred_flush;
+
 /* bitmap for indexing intel_iommus */
-static unsigned long 	*g_iommus_to_flush;
 static int g_num_of_iommus;
 
 static DEFINE_SPINLOCK(async_umap_flush_lock);
@@ -68,10 +77,6 @@
 
 static int timer_on;
 static long list_size;
-static int high_watermark;
-
-static struct dentry *intel_iommu_debug, *debug;
-
 
 static void domain_remove_dev_info(struct dmar_domain *domain);
 
@@ -1692,7 +1697,7 @@
 	struct dmar_rmrr_unit *rmrr;
 	struct pci_dev *pdev;
 	struct intel_iommu *iommu;
-	int nlongs, i, ret, unit = 0;
+	int i, ret, unit = 0;
 
 	/*
 	 * for each drhd
@@ -1711,17 +1716,16 @@
 		 */
 	}
 
-	nlongs = BITS_TO_LONGS(g_num_of_iommus);
-	g_iommus_to_flush = kzalloc(nlongs * sizeof(unsigned long), GFP_KERNEL);
-	if (!g_iommus_to_flush) {
-		printk(KERN_ERR "Intel-IOMMU: "
-			"Allocating bitmap array failed\n");
-		return -ENOMEM;
-	}
-
 	g_iommus = kzalloc(g_num_of_iommus * sizeof(*iommu), GFP_KERNEL);
 	if (!g_iommus) {
-		kfree(g_iommus_to_flush);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	deferred_flush = kzalloc(g_num_of_iommus *
+		sizeof(struct deferred_flush_tables), GFP_KERNEL);
+	if (!deferred_flush) {
+		kfree(g_iommus);
 		ret = -ENOMEM;
 		goto error;
 	}
@@ -1970,42 +1974,48 @@
 
 static void flush_unmaps(void)
 {
-	struct iova *node, *n;
-	unsigned long flags;
-	int i;
+	int i, j;
 
-	spin_lock_irqsave(&async_umap_flush_lock, flags);
 	timer_on = 0;
 
 	/* just flush them all */
 	for (i = 0; i < g_num_of_iommus; i++) {
-		if (test_and_clear_bit(i, g_iommus_to_flush))
+		if (deferred_flush[i].next) {
 			iommu_flush_iotlb_global(&g_iommus[i], 0);
+			for (j = 0; j < deferred_flush[i].next; j++) {
+				__free_iova(&deferred_flush[i].domain[j]->iovad,
+						deferred_flush[i].iova[j]);
+			}
+			deferred_flush[i].next = 0;
+		}
 	}
 
-	list_for_each_entry_safe(node, n, &unmaps_to_do, list) {
-		/* free iova */
-		list_del(&node->list);
-		__free_iova(&((struct dmar_domain *)node->dmar)->iovad, node);
-
-	}
 	list_size = 0;
-	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
 }
 
 static void flush_unmaps_timeout(unsigned long data)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&async_umap_flush_lock, flags);
 	flush_unmaps();
+	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
 }
 
 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
 {
 	unsigned long flags;
+	int next, iommu_id;
 
 	spin_lock_irqsave(&async_umap_flush_lock, flags);
-	iova->dmar = dom;
-	list_add(&iova->list, &unmaps_to_do);
-	set_bit((dom->iommu - g_iommus), g_iommus_to_flush);
+	if (list_size == HIGH_WATER_MARK)
+		flush_unmaps();
+
+	iommu_id = dom->iommu - g_iommus;
+	next = deferred_flush[iommu_id].next;
+	deferred_flush[iommu_id].domain[next] = dom;
+	deferred_flush[iommu_id].iova[next] = iova;
+	deferred_flush[iommu_id].next++;
 
 	if (!timer_on) {
 		mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
@@ -2054,8 +2064,6 @@
 		 * queue up the release of the unmap to save the 1/6th of the
 		 * cpu used up by the iotlb flush operation...
 		 */
-		if (list_size > high_watermark)
-			flush_unmaps();
 	}
 }
 
@@ -2380,10 +2388,6 @@
 	if (dmar_table_init())
 		return 	-ENODEV;
 
-	high_watermark = 250;
-	intel_iommu_debug = debugfs_create_dir("intel_iommu", NULL);
-	debug = debugfs_create_u32("high_watermark", S_IWUGO | S_IRUGO,
-					intel_iommu_debug, &high_watermark);
 	iommu_init_mempool();
 	dmar_init_reserved_ranges();
 
Index: linux-2.6.25-rc3-mm1/drivers/pci/iova.h
===================================================================
--- linux-2.6.25-rc3-mm1.orig/drivers/pci/iova.h	2008-03-05 12:49:12.000000000 -0800
+++ linux-2.6.25-rc3-mm1/drivers/pci/iova.h	2008-03-05 12:49:25.000000000 -0800
@@ -24,8 +24,6 @@
 	struct rb_node	node;
 	unsigned long	pfn_hi; /* IOMMU dish out addr hi */
 	unsigned long	pfn_lo; /* IOMMU dish out addr lo */
-	struct list_head list;
-	void *dmar;
 };
 
 /* holds all the iova translations for a domain */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Use an array instead of a list for deffered intel-iommu iotlb flushing  Re: [PATCH]iommu-iotlb-flushing
  2008-03-05 23:01           ` [PATCH] Use an array instead of a list for deffered intel-iommu iotlb flushing [PATCH]iommu-iotlb-flushing mark gross
@ 2008-03-08 17:20             ` Grant Grundler
  0 siblings, 0 replies; 11+ messages in thread
From: Grant Grundler @ 2008-03-08 17:20 UTC (permalink / raw)
  To: mark gross; +Cc: Grant Grundler, Andrew Morton, greg, lkml, linux-pci

On Wed, Mar 05, 2008 at 03:01:57PM -0800, mark gross wrote:
...
> > *nod* - I know. That why I use pktgen to measure the dma map/unmap overhead.
> > Note that with solid state storage (should be more visible in the next year
> > or so), the transaction rate is going to look alot more like a NIC than
> > the traditional HBA storage controller. So map/unmap performance will
> > matter for those configurations too.
> 
> Sweet! now I have an excuse to get one of those spiffy SD-Disks!  

*chuckle* Glad I could be of assistance :P

...
> > Ok - I wasn't sure which step was the "syncronize step".
> > 
> > BTW, I hope you (and others from Intel - go willy! ;) can give feedback
> > to the Intel/AMD chipset designers to ditch this design ASAP.
> > It clearly sucks.
> > 
> 
> The HW implementation IS evolving.  Especially as the MCH and more of
> the chipsets are moved into the CPU package.  It will get better over
> time, but the protection will never be "free".

Agreed. But IO TLB shoot-down/invalidate could be alot cheaper. Intel knows
how to do it for CPU MMU (I hope they do at least given IA64 experience).
IO MMU should be no different (well, not too much different). IOMMU is
like the CPU MMU except it's shared by many IO devices instead of
"one per CPU".

> > If you can reduce the overhead to < 1% for pktgen, TPC-C won't
> > notice it and I doubt specweb will either.
> 
> Sadly only a fraction of the overhead is due to the IOTLB flushes.
> I can't wave away the IOVA management overhead with batched
> flushes of the IOTLB.

Right. IOVA management is CPU intensive.  But stalling on IO TLB flush
syncronize is a major part, an easy target and should be reduced.
Taking advantage of "warm cache" and other "normal" coding methods
will help minimize the CPU overhead.

....
> I've been using oprofile, and some TSC counters I have in an out-of tree
> patch for instrumenting the code and dumping the cycles per
> code-path-of-interest.  Its been pretty effective, but it affects the
> throughput of the run.

I removed stats from the parisc IOMMU code exactly for that reason.
It was useful for evaluating details of specific algorithms (comparative)
but not for runtime benchmarking. I suggest removing that code.

> > > FWIW : throughput isn't effected so much as the CPU overhead.
> > > iommu=strict: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 25% cpu 
> > > with this patch: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 18% cpu 
> > > IOMMU=OFF: 16K UDP UNIDIRECTIONAL SEND TEST 826Mbps at 11% cpu 
> > 
> > Understood. That's why netperf (see netperf.org) measures "service demand".
> > Taking CPU away from user space generally results in lower benchmark/app perf.
> >
> 
> The following patch is an update to use an array instead of a list of
> IOVA's in the implementation of defered iotlb flushes.  It takes
> inspiration from sba_iommu.c
> 
> I like this implementation better as it encapsulates the batch process
> within intel-iommu.c, and no longer touches iova.h (which is shared)
> 
> Performance data:  Netperf 32byte UDP streaming
> 2.6.25-rc3-mm1:
> IOMMU-strict : 58Mps @ 62% cpu
> NO-IOMMU : 71Mbs @ 41% cpu
> List-based IOMMU-default-batched-IOTLB flush: 66Mbps @ 57% cpu
> 
> with this patch:
> IOMMU-strict : 73Mps @ 75% cpu
> NO-IOMMU : 74Mbs @ 42% cpu
> Array-based IOMMU-default-batched-IOTLB flush: 72Mbps @ 62% cpu

Nice! :)
66/57 == 1.15
72/62 == 1.16
~10% higher throughput with essentially no change in service demand.

But I'm wondering why IOMMU-strict gets better throughput. Something
else is going on here. I suspect better CPU cache utilization and
perhaps lowering the high water mark to 32 would be a test to prove that.

BTW, can you clarify what the units are?
I see "Mps", "Mbs", and "Mbps". Ideally we'd be using
a single unit of measure to compare. "Mpps" would be my preferred one
(Million packets per second) for small, fixed sized packets.

Ditch the "debug" code (stats pr0n) and I'll bet this will go up
a few more percentage points and reduce the service demand.

cheers,
grant

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2008-03-08 17:21 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-02-21  0:06 [PATCH]iommu-iotlb-flushing mark gross
2008-02-23  8:05 ` [PATCH]iommu-iotlb-flushing Andrew Morton
2008-02-25 16:28   ` [PATCH]iommu-iotlb-flushing mark gross
2008-02-25 18:40     ` [PATCH]iommu-iotlb-flushing Andrew Morton
2008-02-29 23:18   ` [PATCH]iommu-iotlb-flushing mark gross
2008-03-01  5:54     ` [PATCH]iommu-iotlb-flushing Greg KH
2008-03-01  7:10     ` [PATCH]iommu-iotlb-flushing Grant Grundler
2008-03-03 18:34       ` [PATCH]iommu-iotlb-flushing mark gross
2008-03-05 18:23         ` [PATCH]iommu-iotlb-flushing Grant Grundler
2008-03-05 23:01           ` [PATCH] Use an array instead of a list for deffered intel-iommu iotlb flushing [PATCH]iommu-iotlb-flushing mark gross
2008-03-08 17:20             ` Grant Grundler

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).