[PATCH] Enable K8 GART as an IOMMU

* [PATCH] Enable K8 GART as an IOMMU
@ 2007-02-12 17:47 Langsdorf, Mark
  2007-02-13  8:47 ` Jan Beulich
  2007-02-13 13:42 ` Muli Ben-Yehuda
  0 siblings, 2 replies; 8+ messages in thread
From: Langsdorf, Mark @ 2007-02-12 17:47 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 739 bytes --]

AMD Opteron and Athlon 64 processors have an AGP aperture
and GART built into the processor.  Linux has used the
AGP GART as an IOMMU to improve the performance of 32-bit
only PCI devices when DMA'ing into physical memory above
0xffffffff.  This patch provides a similar capability 
for Xen dom0.

Most of the code simply migrates the native Linux
aperture.c and pci-gart.c to dom0.  A new memory op
is added to clear the aperture mapping from the 
hypervisor's page tables, which is necessary to
prevent cache alias issues resulting from processor
speculation.

My thanks to Ulrich Meis [um@amd64.org], who was 
instrumental in developing and testing this patch. 

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>

[-- Attachment #2: xen-gart.patch --]
[-- Type: application/octet-stream, Size: 72392 bytes --]

# HG changeset patch
# User root@iwillnow.amd.com
# Node ID 502810b11c7972aabebd8e898e45f63189a493e5
# Parent  d39e8c44da34ac257b26cce29122852ef3a930cc
Add support for the Opteron AGP GART/aperture as a simplified IOMMU

diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c

--- a/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c	Fri Feb 09 16:32:04 2007 -0600
@@ -252,7 +252,7 @@ static void contiguous_bitmap_clear(
 }
 
 /* Protected by balloon_lock. */
-#define MAX_CONTIG_ORDER 9 /* 2MB */
+#define MAX_CONTIG_ORDER 16 /* 256MB */
 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
 static multicall_entry_t cr_mcl[1<<MAX_CONTIG_ORDER];
 
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/Kconfig
--- a/linux-2.6-xen-sparse/arch/x86_64/Kconfig	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/Kconfig	Fri Feb 09 16:32:04 2007 -0600
@@ -434,7 +434,7 @@ config IOMMU
 	default y
 	select SWIOTLB
 	select AGP
-	depends on PCI && !X86_64_XEN
+	depends on PCI
 	help
 	  Support for full DMA access of devices with 32bit memory access only
 	  on systems with more than 3GB. This is usually needed for USB,
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile	Fri Feb 09 16:32:04 2007 -0600
@@ -59,7 +59,7 @@ alternative-y			+= ../../i386/kernel/alt
 
 ifdef CONFIG_XEN
 time-y				+= ../../i386/kernel/time-xen.o
-pci-dma-y			+= ../../i386/kernel/pci-dma-xen.o
+pci-dma-y			+= pci-dma-xen.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  := ../../i386/kernel/microcode-xen.o
 intel_cacheinfo-y		:= ../../i386/kernel/cpu/intel_cacheinfo-xen.o
 quirks-y			:= ../../i386/kernel/quirks-xen.o
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -97,7 +97,6 @@ static inline int bad_addr(unsigned long
 	return 0;
 } 
 
-#ifndef CONFIG_XEN
 /*
  * This function checks if any part of the range <start,end> is mapped
  * with type.
@@ -116,7 +115,6 @@ e820_any_mapped(unsigned long start, uns
 	} 
 	return 0;
 }
-#endif
 
 /*
  * This function checks if the entire range <start,end> is mapped with type.
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -369,7 +369,7 @@ void __init check_ioapic(void)
 				switch (vendor) { 
 				case PCI_VENDOR_ID_VIA:
 #ifdef CONFIG_IOMMU
-					if ((end_pfn > MAX_DMA32_PFN ||
+					if ((global_need_iommu() ||
 					     force_iommu) &&
 					    !iommu_aperture_allowed) {
 						printk(KERN_INFO
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/pci-swiotlb-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-swiotlb-xen.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-swiotlb-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -8,13 +8,10 @@
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
 
-#if 0
 int swiotlb __read_mostly;
 EXPORT_SYMBOL(swiotlb);
-#endif
 
 struct dma_mapping_ops swiotlb_dma_ops = {
-#if 0
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc_coherent = swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
@@ -29,14 +26,12 @@ struct dma_mapping_ops swiotlb_dma_ops =
 	.map_sg = swiotlb_map_sg,
 	.unmap_sg = swiotlb_unmap_sg,
 	.dma_supported = NULL,
-#endif
 };
 
 void pci_swiotlb_init(void)
 {
-#if 0
 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
-	if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
+	if (!iommu_detected && !no_iommu)
 	       swiotlb = 1;
 	if (swiotlb_force)
 		swiotlb = 1;
@@ -45,11 +40,4 @@ void pci_swiotlb_init(void)
 		swiotlb_init();
 		dma_ops = &swiotlb_dma_ops;
 	}
-#else
-	swiotlb_init();
-	if (swiotlb) {
-		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
-		dma_ops = &swiotlb_dma_ops;
-	}
-#endif
 }
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -844,6 +844,7 @@ void __init paging_init(void)
 }
 #endif
 
+#ifndef CONFIG_XEN
 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
    from the CPU leading to inconsistent cache lines. address and size
    must be aligned to 2MB boundaries. 
@@ -877,6 +878,39 @@ void __init clear_kernel_mapping(unsigne
 	}
 	__flush_tlb_all();
 } 
+#else
+/* In Xen the direct mapping doesn't use super pages. */
+void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
+{
+	unsigned long end = address + size;
+
+	BUG_ON(address & ~PAGE_MASK);
+	BUG_ON(size & ~PAGE_MASK); 
+
+	for (; address < end; address += PAGE_SIZE) { 
+		pgd_t *pgd = pgd_offset_k(address);
+		pud_t *pud;
+		pmd_t *pmd;
+		pte_t *pte;
+
+		if (pgd_none(*pgd))
+			continue;
+		pud = pud_offset(pgd, address);
+		if (pud_none(*pud))
+			continue; 
+		pmd = pmd_offset(pud, address);
+		if (!pmd || pmd_none(*pmd))
+			continue; 
+		pte = pte_offset_map(pmd, address);
+		if (!pte || pte_none(*pte))
+			continue;
+
+		pte_clear(&init_mm,address,pte);
+
+	}
+	__flush_tlb_all();
+} 
+#endif
 
 /*
  * Memory hotplug specific functions
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/dma-mapping.h
--- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/dma-mapping.h	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/dma-mapping.h	Fri Feb 09 16:32:04 2007 -0600
@@ -62,7 +62,12 @@ static inline int valid_dma_direction(in
 		(dma_direction == DMA_FROM_DEVICE));
 }
 
-#if 0
+#ifdef CONFIG_XEN
+#define global_need_iommu() 1
+#else
+#define global_need_iommu() (HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL)>MAX_DMA32_PFN)
+#endif
+
 static inline int dma_mapping_error(dma_addr_t dma_addr)
 {
 	if (dma_ops->mapping_error)
@@ -200,8 +205,5 @@ dma_cache_sync(void *vaddr, size_t size,
 
 extern struct device fallback_dev;
 extern int panic_on_overflow;
-#endif
 
 #endif /* _X8664_DMA_MAPPING_H */
-
-#include <asm-i386/mach-xen/asm/dma-mapping.h>
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/lib/Makefile
--- a/linux-2.6-xen-sparse/lib/Makefile	Fri Feb 09 10:48:41 2007 +0000
+++ b/linux-2.6-xen-sparse/lib/Makefile	Fri Feb 09 16:32:04 2007 -0600
@@ -51,8 +51,7 @@ obj-$(CONFIG_SMP) += percpu_counter.o
 obj-$(CONFIG_SMP) += percpu_counter.o
 obj-$(CONFIG_AUDIT_GENERIC) += audit.o
 
-obj-$(CONFIG_SWIOTLB) += swiotlb.o
-swiotlb-$(CONFIG_XEN) := ../arch/i386/kernel/swiotlb.o
+obj-$(CONFIG_SWIOTLB) += swiotlb-xen.o
 
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
diff -r d39e8c44da34 -r 502810b11c79 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c	Fri Feb 09 10:48:41 2007 +0000
+++ b/xen/arch/x86/mm.c	Fri Feb 09 16:32:04 2007 -0600
@@ -2094,6 +2094,12 @@ int do_mmuext_op(
                 }
             }
             break;
+
+	case MMUEXT_UNMAP_REGION:
+            map_pages_to_xen(
+                PAGE_OFFSET + (op.arg1.mfn << PAGE_SHIFT),
+                op.arg1.mfn, op.arg2.nr_ents, PAGE_HYPERVISOR & ~_PAGE_PRESENT);
+	    break;
 #endif
         
         case MMUEXT_TLB_FLUSH_LOCAL:
diff -r d39e8c44da34 -r 502810b11c79 xen/include/public/xen.h
--- a/xen/include/public/xen.h	Fri Feb 09 10:48:41 2007 +0000
+++ b/xen/include/public/xen.h	Fri Feb 09 16:32:04 2007 -0600
@@ -188,6 +188,10 @@
  * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
  * mfn: Machine frame number of new page-table base to install in MMU
  *      when in user space.
+ *
+ * cmd: MMUEXT_UNMAP_REGION [x86/64 only]
+ * mfn: First machine frame number to unmap from the hypervisor's virtual memory.
+ * nr_ents: Number of mfns to unmap.
  * 
  * cmd: MMUEXT_TLB_FLUSH_LOCAL
  * No additional arguments. Flushes local TLB.
@@ -230,6 +234,7 @@
 #define MMUEXT_FLUSH_CACHE      12
 #define MMUEXT_SET_LDT          13
 #define MMUEXT_NEW_USER_BASEPTR 15
+#define MMUEXT_UNMAP_REGION     16
 
 #ifndef __ASSEMBLY__
 struct mmuext_op {
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/aperture-xen.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/aperture-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -0,0 +1,284 @@
+/* 
+ * Firmware replacement code.
+ * 
+ * Work around broken BIOSes that don't set an aperture or only set the
+ * aperture in the AGP bridge. 
+ * If all fails map the aperture over some low memory.  This is cheaper than 
+ * doing bounce buffering. The memory is lost. This is done at early boot 
+ * because only the bootmem allocator can allocate 32+MB. 
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/pci-direct.h>
+#include <asm/dma.h>
+#include <asm/k8.h>
+
+int iommu_aperture;
+int iommu_aperture_disabled __initdata = 0;
+int iommu_aperture_allowed __initdata = 0;
+
+int fallback_aper_order __initdata = 1; /* 64MB */
+int fallback_aper_force __initdata = 0; 
+
+int fix_aperture __initdata = 1;
+
+/* This code runs before the PCI subsystem is initialized, so just
+   access the northbridge directly. */
+
+static u32 __init allocate_aperture(void) 
+{
+	pg_data_t *nd0 = NODE_DATA(0);
+	u32 aper_size;
+	void *p; 
+
+	if (fallback_aper_order > 7) 
+		fallback_aper_order = 7; 
+	aper_size = (32 * 1024 * 1024) << fallback_aper_order; 
+
+	/* 
+	 * Aperture has to be naturally aligned. This means an 2GB aperture won't
+	 * have much chances to find a place in the lower 4GB of memory.
+	 * Unfortunately we cannot move it up because that would make the
+	 * IOMMU useless.
+	 */
+	p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); 
+#ifndef CONFIG_XEN
+	if (!p || __pa(p)+aper_size > 0xffffffff) {
+#else
+	if (!p || (xen_create_contiguous_region((unsigned long)p, get_order(aper_size), 31) != 0)||virt_to_bus(p)+aper_size > 0xffffffff) {
+#endif
+		printk("Cannot allocate aperture memory hole (%p,%uK)\n",
+		       p, aper_size>>10);
+		if (p)
+			free_bootmem_node(nd0, __pa(p), aper_size); 
+		return 0;
+	}
+	printk("Mapping aperture over %d KB of RAM @ %lx\n",
+	       aper_size >> 10, virt_to_bus(p)); 
+	return (u32)virt_to_bus(p); 
+}
+
+static int __init aperture_valid(u64 aper_base, u32 aper_size)
+{ 
+	if (!aper_base) 
+		return 0;
+	if (aper_size < 64*1024*1024) { 
+		printk("Aperture too small (%d MB)\n", aper_size>>20);
+		return 0;
+	}
+	if (aper_base + aper_size >= 0xffffffff) { 
+		printk("Aperture beyond 4GB. Ignoring.\n");
+		return 0; 
+	}
+	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+		printk("Aperture pointing to e820 RAM. Ignoring.\n");
+		return 0; 
+	} 
+	return 1;
+} 
+
+/* Find a PCI capability */
+static __u32 __init find_cap(int num, int slot, int func, int cap) 
+{ 
+	u8 pos;
+	int bytes;
+	if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
+		return 0;
+	pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
+	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 
+		u8 id;
+		pos &= ~3; 
+		id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
+		if (id == 0xff)
+			break;
+		if (id == cap) 
+			return pos; 
+		pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 
+	} 
+	return 0;
+} 
+
+/* Read a standard AGPv3 bridge header */
+static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
+{ 
+	u32 apsize;
+	u32 apsizereg;
+	int nbits;
+	u32 aper_low, aper_hi;
+	u64 aper;
+
+	printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
+	apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
+	if (apsizereg == 0xffffffff) {
+		printk("APSIZE in AGP bridge unreadable\n");
+		return 0;
+	}
+
+	apsize = apsizereg & 0xfff;
+	/* Some BIOS use weird encodings not in the AGPv3 table. */
+	if (apsize & 0xff) 
+		apsize |= 0xf00; 
+	nbits = hweight16(apsize);
+	*order = 7 - nbits;
+	if ((int)*order < 0) /* < 32MB */
+		*order = 0;
+	
+	aper_low = read_pci_config(num,slot,func, 0x10);
+	aper_hi = read_pci_config(num,slot,func,0x14);
+	aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+
+	printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 
+	       aper, 32 << *order, apsizereg);
+
+	if (!aperture_valid(aper, (32*1024*1024) << *order))
+	    return 0;
+	return (u32)aper; 
+} 
+
+/* Look for an AGP bridge. Windows only expects the aperture in the
+   AGP bridge and some BIOS forget to initialize the Northbridge too.
+   Work around this here. 
+
+   Do an PCI bus scan by hand because we're running before the PCI
+   subsystem. 
+
+   All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+   generically. It's probably overkill to always scan all slots because
+   the AGP bridges should be always an own bus on the HT hierarchy, 
+   but do it here for future safety. */
+static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+{
+	int num, slot, func;
+
+	/* Poor man's PCI discovery */
+	for (num = 0; num < 256; num++) { 
+		for (slot = 0; slot < 32; slot++) { 
+			for (func = 0; func < 8; func++) { 
+				u32 class, cap;
+				u8 type;
+				class = read_pci_config(num,slot,func,
+							PCI_CLASS_REVISION);
+				if (class == 0xffffffff)
+					break; 
+				
+				switch (class >> 16) { 
+				case PCI_CLASS_BRIDGE_HOST:
+				case PCI_CLASS_BRIDGE_OTHER: /* needed? */
+					/* AGP bridge? */
+					cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
+					if (!cap)
+						break;
+					*valid_agp = 1; 
+					return read_agp(num,slot,func,cap,order);
+				} 
+				
+				/* No multi-function device? */
+				type = read_pci_config_byte(num,slot,func,
+							       PCI_HEADER_TYPE);
+				if (!(type & 0x80))
+					break;
+			} 
+		} 
+	}
+	printk("No AGP bridge found\n"); 
+	return 0;
+}
+
+void __init iommu_hole_init(void) 
+{ 
+	int fix, num; 
+	u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
+	u64 aper_base, last_aper_base = 0;
+	int valid_agp = 0;
+
+	if (iommu_aperture_disabled || !fix_aperture)
+		return;
+
+	printk("Checking aperture...\n"); 
+
+	fix = 0;
+	for (num = 24; num < 32; num++) {		
+		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+			continue;
+
+		iommu_detected = 1;
+		iommu_aperture = 1; 
+
+		aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 
+		aper_size = (32 * 1024 * 1024) << aper_order; 
+		aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
+		aper_base <<= 25; 
+
+		printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, 
+		       aper_base, aper_size>>20);
+		
+		if (!aperture_valid(aper_base, aper_size)) {
+			fix = 1; 
+			break; 
+		}
+
+		if ((last_aper_order && aper_order != last_aper_order) ||
+		    (last_aper_base && aper_base != last_aper_base)) {
+			fix = 1;
+			break;
+		}
+		last_aper_order = aper_order;
+		last_aper_base = aper_base;
+	} 
+
+	if (!fix && !fallback_aper_force) 
+		return; 
+
+	if (!fallback_aper_force)
+		aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 
+		
+	if (aper_alloc) { 
+		/* Got the aperture from the AGP bridge */
+	} else if (swiotlb && !valid_agp) {
+		/* Do nothing */
+	} else if ((!no_iommu && global_need_iommu()) ||
+		   force_iommu ||
+		   valid_agp ||
+		   fallback_aper_force) { 
+		printk("Your BIOS doesn't leave a aperture memory hole\n");
+		printk("Please enable the IOMMU option in the BIOS setup\n");
+		printk("This costs you %d MB of RAM\n",
+		       32 << fallback_aper_order);
+
+		aper_order = fallback_aper_order;
+		aper_alloc = allocate_aperture();
+		if (!aper_alloc) { 
+			/* Could disable AGP and IOMMU here, but it's probably
+			   not worth it. But the later users cannot deal with
+			   bad apertures and turning on the aperture over memory
+			   causes very strange problems, so it's better to 
+			   panic early. */
+			panic("Not enough memory for aperture");
+		}
+	} else { 
+		return; 
+	} 
+
+	/* Fix up the north bridges */
+	for (num = 24; num < 32; num++) { 		
+		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+			continue;	
+
+		/* Don't enable translation yet. That is done later. 
+		   Assume this BIOS didn't initialise the GART so 
+		   just overwrite all previous bits */ 
+		write_pci_config(0, num, 3, 0x90, aper_order<<1); 
+		write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 
+	} 
+} 
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/pci-dma-xen.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-dma-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -0,0 +1,328 @@
+/*
+ * Dynamic DMA mapping support.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/calgary.h>
+#include <xen/balloon.h>
+#include <asm/tlbflush.h>
+
+int iommu_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_merge);
+
+dma_addr_t bad_dma_address __read_mostly;
+EXPORT_SYMBOL(bad_dma_address);
+
+/* This tells the BIO block layer to assume merging. Default to off
+   because we cannot guarantee merging later. */
+int iommu_bio_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_bio_merge);
+
+int iommu_sac_force __read_mostly = 0;
+EXPORT_SYMBOL(iommu_sac_force);
+
+int no_iommu __read_mostly;
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow __read_mostly = 1;
+int force_iommu __read_mostly = 1;
+#else
+int panic_on_overflow __read_mostly = 0;
+int force_iommu __read_mostly= 0;
+#endif
+
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+
+/* Dummy device used for NULL arguments (normally ISA). Better would
+   be probably a smaller DMA mask, but this is bug-to-bug compatible
+   to i386. */
+struct device fallback_dev = {
+	.bus_id = "fallback device",
+	.coherent_dma_mask = DMA_32BIT_MASK,
+	.dma_mask = &fallback_dev.coherent_dma_mask,
+};
+
+/* Allocate DMA memory on node near device */
+noinline static void *
+dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
+{
+	struct page *page;
+	int node;
+#ifdef CONFIG_PCI
+	if (dev->bus == &pci_bus_type)
+		node = pcibus_to_node(to_pci_dev(dev)->bus);
+	else
+#endif
+		node = numa_node_id();
+
+	if (node < first_node(node_online_map))
+		node = first_node(node_online_map);
+
+	page = alloc_pages_node(node, gfp, order);
+	return page ? page_address(page) : NULL;
+}
+
+/*
+ * Allocate memory for a coherent mapping.
+ */
+void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		   gfp_t gfp)
+{
+	void *memory;
+	unsigned long dma_mask = 0;
+	u64 bus;
+
+	if (!dev)
+		dev = &fallback_dev;
+	dma_mask = dev->coherent_dma_mask;
+	if (dma_mask == 0)
+		dma_mask = DMA_32BIT_MASK;
+
+	/* Don't invoke OOM killer */
+	gfp |= __GFP_NORETRY;
+
+	/* Kludge to make it bug-to-bug compatible with i386. i386
+	   uses the normal dma_mask for alloc_coherent. */
+	dma_mask &= *dev->dma_mask;
+
+	/* Why <=? Even when the mask is smaller than 4GB it is often
+	   larger than 16MB and in this case we have a chance of
+	   finding fitting memory in the next higher zone first. If
+	   not retry with true GFP_DMA. -AK */
+	if (dma_mask <= DMA_32BIT_MASK)
+		gfp |= GFP_DMA32;
+
+ again:
+	memory = dma_alloc_pages(dev, gfp, get_order(size));
+	if (memory == NULL)
+		return NULL;
+
+	{
+		int high, mmu;
+		bus = virt_to_bus(memory);
+	        high = (bus + size) >= dma_mask;
+		mmu = high;
+		if (force_iommu && !(gfp & GFP_DMA))
+			mmu = 1;
+		else if (high) {
+			free_pages((unsigned long)memory,
+				   get_order(size));
+
+			/* Don't use the 16MB ZONE_DMA unless absolutely
+			   needed. It's better to use remapping first. */
+			if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
+				gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
+				goto again;
+			}
+
+			/* Let low level make its own zone decisions */
+			gfp &= ~(GFP_DMA32|GFP_DMA);
+
+			if (dma_ops->alloc_coherent)
+				return dma_ops->alloc_coherent(dev, size,
+							   dma_handle, gfp);
+			return NULL;
+		}
+
+		/* Hardcode 31 address bits for now: aacraid limitation. */
+		if (xen_create_contiguous_region((unsigned long)memory, get_order(size), 31) != 0) {
+			free_pages((unsigned long)memory, get_order(size));
+			return NULL;
+		}
+
+		memset(memory, 0, size);
+		if (!mmu) {
+			*dma_handle = virt_to_bus(memory);
+			return memory;
+		}
+	}
+
+	if (dma_ops->alloc_coherent) {
+		free_pages((unsigned long)memory, get_order(size));
+		gfp &= ~(GFP_DMA|GFP_DMA32);
+		return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+	}
+
+	if (dma_ops->map_simple) {
+		*dma_handle = dma_ops->map_simple(dev, memory,
+					      size,
+					      PCI_DMA_BIDIRECTIONAL);
+		if (*dma_handle != bad_dma_address)
+			return memory;
+	}
+
+	if (panic_on_overflow)
+		panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
+	free_pages((unsigned long)memory, get_order(size));
+	return NULL;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);
+
+/*
+ * Unmap coherent memory.
+ * The caller must ensure that the device has finished accessing the mapping.
+ */
+void dma_free_coherent(struct device *dev, size_t size,
+			 void *vaddr, dma_addr_t bus)
+{
+	if (dma_ops->unmap_single)
+		dma_ops->unmap_single(dev, bus, size, 0);
+	xen_destroy_contiguous_region((unsigned long) vaddr, get_order(size));
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+int dma_supported(struct device *dev, u64 mask)
+{
+	if (dma_ops->dma_supported)
+		return dma_ops->dma_supported(dev, mask);
+
+	/* Copied from i386. Doesn't make much sense, because it will
+	   only work for pci_alloc_coherent.
+	   The caller just has to use GFP_DMA in this case. */
+        if (mask < DMA_24BIT_MASK)
+                return 0;
+
+	/* Tell the device to use SAC when IOMMU force is on.  This
+	   allows the driver to use cheaper accesses in some cases.
+
+	   Problem with this is that if we overflow the IOMMU area and
+	   return DAC as fallback address the device may not handle it
+	   correctly.
+
+	   As a special case some controllers have a 39bit address
+	   mode that is as efficient as 32bit (aic79xx). Don't force
+	   SAC for these.  Assume all masks <= 40 bits are of this
+	   type. Normally this doesn't make any difference, but gives
+	   more gentle handling of IOMMU overflow. */
+	if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+		printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+
+int dma_set_mask(struct device *dev, u64 mask)
+{
+	if (!dev->dma_mask || !dma_supported(dev, mask))
+		return -EIO;
+	*dev->dma_mask = mask;
+	return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
+         [,forcesac][,fullflush][,nomerge][,biomerge]
+   size  set size of iommu (in bytes)
+   noagp don't initialize the AGP driver and use full aperture.
+   off   don't use the IOMMU
+   leak  turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
+   memaper[=order] allocate an own aperture over RAM with size 32MB^order.
+   noforce don't force IOMMU usage. Default.
+   force  Force IOMMU.
+   merge  Do lazy merging. This may improve performance on some block devices.
+          Implies force (experimental)
+   biomerge Do merging at the BIO layer. This is more efficient than merge,
+            but should be only done with very big IOMMUs. Implies merge,force.
+   nomerge Don't do SG merging.
+   forcesac For SAC mode for masks <40bits  (experimental)
+   fullflush Flush IOMMU on each allocation (default)
+   nofullflush Don't use IOMMU fullflush
+   allowed  overwrite iommu off workarounds for specific chipsets.
+   soft	 Use software bounce buffering (default for Intel machines)
+   noaperture Don't touch the aperture for AGP.
+*/
+__init int iommu_setup(char *p)
+{
+    iommu_merge = 1;
+
+    while (*p) {
+	    if (!strncmp(p,"off",3))
+		    no_iommu = 1;
+	    /* gart_parse_options has more force support */
+	    if (!strncmp(p,"force",5))
+		    force_iommu = 1;
+	    if (!strncmp(p,"noforce",7)) {
+		    iommu_merge = 0;
+		    force_iommu = 0;
+	    }
+
+	    if (!strncmp(p, "biomerge",8)) {
+		    iommu_bio_merge = 4096;
+		    iommu_merge = 1;
+		    force_iommu = 1;
+	    }
+	    if (!strncmp(p, "panic",5))
+		    panic_on_overflow = 1;
+	    if (!strncmp(p, "nopanic",7))
+		    panic_on_overflow = 0;
+	    if (!strncmp(p, "merge",5)) {
+		    iommu_merge = 1;
+		    force_iommu = 1;
+	    }
+	    if (!strncmp(p, "nomerge",7))
+		    iommu_merge = 0;
+	    if (!strncmp(p, "forcesac",8))
+		    iommu_sac_force = 1;
+
+#ifdef CONFIG_SWIOTLB
+	    if (!strncmp(p, "soft",4))
+		    swiotlb = 1;
+#endif
+
+#ifdef CONFIG_IOMMU
+	    gart_parse_options(p);
+#endif
+
+	    p += strcspn(p, ",");
+	    if (*p == ',')
+		    ++p;
+    }
+    return 1;
+}
+__setup("iommu=", iommu_setup);
+
+void __init pci_iommu_alloc(void)
+{
+	/*
+	 * The order of these functions is important for
+	 * fall-back/fail-over reasons
+	 */
+#ifdef CONFIG_IOMMU
+	iommu_hole_init();
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+	detect_calgary();
+#endif
+
+#ifdef CONFIG_SWIOTLB
+	pci_swiotlb_init();
+#endif
+}
+
+static int __init pci_iommu_init(void)
+{
+#ifdef CONFIG_CALGARY_IOMMU
+	calgary_iommu_init();
+#endif
+
+#ifdef CONFIG_IOMMU
+	gart_iommu_init();
+#endif
+
+	no_iommu_init();
+	return 0;
+}
+
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/arch/x86_64/kernel/pci-gart-xen.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-gart-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -0,0 +1,741 @@
+/*
+ * Dynamic DMA mapping support for AMD Hammer.
+ * 
+ * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
+ * This allows to use PCI devices that only support 32bit addresses on systems
+ * with more than 4GB. 
+ *
+ * See Documentation/DMA-mapping.txt for the interface specification.
+ * 
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/agp_backend.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <asm/atomic.h>
+#include <asm/io.h>
+#include <asm/mtrr.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/cacheflush.h>
+#include <asm/kdebug.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/k8.h>
+
+unsigned long iommu_bus_base;	/* GART remapping area (physical) */
+static unsigned long iommu_size; 	/* size of remapping area bytes */
+static unsigned long iommu_pages;	/* .. and in pages */
+
+u32 *iommu_gatt_base; 		/* Remapping table */
+
+/* If this is disabled the IOMMU will use an optimized flushing strategy
+   of only flushing when an mapping is reused. With it true the GART is flushed 
+   for every mapping. Problem is that doing the lazy flush seems to trigger
+   bugs with some popular PCI cards, in particular 3ware (but has been also
+   also seen with Qlogic at least). */
+int iommu_fullflush = 1;
+
+/* Allocation bitmap for the remapping area */ 
+static DEFINE_SPINLOCK(iommu_bitmap_lock);
+static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
+
+static u32 gart_unmapped_entry; 
+
+#define GPTE_VALID    1
+#define GPTE_COHERENT 2
+#define GPTE_ENCODE(x) \
+	(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
+#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
+
+#define to_pages(addr,size) \
+	(round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+
+#define EMERGENCY_PAGES 32 /* = 128KB */ 
+
+#ifdef CONFIG_AGP
+#define AGPEXTERN extern
+#else
+#define AGPEXTERN
+#endif
+
+/* backdoor interface to AGP driver */
+AGPEXTERN int agp_memory_reserved;
+AGPEXTERN __u32 *agp_gatt_table;
+
+static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
+static int need_flush; 		/* global flush state. set for each gart wrap */
+
+static unsigned long alloc_iommu(int size) 
+{ 	
+	unsigned long offset, flags;
+
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);	
+	offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
+	if (offset == -1) {
+		need_flush = 1;
+		offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
+	}
+	if (offset != -1) { 
+		set_bit_string(iommu_gart_bitmap, offset, size); 
+		next_bit = offset+size; 
+		if (next_bit >= iommu_pages) { 
+			next_bit = 0;
+			need_flush = 1;
+		} 
+	} 
+	if (iommu_fullflush)
+		need_flush = 1;
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);      
+	return offset;
+} 
+
+static void free_iommu(unsigned long offset, int size)
+{ 
+	unsigned long flags;
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);
+	__clear_bit_string(iommu_gart_bitmap, offset, size);
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+
+/* 
+ * Use global flush state to avoid races with multiple flushers.
+ */
+static void flush_gart(void)
+{ 
+	unsigned long flags;
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);
+	if (need_flush) {
+		k8_flush_garts();
+		need_flush = 0;
+	} 
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+} 
+
+#ifdef CONFIG_IOMMU_LEAK
+
+#define SET_LEAK(x) if (iommu_leak_tab) \
+			iommu_leak_tab[x] = __builtin_return_address(0);
+#define CLEAR_LEAK(x) if (iommu_leak_tab) \
+			iommu_leak_tab[x] = NULL;
+
+/* Debugging aid for drivers that don't free their IOMMU tables */
+static void **iommu_leak_tab; 
+static int leak_trace;
+int iommu_leak_pages = 20; 
+void dump_leak(void)
+{
+	int i;
+	static int dump; 
+	if (dump || !iommu_leak_tab) return;
+	dump = 1;
+	show_stack(NULL,NULL);
+	/* Very crude. dump some from the end of the table too */ 
+	printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 
+	for (i = 0; i < iommu_leak_pages; i+=2) {
+		printk("%lu: ", iommu_pages-i);
+		printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]);
+		printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 
+	} 
+	printk("\n");
+}
+#else
+#define SET_LEAK(x)
+#define CLEAR_LEAK(x)
+#endif
+
+static void iommu_full(struct device *dev, size_t size, int dir)
+{
+	/* 
+	 * Ran out of IOMMU space for this operation. This is very bad.
+	 * Unfortunately the drivers cannot handle this operation properly.
+	 * Return some non mapped prereserved space in the aperture and 
+	 * let the Northbridge deal with it. This will result in garbage
+	 * in the IO operation. When the size exceeds the prereserved space
+	 * memory corruption will occur or random memory will be DMAed 
+	 * out. Hopefully no network devices use single mappings that big.
+	 */ 
+	
+	printk(KERN_ERR 
+  "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
+	       size, dev->bus_id);
+
+	if (size > PAGE_SIZE*EMERGENCY_PAGES) {
+		if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+			panic("PCI-DMA: Memory would be corrupted\n");
+		if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 
+			panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n");
+	} 
+
+#ifdef CONFIG_IOMMU_LEAK
+	dump_leak(); 
+#endif
+} 
+
+static inline int need_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+	u64 mask = *dev->dma_mask;
+	int high = addr + size >= mask;
+	int mmu = high;
+	if (force_iommu) 
+		mmu = 1; 
+	return mmu; 
+}
+
+static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{ 
+	u64 mask = *dev->dma_mask;
+	int high = addr + size >= mask;
+	int mmu = high;
+	return mmu; 
+}
+
+/* Map a single continuous physical area into the IOMMU.
+ * Caller needs to check if the iommu is needed and flush.
+ */
+static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
+				size_t size, int dir)
+{ 
+	unsigned long npages = to_pages(phys_mem, size);
+	unsigned long iommu_page = alloc_iommu(npages);
+	int i;
+	if (iommu_page == -1) {
+		if (!nonforced_iommu(dev, phys_mem, size))
+			return phys_mem; 
+		if (panic_on_overflow)
+			panic("dma_map_area overflow %lu bytes\n", size);
+		iommu_full(dev, size, dir);
+		return bad_dma_address;
+	}
+
+	for (i = 0; i < npages; i++) {
+		iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
+		SET_LEAK(iommu_page + i);
+		phys_mem += PAGE_SIZE;
+	}
+	return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
+}
+
+static dma_addr_t gart_map_simple(struct device *dev, char *buf,
+				 size_t size, int dir)
+{
+	dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
+	flush_gart();
+	return map;
+}
+
+/* Map a single area into the IOMMU */
+dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
+{
+	unsigned long phys_mem, bus;
+
+	BUG_ON(dir == DMA_NONE);
+
+	if (!dev)
+		dev = &fallback_dev;
+
+	phys_mem = virt_to_bus(addr); 
+	if (!need_iommu(dev, phys_mem, size))
+		return phys_mem; 
+
+	bus = gart_map_simple(dev, addr, size, dir);
+	return bus; 
+}
+
+/*
+ * Free a DMA mapping.
+ */
+void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
+		      size_t size, int direction)
+{
+	unsigned long iommu_page;
+	int npages;
+	int i;
+
+	if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+	    dma_addr >= iommu_bus_base + iommu_size)
+		return;
+	iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
+	npages = to_pages(dma_addr, size);
+	for (i = 0; i < npages; i++) {
+		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
+		CLEAR_LEAK(iommu_page + i);
+	}
+	free_iommu(iommu_page, npages);
+}
+
+/*
+ * Wrapper for pci_unmap_single working with scatterlists.
+ */
+void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+	int i;
+
+	for (i = 0; i < nents; i++) {
+		struct scatterlist *s = &sg[i];
+		if (!s->dma_length || !s->length)
+			break;
+		gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
+	}
+}
+
+/* Fallback for dma_map_sg in case of overflow */
+static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+			       int nents, int dir)
+{
+	int i;
+
+#ifdef CONFIG_IOMMU_DEBUG
+	printk(KERN_DEBUG "dma_map_sg overflow\n");
+#endif
+
+ 	for (i = 0; i < nents; i++ ) {
+		struct scatterlist *s = &sg[i];
+		unsigned long addr = page_to_phys(s->page) + s->offset; 
+		if (nonforced_iommu(dev, addr, s->length)) { 
+			addr = dma_map_area(dev, addr, s->length, dir);
+			if (addr == bad_dma_address) { 
+				if (i > 0) 
+					gart_unmap_sg(dev, sg, i, dir);
+				nents = 0; 
+				sg[0].dma_length = 0;
+				break;
+			}
+		}
+		s->dma_address = addr;
+		s->dma_length = s->length;
+	}
+	flush_gart();
+	return nents;
+}
+
+/* Map multiple scatterlist entries continuous into the first. */
+static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
+		      struct scatterlist *sout, unsigned long pages)
+{
+	unsigned long iommu_start = alloc_iommu(pages);
+	unsigned long iommu_page = iommu_start; 
+	int i;
+
+	if (iommu_start == -1)
+		return -1;
+	
+	for (i = start; i < stopat; i++) {
+		struct scatterlist *s = &sg[i];
+		unsigned long pages, addr;
+		unsigned long phys_addr = s->dma_address;
+		
+		BUG_ON(i > start && s->offset);
+		if (i == start) {
+			*sout = *s; 
+			sout->dma_address = iommu_bus_base;
+			sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
+			sout->dma_length = s->length;
+		} else { 
+			sout->dma_length += s->length; 
+		}
+
+		addr = phys_addr;
+		pages = to_pages(s->offset, s->length); 
+		while (pages--) { 
+			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 
+			SET_LEAK(iommu_page);
+			addr += PAGE_SIZE;
+			iommu_page++;
+		}
+	} 
+	BUG_ON(iommu_page - iommu_start != pages);	
+	return 0;
+}
+
+static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
+		      struct scatterlist *sout,
+		      unsigned long pages, int need)
+{
+	if (!need) { 
+		BUG_ON(stopat - start != 1);
+		*sout = sg[start]; 
+		sout->dma_length = sg[start].length; 
+		return 0;
+	} 
+	return __dma_map_cont(sg, start, stopat, sout, pages);
+}
+		
+/*
+ * DMA map all entries in a scatterlist.
+ * Merge chunks that have page aligned sizes into a continuous mapping. 
+ */
+int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
+{
+	int i;
+	int out;
+	int start;
+	unsigned long pages = 0;
+	int need = 0, nextneed;
+
+	BUG_ON(dir == DMA_NONE);
+	if (nents == 0) 
+		return 0;
+
+	if (!dev)
+		dev = &fallback_dev;
+
+	out = 0;
+	start = 0;
+	for (i = 0; i < nents; i++) {
+		struct scatterlist *s = &sg[i];
+		dma_addr_t addr = page_to_phys(s->page) + s->offset;
+		s->dma_address = addr;
+		BUG_ON(s->length == 0); 
+
+		nextneed = need_iommu(dev, addr, s->length); 
+
+		/* Handle the previous not yet processed entries */
+		if (i > start) {
+			struct scatterlist *ps = &sg[i-1];
+			/* Can only merge when the last chunk ends on a page 
+			   boundary and the new one doesn't have an offset. */
+			if (!iommu_merge || !nextneed || !need || s->offset ||
+			    (ps->offset + ps->length) % PAGE_SIZE) { 
+				if (dma_map_cont(sg, start, i, sg+out, pages,
+						 need) < 0)
+					goto error;
+				out++;
+				pages = 0;
+				start = i;	
+			}
+		}
+
+		need = nextneed;
+		pages += to_pages(s->offset, s->length);
+	}
+	if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
+		goto error;
+	out++;
+	flush_gart();
+	if (out < nents) 
+		sg[out].dma_length = 0; 
+	return out;
+
+error:
+	flush_gart();
+	gart_unmap_sg(dev, sg, nents, dir);
+	/* When it was forced or merged try again in a dumb way */
+	if (force_iommu || iommu_merge) {
+		out = dma_map_sg_nonforce(dev, sg, nents, dir);
+		if (out > 0)
+			return out;
+	}
+	if (panic_on_overflow)
+		panic("dma_map_sg: overflow on %lu pages\n", pages);
+	iommu_full(dev, pages << PAGE_SHIFT, dir);
+	for (i = 0; i < nents; i++)
+		sg[i].dma_address = bad_dma_address;
+	return 0;
+} 
+
+static int no_agp;
+
+static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+{ 
+	unsigned long a; 
+	if (!iommu_size) { 
+		iommu_size = aper_size; 
+		if (!no_agp) 
+			iommu_size /= 2; 
+	} 
+
+	a = aper + iommu_size; 
+	iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a;
+
+	if (iommu_size < 64*1024*1024) 
+		printk(KERN_WARNING
+  "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 
+	
+	return iommu_size;
+} 
+
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 
+{ 
+	unsigned aper_size = 0, aper_base_32;
+	u64 aper_base;
+	unsigned aper_order;
+
+	pci_read_config_dword(dev, 0x94, &aper_base_32); 
+	pci_read_config_dword(dev, 0x90, &aper_order);
+	aper_order = (aper_order >> 1) & 7;	
+
+	aper_base = aper_base_32 & 0x7fff; 
+	aper_base <<= 25;
+
+	aper_size = (32 * 1024 * 1024) << aper_order; 
+	if (aper_base + aper_size >= 0xffffffff || !aper_size)
+		aper_base = 0;
+
+	*size = aper_size;
+	return aper_base;
+} 
+
+/* 
+ * Private Northbridge GATT initialization in case we cannot use the
+ * AGP driver for some reason.  
+ */
+static __init int init_k8_gatt(struct agp_kern_info *info)
+{ 
+	struct pci_dev *dev;
+	void *gatt;
+	unsigned aper_base, new_aper_base;
+	unsigned aper_size, gatt_size, new_aper_size;
+	int i;
+
+	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
+	aper_size = aper_base = info->aper_size = 0;
+	dev = NULL;
+	for (i = 0; i < num_k8_northbridges; i++) {
+		dev = k8_northbridges[i];
+		new_aper_base = read_aperture(dev, &new_aper_size); 
+		if (!new_aper_base) 
+			goto nommu; 
+		
+		if (!aper_base) { 
+			aper_size = new_aper_size;
+			aper_base = new_aper_base;
+		} 
+		if (aper_size != new_aper_size || aper_base != new_aper_base) 
+			goto nommu;
+	}
+	if (!aper_base)
+		goto nommu; 
+	info->aper_base = aper_base;
+	info->aper_size = aper_size>>20; 
+
+	gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 
+	gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 
+	if (!gatt) 
+		panic("Cannot allocate GATT table"); 
+#ifdef CONFIG_XEN
+	if (xen_create_contiguous_region((unsigned long)gatt, get_order(gatt_size), 31))
+		panic("Cannot create a contiguous memory region below 4GB for the GATT table");
+#endif
+	memset(gatt, 0, gatt_size); 
+	agp_gatt_table = gatt;
+
+	for (i = 0; i < num_k8_northbridges; i++) {
+		u32 ctl; 
+		u32 gatt_reg; 
+
+		dev = k8_northbridges[i];
+		gatt_reg = phys_to_machine(__pa(gatt)) >> 12;
+		gatt_reg <<= 4; 
+		pci_write_config_dword(dev, 0x98, gatt_reg);
+		pci_read_config_dword(dev, 0x90, &ctl); 
+
+		ctl |= 1;
+		ctl &= ~((1<<4) | (1<<5));
+
+		pci_write_config_dword(dev, 0x90, ctl); 
+	}
+	flush_gart();
+	
+	printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 
+	return 0;
+
+ nommu:
+ 	/* Should not happen anymore */
+	printk(KERN_ERR "PCI-DMA: init k8_gat More than 4GB of RAM and no IOMMU\n"
+	       KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
+	return -1; 
+} 
+
+extern int agp_amd64_init(void);
+
+static struct dma_mapping_ops gart_dma_ops = {
+	.mapping_error = NULL,
+	.map_single = gart_map_single,
+	.map_simple = gart_map_simple,
+	.unmap_single = gart_unmap_single,
+	.sync_single_for_cpu = NULL,
+	.sync_single_for_device = NULL,
+	.sync_single_range_for_cpu = NULL,
+	.sync_single_range_for_device = NULL,
+	.sync_sg_for_cpu = NULL,
+	.sync_sg_for_device = NULL,
+	.map_sg = gart_map_sg,
+	.unmap_sg = gart_unmap_sg,
+};
+
+void __init gart_iommu_init(void)
+{ 
+	struct agp_kern_info info;
+	unsigned long aper_size;
+	unsigned long iommu_start;
+	unsigned long scratch;
+	long i;
+
+	if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
+		printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
+		return;
+	}
+
+#ifndef CONFIG_AGP_AMD64
+	no_agp = 1; 
+#else
+	/* Makefile puts PCI initialization via subsys_initcall first. */
+	/* Add other K8 AGP bridge drivers here */
+	no_agp = no_agp || 
+		(agp_amd64_init() < 0) || 
+		(agp_copy_info(agp_bridge, &info) < 0);
+#endif	
+
+	if (swiotlb)
+		return;
+
+	/* Did we detect a different HW IOMMU? */
+	if (iommu_detected && !iommu_aperture)
+		return;
+
+	if (no_iommu ||
+	    (!force_iommu && !global_need_iommu()) ||
+	    !iommu_aperture ||
+	    (no_agp && init_k8_gatt(&info) < 0)) {
+		printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
+		if (global_need_iommu()) {
+			printk(KERN_ERR "WARNING more than 4GB of memory "
+					"but IOMMU not available.\n"
+			       KERN_ERR "WARNING 32bit PCI may malfunction.\n");
+		}
+		return;
+	}
+
+	printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
+	aper_size = info.aper_size * 1024 * 1024;	
+	iommu_size = check_iommu_size(info.aper_base, aper_size); 
+	iommu_pages = iommu_size >> PAGE_SHIFT; 
+
+	iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 
+						    get_order(iommu_pages/8)); 
+	if (!iommu_gart_bitmap) 
+		panic("Cannot allocate iommu bitmap\n"); 
+	memset(iommu_gart_bitmap, 0, iommu_pages/8);
+
+#ifdef CONFIG_IOMMU_LEAK
+	if (leak_trace) { 
+		iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 
+				  get_order(iommu_pages*sizeof(void *)));
+		if (iommu_leak_tab) 
+			memset(iommu_leak_tab, 0, iommu_pages * 8); 
+		else
+			printk("PCI-DMA: Cannot allocate leak trace area\n"); 
+	} 
+#endif
+
+	/* 
+	 * Out of IOMMU space handling.
+	 * Reserve some invalid pages at the beginning of the GART. 
+	 */ 
+	set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 
+
+	agp_memory_reserved = iommu_size;	
+	printk(KERN_INFO
+	       "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+	       iommu_size>>20); 
+
+	iommu_start = aper_size - iommu_size;	
+	iommu_bus_base = info.aper_base + iommu_start; 
+	bad_dma_address = iommu_bus_base;
+	iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+
+	/* 
+	 * Unmap the IOMMU part of the GART. The alias of the page is
+	 * always mapped with cache enabled and there is no full cache
+	 * coherency across the GART remapping. The unmapping avoids
+	 * automatic prefetches from the CPU allocating cache lines in
+	 * there. All CPU accesses are done via the direct mapping to
+	 * the backing memory. The GART address is only used by PCI
+	 * devices. 
+	 */
+#ifndef CONFIG_XEN
+       clear_kernel_mapping((unsigned long)bus_to_virt(iommu_bus_base), iommu_size);
+#else
+       /*
+        * In xen guests and hypervisor the IOMMU region is only mapped in
+        * virtual memory if it has been allocated from RAM.
+        */
+	if (mfn_to_local_pfn(iommu_bus_base>>PAGE_SHIFT)<end_pfn) {
+		struct mmuext_op op = {
+			.cmd = MMUEXT_UNMAP_REGION,
+			.arg1.mfn = iommu_bus_base>>PAGE_SHIFT,
+			.arg2.nr_ents = iommu_size>>PAGE_SHIFT
+		};
+		clear_kernel_mapping((unsigned long)bus_to_virt(iommu_bus_base), iommu_size);
+		BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF));
+	}
+#endif
+
+	/* 
+	 * Try to workaround a bug (thanks to BenH) 
+	 * Set unmapped entries to a scratch page instead of 0. 
+	 * Any prefetches that hit unmapped entries won't get an bus abort
+	 * then.
+	 */
+	scratch = get_zeroed_page(GFP_KERNEL); 
+	if (!scratch) 
+		panic("Cannot allocate iommu scratch page");
+	gart_unmapped_entry = GPTE_ENCODE(virt_to_bus(scratch));
+	for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 
+		iommu_gatt_base[i] = gart_unmapped_entry;
+
+	flush_gart();
+	iommu_detected = 1;
+	dma_ops = &gart_dma_ops;
+	printk(KERN_INFO "GART: ops set\n");
+} 
+
+void gart_parse_options(char *p)
+{
+	int arg;
+
+#ifdef CONFIG_IOMMU_LEAK
+	if (!strncmp(p,"leak",4)) {
+		leak_trace = 1;
+		p += 4;
+		if (*p == '=') ++p;
+		if (isdigit(*p) && get_option(&p, &arg))
+			iommu_leak_pages = arg;
+	}
+#endif
+	if (isdigit(*p) && get_option(&p, &arg))
+		iommu_size = arg;
+	if (!strncmp(p, "fullflush",8))
+		iommu_fullflush = 1;
+	if (!strncmp(p, "nofullflush",11))
+		iommu_fullflush = 0;
+	if (!strncmp(p,"noagp",5))
+		no_agp = 1;
+	if (!strncmp(p, "noaperture",10))
+		fix_aperture = 0;
+	/* duplicated from pci-dma.c */
+	if (!strncmp(p,"force",5))
+		iommu_aperture_allowed = 1;
+	if (!strncmp(p,"allowed",7))
+		iommu_aperture_allowed = 1;
+	if (!strncmp(p, "memaper", 7)) {
+		fallback_aper_force = 1;
+		p += 7;
+		if (*p == '=') {
+			++p;
+			if (get_option(&p, &arg))
+				fallback_aper_order = arg;
+		}
+	}
+}
diff -r d39e8c44da34 -r 502810b11c79 linux-2.6-xen-sparse/lib/swiotlb-xen.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/linux-2.6-xen-sparse/lib/swiotlb-xen.c	Fri Feb 09 16:32:04 2007 -0600
@@ -0,0 +1,831 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * This implementation is for IA-64 and EM64T platforms that do not support
+ * I/O TLBs (aka DMA address translation hardware).
+ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
+ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
+ * Copyright (C) 2000, 2003 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 03/05/07 davidm	Switch from PCI-DMA to generic device DMA API.
+ * 00/12/13 davidm	Rename to swiotlb.c and add mark_clean() to avoid
+ *			unnecessary i-cache flushing.
+ * 04/07/.. ak		Better overflow handling. Assorted fixes.
+ * 05/09/10 linville	Add support for syncing ranges, support syncing for
+ *			DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
+ */
+
+#include <linux/cache.h>
+#include <linux/dma-mapping.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+
+#include <asm/io.h>
+#include <asm/dma.h>
+#include <asm/scatterlist.h>
+
+#include <linux/init.h>
+#include <linux/bootmem.h>
+
+#define OFFSET(val,align) ((unsigned long)	\
+	                   ( (val) & ( (align) - 1)))
+
+#define SG_ENT_VIRT_ADDRESS(sg)	(page_address((sg)->page) + (sg)->offset)
+#define SG_ENT_PHYS_ADDRESS(SG)	virt_to_bus(SG_ENT_VIRT_ADDRESS(SG))
+
+/*
+ * Maximum allowable number of contiguous slabs to map,
+ * must be a power of 2.  What is the appropriate value ?
+ * The complexity of {map,unmap}_single is linearly dependent on this value.
+ */
+#define IO_TLB_SEGSIZE	128
+
+/*
+ * log of the size of each IO TLB slab.  The number of slabs is command line
+ * controllable.
+ */
+#define IO_TLB_SHIFT 11
+
+#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
+
+/*
+ * Minimum IO TLB size to bother booting with.  Systems with mainly
+ * 64bit capable cards will only lightly use the swiotlb.  If we can't
+ * allocate a contiguous 1MB, we're probably in trouble anyway.
+ */
+#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
+
+/*
+ * Enumeration for sync targets
+ */
+enum dma_sync_target {
+	SYNC_FOR_CPU = 0,
+	SYNC_FOR_DEVICE = 1,
+};
+
+int swiotlb_force;
+
+/*
+ * Used to do a quick range check in swiotlb_unmap_single and
+ * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+static char *io_tlb_start, *io_tlb_end;
+
+/*
+ * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
+ * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
+ */
+static unsigned long io_tlb_nslabs;
+
+/*
+ * When the IOMMU overflows we return a fallback buffer. This sets the size.
+ */
+static unsigned long io_tlb_overflow = 32*1024;
+
+void *io_tlb_overflow_buffer;
+
+/*
+ * This is a free list describing the number of free entries available from
+ * each index
+ */
+static unsigned int *io_tlb_list;
+static unsigned int io_tlb_index;
+
+/*
+ * We need to save away the original address corresponding to a mapped entry
+ * for the sync operations.
+ */
+static unsigned char **io_tlb_orig_addr;
+
+/*
+ * Protect the above data structures in the map and unmap calls
+ */
+static DEFINE_SPINLOCK(io_tlb_lock);
+
+static int __init
+setup_io_tlb_npages(char *str)
+{
+	if (isdigit(*str)) {
+		io_tlb_nslabs = simple_strtoul(str, &str, 0);
+		/* avoid tail segment of size < IO_TLB_SEGSIZE */
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
+	if (*str == ',')
+		++str;
+	if (!strcmp(str, "force"))
+		swiotlb_force = 1;
+	return 1;
+}
+__setup("swiotlb=", setup_io_tlb_npages);
+/* make io_tlb_overflow tunable too? */
+
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the DMA API.
+ */
+void
+swiotlb_init_with_default_size (size_t default_size)
+{
+	unsigned long i;
+
+	if (!io_tlb_nslabs) {
+		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
+
+	/*
+	 * Get IO TLB memory from the low pages
+	 */
+	io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
+	if (!io_tlb_start)
+		panic("Cannot allocate SWIOTLB buffer");
+#ifdef CONFIG_XEN
+	if (xen_create_contiguous_region((unsigned long)io_tlb_start, get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT)), 31))
+		panic("Cannot create a contiguous memory region below 4GB for the SWIOTLB buffer");
+#endif
+//TODO ulim: make contiguous
+	io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
+
+	/*
+	 * Allocate and initialize the free list array.  This array is used
+	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+	 * between io_tlb_start and io_tlb_end.
+	 */
+	io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
+	for (i = 0; i < io_tlb_nslabs; i++)
+ 		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+	io_tlb_index = 0;
+	io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(char *));
+
+	/*
+	 * Get the overflow emergency buffer
+	 */
+	io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+#ifdef CONFIG_XEN
+	if (io_tlb_overflow_buffer)
+		xen_create_contiguous_region((unsigned long)io_tlb_overflow_buffer, get_order(io_tlb_overflow), 31);
+#endif
+//TODO ulim: make contiguous
+	printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n",
+	       virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end));
+}
+
+void
+swiotlb_init (void)
+{
+	swiotlb_init_with_default_size(64 * (1<<20));	/* default to 64MB */
+}
+
+/*
+ * Systems with larger DMA zones (those that don't support ISA) can
+ * initialize the swiotlb later using the slab allocator if needed.
+ * This should be just like above, but with some error catching.
+ */
+int
+swiotlb_late_init_with_default_size (size_t default_size)
+{
+	unsigned long i, req_nslabs = io_tlb_nslabs;
+	unsigned int order;
+
+	if (!io_tlb_nslabs) {
+		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
+
+	/*
+	 * Get IO TLB memory from the low pages
+	 */
+	order = get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
+	io_tlb_nslabs = SLABS_PER_PAGE << order;
+
+	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
+//TODO ulim: make contiguous
+		io_tlb_start = (char *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+		                                        order);
+#ifdef CONFIG_XEN
+	if (io_tlb_start&&!xen_create_contiguous_region((unsigned long)io_tlb_start, order, 31)) {
+		free_pages((unsigned long)io_tlb_start, order);
+		io_tlb_start = NULL;
+	}
+#endif
+		if (io_tlb_start)
+			break;
+		order--;
+	}
+
+	if (!io_tlb_start)
+		goto cleanup1;
+
+	if (order != get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT))) {
+		printk(KERN_WARNING "Warning: only able to allocate %ld MB "
+		       "for software IO TLB\n", (PAGE_SIZE << order) >> 20);
+		io_tlb_nslabs = SLABS_PER_PAGE << order;
+	}
+	io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
+	memset(io_tlb_start, 0, io_tlb_nslabs * (1 << IO_TLB_SHIFT));
+
+	/*
+	 * Allocate and initialize the free list array.  This array is used
+	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+	 * between io_tlb_start and io_tlb_end.
+	 */
+	io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
+	                              get_order(io_tlb_nslabs * sizeof(int)));
+	if (!io_tlb_list)
+		goto cleanup2;
+
+	for (i = 0; i < io_tlb_nslabs; i++)
+ 		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+	io_tlb_index = 0;
+
+	io_tlb_orig_addr = (unsigned char **)__get_free_pages(GFP_KERNEL,
+	                           get_order(io_tlb_nslabs * sizeof(char *)));
+	if (!io_tlb_orig_addr)
+		goto cleanup3;
+
+	memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(char *));
+
+	/*
+	 * Get the overflow emergency buffer
+	 */
+	io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
+	                                          get_order(io_tlb_overflow));
+#ifdef CONFIG_XEN
+	if (io_tlb_overflow_buffer&&!xen_create_contiguous_region((unsigned long)io_tlb_overflow_buffer, get_order(io_tlb_overflow), 31)) {
+		free_pages((unsigned long)io_tlb_overflow_buffer, order);
+		io_tlb_overflow_buffer = NULL;
+	}
+#endif
+	if (!io_tlb_overflow_buffer)
+		goto cleanup4;
+
+//TODO ulim: make contiguous
+	printk(KERN_INFO "Placing %ldMB software IO TLB between 0x%lx - "
+	       "0x%lx\n", (io_tlb_nslabs * (1 << IO_TLB_SHIFT)) >> 20,
+	       virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end));
+
+	return 0;
+
+cleanup4:
+	free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
+	                                                      sizeof(char *)));
+	io_tlb_orig_addr = NULL;
+cleanup3:
+	free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
+	                                                 sizeof(int)));
+	io_tlb_list = NULL;
+	io_tlb_end = NULL;
+cleanup2:
+#ifdef CONFIG_XEN
+	xen_destroy_contiguous_region((unsigned long)io_tlb_start, order);
+#endif
+	free_pages((unsigned long)io_tlb_start, order);
+//TODO ulim: make contiguous
+	io_tlb_start = NULL;
+cleanup1:
+	io_tlb_nslabs = req_nslabs;
+	return -ENOMEM;
+}
+
+static inline int
+address_needs_mapping(struct device *hwdev, dma_addr_t addr)
+{
+	dma_addr_t mask = 0xffffffff;
+	/* If the device has a mask, use it, otherwise default to 32 bits */
+	if (hwdev && hwdev->dma_mask)
+		mask = *hwdev->dma_mask;
+	return (addr & ~mask) != 0;
+}
+
+/*
+ * Allocates bounce buffer and returns its kernel virtual address.
+ */
+static void *
+map_single(struct device *hwdev, char *buffer, size_t size, int dir)
+{
+	unsigned long flags;
+	char *dma_addr;
+	unsigned int nslots, stride, index, wrap;
+	int i;
+
+	/*
+	 * For mappings greater than a page, we limit the stride (and
+	 * hence alignment) to a page size.
+	 */
+	nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	if (size > PAGE_SIZE)
+		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
+	else
+		stride = 1;
+
+	BUG_ON(!nslots);
+
+	/*
+	 * Find suitable number of IO TLB entries size that will fit this
+	 * request and allocate a buffer from that IO TLB pool.
+	 */
+	spin_lock_irqsave(&io_tlb_lock, flags);
+	{
+		wrap = index = ALIGN(io_tlb_index, stride);
+
+		if (index >= io_tlb_nslabs)
+			wrap = index = 0;
+
+		do {
+			/*
+			 * If we find a slot that indicates we have 'nslots'
+			 * number of contiguous buffers, we allocate the
+			 * buffers from that slot and mark the entries as '0'
+			 * indicating unavailable.
+			 */
+			if (io_tlb_list[index] >= nslots) {
+				int count = 0;
+
+				for (i = index; i < (int) (index + nslots); i++)
+					io_tlb_list[i] = 0;
+				for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
+					io_tlb_list[i] = ++count;
+				dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
+
+				/*
+				 * Update the indices to avoid searching in
+				 * the next round.
+				 */
+				io_tlb_index = ((index + nslots) < io_tlb_nslabs
+						? (index + nslots) : 0);
+
+				goto found;
+			}
+			index += stride;
+			if (index >= io_tlb_nslabs)
+				index = 0;
+		} while (index != wrap);
+
+		spin_unlock_irqrestore(&io_tlb_lock, flags);
+		return NULL;
+	}
+  found:
+	spin_unlock_irqrestore(&io_tlb_lock, flags);
+
+	/*
+	 * Save away the mapping from the original address to the DMA address.
+	 * This is needed when we sync the memory.  Then we sync the buffer if
+	 * needed.
+	 */
+	io_tlb_orig_addr[index] = buffer;
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+		memcpy(dma_addr, buffer, size);
+
+	return dma_addr;
+}
+
+/*
+ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+ */
+static void
+unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+{
+	unsigned long flags;
+	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	char *buffer = io_tlb_orig_addr[index];
+
+	/*
+	 * First, sync the memory before unmapping the entry
+	 */
+	if (buffer && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+		/*
+		 * bounce... copy the data back into the original buffer * and
+		 * delete the bounce buffer.
+		 */
+		memcpy(buffer, dma_addr, size);
+
+	/*
+	 * Return the buffer to the free list by setting the corresponding
+	 * entries to indicate the number of contigous entries available.
+	 * While returning the entries to the free list, we merge the entries
+	 * with slots below and above the pool being returned.
+	 */
+	spin_lock_irqsave(&io_tlb_lock, flags);
+	{
+		count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
+			 io_tlb_list[index + nslots] : 0);
+		/*
+		 * Step 1: return the slots to the free list, merging the
+		 * slots with superceeding slots
+		 */
+		for (i = index + nslots - 1; i >= index; i--)
+			io_tlb_list[i] = ++count;
+		/*
+		 * Step 2: merge the returned slots with the preceding slots,
+		 * if available (non zero)
+		 */
+		for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
+			io_tlb_list[i] = ++count;
+	}
+	spin_unlock_irqrestore(&io_tlb_lock, flags);
+}
+
+static void
+sync_single(struct device *hwdev, char *dma_addr, size_t size,
+	    int dir, int target)
+{
+	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	char *buffer = io_tlb_orig_addr[index];
+
+	switch (target) {
+	case SYNC_FOR_CPU:
+		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+			memcpy(buffer, dma_addr, size);
+		else
+			BUG_ON(dir != DMA_TO_DEVICE);
+		break;
+	case SYNC_FOR_DEVICE:
+		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+			memcpy(dma_addr, buffer, size);
+		else
+			BUG_ON(dir != DMA_FROM_DEVICE);
+		break;
+	default:
+		BUG();
+	}
+}
+
+void *
+swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+		       dma_addr_t *dma_handle, gfp_t flags)
+{
+	unsigned long dev_addr;
+	void *ret;
+	int order = get_order(size);
+
+	/*
+	 * XXX fix me: the DMA API should pass us an explicit DMA mask
+	 * instead, or use ZONE_DMA32 (ia64 overloads ZONE_DMA to be a ~32
+	 * bit range instead of a 16MB one).
+	 */
+	flags |= GFP_DMA;
+
+	ret = (void *)__get_free_pages(flags, order);
+	if (ret && address_needs_mapping(hwdev, virt_to_bus(ret))) {
+		/*
+		 * The allocated memory isn't reachable by the device.
+		 * Fall back on swiotlb_map_single().
+		 */
+		free_pages((unsigned long) ret, order);
+		ret = NULL;
+	}
+	if (!ret) {
+		/*
+		 * We are either out of memory or the device can't DMA
+		 * to GFP_DMA memory; fall back on
+		 * swiotlb_map_single(), which will grab memory from
+		 * the lowest available address range.
+		 */
+		dma_addr_t handle;
+		handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE);
+		if (swiotlb_dma_mapping_error(handle))
+			return NULL;
+
+		ret = bus_to_virt(handle);
+	}
+
+	memset(ret, 0, size);
+	dev_addr = virt_to_bus(ret);
+
+	/* Confirm address can be DMA'd by device */
+	if (address_needs_mapping(hwdev, dev_addr)) {
+		printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016lx\n",
+		       (unsigned long long)*hwdev->dma_mask, dev_addr);
+		panic("swiotlb_alloc_coherent: allocated memory is out of "
+		      "range for device");
+	}
+	*dma_handle = dev_addr;
+	return ret;
+}
+
+void
+swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+		      dma_addr_t dma_handle)
+{
+	if (!(vaddr >= (void *)io_tlb_start
+                    && vaddr < (void *)io_tlb_end))
+		free_pages((unsigned long) vaddr, get_order(size));
+	else
+		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
+		swiotlb_unmap_single (hwdev, dma_handle, size, DMA_TO_DEVICE);
+}
+
+static void
+swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
+{
+	/*
+	 * Ran out of IOMMU space for this operation. This is very bad.
+	 * Unfortunately the drivers cannot handle this operation properly.
+	 * unless they check for dma_mapping_error (most don't)
+	 * When the mapping is small enough return a static buffer to limit
+	 * the damage, or panic when the transfer is too big.
+	 */
+	printk(KERN_ERR "DMA: Out of SW-IOMMU space for %lu bytes at "
+	       "device %s\n", size, dev ? dev->bus_id : "?");
+
+	if (size > io_tlb_overflow && do_panic) {
+		if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+			panic("DMA: Memory would be corrupted\n");
+		if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+			panic("DMA: Random memory would be DMAed\n");
+	}
+}
+
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode.  The
+ * physical address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
+ */
+dma_addr_t
+swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir)
+{
+	unsigned long dev_addr = virt_to_bus(ptr);
+	void *map;
+
+	BUG_ON(dir == DMA_NONE);
+	/*
+	 * If the pointer passed in happens to be in the device's DMA window,
+	 * we can safely return the device addr and not worry about bounce
+	 * buffering it.
+	 */
+	if (!address_needs_mapping(hwdev, dev_addr) && !swiotlb_force)
+		return dev_addr;
+
+	/*
+	 * Oh well, have to allocate and map a bounce buffer.
+	 */
+	map = map_single(hwdev, ptr, size, dir);
+	if (!map) {
+		swiotlb_full(hwdev, size, dir, 1);
+		map = io_tlb_overflow_buffer;
+	}
+
+	dev_addr = virt_to_bus(map);
+
+	/*
+	 * Ensure that the address returned is DMA'ble
+	 */
+	if (address_needs_mapping(hwdev, dev_addr))
+		panic("map_single: bounce buffer is not DMA'ble");
+
+	return dev_addr;
+}
+
+/*
+ * Since DMA is i-cache coherent, any (complete) pages that were written via
+ * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
+ * flush them when they get mapped into an executable vm-area.
+ */
+static void
+mark_clean(void *addr, size_t size)
+{
+	unsigned long pg_addr, end;
+
+	pg_addr = PAGE_ALIGN((unsigned long) addr);
+	end = (unsigned long) addr + size;
+	while (pg_addr + PAGE_SIZE <= end) {
+		struct page *page = virt_to_page(pg_addr);
+		set_bit(PG_arch_1, &page->flags);
+		pg_addr += PAGE_SIZE;
+	}
+}
+
+/*
+ * Unmap a single streaming mode DMA translation.  The dma_addr and size must
+ * match what was provided for in a previous swiotlb_map_single call.  All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+void
+swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size,
+		     int dir)
+{
+	char *dma_addr = bus_to_virt(dev_addr);
+
+	BUG_ON(dir == DMA_NONE);
+	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+		unmap_single(hwdev, dma_addr, size, dir);
+	else if (dir == DMA_FROM_DEVICE)
+		mark_clean(dma_addr, size);
+}
+
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a swiotlb_map_single() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the dma mapping, you must
+ * call this function before doing so.  At the next point you give the dma
+ * address back to the card, you must first perform a
+ * swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+static inline void
+swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+		    size_t size, int dir, int target)
+{
+	char *dma_addr = bus_to_virt(dev_addr);
+
+	BUG_ON(dir == DMA_NONE);
+	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+		sync_single(hwdev, dma_addr, size, dir, target);
+	else if (dir == DMA_FROM_DEVICE)
+		mark_clean(dma_addr, size);
+}
+
+void
+swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+			    size_t size, int dir)
+{
+	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
+}
+
+void
+swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+			       size_t size, int dir)
+{
+	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
+}
+
+/*
+ * Same as above, but for a sub-range of the mapping.
+ */
+static inline void
+swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
+			  unsigned long offset, size_t size,
+			  int dir, int target)
+{
+	char *dma_addr = bus_to_virt(dev_addr) + offset;
+
+	BUG_ON(dir == DMA_NONE);
+	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
+		sync_single(hwdev, dma_addr, size, dir, target);
+	else if (dir == DMA_FROM_DEVICE)
+		mark_clean(dma_addr, size);
+}
+
+void
+swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+				  unsigned long offset, size_t size, int dir)
+{
+	swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
+				  SYNC_FOR_CPU);
+}
+
+void
+swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
+				     unsigned long offset, size_t size, int dir)
+{
+	swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
+				  SYNC_FOR_DEVICE);
+}
+
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above swiotlb_map_single
+ * interface.  Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length.  They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for swiotlb_map_single are the
+ * same here.
+ */
+int
+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
+	       int dir)
+{
+	void *addr;
+	unsigned long dev_addr;
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for (i = 0; i < nelems; i++, sg++) {
+		addr = SG_ENT_VIRT_ADDRESS(sg);
+		dev_addr = virt_to_bus(addr);
+		if (swiotlb_force || address_needs_mapping(hwdev, dev_addr)) {
+			void *map = map_single(hwdev, addr, sg->length, dir);
+			sg->dma_address = virt_to_bus(map);
+			if (!map) {
+				/* Don't panic here, we expect map_sg users
+				   to do proper error handling. */
+				swiotlb_full(hwdev, sg->length, dir, 0);
+				swiotlb_unmap_sg(hwdev, sg - i, i, dir);
+				sg[0].dma_length = 0;
+				return 0;
+			}
+		} else
+			sg->dma_address = dev_addr;
+		sg->dma_length = sg->length;
+	}
+	return nelems;
+}
+
+/*
+ * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
+ * concerning calls here are the same as for swiotlb_unmap_single() above.
+ */
+void
+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems,
+		 int dir)
+{
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for (i = 0; i < nelems; i++, sg++)
+		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
+			unmap_single(hwdev, (void *) bus_to_virt(sg->dma_address), sg->dma_length, dir);
+		else if (dir == DMA_FROM_DEVICE)
+			mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
+}
+
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
+ * and usage.
+ */
+static inline void
+swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sg,
+		int nelems, int dir, int target)
+{
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for (i = 0; i < nelems; i++, sg++)
+		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
+			sync_single(hwdev, (void *) sg->dma_address,
+				    sg->dma_length, dir, target);
+}
+
+void
+swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+			int nelems, int dir)
+{
+	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
+}
+
+void
+swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+			   int nelems, int dir)
+{
+	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+}
+
+int
+swiotlb_dma_mapping_error(dma_addr_t dma_addr)
+{
+	return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
+}
+
+/*
+ * Return whether the given device DMA address mask can be supported
+ * properly.  For example, if your device can only drive the low 24-bits
+ * during bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+int
+swiotlb_dma_supported (struct device *hwdev, u64 mask)
+{
+	return (virt_to_bus (io_tlb_end) - 1) <= mask;
+}
+
+EXPORT_SYMBOL(swiotlb_init);
+EXPORT_SYMBOL(swiotlb_map_single);
+EXPORT_SYMBOL(swiotlb_unmap_single);
+EXPORT_SYMBOL(swiotlb_map_sg);
+EXPORT_SYMBOL(swiotlb_unmap_sg);
+EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
+EXPORT_SYMBOL(swiotlb_sync_single_for_device);
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_cpu);
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
+EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
+EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
+EXPORT_SYMBOL(swiotlb_dma_mapping_error);
+EXPORT_SYMBOL(swiotlb_alloc_coherent);
+EXPORT_SYMBOL(swiotlb_free_coherent);
+EXPORT_SYMBOL(swiotlb_dma_supported);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 8+ messages in thread