All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/7] ARM: provide phys_to_page() to complement page_to_phys()
  2009-11-20 18:25 [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Russell King - ARM Linux
@ 2009-11-20 18:01 ` Russell King - ARM Linux
  2009-11-23 12:05   ` Catalin Marinas
  2009-11-20 18:02 ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus() Russell King - ARM Linux
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-20 18:01 UTC (permalink / raw)
  To: linux-arm-kernel

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/memory.h |   11 ++++++-----
 1 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index cefedf0..e0f8f4a 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -134,6 +134,12 @@
 #define	__phys_to_pfn(paddr)	((paddr) >> PAGE_SHIFT)
 #define	__pfn_to_phys(pfn)	((pfn) << PAGE_SHIFT)
 
+/*
+ * Convert a page to/from a physical address
+ */
+#define page_to_phys(page)	(__pfn_to_phys(page_to_pfn(page)))
+#define phys_to_page(phys)	(pfn_to_page(__phys_to_pfn(phys)))
+
 #ifndef __ASSEMBLY__
 
 /*
@@ -293,11 +299,6 @@ static inline __deprecated void *bus_to_virt(unsigned long x)
 #endif /* !CONFIG_DISCONTIGMEM */
 
 /*
- * For BIO.  "will die".  Kill me when bio_to_phys() and bvec_to_phys() die.
- */
-#define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
-
-/*
  * Optional coherency support.  Currently used only by selected
  * Intel XSC3-based systems.
  */
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus()
  2009-11-20 18:25 [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Russell King - ARM Linux
  2009-11-20 18:01 ` [PATCH 1/7] ARM: provide phys_to_page() to complement page_to_phys() Russell King - ARM Linux
@ 2009-11-20 18:02 ` Russell King - ARM Linux
  2009-11-23 12:05   ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and__pfn_to_bus() Catalin Marinas
  2009-12-12 14:01   ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus() Anders Grafström
  2009-11-20 18:03 ` [PATCH 3/7] ARM: dma-mapping: provide dma_to_page() Russell King - ARM Linux
                   ` (7 subsequent siblings)
  9 siblings, 2 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-20 18:02 UTC (permalink / raw)
  To: linux-arm-kernel

The non-highmem() and the __pfn_to_bus() based page_to_dma() both
compile to the same code, so its pointless having these two different
approaches.  Use the __pfn_to_bus() based version.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/dma-mapping.h |   10 ----------
 arch/arm/include/asm/memory.h      |    3 ++-
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index ff46dfa..5d78eb1 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -15,20 +15,10 @@
  * must not be used by drivers.
  */
 #ifndef __arch_page_to_dma
-
-#if !defined(CONFIG_HIGHMEM)
-static inline dma_addr_t page_to_dma(struct device *dev, struct page *page)
-{
-	return (dma_addr_t)__virt_to_bus((unsigned long)page_address(page));
-}
-#elif defined(__pfn_to_bus)
 static inline dma_addr_t page_to_dma(struct device *dev, struct page *page)
 {
 	return (dma_addr_t)__pfn_to_bus(page_to_pfn(page));
 }
-#else
-#error "this machine class needs to define __arch_page_to_dma to use HIGHMEM"
-#endif
 
 static inline void *dma_to_virt(struct device *dev, dma_addr_t addr)
 {
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index e0f8f4a..9099ada 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -200,7 +200,8 @@ static inline void *phys_to_virt(unsigned long x)
 #ifndef __virt_to_bus
 #define __virt_to_bus	__virt_to_phys
 #define __bus_to_virt	__phys_to_virt
-#define __pfn_to_bus(x)	((x) << PAGE_SHIFT)
+#define __pfn_to_bus(x)	__pfn_to_phys(x)
+#define __bus_to_pfn(x)	__phys_to_pfn(x)
 #endif
 
 static inline __deprecated unsigned long virt_to_bus(void *x)
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 3/7] ARM: dma-mapping: provide dma_to_page()
  2009-11-20 18:25 [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Russell King - ARM Linux
  2009-11-20 18:01 ` [PATCH 1/7] ARM: provide phys_to_page() to complement page_to_phys() Russell King - ARM Linux
  2009-11-20 18:02 ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus() Russell King - ARM Linux
@ 2009-11-20 18:03 ` Russell King - ARM Linux
  2009-11-20 18:04 ` [PATCH 4/7] ARM: dma-mapping: split dma_unmap_page() from dma_unmap_single() Russell King - ARM Linux
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-20 18:03 UTC (permalink / raw)
  To: linux-arm-kernel

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/dma-mapping.h          |   10 ++++++++++
 arch/arm/mach-iop13xx/include/mach/memory.h |    2 ++
 arch/arm/mach-ks8695/include/mach/memory.h  |    7 +++++++
 arch/arm/plat-omap/include/mach/memory.h    |    7 +++++++
 4 files changed, 26 insertions(+), 0 deletions(-)

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 5d78eb1..f06d80c 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -20,6 +20,11 @@ static inline dma_addr_t page_to_dma(struct device *dev, struct page *page)
 	return (dma_addr_t)__pfn_to_bus(page_to_pfn(page));
 }
 
+static inline struct page *dma_to_page(struct device *dev, dma_addr_t addr)
+{
+	return pfn_to_page(__bus_to_pfn(addr));
+}
+
 static inline void *dma_to_virt(struct device *dev, dma_addr_t addr)
 {
 	return (void *)__bus_to_virt(addr);
@@ -35,6 +40,11 @@ static inline dma_addr_t page_to_dma(struct device *dev, struct page *page)
 	return __arch_page_to_dma(dev, page);
 }
 
+static inline struct page *dma_to_page(struct device *dev, dma_addr_t addr)
+{
+	return __arch_dma_to_page(dev, addr);
+}
+
 static inline void *dma_to_virt(struct device *dev, dma_addr_t addr)
 {
 	return __arch_dma_to_virt(dev, addr);
diff --git a/arch/arm/mach-iop13xx/include/mach/memory.h b/arch/arm/mach-iop13xx/include/mach/memory.h
index 42ae29b..25b1da9 100644
--- a/arch/arm/mach-iop13xx/include/mach/memory.h
+++ b/arch/arm/mach-iop13xx/include/mach/memory.h
@@ -64,6 +64,8 @@ static inline unsigned long __lbus_to_virt(dma_addr_t x)
 		(dma_addr_t)page_to_phys(page);				\
 	})
 
+#define __arch_dma_to_page(dev, addr)	phys_to_page(addr)
+
 #endif /* CONFIG_ARCH_IOP13XX */
 #endif /* !ASSEMBLY */
 
diff --git a/arch/arm/mach-ks8695/include/mach/memory.h b/arch/arm/mach-ks8695/include/mach/memory.h
index 76e5308..ffa19aa 100644
--- a/arch/arm/mach-ks8695/include/mach/memory.h
+++ b/arch/arm/mach-ks8695/include/mach/memory.h
@@ -41,6 +41,13 @@ extern struct bus_type platform_bus_type;
 		__dma = __dma - PHYS_OFFSET + KS8695_PCIMEM_PA; \
 	   __dma; })
 
+#define __arch_dma_to_page(dev, x)	\
+	({ dma_addr_t __dma = x;				\
+	   if (!is_lbus_device(dev))				\
+		__dma += PHYS_OFFSET - KS8695_PCIMEM_PA;	\
+	   phys_to_page(__dma);					\
+	})
+
 #endif
 
 #endif
diff --git a/arch/arm/plat-omap/include/mach/memory.h b/arch/arm/plat-omap/include/mach/memory.h
index 9ad41dc..3325f7b 100644
--- a/arch/arm/plat-omap/include/mach/memory.h
+++ b/arch/arm/plat-omap/include/mach/memory.h
@@ -68,6 +68,13 @@
 		__dma = __dma - PHYS_OFFSET + OMAP1510_LB_OFFSET; \
 	   __dma; })
 
+#define __arch_dma_to_page(dev, addr)	\
+	({ dma_addr_t __dma = addr;				\
+	   if (is_lbus_device(dev))				\
+		__dma += PHYS_OFFSET - OMAP1510_LB_OFFSET;	\
+	   phys_to_page(__dma);					\
+	})
+
 #define __arch_dma_to_virt(dev, addr)	({ (void *) (is_lbus_device(dev) ? \
 						lbus_to_virt(addr) : \
 						__phys_to_virt(addr)); })
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 4/7] ARM: dma-mapping: split dma_unmap_page() from dma_unmap_single()
  2009-11-20 18:25 [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Russell King - ARM Linux
                   ` (2 preceding siblings ...)
  2009-11-20 18:03 ` [PATCH 3/7] ARM: dma-mapping: provide dma_to_page() Russell King - ARM Linux
@ 2009-11-20 18:04 ` Russell King - ARM Linux
  2009-11-20 18:05 ` [PATCH 5/7] ARM: dma-mapping: use the idea of buffer ownership Russell King - ARM Linux
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-20 18:04 UTC (permalink / raw)
  To: linux-arm-kernel

We will need to treat dma_unmap_page() differently from dma_unmap_single()

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/common/dmabounce.c        |   21 ++++++++++++++++++---
 arch/arm/include/asm/dma-mapping.h |    8 +++++---
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
index 734ac91..5a375e5 100644
--- a/arch/arm/common/dmabounce.c
+++ b/arch/arm/common/dmabounce.c
@@ -342,6 +342,22 @@ dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
 }
 EXPORT_SYMBOL(dma_map_single);
 
+/*
+ * see if a mapped address was really a "safe" buffer and if so, copy
+ * the data from the safe buffer back to the unsafe buffer and free up
+ * the safe buffer.  (basically return things back to the way they
+ * should be)
+ */
+void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+		enum dma_data_direction dir)
+{
+	dev_dbg(dev, "%s(ptr=%p,size=%d,dir=%x)\n",
+		__func__, (void *) dma_addr, size, dir);
+
+	unmap_single(dev, dma_addr, size, dir);
+}
+EXPORT_SYMBOL(dma_unmap_single);
+
 dma_addr_t dma_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir)
 {
@@ -366,8 +382,7 @@ EXPORT_SYMBOL(dma_map_page);
  * the safe buffer.  (basically return things back to the way they
  * should be)
  */
-
-void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+void dma_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
 		enum dma_data_direction dir)
 {
 	dev_dbg(dev, "%s(ptr=%p,size=%d,dir=%x)\n",
@@ -375,7 +390,7 @@ void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
 
 	unmap_single(dev, dma_addr, size, dir);
 }
-EXPORT_SYMBOL(dma_unmap_single);
+EXPORT_SYMBOL(dma_unmap_page);
 
 int dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr,
 		unsigned long off, size_t sz, enum dma_data_direction dir)
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index f06d80c..a96300b 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -257,9 +257,11 @@ extern int dma_needs_bounce(struct device*, dma_addr_t, size_t);
  */
 extern dma_addr_t dma_map_single(struct device *, void *, size_t,
 		enum dma_data_direction);
+extern void dma_unmap_single(struct device *, dma_addr_t, size_t,
+		enum dma_data_direction);
 extern dma_addr_t dma_map_page(struct device *, struct page *,
 		unsigned long, size_t, enum dma_data_direction);
-extern void dma_unmap_single(struct device *, dma_addr_t, size_t,
+extern void dma_unmap_page(struct device *, dma_addr_t, size_t,
 		enum dma_data_direction);
 
 /*
@@ -352,7 +354,6 @@ static inline void dma_unmap_single(struct device *dev, dma_addr_t handle,
 {
 	/* nothing to do */
 }
-#endif /* CONFIG_DMABOUNCE */
 
 /**
  * dma_unmap_page - unmap a buffer previously mapped through dma_map_page()
@@ -371,8 +372,9 @@ static inline void dma_unmap_single(struct device *dev, dma_addr_t handle,
 static inline void dma_unmap_page(struct device *dev, dma_addr_t handle,
 		size_t size, enum dma_data_direction dir)
 {
-	dma_unmap_single(dev, handle, size, dir);
+	/* nothing to do */
 }
+#endif /* CONFIG_DMABOUNCE */
 
 /**
  * dma_sync_single_range_for_cpu
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 5/7] ARM: dma-mapping: use the idea of buffer ownership
  2009-11-20 18:25 [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Russell King - ARM Linux
                   ` (3 preceding siblings ...)
  2009-11-20 18:04 ` [PATCH 4/7] ARM: dma-mapping: split dma_unmap_page() from dma_unmap_single() Russell King - ARM Linux
@ 2009-11-20 18:05 ` Russell King - ARM Linux
  2009-11-20 18:06 ` [PATCH 6/7] ARM: dma-mapping: fix for speculative accesses Russell King - ARM Linux
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-20 18:05 UTC (permalink / raw)
  To: linux-arm-kernel

The DMA API has the notion of buffer ownership; make it explicit in the
ARM implementation of this API.  This gives us a set of hooks to allow
us to deal with CPU cache issues arising from non-cache coherent DMA.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/common/dmabounce.c        |    9 ++---
 arch/arm/include/asm/dma-mapping.h |   64 ++++++++++++++++++++++++++---------
 arch/arm/mm/dma-mapping.c          |   13 ++++---
 3 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
index 5a375e5..bc3e269 100644
--- a/arch/arm/common/dmabounce.c
+++ b/arch/arm/common/dmabounce.c
@@ -277,7 +277,7 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
 		 * We don't need to sync the DMA buffer since
 		 * it was allocated via the coherent allocators.
 		 */
-		dma_cache_maint(ptr, size, dir);
+		__dma_single_cpu_to_dev(ptr, size, dir);
 	}
 
 	return dma_addr;
@@ -310,15 +310,14 @@ static inline void unmap_single(struct device *dev, dma_addr_t dma_addr,
 			/*
 			 * DMA buffers must have the same cache properties
 			 * as if they were really used for DMA - which means
-			 * data must be written back to RAM.  Note that
-			 * we don't use dmac_flush_range() here for the
-			 * bidirectional case because we know the cache
-			 * lines will be coherent with the data written.
+			 * data must be written back to RAM.
 			 */
 			dmac_clean_range(ptr, ptr + size);
 			outer_clean_range(__pa(ptr), __pa(ptr) + size);
 		}
 		free_safe_buffer(dev->archdata.dmabounce, buf);
+	} else {
+		__dma_single_dev_to_cpu(dma_to_virt(dev, dma_addr), size, dir);
 	}
 }
 
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index a96300b..e850f5c 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -57,20 +57,49 @@ static inline dma_addr_t virt_to_dma(struct device *dev, void *addr)
 #endif
 
 /*
- * DMA-consistent mapping functions.  These allocate/free a region of
- * uncached, unwrite-buffered mapped memory space for use with DMA
- * devices.  This is the "generic" version.  The PCI specific version
- * is in pci.h
- *
- * Note: Drivers should NOT use this function directly, as it will break
- * platforms with CONFIG_DMABOUNCE.
- * Use the driver DMA support - see dma-mapping.h (dma_sync_*)
+ * Private support functions: these are not part of the API and are
+ * liable to change.  Drivers must not use these.
  */
 extern void dma_cache_maint(const void *kaddr, size_t size, int rw);
 extern void dma_cache_maint_page(struct page *page, unsigned long offset,
 				 size_t size, int rw);
 
 /*
+ * The DMA API is built upon the notion of "buffer ownership".  A buffer
+ * is either exclusively owned by the CPU (and therefore may be accessed
+ * by it) or exclusively owned by the DMA device.  These helper functions
+ * represent the transitions between these two ownership states.
+ *
+ * As above, these are private support functions and not part of the API.
+ * Drivers must not use these.
+ */
+static inline void __dma_single_cpu_to_dev(const void *kaddr, size_t size,
+	enum dma_data_direction dir)
+{
+	if (!arch_is_coherent())
+		dma_cache_maint(kaddr, size, dir);
+}
+
+static inline void __dma_single_dev_to_cpu(const void *kaddr, size_t size,
+	enum dma_data_direction dir)
+{
+	/* nothing to do */
+}
+
+static inline void __dma_page_cpu_to_dev(struct page *page, unsigned long off,
+	size_t size, enum dma_data_direction dir)
+{
+	if (!arch_is_coherent())
+		dma_cache_maint_page(page, off, size, dir);
+}
+
+static inline void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
+	size_t size, enum dma_data_direction dir)
+{
+	/* nothing to do */
+}
+
+/*
  * Return whether the given device DMA address mask can be supported
  * properly.  For example, if your device can only drive the low 24-bits
  * during bus mastering, then you would pass 0x00ffffff as the mask
@@ -304,8 +333,7 @@ static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
 {
 	BUG_ON(!valid_dma_direction(dir));
 
-	if (!arch_is_coherent())
-		dma_cache_maint(cpu_addr, size, dir);
+	__dma_single_cpu_to_dev(cpu_addr, size, dir);
 
 	return virt_to_dma(dev, cpu_addr);
 }
@@ -329,8 +357,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
 {
 	BUG_ON(!valid_dma_direction(dir));
 
-	if (!arch_is_coherent())
-		dma_cache_maint_page(page, offset, size, dir);
+	__dma_page_cpu_to_dev(page, offset, size, dir);
 
 	return page_to_dma(dev, page) + offset;
 }
@@ -352,7 +379,7 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
 static inline void dma_unmap_single(struct device *dev, dma_addr_t handle,
 		size_t size, enum dma_data_direction dir)
 {
-	/* nothing to do */
+	__dma_single_dev_to_cpu(dma_to_virt(dev, handle), size, dir);
 }
 
 /**
@@ -372,7 +399,8 @@ static inline void dma_unmap_single(struct device *dev, dma_addr_t handle,
 static inline void dma_unmap_page(struct device *dev, dma_addr_t handle,
 		size_t size, enum dma_data_direction dir)
 {
-	/* nothing to do */
+	__dma_page_dev_to_cpu(dma_to_page(dev, handle), handle & ~PAGE_MASK,
+		size, dir);
 }
 #endif /* CONFIG_DMABOUNCE */
 
@@ -400,7 +428,10 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev,
 {
 	BUG_ON(!valid_dma_direction(dir));
 
-	dmabounce_sync_for_cpu(dev, handle, offset, size, dir);
+	if (!dmabounce_sync_for_cpu(dev, handle, offset, size, dir))
+		return;
+
+	__dma_single_dev_to_cpu(dma_to_virt(dev, handle) + offset, size, dir);
 }
 
 static inline void dma_sync_single_range_for_device(struct device *dev,
@@ -412,8 +443,7 @@ static inline void dma_sync_single_range_for_device(struct device *dev,
 	if (!dmabounce_sync_for_device(dev, handle, offset, size, dir))
 		return;
 
-	if (!arch_is_coherent())
-		dma_cache_maint(dma_to_virt(dev, handle) + offset, size, dir);
+	__dma_single_cpu_to_dev(dma_to_virt(dev, handle) + offset, size, dir);
 }
 
 static inline void dma_sync_single_for_cpu(struct device *dev,
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index b9590a7..5bd2e0d 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -708,8 +708,12 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 	int i;
 
 	for_each_sg(sg, s, nents, i) {
-		dmabounce_sync_for_cpu(dev, sg_dma_address(s), 0,
-					sg_dma_len(s), dir);
+		if (!dmabounce_sync_for_cpu(dev, sg_dma_address(s), 0,
+					    sg_dma_len(s), dir))
+			continue;
+
+		__dma_page_dev_to_cpu(sg_page(s), s->offset,
+				      s->length, dir);
 	}
 }
 EXPORT_SYMBOL(dma_sync_sg_for_cpu);
@@ -732,9 +736,8 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 					sg_dma_len(s), dir))
 			continue;
 
-		if (!arch_is_coherent())
-			dma_cache_maint_page(sg_page(s), s->offset,
-					     s->length, dir);
+		__dma_page_cpu_to_dev(sg_page(s), s->offset,
+				      s->length, dir);
 	}
 }
 EXPORT_SYMBOL(dma_sync_sg_for_device);
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 6/7] ARM: dma-mapping: fix for speculative accesses
  2009-11-20 18:25 [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Russell King - ARM Linux
                   ` (4 preceding siblings ...)
  2009-11-20 18:05 ` [PATCH 5/7] ARM: dma-mapping: use the idea of buffer ownership Russell King - ARM Linux
@ 2009-11-20 18:06 ` Russell King - ARM Linux
  2009-11-23 12:03   ` Catalin Marinas
  2009-11-23 13:38   ` [PATCH v2 06/17] " Russell King - ARM Linux
  2009-11-20 18:07 ` [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate Russell King - ARM Linux
                   ` (3 subsequent siblings)
  9 siblings, 2 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-20 18:06 UTC (permalink / raw)
  To: linux-arm-kernel

Rather than handing all cache maintainence before DMA begins, we
also need to handle cache maintainence upon completion when we
have CPUs which prefetch speculatively.

This renames the dma_cache_maint*() functions, and changes their
parameters to indicate whether we are mapping a DMA buffer.  We
always clean DMA buffers when we map them, but avoid invalidating
them if we are DMA'ing to the device.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/dma-mapping.h |   25 +++++++++++++-----
 arch/arm/mm/dma-mapping.c          |   48 +++++++++++------------------------
 2 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index e850f5c..5a86105 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -60,9 +60,9 @@ static inline dma_addr_t virt_to_dma(struct device *dev, void *addr)
  * Private support functions: these are not part of the API and are
  * liable to change.  Drivers must not use these.
  */
-extern void dma_cache_maint(const void *kaddr, size_t size, int rw);
-extern void dma_cache_maint_page(struct page *page, unsigned long offset,
-				 size_t size, int rw);
+extern void __dma_cache_maint(const void *kaddr, size_t size, int map);
+extern void __dma_cache_maint_page(struct page *page, unsigned long offset,
+				   size_t size, int map);
 
 /*
  * The DMA API is built upon the notion of "buffer ownership".  A buffer
@@ -70,6 +70,11 @@ extern void dma_cache_maint_page(struct page *page, unsigned long offset,
  * by it) or exclusively owned by the DMA device.  These helper functions
  * represent the transitions between these two ownership states.
  *
+ * Note, however, that on later ARMs, this notion does not work due to
+ * speculative prefetches.  We model our approach on the assumption that
+ * the CPU does do speculative prefetches, which means we clean caches
+ * before transfers and delay cache invalidation until transfer completion.
+ *
  * As above, these are private support functions and not part of the API.
  * Drivers must not use these.
  */
@@ -77,26 +82,32 @@ static inline void __dma_single_cpu_to_dev(const void *kaddr, size_t size,
 	enum dma_data_direction dir)
 {
 	if (!arch_is_coherent())
-		dma_cache_maint(kaddr, size, dir);
+		__dma_cache_maint(kaddr, size, 1);
 }
 
 static inline void __dma_single_dev_to_cpu(const void *kaddr, size_t size,
 	enum dma_data_direction dir)
 {
-	/* nothing to do */
+	if (!arch_is_coherent())
+		/* don't bother invalidating if DMA to device */
+		if (dir != DMA_TO_DEVICE)
+			__dma_cache_maint(kaddr, size, 0);
 }
 
 static inline void __dma_page_cpu_to_dev(struct page *page, unsigned long off,
 	size_t size, enum dma_data_direction dir)
 {
 	if (!arch_is_coherent())
-		dma_cache_maint_page(page, off, size, dir);
+		__dma_cache_maint_page(page, off, size, 1);
 }
 
 static inline void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
 	size_t size, enum dma_data_direction dir)
 {
-	/* nothing to do */
+	if (!arch_is_coherent())
+		/* don't bother invalidating if DMA to device */
+		if (dir != DMA_TO_DEVICE)
+			__dma_cache_maint_page(page, off, size, 0);
 }
 
 /*
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 5bd2e0d..4193904 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -539,58 +539,40 @@ core_initcall(consistent_init);
  * platforms with CONFIG_DMABOUNCE.
  * Use the driver DMA support - see dma-mapping.h (dma_sync_*)
  */
-void dma_cache_maint(const void *start, size_t size, int direction)
+void __dma_cache_maint(const void *start, size_t size, int map)
 {
 	void (*inner_op)(const void *, const void *);
 	void (*outer_op)(unsigned long, unsigned long);
 
 	BUG_ON(!virt_addr_valid(start) || !virt_addr_valid(start + size - 1));
 
-	switch (direction) {
-	case DMA_FROM_DEVICE:		/* invalidate only */
-		inner_op = dmac_inv_range;
-		outer_op = outer_inv_range;
-		break;
-	case DMA_TO_DEVICE:		/* writeback only */
+	if (map) {		/* writeback only */
 		inner_op = dmac_clean_range;
 		outer_op = outer_clean_range;
-		break;
-	case DMA_BIDIRECTIONAL:		/* writeback and invalidate */
-		inner_op = dmac_flush_range;
-		outer_op = outer_flush_range;
-		break;
-	default:
-		BUG();
+	} else {		/* invalidate only */
+		inner_op = dmac_inv_range;
+		outer_op = outer_inv_range;
 	}
 
 	inner_op(start, start + size);
 	outer_op(__pa(start), __pa(start) + size);
 }
-EXPORT_SYMBOL(dma_cache_maint);
+EXPORT_SYMBOL(__dma_cache_maint);
 
 static void dma_cache_maint_contiguous(struct page *page, unsigned long offset,
-				       size_t size, int direction)
+				       size_t size, int map)
 {
 	void *vaddr;
 	unsigned long paddr;
 	void (*inner_op)(const void *, const void *);
 	void (*outer_op)(unsigned long, unsigned long);
 
-	switch (direction) {
-	case DMA_FROM_DEVICE:		/* invalidate only */
-		inner_op = dmac_inv_range;
-		outer_op = outer_inv_range;
-		break;
-	case DMA_TO_DEVICE:		/* writeback only */
+	if (map) {		/* writeback only */
 		inner_op = dmac_clean_range;
 		outer_op = outer_clean_range;
-		break;
-	case DMA_BIDIRECTIONAL:		/* writeback and invalidate */
-		inner_op = dmac_flush_range;
-		outer_op = outer_flush_range;
-		break;
-	default:
-		BUG();
+	} else {		/* invalidate only */
+		inner_op = dmac_inv_range;
+		outer_op = outer_inv_range;
 	}
 
 	if (!PageHighMem(page)) {
@@ -609,8 +591,8 @@ static void dma_cache_maint_contiguous(struct page *page, unsigned long offset,
 	outer_op(paddr, paddr + size);
 }
 
-void dma_cache_maint_page(struct page *page, unsigned long offset,
-			  size_t size, int dir)
+void __dma_cache_maint_page(struct page *page, unsigned long offset,
+	size_t size, int map)
 {
 	/*
 	 * A single sg entry may refer to multiple physically contiguous
@@ -628,13 +610,13 @@ void dma_cache_maint_page(struct page *page, unsigned long offset,
 			}
 			len = PAGE_SIZE - offset;
 		}
-		dma_cache_maint_contiguous(page, offset, len, dir);
+		dma_cache_maint_contiguous(page, offset, len, map);
 		offset = 0;
 		page++;
 		left -= len;
 	} while (left);
 }
-EXPORT_SYMBOL(dma_cache_maint_page);
+EXPORT_SYMBOL(__dma_cache_maint_page);
 
 /**
  * dma_map_sg - map a set of SG buffers for streaming mode DMA
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate
  2009-11-20 18:25 [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Russell King - ARM Linux
                   ` (5 preceding siblings ...)
  2009-11-20 18:06 ` [PATCH 6/7] ARM: dma-mapping: fix for speculative accesses Russell King - ARM Linux
@ 2009-11-20 18:07 ` Russell King - ARM Linux
  2009-11-22 22:16   ` Nicolas Pitre
                     ` (2 more replies)
  2009-11-21 14:00 ` [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Jamie Iles
                   ` (2 subsequent siblings)
  9 siblings, 3 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-20 18:07 UTC (permalink / raw)
  To: linux-arm-kernel

Since we now clean the DMA buffers on map, there's no need to clean
overlapping cache lines on invalidation anymore.  (Note: the DMA API
prohibits other data sharing the same cache line as a DMA buffer
anyway.)

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/cache-fa.S      |    5 -----
 arch/arm/mm/cache-v4wb.S    |    5 -----
 arch/arm/mm/cache-v6.S      |   13 -------------
 arch/arm/mm/cache-v7.S      |    6 ------
 arch/arm/mm/proc-arm1020.S  |    8 --------
 arch/arm/mm/proc-arm1020e.S |    4 ----
 arch/arm/mm/proc-arm1022.S  |    4 ----
 arch/arm/mm/proc-arm1026.S  |    4 ----
 arch/arm/mm/proc-arm920.S   |    4 ----
 arch/arm/mm/proc-arm922.S   |    4 ----
 arch/arm/mm/proc-arm925.S   |    6 ------
 arch/arm/mm/proc-arm926.S   |    6 ------
 arch/arm/mm/proc-arm946.S   |    7 -------
 arch/arm/mm/proc-feroceon.S |    4 ----
 arch/arm/mm/proc-mohawk.S   |    4 ----
 arch/arm/mm/proc-xsc3.S     |    4 ----
 arch/arm/mm/proc-xscale.S   |    4 ----
 17 files changed, 0 insertions(+), 92 deletions(-)

diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S
index b63a8f7..1711386 100644
--- a/arch/arm/mm/cache-fa.S
+++ b/arch/arm/mm/cache-fa.S
@@ -157,12 +157,7 @@ ENTRY(fa_flush_kern_dcache_page)
  *	- end	 - virtual end address
  */
 ENTRY(fa_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	bic	r1, r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index 2ebc1b3..553931a 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -173,16 +173,11 @@ ENTRY(v4wb_coherent_user_range)
  *	- end	 - virtual end address
  */
 ENTRY(v4wb_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 295e25d..d1dfd87 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -195,20 +195,7 @@ ENTRY(v6_flush_kern_dcache_page)
  *	- end     - virtual end address of region
  */
 ENTRY(v6_dma_inv_range)
-	tst	r0, #D_CACHE_LINE_SIZE - 1
 	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
-#ifdef HARVARD_CACHE
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D line
-#else
-	mcrne	p15, 0, r0, c7, c11, 1		@ clean unified line
-#endif
-	tst	r1, #D_CACHE_LINE_SIZE - 1
-	bic	r1, r1, #D_CACHE_LINE_SIZE - 1
-#ifdef HARVARD_CACHE
-	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D line
-#else
-	mcrne	p15, 0, r1, c7, c15, 1		@ clean & invalidate unified line
-#endif
 1:
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D line
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index e1bd975..893ee59 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -218,13 +218,7 @@ ENDPROC(v7_flush_kern_dcache_page)
 ENTRY(v7_dma_inv_range)
 	dcache_line_size r2, r3
 	sub	r3, r2, #1
-	tst	r0, r3
 	bic	r0, r0, r3
-	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D / U line
-
-	tst	r1, r3
-	bic	r1, r1, r3
-	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D / U line
 1:
 	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D / U line
 	add	r0, r0, r2
diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S
index d9fb4b9..7bbf624 100644
--- a/arch/arm/mm/proc-arm1020.S
+++ b/arch/arm/mm/proc-arm1020.S
@@ -267,15 +267,7 @@ ENTRY(arm1020_flush_kern_dcache_page)
 ENTRY(arm1020_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, ip, c7, c10, 4
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	mcrne	p15, 0, ip, c7, c10, 4		@ drain WB
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, ip, c7, c10, 4
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-	mcrne	p15, 0, ip, c7, c10, 4		@ drain WB
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S
index 7453b75..d379cb7 100644
--- a/arch/arm/mm/proc-arm1020e.S
+++ b/arch/arm/mm/proc-arm1020e.S
@@ -260,11 +260,7 @@ ENTRY(arm1020e_flush_kern_dcache_page)
 ENTRY(arm1020e_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S
index 8eb72d7..f5a7949 100644
--- a/arch/arm/mm/proc-arm1022.S
+++ b/arch/arm/mm/proc-arm1022.S
@@ -249,11 +249,7 @@ ENTRY(arm1022_flush_kern_dcache_page)
 ENTRY(arm1022_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S
index 3b59f0d..1dc26f8 100644
--- a/arch/arm/mm/proc-arm1026.S
+++ b/arch/arm/mm/proc-arm1026.S
@@ -243,11 +243,7 @@ ENTRY(arm1026_flush_kern_dcache_page)
 ENTRY(arm1026_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
index 2b7c197..078a873 100644
--- a/arch/arm/mm/proc-arm920.S
+++ b/arch/arm/mm/proc-arm920.S
@@ -239,11 +239,7 @@ ENTRY(arm920_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm920_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S
index 06a1aa4..22ca857 100644
--- a/arch/arm/mm/proc-arm922.S
+++ b/arch/arm/mm/proc-arm922.S
@@ -241,11 +241,7 @@ ENTRY(arm922_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm922_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S
index cb53435..ff04299 100644
--- a/arch/arm/mm/proc-arm925.S
+++ b/arch/arm/mm/proc-arm925.S
@@ -283,12 +283,6 @@ ENTRY(arm925_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm925_dma_inv_range)
-#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-#endif
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
index 1c48487..4b4c717 100644
--- a/arch/arm/mm/proc-arm926.S
+++ b/arch/arm/mm/proc-arm926.S
@@ -246,12 +246,6 @@ ENTRY(arm926_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm926_dma_inv_range)
-#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-#endif
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S
index 40c0449..589a61c 100644
--- a/arch/arm/mm/proc-arm946.S
+++ b/arch/arm/mm/proc-arm946.S
@@ -215,18 +215,11 @@ ENTRY(arm946_flush_kern_dcache_page)
  * (same as arm926)
  */
 ENTRY(arm946_dma_inv_range)
-#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-#endif
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
index d0d7795..b2f264e 100644
--- a/arch/arm/mm/proc-feroceon.S
+++ b/arch/arm/mm/proc-feroceon.S
@@ -274,11 +274,7 @@ ENTRY(feroceon_range_flush_kern_dcache_page)
  */
 	.align	5
 ENTRY(feroceon_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index 52b5fd7..191ea6d 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -218,10 +218,6 @@ ENTRY(mohawk_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(mohawk_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
index 2028f37..2c1ac69 100644
--- a/arch/arm/mm/proc-xsc3.S
+++ b/arch/arm/mm/proc-xsc3.S
@@ -257,11 +257,7 @@ ENTRY(xsc3_flush_kern_dcache_page)
  *	- end	 - virtual end address
  */
 ENTRY(xsc3_dma_inv_range)
-	tst	r0, #CACHELINESIZE - 1
 	bic	r0, r0, #CACHELINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean L1 D line
-	tst	r1, #CACHELINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean L1 D line
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate L1 D line
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index f056c28..3170348 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -315,11 +315,7 @@ ENTRY(xscale_flush_kern_dcache_page)
  *	- end	 - virtual end address
  */
 ENTRY(xscale_dma_inv_range)
-	tst	r0, #CACHELINESIZE - 1
 	bic	r0, r0, #CACHELINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHELINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes
@ 2009-11-20 18:25 Russell King - ARM Linux
  2009-11-20 18:01 ` [PATCH 1/7] ARM: provide phys_to_page() to complement page_to_phys() Russell King - ARM Linux
                   ` (9 more replies)
  0 siblings, 10 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-20 18:25 UTC (permalink / raw)
  To: linux-arm-kernel

An equivalent patch has been tested by a couple of folk who now
report success.  Here's the refined patch set.  Acks, further
testing, and tested-by stuff welcome.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes
  2009-11-20 18:25 [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Russell King - ARM Linux
                   ` (6 preceding siblings ...)
  2009-11-20 18:07 ` [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate Russell King - ARM Linux
@ 2009-11-21 14:00 ` Jamie Iles
  2009-11-23 10:29   ` Russell King - ARM Linux
  2009-11-21 17:56 ` Rajanikanth H.V
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
  9 siblings, 1 reply; 65+ messages in thread
From: Jamie Iles @ 2009-11-21 14:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Nov 20, 2009 at 06:25:39PM +0000, Russell King - ARM Linux wrote:
> An equivalent patch has been tested by a couple of folk who now
> report success.  Here's the refined patch set.  Acks, further
> testing, and tested-by stuff welcome.
Tested against 2.6.32-rc8 on an ARM1176JZ-S platform and this fixes the issues
that we were seeing.

Tested-By: Jamie Iles <jamie@jamieiles.com>

Jamie

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes
  2009-11-20 18:25 [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Russell King - ARM Linux
                   ` (7 preceding siblings ...)
  2009-11-21 14:00 ` [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Jamie Iles
@ 2009-11-21 17:56 ` Rajanikanth H.V
  2009-11-21 18:57   ` Russell King - ARM Linux
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
  9 siblings, 1 reply; 65+ messages in thread
From: Rajanikanth H.V @ 2009-11-21 17:56 UTC (permalink / raw)
  To: linux-arm-kernel

Russell,
I think you have missed to paste the refined patch, can you please
resend the same.

thanks,
Rajanikanth

On Fri, Nov 20, 2009 at 11:55 PM, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> An equivalent patch has been tested by a couple of folk who now
> report success. ?Here's the refined patch set. ?Acks, further
> testing, and tested-by stuff welcome.
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes
  2009-11-21 17:56 ` Rajanikanth H.V
@ 2009-11-21 18:57   ` Russell King - ARM Linux
  0 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 18:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Sat, Nov 21, 2009 at 11:26:18PM +0530, Rajanikanth H.V wrote:
> Russell,
> I think you have missed to paste the refined patch, can you please
> resend the same.

You appear to have missed reading the word "set".  Patches 1 to 7
are the refined patch set.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support
  2009-11-20 18:25 [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Russell King - ARM Linux
                   ` (8 preceding siblings ...)
  2009-11-21 17:56 ` Rajanikanth H.V
@ 2009-11-21 19:35 ` Russell King - ARM Linux
  2009-11-21 19:37   ` [PATCH 01/10] ARM: dma-mapping: split out vmregion code from dma coherent mapping code Russell King - ARM Linux
                     ` (10 more replies)
  9 siblings, 11 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 19:35 UTC (permalink / raw)
  To: linux-arm-kernel

Here's the follow-on patches for cleaning up the DMA support code and
arranging for coherent DMA allocations to be mapped using the ARMv7
'memory' attribute.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 01/10] ARM: dma-mapping: split out vmregion code from dma coherent mapping code
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
@ 2009-11-21 19:37   ` Russell King - ARM Linux
  2009-11-21 19:37   ` [PATCH 02/10] ARM: dma-mapping: functions to allocate/free a coherent buffer Russell King - ARM Linux
                     ` (9 subsequent siblings)
  10 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 19:37 UTC (permalink / raw)
  To: linux-arm-kernel

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/Makefile      |    2 +-
 arch/arm/mm/dma-mapping.c |  132 +++++----------------------------------------
 arch/arm/mm/vmregion.c    |  131 ++++++++++++++++++++++++++++++++++++++++++++
 arch/arm/mm/vmregion.h    |   29 ++++++++++
 4 files changed, 174 insertions(+), 120 deletions(-)
 create mode 100644 arch/arm/mm/vmregion.c
 create mode 100644 arch/arm/mm/vmregion.h

diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 055cb2a..42352e7 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -6,7 +6,7 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   iomap.o
 
 obj-$(CONFIG_MMU)		+= fault-armv.o flush.o ioremap.o mmap.o \
-				   pgd.o mmu.o
+				   pgd.o mmu.o vmregion.o
 
 ifneq ($(CONFIG_MMU),y)
 obj-y				+= nommu.o
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 4193904..536dfb2 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -68,106 +68,16 @@ static u64 get_coherent_dma_mask(struct device *dev)
  * These are the page tables (2MB each) covering uncached, DMA consistent allocations
  */
 static pte_t *consistent_pte[NUM_CONSISTENT_PTES];
-static DEFINE_SPINLOCK(consistent_lock);
 
-/*
- * VM region handling support.
- *
- * This should become something generic, handling VM region allocations for
- * vmalloc and similar (ioremap, module space, etc).
- *
- * I envisage vmalloc()'s supporting vm_struct becoming:
- *
- *  struct vm_struct {
- *    struct vm_region	region;
- *    unsigned long	flags;
- *    struct page	**pages;
- *    unsigned int	nr_pages;
- *    unsigned long	phys_addr;
- *  };
- *
- * get_vm_area() would then call vm_region_alloc with an appropriate
- * struct vm_region head (eg):
- *
- *  struct vm_region vmalloc_head = {
- *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
- *	.vm_start	= VMALLOC_START,
- *	.vm_end		= VMALLOC_END,
- *  };
- *
- * However, vmalloc_head.vm_start is variable (typically, it is dependent on
- * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
- * would have to initialise this each time prior to calling vm_region_alloc().
- */
-struct arm_vm_region {
-	struct list_head	vm_list;
-	unsigned long		vm_start;
-	unsigned long		vm_end;
-	struct page		*vm_pages;
-	int			vm_active;
-};
+#include "vmregion.h"
 
-static struct arm_vm_region consistent_head = {
+static struct arm_vmregion_head consistent_head = {
+	.vm_lock	= __SPIN_LOCK_UNLOCKED(&consistent_head.vm_lock),
 	.vm_list	= LIST_HEAD_INIT(consistent_head.vm_list),
 	.vm_start	= CONSISTENT_BASE,
 	.vm_end		= CONSISTENT_END,
 };
 
-static struct arm_vm_region *
-arm_vm_region_alloc(struct arm_vm_region *head, size_t size, gfp_t gfp)
-{
-	unsigned long addr = head->vm_start, end = head->vm_end - size;
-	unsigned long flags;
-	struct arm_vm_region *c, *new;
-
-	new = kmalloc(sizeof(struct arm_vm_region), gfp);
-	if (!new)
-		goto out;
-
-	spin_lock_irqsave(&consistent_lock, flags);
-
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if ((addr + size) < addr)
-			goto nospc;
-		if ((addr + size) <= c->vm_start)
-			goto found;
-		addr = c->vm_end;
-		if (addr > end)
-			goto nospc;
-	}
-
- found:
-	/*
-	 * Insert this entry _before_ the one we found.
-	 */
-	list_add_tail(&new->vm_list, &c->vm_list);
-	new->vm_start = addr;
-	new->vm_end = addr + size;
-	new->vm_active = 1;
-
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	return new;
-
- nospc:
-	spin_unlock_irqrestore(&consistent_lock, flags);
-	kfree(new);
- out:
-	return NULL;
-}
-
-static struct arm_vm_region *arm_vm_region_find(struct arm_vm_region *head, unsigned long addr)
-{
-	struct arm_vm_region *c;
-	
-	list_for_each_entry(c, &head->vm_list, vm_list) {
-		if (c->vm_active && c->vm_start == addr)
-			goto out;
-	}
-	c = NULL;
- out:
-	return c;
-}
-
 #ifdef CONFIG_HUGETLB_PAGE
 #error ARM Coherent DMA allocator does not (yet) support huge TLB
 #endif
@@ -177,7 +87,7 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 	    pgprot_t prot)
 {
 	struct page *page;
-	struct arm_vm_region *c;
+	struct arm_vmregion *c;
 	unsigned long order;
 	u64 mask = get_coherent_dma_mask(dev);
 	u64 limit;
@@ -191,13 +101,9 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 	if (!mask)
 		goto no_page;
 
-	/*
-	 * Sanity check the allocation size.
-	 */
 	size = PAGE_ALIGN(size);
 	limit = (mask + 1) & ~mask;
-	if ((limit && size >= limit) ||
-	    size >= (CONSISTENT_END - CONSISTENT_BASE)) {
+	if (limit && size >= limit) {
 		printk(KERN_WARNING "coherent allocation too big "
 		       "(requested %#x mask %#llx)\n", size, mask);
 		goto no_page;
@@ -226,7 +132,7 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 	/*
 	 * Allocate a virtual address in the consistent mapping region.
 	 */
-	c = arm_vm_region_alloc(&consistent_head, size,
+	c = arm_vmregion_alloc(&consistent_head, size,
 			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
 	if (c) {
 		pte_t *pte;
@@ -349,15 +255,12 @@ static int dma_mmap(struct device *dev, struct vm_area_struct *vma,
 {
 	int ret = -ENXIO;
 #ifdef CONFIG_MMU
-	unsigned long flags, user_size, kern_size;
-	struct arm_vm_region *c;
+	unsigned long user_size, kern_size;
+	struct arm_vmregion *c;
 
 	user_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 
-	spin_lock_irqsave(&consistent_lock, flags);
-	c = arm_vm_region_find(&consistent_head, (unsigned long)cpu_addr);
-	spin_unlock_irqrestore(&consistent_lock, flags);
-
+	c = arm_vmregion_find(&consistent_head, (unsigned long)cpu_addr);
 	if (c) {
 		unsigned long off = vma->vm_pgoff;
 
@@ -399,8 +302,8 @@ EXPORT_SYMBOL(dma_mmap_writecombine);
 #ifdef CONFIG_MMU
 void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle)
 {
-	struct arm_vm_region *c;
-	unsigned long flags, addr;
+	struct arm_vmregion *c;
+	unsigned long addr;
 	pte_t *ptep;
 	int idx;
 	u32 off;
@@ -417,14 +320,10 @@ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr
 
 	size = PAGE_ALIGN(size);
 
-	spin_lock_irqsave(&consistent_lock, flags);
-	c = arm_vm_region_find(&consistent_head, (unsigned long)cpu_addr);
+	c = arm_vmregion_find_remove(&consistent_head, (unsigned long)cpu_addr);
 	if (!c)
 		goto no_area;
 
-	c->vm_active = 0;
-	spin_unlock_irqrestore(&consistent_lock, flags);
-
 	if ((c->vm_end - c->vm_start) != size) {
 		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
 		       __func__, c->vm_end - c->vm_start, size);
@@ -470,15 +369,10 @@ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr
 
 	flush_tlb_kernel_range(c->vm_start, c->vm_end);
 
-	spin_lock_irqsave(&consistent_lock, flags);
-	list_del(&c->vm_list);
-	spin_unlock_irqrestore(&consistent_lock, flags);
-
-	kfree(c);
+	arm_vmregion_free(&consistent_head, c);
 	return;
 
  no_area:
-	spin_unlock_irqrestore(&consistent_lock, flags);
 	printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
 	       __func__, cpu_addr);
 	dump_stack();
diff --git a/arch/arm/mm/vmregion.c b/arch/arm/mm/vmregion.c
new file mode 100644
index 0000000..19e09bd
--- /dev/null
+++ b/arch/arm/mm/vmregion.c
@@ -0,0 +1,131 @@
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include "vmregion.h"
+
+/*
+ * VM region handling support.
+ *
+ * This should become something generic, handling VM region allocations for
+ * vmalloc and similar (ioremap, module space, etc).
+ *
+ * I envisage vmalloc()'s supporting vm_struct becoming:
+ *
+ *  struct vm_struct {
+ *    struct vmregion	region;
+ *    unsigned long	flags;
+ *    struct page	**pages;
+ *    unsigned int	nr_pages;
+ *    unsigned long	phys_addr;
+ *  };
+ *
+ * get_vm_area() would then call vmregion_alloc with an appropriate
+ * struct vmregion head (eg):
+ *
+ *  struct vmregion vmalloc_head = {
+ *	.vm_list	= LIST_HEAD_INIT(vmalloc_head.vm_list),
+ *	.vm_start	= VMALLOC_START,
+ *	.vm_end		= VMALLOC_END,
+ *  };
+ *
+ * However, vmalloc_head.vm_start is variable (typically, it is dependent on
+ * the amount of RAM found at boot time.)  I would imagine that get_vm_area()
+ * would have to initialise this each time prior to calling vmregion_alloc().
+ */
+
+struct arm_vmregion *
+arm_vmregion_alloc(struct arm_vmregion_head *head, size_t size, gfp_t gfp)
+{
+	unsigned long addr = head->vm_start, end = head->vm_end - size;
+	unsigned long flags;
+	struct arm_vmregion *c, *new;
+
+	if (head->vm_end - head->vm_start < size) {
+		printk(KERN_WARNING "%s: allocation too big (requested %#x)\n",
+			__func__, size);
+		goto out;
+	}
+
+	new = kmalloc(sizeof(struct arm_vmregion), gfp);
+	if (!new)
+		goto out;
+
+	spin_lock_irqsave(&head->vm_lock, flags);
+
+	list_for_each_entry(c, &head->vm_list, vm_list) {
+		if ((addr + size) < addr)
+			goto nospc;
+		if ((addr + size) <= c->vm_start)
+			goto found;
+		addr = c->vm_end;
+		if (addr > end)
+			goto nospc;
+	}
+
+ found:
+	/*
+	 * Insert this entry _before_ the one we found.
+	 */
+	list_add_tail(&new->vm_list, &c->vm_list);
+	new->vm_start = addr;
+	new->vm_end = addr + size;
+	new->vm_active = 1;
+
+	spin_unlock_irqrestore(&head->vm_lock, flags);
+	return new;
+
+ nospc:
+	spin_unlock_irqrestore(&head->vm_lock, flags);
+	kfree(new);
+ out:
+	return NULL;
+}
+
+static struct arm_vmregion *__arm_vmregion_find(struct arm_vmregion_head *head, unsigned long addr)
+{
+	struct arm_vmregion *c;
+
+	list_for_each_entry(c, &head->vm_list, vm_list) {
+		if (c->vm_active && c->vm_start == addr)
+			goto out;
+	}
+	c = NULL;
+ out:
+	return c;
+}
+
+struct arm_vmregion *arm_vmregion_find(struct arm_vmregion_head *head, unsigned long addr)
+{
+	struct arm_vmregion *c;
+	unsigned long flags;
+
+	spin_lock_irqsave(&head->vm_lock, flags);
+	c = __arm_vmregion_find(head, addr);
+	spin_unlock_irqrestore(&head->vm_lock, flags);
+	return c;
+}
+
+struct arm_vmregion *arm_vmregion_find_remove(struct arm_vmregion_head *head, unsigned long addr)
+{
+	struct arm_vmregion *c;
+	unsigned long flags;
+
+	spin_lock_irqsave(&head->vm_lock, flags);
+	c = __arm_vmregion_find(head, addr);
+	if (c)
+		c->vm_active = 0;
+	spin_unlock_irqrestore(&head->vm_lock, flags);
+	return c;
+}
+
+void arm_vmregion_free(struct arm_vmregion_head *head, struct arm_vmregion *c)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&head->vm_lock, flags);
+	list_del(&c->vm_list);
+	spin_unlock_irqrestore(&head->vm_lock, flags);
+
+	kfree(c);
+}
diff --git a/arch/arm/mm/vmregion.h b/arch/arm/mm/vmregion.h
new file mode 100644
index 0000000..6b2cdbd
--- /dev/null
+++ b/arch/arm/mm/vmregion.h
@@ -0,0 +1,29 @@
+#ifndef VMREGION_H
+#define VMREGION_H
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+
+struct page;
+
+struct arm_vmregion_head {
+	spinlock_t		vm_lock;
+	struct list_head	vm_list;
+	unsigned long		vm_start;
+	unsigned long		vm_end;
+};
+
+struct arm_vmregion {
+	struct list_head	vm_list;
+	unsigned long		vm_start;
+	unsigned long		vm_end;
+	struct page		*vm_pages;
+	int			vm_active;
+};
+
+struct arm_vmregion *arm_vmregion_alloc(struct arm_vmregion_head *, size_t, gfp_t);
+struct arm_vmregion *arm_vmregion_find(struct arm_vmregion_head *, unsigned long);
+struct arm_vmregion *arm_vmregion_find_remove(struct arm_vmregion_head *, unsigned long);
+void arm_vmregion_free(struct arm_vmregion_head *, struct arm_vmregion *);
+
+#endif
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 02/10] ARM: dma-mapping: functions to allocate/free a coherent buffer
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
  2009-11-21 19:37   ` [PATCH 01/10] ARM: dma-mapping: split out vmregion code from dma coherent mapping code Russell King - ARM Linux
@ 2009-11-21 19:37   ` Russell King - ARM Linux
  2009-11-21 19:37   ` [PATCH 03/10] ARM: dma-mapping: fix coherent arch dma_alloc_coherent() Russell King - ARM Linux
                     ` (8 subsequent siblings)
  10 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 19:37 UTC (permalink / raw)
  To: linux-arm-kernel

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/dma-mapping.c |  110 +++++++++++++++++++++++++++------------------
 1 files changed, 66 insertions(+), 44 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 536dfb2..a39ec89 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -63,6 +63,68 @@ static u64 get_coherent_dma_mask(struct device *dev)
 	return mask;
 }
 
+/*
+ * Allocate a DMA buffer for 'dev' of size 'size' using the
+ * specified gfp mask.  Note that 'size' must be page aligned.
+ */
+static struct page *__dma_alloc_buffer(struct device *dev, size_t size, gfp_t gfp)
+{
+	unsigned long order = get_order(size);
+	struct page *page, *p, *e;
+	void *ptr;
+	u64 mask = get_coherent_dma_mask(dev);
+
+#ifdef CONFIG_DMA_API_DEBUG
+	u64 limit = (mask + 1) & ~mask;
+	if (limit && size >= limit) {
+		dev_warn(dev, "coherent allocation too big (requested %#x mask %#llx)\n",
+			size, mask);
+		return NULL;
+	}
+#endif
+
+	if (!mask)
+		return NULL;
+
+	if (mask < 0xffffffffULL)
+		gfp |= GFP_DMA;
+
+	page = alloc_pages(gfp, order);
+	if (!page)
+		return NULL;
+
+	/*
+	 * Now split the huge page and free the excess pages
+	 */
+	split_page(page, order);
+	for (p = page + (size >> PAGE_SHIFT), e = page + (1 << order); p < e; p++)
+		__free_page(p);
+
+	/*
+	 * Ensure that the allocated pages are zeroed, and that any data
+	 * lurking in the kernel direct-mapped region is invalidated.
+	 */
+	ptr = page_address(page);
+	memset(ptr, 0, size);
+	dmac_flush_range(ptr, ptr + size);
+	outer_flush_range(__pa(ptr), __pa(ptr) + size);
+
+	return page;
+}
+
+/*
+ * Free a DMA buffer.  'size' must be page aligned.
+ */
+static void __dma_free_buffer(struct page *page, size_t size)
+{
+	struct page *e = page + (size >> PAGE_SHIFT);
+
+	while (page < e) {
+		__free_page(page);
+		page++;
+	}
+}
+
 #ifdef CONFIG_MMU
 /*
  * These are the page tables (2MB each) covering uncached, DMA consistent allocations
@@ -88,9 +150,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 {
 	struct page *page;
 	struct arm_vmregion *c;
-	unsigned long order;
-	u64 mask = get_coherent_dma_mask(dev);
-	u64 limit;
 
 	if (!consistent_pte[0]) {
 		printk(KERN_ERR "%s: not initialised\n", __func__);
@@ -98,53 +157,25 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 		return NULL;
 	}
 
-	if (!mask)
-		goto no_page;
-
 	size = PAGE_ALIGN(size);
-	limit = (mask + 1) & ~mask;
-	if (limit && size >= limit) {
-		printk(KERN_WARNING "coherent allocation too big "
-		       "(requested %#x mask %#llx)\n", size, mask);
-		goto no_page;
-	}
-
-	order = get_order(size);
-
-	if (mask < 0xffffffffULL)
-		gfp |= GFP_DMA;
 
-	page = alloc_pages(gfp, order);
+	page = __dma_alloc_buffer(dev, size, gfp);
 	if (!page)
 		goto no_page;
 
 	/*
-	 * Invalidate any data that might be lurking in the
-	 * kernel direct-mapped region for device DMA.
-	 */
-	{
-		void *ptr = page_address(page);
-		memset(ptr, 0, size);
-		dmac_flush_range(ptr, ptr + size);
-		outer_flush_range(__pa(ptr), __pa(ptr) + size);
-	}
-
-	/*
 	 * Allocate a virtual address in the consistent mapping region.
 	 */
 	c = arm_vmregion_alloc(&consistent_head, size,
 			    gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
 	if (c) {
 		pte_t *pte;
-		struct page *end = page + (1 << order);
 		int idx = CONSISTENT_PTE_INDEX(c->vm_start);
 		u32 off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1);
 
 		pte = consistent_pte[idx] + off;
 		c->vm_pages = page;
 
-		split_page(page, order);
-
 		/*
 		 * Set the "dma handle"
 		 */
@@ -167,19 +198,11 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 			}
 		} while (size -= PAGE_SIZE);
 
-		/*
-		 * Free the otherwise unused pages.
-		 */
-		while (page < end) {
-			__free_page(page);
-			page++;
-		}
-
 		return (void *)c->vm_start;
 	}
 
 	if (page)
-		__free_pages(page, order);
+		__dma_free_buffer(page, size);
  no_page:
 	*handle = ~0;
 	return NULL;
@@ -357,12 +380,9 @@ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr
 				 * x86 does not mark the pages reserved...
 				 */
 				ClearPageReserved(page);
-
-				__free_page(page);
 				continue;
 			}
 		}
-
 		printk(KERN_CRIT "%s: bad page in kernel page table\n",
 		       __func__);
 	} while (size -= PAGE_SIZE);
@@ -370,6 +390,8 @@ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr
 	flush_tlb_kernel_range(c->vm_start, c->vm_end);
 
 	arm_vmregion_free(&consistent_head, c);
+
+	__dma_free_buffer(dma_to_page(dev, handle), size);
 	return;
 
  no_area:
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 03/10] ARM: dma-mapping: fix coherent arch dma_alloc_coherent()
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
  2009-11-21 19:37   ` [PATCH 01/10] ARM: dma-mapping: split out vmregion code from dma coherent mapping code Russell King - ARM Linux
  2009-11-21 19:37   ` [PATCH 02/10] ARM: dma-mapping: functions to allocate/free a coherent buffer Russell King - ARM Linux
@ 2009-11-21 19:37   ` Russell King - ARM Linux
  2009-11-21 19:38   ` [PATCH 04/10] ARM: dma-mapping: fix nommu dma_alloc_coherent() Russell King - ARM Linux
                     ` (7 subsequent siblings)
  10 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 19:37 UTC (permalink / raw)
  To: linux-arm-kernel

The coherent architecture dma_alloc_coherent was using kmalloc/kfree to
manage the memory.  dma_alloc_coherent() is expected to work with a
granularity of a page, so this is wrong.  Fix it by using the helper
functions now provided.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/dma-mapping.c |   18 ++++++++++--------
 1 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index a39ec89..49e0284 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -246,14 +246,16 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gf
 		return memory;
 
 	if (arch_is_coherent()) {
-		void *virt;
+		struct page *page;
 
-		virt = kmalloc(size, gfp);
-		if (!virt)
+		page = __dma_alloc_buffer(dev, PAGE_ALIGN(size), gfp);
+		if (!page) {
+			*handle = ~0;
 			return NULL;
-		*handle =  virt_to_dma(dev, virt);
+		}
 
-		return virt;
+		*handle = page_to_dma(dev, page);
+		return page_address(page);
 	}
 
 	return __dma_alloc(dev, size, handle, gfp,
@@ -336,13 +338,13 @@ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr
 	if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
 		return;
 
+	size = PAGE_ALIGN(size);
+
 	if (arch_is_coherent()) {
-		kfree(cpu_addr);
+		__dma_free_buffer(dma_to_page(dev, handle), size);
 		return;
 	}
 
-	size = PAGE_ALIGN(size);
-
 	c = arm_vmregion_find_remove(&consistent_head, (unsigned long)cpu_addr);
 	if (!c)
 		goto no_area;
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 04/10] ARM: dma-mapping: fix nommu dma_alloc_coherent()
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
                     ` (2 preceding siblings ...)
  2009-11-21 19:37   ` [PATCH 03/10] ARM: dma-mapping: fix coherent arch dma_alloc_coherent() Russell King - ARM Linux
@ 2009-11-21 19:38   ` Russell King - ARM Linux
  2009-11-21 19:38   ` [PATCH 05/10] ARM: dma-mapping: factor dma_free_coherent() common code Russell King - ARM Linux
                     ` (6 subsequent siblings)
  10 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 19:38 UTC (permalink / raw)
  To: linux-arm-kernel

The nommu version of dma_alloc_coherent was using kmalloc/kfree to manage
the memory.  dma_alloc_coherent() is expected to work with a granularity
of a page, so this is wrong.  Fix it by using the helper functions now
provided.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/dma-mapping.c |   25 +++++++++----------------
 1 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 49e0284..44be5ad 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -212,24 +212,17 @@ static void *
 __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 	    pgprot_t prot)
 {
-	void *virt;
-	u64 mask = get_coherent_dma_mask(dev);
-
-	if (!mask)
-		goto error;
+	struct page *page;
 
-	if (mask < 0xffffffffULL)
-		gfp |= GFP_DMA;
-	virt = kmalloc(size, gfp);
-	if (!virt)
-		goto error;
+	*handle = ~0;
+	size = PAGE_ALIGN(size);
 
-	*handle =  virt_to_dma(dev, virt);
-	return virt;
+	page = __dma_alloc_buffer(dev, size, gfp);
+	if (!page)
+		return NULL;
 
-error:
-	*handle = ~0;
-	return NULL;
+	*handle = page_to_dma(dev, page);
+	return page_address(page);
 }
 #endif	/* CONFIG_MMU */
 
@@ -406,7 +399,7 @@ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr
 {
 	if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
 		return;
-	kfree(cpu_addr);
+	__dma_free_buffer(dma_to_page(dev, handle), PAGE_ALIGN(size));
 }
 #endif	/* CONFIG_MMU */
 EXPORT_SYMBOL(dma_free_coherent);
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 05/10] ARM: dma-mapping: factor dma_free_coherent() common code
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
                     ` (3 preceding siblings ...)
  2009-11-21 19:38   ` [PATCH 04/10] ARM: dma-mapping: fix nommu dma_alloc_coherent() Russell King - ARM Linux
@ 2009-11-21 19:38   ` Russell King - ARM Linux
  2009-11-21 19:38   ` [PATCH 06/10] ARM: dma-mapping: move consistent_init into CONFIG_MMU section Russell King - ARM Linux
                     ` (5 subsequent siblings)
  10 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 19:38 UTC (permalink / raw)
  To: linux-arm-kernel

We effectively have three implementations of dma_free_coherent() mixed up
in the code; the incoherent MMU, coherent MMU and noMMU versions.

The coherent MMU and noMMU versions are actually functionally identical.
The incoherent MMU version is almost the same, but with the additional
step of unmapping the secondary mapping.

Separate out this additional step into __dma_free_remap() and simplify
the resulting dma_free_coherent() code.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/dma-mapping.c |  141 ++++++++++++++++++++++-----------------------
 1 files changed, 68 insertions(+), 73 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 44be5ad..2e025f6 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -207,7 +207,70 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 	*handle = ~0;
 	return NULL;
 }
+
+static void __dma_free_remap(void *cpu_addr, size_t size)
+{
+	struct arm_vmregion *c;
+	unsigned long addr;
+	pte_t *ptep;
+	int idx;
+	u32 off;
+
+	c = arm_vmregion_find_remove(&consistent_head, (unsigned long)cpu_addr);
+	if (!c) {
+		printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
+		       __func__, cpu_addr);
+		dump_stack();
+		return;
+	}
+
+	if ((c->vm_end - c->vm_start) != size) {
+		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
+		       __func__, c->vm_end - c->vm_start, size);
+		dump_stack();
+		size = c->vm_end - c->vm_start;
+	}
+
+	idx = CONSISTENT_PTE_INDEX(c->vm_start);
+	off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1);
+	ptep = consistent_pte[idx] + off;
+	addr = c->vm_start;
+	do {
+		pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep);
+		unsigned long pfn;
+
+		ptep++;
+		addr += PAGE_SIZE;
+		off++;
+		if (off >= PTRS_PER_PTE) {
+			off = 0;
+			ptep = consistent_pte[++idx];
+		}
+
+		if (!pte_none(pte) && pte_present(pte)) {
+			pfn = pte_pfn(pte);
+
+			if (pfn_valid(pfn)) {
+				struct page *page = pfn_to_page(pfn);
+
+				/*
+				 * x86 does not mark the pages reserved...
+				 */
+				ClearPageReserved(page);
+				continue;
+			}
+		}
+		printk(KERN_CRIT "%s: bad page in kernel page table\n",
+		       __func__);
+	} while (size -= PAGE_SIZE);
+
+	flush_tlb_kernel_range(c->vm_start, c->vm_end);
+
+	arm_vmregion_free(&consistent_head, c);
+}
+
 #else	/* !CONFIG_MMU */
+
 static void *
 __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 	    pgprot_t prot)
@@ -224,6 +287,9 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 	*handle = page_to_dma(dev, page);
 	return page_address(page);
 }
+
+#define __dma_free_remap(addr, size)	do { } while (0)
+
 #endif	/* CONFIG_MMU */
 
 /*
@@ -317,15 +383,8 @@ EXPORT_SYMBOL(dma_mmap_writecombine);
  * free a page as defined by the above mapping.
  * Must not be called with IRQs disabled.
  */
-#ifdef CONFIG_MMU
 void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle)
 {
-	struct arm_vmregion *c;
-	unsigned long addr;
-	pte_t *ptep;
-	int idx;
-	u32 off;
-
 	WARN_ON(irqs_disabled());
 
 	if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
@@ -333,75 +392,11 @@ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr
 
 	size = PAGE_ALIGN(size);
 
-	if (arch_is_coherent()) {
-		__dma_free_buffer(dma_to_page(dev, handle), size);
-		return;
-	}
-
-	c = arm_vmregion_find_remove(&consistent_head, (unsigned long)cpu_addr);
-	if (!c)
-		goto no_area;
-
-	if ((c->vm_end - c->vm_start) != size) {
-		printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
-		       __func__, c->vm_end - c->vm_start, size);
-		dump_stack();
-		size = c->vm_end - c->vm_start;
-	}
-
-	idx = CONSISTENT_PTE_INDEX(c->vm_start);
-	off = CONSISTENT_OFFSET(c->vm_start) & (PTRS_PER_PTE-1);
-	ptep = consistent_pte[idx] + off;
-	addr = c->vm_start;
-	do {
-		pte_t pte = ptep_get_and_clear(&init_mm, addr, ptep);
-		unsigned long pfn;
-
-		ptep++;
-		addr += PAGE_SIZE;
-		off++;
-		if (off >= PTRS_PER_PTE) {
-			off = 0;
-			ptep = consistent_pte[++idx];
-		}
-
-		if (!pte_none(pte) && pte_present(pte)) {
-			pfn = pte_pfn(pte);
-
-			if (pfn_valid(pfn)) {
-				struct page *page = pfn_to_page(pfn);
-
-				/*
-				 * x86 does not mark the pages reserved...
-				 */
-				ClearPageReserved(page);
-				continue;
-			}
-		}
-		printk(KERN_CRIT "%s: bad page in kernel page table\n",
-		       __func__);
-	} while (size -= PAGE_SIZE);
-
-	flush_tlb_kernel_range(c->vm_start, c->vm_end);
-
-	arm_vmregion_free(&consistent_head, c);
+	if (!arch_is_coherent())
+		__dma_free_remap(cpu_addr, size);
 
 	__dma_free_buffer(dma_to_page(dev, handle), size);
-	return;
-
- no_area:
-	printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
-	       __func__, cpu_addr);
-	dump_stack();
 }
-#else	/* !CONFIG_MMU */
-void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle)
-{
-	if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
-		return;
-	__dma_free_buffer(dma_to_page(dev, handle), PAGE_ALIGN(size));
-}
-#endif	/* CONFIG_MMU */
 EXPORT_SYMBOL(dma_free_coherent);
 
 /*
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 06/10] ARM: dma-mapping: move consistent_init into CONFIG_MMU section
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
                     ` (4 preceding siblings ...)
  2009-11-21 19:38   ` [PATCH 05/10] ARM: dma-mapping: factor dma_free_coherent() common code Russell King - ARM Linux
@ 2009-11-21 19:38   ` Russell King - ARM Linux
  2009-11-22  0:05     ` Greg Ungerer
  2009-11-21 19:38   ` [PATCH 07/10] ARM: dma-mapping: clean up coherent arch dma allocation Russell King - ARM Linux
                     ` (4 subsequent siblings)
  10 siblings, 1 reply; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 19:38 UTC (permalink / raw)
  To: linux-arm-kernel

No point wrapping the contents of this function with #ifdef CONFIG_MMU
when we can place it and the core_initcall() entirely within the

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/dma-mapping.c |   78 ++++++++++++++++++++++-----------------------
 1 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 2e025f6..7e8f150 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -144,6 +144,44 @@ static struct arm_vmregion_head consistent_head = {
 #error ARM Coherent DMA allocator does not (yet) support huge TLB
 #endif
 
+/*
+ * Initialise the consistent memory allocation.
+ */
+static int __init consistent_init(void)
+{
+	int ret = 0;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	int i = 0;
+	u32 base = CONSISTENT_BASE;
+
+	do {
+		pgd = pgd_offset(&init_mm, base);
+		pmd = pmd_alloc(&init_mm, pgd, base);
+		if (!pmd) {
+			printk(KERN_ERR "%s: no pmd tables\n", __func__);
+			ret = -ENOMEM;
+			break;
+		}
+		WARN_ON(!pmd_none(*pmd));
+
+		pte = pte_alloc_kernel(pmd, base);
+		if (!pte) {
+			printk(KERN_ERR "%s: no pte tables\n", __func__);
+			ret = -ENOMEM;
+			break;
+		}
+
+		consistent_pte[i++] = pte;
+		base += (1 << PGDIR_SHIFT);
+	} while (base < CONSISTENT_END);
+
+	return ret;
+}
+
+core_initcall(consistent_init);
+
 static void *
 __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 	    pgprot_t prot)
@@ -400,46 +438,6 @@ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr
 EXPORT_SYMBOL(dma_free_coherent);
 
 /*
- * Initialise the consistent memory allocation.
- */
-static int __init consistent_init(void)
-{
-	int ret = 0;
-#ifdef CONFIG_MMU
-	pgd_t *pgd;
-	pmd_t *pmd;
-	pte_t *pte;
-	int i = 0;
-	u32 base = CONSISTENT_BASE;
-
-	do {
-		pgd = pgd_offset(&init_mm, base);
-		pmd = pmd_alloc(&init_mm, pgd, base);
-		if (!pmd) {
-			printk(KERN_ERR "%s: no pmd tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-		WARN_ON(!pmd_none(*pmd));
-
-		pte = pte_alloc_kernel(pmd, base);
-		if (!pte) {
-			printk(KERN_ERR "%s: no pte tables\n", __func__);
-			ret = -ENOMEM;
-			break;
-		}
-
-		consistent_pte[i++] = pte;
-		base += (1 << PGDIR_SHIFT);
-	} while (base < CONSISTENT_END);
-#endif	/* !CONFIG_MMU */
-
-	return ret;
-}
-
-core_initcall(consistent_init);
-
-/*
  * Make an area consistent for devices.
  * Note: Drivers should NOT use this function directly, as it will break
  * platforms with CONFIG_DMABOUNCE.
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 07/10] ARM: dma-mapping: clean up coherent arch dma allocation
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
                     ` (5 preceding siblings ...)
  2009-11-21 19:38   ` [PATCH 06/10] ARM: dma-mapping: move consistent_init into CONFIG_MMU section Russell King - ARM Linux
@ 2009-11-21 19:38   ` Russell King - ARM Linux
  2009-11-21 19:39   ` [PATCH 08/10] ARM: dma-mapping: Factor out noMMU dma buffer allocation code Russell King - ARM Linux
                     ` (3 subsequent siblings)
  10 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 19:38 UTC (permalink / raw)
  To: linux-arm-kernel

IXP23xx added support for dma_alloc_coherent() for DMA arches with an
exception in dma_alloc_coherent().  This is a subset of what goes on
in __dma_alloc(), and there is no reason why dma_alloc_writecombine()
should not be given the same treatment (except, maybe, that IXP23xx
doesn't use it.)

We can better deal with this by moving the arch_is_coherent() test
inside __dma_alloc() and killing the code duplication.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/dma-mapping.c |   31 ++++++++++++-------------------
 1 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 7e8f150..dac44f0 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -189,18 +189,24 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 	struct page *page;
 	struct arm_vmregion *c;
 
-	if (!consistent_pte[0]) {
-		printk(KERN_ERR "%s: not initialised\n", __func__);
-		dump_stack();
-		return NULL;
-	}
-
 	size = PAGE_ALIGN(size);
 
 	page = __dma_alloc_buffer(dev, size, gfp);
 	if (!page)
 		goto no_page;
 
+	if (arch_is_coherent()) {
+		*handle = page_to_dma(dev, page);
+		return page_address(page);
+	}
+
+	if (!consistent_pte[0]) {
+		printk(KERN_ERR "%s: not initialised\n", __func__);
+		dump_stack();
+		__dma_free_buffer(page, size);
+		return NULL;
+	}
+
 	/*
 	 * Allocate a virtual address in the consistent mapping region.
 	 */
@@ -342,19 +348,6 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gf
 	if (dma_alloc_from_coherent(dev, size, handle, &memory))
 		return memory;
 
-	if (arch_is_coherent()) {
-		struct page *page;
-
-		page = __dma_alloc_buffer(dev, PAGE_ALIGN(size), gfp);
-		if (!page) {
-			*handle = ~0;
-			return NULL;
-		}
-
-		*handle = page_to_dma(dev, page);
-		return page_address(page);
-	}
-
 	return __dma_alloc(dev, size, handle, gfp,
 			   pgprot_noncached(pgprot_kernel));
 }
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 08/10] ARM: dma-mapping: Factor out noMMU dma buffer allocation code
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
                     ` (6 preceding siblings ...)
  2009-11-21 19:38   ` [PATCH 07/10] ARM: dma-mapping: clean up coherent arch dma allocation Russell King - ARM Linux
@ 2009-11-21 19:39   ` Russell King - ARM Linux
  2009-11-21 19:39   ` [PATCH 09/10] ARM: dma-mapping: get rid of setting/clearing the reserved page bit Russell King - ARM Linux
                     ` (2 subsequent siblings)
  10 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 19:39 UTC (permalink / raw)
  To: linux-arm-kernel

This entirely separates the DMA coherent buffer remapping code from
the allocation code, and gets rid of the duplicate copy in the !MMU
section.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/dma-mapping.c |   45 +++++++++++++++------------------------------
 1 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index dac44f0..f3c0fc8 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -183,27 +183,13 @@ static int __init consistent_init(void)
 core_initcall(consistent_init);
 
 static void *
-__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
-	    pgprot_t prot)
+__dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot)
 {
-	struct page *page;
 	struct arm_vmregion *c;
 
-	size = PAGE_ALIGN(size);
-
-	page = __dma_alloc_buffer(dev, size, gfp);
-	if (!page)
-		goto no_page;
-
-	if (arch_is_coherent()) {
-		*handle = page_to_dma(dev, page);
-		return page_address(page);
-	}
-
 	if (!consistent_pte[0]) {
 		printk(KERN_ERR "%s: not initialised\n", __func__);
 		dump_stack();
-		__dma_free_buffer(page, size);
 		return NULL;
 	}
 
@@ -220,11 +206,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 		pte = consistent_pte[idx] + off;
 		c->vm_pages = page;
 
-		/*
-		 * Set the "dma handle"
-		 */
-		*handle = page_to_dma(dev, page);
-
 		do {
 			BUG_ON(!pte_none(*pte));
 
@@ -244,11 +225,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 
 		return (void *)c->vm_start;
 	}
-
-	if (page)
-		__dma_free_buffer(page, size);
- no_page:
-	*handle = ~0;
 	return NULL;
 }
 
@@ -315,11 +291,17 @@ static void __dma_free_remap(void *cpu_addr, size_t size)
 
 #else	/* !CONFIG_MMU */
 
+#define __dma_alloc_remap(page, size, gfp, prot)	page_address(page)
+#define __dma_free_remap(addr, size)			do { } while (0)
+
+#endif	/* CONFIG_MMU */
+
 static void *
 __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 	    pgprot_t prot)
 {
 	struct page *page;
+	void *addr;
 
 	*handle = ~0;
 	size = PAGE_ALIGN(size);
@@ -328,13 +310,16 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 	if (!page)
 		return NULL;
 
-	*handle = page_to_dma(dev, page);
-	return page_address(page);
-}
+	if (!arch_is_coherent())
+		addr = __dma_alloc_remap(page, size, gfp, prot);
+	else
+		addr = page_address(page);
 
-#define __dma_free_remap(addr, size)	do { } while (0)
+	if (addr)
+		*handle = page_to_dma(dev, page);
 
-#endif	/* CONFIG_MMU */
+	return addr;
+}
 
 /*
  * Allocate DMA-coherent memory space and return both the kernel remapped
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 09/10] ARM: dma-mapping: get rid of setting/clearing the reserved page bit
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
                     ` (7 preceding siblings ...)
  2009-11-21 19:39   ` [PATCH 08/10] ARM: dma-mapping: Factor out noMMU dma buffer allocation code Russell King - ARM Linux
@ 2009-11-21 19:39   ` Russell King - ARM Linux
  2009-11-21 19:39   ` [PATCH 10/10] ARM: dma-mapping: switch ARMv7 DMA mappings to retain 'memory' attribute Russell King - ARM Linux
  2009-11-23  7:10   ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Greg Ungerer
  10 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 19:39 UTC (permalink / raw)
  To: linux-arm-kernel

It's unnecessary; x86 doesn't do it, and ALSA doesn't require it
anymore.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/dma-mapping.c |   22 +++-------------------
 1 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index f3c0fc8..9807035 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -209,10 +209,6 @@ __dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot)
 		do {
 			BUG_ON(!pte_none(*pte));
 
-			/*
-			 * x86 does not mark the pages reserved...
-			 */
-			SetPageReserved(page);
 			set_pte_ext(pte, mk_pte(page, prot), 0);
 			page++;
 			pte++;
@@ -267,21 +263,9 @@ static void __dma_free_remap(void *cpu_addr, size_t size)
 			ptep = consistent_pte[++idx];
 		}
 
-		if (!pte_none(pte) && pte_present(pte)) {
-			pfn = pte_pfn(pte);
-
-			if (pfn_valid(pfn)) {
-				struct page *page = pfn_to_page(pfn);
-
-				/*
-				 * x86 does not mark the pages reserved...
-				 */
-				ClearPageReserved(page);
-				continue;
-			}
-		}
-		printk(KERN_CRIT "%s: bad page in kernel page table\n",
-		       __func__);
+		if (pte_none(pte) || !pte_present(pte))
+			printk(KERN_CRIT "%s: bad page in kernel page table\n",
+			       __func__);
 	} while (size -= PAGE_SIZE);
 
 	flush_tlb_kernel_range(c->vm_start, c->vm_end);
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 10/10] ARM: dma-mapping: switch ARMv7 DMA mappings to retain 'memory' attribute
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
                     ` (8 preceding siblings ...)
  2009-11-21 19:39   ` [PATCH 09/10] ARM: dma-mapping: get rid of setting/clearing the reserved page bit Russell King - ARM Linux
@ 2009-11-21 19:39   ` Russell King - ARM Linux
  2009-11-23 12:21     ` [PATCH 10/10] ARM: dma-mapping: switch ARMv7 DMA mappings to retain'memory' attribute Catalin Marinas
  2009-11-23  7:10   ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Greg Ungerer
  10 siblings, 1 reply; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-21 19:39 UTC (permalink / raw)
  To: linux-arm-kernel

On ARMv7, it is invalid to map the same physical address multiple times
with different memory types.  Since system RAM is already mapped as
'memory', subsequent remapping of it must retain this attribute.

However, DMA memory maps it as "strongly ordered".  Fix this by introducing
'pgprot_dmacoherent()' which provides the necessary page table bits for
DMA mappings.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/pgtable.h |   14 ++++++++++++--
 arch/arm/include/asm/system.h  |   19 ++++++++++++-------
 arch/arm/mm/dma-mapping.c      |    4 ++--
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 201ccaa..1139768 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -304,13 +304,23 @@ PTE_BIT_FUNC(mkyoung,   |= L_PTE_YOUNG);
 
 static inline pte_t pte_mkspecial(pte_t pte) { return pte; }
 
+#define __pgprot_modify(prot,mask,bits)		\
+	__pgprot((pgprot_val(prot) & ~(mask)) | (bits))
+
 /*
  * Mark the prot value as uncacheable and unbufferable.
  */
 #define pgprot_noncached(prot) \
-	__pgprot((pgprot_val(prot) & ~L_PTE_MT_MASK) | L_PTE_MT_UNCACHED)
+	__pgprot_modify(prot, L_PTE_MT_MASK, L_PTE_MT_UNCACHED)
 #define pgprot_writecombine(prot) \
-	__pgprot((pgprot_val(prot) & ~L_PTE_MT_MASK) | L_PTE_MT_BUFFERABLE)
+	__pgprot_modify(prot, L_PTE_MT_MASK, L_PTE_MT_BUFFERABLE)
+#if __LINUX_ARM_ARCH__ >= 7
+#define pgprot_dmacoherent(prot) \
+	__pgprot_modify(prot, L_PTE_MT_MASK|L_PTE_EXEC, L_PTE_MT_BUFFERABLE)
+#else
+#define pgprot_dmacoherent(prot) \
+	__pgprot_modify(prot, L_PTE_MT_MASK|L_PTE_EXEC, L_PTE_MT_UNCACHED)
+#endif
 
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define pmd_present(pmd)	(pmd_val(pmd))
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index d65b2f5..058e7e9 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -138,21 +138,26 @@ extern unsigned int user_debug;
 #define dmb() __asm__ __volatile__ ("" : : : "memory")
 #endif
 
-#ifndef CONFIG_SMP
+#if __LINUX_ARM_ARCH__ >= 7 || defined(CONFIG_SMP)
+#define mb()		dmb()
+#define rmb()		dmb()
+#define wmb()		dmb()
+#else
 #define mb()	do { if (arch_is_coherent()) dmb(); else barrier(); } while (0)
 #define rmb()	do { if (arch_is_coherent()) dmb(); else barrier(); } while (0)
 #define wmb()	do { if (arch_is_coherent()) dmb(); else barrier(); } while (0)
+#endif
+
+#ifndef CONFIG_SMP
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
 #else
-#define mb()		dmb()
-#define rmb()		dmb()
-#define wmb()		dmb()
-#define smp_mb()	dmb()
-#define smp_rmb()	dmb()
-#define smp_wmb()	dmb()
+#define smp_mb()	mb()
+#define smp_rmb()	rmb()
+#define smp_wmb()	wmb()
 #endif
+
 #define read_barrier_depends()		do { } while(0)
 #define smp_read_barrier_depends()	do { } while(0)
 
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 9807035..7c1d2c7 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -318,7 +318,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gf
 		return memory;
 
 	return __dma_alloc(dev, size, handle, gfp,
-			   pgprot_noncached(pgprot_kernel));
+			   pgprot_dmacoherent(pgprot_kernel));
 }
 EXPORT_SYMBOL(dma_alloc_coherent);
 
@@ -366,7 +366,7 @@ static int dma_mmap(struct device *dev, struct vm_area_struct *vma,
 int dma_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
 		      void *cpu_addr, dma_addr_t dma_addr, size_t size)
 {
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_page_prot = pgprot_dmacoherent(vma->vm_page_prot);
 	return dma_mmap(dev, vma, cpu_addr, dma_addr, size);
 }
 EXPORT_SYMBOL(dma_mmap_coherent);
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 06/10] ARM: dma-mapping: move consistent_init into CONFIG_MMU section
  2009-11-21 19:38   ` [PATCH 06/10] ARM: dma-mapping: move consistent_init into CONFIG_MMU section Russell King - ARM Linux
@ 2009-11-22  0:05     ` Greg Ungerer
  2009-11-22  0:16       ` Russell King - ARM Linux
  0 siblings, 1 reply; 65+ messages in thread
From: Greg Ungerer @ 2009-11-22  0:05 UTC (permalink / raw)
  To: linux-arm-kernel


Hi Russell,

Russell King - ARM Linux wrote:
> No point wrapping the contents of this function with #ifdef CONFIG_MMU
> when we can place it and the core_initcall() entirely within the

Lost the end of the comments here?

Regards
Greg



> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> ---
>  arch/arm/mm/dma-mapping.c |   78 ++++++++++++++++++++++-----------------------
>  1 files changed, 38 insertions(+), 40 deletions(-)
> 
> diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
> index 2e025f6..7e8f150 100644
> --- a/arch/arm/mm/dma-mapping.c
> +++ b/arch/arm/mm/dma-mapping.c
> @@ -144,6 +144,44 @@ static struct arm_vmregion_head consistent_head = {
>  #error ARM Coherent DMA allocator does not (yet) support huge TLB
>  #endif
>  
> +/*
> + * Initialise the consistent memory allocation.
> + */
> +static int __init consistent_init(void)
> +{
> +	int ret = 0;
> +	pgd_t *pgd;
> +	pmd_t *pmd;
> +	pte_t *pte;
> +	int i = 0;
> +	u32 base = CONSISTENT_BASE;
> +
> +	do {
> +		pgd = pgd_offset(&init_mm, base);
> +		pmd = pmd_alloc(&init_mm, pgd, base);
> +		if (!pmd) {
> +			printk(KERN_ERR "%s: no pmd tables\n", __func__);
> +			ret = -ENOMEM;
> +			break;
> +		}
> +		WARN_ON(!pmd_none(*pmd));
> +
> +		pte = pte_alloc_kernel(pmd, base);
> +		if (!pte) {
> +			printk(KERN_ERR "%s: no pte tables\n", __func__);
> +			ret = -ENOMEM;
> +			break;
> +		}
> +
> +		consistent_pte[i++] = pte;
> +		base += (1 << PGDIR_SHIFT);
> +	} while (base < CONSISTENT_END);
> +
> +	return ret;
> +}
> +
> +core_initcall(consistent_init);
> +
>  static void *
>  __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
>  	    pgprot_t prot)
> @@ -400,46 +438,6 @@ void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr
>  EXPORT_SYMBOL(dma_free_coherent);
>  
>  /*
> - * Initialise the consistent memory allocation.
> - */
> -static int __init consistent_init(void)
> -{
> -	int ret = 0;
> -#ifdef CONFIG_MMU
> -	pgd_t *pgd;
> -	pmd_t *pmd;
> -	pte_t *pte;
> -	int i = 0;
> -	u32 base = CONSISTENT_BASE;
> -
> -	do {
> -		pgd = pgd_offset(&init_mm, base);
> -		pmd = pmd_alloc(&init_mm, pgd, base);
> -		if (!pmd) {
> -			printk(KERN_ERR "%s: no pmd tables\n", __func__);
> -			ret = -ENOMEM;
> -			break;
> -		}
> -		WARN_ON(!pmd_none(*pmd));
> -
> -		pte = pte_alloc_kernel(pmd, base);
> -		if (!pte) {
> -			printk(KERN_ERR "%s: no pte tables\n", __func__);
> -			ret = -ENOMEM;
> -			break;
> -		}
> -
> -		consistent_pte[i++] = pte;
> -		base += (1 << PGDIR_SHIFT);
> -	} while (base < CONSISTENT_END);
> -#endif	/* !CONFIG_MMU */
> -
> -	return ret;
> -}
> -
> -core_initcall(consistent_init);
> -
> -/*
>   * Make an area consistent for devices.
>   * Note: Drivers should NOT use this function directly, as it will break
>   * platforms with CONFIG_DMABOUNCE.


-- 
------------------------------------------------------------------------
Greg Ungerer  --  Principal Engineer        EMAIL:     gerg at snapgear.com
SnapGear Group, McAfee                      PHONE:       +61 7 3435 2888
8 Gardner Close                             FAX:         +61 7 3217 5323
Milton, QLD, 4064, Australia                WEB: http://www.SnapGear.com

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 06/10] ARM: dma-mapping: move consistent_init into CONFIG_MMU section
  2009-11-22  0:05     ` Greg Ungerer
@ 2009-11-22  0:16       ` Russell King - ARM Linux
  0 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-22  0:16 UTC (permalink / raw)
  To: linux-arm-kernel

On Sun, Nov 22, 2009 at 10:05:40AM +1000, Greg Ungerer wrote:
>
> Hi Russell,
>
> Russell King - ARM Linux wrote:
>> No point wrapping the contents of this function with #ifdef CONFIG_MMU
>> when we can place it and the core_initcall() entirely within the
>
> Lost the end of the comments here?

Yes, it should have one more line.  "existing conditional block."

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate
  2009-11-20 18:07 ` [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate Russell King - ARM Linux
@ 2009-11-22 22:16   ` Nicolas Pitre
  2009-11-23 10:26     ` Russell King - ARM Linux
  2009-11-23 12:09   ` [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cachelines " Catalin Marinas
  2009-11-23 13:38   ` [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping cache lines " Russell King - ARM Linux
  2 siblings, 1 reply; 65+ messages in thread
From: Nicolas Pitre @ 2009-11-22 22:16 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 20 Nov 2009, Russell King - ARM Linux wrote:

> Since we now clean the DMA buffers on map, there's no need to clean
> overlapping cache lines on invalidation anymore.  (Note: the DMA API
> prohibits other data sharing the same cache line as a DMA buffer
> anyway.)
> 
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> ---
[...]

> diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
> index d0d7795..b2f264e 100644
> --- a/arch/arm/mm/proc-feroceon.S
> +++ b/arch/arm/mm/proc-feroceon.S
> @@ -274,11 +274,7 @@ ENTRY(feroceon_range_flush_kern_dcache_page)
>   */
>  	.align	5
>  ENTRY(feroceon_dma_inv_range)
> -	tst	r0, #CACHE_DLINESIZE - 1
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
>  	cmp	r0, r1

The code in feroceon_range_dma_inv_range is likely to require the same 
cleanup.


Nicolas

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support
  2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
                     ` (9 preceding siblings ...)
  2009-11-21 19:39   ` [PATCH 10/10] ARM: dma-mapping: switch ARMv7 DMA mappings to retain 'memory' attribute Russell King - ARM Linux
@ 2009-11-23  7:10   ` Greg Ungerer
  2009-11-23 10:26     ` Russell King - ARM Linux
  10 siblings, 1 reply; 65+ messages in thread
From: Greg Ungerer @ 2009-11-23  7:10 UTC (permalink / raw)
  To: linux-arm-kernel


Hi Russell,

Russell King - ARM Linux wrote:
> Here's the follow-on patches for cleaning up the DMA support code and
> arranging for coherent DMA allocations to be mapped using the ARMv7
> 'memory' attribute.

I looked at these from a noMMU point of view, and they look good to me.
I you want it, you have an:

Acked-by:  Greg Ungerer <gerg@uclinux.org>

Regards
Greg



------------------------------------------------------------------------
Greg Ungerer  --  Principal Engineer        EMAIL:     gerg at snapgear.com
SnapGear Group, McAfee                      PHONE:       +61 7 3435 2888
8 Gardner Close                             FAX:         +61 7 3217 5323
Milton, QLD, 4064, Australia                WEB: http://www.SnapGear.com

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate
  2009-11-22 22:16   ` Nicolas Pitre
@ 2009-11-23 10:26     ` Russell King - ARM Linux
  0 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-23 10:26 UTC (permalink / raw)
  To: linux-arm-kernel

Since we now clean the DMA buffers on map, there's no need to clean
overlapping cache lines on invalidation anymore.  (Note: the DMA API
prohibits other data sharing the same cache line as a DMA buffer
anyway.)

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/cache-fa.S      |    5 -----
 arch/arm/mm/cache-v4wb.S    |    5 -----
 arch/arm/mm/cache-v6.S      |   13 -------------
 arch/arm/mm/cache-v7.S      |    6 ------
 arch/arm/mm/proc-arm1020.S  |    8 --------
 arch/arm/mm/proc-arm1020e.S |    4 ----
 arch/arm/mm/proc-arm1022.S  |    4 ----
 arch/arm/mm/proc-arm1026.S  |    4 ----
 arch/arm/mm/proc-arm920.S   |    4 ----
 arch/arm/mm/proc-arm922.S   |    4 ----
 arch/arm/mm/proc-arm925.S   |    6 ------
 arch/arm/mm/proc-arm926.S   |    6 ------
 arch/arm/mm/proc-arm946.S   |    7 -------
 arch/arm/mm/proc-feroceon.S |    8 --------
 arch/arm/mm/proc-mohawk.S   |    4 ----
 arch/arm/mm/proc-xsc3.S     |    4 ----
 arch/arm/mm/proc-xscale.S   |    4 ----
 17 files changed, 0 insertions(+), 96 deletions(-)

diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S
index b63a8f7..1711386 100644
--- a/arch/arm/mm/cache-fa.S
+++ b/arch/arm/mm/cache-fa.S
@@ -157,12 +157,7 @@ ENTRY(fa_flush_kern_dcache_page)
  *	- end	 - virtual end address
  */
 ENTRY(fa_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	bic	r1, r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index 2ebc1b3..553931a 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -173,16 +173,11 @@ ENTRY(v4wb_coherent_user_range)
  *	- end	 - virtual end address
  */
 ENTRY(v4wb_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 295e25d..d1dfd87 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -195,20 +195,7 @@ ENTRY(v6_flush_kern_dcache_page)
  *	- end     - virtual end address of region
  */
 ENTRY(v6_dma_inv_range)
-	tst	r0, #D_CACHE_LINE_SIZE - 1
 	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
-#ifdef HARVARD_CACHE
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D line
-#else
-	mcrne	p15, 0, r0, c7, c11, 1		@ clean unified line
-#endif
-	tst	r1, #D_CACHE_LINE_SIZE - 1
-	bic	r1, r1, #D_CACHE_LINE_SIZE - 1
-#ifdef HARVARD_CACHE
-	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D line
-#else
-	mcrne	p15, 0, r1, c7, c15, 1		@ clean & invalidate unified line
-#endif
 1:
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D line
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index e1bd975..893ee59 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -218,13 +218,7 @@ ENDPROC(v7_flush_kern_dcache_page)
 ENTRY(v7_dma_inv_range)
 	dcache_line_size r2, r3
 	sub	r3, r2, #1
-	tst	r0, r3
 	bic	r0, r0, r3
-	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D / U line
-
-	tst	r1, r3
-	bic	r1, r1, r3
-	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D / U line
 1:
 	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D / U line
 	add	r0, r0, r2
diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S
index d9fb4b9..7bbf624 100644
--- a/arch/arm/mm/proc-arm1020.S
+++ b/arch/arm/mm/proc-arm1020.S
@@ -267,15 +267,7 @@ ENTRY(arm1020_flush_kern_dcache_page)
 ENTRY(arm1020_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, ip, c7, c10, 4
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	mcrne	p15, 0, ip, c7, c10, 4		@ drain WB
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, ip, c7, c10, 4
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-	mcrne	p15, 0, ip, c7, c10, 4		@ drain WB
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S
index 7453b75..d379cb7 100644
--- a/arch/arm/mm/proc-arm1020e.S
+++ b/arch/arm/mm/proc-arm1020e.S
@@ -260,11 +260,7 @@ ENTRY(arm1020e_flush_kern_dcache_page)
 ENTRY(arm1020e_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S
index 8eb72d7..f5a7949 100644
--- a/arch/arm/mm/proc-arm1022.S
+++ b/arch/arm/mm/proc-arm1022.S
@@ -249,11 +249,7 @@ ENTRY(arm1022_flush_kern_dcache_page)
 ENTRY(arm1022_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S
index 3b59f0d..1dc26f8 100644
--- a/arch/arm/mm/proc-arm1026.S
+++ b/arch/arm/mm/proc-arm1026.S
@@ -243,11 +243,7 @@ ENTRY(arm1026_flush_kern_dcache_page)
 ENTRY(arm1026_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
index 2b7c197..078a873 100644
--- a/arch/arm/mm/proc-arm920.S
+++ b/arch/arm/mm/proc-arm920.S
@@ -239,11 +239,7 @@ ENTRY(arm920_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm920_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S
index 06a1aa4..22ca857 100644
--- a/arch/arm/mm/proc-arm922.S
+++ b/arch/arm/mm/proc-arm922.S
@@ -241,11 +241,7 @@ ENTRY(arm922_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm922_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S
index cb53435..ff04299 100644
--- a/arch/arm/mm/proc-arm925.S
+++ b/arch/arm/mm/proc-arm925.S
@@ -283,12 +283,6 @@ ENTRY(arm925_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm925_dma_inv_range)
-#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-#endif
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
index 1c48487..4b4c717 100644
--- a/arch/arm/mm/proc-arm926.S
+++ b/arch/arm/mm/proc-arm926.S
@@ -246,12 +246,6 @@ ENTRY(arm926_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm926_dma_inv_range)
-#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-#endif
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S
index 40c0449..589a61c 100644
--- a/arch/arm/mm/proc-arm946.S
+++ b/arch/arm/mm/proc-arm946.S
@@ -215,18 +215,11 @@ ENTRY(arm946_flush_kern_dcache_page)
  * (same as arm926)
  */
 ENTRY(arm946_dma_inv_range)
-#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-#endif
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
index d0d7795..1262b92 100644
--- a/arch/arm/mm/proc-feroceon.S
+++ b/arch/arm/mm/proc-feroceon.S
@@ -274,11 +274,7 @@ ENTRY(feroceon_range_flush_kern_dcache_page)
  */
 	.align	5
 ENTRY(feroceon_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
@@ -289,10 +285,6 @@ ENTRY(feroceon_dma_inv_range)
 	.align	5
 ENTRY(feroceon_range_dma_inv_range)
 	mrs	r2, cpsr
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 	cmp	r1, r0
 	subne	r1, r1, #1			@ top address is inclusive
 	orr	r3, r2, #PSR_I_BIT
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index 52b5fd7..191ea6d 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -218,10 +218,6 @@ ENTRY(mohawk_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(mohawk_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
index 2028f37..2c1ac69 100644
--- a/arch/arm/mm/proc-xsc3.S
+++ b/arch/arm/mm/proc-xsc3.S
@@ -257,11 +257,7 @@ ENTRY(xsc3_flush_kern_dcache_page)
  *	- end	 - virtual end address
  */
 ENTRY(xsc3_dma_inv_range)
-	tst	r0, #CACHELINESIZE - 1
 	bic	r0, r0, #CACHELINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean L1 D line
-	tst	r1, #CACHELINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean L1 D line
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate L1 D line
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index f056c28..3170348 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -315,11 +315,7 @@ ENTRY(xscale_flush_kern_dcache_page)
  *	- end	 - virtual end address
  */
 ENTRY(xscale_dma_inv_range)
-	tst	r0, #CACHELINESIZE - 1
 	bic	r0, r0, #CACHELINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHELINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support
  2009-11-23  7:10   ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Greg Ungerer
@ 2009-11-23 10:26     ` Russell King - ARM Linux
  0 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-23 10:26 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Nov 23, 2009 at 05:10:30PM +1000, Greg Ungerer wrote:
> I looked at these from a noMMU point of view, and they look good to me.
> I you want it, you have an:
>
> Acked-by:  Greg Ungerer <gerg@uclinux.org>

Thanks.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes
  2009-11-21 14:00 ` [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Jamie Iles
@ 2009-11-23 10:29   ` Russell King - ARM Linux
  2010-01-13  6:37     ` muni anda
  0 siblings, 1 reply; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-23 10:29 UTC (permalink / raw)
  To: linux-arm-kernel

On Sat, Nov 21, 2009 at 02:00:37PM +0000, Jamie Iles wrote:
> On Fri, Nov 20, 2009 at 06:25:39PM +0000, Russell King - ARM Linux wrote:
> > An equivalent patch has been tested by a couple of folk who now
> > report success.  Here's the refined patch set.  Acks, further
> > testing, and tested-by stuff welcome.
> Tested against 2.6.32-rc8 on an ARM1176JZ-S platform and this fixes the issues
> that we were seeing.
> 
> Tested-By: Jamie Iles <jamie@jamieiles.com>

Thanks Jamie.  We're a little too late for 2.6.32, but it'll be merged
during the post 2.6.32 window.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 6/7] ARM: dma-mapping: fix for speculative accesses
  2009-11-20 18:06 ` [PATCH 6/7] ARM: dma-mapping: fix for speculative accesses Russell King - ARM Linux
@ 2009-11-23 12:03   ` Catalin Marinas
  2009-11-23 12:08     ` Russell King - ARM Linux
  2009-11-23 13:38   ` [PATCH v2 06/17] " Russell King - ARM Linux
  1 sibling, 1 reply; 65+ messages in thread
From: Catalin Marinas @ 2009-11-23 12:03 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2009-11-20 at 18:06 +0000, Russell King - ARM Linux wrote:
> Rather than handing all cache maintainence before DMA begins, we
> also need to handle cache maintainence upon completion when we
> have CPUs which prefetch speculatively.
> 
> This renames the dma_cache_maint*() functions, and changes their
> parameters to indicate whether we are mapping a DMA buffer.  We
> always clean DMA buffers when we map them, but avoid invalidating
> them if we are DMA'ing to the device.
[...]
> --- a/arch/arm/include/asm/dma-mapping.h
> +++ b/arch/arm/include/asm/dma-mapping.h
[...]
> @@ -70,6 +70,11 @@ extern void dma_cache_maint_page(struct page *page, unsigned long offset,
>   * by it) or exclusively owned by the DMA device.  These helper functions
>   * represent the transitions between these two ownership states.
>   *
> + * Note, however, that on later ARMs, this notion does not work due to
> + * speculative prefetches.  We model our approach on the assumption that
> + * the CPU does do speculative prefetches, which means we clean caches
> + * before transfers and delay cache invalidation until transfer completion.
> + *
>   * As above, these are private support functions and not part of the API.
>   * Drivers must not use these.
>   */
> @@ -77,26 +82,32 @@ static inline void __dma_single_cpu_to_dev(const void *kaddr, size_t size,
>         enum dma_data_direction dir)
>  {
>         if (!arch_is_coherent())
> -               dma_cache_maint(kaddr, size, dir);
> +               __dma_cache_maint(kaddr, size, 1);
>  }
[...]
>  static inline void __dma_page_cpu_to_dev(struct page *page, unsigned long off,
>         size_t size, enum dma_data_direction dir)
>  {
>         if (!arch_is_coherent())
> -               dma_cache_maint_page(page, off, size, dir);
> +               __dma_cache_maint_page(page, off, size, 1);
>  }

Does this mean that the direction information is lost after this point?
My only issue with this is that in the FROM_DEVICE case we use a clean
operation before the transfer even though an invalidate would be enough
and probably faster.

But I agree that it's a nice simplification.

> --- a/arch/arm/mm/dma-mapping.c
> +++ b/arch/arm/mm/dma-mapping.c
> @@ -539,58 +539,40 @@ core_initcall(consistent_init);
>   * platforms with CONFIG_DMABOUNCE.
>   * Use the driver DMA support - see dma-mapping.h (dma_sync_*)
>   */
> -void dma_cache_maint(const void *start, size_t size, int direction)
> +void __dma_cache_maint(const void *start, size_t size, int map)
>  {
>         void (*inner_op)(const void *, const void *);
>         void (*outer_op)(unsigned long, unsigned long);
> 
>         BUG_ON(!virt_addr_valid(start) || !virt_addr_valid(start + size - 1));
> 
> -       switch (direction) {
> -       case DMA_FROM_DEVICE:           /* invalidate only */
> -               inner_op = dmac_inv_range;
> -               outer_op = outer_inv_range;
> -               break;
> -       case DMA_TO_DEVICE:             /* writeback only */
> +       if (map) {              /* writeback only */
>                 inner_op = dmac_clean_range;
>                 outer_op = outer_clean_range;
> -               break;
> -       case DMA_BIDIRECTIONAL:         /* writeback and invalidate */
> -               inner_op = dmac_flush_range;
> -               outer_op = outer_flush_range;
> -               break;
> -       default:
> -               BUG();
> +       } else {                /* invalidate only */
> +               inner_op = dmac_inv_range;
> +               outer_op = outer_inv_range;
>         }
> 
>         inner_op(start, start + size);
>         outer_op(__pa(start), __pa(start) + size);
>  }

Since we have real issues with speculative accesses, for the !map case,
we need to invalidate the outer cache before the inner cache. I think it
just makes sense to avoid the inner_op and outer_op pointers.

-- 
Catalin

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 1/7] ARM: provide phys_to_page() to complement page_to_phys()
  2009-11-20 18:01 ` [PATCH 1/7] ARM: provide phys_to_page() to complement page_to_phys() Russell King - ARM Linux
@ 2009-11-23 12:05   ` Catalin Marinas
  0 siblings, 0 replies; 65+ messages in thread
From: Catalin Marinas @ 2009-11-23 12:05 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2009-11-20 at 18:01 +0000, Russell King - ARM Linux wrote:
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

-- 
Catalin

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and__pfn_to_bus()
  2009-11-20 18:02 ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus() Russell King - ARM Linux
@ 2009-11-23 12:05   ` Catalin Marinas
  2009-12-12 14:01   ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus() Anders Grafström
  1 sibling, 0 replies; 65+ messages in thread
From: Catalin Marinas @ 2009-11-23 12:05 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2009-11-20 at 18:02 +0000, Russell King - ARM Linux wrote:
> The non-highmem() and the __pfn_to_bus() based page_to_dma() both
> compile to the same code, so its pointless having these two different
> approaches.  Use the __pfn_to_bus() based version.
> 
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

-- 
Catalin

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 6/7] ARM: dma-mapping: fix for speculative accesses
  2009-11-23 12:03   ` Catalin Marinas
@ 2009-11-23 12:08     ` Russell King - ARM Linux
  0 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-23 12:08 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Nov 23, 2009 at 12:03:02PM +0000, Catalin Marinas wrote:
> Does this mean that the direction information is lost after this point?
> My only issue with this is that in the FROM_DEVICE case we use a clean
> operation before the transfer even though an invalidate would be enough
> and probably faster.

Yes we do.  In the long run, I'd like to change dmac_*_range to be
dmac_map_range and dmac_unmap_range, taking the direction argument.
The problem at the moment is the highmem page stuff gets rather messy
and less efficient to go that far.

> > --- a/arch/arm/mm/dma-mapping.c
> > +++ b/arch/arm/mm/dma-mapping.c
> > @@ -539,58 +539,40 @@ core_initcall(consistent_init);
> >   * platforms with CONFIG_DMABOUNCE.
> >   * Use the driver DMA support - see dma-mapping.h (dma_sync_*)
> >   */
> > -void dma_cache_maint(const void *start, size_t size, int direction)
> > +void __dma_cache_maint(const void *start, size_t size, int map)
> >  {
> >         void (*inner_op)(const void *, const void *);
> >         void (*outer_op)(unsigned long, unsigned long);
> > 
> >         BUG_ON(!virt_addr_valid(start) || !virt_addr_valid(start + size - 1));
> > 
> > -       switch (direction) {
> > -       case DMA_FROM_DEVICE:           /* invalidate only */
> > -               inner_op = dmac_inv_range;
> > -               outer_op = outer_inv_range;
> > -               break;
> > -       case DMA_TO_DEVICE:             /* writeback only */
> > +       if (map) {              /* writeback only */
> >                 inner_op = dmac_clean_range;
> >                 outer_op = outer_clean_range;
> > -               break;
> > -       case DMA_BIDIRECTIONAL:         /* writeback and invalidate */
> > -               inner_op = dmac_flush_range;
> > -               outer_op = outer_flush_range;
> > -               break;
> > -       default:
> > -               BUG();
> > +       } else {                /* invalidate only */
> > +               inner_op = dmac_inv_range;
> > +               outer_op = outer_inv_range;
> >         }
> > 
> >         inner_op(start, start + size);
> >         outer_op(__pa(start), __pa(start) + size);
> >  }
> 
> Since we have real issues with speculative accesses, for the !map case,
> we need to invalidate the outer cache before the inner cache. I think it
> just makes sense to avoid the inner_op and outer_op pointers.

Yes, good catch.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cachelines on invalidate
  2009-11-20 18:07 ` [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate Russell King - ARM Linux
  2009-11-22 22:16   ` Nicolas Pitre
@ 2009-11-23 12:09   ` Catalin Marinas
  2009-11-23 13:30     ` Russell King - ARM Linux
  2009-11-23 13:38   ` [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping cache lines " Russell King - ARM Linux
  2 siblings, 1 reply; 65+ messages in thread
From: Catalin Marinas @ 2009-11-23 12:09 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2009-11-20 at 18:07 +0000, Russell King - ARM Linux wrote:
> Since we now clean the DMA buffers on map, there's no need to clean
> overlapping cache lines on invalidation anymore.  (Note: the DMA API
> prohibits other data sharing the same cache line as a DMA buffer
> anyway.)

If patch 6/7 cleans before the transfer (I made a comment on performance
but haven't run any benchmarks), it makes sense to have this
simplification.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

-- 
Catalin

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 10/10] ARM: dma-mapping: switch ARMv7 DMA mappings to retain'memory' attribute
  2009-11-21 19:39   ` [PATCH 10/10] ARM: dma-mapping: switch ARMv7 DMA mappings to retain 'memory' attribute Russell King - ARM Linux
@ 2009-11-23 12:21     ` Catalin Marinas
  0 siblings, 0 replies; 65+ messages in thread
From: Catalin Marinas @ 2009-11-23 12:21 UTC (permalink / raw)
  To: linux-arm-kernel

On Sat, 2009-11-21 at 19:39 +0000, Russell King - ARM Linux wrote:
> On ARMv7, it is invalid to map the same physical address multiple times
> with different memory types.  Since system RAM is already mapped as
> 'memory', subsequent remapping of it must retain this attribute.
> 
> However, DMA memory maps it as "strongly ordered".  Fix this by introducing
> 'pgprot_dmacoherent()' which provides the necessary page table bits for
> DMA mappings.
> 
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

-- 
Catalin

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cachelines on invalidate
  2009-11-23 12:09   ` [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cachelines " Catalin Marinas
@ 2009-11-23 13:30     ` Russell King - ARM Linux
  0 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-23 13:30 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Nov 23, 2009 at 12:09:12PM +0000, Catalin Marinas wrote:
> On Fri, 2009-11-20 at 18:07 +0000, Russell King - ARM Linux wrote:
> > Since we now clean the DMA buffers on map, there's no need to clean
> > overlapping cache lines on invalidation anymore.  (Note: the DMA API
> > prohibits other data sharing the same cache line as a DMA buffer
> > anyway.)
> 
> If patch 6/7 cleans before the transfer (I made a comment on performance
> but haven't run any benchmarks), it makes sense to have this
> simplification.
> 
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

However, it (and the updated patch I sent in reply to Nicolas) misses
touching the L2 functions.

I'll post an updated set of the entire 17 patches shortly.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-20 18:06 ` [PATCH 6/7] ARM: dma-mapping: fix for speculative accesses Russell King - ARM Linux
  2009-11-23 12:03   ` Catalin Marinas
@ 2009-11-23 13:38   ` Russell King - ARM Linux
  2009-11-23 15:25     ` saeed bishara
  1 sibling, 1 reply; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-23 13:38 UTC (permalink / raw)
  To: linux-arm-kernel

Rather than handing all cache maintainence before DMA begins, we
also need to handle cache maintainence upon completion when we
have CPUs which prefetch speculatively.

This renames the dma_cache_maint*() functions, and changes their
parameters to indicate whether we are mapping a DMA buffer.  We
always clean DMA buffers when we map them, but avoid invalidating
them if we are DMA'ing to the device.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-By: Jamie Iles <jamie@jamieiles.com>
---
 arch/arm/include/asm/dma-mapping.h |   32 ++++++++--
 arch/arm/mm/dma-mapping.c          |  115 +++++++++++++++++++-----------------
 2 files changed, 86 insertions(+), 61 deletions(-)

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index e850f5c..ef9a11d 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -60,9 +60,8 @@ static inline dma_addr_t virt_to_dma(struct device *dev, void *addr)
  * Private support functions: these are not part of the API and are
  * liable to change.  Drivers must not use these.
  */
-extern void dma_cache_maint(const void *kaddr, size_t size, int rw);
-extern void dma_cache_maint_page(struct page *page, unsigned long offset,
-				 size_t size, int rw);
+extern void __dma_cache_maint_page(struct page *page, unsigned long offset,
+				   size_t size, int map);
 
 /*
  * The DMA API is built upon the notion of "buffer ownership".  A buffer
@@ -70,33 +69,52 @@ extern void dma_cache_maint_page(struct page *page, unsigned long offset,
  * by it) or exclusively owned by the DMA device.  These helper functions
  * represent the transitions between these two ownership states.
  *
+ * Note, however, that on later ARMs, this notion does not work due to
+ * speculative prefetches.  We model our approach on the assumption that
+ * the CPU does do speculative prefetches, which means we clean caches
+ * before transfers and delay cache invalidation until transfer completion.
+ *
  * As above, these are private support functions and not part of the API.
  * Drivers must not use these.
  */
 static inline void __dma_single_cpu_to_dev(const void *kaddr, size_t size,
 	enum dma_data_direction dir)
 {
+	extern void ___dma_single_cpu_to_dev(const void *, size_t,
+		enum dma_data_direction);
+
 	if (!arch_is_coherent())
-		dma_cache_maint(kaddr, size, dir);
+		___dma_single_cpu_to_dev(kaddr, size, dir);
 }
 
 static inline void __dma_single_dev_to_cpu(const void *kaddr, size_t size,
 	enum dma_data_direction dir)
 {
-	/* nothing to do */
+	extern void ___dma_single_dev_to_cpu(const void *, size_t,
+		enum dma_data_direction);
+
+	if (!arch_is_coherent())
+		___dma_single_dev_to_cpu(kaddr, size, dir);
 }
 
 static inline void __dma_page_cpu_to_dev(struct page *page, unsigned long off,
 	size_t size, enum dma_data_direction dir)
 {
+	extern void ___dma_page_cpu_to_dev(struct page *, unsigned long,
+		size_t, enum dma_data_direction);
+
 	if (!arch_is_coherent())
-		dma_cache_maint_page(page, off, size, dir);
+		___dma_page_cpu_to_dev(page, off, size, dir);
 }
 
 static inline void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
 	size_t size, enum dma_data_direction dir)
 {
-	/* nothing to do */
+	extern void ___dma_page_dev_to_cpu(struct page *, unsigned long,
+		size_t, enum dma_data_direction);
+
+	if (!arch_is_coherent())
+		___dma_page_dev_to_cpu(page, off, size, dir);
 }
 
 /*
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 5bd2e0d..80a4a57 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -539,78 +539,58 @@ core_initcall(consistent_init);
  * platforms with CONFIG_DMABOUNCE.
  * Use the driver DMA support - see dma-mapping.h (dma_sync_*)
  */
-void dma_cache_maint(const void *start, size_t size, int direction)
+void ___dma_single_cpu_to_dev(const void *kaddr, size_t size,
+	enum dma_data_direction dir)
 {
-	void (*inner_op)(const void *, const void *);
-	void (*outer_op)(unsigned long, unsigned long);
-
-	BUG_ON(!virt_addr_valid(start) || !virt_addr_valid(start + size - 1));
-
-	switch (direction) {
-	case DMA_FROM_DEVICE:		/* invalidate only */
-		inner_op = dmac_inv_range;
-		outer_op = outer_inv_range;
-		break;
-	case DMA_TO_DEVICE:		/* writeback only */
-		inner_op = dmac_clean_range;
-		outer_op = outer_clean_range;
-		break;
-	case DMA_BIDIRECTIONAL:		/* writeback and invalidate */
-		inner_op = dmac_flush_range;
-		outer_op = outer_flush_range;
-		break;
-	default:
-		BUG();
+	unsigned long paddr;
+
+	BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));
+
+	paddr = __pa(kaddr);
+	if (dir == DMA_FROM_DEVICE) {
+		outer_inv_range(paddr, paddr + size);
+		dmac_inv_range(kaddr, kaddr + size);
+	} else {
+		dmac_clean_range(kaddr, kaddr + size);
+		outer_clean_range(paddr, paddr + size);
 	}
+}
+EXPORT_SYMBOL(___dma_single_cpu_to_dev);
 
-	inner_op(start, start + size);
-	outer_op(__pa(start), __pa(start) + size);
+void ___dma_single_dev_to_cpu(const void *kaddr, size_t size,
+	enum dma_data_direction dir)
+{
+	BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));
+
+	/* don't bother invalidating if DMA to device */
+	if (dir != DMA_TO_DEVICE) {
+		unsigned long paddr = __pa(kaddr);
+		outer_inv_range(paddr, paddr + size);
+		dmac_inv_range(kaddr, kaddr + size);
+	}
 }
-EXPORT_SYMBOL(dma_cache_maint);
+EXPORT_SYMBOL(___dma_single_dev_to_cpu);
 
 static void dma_cache_maint_contiguous(struct page *page, unsigned long offset,
-				       size_t size, int direction)
+		       size_t size, void (*op)(const void *, const void *))
 {
 	void *vaddr;
-	unsigned long paddr;
-	void (*inner_op)(const void *, const void *);
-	void (*outer_op)(unsigned long, unsigned long);
-
-	switch (direction) {
-	case DMA_FROM_DEVICE:		/* invalidate only */
-		inner_op = dmac_inv_range;
-		outer_op = outer_inv_range;
-		break;
-	case DMA_TO_DEVICE:		/* writeback only */
-		inner_op = dmac_clean_range;
-		outer_op = outer_clean_range;
-		break;
-	case DMA_BIDIRECTIONAL:		/* writeback and invalidate */
-		inner_op = dmac_flush_range;
-		outer_op = outer_flush_range;
-		break;
-	default:
-		BUG();
-	}
 
 	if (!PageHighMem(page)) {
 		vaddr = page_address(page) + offset;
-		inner_op(vaddr, vaddr + size);
+		op(vaddr, vaddr + size);
 	} else {
 		vaddr = kmap_high_get(page);
 		if (vaddr) {
 			vaddr += offset;
-			inner_op(vaddr, vaddr + size);
+			op(vaddr, vaddr + size);
 			kunmap_high(page);
 		}
 	}
-
-	paddr = page_to_phys(page) + offset;
-	outer_op(paddr, paddr + size);
 }
 
-void dma_cache_maint_page(struct page *page, unsigned long offset,
-			  size_t size, int dir)
+void __dma_cache_maint_page(struct page *page, unsigned long offset,
+	size_t size, void (*op)(const void *, const void *))
 {
 	/*
 	 * A single sg entry may refer to multiple physically contiguous
@@ -628,13 +608,40 @@ void dma_cache_maint_page(struct page *page, unsigned long offset,
 			}
 			len = PAGE_SIZE - offset;
 		}
-		dma_cache_maint_contiguous(page, offset, len, dir);
+		dma_cache_maint_contiguous(page, offset, len, op);
 		offset = 0;
 		page++;
 		left -= len;
 	} while (left);
 }
-EXPORT_SYMBOL(dma_cache_maint_page);
+
+void ___dma_page_cpu_to_dev(struct page *page, unsigned long off,
+	size_t size, enum dma_data_direction dir)
+{
+	unsigned long paddr = page_to_phys(page) + offset;
+
+	if (dir == DMA_FROM_DEVICE) {
+		outer_inv_range(paddr, paddr + size);
+		__dma_cache_maint_page(page, off, size, dmac_inv_range);
+	} else {
+		__dma_cache_maint_page(page, off, size, dmac_clean_range);
+		outer_clean_range(paddr, paddr + size);
+	}
+}
+EXPORT_SYMBOL(___dma_page_cpu_to_dev);
+
+void ___dma_page_dev_to_cpu(struct page *page, unsigned long off,
+	size_t size, enum dma_data_direction dir)
+{
+	unsigned long paddr = page_to_phys(page) + offset;
+
+	/* don't bother invalidating if DMA to device */
+	if (dir != DMA_TO_DEVICE) {
+		outer_inv_range(paddr, paddr + size);
+		__dma_cache_maint_page(page, off, size, dmac_inv_range);
+	}
+}
+EXPORT_SYMBOL(___dma_page_dev_to_cpu);
 
 /**
  * dma_map_sg - map a set of SG buffers for streaming mode DMA
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate
  2009-11-20 18:07 ` [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate Russell King - ARM Linux
  2009-11-22 22:16   ` Nicolas Pitre
  2009-11-23 12:09   ` [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cachelines " Catalin Marinas
@ 2009-11-23 13:38   ` Russell King - ARM Linux
  2009-11-23 19:35     ` Nicolas Pitre
  2009-11-25 12:19     ` Catalin Marinas
  2 siblings, 2 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-23 13:38 UTC (permalink / raw)
  To: linux-arm-kernel

Since we now clean the DMA buffers on map, there's no need to clean
overlapping cache lines on invalidation anymore.  (Note: the DMA API
prohibits other data sharing the same cache line as a DMA buffer
anyway.)

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Tested-By: Jamie Iles <jamie@jamieiles.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm/mm/cache-fa.S          |    5 -----
 arch/arm/mm/cache-feroceon-l2.c |   19 +++----------------
 arch/arm/mm/cache-l2x0.c        |   12 +-----------
 arch/arm/mm/cache-v4wb.S        |    5 -----
 arch/arm/mm/cache-v6.S          |   13 -------------
 arch/arm/mm/cache-v7.S          |    6 ------
 arch/arm/mm/proc-arm1020.S      |    8 --------
 arch/arm/mm/proc-arm1020e.S     |    4 ----
 arch/arm/mm/proc-arm1022.S      |    4 ----
 arch/arm/mm/proc-arm1026.S      |    4 ----
 arch/arm/mm/proc-arm920.S       |    4 ----
 arch/arm/mm/proc-arm922.S       |    4 ----
 arch/arm/mm/proc-arm925.S       |    6 ------
 arch/arm/mm/proc-arm926.S       |    6 ------
 arch/arm/mm/proc-arm946.S       |    7 -------
 arch/arm/mm/proc-feroceon.S     |    8 --------
 arch/arm/mm/proc-mohawk.S       |    4 ----
 arch/arm/mm/proc-xsc3.S         |    4 ----
 arch/arm/mm/proc-xscale.S       |    4 ----
 19 files changed, 4 insertions(+), 123 deletions(-)

diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S
index b63a8f7..1711386 100644
--- a/arch/arm/mm/cache-fa.S
+++ b/arch/arm/mm/cache-fa.S
@@ -157,12 +157,7 @@ ENTRY(fa_flush_kern_dcache_page)
  *	- end	 - virtual end address
  */
 ENTRY(fa_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	bic	r1, r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c
index 6e77c04..6834e07 100644
--- a/arch/arm/mm/cache-feroceon-l2.c
+++ b/arch/arm/mm/cache-feroceon-l2.c
@@ -163,26 +163,13 @@ static unsigned long calc_range_end(unsigned long start, unsigned long end)
 
 static void feroceon_l2_inv_range(unsigned long start, unsigned long end)
 {
-	/*
-	 * Clean and invalidate partial first cache line.
-	 */
-	if (start & (CACHE_LINE_SIZE - 1)) {
-		l2_clean_inv_pa(start & ~(CACHE_LINE_SIZE - 1));
-		start = (start | (CACHE_LINE_SIZE - 1)) + 1;
-	}
-
-	/*
-	 * Clean and invalidate partial last cache line.
-	 */
-	if (start < end && end & (CACHE_LINE_SIZE - 1)) {
-		l2_clean_inv_pa(end & ~(CACHE_LINE_SIZE - 1));
-		end &= ~(CACHE_LINE_SIZE - 1);
-	}
+	start &= ~(CACHE_LINE_SIZE - 1);
+	end = (end + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1);
 
 	/*
 	 * Invalidate all full cache lines between 'start' and 'end'.
 	 */
-	while (start < end) {
+	while (start != end) {
 		unsigned long range_end = calc_range_end(start, end);
 		l2_inv_pa_range(start, range_end - CACHE_LINE_SIZE);
 		start = range_end;
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index b480f1d..a726b57 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -57,17 +57,7 @@ static void l2x0_inv_range(unsigned long start, unsigned long end)
 {
 	unsigned long addr;
 
-	if (start & (CACHE_LINE_SIZE - 1)) {
-		start &= ~(CACHE_LINE_SIZE - 1);
-		sync_writel(start, L2X0_CLEAN_INV_LINE_PA, 1);
-		start += CACHE_LINE_SIZE;
-	}
-
-	if (end & (CACHE_LINE_SIZE - 1)) {
-		end &= ~(CACHE_LINE_SIZE - 1);
-		sync_writel(end, L2X0_CLEAN_INV_LINE_PA, 1);
-	}
-
+	start &= ~(CACHE_LINE_SIZE - 1);
 	for (addr = start; addr < end; addr += CACHE_LINE_SIZE)
 		sync_writel(addr, L2X0_INV_LINE_PA, 1);
 	cache_sync();
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index 2ebc1b3..553931a 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -173,16 +173,11 @@ ENTRY(v4wb_coherent_user_range)
  *	- end	 - virtual end address
  */
 ENTRY(v4wb_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
 	mov	pc, lr
 
 /*
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 295e25d..d1dfd87 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -195,20 +195,7 @@ ENTRY(v6_flush_kern_dcache_page)
  *	- end     - virtual end address of region
  */
 ENTRY(v6_dma_inv_range)
-	tst	r0, #D_CACHE_LINE_SIZE - 1
 	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
-#ifdef HARVARD_CACHE
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D line
-#else
-	mcrne	p15, 0, r0, c7, c11, 1		@ clean unified line
-#endif
-	tst	r1, #D_CACHE_LINE_SIZE - 1
-	bic	r1, r1, #D_CACHE_LINE_SIZE - 1
-#ifdef HARVARD_CACHE
-	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D line
-#else
-	mcrne	p15, 0, r1, c7, c15, 1		@ clean & invalidate unified line
-#endif
 1:
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D line
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index e1bd975..893ee59 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -218,13 +218,7 @@ ENDPROC(v7_flush_kern_dcache_page)
 ENTRY(v7_dma_inv_range)
 	dcache_line_size r2, r3
 	sub	r3, r2, #1
-	tst	r0, r3
 	bic	r0, r0, r3
-	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D / U line
-
-	tst	r1, r3
-	bic	r1, r1, r3
-	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D / U line
 1:
 	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D / U line
 	add	r0, r0, r2
diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S
index d9fb4b9..7bbf624 100644
--- a/arch/arm/mm/proc-arm1020.S
+++ b/arch/arm/mm/proc-arm1020.S
@@ -267,15 +267,7 @@ ENTRY(arm1020_flush_kern_dcache_page)
 ENTRY(arm1020_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, ip, c7, c10, 4
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	mcrne	p15, 0, ip, c7, c10, 4		@ drain WB
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, ip, c7, c10, 4
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-	mcrne	p15, 0, ip, c7, c10, 4		@ drain WB
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S
index 7453b75..d379cb7 100644
--- a/arch/arm/mm/proc-arm1020e.S
+++ b/arch/arm/mm/proc-arm1020e.S
@@ -260,11 +260,7 @@ ENTRY(arm1020e_flush_kern_dcache_page)
 ENTRY(arm1020e_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S
index 8eb72d7..f5a7949 100644
--- a/arch/arm/mm/proc-arm1022.S
+++ b/arch/arm/mm/proc-arm1022.S
@@ -249,11 +249,7 @@ ENTRY(arm1022_flush_kern_dcache_page)
 ENTRY(arm1022_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S
index 3b59f0d..1dc26f8 100644
--- a/arch/arm/mm/proc-arm1026.S
+++ b/arch/arm/mm/proc-arm1026.S
@@ -243,11 +243,7 @@ ENTRY(arm1026_flush_kern_dcache_page)
 ENTRY(arm1026_dma_inv_range)
 	mov	ip, #0
 #ifndef CONFIG_CPU_DCACHE_DISABLE
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
index 2b7c197..078a873 100644
--- a/arch/arm/mm/proc-arm920.S
+++ b/arch/arm/mm/proc-arm920.S
@@ -239,11 +239,7 @@ ENTRY(arm920_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm920_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S
index 06a1aa4..22ca857 100644
--- a/arch/arm/mm/proc-arm922.S
+++ b/arch/arm/mm/proc-arm922.S
@@ -241,11 +241,7 @@ ENTRY(arm922_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm922_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S
index cb53435..ff04299 100644
--- a/arch/arm/mm/proc-arm925.S
+++ b/arch/arm/mm/proc-arm925.S
@@ -283,12 +283,6 @@ ENTRY(arm925_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm925_dma_inv_range)
-#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-#endif
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
index 1c48487..4b4c717 100644
--- a/arch/arm/mm/proc-arm926.S
+++ b/arch/arm/mm/proc-arm926.S
@@ -246,12 +246,6 @@ ENTRY(arm926_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(arm926_dma_inv_range)
-#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-#endif
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S
index 40c0449..589a61c 100644
--- a/arch/arm/mm/proc-arm946.S
+++ b/arch/arm/mm/proc-arm946.S
@@ -215,18 +215,11 @@ ENTRY(arm946_flush_kern_dcache_page)
  * (same as arm926)
  */
 ENTRY(arm946_dma_inv_range)
-#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
-#endif
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
 	blo	1b
-	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
 	mov	pc, lr
 
 /*
diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
index d0d7795..1262b92 100644
--- a/arch/arm/mm/proc-feroceon.S
+++ b/arch/arm/mm/proc-feroceon.S
@@ -274,11 +274,7 @@ ENTRY(feroceon_range_flush_kern_dcache_page)
  */
 	.align	5
 ENTRY(feroceon_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
 	bic	r0, r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
 	cmp	r0, r1
@@ -289,10 +285,6 @@ ENTRY(feroceon_dma_inv_range)
 	.align	5
 ENTRY(feroceon_range_dma_inv_range)
 	mrs	r2, cpsr
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 	cmp	r1, r0
 	subne	r1, r1, #1			@ top address is inclusive
 	orr	r3, r2, #PSR_I_BIT
diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
index 52b5fd7..191ea6d 100644
--- a/arch/arm/mm/proc-mohawk.S
+++ b/arch/arm/mm/proc-mohawk.S
@@ -218,10 +218,6 @@ ENTRY(mohawk_flush_kern_dcache_page)
  * (same as v4wb)
  */
 ENTRY(mohawk_dma_inv_range)
-	tst	r0, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHE_DLINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 	bic	r0, r0, #CACHE_DLINESIZE - 1
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHE_DLINESIZE
diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
index 2028f37..2c1ac69 100644
--- a/arch/arm/mm/proc-xsc3.S
+++ b/arch/arm/mm/proc-xsc3.S
@@ -257,11 +257,7 @@ ENTRY(xsc3_flush_kern_dcache_page)
  *	- end	 - virtual end address
  */
 ENTRY(xsc3_dma_inv_range)
-	tst	r0, #CACHELINESIZE - 1
 	bic	r0, r0, #CACHELINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean L1 D line
-	tst	r1, #CACHELINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean L1 D line
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate L1 D line
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index f056c28..3170348 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -315,11 +315,7 @@ ENTRY(xscale_flush_kern_dcache_page)
  *	- end	 - virtual end address
  */
 ENTRY(xscale_dma_inv_range)
-	tst	r0, #CACHELINESIZE - 1
 	bic	r0, r0, #CACHELINESIZE - 1
-	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
-	tst	r1, #CACHELINESIZE - 1
-	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
 1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
 	add	r0, r0, #CACHELINESIZE
 	cmp	r0, r1
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-23 13:38   ` [PATCH v2 06/17] " Russell King - ARM Linux
@ 2009-11-23 15:25     ` saeed bishara
  2009-11-23 17:27       ` Russell King - ARM Linux
  0 siblings, 1 reply; 65+ messages in thread
From: saeed bishara @ 2009-11-23 15:25 UTC (permalink / raw)
  To: linux-arm-kernel

> + ? ? ? if (dir == DMA_FROM_DEVICE) {
> + ? ? ? ? ? ? ? outer_inv_range(paddr, paddr + size);
> + ? ? ? ? ? ? ? dmac_inv_range(kaddr, kaddr + size);
it's not clear why outer cache invalidated before inner cache, and, I
think that it may be incorrect, how can you be sure that a dirty line
that is in inner cache can't be moved down to outer cache?
> + ? ? ? } else {
> + ? ? ? ? ? ? ? dmac_clean_range(kaddr, kaddr + size);
> + ? ? ? ? ? ? ? outer_clean_range(paddr, paddr + size);
> ? ? ? ?}
> +}
> +EXPORT_SYMBOL(___dma_single_cpu_to_dev);
>
> - ? ? ? inner_op(start, start + size);
> - ? ? ? outer_op(__pa(start), __pa(start) + size);
> +void ___dma_single_dev_to_cpu(const void *kaddr, size_t size,
> + ? ? ? enum dma_data_direction dir)
> +{
> + ? ? ? BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));
> +
> + ? ? ? /* don't bother invalidating if DMA to device */
> + ? ? ? if (dir != DMA_TO_DEVICE) {
> + ? ? ? ? ? ? ? unsigned long paddr = __pa(kaddr);
> + ? ? ? ? ? ? ? outer_inv_range(paddr, paddr + size);
> + ? ? ? ? ? ? ? dmac_inv_range(kaddr, kaddr + size);
> + ? ? ? }
that code implies that cache invalidate will be applied also for CPU's
that doesn't suffer from speculative prefetch issue, right?
> ?}
> -EXPORT_SYMBOL(dma_cache_maint);
> +EXPORT_SYMBOL(___dma_single_dev_to_cpu);
>
> ?static void dma_cache_maint_contiguous(struct page *page, unsigned long offset,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?size_t size, int direction)
> + ? ? ? ? ? ? ? ? ? ? ?size_t size, void (*op)(const void *, const void *))
> ?{
> ? ? ? ?void *vaddr;
> - ? ? ? unsigned long paddr;
> - ? ? ? void (*inner_op)(const void *, const void *);
> - ? ? ? void (*outer_op)(unsigned long, unsigned long);
> -
> - ? ? ? switch (direction) {
> - ? ? ? case DMA_FROM_DEVICE: ? ? ? ? ? /* invalidate only */
> - ? ? ? ? ? ? ? inner_op = dmac_inv_range;
> - ? ? ? ? ? ? ? outer_op = outer_inv_range;
> - ? ? ? ? ? ? ? break;
> - ? ? ? case DMA_TO_DEVICE: ? ? ? ? ? ? /* writeback only */
> - ? ? ? ? ? ? ? inner_op = dmac_clean_range;
> - ? ? ? ? ? ? ? outer_op = outer_clean_range;
> - ? ? ? ? ? ? ? break;
> - ? ? ? case DMA_BIDIRECTIONAL: ? ? ? ? /* writeback and invalidate */
> - ? ? ? ? ? ? ? inner_op = dmac_flush_range;
> - ? ? ? ? ? ? ? outer_op = outer_flush_range;
> - ? ? ? ? ? ? ? break;
> - ? ? ? default:
> - ? ? ? ? ? ? ? BUG();
> - ? ? ? }
>
> ? ? ? ?if (!PageHighMem(page)) {
> ? ? ? ? ? ? ? ?vaddr = page_address(page) + offset;
> - ? ? ? ? ? ? ? inner_op(vaddr, vaddr + size);
> + ? ? ? ? ? ? ? op(vaddr, vaddr + size);
> ? ? ? ?} else {
> ? ? ? ? ? ? ? ?vaddr = kmap_high_get(page);
> ? ? ? ? ? ? ? ?if (vaddr) {
> ? ? ? ? ? ? ? ? ? ? ? ?vaddr += offset;
> - ? ? ? ? ? ? ? ? ? ? ? inner_op(vaddr, vaddr + size);
> + ? ? ? ? ? ? ? ? ? ? ? op(vaddr, vaddr + size);
> ? ? ? ? ? ? ? ? ? ? ? ?kunmap_high(page);
> ? ? ? ? ? ? ? ?}
> ? ? ? ?}
> -
> - ? ? ? paddr = page_to_phys(page) + offset;
> - ? ? ? outer_op(paddr, paddr + size);
> ?}
>
> -void dma_cache_maint_page(struct page *page, unsigned long offset,
> - ? ? ? ? ? ? ? ? ? ? ? ? size_t size, int dir)
> +void __dma_cache_maint_page(struct page *page, unsigned long offset,
> + ? ? ? size_t size, void (*op)(const void *, const void *))
> ?{
> ? ? ? ?/*
> ? ? ? ? * A single sg entry may refer to multiple physically contiguous
> @@ -628,13 +608,40 @@ void dma_cache_maint_page(struct page *page, unsigned long offset,
> ? ? ? ? ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ? ? ? ? ?len = PAGE_SIZE - offset;
> ? ? ? ? ? ? ? ?}
> - ? ? ? ? ? ? ? dma_cache_maint_contiguous(page, offset, len, dir);
> + ? ? ? ? ? ? ? dma_cache_maint_contiguous(page, offset, len, op);
> ? ? ? ? ? ? ? ?offset = 0;
> ? ? ? ? ? ? ? ?page++;
> ? ? ? ? ? ? ? ?left -= len;
> ? ? ? ?} while (left);
> ?}
> -EXPORT_SYMBOL(dma_cache_maint_page);
> +
> +void ___dma_page_cpu_to_dev(struct page *page, unsigned long off,
> + ? ? ? size_t size, enum dma_data_direction dir)
> +{
> + ? ? ? unsigned long paddr = page_to_phys(page) + offset;
> +
> + ? ? ? if (dir == DMA_FROM_DEVICE) {
> + ? ? ? ? ? ? ? outer_inv_range(paddr, paddr + size);
> + ? ? ? ? ? ? ? __dma_cache_maint_page(page, off, size, dmac_inv_range);
> + ? ? ? } else {
> + ? ? ? ? ? ? ? __dma_cache_maint_page(page, off, size, dmac_clean_range);
> + ? ? ? ? ? ? ? outer_clean_range(paddr, paddr + size);
> + ? ? ? }
> +}
> +EXPORT_SYMBOL(___dma_page_cpu_to_dev);
> +
> +void ___dma_page_dev_to_cpu(struct page *page, unsigned long off,
> + ? ? ? size_t size, enum dma_data_direction dir)
> +{
> + ? ? ? unsigned long paddr = page_to_phys(page) + offset;
> +
> + ? ? ? /* don't bother invalidating if DMA to device */
> + ? ? ? if (dir != DMA_TO_DEVICE) {
> + ? ? ? ? ? ? ? outer_inv_range(paddr, paddr + size);
> + ? ? ? ? ? ? ? __dma_cache_maint_page(page, off, size, dmac_inv_range);
> + ? ? ? }
> +}
> +EXPORT_SYMBOL(___dma_page_dev_to_cpu);
>
> ?/**
> ?* dma_map_sg - map a set of SG buffers for streaming mode DMA
> --
> 1.6.2.5
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-23 15:25     ` saeed bishara
@ 2009-11-23 17:27       ` Russell King - ARM Linux
  2009-11-24 15:46         ` saeed bishara
  2009-11-24 16:47         ` Catalin Marinas
  0 siblings, 2 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-23 17:27 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Nov 23, 2009 at 05:25:06PM +0200, saeed bishara wrote:
> > + ? ? ? if (dir == DMA_FROM_DEVICE) {
> > + ? ? ? ? ? ? ? outer_inv_range(paddr, paddr + size);
> > + ? ? ? ? ? ? ? dmac_inv_range(kaddr, kaddr + size);
> it's not clear why outer cache invalidated before inner cache, and, I
> think that it may be incorrect, how can you be sure that a dirty line
> that is in inner cache can't be moved down to outer cache?

I think you have a point, but also see Catalin's point.  When we are
invalidating, we have to invalidate the outer caches before the inner
caches, otherwise there is a danger that the inner caches could
prefetch from the outer between the two operations.

I think that the only thing we can do is as my previous version did,
and always clean the caches at this point - first the inner and then
the outer caches.

The only other alternative would be to invalidate the outer cache,
then the inner cache, followed again by the outer cache.  I don't
currently have a feeling for how expensive that would be against
just cleaning them.

Catalin, any thoughts?


> > +void ___dma_single_dev_to_cpu(const void *kaddr, size_t size,
> > + ? ? ? enum dma_data_direction dir)
> > +{
> > + ? ? ? BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));
> > +
> > + ? ? ? /* don't bother invalidating if DMA to device */
> > + ? ? ? if (dir != DMA_TO_DEVICE) {
> > + ? ? ? ? ? ? ? unsigned long paddr = __pa(kaddr);
> > + ? ? ? ? ? ? ? outer_inv_range(paddr, paddr + size);
> > + ? ? ? ? ? ? ? dmac_inv_range(kaddr, kaddr + size);
> > + ? ? ? }
> that code implies that cache invalidate will be applied also for CPU's
> that doesn't suffer from speculative prefetch issue, right?

Yes, but we need to do this for bidirectional mappings as well (which
would've only been cleaned.)

What would be useful is if someone could run some tests on the performance
impact of this (eg, measuring the DMA performance of a SATA hard drive)
and comparing the resulting performance.

I _could_ do that, but it would mean taking down my network and running
the tests on a fairly old machine.

If it does turn out to be a problem, we can change the 'inv_range' and
'clean_range' functions to be 'dev_to_cpu' and 'cpu_to_dev' functions,
and push the decisions about what to do at each point down into the
per-CPU assembly.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate
  2009-11-23 13:38   ` [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping cache lines " Russell King - ARM Linux
@ 2009-11-23 19:35     ` Nicolas Pitre
  2009-11-25 12:19     ` Catalin Marinas
  1 sibling, 0 replies; 65+ messages in thread
From: Nicolas Pitre @ 2009-11-23 19:35 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 23 Nov 2009, Russell King - ARM Linux wrote:

> Since we now clean the DMA buffers on map, there's no need to clean
> overlapping cache lines on invalidation anymore.  (Note: the DMA API
> prohibits other data sharing the same cache line as a DMA buffer
> anyway.)
> 
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> Tested-By: Jamie Iles <jamie@jamieiles.com>
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

Acked-by: Nicolas Pitre <nico@marvell.com>


> ---
>  arch/arm/mm/cache-fa.S          |    5 -----
>  arch/arm/mm/cache-feroceon-l2.c |   19 +++----------------
>  arch/arm/mm/cache-l2x0.c        |   12 +-----------
>  arch/arm/mm/cache-v4wb.S        |    5 -----
>  arch/arm/mm/cache-v6.S          |   13 -------------
>  arch/arm/mm/cache-v7.S          |    6 ------
>  arch/arm/mm/proc-arm1020.S      |    8 --------
>  arch/arm/mm/proc-arm1020e.S     |    4 ----
>  arch/arm/mm/proc-arm1022.S      |    4 ----
>  arch/arm/mm/proc-arm1026.S      |    4 ----
>  arch/arm/mm/proc-arm920.S       |    4 ----
>  arch/arm/mm/proc-arm922.S       |    4 ----
>  arch/arm/mm/proc-arm925.S       |    6 ------
>  arch/arm/mm/proc-arm926.S       |    6 ------
>  arch/arm/mm/proc-arm946.S       |    7 -------
>  arch/arm/mm/proc-feroceon.S     |    8 --------
>  arch/arm/mm/proc-mohawk.S       |    4 ----
>  arch/arm/mm/proc-xsc3.S         |    4 ----
>  arch/arm/mm/proc-xscale.S       |    4 ----
>  19 files changed, 4 insertions(+), 123 deletions(-)
> 
> diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S
> index b63a8f7..1711386 100644
> --- a/arch/arm/mm/cache-fa.S
> +++ b/arch/arm/mm/cache-fa.S
> @@ -157,12 +157,7 @@ ENTRY(fa_flush_kern_dcache_page)
>   *	- end	 - virtual end address
>   */
>  ENTRY(fa_dma_inv_range)
> -	tst	r0, #CACHE_DLINESIZE - 1
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	bic	r1, r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D entry
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
>  	cmp	r0, r1
> diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c
> index 6e77c04..6834e07 100644
> --- a/arch/arm/mm/cache-feroceon-l2.c
> +++ b/arch/arm/mm/cache-feroceon-l2.c
> @@ -163,26 +163,13 @@ static unsigned long calc_range_end(unsigned long start, unsigned long end)
>  
>  static void feroceon_l2_inv_range(unsigned long start, unsigned long end)
>  {
> -	/*
> -	 * Clean and invalidate partial first cache line.
> -	 */
> -	if (start & (CACHE_LINE_SIZE - 1)) {
> -		l2_clean_inv_pa(start & ~(CACHE_LINE_SIZE - 1));
> -		start = (start | (CACHE_LINE_SIZE - 1)) + 1;
> -	}
> -
> -	/*
> -	 * Clean and invalidate partial last cache line.
> -	 */
> -	if (start < end && end & (CACHE_LINE_SIZE - 1)) {
> -		l2_clean_inv_pa(end & ~(CACHE_LINE_SIZE - 1));
> -		end &= ~(CACHE_LINE_SIZE - 1);
> -	}
> +	start &= ~(CACHE_LINE_SIZE - 1);
> +	end = (end + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1);
>  
>  	/*
>  	 * Invalidate all full cache lines between 'start' and 'end'.
>  	 */
> -	while (start < end) {
> +	while (start != end) {
>  		unsigned long range_end = calc_range_end(start, end);
>  		l2_inv_pa_range(start, range_end - CACHE_LINE_SIZE);
>  		start = range_end;
> diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
> index b480f1d..a726b57 100644
> --- a/arch/arm/mm/cache-l2x0.c
> +++ b/arch/arm/mm/cache-l2x0.c
> @@ -57,17 +57,7 @@ static void l2x0_inv_range(unsigned long start, unsigned long end)
>  {
>  	unsigned long addr;
>  
> -	if (start & (CACHE_LINE_SIZE - 1)) {
> -		start &= ~(CACHE_LINE_SIZE - 1);
> -		sync_writel(start, L2X0_CLEAN_INV_LINE_PA, 1);
> -		start += CACHE_LINE_SIZE;
> -	}
> -
> -	if (end & (CACHE_LINE_SIZE - 1)) {
> -		end &= ~(CACHE_LINE_SIZE - 1);
> -		sync_writel(end, L2X0_CLEAN_INV_LINE_PA, 1);
> -	}
> -
> +	start &= ~(CACHE_LINE_SIZE - 1);
>  	for (addr = start; addr < end; addr += CACHE_LINE_SIZE)
>  		sync_writel(addr, L2X0_INV_LINE_PA, 1);
>  	cache_sync();
> diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
> index 2ebc1b3..553931a 100644
> --- a/arch/arm/mm/cache-v4wb.S
> +++ b/arch/arm/mm/cache-v4wb.S
> @@ -173,16 +173,11 @@ ENTRY(v4wb_coherent_user_range)
>   *	- end	 - virtual end address
>   */
>  ENTRY(v4wb_dma_inv_range)
> -	tst	r0, #CACHE_DLINESIZE - 1
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
>  	cmp	r0, r1
>  	blo	1b
> -	mcr	p15, 0, r0, c7, c10, 4		@ drain write buffer
>  	mov	pc, lr
>  
>  /*
> diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
> index 295e25d..d1dfd87 100644
> --- a/arch/arm/mm/cache-v6.S
> +++ b/arch/arm/mm/cache-v6.S
> @@ -195,20 +195,7 @@ ENTRY(v6_flush_kern_dcache_page)
>   *	- end     - virtual end address of region
>   */
>  ENTRY(v6_dma_inv_range)
> -	tst	r0, #D_CACHE_LINE_SIZE - 1
>  	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
> -#ifdef HARVARD_CACHE
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D line
> -#else
> -	mcrne	p15, 0, r0, c7, c11, 1		@ clean unified line
> -#endif
> -	tst	r1, #D_CACHE_LINE_SIZE - 1
> -	bic	r1, r1, #D_CACHE_LINE_SIZE - 1
> -#ifdef HARVARD_CACHE
> -	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D line
> -#else
> -	mcrne	p15, 0, r1, c7, c15, 1		@ clean & invalidate unified line
> -#endif
>  1:
>  #ifdef HARVARD_CACHE
>  	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D line
> diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
> index e1bd975..893ee59 100644
> --- a/arch/arm/mm/cache-v7.S
> +++ b/arch/arm/mm/cache-v7.S
> @@ -218,13 +218,7 @@ ENDPROC(v7_flush_kern_dcache_page)
>  ENTRY(v7_dma_inv_range)
>  	dcache_line_size r2, r3
>  	sub	r3, r2, #1
> -	tst	r0, r3
>  	bic	r0, r0, r3
> -	mcrne	p15, 0, r0, c7, c14, 1		@ clean & invalidate D / U line
> -
> -	tst	r1, r3
> -	bic	r1, r1, r3
> -	mcrne	p15, 0, r1, c7, c14, 1		@ clean & invalidate D / U line
>  1:
>  	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D / U line
>  	add	r0, r0, r2
> diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S
> index d9fb4b9..7bbf624 100644
> --- a/arch/arm/mm/proc-arm1020.S
> +++ b/arch/arm/mm/proc-arm1020.S
> @@ -267,15 +267,7 @@ ENTRY(arm1020_flush_kern_dcache_page)
>  ENTRY(arm1020_dma_inv_range)
>  	mov	ip, #0
>  #ifndef CONFIG_CPU_DCACHE_DISABLE
> -	tst	r0, #CACHE_DLINESIZE - 1
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, ip, c7, c10, 4
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	mcrne	p15, 0, ip, c7, c10, 4		@ drain WB
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, ip, c7, c10, 4
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
> -	mcrne	p15, 0, ip, c7, c10, 4		@ drain WB
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
>  	cmp	r0, r1
> diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S
> index 7453b75..d379cb7 100644
> --- a/arch/arm/mm/proc-arm1020e.S
> +++ b/arch/arm/mm/proc-arm1020e.S
> @@ -260,11 +260,7 @@ ENTRY(arm1020e_flush_kern_dcache_page)
>  ENTRY(arm1020e_dma_inv_range)
>  	mov	ip, #0
>  #ifndef CONFIG_CPU_DCACHE_DISABLE
> -	tst	r0, #CACHE_DLINESIZE - 1
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
>  	cmp	r0, r1
> diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S
> index 8eb72d7..f5a7949 100644
> --- a/arch/arm/mm/proc-arm1022.S
> +++ b/arch/arm/mm/proc-arm1022.S
> @@ -249,11 +249,7 @@ ENTRY(arm1022_flush_kern_dcache_page)
>  ENTRY(arm1022_dma_inv_range)
>  	mov	ip, #0
>  #ifndef CONFIG_CPU_DCACHE_DISABLE
> -	tst	r0, #CACHE_DLINESIZE - 1
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
>  	cmp	r0, r1
> diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S
> index 3b59f0d..1dc26f8 100644
> --- a/arch/arm/mm/proc-arm1026.S
> +++ b/arch/arm/mm/proc-arm1026.S
> @@ -243,11 +243,7 @@ ENTRY(arm1026_flush_kern_dcache_page)
>  ENTRY(arm1026_dma_inv_range)
>  	mov	ip, #0
>  #ifndef CONFIG_CPU_DCACHE_DISABLE
> -	tst	r0, #CACHE_DLINESIZE - 1
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
>  	cmp	r0, r1
> diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S
> index 2b7c197..078a873 100644
> --- a/arch/arm/mm/proc-arm920.S
> +++ b/arch/arm/mm/proc-arm920.S
> @@ -239,11 +239,7 @@ ENTRY(arm920_flush_kern_dcache_page)
>   * (same as v4wb)
>   */
>  ENTRY(arm920_dma_inv_range)
> -	tst	r0, #CACHE_DLINESIZE - 1
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
>  	cmp	r0, r1
> diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S
> index 06a1aa4..22ca857 100644
> --- a/arch/arm/mm/proc-arm922.S
> +++ b/arch/arm/mm/proc-arm922.S
> @@ -241,11 +241,7 @@ ENTRY(arm922_flush_kern_dcache_page)
>   * (same as v4wb)
>   */
>  ENTRY(arm922_dma_inv_range)
> -	tst	r0, #CACHE_DLINESIZE - 1
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
>  	cmp	r0, r1
> diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S
> index cb53435..ff04299 100644
> --- a/arch/arm/mm/proc-arm925.S
> +++ b/arch/arm/mm/proc-arm925.S
> @@ -283,12 +283,6 @@ ENTRY(arm925_flush_kern_dcache_page)
>   * (same as v4wb)
>   */
>  ENTRY(arm925_dma_inv_range)
> -#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
> -	tst	r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
> -#endif
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
> diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S
> index 1c48487..4b4c717 100644
> --- a/arch/arm/mm/proc-arm926.S
> +++ b/arch/arm/mm/proc-arm926.S
> @@ -246,12 +246,6 @@ ENTRY(arm926_flush_kern_dcache_page)
>   * (same as v4wb)
>   */
>  ENTRY(arm926_dma_inv_range)
> -#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
> -	tst	r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
> -#endif
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
> diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S
> index 40c0449..589a61c 100644
> --- a/arch/arm/mm/proc-arm946.S
> +++ b/arch/arm/mm/proc-arm946.S
> @@ -215,18 +215,11 @@ ENTRY(arm946_flush_kern_dcache_page)
>   * (same as arm926)
>   */
>  ENTRY(arm946_dma_inv_range)
> -#ifndef CONFIG_CPU_DCACHE_WRITETHROUGH
> -	tst	r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
> -#endif
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
>  	cmp	r0, r1
>  	blo	1b
> -	mcr	p15, 0, r0, c7, c10, 4		@ drain WB
>  	mov	pc, lr
>  
>  /*
> diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S
> index d0d7795..1262b92 100644
> --- a/arch/arm/mm/proc-feroceon.S
> +++ b/arch/arm/mm/proc-feroceon.S
> @@ -274,11 +274,7 @@ ENTRY(feroceon_range_flush_kern_dcache_page)
>   */
>  	.align	5
>  ENTRY(feroceon_dma_inv_range)
> -	tst	r0, #CACHE_DLINESIZE - 1
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
>  	cmp	r0, r1
> @@ -289,10 +285,6 @@ ENTRY(feroceon_dma_inv_range)
>  	.align	5
>  ENTRY(feroceon_range_dma_inv_range)
>  	mrs	r2, cpsr
> -	tst	r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
>  	cmp	r1, r0
>  	subne	r1, r1, #1			@ top address is inclusive
>  	orr	r3, r2, #PSR_I_BIT
> diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S
> index 52b5fd7..191ea6d 100644
> --- a/arch/arm/mm/proc-mohawk.S
> +++ b/arch/arm/mm/proc-mohawk.S
> @@ -218,10 +218,6 @@ ENTRY(mohawk_flush_kern_dcache_page)
>   * (same as v4wb)
>   */
>  ENTRY(mohawk_dma_inv_range)
> -	tst	r0, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHE_DLINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
>  	bic	r0, r0, #CACHE_DLINESIZE - 1
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHE_DLINESIZE
> diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S
> index 2028f37..2c1ac69 100644
> --- a/arch/arm/mm/proc-xsc3.S
> +++ b/arch/arm/mm/proc-xsc3.S
> @@ -257,11 +257,7 @@ ENTRY(xsc3_flush_kern_dcache_page)
>   *	- end	 - virtual end address
>   */
>  ENTRY(xsc3_dma_inv_range)
> -	tst	r0, #CACHELINESIZE - 1
>  	bic	r0, r0, #CACHELINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean L1 D line
> -	tst	r1, #CACHELINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean L1 D line
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate L1 D line
>  	add	r0, r0, #CACHELINESIZE
>  	cmp	r0, r1
> diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
> index f056c28..3170348 100644
> --- a/arch/arm/mm/proc-xscale.S
> +++ b/arch/arm/mm/proc-xscale.S
> @@ -315,11 +315,7 @@ ENTRY(xscale_flush_kern_dcache_page)
>   *	- end	 - virtual end address
>   */
>  ENTRY(xscale_dma_inv_range)
> -	tst	r0, #CACHELINESIZE - 1
>  	bic	r0, r0, #CACHELINESIZE - 1
> -	mcrne	p15, 0, r0, c7, c10, 1		@ clean D entry
> -	tst	r1, #CACHELINESIZE - 1
> -	mcrne	p15, 0, r1, c7, c10, 1		@ clean D entry
>  1:	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D entry
>  	add	r0, r0, #CACHELINESIZE
>  	cmp	r0, r1
> -- 
> 1.6.2.5
> 

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-23 17:27       ` Russell King - ARM Linux
@ 2009-11-24 15:46         ` saeed bishara
  2009-11-24 16:22           ` Russell King - ARM Linux
  2009-11-24 16:47         ` Catalin Marinas
  1 sibling, 1 reply; 65+ messages in thread
From: saeed bishara @ 2009-11-24 15:46 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Nov 23, 2009 at 7:27 PM, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> On Mon, Nov 23, 2009 at 05:25:06PM +0200, saeed bishara wrote:
>> > + ? ? ? if (dir == DMA_FROM_DEVICE) {
>> > + ? ? ? ? ? ? ? outer_inv_range(paddr, paddr + size);
>> > + ? ? ? ? ? ? ? dmac_inv_range(kaddr, kaddr + size);
>> it's not clear why outer cache invalidated before inner cache, and, I
>> think that it may be incorrect, how can you be sure that a dirty line
>> that is in inner cache can't be moved down to outer cache?
>
> I think you have a point, but also see Catalin's point. ?When we are
> invalidating, we have to invalidate the outer caches before the inner
> caches, otherwise there is a danger that the inner caches could
> prefetch from the outer between the two operations.
>
> I think that the only thing we can do is as my previous version did,
> and always clean the caches at this point - first the inner and then
> the outer caches.
this makes sense, if  a specific CPU needs the outer cache to be
invalidated before the inner one, then that better be done in a
separate patch as this is a different issue and has nothing to do with
the speculative prefetch
>
> The only other alternative would be to invalidate the outer cache,
> then the inner cache, followed again by the outer cache. ?I don't
> currently have a feeling for how expensive that would be against
> just cleaning them.
>
> Catalin, any thoughts?
>
>
>> > +void ___dma_single_dev_to_cpu(const void *kaddr, size_t size,
>> > + ? ? ? enum dma_data_direction dir)
>> > +{
>> > + ? ? ? BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));
>> > +
>> > + ? ? ? /* don't bother invalidating if DMA to device */
>> > + ? ? ? if (dir != DMA_TO_DEVICE) {
>> > + ? ? ? ? ? ? ? unsigned long paddr = __pa(kaddr);
>> > + ? ? ? ? ? ? ? outer_inv_range(paddr, paddr + size);
>> > + ? ? ? ? ? ? ? dmac_inv_range(kaddr, kaddr + size);
>> > + ? ? ? }
>> that code implies that cache invalidate will be applied also for CPU's
>> that doesn't suffer from speculative prefetch issue, right?
>
> Yes, but we need to do this for bidirectional mappings as well (which
> would've only been cleaned.)
>
> What would be useful is if someone could run some tests on the performance
> impact of this (eg, measuring the DMA performance of a SATA hard drive)
> and comparing the resulting performance.
>
> I _could_ do that, but it would mean taking down my network and running
> the tests on a fairly old machine.
I want to help with that testing, is there any git tree where I can
pull the code from?
>
> If it does turn out to be a problem, we can change the 'inv_range' and
> 'clean_range' functions to be 'dev_to_cpu' and 'cpu_to_dev' functions,
> and push the decisions about what to do at each point down into the
> per-CPU assembly.
the thing is that the inv_range or dev_to_cpu is called before and
after the DMA operation.
and for the systems that doesn't have speculative prefetch, we don't
that function to be called at all after the DMA operation.
>

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-24 15:46         ` saeed bishara
@ 2009-11-24 16:22           ` Russell King - ARM Linux
  2009-11-24 16:40             ` saeed bishara
                               ` (2 more replies)
  0 siblings, 3 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-24 16:22 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 24, 2009 at 05:46:02PM +0200, saeed bishara wrote:
> On Mon, Nov 23, 2009 at 7:27 PM, Russell King - ARM Linux
> <linux@arm.linux.org.uk> wrote:
> > On Mon, Nov 23, 2009 at 05:25:06PM +0200, saeed bishara wrote:
> >> > + ? ? ? if (dir == DMA_FROM_DEVICE) {
> >> > + ? ? ? ? ? ? ? outer_inv_range(paddr, paddr + size);
> >> > + ? ? ? ? ? ? ? dmac_inv_range(kaddr, kaddr + size);
> >> it's not clear why outer cache invalidated before inner cache, and, I
> >> think that it may be incorrect, how can you be sure that a dirty line
> >> that is in inner cache can't be moved down to outer cache?
> >
> > I think you have a point, but also see Catalin's point. ?When we are
> > invalidating, we have to invalidate the outer caches before the inner
> > caches, otherwise there is a danger that the inner caches could
> > prefetch from the outer between the two operations.
> >
> > I think that the only thing we can do is as my previous version did,
> > and always clean the caches at this point - first the inner and then
> > the outer caches.
>
> this makes sense, if  a specific CPU needs the outer cache to be
> invalidated before the inner one, then that better be done in a
> separate patch as this is a different issue and has nothing to do with
> the speculative prefetch

That will mean redoing most of the patch series, which won't be pretty.

> > Yes, but we need to do this for bidirectional mappings as well (which
> > would've only been cleaned.)
> >
> > What would be useful is if someone could run some tests on the performance
> > impact of this (eg, measuring the DMA performance of a SATA hard drive)
> > and comparing the resulting performance.
> >
> > I _could_ do that, but it would mean taking down my network and running
> > the tests on a fairly old machine.
>
> I want to help with that testing, is there any git tree where I can
> pull the code from?

There isn't, because apparantly under git rules, as soon as I publish it
in git form it must be fixed.  So I'm keeping the stuff private and only
issuing patches on this mailing list.

> > If it does turn out to be a problem, we can change the 'inv_range' and
> > 'clean_range' functions to be 'dev_to_cpu' and 'cpu_to_dev' functions,
> > and push the decisions about what to do at each point down into the
> > per-CPU assembly.
>
> the thing is that the inv_range or dev_to_cpu is called before and
> after the DMA operation.
> and for the systems that doesn't have speculative prefetch, we don't
> that function to be called at all after the DMA operation.

Well, the numbers based on trivial looping tests on Versatile/PB926
(in other words, it's not a real test) are:

--- original ---
Test: Null Function: 108.000ns
Test: Map 4K DMA_FROM_DEVICE: 10393.000ns
Test: Unmap 4K DMA_FROM_DEVICE: 107.000ns 
Test: Map+Unmap 4K DMA_FROM_DEVICE: 10388.999ns
Test: Map 4K DMA_TO_DEVICE: 10374.000ns
Test: Unmap 4K DMA_TO_DEVICE: 106.000ns  
Test: Map+Unmap 4K DMA_TO_DEVICE: 10374.000ns
Test: Map 32K DMA_FROM_DEVICE: 78653.998ns
Test: Unmap 32K DMA_FROM_DEVICE: 106.000ns
Test: Map+Unmap 32K DMA_FROM_DEVICE: 78654.003ns
Test: Map 32K DMA_TO_DEVICE: 78639.984ns
Test: Unmap 32K DMA_TO_DEVICE: 107.000ns
Test: Map+Unmap 32K DMA_TO_DEVICE: 78640.041ns

--- replacement ---
Test: Null Function: 109.000ns
Test: Map 4K DMA_FROM_DEVICE: 10196.000ns
Test: Unmap 4K DMA_FROM_DEVICE: 10185.999ns
Test: Map+Unmap 4K DMA_FROM_DEVICE: 20290.999ns
Test: Map 4K DMA_TO_DEVICE: 10164.000ns
Test: Unmap 4K DMA_TO_DEVICE: 327.000ns
Test: Map+Unmap 4K DMA_TO_DEVICE: 10454.000ns
Test: Map 32K DMA_FROM_DEVICE: 78457.995ns
Test: Unmap 32K DMA_FROM_DEVICE: 78473.016ns  
Test: Map+Unmap 32K DMA_FROM_DEVICE: 156819.917ns
Test: Map 32K DMA_TO_DEVICE: 78454.342ns
Test: Unmap 32K DMA_TO_DEVICE: 325.995ns
Test: Map+Unmap 32K DMA_TO_DEVICE: 78715.267ns

which shows minimal impact for the DMA_TO_DEVICE case, but a major impact
on the DMA_FROM_DEVICE case, which I don't think we can live with.

So I think it's time to re-think this.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-24 16:22           ` Russell King - ARM Linux
@ 2009-11-24 16:40             ` saeed bishara
  2009-11-24 16:48               ` Catalin Marinas
  2009-11-24 17:02               ` Russell King - ARM Linux
  2009-11-24 19:12             ` Nicolas Pitre
  2009-11-25 12:12             ` Mark Brown
  2 siblings, 2 replies; 65+ messages in thread
From: saeed bishara @ 2009-11-24 16:40 UTC (permalink / raw)
  To: linux-arm-kernel

Russell,
this patch series fixes the cache coherency due to the speculative
prefetch issue . but what about the case were the cpu prefetches
address from I/O registers (when the cpu the manager mode)?

saeed

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-23 17:27       ` Russell King - ARM Linux
  2009-11-24 15:46         ` saeed bishara
@ 2009-11-24 16:47         ` Catalin Marinas
  2009-11-24 20:00           ` Russell King - ARM Linux
  1 sibling, 1 reply; 65+ messages in thread
From: Catalin Marinas @ 2009-11-24 16:47 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 2009-11-23 at 17:27 +0000, Russell King - ARM Linux wrote:
> On Mon, Nov 23, 2009 at 05:25:06PM +0200, saeed bishara wrote:
> > > +       if (dir == DMA_FROM_DEVICE) {
> > > +               outer_inv_range(paddr, paddr + size);
> > > +               dmac_inv_range(kaddr, kaddr + size);
> > it's not clear why outer cache invalidated before inner cache, and, I
> > think that it may be incorrect, how can you be sure that a dirty line
> > that is in inner cache can't be moved down to outer cache?
> 
> I think you have a point, but also see Catalin's point.  When we are
> invalidating, we have to invalidate the outer caches before the inner
> caches, otherwise there is a danger that the inner caches could
> prefetch from the outer between the two operations.

Good point. In the cpu_to_dev case, we can invalidate the inner cache
before the outer cache as it may be faster than cleaning. We also don't
care about speculative fetches at this point, only about random cache
line evictions.

In the dev_to_cpu case, we must invalidate the outer cache before the
inner cache.

Otherwise I'm ok with the patch.

-- 
Catalin

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-24 16:40             ` saeed bishara
@ 2009-11-24 16:48               ` Catalin Marinas
  2009-11-24 17:02               ` Russell King - ARM Linux
  1 sibling, 0 replies; 65+ messages in thread
From: Catalin Marinas @ 2009-11-24 16:48 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 2009-11-24 at 16:40 +0000, saeed bishara wrote:
> this patch series fixes the cache coherency due to the speculative
> prefetch issue . but what about the case were the cpu prefetches
> address from I/O registers (when the cpu the manager mode)?

I have a patch and happy to repost it - removing the domain switching
from the kernel.

-- 
Catalin

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-24 16:40             ` saeed bishara
  2009-11-24 16:48               ` Catalin Marinas
@ 2009-11-24 17:02               ` Russell King - ARM Linux
  1 sibling, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-24 17:02 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 24, 2009 at 06:40:39PM +0200, saeed bishara wrote:
> this patch series fixes the cache coherency due to the speculative
> prefetch issue . but what about the case were the cpu prefetches
> address from I/O registers (when the cpu the manager mode)?

That's an entirely separate issue which has nothing to do with DMA mapping.

At this point, I'm sitting on soo many patches that I really don't want to
tackle any more ARMv7 issues for the upcoming merge window - certainly not
until we get some of these patches finalized and merged into my 'devel'
branch.

Any of these series which aren't merged into my 'devel' branch within the
next 24 hours will not be going in the next merge window.

If there is a -rc9, it should be on Thursday, and then there will be
about one week before the merge window opens.  If there isn't going to
be a -rc9, the merge window should open on Thursday.

It is bad form to suddenly spring a load of patches into linux-next at
short notice.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-24 16:22           ` Russell King - ARM Linux
  2009-11-24 16:40             ` saeed bishara
@ 2009-11-24 19:12             ` Nicolas Pitre
  2009-11-25 12:12             ` Mark Brown
  2 siblings, 0 replies; 65+ messages in thread
From: Nicolas Pitre @ 2009-11-24 19:12 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 24 Nov 2009, Russell King - ARM Linux wrote:

> > I want to help with that testing, is there any git tree where I can
> > pull the code from?
> 
> There isn't, because apparantly under git rules, as soon as I publish it
> in git form it must be fixed.  So I'm keeping the stuff private and only
> issuing patches on this mailing list.

I think you should be perfectly fine publishing some Git branches with 
the explicit indication that such branches are unstable and volatile.  
After all that's exactly what the high profile linux-next repository is 
about, and everyone should know already that basing any work on top of 
linux-next is a good way to screw oneself.  And even then there is 'git 
rebase' to let you move your stuff onto a stable base if needed.

A very few individuals apparently have difficulties with the distinction 
between a stable and an unstable Git branch which should be their own 
problem.  That shouldn't prevent the rest of the world from using Git in 
such a way.  Even Linus is not against a daily rebasing linux-next.

You still can expand your Git branch namespace for clarity.  Branches 
such as those are quite unambiguous:

	unstable/wip/dma_mapping_v1
	unstable/wip/dma_mapping_v2

Etc.


Nicolas

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-24 16:47         ` Catalin Marinas
@ 2009-11-24 20:00           ` Russell King - ARM Linux
  2009-11-24 20:35             ` Nicolas Pitre
  2009-11-25 12:14             ` Catalin Marinas
  0 siblings, 2 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-24 20:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 24, 2009 at 04:47:41PM +0000, Catalin Marinas wrote:
> On Mon, 2009-11-23 at 17:27 +0000, Russell King - ARM Linux wrote:
> > On Mon, Nov 23, 2009 at 05:25:06PM +0200, saeed bishara wrote:
> > > > +       if (dir == DMA_FROM_DEVICE) {
> > > > +               outer_inv_range(paddr, paddr + size);
> > > > +               dmac_inv_range(kaddr, kaddr + size);
> > > it's not clear why outer cache invalidated before inner cache, and, I
> > > think that it may be incorrect, how can you be sure that a dirty line
> > > that is in inner cache can't be moved down to outer cache?
> > 
> > I think you have a point, but also see Catalin's point.  When we are
> > invalidating, we have to invalidate the outer caches before the inner
> > caches, otherwise there is a danger that the inner caches could
> > prefetch from the outer between the two operations.
> 
> Good point. In the cpu_to_dev case, we can invalidate the inner cache
> before the outer cache as it may be faster than cleaning. We also don't
> care about speculative fetches at this point, only about random cache
> line evictions.
> 
> In the dev_to_cpu case, we must invalidate the outer cache before the
> inner cache.
> 
> Otherwise I'm ok with the patch.

Having looked into pushing the 3rd argument down into per-CPU code, I've
come up against two problems:

1. the dmabounce (hacky, I wish it'd die) code wants to be able to place
   the cache in the same state as if the buffer was used for DMA - it
   does this by calling dmac_clean_range() and outer_clean_range().

2. various MTD map drivers want the ARM private flush_ioremap_region,
   which is built on top of dmac_inv_cache.

You can't reliably implement either of these two functions using the new
dmac_(map|unmap)_range interfaces without making the callers knowledgeable
about the implementation of the map/unmap range interfaces.

This means we currently end up with five DMA operation pointers in the
cpu_cache_fns:

        void (*dma_map_range)(const void *, size_t, int);
        void (*dma_unmap_range)(const void *, size_t, int);
        void (*dma_inv_range)(const void *, const void *);
        void (*dma_clean_range)(const void *, const void *);
        void (*dma_flush_range)(const void *, const void *);

which is rather excessive, especially as two will no longer have anything
to do with DMA.

I'm tempted to say that for 2.6.33, we'll take the performance hit for CPUs
older than ARMv7 of needlessly doing DMA cache maintainence on both map and
unmap, and we'll try to sort it out for 2.6.34.

The alternative is that we persist with streaming DMA being broken on ARMv7
for 2.6.33.

(This is why I like interfaces designed around purpose rather, than used
according to their function.  Function defined interfaces are a major PITA
if you need to go back and modify stuff - they make it much harder.)

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-24 20:00           ` Russell King - ARM Linux
@ 2009-11-24 20:35             ` Nicolas Pitre
  2009-11-24 21:01               ` Russell King - ARM Linux
  2009-11-25 12:14             ` Catalin Marinas
  1 sibling, 1 reply; 65+ messages in thread
From: Nicolas Pitre @ 2009-11-24 20:35 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 24 Nov 2009, Russell King - ARM Linux wrote:

> On Tue, Nov 24, 2009 at 04:47:41PM +0000, Catalin Marinas wrote:
> > On Mon, 2009-11-23 at 17:27 +0000, Russell King - ARM Linux wrote:
> > > On Mon, Nov 23, 2009 at 05:25:06PM +0200, saeed bishara wrote:
> > > > > +       if (dir == DMA_FROM_DEVICE) {
> > > > > +               outer_inv_range(paddr, paddr + size);
> > > > > +               dmac_inv_range(kaddr, kaddr + size);
> > > > it's not clear why outer cache invalidated before inner cache, and, I
> > > > think that it may be incorrect, how can you be sure that a dirty line
> > > > that is in inner cache can't be moved down to outer cache?
> > > 
> > > I think you have a point, but also see Catalin's point.  When we are
> > > invalidating, we have to invalidate the outer caches before the inner
> > > caches, otherwise there is a danger that the inner caches could
> > > prefetch from the outer between the two operations.
> > 
> > Good point. In the cpu_to_dev case, we can invalidate the inner cache
> > before the outer cache as it may be faster than cleaning. We also don't
> > care about speculative fetches at this point, only about random cache
> > line evictions.
> > 
> > In the dev_to_cpu case, we must invalidate the outer cache before the
> > inner cache.
> > 
> > Otherwise I'm ok with the patch.
> 
> Having looked into pushing the 3rd argument down into per-CPU code, I've
> come up against two problems:
> 
> 1. the dmabounce (hacky, I wish it'd die) code wants to be able to place
>    the cache in the same state as if the buffer was used for DMA - it
>    does this by calling dmac_clean_range() and outer_clean_range().
> 
> 2. various MTD map drivers want the ARM private flush_ioremap_region,
>    which is built on top of dmac_inv_cache.
> 
> You can't reliably implement either of these two functions using the new
> dmac_(map|unmap)_range interfaces without making the callers knowledgeable
> about the implementation of the map/unmap range interfaces.
> 
> This means we currently end up with five DMA operation pointers in the
> cpu_cache_fns:
> 
>         void (*dma_map_range)(const void *, size_t, int);
>         void (*dma_unmap_range)(const void *, size_t, int);
>         void (*dma_inv_range)(const void *, const void *);
>         void (*dma_clean_range)(const void *, const void *);
>         void (*dma_flush_range)(const void *, const void *);
> 
> which is rather excessive, especially as two will no longer have anything
> to do with DMA.
> 
> I'm tempted to say that for 2.6.33, we'll take the performance hit for CPUs
> older than ARMv7 of needlessly doing DMA cache maintainence on both map and
> unmap, and we'll try to sort it out for 2.6.34.
> 
> The alternative is that we persist with streaming DMA being broken on ARMv7
> for 2.6.33.

What about simply having those two problematic cases above implemented 
on top of dmac_(map|unmap)_range interfaces even if that implies inner 
implementation details, and stick a big fat warning besides them until 
they're properly sorted out?  That should affect a much reduced set of 
users compared to all pre ARMv7 users.


Nicolas

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-24 20:35             ` Nicolas Pitre
@ 2009-11-24 21:01               ` Russell King - ARM Linux
  2009-11-24 21:46                 ` saeed bishara
  0 siblings, 1 reply; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-24 21:01 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 24, 2009 at 03:35:13PM -0500, Nicolas Pitre wrote:
> On Tue, 24 Nov 2009, Russell King - ARM Linux wrote:
> > Having looked into pushing the 3rd argument down into per-CPU code, I've
> > come up against two problems:
> > 
> > 1. the dmabounce (hacky, I wish it'd die) code wants to be able to place
> >    the cache in the same state as if the buffer was used for DMA - it
> >    does this by calling dmac_clean_range() and outer_clean_range().
> > 
> > 2. various MTD map drivers want the ARM private flush_ioremap_region,
> >    which is built on top of dmac_inv_cache.
> > 
> > You can't reliably implement either of these two functions using the new
> > dmac_(map|unmap)_range interfaces without making the callers knowledgeable
> > about the implementation of the map/unmap range interfaces.
> > 
> > This means we currently end up with five DMA operation pointers in the
> > cpu_cache_fns:
> > 
> >         void (*dma_map_range)(const void *, size_t, int);
> >         void (*dma_unmap_range)(const void *, size_t, int);
> >         void (*dma_inv_range)(const void *, const void *);
> >         void (*dma_clean_range)(const void *, const void *);
> >         void (*dma_flush_range)(const void *, const void *);
> > 
> > which is rather excessive, especially as two will no longer have anything
> > to do with DMA.
> > 
> > I'm tempted to say that for 2.6.33, we'll take the performance hit for CPUs
> > older than ARMv7 of needlessly doing DMA cache maintainence on both map and
> > unmap, and we'll try to sort it out for 2.6.34.
> > 
> > The alternative is that we persist with streaming DMA being broken on ARMv7
> > for 2.6.33.
> 
> What about simply having those two problematic cases above implemented 
> on top of dmac_(map|unmap)_range interfaces even if that implies inner 
> implementation details, and stick a big fat warning besides them until 
> they're properly sorted out?  That should affect a much reduced set of 
> users compared to all pre ARMv7 users.

My point was it doesn't work.  Not only will you have to select which
of the map/unmap functions you require, but also the 3rd argument to
get the desired behaviour depending on the CPU in use.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-24 21:01               ` Russell King - ARM Linux
@ 2009-11-24 21:46                 ` saeed bishara
  0 siblings, 0 replies; 65+ messages in thread
From: saeed bishara @ 2009-11-24 21:46 UTC (permalink / raw)
  To: linux-arm-kernel

btw, will it be necessary to invalidate a harvard caches when doing unmap?
if the speculative prefetch ends up with cache line that filled into
the icache, then no need invalidate it.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-24 16:22           ` Russell King - ARM Linux
  2009-11-24 16:40             ` saeed bishara
  2009-11-24 19:12             ` Nicolas Pitre
@ 2009-11-25 12:12             ` Mark Brown
  2 siblings, 0 replies; 65+ messages in thread
From: Mark Brown @ 2009-11-25 12:12 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 24, 2009 at 04:22:50PM +0000, Russell King - ARM Linux wrote:

> There isn't, because apparantly under git rules, as soon as I publish it
> in git form it must be fixed.  So I'm keeping the stuff private and only
> issuing patches on this mailing list.

Rebasing is only a big deal if it's a branch other people are supposed
to be working off, or (to a lesser extent) on a "final submission"
branch for reviewed and stable patches.  A branch that's used for work
in progress where the patches are being constantly revised isn't a
problem here.

The issue with branches other people are working off is that it's
painful for them to have to constantly rebase or otherwise update for
your history.  The issue with final submission branches is that rebasing
those means that the history no longer shows the tree that was actually
reviewed and accepted (and hopefully tested) which makes history
analysis to locate breakages such as bisection less useful.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-24 20:00           ` Russell King - ARM Linux
  2009-11-24 20:35             ` Nicolas Pitre
@ 2009-11-25 12:14             ` Catalin Marinas
  2009-11-25 16:26               ` Russell King - ARM Linux
  1 sibling, 1 reply; 65+ messages in thread
From: Catalin Marinas @ 2009-11-25 12:14 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 2009-11-24 at 20:00 +0000, Russell King - ARM Linux wrote:
> On Tue, Nov 24, 2009 at 04:47:41PM +0000, Catalin Marinas wrote:
> > On Mon, 2009-11-23 at 17:27 +0000, Russell King - ARM Linux wrote:
> > > On Mon, Nov 23, 2009 at 05:25:06PM +0200, saeed bishara wrote:
> > > > > +       if (dir == DMA_FROM_DEVICE) {
> > > > > +               outer_inv_range(paddr, paddr + size);
> > > > > +               dmac_inv_range(kaddr, kaddr + size);
> > > > it's not clear why outer cache invalidated before inner cache, and, I
> > > > think that it may be incorrect, how can you be sure that a dirty line
> > > > that is in inner cache can't be moved down to outer cache?
> > >
> > > I think you have a point, but also see Catalin's point.  When we are
> > > invalidating, we have to invalidate the outer caches before the inner
> > > caches, otherwise there is a danger that the inner caches could
> > > prefetch from the outer between the two operations.
> >
> > Good point. In the cpu_to_dev case, we can invalidate the inner cache
> > before the outer cache as it may be faster than cleaning. We also don't
> > care about speculative fetches at this point, only about random cache
> > line evictions.
> >
> > In the dev_to_cpu case, we must invalidate the outer cache before the
> > inner cache.
> >
> > Otherwise I'm ok with the patch.
> 
> Having looked into pushing the 3rd argument down into per-CPU code, I've
> come up against two problems:
> 
> 1. the dmabounce (hacky, I wish it'd die) code wants to be able to place
>    the cache in the same state as if the buffer was used for DMA - it
>    does this by calling dmac_clean_range() and outer_clean_range().
> 
> 2. various MTD map drivers want the ARM private flush_ioremap_region,
>    which is built on top of dmac_inv_cache.
> 
> You can't reliably implement either of these two functions using the new
> dmac_(map|unmap)_range interfaces without making the callers knowledgeable
> about the implementation of the map/unmap range interfaces.
> 
> This means we currently end up with five DMA operation pointers in the
> cpu_cache_fns:
> 
>         void (*dma_map_range)(const void *, size_t, int);
>         void (*dma_unmap_range)(const void *, size_t, int);
>         void (*dma_inv_range)(const void *, const void *);
>         void (*dma_clean_range)(const void *, const void *);
>         void (*dma_flush_range)(const void *, const void *);
> 
> which is rather excessive, especially as two will no longer have anything
> to do with DMA.

BTW, it would really help if you publish a volatile git branch
containing these patches. There's nothing wrong with it. Even the git
tool maintainer has a branch called "pu" for experimental stuff.

I'm getting confused. Can we not just implement a generic dma_map_range
and dma_unmap_range in terms of dma_clean_range and dma_inv_range as
low-level CPU support functions?

-- 
Catalin

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate
  2009-11-23 13:38   ` [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping cache lines " Russell King - ARM Linux
  2009-11-23 19:35     ` Nicolas Pitre
@ 2009-11-25 12:19     ` Catalin Marinas
  2009-11-25 16:31       ` Russell King - ARM Linux
  1 sibling, 1 reply; 65+ messages in thread
From: Catalin Marinas @ 2009-11-25 12:19 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 2009-11-23 at 13:38 +0000, Russell King - ARM Linux wrote:
> Since we now clean the DMA buffers on map, 

Is this still valid with the latest patch you sent?

> there's no need to clean
> overlapping cache lines on invalidation anymore.  (Note: the DMA API
> prohibits other data sharing the same cache line as a DMA buffer
> anyway.)

Why was this cleaning ever needed if the DMA API prohibits cache line
sharing?

Even if we do cleaning on map, having shared cache lines is broken since
you may do an invalidate on unmap but the shared cache line may have
been dirtied.

-- 
Catalin

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-25 12:14             ` Catalin Marinas
@ 2009-11-25 16:26               ` Russell King - ARM Linux
  2009-11-26 13:21                 ` Ronen Shitrit
  0 siblings, 1 reply; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-25 16:26 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 25, 2009 at 12:14:39PM +0000, Catalin Marinas wrote:
> I'm getting confused. Can we not just implement a generic dma_map_range
> and dma_unmap_range in terms of dma_clean_range and dma_inv_range as
> low-level CPU support functions?

That depends if you want to create something like:

static void dma_unmap_range(const void *kaddr, size_t size, enum dma_direction dir)
{
	if ((cpu_is_xxx() || cpu_is_yyy()...) && dir != DMA_TO_DEVICE) {
		outer_inv_range(kaddr, kaddr + size);
		dma_inv_range(kaddr, kaddr + size);
	}
}

static void dma_map_range(const void *kaddr, size_t size, enum dma_direction dir)
{
	if (cpu_is_xxx() || cpu_is_yyy()) {
		if (dir == DMA_TO_DEVICE) {
			dma_clean_range(kaddr, kaddr + size);
			outer_clean_range(kaddr, kaddr + size);
		} else {
			dma_inv_range(kaddr, kaddr + size);
			outer_inv_range(kaddr, kaddr + size);
		}
	} else {
		switch (dir) {
		case DMA_TO_DEVICE:
			dma_clean_range(kaddr, kaddr + size);
			outer_clean_range(kaddr, kaddr + size);
			break;
		case DMA_FROM_DEVICE:
			dma_inv_range(kaddr, kaddr + size);
			outer_inv_range(kaddr, kaddr + size);
			break;
		case DMA_BIDIRECTIONAL:
			dma_flush_range(kaddr, kaddr + size);
			outer_flush_range(kaddr, kaddr + size);
			break;
		}
	}
}

This is utterly stupid, since the functions we're calling are already
CPU specific, and adds extra needless overhead where its plainly not
necessary - just because we have two stupid cases, one where the API
is being mis-used and one where its being used in hacky code.

I'd rather have the decisions about what to do on map/unmap entirely
in CPU specific code, but first we need to eliminate the two hacks.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate
  2009-11-25 12:19     ` Catalin Marinas
@ 2009-11-25 16:31       ` Russell King - ARM Linux
  2009-11-25 17:02         ` [PATCH v2 07/17] ARM: dma-mapping: no need to cleanoverlapping " Catalin Marinas
  2009-11-25 17:34         ` [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping " Russell King - ARM Linux
  0 siblings, 2 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-25 16:31 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 25, 2009 at 12:19:42PM +0000, Catalin Marinas wrote:
> On Mon, 2009-11-23 at 13:38 +0000, Russell King - ARM Linux wrote:
> > Since we now clean the DMA buffers on map, 
> 
> Is this still valid with the latest patch you sent?

Again, welcome to why the DMA functions end up being screwed over my
ARMv7... and why I think we should hold off merging this until
post-2.6.33 when we've had time to properly sorted out issues like
this.

> > there's no need to clean
> > overlapping cache lines on invalidation anymore.  (Note: the DMA API
> > prohibits other data sharing the same cache line as a DMA buffer
> > anyway.)
> 
> Why was this cleaning ever needed if the DMA API prohibits cache line
> sharing?
>
> Even if we do cleaning on map, having shared cache lines is broken since
> you may do an invalidate on unmap but the shared cache line may have
> been dirtied.

It happens, particularly with SCSI.

And congratuations on realising why it's not permitted.  I do wish
other people would see the light and not do things like DMA directly
to the kernel stack and similar tricks.

In fact, I've made the decision.  I'm not merging this for the upcoming
merge window.  It will wait another cycle to give us time to properly
sort this problem out in a sane way, rather than rushing through with
a half baked solution.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 07/17] ARM: dma-mapping: no need to cleanoverlapping cache lines on invalidate
  2009-11-25 16:31       ` Russell King - ARM Linux
@ 2009-11-25 17:02         ` Catalin Marinas
  2009-11-25 17:34         ` [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping " Russell King - ARM Linux
  1 sibling, 0 replies; 65+ messages in thread
From: Catalin Marinas @ 2009-11-25 17:02 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 2009-11-25 at 16:31 +0000, Russell King - ARM Linux wrote:
> In fact, I've made the decision.  I'm not merging this for the upcoming
> merge window.  It will wait another cycle to give us time to properly
> sort this problem out in a sane way, rather than rushing through with
> a half baked solution.

That would be a good idea. But please try to publish a volatile branch
somewhere for testing this easily (maybe before getting into -next).

-- 
Catalin

P.S. I'm off for the rest of the week with very limited e-mail access.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate
  2009-11-25 16:31       ` Russell King - ARM Linux
  2009-11-25 17:02         ` [PATCH v2 07/17] ARM: dma-mapping: no need to cleanoverlapping " Catalin Marinas
@ 2009-11-25 17:34         ` Russell King - ARM Linux
  1 sibling, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-11-25 17:34 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 25, 2009 at 04:31:19PM +0000, Russell King - ARM Linux wrote:
> In fact, I've made the decision.  I'm not merging this for the upcoming
> merge window.  It will wait another cycle to give us time to properly
> sort this problem out in a sane way, rather than rushing through with
> a half baked solution.

Actually, I'll merge the first four patches, since they don't cause any
major changes to the existing API, and just cleanup what we already have.
However, the rest are currently too volatile to merge.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses
  2009-11-25 16:26               ` Russell King - ARM Linux
@ 2009-11-26 13:21                 ` Ronen Shitrit
  0 siblings, 0 replies; 65+ messages in thread
From: Ronen Shitrit @ 2009-11-26 13:21 UTC (permalink / raw)
  To: linux-arm-kernel

I was just about to start working on a specific unmap for Kirkwood when I noticed this on going work...
Kirkwood (_ARMv5_) is also using speculative prefetch and need the below unmap invalidate fix.

Regards

-----Original Message-----
From: linux-arm-kernel-bounces@lists.infradead.org [mailto:linux-arm-kernel-bounces at lists.infradead.org] On Behalf Of Russell King - ARM Linux
Sent: Wednesday, November 25, 2009 6:27 PM
To: Catalin Marinas
Cc: saeed bishara; linux-arm-kernel at lists.infradead.org
Subject: Re: [PATCH v2 06/17] ARM: dma-mapping: fix for speculative accesses

On Wed, Nov 25, 2009 at 12:14:39PM +0000, Catalin Marinas wrote:
> I'm getting confused. Can we not just implement a generic dma_map_range
> and dma_unmap_range in terms of dma_clean_range and dma_inv_range as
> low-level CPU support functions?

That depends if you want to create something like:

static void dma_unmap_range(const void *kaddr, size_t size, enum dma_direction dir)
{
	if ((cpu_is_xxx() || cpu_is_yyy()...) && dir != DMA_TO_DEVICE) {
		outer_inv_range(kaddr, kaddr + size);
		dma_inv_range(kaddr, kaddr + size);
	}
}

static void dma_map_range(const void *kaddr, size_t size, enum dma_direction dir)
{
	if (cpu_is_xxx() || cpu_is_yyy()) {
		if (dir == DMA_TO_DEVICE) {
			dma_clean_range(kaddr, kaddr + size);
			outer_clean_range(kaddr, kaddr + size);
		} else {
			dma_inv_range(kaddr, kaddr + size);
			outer_inv_range(kaddr, kaddr + size);
		}
	} else {
		switch (dir) {
		case DMA_TO_DEVICE:
			dma_clean_range(kaddr, kaddr + size);
			outer_clean_range(kaddr, kaddr + size);
			break;
		case DMA_FROM_DEVICE:
			dma_inv_range(kaddr, kaddr + size);
			outer_inv_range(kaddr, kaddr + size);
			break;
		case DMA_BIDIRECTIONAL:
			dma_flush_range(kaddr, kaddr + size);
			outer_flush_range(kaddr, kaddr + size);
			break;
		}
	}
}

This is utterly stupid, since the functions we're calling are already
CPU specific, and adds extra needless overhead where its plainly not
necessary - just because we have two stupid cases, one where the API
is being mis-used and one where its being used in hacky code.

I'd rather have the decisions about what to do on map/unmap entirely
in CPU specific code, but first we need to eliminate the two hacks.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel at lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus()
  2009-11-20 18:02 ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus() Russell King - ARM Linux
  2009-11-23 12:05   ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and__pfn_to_bus() Catalin Marinas
@ 2009-12-12 14:01   ` Anders Grafström
  2009-12-12 14:37     ` Russell King - ARM Linux
  1 sibling, 1 reply; 65+ messages in thread
From: Anders Grafström @ 2009-12-12 14:01 UTC (permalink / raw)
  To: linux-arm-kernel

Russell King - ARM Linux wrote:
> The non-highmem() and the __pfn_to_bus() based page_to_dma() both
> compile to the same code, so its pointless having these two different
> approaches.  Use the __pfn_to_bus() based version.
> 
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> ---
>  arch/arm/include/asm/dma-mapping.h |   10 ----------
>  arch/arm/include/asm/memory.h      |    3 ++-
>  2 files changed, 2 insertions(+), 11 deletions(-)

footbridge doesn't build after this.
Could something like the patch below be in order?

diff --git a/arch/arm/mach-footbridge/include/mach/memory.h b/arch/arm/mach-footbridge/include/mach/memory.h
index cb16e59..cd23276 100644
--- a/arch/arm/mach-footbridge/include/mach/memory.h
+++ b/arch/arm/mach-footbridge/include/mach/memory.h
@@ -51,6 +51,9 @@ extern unsigned long __bus_to_virt(unsigned long);
 
 #endif
 
+#define __pfn_to_bus(x)		__pfn_to_phys(x)
+#define __bus_to_pfn(x)		__phys_to_pfn(x)
+
 /*
  * Cache flushing area.
  */

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus()
  2009-12-12 14:01   ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus() Anders Grafström
@ 2009-12-12 14:37     ` Russell King - ARM Linux
  2009-12-14 18:47       ` Anders Grafström
  0 siblings, 1 reply; 65+ messages in thread
From: Russell King - ARM Linux @ 2009-12-12 14:37 UTC (permalink / raw)
  To: linux-arm-kernel

On Sat, Dec 12, 2009 at 03:01:31PM +0100, Anders Grafstr?m wrote:
> Russell King - ARM Linux wrote:
> > The non-highmem() and the __pfn_to_bus() based page_to_dma() both
> > compile to the same code, so its pointless having these two different
> > approaches.  Use the __pfn_to_bus() based version.
> > 
> > Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> > ---
> >  arch/arm/include/asm/dma-mapping.h |   10 ----------
> >  arch/arm/include/asm/memory.h      |    3 ++-
> >  2 files changed, 2 insertions(+), 11 deletions(-)
> 
> footbridge doesn't build after this.
> Could something like the patch below be in order?

No, because bus addresses on footbridge are not physical addresses.  This
should solve the problem:

diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c
index b97f529..41febc7 100644
--- a/arch/arm/mach-footbridge/common.c
+++ b/arch/arm/mach-footbridge/common.c
@@ -201,6 +201,11 @@ void __init footbridge_map_io(void)
 
 #ifdef CONFIG_FOOTBRIDGE_ADDIN
 
+static inline unsigned long fb_bus_sdram_offset(void)
+{
+	return *CSR_PCISDRAMBASE & 0xfffffff0;
+}
+
 /*
  * These two functions convert virtual addresses to PCI addresses and PCI
  * addresses to virtual addresses.  Note that it is only legal to use these
@@ -210,14 +215,13 @@ unsigned long __virt_to_bus(unsigned long res)
 {
 	WARN_ON(res < PAGE_OFFSET || res >= (unsigned long)high_memory);
 
-	return (res - PAGE_OFFSET) + (*CSR_PCISDRAMBASE & 0xfffffff0);
+	return res + (fb_bus_sdram_offset() - PAGE_OFFSET);
 }
 EXPORT_SYMBOL(__virt_to_bus);
 
 unsigned long __bus_to_virt(unsigned long res)
 {
-	res -= (*CSR_PCISDRAMBASE & 0xfffffff0);
-	res += PAGE_OFFSET;
+	res = res - (fb_bus_sdram_offset() - PAGE_OFFSET);
 
 	WARN_ON(res < PAGE_OFFSET || res >= (unsigned long)high_memory);
 
@@ -225,4 +229,16 @@ unsigned long __bus_to_virt(unsigned long res)
 }
 EXPORT_SYMBOL(__bus_to_virt);
 
+unsigned long __pfn_to_bus(unsigned long pfn)
+{
+	return __pfn_to_phys(pfn) + (fb_bus_sdram_offset() - PHYS_OFFSET));
+}
+EXPORT_SYMBOL(__pfn_to_bus);
+
+unsigned long __bus_to_pfn(unsigned long bus)
+{
+	return __phys_to_pfn(bus - (fb_bus_sdram_offset() - PHYS_OFFSET));
+}
+EXPORT_SYMBOL(__bus_to_pfn);
+
 #endif
diff --git a/arch/arm/mach-footbridge/include/mach/memory.h b/arch/arm/mach-footbridge/include/mach/memory.h
index cb16e59..8d64f45 100644
--- a/arch/arm/mach-footbridge/include/mach/memory.h
+++ b/arch/arm/mach-footbridge/include/mach/memory.h
@@ -29,6 +29,8 @@
 #ifndef __ASSEMBLY__
 extern unsigned long __virt_to_bus(unsigned long);
 extern unsigned long __bus_to_virt(unsigned long);
+extern unsigned long __pfn_to_bus(unsigned long);
+extern unsigned long __bus_to_pfn(unsigned long);
 #endif
 #define __virt_to_bus	__virt_to_bus
 #define __bus_to_virt	__bus_to_virt
@@ -36,14 +38,15 @@ extern unsigned long __bus_to_virt(unsigned long);
 #elif defined(CONFIG_FOOTBRIDGE_HOST)
 
 /*
- * The footbridge is programmed to expose the system RAM at the corresponding
- * address.  So, if PAGE_OFFSET is 0xc0000000, RAM appears at 0xe0000000.
- * If 0x80000000, then its exposed at 0xa0000000 on the bus. etc.
- * The only requirement is that the RAM isn't placed at bus address 0 which
+ * The footbridge is programmed to expose the system RAM at 0xe0000000.
+ * The requirement is that the RAM isn't placed at bus address 0, which
  * would clash with VGA cards.
  */
-#define __virt_to_bus(x)	((x) - 0xe0000000)
-#define __bus_to_virt(x)	((x) + 0xe0000000)
+#define BUS_OFFSET		0xe0000000
+#define __virt_to_bus(x)	((x) + (BUS_OFFSET - PAGE_OFFSET))
+#define __bus_to_virt(x)	((x) - (BUS_OFFSET - PAGE_OFFSET))
+#define __pfn_to_bus(x)		(__pfn_to_phys(x) + (BUS_OFFSET - PHYS_OFFSET))
+#define __bus_to_pfn(x)		__phys_to_pfn((x) - (BUS_OFFSET - PHYS_OFFSET))
 
 #else
 

^ permalink raw reply related	[flat|nested] 65+ messages in thread

* [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus()
  2009-12-12 14:37     ` Russell King - ARM Linux
@ 2009-12-14 18:47       ` Anders Grafström
  0 siblings, 0 replies; 65+ messages in thread
From: Anders Grafström @ 2009-12-14 18:47 UTC (permalink / raw)
  To: linux-arm-kernel

Russell King - ARM Linux wrote:
> No, because bus addresses on footbridge are not physical addresses.  This
> should solve the problem:
> 
> diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c
> index b97f529..41febc7 100644
> --- a/arch/arm/mach-footbridge/common.c
> +++ b/arch/arm/mach-footbridge/common.c

Appears to be good. I did a quick test run and saw no problem.

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes
  2009-11-23 10:29   ` Russell King - ARM Linux
@ 2010-01-13  6:37     ` muni anda
  2010-01-13  8:42       ` Russell King - ARM Linux
  0 siblings, 1 reply; 65+ messages in thread
From: muni anda @ 2010-01-13  6:37 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Russell,

> We're a little too late for 2.6.32, but it'll be merged
> during the post 2.6.32 window.

Did these "Cortex speculative Prefetch" changes made it into ARM git?
You changes are working on our end for quite some time, but would like
to cherry-pick or merge the upstream commits.

Thanks
-Muni

^ permalink raw reply	[flat|nested] 65+ messages in thread

* [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes
  2010-01-13  6:37     ` muni anda
@ 2010-01-13  8:42       ` Russell King - ARM Linux
  0 siblings, 0 replies; 65+ messages in thread
From: Russell King - ARM Linux @ 2010-01-13  8:42 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Jan 12, 2010 at 10:37:27PM -0800, muni anda wrote:
> Hi Russell,
> 
> > We're a little too late for 2.6.32, but it'll be merged
> > during the post 2.6.32 window.
> 
> Did these "Cortex speculative Prefetch" changes made it into ARM git?
> You changes are working on our end for quite some time, but would like
> to cherry-pick or merge the upstream commits.

No, it's far too large to merge during the -rc period.  It is going to
have to wait for the next merge window.

^ permalink raw reply	[flat|nested] 65+ messages in thread

end of thread, other threads:[~2010-01-13  8:42 UTC | newest]

Thread overview: 65+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-20 18:25 [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Russell King - ARM Linux
2009-11-20 18:01 ` [PATCH 1/7] ARM: provide phys_to_page() to complement page_to_phys() Russell King - ARM Linux
2009-11-23 12:05   ` Catalin Marinas
2009-11-20 18:02 ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus() Russell King - ARM Linux
2009-11-23 12:05   ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and__pfn_to_bus() Catalin Marinas
2009-12-12 14:01   ` [PATCH 2/7] ARM: dma-mapping: simplify page_to_dma() and __pfn_to_bus() Anders Grafström
2009-12-12 14:37     ` Russell King - ARM Linux
2009-12-14 18:47       ` Anders Grafström
2009-11-20 18:03 ` [PATCH 3/7] ARM: dma-mapping: provide dma_to_page() Russell King - ARM Linux
2009-11-20 18:04 ` [PATCH 4/7] ARM: dma-mapping: split dma_unmap_page() from dma_unmap_single() Russell King - ARM Linux
2009-11-20 18:05 ` [PATCH 5/7] ARM: dma-mapping: use the idea of buffer ownership Russell King - ARM Linux
2009-11-20 18:06 ` [PATCH 6/7] ARM: dma-mapping: fix for speculative accesses Russell King - ARM Linux
2009-11-23 12:03   ` Catalin Marinas
2009-11-23 12:08     ` Russell King - ARM Linux
2009-11-23 13:38   ` [PATCH v2 06/17] " Russell King - ARM Linux
2009-11-23 15:25     ` saeed bishara
2009-11-23 17:27       ` Russell King - ARM Linux
2009-11-24 15:46         ` saeed bishara
2009-11-24 16:22           ` Russell King - ARM Linux
2009-11-24 16:40             ` saeed bishara
2009-11-24 16:48               ` Catalin Marinas
2009-11-24 17:02               ` Russell King - ARM Linux
2009-11-24 19:12             ` Nicolas Pitre
2009-11-25 12:12             ` Mark Brown
2009-11-24 16:47         ` Catalin Marinas
2009-11-24 20:00           ` Russell King - ARM Linux
2009-11-24 20:35             ` Nicolas Pitre
2009-11-24 21:01               ` Russell King - ARM Linux
2009-11-24 21:46                 ` saeed bishara
2009-11-25 12:14             ` Catalin Marinas
2009-11-25 16:26               ` Russell King - ARM Linux
2009-11-26 13:21                 ` Ronen Shitrit
2009-11-20 18:07 ` [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cache lines on invalidate Russell King - ARM Linux
2009-11-22 22:16   ` Nicolas Pitre
2009-11-23 10:26     ` Russell King - ARM Linux
2009-11-23 12:09   ` [PATCH 7/7] ARM: dma-mapping: no need to clean overlapping cachelines " Catalin Marinas
2009-11-23 13:30     ` Russell King - ARM Linux
2009-11-23 13:38   ` [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping cache lines " Russell King - ARM Linux
2009-11-23 19:35     ` Nicolas Pitre
2009-11-25 12:19     ` Catalin Marinas
2009-11-25 16:31       ` Russell King - ARM Linux
2009-11-25 17:02         ` [PATCH v2 07/17] ARM: dma-mapping: no need to cleanoverlapping " Catalin Marinas
2009-11-25 17:34         ` [PATCH v2 07/17] ARM: dma-mapping: no need to clean overlapping " Russell King - ARM Linux
2009-11-21 14:00 ` [PATCH 0/7] dma-mapping: Cortex A9 speculative prefetch fixes Jamie Iles
2009-11-23 10:29   ` Russell King - ARM Linux
2010-01-13  6:37     ` muni anda
2010-01-13  8:42       ` Russell King - ARM Linux
2009-11-21 17:56 ` Rajanikanth H.V
2009-11-21 18:57   ` Russell King - ARM Linux
2009-11-21 19:35 ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Russell King - ARM Linux
2009-11-21 19:37   ` [PATCH 01/10] ARM: dma-mapping: split out vmregion code from dma coherent mapping code Russell King - ARM Linux
2009-11-21 19:37   ` [PATCH 02/10] ARM: dma-mapping: functions to allocate/free a coherent buffer Russell King - ARM Linux
2009-11-21 19:37   ` [PATCH 03/10] ARM: dma-mapping: fix coherent arch dma_alloc_coherent() Russell King - ARM Linux
2009-11-21 19:38   ` [PATCH 04/10] ARM: dma-mapping: fix nommu dma_alloc_coherent() Russell King - ARM Linux
2009-11-21 19:38   ` [PATCH 05/10] ARM: dma-mapping: factor dma_free_coherent() common code Russell King - ARM Linux
2009-11-21 19:38   ` [PATCH 06/10] ARM: dma-mapping: move consistent_init into CONFIG_MMU section Russell King - ARM Linux
2009-11-22  0:05     ` Greg Ungerer
2009-11-22  0:16       ` Russell King - ARM Linux
2009-11-21 19:38   ` [PATCH 07/10] ARM: dma-mapping: clean up coherent arch dma allocation Russell King - ARM Linux
2009-11-21 19:39   ` [PATCH 08/10] ARM: dma-mapping: Factor out noMMU dma buffer allocation code Russell King - ARM Linux
2009-11-21 19:39   ` [PATCH 09/10] ARM: dma-mapping: get rid of setting/clearing the reserved page bit Russell King - ARM Linux
2009-11-21 19:39   ` [PATCH 10/10] ARM: dma-mapping: switch ARMv7 DMA mappings to retain 'memory' attribute Russell King - ARM Linux
2009-11-23 12:21     ` [PATCH 10/10] ARM: dma-mapping: switch ARMv7 DMA mappings to retain'memory' attribute Catalin Marinas
2009-11-23  7:10   ` [PATCH 0/10] dma-mapping: cleanup coherent/writealloc dma allocations and ARMv7 memory support Greg Ungerer
2009-11-23 10:26     ` Russell King - ARM Linux

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.