linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync
       [not found] <CGME20200818075050epcas2p15c780650f5f6b4a54ce731c273d24c98@epcas2p1.samsung.com>
@ 2020-08-18  7:43 ` Cho KyongHo
       [not found]   ` <CGME20200818075051epcas2p316edad6edd3df59444c08d392b075ea8@epcas2p3.samsung.com>
  2020-08-18  8:28   ` [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync Will Deacon
  0 siblings, 2 replies; 10+ messages in thread
From: Cho KyongHo @ 2020-08-18  7:43 UTC (permalink / raw)
  To: joro, catalin.marinas, will
  Cc: iommu, linux-kernel, linux-arm-kernel, m.szyprowski,
	robin.murphy, janghyuck.kim, hyesoo.yu, Cho KyongHo

Cache maintenance operations in the most of CPU architectures needs
memory barrier after the cache maintenance for the DMAs to view the
region of the memory correctly. The problem is that memory barrier is
very expensive and dma_[un]map_sg() and dma_sync_sg_for_{device|cpu}()
involves the memory barrier per every single cache sg entry. In some
CPU micro-architecture, a single memory barrier consumes more time than
cache clean on 4KiB. It becomes more serious if the number of CPU cores
are larger.
This patch introduces arch_sync_dma_for_device_relaxed() and
arch_sync_dma_for_cpu_relaxed() which do not involve memory barrier.
So the users called those functions require explicitly calling
arch_sync_barrier_for_device() and arch_sync_barrier_for_cpu(),
respectively to confirm the view of memory is consistent between the
CPUs and DMAs.

Signed-off-by: Cho KyongHo <pullip.cho@samsung.com>
---
 drivers/iommu/dma-iommu.c       |  6 +++--
 include/linux/dma-direct.h      | 29 +++++++++++++++++-----
 include/linux/dma-noncoherent.h | 54 +++++++++++++++++++++++++++++++++++++++++
 kernel/dma/Kconfig              |  8 ++++++
 kernel/dma/direct.c             | 25 +++++++++++++++----
 5 files changed, 109 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 5141d49..4f9c9cb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -705,7 +705,8 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
 		return;
 
 	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
+		arch_sync_dma_for_cpu_relaxed(sg_phys(sg), sg->length, dir);
+	arch_sync_barrier_for_cpu(dir);
 }
 
 static void iommu_dma_sync_sg_for_device(struct device *dev,
@@ -719,7 +720,8 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
 		return;
 
 	for_each_sg(sgl, sg, nelems, i)
-		arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
+		arch_sync_dma_for_device_relaxed(sg_phys(sg), sg->length, dir);
+	arch_sync_barrier_for_device(dir);
 }
 
 static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 6e87225..f5b1fee 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -152,7 +152,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
 }
 
-static inline dma_addr_t dma_direct_map_page(struct device *dev,
+static inline dma_addr_t __dma_direct_map_page(struct device *dev,
 		struct page *page, unsigned long offset, size_t size,
 		enum dma_data_direction dir, unsigned long attrs)
 {
@@ -172,20 +172,37 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
 		return DMA_MAPPING_ERROR;
 	}
 
-	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		arch_sync_dma_for_device(phys, size, dir);
 	return dma_addr;
 }
 
-static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+static inline dma_addr_t dma_direct_map_page(struct device *dev,
+		struct page *page, unsigned long offset, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
+{
+	dma_addr_t dma_addr = __dma_direct_map_page(dev, page, offset, size, dir, attrs);
+
+	if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
+	    !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		arch_sync_dma_for_device(page_to_phys(page) + offset, size, dir);
+
+	return dma_addr;
+}
+
+static inline void __dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	phys_addr_t phys = dma_to_phys(dev, addr);
 
+	if (unlikely(is_swiotlb_buffer(phys)))
+		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+}
+
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
-	if (unlikely(is_swiotlb_buffer(phys)))
-		swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+	__dma_direct_unmap_page(dev, addr, size, dir, attrs);
 }
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index ca09a4e..0a31e6c 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -73,23 +73,77 @@ static inline void arch_dma_cache_sync(struct device *dev, void *vaddr,
 #endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE_RELAXED
+void arch_sync_dma_for_device_relaxed(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
+
+static inline void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
+{
+	arch_sync_dma_for_device_relaxed(paddr, size, dir);
+	arch_sync_barrier_for_device(dir);
+}
+#else
+#define arch_sync_dma_for_device_relaxed arch_sync_dma_for_device
+
 void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir);
+
+static inline void arch_sync_barrier_for_device(enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE_RELAXED */
 #else
+static inline void arch_sync_dma_for_device_relaxed(phys_addr_t paddr,
+		size_t size, enum dma_data_direction dir)
+{
+}
+
 static inline void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir)
 {
 }
+
+static inline void arch_sync_barrier_for_device(enum dma_data_direction dir)
+{
+}
 #endif /* ARCH_HAS_SYNC_DMA_FOR_DEVICE */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED
+void arch_sync_dma_for_cpu_relaxed(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
+
+static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
+{
+	arch_sync_dma_for_cpu_relaxed(paddr, size, dir);
+	arch_sync_barrier_for_cpu(dir);
+}
+#else
+#define arch_sync_dma_for_cpu_relaxed arch_sync_dma_for_cpu
+
 void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir);
+
+static inline void arch_sync_barrier_for_cpu(enum dma_data_direction dir)
+{
+}
+#endif /* ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED */
 #else
+static inline void arch_sync_dma_for_cpu_relaxed(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
+{
+}
+
 static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir)
 {
 }
+
+static inline void arch_sync_barrier_for_cpu(enum dma_data_direction dir)
+{
+}
 #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */
 
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 847a9d1..d6fe727f1 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -59,6 +59,14 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU
 	bool
 	select NEED_DMA_MAP_STATE
 
+config ARCH_HAS_SYNC_DMA_FOR_DEVICE_RELAXED
+	bool
+	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
+
+config ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED
+	bool
+	select ARCH_HAS_SYNC_DMA_FOR_CPU
+
 config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
 	bool
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index db6ef07a..52e5fd1 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -321,9 +321,12 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 					dir, SYNC_FOR_DEVICE);
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_device(paddr, sg->length,
+			arch_sync_dma_for_device_relaxed(paddr, sg->length,
 					dir);
 	}
+
+	if (!dev_is_dma_coherent(dev))
+		arch_sync_barrier_for_device(dir);
 }
 #endif
 
@@ -340,15 +343,17 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_cpu(paddr, sg->length, dir);
+			arch_sync_dma_for_cpu_relaxed(paddr, sg->length, dir);
 
 		if (unlikely(is_swiotlb_buffer(paddr)))
 			swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
 					SYNC_FOR_CPU);
 	}
 
-	if (!dev_is_dma_coherent(dev))
+	if (!dev_is_dma_coherent(dev)) {
+		arch_sync_barrier_for_cpu(dir);
 		arch_sync_dma_for_cpu_all();
+	}
 }
 
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
@@ -357,8 +362,11 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 	struct scatterlist *sg;
 	int i;
 
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		dma_direct_sync_sg_for_cpu(dev, sgl, nents, dir);
+
 	for_each_sg(sgl, sg, nents, i)
-		dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
+		__dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
 			     attrs);
 }
 #endif
@@ -370,13 +378,20 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	struct scatterlist *sg;
 
 	for_each_sg(sgl, sg, nents, i) {
-		sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
+		sg->dma_address = __dma_direct_map_page(dev, sg_page(sg),
 				sg->offset, sg->length, dir, attrs);
 		if (sg->dma_address == DMA_MAPPING_ERROR)
 			goto out_unmap;
 		sg_dma_len(sg) = sg->length;
 	}
 
+	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
+		for_each_sg(sgl, sg, nents, i)
+			arch_sync_dma_for_device_relaxed(dma_to_phys(dev, sg_dma_address(sg)),
+							 sg->length, dir);
+		arch_sync_barrier_for_device(dir);
+	}
+
 	return nents;
 
 out_unmap:
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2] arm64: dma-mapping: add relaxed DMA sync
       [not found]   ` <CGME20200818075051epcas2p316edad6edd3df59444c08d392b075ea8@epcas2p3.samsung.com>
@ 2020-08-18  7:43     ` Cho KyongHo
  0 siblings, 0 replies; 10+ messages in thread
From: Cho KyongHo @ 2020-08-18  7:43 UTC (permalink / raw)
  To: joro, catalin.marinas, will
  Cc: iommu, linux-kernel, linux-arm-kernel, m.szyprowski,
	robin.murphy, janghyuck.kim, hyesoo.yu, Cho KyongHo

__dma_[un]map_area() is the implementation of cache maintenance
operations for DMA in arm64. 'dsb sy' in the subroutine guarantees the
view of given memory area is consistent to all memory observers. So,
it is required.
However, dma_sync_sg_for_{device|cpu}() and dma_[un]map_sg() calls
__dma_[un]map_area() nents number of times and 'dsb sy' instruction is
executed the same number of times. We have observed that 'dsb sy'
consumes more time than cleaning or invalidating 4KiB area.
arch_sync_dma_for_{device|cpu}_relaxed() and
arch_sync_barrier_for_{device|cpu}() are introduced since commit
6a9356234 ("dma-mapping: introduce relaxed version of dma sync") to
reduce redundant memory barriers in sg versions of DMA sync API.
Implementing relaxed version of DMA sync API will dramatically increase
the performance of dma_sync_sg_for_{device|cpu}().

Signed-off-by: Cho KyongHo <pullip.cho@samsung.com>
---
 arch/arm64/Kconfig                 |  4 ++--
 arch/arm64/include/asm/assembler.h | 33 ++++++++++++++++++++++++++++++++-
 arch/arm64/include/asm/barrier.h   | 13 +++++++++++++
 arch/arm64/mm/cache.S              | 34 +++++++++++-----------------------
 arch/arm64/mm/dma-mapping.c        |  4 ++--
 include/linux/dma-noncoherent.h    |  1 +
 6 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6d23283..4fc7ef4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -31,8 +31,8 @@ config ARM64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
-	select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-	select ARCH_HAS_SYNC_DMA_FOR_CPU
+	select ARCH_HAS_SYNC_DMA_FOR_DEVICE_RELAXED
+	select ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 54d1811..1f87d98 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -345,6 +345,33 @@ alternative_endif
 	.endm
 
 /*
+ * Macro to perform a data cache invalidation for the interval
+ * [kaddr, kaddr + size)
+ *
+ *	kaddr:		starting virtual address of the region
+ *	size:		size of the region
+ *	Corrupts:	kaddr, size, tmp1, tmp2
+ */
+	.macro __dcache_inv_by_line kaddr, size, tmp1, tmp2
+	add	\size, \size, \kaddr
+	dcache_line_size \tmp1, \tmp2
+	sub	\tmp2, \tmp1, #1
+	tst	\size, \tmp2		// end cache line aligned?
+	bic	\size, \size, \tmp2
+	b.eq	9997f
+	dc	civac, \size		// clean & invalidate D / U line
+9997:	tst	\kaddr, \tmp2		// start cache line aligned?
+	bic	\kaddr, \kaddr, \tmp2
+	b.eq	9998f
+	dc	civac, \kaddr		// clean & invalidate D / U line
+	b	9999f
+9998:	dc	ivac, \kaddr		// invalidate D / U line
+9999:	add	\kaddr, \kaddr, \tmp1
+	cmp	\kaddr, \size
+	b.lo	9998b
+	.endm
+
+/*
  * Macro to perform a data cache maintenance for the interval
  * [kaddr, kaddr + size)
  *
@@ -362,7 +389,7 @@ alternative_else
 alternative_endif
 	.endm
 
-	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+	.macro __dcache_by_line_op op, kaddr, size, tmp1, tmp2
 	dcache_line_size \tmp1, \tmp2
 	add	\size, \kaddr, \size
 	sub	\tmp2, \tmp1, #1
@@ -388,6 +415,10 @@ alternative_endif
 	add	\kaddr, \kaddr, \tmp1
 	cmp	\kaddr, \size
 	b.lo	9998b
+	.endm
+
+	.macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2
+	__dcache_by_line_op \op, \kaddr, \size, \tmp1, \tmp2
 	dsb	\domain
 	.endm
 
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index fb4c275..96bbbf6 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -167,6 +167,19 @@ do {									\
 
 #include <asm-generic/barrier.h>
 
+#include <linux/dma-direction.h>
+
+static inline void arch_sync_barrier_for_device(enum dma_data_direction dir)
+{
+	dsb(sy);
+}
+
+static inline void arch_sync_barrier_for_cpu(enum dma_data_direction dir)
+{
+	if (dir == DMA_FROM_DEVICE)
+		dsb(sy);
+}
+
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* __ASM_BARRIER_H */
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 2d881f3..7180256 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -138,34 +138,20 @@ SYM_FUNC_END(__clean_dcache_area_pou)
  *	- kaddr   - kernel address
  *	- size    - size in question
  */
-SYM_FUNC_START_LOCAL(__dma_inv_area)
 SYM_FUNC_START_PI(__inval_dcache_area)
-	/* FALLTHROUGH */
+	__dcache_inv_by_line x0, x1, x2, x3
+	dsb	sy
+	ret
+SYM_FUNC_END_PI(__inval_dcache_area)
 
 /*
  *	__dma_inv_area(start, size)
  *	- start   - virtual start address of region
  *	- size    - size in question
  */
-	add	x1, x1, x0
-	dcache_line_size x2, x3
-	sub	x3, x2, #1
-	tst	x1, x3				// end cache line aligned?
-	bic	x1, x1, x3
-	b.eq	1f
-	dc	civac, x1			// clean & invalidate D / U line
-1:	tst	x0, x3				// start cache line aligned?
-	bic	x0, x0, x3
-	b.eq	2f
-	dc	civac, x0			// clean & invalidate D / U line
-	b	3f
-2:	dc	ivac, x0			// invalidate D / U line
-3:	add	x0, x0, x2
-	cmp	x0, x1
-	b.lo	2b
-	dsb	sy
+SYM_FUNC_START_LOCAL(__dma_inv_area)
+	__dcache_inv_by_line x0, x1, x2, x3
 	ret
-SYM_FUNC_END_PI(__inval_dcache_area)
 SYM_FUNC_END(__dma_inv_area)
 
 /*
@@ -177,16 +163,18 @@ SYM_FUNC_END(__dma_inv_area)
  *	- kaddr   - kernel address
  *	- size    - size in question
  */
-SYM_FUNC_START_LOCAL(__dma_clean_area)
 SYM_FUNC_START_PI(__clean_dcache_area_poc)
-	/* FALLTHROUGH */
+	dcache_by_line_op cvac, sy, x0, x1, x2, x3
+	ret
+SYM_FUNC_END_PI(__clean_dcache_area_poc)
 
 /*
  *	__dma_clean_area(start, size)
  *	- start   - virtual start address of region
  *	- size    - size in question
  */
-	dcache_by_line_op cvac, sy, x0, x1, x2, x3
+SYM_FUNC_START_LOCAL(__dma_clean_area)
+	__dcache_by_line_op cvac, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(__clean_dcache_area_poc)
 SYM_FUNC_END(__dma_clean_area)
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 6c45350..12943b3 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -13,13 +13,13 @@
 
 #include <asm/cacheflush.h>
 
-void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+void arch_sync_dma_for_device_relaxed(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir)
 {
 	__dma_map_area(phys_to_virt(paddr), size, dir);
 }
 
-void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+void arch_sync_dma_for_cpu_relaxed(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir)
 {
 	__dma_unmap_area(phys_to_virt(paddr), size, dir);
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 0a31e6c..f88fc0f 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -5,6 +5,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/pgtable.h>
 
+#include <asm/barrier.h>
 #ifdef CONFIG_ARCH_HAS_DMA_COHERENCE_H
 #include <asm/dma-coherence.h>
 #elif defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync
  2020-08-18  7:43 ` [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync Cho KyongHo
       [not found]   ` <CGME20200818075051epcas2p316edad6edd3df59444c08d392b075ea8@epcas2p3.samsung.com>
@ 2020-08-18  8:28   ` Will Deacon
  2020-08-18  8:37     ` Christoph Hellwig
  2020-08-18  9:37     ` Cho KyongHo
  1 sibling, 2 replies; 10+ messages in thread
From: Will Deacon @ 2020-08-18  8:28 UTC (permalink / raw)
  To: Cho KyongHo
  Cc: joro, catalin.marinas, iommu, linux-kernel, linux-arm-kernel,
	m.szyprowski, robin.murphy, janghyuck.kim, hyesoo.yu

On Tue, Aug 18, 2020 at 04:43:10PM +0900, Cho KyongHo wrote:
> Cache maintenance operations in the most of CPU architectures needs
> memory barrier after the cache maintenance for the DMAs to view the
> region of the memory correctly. The problem is that memory barrier is
> very expensive and dma_[un]map_sg() and dma_sync_sg_for_{device|cpu}()
> involves the memory barrier per every single cache sg entry. In some
> CPU micro-architecture, a single memory barrier consumes more time than
> cache clean on 4KiB. It becomes more serious if the number of CPU cores
> are larger.

Have you got higher-level performance data for this change? It's more likely
that the DSB is what actually forces the prior cache maintenance to
complete, so it's important to look at the bigger picture, not just the
apparent relative cost of these instructions.

Also, it's a miracle that non-coherent DMA even works, so I'm not sure
that we should be complicating the implementation like this to try to
make it "fast".

Will

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync
  2020-08-18  8:28   ` [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync Will Deacon
@ 2020-08-18  8:37     ` Christoph Hellwig
  2020-08-18  9:46       ` Cho KyongHo
  2020-08-18  9:37     ` Cho KyongHo
  1 sibling, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2020-08-18  8:37 UTC (permalink / raw)
  To: Will Deacon
  Cc: Cho KyongHo, joro, catalin.marinas, iommu, linux-kernel,
	linux-arm-kernel, m.szyprowski, robin.murphy, janghyuck.kim,
	hyesoo.yu

On Tue, Aug 18, 2020 at 09:28:53AM +0100, Will Deacon wrote:
> On Tue, Aug 18, 2020 at 04:43:10PM +0900, Cho KyongHo wrote:
> > Cache maintenance operations in the most of CPU architectures needs
> > memory barrier after the cache maintenance for the DMAs to view the
> > region of the memory correctly. The problem is that memory barrier is
> > very expensive and dma_[un]map_sg() and dma_sync_sg_for_{device|cpu}()
> > involves the memory barrier per every single cache sg entry. In some
> > CPU micro-architecture, a single memory barrier consumes more time than
> > cache clean on 4KiB. It becomes more serious if the number of CPU cores
> > are larger.
> 
> Have you got higher-level performance data for this change? It's more likely
> that the DSB is what actually forces the prior cache maintenance to
> complete, so it's important to look at the bigger picture, not just the
> apparent relative cost of these instructions.
> 
> Also, it's a miracle that non-coherent DMA even works, so I'm not sure
> that we should be complicating the implementation like this to try to
> make it "fast".

And without not just an important in-tree user but one that actually
matters and can show how this is correct the whole proposal is complete
nonstarter.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync
  2020-08-18  8:28   ` [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync Will Deacon
  2020-08-18  8:37     ` Christoph Hellwig
@ 2020-08-18  9:37     ` Cho KyongHo
  2020-08-18 10:07       ` Will Deacon
  1 sibling, 1 reply; 10+ messages in thread
From: Cho KyongHo @ 2020-08-18  9:37 UTC (permalink / raw)
  To: Will Deacon
  Cc: joro, catalin.marinas, iommu, linux-kernel, linux-arm-kernel,
	m.szyprowski, robin.murphy, janghyuck.kim, hyesoo.yu

[-- Attachment #1: Type: text/plain, Size: 1816 bytes --]

On Tue, Aug 18, 2020 at 09:28:53AM +0100, Will Deacon wrote:
> On Tue, Aug 18, 2020 at 04:43:10PM +0900, Cho KyongHo wrote:
> > Cache maintenance operations in the most of CPU architectures needs
> > memory barrier after the cache maintenance for the DMAs to view the
> > region of the memory correctly. The problem is that memory barrier is
> > very expensive and dma_[un]map_sg() and dma_sync_sg_for_{device|cpu}()
> > involves the memory barrier per every single cache sg entry. In some
> > CPU micro-architecture, a single memory barrier consumes more time than
> > cache clean on 4KiB. It becomes more serious if the number of CPU cores
> > are larger.
> 
> Have you got higher-level performance data for this change? It's more likely
> that the DSB is what actually forces the prior cache maintenance to
> complete,

This patch does not skip necessary DSB after cache maintenance. It just
remove repeated dsb per every single sg entry and call dsb just once
after cache maintenance on all sg entries is completed.

> so it's important to look at the bigger picture, not just the
> apparent relative cost of these instructions.
> 
If you mean bigger picture is the performance impact of this patch to a
complete user scenario, we are evaluating it in some latency sensitve
scenario. But I wonder if a performance gain in a platform/SoC specific
scenario is also persuasive.

> Also, it's a miracle that non-coherent DMA even works,

I am sorry, Will. I don't understand this. Can you let me know what do
you mena with the above sentence?

> so I'm not sure
> that we should be complicating the implementation like this to try to
> make it "fast".
> 
I agree that this patch makes the implementation of dma API a bit more
but I don't think this does not impact its complication seriously.

> Will
> 

Thank you.

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync
  2020-08-18  8:37     ` Christoph Hellwig
@ 2020-08-18  9:46       ` Cho KyongHo
  0 siblings, 0 replies; 10+ messages in thread
From: Cho KyongHo @ 2020-08-18  9:46 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Will Deacon, joro, catalin.marinas, iommu, linux-kernel,
	linux-arm-kernel, m.szyprowski, robin.murphy, janghyuck.kim,
	hyesoo.yu

[-- Attachment #1: Type: text/plain, Size: 1757 bytes --]

On Tue, Aug 18, 2020 at 09:37:20AM +0100, Christoph Hellwig wrote:
> On Tue, Aug 18, 2020 at 09:28:53AM +0100, Will Deacon wrote:
> > On Tue, Aug 18, 2020 at 04:43:10PM +0900, Cho KyongHo wrote:
> > > Cache maintenance operations in the most of CPU architectures needs
> > > memory barrier after the cache maintenance for the DMAs to view the
> > > region of the memory correctly. The problem is that memory barrier is
> > > very expensive and dma_[un]map_sg() and dma_sync_sg_for_{device|cpu}()
> > > involves the memory barrier per every single cache sg entry. In some
> > > CPU micro-architecture, a single memory barrier consumes more time than
> > > cache clean on 4KiB. It becomes more serious if the number of CPU cores
> > > are larger.
> > 
> > Have you got higher-level performance data for this change? It's more likely
> > that the DSB is what actually forces the prior cache maintenance to
> > complete, so it's important to look at the bigger picture, not just the
> > apparent relative cost of these instructions.
> > 
> > Also, it's a miracle that non-coherent DMA even works, so I'm not sure
> > that we should be complicating the implementation like this to try to
> > make it "fast".
> 
> And without not just an important in-tree user but one that actually
> matters and can show how this is correct the whole proposal is complete
> nonstarter.
> 
The patch introduces new kernel configurations
ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED and ARCH_HAS_SYNC_DMA_FOR_CPU_RELAXED
not to affect the rest of the system. I also confirmed that the patch
does not break some other architectures including arm and x86 which do
not define the new kernel configurations.

Would you let me know some other things to confirm this patch is
correct?

Thank you.

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync
  2020-08-18  9:37     ` Cho KyongHo
@ 2020-08-18 10:07       ` Will Deacon
  2020-08-18 16:10         ` Christoph Hellwig
  2020-08-19  1:24         ` Cho KyongHo
  0 siblings, 2 replies; 10+ messages in thread
From: Will Deacon @ 2020-08-18 10:07 UTC (permalink / raw)
  To: Cho KyongHo
  Cc: joro, catalin.marinas, iommu, linux-kernel, linux-arm-kernel,
	m.szyprowski, robin.murphy, janghyuck.kim, hyesoo.yu

On Tue, Aug 18, 2020 at 06:37:39PM +0900, Cho KyongHo wrote:
> On Tue, Aug 18, 2020 at 09:28:53AM +0100, Will Deacon wrote:
> > On Tue, Aug 18, 2020 at 04:43:10PM +0900, Cho KyongHo wrote:
> > > Cache maintenance operations in the most of CPU architectures needs
> > > memory barrier after the cache maintenance for the DMAs to view the
> > > region of the memory correctly. The problem is that memory barrier is
> > > very expensive and dma_[un]map_sg() and dma_sync_sg_for_{device|cpu}()
> > > involves the memory barrier per every single cache sg entry. In some
> > > CPU micro-architecture, a single memory barrier consumes more time than
> > > cache clean on 4KiB. It becomes more serious if the number of CPU cores
> > > are larger.
> > 
> > Have you got higher-level performance data for this change? It's more likely
> > that the DSB is what actually forces the prior cache maintenance to
> > complete,
> 
> This patch does not skip necessary DSB after cache maintenance. It just
> remove repeated dsb per every single sg entry and call dsb just once
> after cache maintenance on all sg entries is completed.

Yes, I realise that, but what I'm saying is that a big part of your
justification for this change is:

  | The problem is that memory barrier is very expensive and dma_[un]map_sg()
  | and dma_sync_sg_for_{device|cpu}() involves the memory barrier per every
  | single cache sg entry. In some CPU micro-architecture, a single memory
  | barrier consumes more time than cache clean on 4KiB.

and my point is that the DSB is likely completing the cache maintenance,
so as cache maintenance instructions retire faster in the micro-architecture,
the DSB becomes absolutely slower. In other words, it doesn't make much
sense to me to compare the cost of the DSB with the cost of the cache
maintenance; what matters more is the code of the high-level unmap()
operation for the sglist.

> > so it's important to look at the bigger picture, not just the
> > apparent relative cost of these instructions.
> > 
> If you mean bigger picture is the performance impact of this patch to a
> complete user scenario, we are evaluating it in some latency sensitve
> scenario. But I wonder if a performance gain in a platform/SoC specific
> scenario is also persuasive.

Latency is fine too, but phrasing the numbers (and we really need those)
in terms of things like "The interrupt response time for this in-tree
driver is improved by xxx ns (yy %) after this change" or "Throughput
for this in-tree driver goes from xxx mb/s to yyy mb/s" would be really
helpful.

> > Also, it's a miracle that non-coherent DMA even works,
> 
> I am sorry, Will. I don't understand this. Can you let me know what do
> you mena with the above sentence?

Non-coherent DMA sucks for software. For the most part, Linux does a nice
job of hiding this from device drivers, and I think _that_ is the primary
concern, rather than performance. If performance is a problem, then the
solution is cache coherence or a shared non-cacheable buffer (rather than
the streaming API).

> > so I'm not sure
> > that we should be complicating the implementation like this to try to
> > make it "fast".
> > 
> I agree that this patch makes the implementation of dma API a bit more
> but I don't think this does not impact its complication seriously.

It's death by a thousand cuts; this patch further fragments the architecture
backends and leads to arm64-specific behaviour which consequently won't get
well tested by anybody else. Now, it might be worth it, but there's not
enough information here to make that call.

Will

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync
  2020-08-18 10:07       ` Will Deacon
@ 2020-08-18 16:10         ` Christoph Hellwig
  2020-08-19  2:01           ` Cho KyongHo
  2020-08-19  1:24         ` Cho KyongHo
  1 sibling, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2020-08-18 16:10 UTC (permalink / raw)
  To: Will Deacon
  Cc: Cho KyongHo, janghyuck.kim, catalin.marinas, linux-kernel,
	hyesoo.yu, iommu, robin.murphy, linux-arm-kernel

On Tue, Aug 18, 2020 at 11:07:57AM +0100, Will Deacon wrote:
> > > so I'm not sure
> > > that we should be complicating the implementation like this to try to
> > > make it "fast".
> > > 
> > I agree that this patch makes the implementation of dma API a bit more
> > but I don't think this does not impact its complication seriously.
> 
> It's death by a thousand cuts; this patch further fragments the architecture
> backends and leads to arm64-specific behaviour which consequently won't get
> well tested by anybody else. Now, it might be worth it, but there's not
> enough information here to make that call.

So it turns out I misread the series (*cough*, crazy long lines,
*cough*), and it does not actually expose a new API as I thought, but
it still makes a total mess of the internal interface.  It turns out
that on the for cpu side we already have arch_sync_dma_for_cpu_all,
which should do all that is needed.  We could do the equivalent for
the to device side, but only IFF there really is a major benefit for
something that actually is mainstream and matters.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync
  2020-08-18 10:07       ` Will Deacon
  2020-08-18 16:10         ` Christoph Hellwig
@ 2020-08-19  1:24         ` Cho KyongHo
  1 sibling, 0 replies; 10+ messages in thread
From: Cho KyongHo @ 2020-08-19  1:24 UTC (permalink / raw)
  To: Will Deacon
  Cc: joro, catalin.marinas, iommu, linux-kernel, linux-arm-kernel,
	m.szyprowski, robin.murphy, janghyuck.kim, hyesoo.yu

[-- Attachment #1: Type: text/plain, Size: 4796 bytes --]

On Tue, Aug 18, 2020 at 11:07:57AM +0100, Will Deacon wrote:
> On Tue, Aug 18, 2020 at 06:37:39PM +0900, Cho KyongHo wrote:
> > On Tue, Aug 18, 2020 at 09:28:53AM +0100, Will Deacon wrote:
> > > On Tue, Aug 18, 2020 at 04:43:10PM +0900, Cho KyongHo wrote:
> > > > Cache maintenance operations in the most of CPU architectures needs
> > > > memory barrier after the cache maintenance for the DMAs to view the
> > > > region of the memory correctly. The problem is that memory barrier is
> > > > very expensive and dma_[un]map_sg() and dma_sync_sg_for_{device|cpu}()
> > > > involves the memory barrier per every single cache sg entry. In some
> > > > CPU micro-architecture, a single memory barrier consumes more time than
> > > > cache clean on 4KiB. It becomes more serious if the number of CPU cores
> > > > are larger.
> > > 
> > > Have you got higher-level performance data for this change? It's more likely
> > > that the DSB is what actually forces the prior cache maintenance to
> > > complete,
> > 
> > This patch does not skip necessary DSB after cache maintenance. It just
> > remove repeated dsb per every single sg entry and call dsb just once
> > after cache maintenance on all sg entries is completed.
> 
> Yes, I realise that, but what I'm saying is that a big part of your
> justification for this change is:
> 
>   | The problem is that memory barrier is very expensive and dma_[un]map_sg()
>   | and dma_sync_sg_for_{device|cpu}() involves the memory barrier per every
>   | single cache sg entry. In some CPU micro-architecture, a single memory
>   | barrier consumes more time than cache clean on 4KiB.
> 
> and my point is that the DSB is likely completing the cache maintenance,
> so as cache maintenance instructions retire faster in the micro-architecture,
> the DSB becomes absolutely slower. In other words, it doesn't make much
> sense to me to compare the cost of the DSB with the cost of the cache
> maintenance; what matters more is the code of the high-level unmap()
> operation for the sglist.
> 
I now understand your point. But I still believe that repeated DSB in
the middle of cache maintenance wastes redundant CPU cycles. Avoiding
that redundancy causes extra complexity to implmentation of dma API. But
I think it is valuable.

> > > so it's important to look at the bigger picture, not just the
> > > apparent relative cost of these instructions.
> > > 
> > If you mean bigger picture is the performance impact of this patch to a
> > complete user scenario, we are evaluating it in some latency sensitve
> > scenario. But I wonder if a performance gain in a platform/SoC specific
> > scenario is also persuasive.
> 
> Latency is fine too, but phrasing the numbers (and we really need those)
> in terms of things like "The interrupt response time for this in-tree
> driver is improved by xxx ns (yy %) after this change" or "Throughput
> for this in-tree driver goes from xxx mb/s to yyy mb/s" would be really
> helpful.
> 

Unfortunately, we have no in-tree driver to show the performance.
Instead, we just evaluated the speed of dma_sync_sg_for_device() to see
the improvements of this patch.
For example, Cortex-A55 in our 2-cluster, big-mid-little system gains 28%
(130.9 usec. -> 94.5 usec.) during dma_sync_sg_for_device(sg, nents,
DMA_TO_DEVICE) is running with nents = 256 and length of each sg entrh is 4KiB.
Let me describe the detailed performance results in the next patch
series which will include some fixes to errata in commit messages.

> > > Also, it's a miracle that non-coherent DMA even works,
> > 
> > I am sorry, Will. I don't understand this. Can you let me know what do
> > you mena with the above sentence?
> 
> Non-coherent DMA sucks for software.

I agree. But due to the H/W cost, proposals about coherent DMA are
always challenging.

> For the most part, Linux does a nice
> job of hiding this from device drivers, and I think _that_ is the primary
> concern, rather than performance. If performance is a problem, then the
> solution is cache coherence or a shared non-cacheable buffer (rather than
> the streaming API).
We are also trying to use non-cacheable buffers for the non-coherent
DMAs. But the problem with the non-cacheable buffer is CPU access speed.
> 
> > > so I'm not sure
> > > that we should be complicating the implementation like this to try to
> > > make it "fast".
> > > 
> > I agree that this patch makes the implementation of dma API a bit more
> > but I don't think this does not impact its complication seriously.
> 
> It's death by a thousand cuts; this patch further fragments the architecture
> backends and leads to arm64-specific behaviour which consequently won't get
> well tested by anybody else. Now, it might be worth it, but there's not
> enough information here to make that call.
> Will
> 

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync
  2020-08-18 16:10         ` Christoph Hellwig
@ 2020-08-19  2:01           ` Cho KyongHo
  0 siblings, 0 replies; 10+ messages in thread
From: Cho KyongHo @ 2020-08-19  2:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Will Deacon, janghyuck.kim, catalin.marinas, linux-kernel,
	hyesoo.yu, iommu, robin.murphy, linux-arm-kernel

[-- Attachment #1: Type: text/plain, Size: 1579 bytes --]

On Tue, Aug 18, 2020 at 05:10:06PM +0100, Christoph Hellwig wrote:
> On Tue, Aug 18, 2020 at 11:07:57AM +0100, Will Deacon wrote:
> > > > so I'm not sure
> > > > that we should be complicating the implementation like this to try to
> > > > make it "fast".
> > > > 
> > > I agree that this patch makes the implementation of dma API a bit more
> > > but I don't think this does not impact its complication seriously.
> > 
> > It's death by a thousand cuts; this patch further fragments the architecture
> > backends and leads to arm64-specific behaviour which consequently won't get
> > well tested by anybody else. Now, it might be worth it, but there's not
> > enough information here to make that call.
> 
> So it turns out I misread the series (*cough*, crazy long lines,
> *cough*), and it does not actually expose a new API as I thought, but
> it still makes a total mess of the internal interface.  It turns out
> that on the for cpu side we already have arch_sync_dma_for_cpu_all,
> which should do all that is needed.  We could do the equivalent for
> the to device side, but only IFF there really is a major benefit for
> something that actually is mainstream and matters.
> 
Indeed, arch_sync_dma_for_cpu_all() is used where the new internal API
arch_sync_barrier_for_cpu() should be called. I just thought it is a
special hook for MIPS.
In the next version of the patch series, I should consider using
arch_sync_dma_for_cpu_all() and introducting its 'for_dev' version with
some performance data to show the benefit of the change.

Thank you for the proposal.

KyongHo

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-08-19  2:09 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <CGME20200818075050epcas2p15c780650f5f6b4a54ce731c273d24c98@epcas2p1.samsung.com>
2020-08-18  7:43 ` [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync Cho KyongHo
     [not found]   ` <CGME20200818075051epcas2p316edad6edd3df59444c08d392b075ea8@epcas2p3.samsung.com>
2020-08-18  7:43     ` [PATCH 2/2] arm64: dma-mapping: add relaxed DMA sync Cho KyongHo
2020-08-18  8:28   ` [PATCH 1/2] dma-mapping: introduce relaxed version of dma sync Will Deacon
2020-08-18  8:37     ` Christoph Hellwig
2020-08-18  9:46       ` Cho KyongHo
2020-08-18  9:37     ` Cho KyongHo
2020-08-18 10:07       ` Will Deacon
2020-08-18 16:10         ` Christoph Hellwig
2020-08-19  2:01           ` Cho KyongHo
2020-08-19  1:24         ` Cho KyongHo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).