LKML Archive on lore.kernel.org
 help / Atom feed
* [RFC] avoid indirect calls for DMA direct mappings
@ 2018-12-06 15:37 Christoph Hellwig
  2018-12-06 15:37 ` [PATCH] dma-mapping: bypass indirect calls for dma-direct Christoph Hellwig
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Christoph Hellwig @ 2018-12-06 15:37 UTC (permalink / raw)
  To: iommu, Linus Torvalds, Jesper Dangaard Brouer
  Cc: Tariq Toukan, Ilias Apalodimas, Toke Høiland-Jørgensen,
	Robin Murphy, linux-kernel

Hi all,

a while ago Jesper reported major performance regressions due to the
spectre v2 mitigations in his XDP forwarding workloads.  A large part
of that is due to the DMA mapping API indirect calls.

It turns out that the most common implementation of the DMA API is the
direct mapping case, and now that we have merged almost all duplicate
implementations of that into a single generic one is easily feasily to
direct calls for this fast path.

This patch adds a check if we are using dma_direct_ops in each fast path
DMA operation, and just uses a direct call instead.  For the XDP workload
this increases the number of packets per second from 7,438,283 to
9,610,088, so it provides a very significant speedup.

Note that the patch depends on a lot of work either queued up in the
DMA mapping tree, or still out on the list from review, so to actually
try the patch you probably want this git tree:


    git://git.infradead.org/users/hch/misc.git dma-direct-calls

Gitweb:

    http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dma-direct-calls


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] dma-mapping: bypass indirect calls for dma-direct
  2018-12-06 15:37 [RFC] avoid indirect calls for DMA direct mappings Christoph Hellwig
@ 2018-12-06 15:37 ` Christoph Hellwig
  2018-12-06 17:40   ` Jesper Dangaard Brouer
  2018-12-06 17:43 ` [RFC] avoid indirect calls for DMA direct mappings Jesper Dangaard Brouer
  2018-12-06 18:28 ` Linus Torvalds
  2 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2018-12-06 15:37 UTC (permalink / raw)
  To: iommu, Linus Torvalds, Jesper Dangaard Brouer
  Cc: Tariq Toukan, Ilias Apalodimas, Toke Høiland-Jørgensen,
	Robin Murphy, linux-kernel

Avoid expensive indirect calls in the fast path DMA mapping
operations by directly calling the dma_direct_* ops if we are using
the directly mapped DMA operations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-direct.h  |  17 -----
 include/linux/dma-mapping.h | 138 +++++++++++++++++++++++++++++++-----
 kernel/dma/direct.c         |  13 ++--
 3 files changed, 127 insertions(+), 41 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 3b0a3ea3876d..b7338702592a 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -60,22 +60,5 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
 void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page);
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs);
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
-int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
-		enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
-void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
 int dma_direct_supported(struct device *dev, u64 mask);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 8916499d2805..648ca7da3bb3 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -221,6 +221,69 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 }
 #endif
 
+static inline bool dma_is_direct(const struct dma_map_ops *ops)
+{
+	return IS_ENABLED(CONFIG_DMA_DIRECT_OPS) && ops == &dma_direct_ops;
+}
+
+/*
+ * All the dma_direct_* declarations are here just for the indirect call bypass,
+ * and must not be used directly drivers!
+ */
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+}
+static inline void dma_direct_unmap_sg(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+}
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 					      size_t size,
 					      enum dma_data_direction dir,
@@ -231,9 +294,12 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_map_single(dev, ptr, size);
-	addr = ops->map_page(dev, virt_to_page(ptr),
-			     offset_in_page(ptr), size,
-			     dir, attrs);
+	if (dma_is_direct(ops))
+		addr = dma_direct_map_page(dev, virt_to_page(ptr),
+				offset_in_page(ptr), size, dir, attrs);
+	else
+		addr = ops->map_page(dev, virt_to_page(ptr),
+				offset_in_page(ptr), size, dir, attrs);
 	debug_dma_map_page(dev, virt_to_page(ptr),
 			   offset_in_page(ptr), size,
 			   dir, addr, true);
@@ -248,8 +314,12 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_page)
-		ops->unmap_page(dev, addr, size, dir, attrs);
+	if (ops->unmap_page) {
+		if (dma_is_direct(ops))
+			dma_direct_unmap_page(dev, addr, size, dir, attrs);
+		else
+			ops->unmap_page(dev, addr, size, dir, attrs);
+	}
 	debug_dma_unmap_page(dev, addr, size, dir, true);
 }
 
@@ -265,7 +335,10 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 	int ents;
 
 	BUG_ON(!valid_dma_direction(dir));
-	ents = ops->map_sg(dev, sg, nents, dir, attrs);
+	if (dma_is_direct(ops))
+		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+	else
+		ents = ops->map_sg(dev, sg, nents, dir, attrs);
 	BUG_ON(ents < 0);
 	debug_dma_map_sg(dev, sg, nents, ents, dir);
 
@@ -280,8 +353,12 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (ops->unmap_sg)
-		ops->unmap_sg(dev, sg, nents, dir, attrs);
+	if (ops->unmap_sg) {
+		if (dma_is_direct(ops))
+			dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+		else
+			ops->unmap_sg(dev, sg, nents, dir, attrs);
+	}
 }
 
 static inline dma_addr_t dma_map_page_attrs(struct device *dev,
@@ -294,7 +371,10 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
 	dma_addr_t addr;
 
 	BUG_ON(!valid_dma_direction(dir));
-	addr = ops->map_page(dev, page, offset, size, dir, attrs);
+	if (dma_is_direct(ops))
+		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	else
+		addr = ops->map_page(dev, page, offset, size, dir, attrs);
 	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
 
 	return addr;
@@ -308,8 +388,12 @@ static inline void dma_unmap_page_attrs(struct device *dev,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_page)
-		ops->unmap_page(dev, addr, size, dir, attrs);
+	if (ops->unmap_page) {
+		if (dma_is_direct(ops))
+			dma_direct_unmap_page(dev, addr, size, dir, attrs);
+		else
+			ops->unmap_page(dev, addr, size, dir, attrs);
+	}
 	debug_dma_unmap_page(dev, addr, size, dir, false);
 }
 
@@ -355,8 +439,12 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_cpu)
-		ops->sync_single_for_cpu(dev, addr, size, dir);
+	if (ops->sync_single_for_cpu) {
+		if (dma_is_direct(ops))
+			dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+		else
+			ops->sync_single_for_cpu(dev, addr, size, dir);
+	}
 	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
 }
 
@@ -374,8 +462,12 @@ static inline void dma_sync_single_for_device(struct device *dev,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_device)
-		ops->sync_single_for_device(dev, addr, size, dir);
+	if (ops->sync_single_for_device) {
+		if (dma_is_direct(ops))
+			dma_direct_sync_single_for_device(dev, addr, size, dir);
+		else
+			ops->sync_single_for_device(dev, addr, size, dir);
+	}
 	debug_dma_sync_single_for_device(dev, addr, size, dir);
 }
 
@@ -393,8 +485,12 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_cpu)
-		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+	if (ops->sync_sg_for_cpu) {
+		if (dma_is_direct(ops))
+			dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+		else
+			ops->sync_sg_for_cpu(dev, sg, nelems, dir);
+	}
 	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
 }
 
@@ -405,8 +501,12 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_device)
-		ops->sync_sg_for_device(dev, sg, nelems, dir);
+	if (ops->sync_sg_for_device) {
+		if (dma_is_direct(ops))
+			dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+		else
+			ops->sync_sg_for_device(dev, sg, nelems, dir);
+	}
 	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
 
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 372c1955454a..cd535e7c67d7 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -223,6 +223,7 @@ void dma_direct_sync_single_for_device(struct device *dev,
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_device(dev, paddr, size, dir);
 }
+EXPORT_SYMBOL(dma_direct_sync_single_for_device);
 
 void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
@@ -240,6 +241,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 					dir);
 	}
 }
+EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
 #endif
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
@@ -258,6 +260,7 @@ void dma_direct_sync_single_for_cpu(struct device *dev,
 	if (is_swiotlb_buffer(paddr))
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
 }
+EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
 
 void dma_direct_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
@@ -277,6 +280,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_cpu_all(dev);
 }
+EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
 
 void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
@@ -289,6 +293,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 	if (is_swiotlb_buffer(phys))
 		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
+EXPORT_SYMBOL(dma_direct_unmap_page);
 
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs)
@@ -300,11 +305,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
 			     attrs);
 }
-#else
-void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-}
+EXPORT_SYMBOL(dma_direct_unmap_sg);
 #endif
 
 static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
@@ -331,6 +332,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		arch_sync_dma_for_device(dev, phys, size, dir);
 	return dma_addr;
 }
+EXPORT_SYMBOL(dma_direct_map_page);
 
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs)
@@ -352,6 +354,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
 	return 0;
 }
+EXPORT_SYMBOL(dma_direct_map_sg);
 
 /*
  * Because 32-bit DMA masks are so common we expect every architecture to be
-- 
2.19.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dma-mapping: bypass indirect calls for dma-direct
  2018-12-06 15:37 ` [PATCH] dma-mapping: bypass indirect calls for dma-direct Christoph Hellwig
@ 2018-12-06 17:40   ` Jesper Dangaard Brouer
  2018-12-06 18:35     ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2018-12-06 17:40 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: iommu, Linus Torvalds, Tariq Toukan, Ilias Apalodimas,
	Toke Høiland-Jørgensen, Robin Murphy, linux-kernel,
	brouer

On Thu,  6 Dec 2018 07:37:20 -0800
Christoph Hellwig <hch@lst.de> wrote:

> Avoid expensive indirect calls in the fast path DMA mapping
> operations by directly calling the dma_direct_* ops if we are using
> the directly mapped DMA operations.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  include/linux/dma-direct.h  |  17 -----
>  include/linux/dma-mapping.h | 138 +++++++++++++++++++++++++++++++-----
>  kernel/dma/direct.c         |  13 ++--
>  3 files changed, 127 insertions(+), 41 deletions(-)

Hi Christoph,

Is this patch to be applied on top of the other (41) patches I just
tested with my XDP forward workload?

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-06 15:37 [RFC] avoid indirect calls for DMA direct mappings Christoph Hellwig
  2018-12-06 15:37 ` [PATCH] dma-mapping: bypass indirect calls for dma-direct Christoph Hellwig
@ 2018-12-06 17:43 ` Jesper Dangaard Brouer
  2018-12-06 18:29   ` Nadav Amit
  2018-12-06 18:28 ` Linus Torvalds
  2 siblings, 1 reply; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2018-12-06 17:43 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: iommu, Linus Torvalds, Tariq Toukan, Ilias Apalodimas,
	Toke Høiland-Jørgensen, Robin Murphy, linux-kernel,
	brouer

On Thu,  6 Dec 2018 07:37:19 -0800
Christoph Hellwig <hch@lst.de> wrote:

> Hi all,
> 
> a while ago Jesper reported major performance regressions due to the
> spectre v2 mitigations in his XDP forwarding workloads.  A large part
> of that is due to the DMA mapping API indirect calls.
> 
> It turns out that the most common implementation of the DMA API is the
> direct mapping case, and now that we have merged almost all duplicate
> implementations of that into a single generic one is easily feasily to
> direct calls for this fast path.
> 
> This patch adds a check if we are using dma_direct_ops in each fast path
> DMA operation, and just uses a direct call instead.  For the XDP workload
> this increases the number of packets per second from 7,438,283 to
> 9,610,088, so it provides a very significant speedup.

Full test report avail here:
 https://github.com/xdp-project/xdp-project/blob/master/areas/dma/dma01_test_hellwig_direct_dma.org


> Note that the patch depends on a lot of work either queued up in the
> DMA mapping tree, or still out on the list from review, so to actually
> try the patch you probably want this git tree:
> 
> 
>     git://git.infradead.org/users/hch/misc.git dma-direct-calls
> 
> Gitweb:
> 
>     http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dma-direct-calls
> 



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-06 15:37 [RFC] avoid indirect calls for DMA direct mappings Christoph Hellwig
  2018-12-06 15:37 ` [PATCH] dma-mapping: bypass indirect calls for dma-direct Christoph Hellwig
  2018-12-06 17:43 ` [RFC] avoid indirect calls for DMA direct mappings Jesper Dangaard Brouer
@ 2018-12-06 18:28 ` Linus Torvalds
  2018-12-06 18:30   ` Linus Torvalds
  2018-12-06 18:43   ` Christoph Hellwig
  2 siblings, 2 replies; 17+ messages in thread
From: Linus Torvalds @ 2018-12-06 18:28 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: iommu, brouer, tariqt, ilias.apalodimas, toke, robin.murphy,
	Linux List Kernel Mailing

On Thu, Dec 6, 2018 at 7:37 AM Christoph Hellwig <hch@lst.de> wrote:
>
> This patch adds a check if we are using dma_direct_ops in each fast path
> DMA operation, and just uses a direct call instead.  For the XDP workload
> this increases the number of packets per second from 7,438,283 to
> 9,610,088, so it provides a very significant speedup.

May I suggest re-doing the logic a bit?

You do things like this:

+       if (ops->unmap_page) {
+               if (dma_is_direct(ops))
+                       dma_direct_unmap_page(dev, addr, size, dir, attrs);
+               else
+                       ops->unmap_page(dev, addr, size, dir, attrs);
+       }

which is counterproductive because it checks for the fast case *after*
you've done a load (and a check) that is entirely pointless for the
fast case.

Put another way, you made the fast case unnecessarily slow.

If this fast case matters (and the whole point of the series is that
it *does* matter), then you should make sure that

 (a) the dma_is_direct() function/macro uses "likely()" for the test

and

 (b) the code is organized like this instead:

+       if (dma_is_direct(ops))
+               dma_direct_unmap_page(dev, addr, size, dir, attrs);
+       else if (ops->unmap_page)
+               ops->unmap_page(dev, addr, size, dir, attrs);

because now you avoid that unnecessary conditional for the common
case, and you hopefully never even access the pointer (because you
know what's behind it: the direct ops cases that you're
short-circuiting).

In fact, as a further micro-optimization it might be a good idea to
just specify that the dma_is_direct() ops is a special pointer
(perhaps even just say that "NULL means it's direct"), because that
then makes the fast-case test much simpler (avoids a whole nasty
constant load, and testing for NULL in particular is often much
better).

But that further micro-optimization absolutely *requires* that the ops
pointer test comes first. So making that ordering change is not only
"better code generation for the fast case to avoid extra cache
accesses", it also allows future optimizations.

                     Linus

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-06 17:43 ` [RFC] avoid indirect calls for DMA direct mappings Jesper Dangaard Brouer
@ 2018-12-06 18:29   ` Nadav Amit
  2018-12-06 18:45     ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Nadav Amit @ 2018-12-06 18:29 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Christoph Hellwig, Toke Høiland-Jørgensen,
	Linus Torvalds, Ilias Apalodimas, LKML, iommu, Robin Murphy,
	Tariq Toukan, Josh Poimboeuf

> On Dec 6, 2018, at 9:43 AM, Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> 
> On Thu,  6 Dec 2018 07:37:19 -0800
> Christoph Hellwig <hch@lst.de> wrote:
> 
>> Hi all,
>> 
>> a while ago Jesper reported major performance regressions due to the
>> spectre v2 mitigations in his XDP forwarding workloads.  A large part
>> of that is due to the DMA mapping API indirect calls.
>> 
>> It turns out that the most common implementation of the DMA API is the
>> direct mapping case, and now that we have merged almost all duplicate
>> implementations of that into a single generic one is easily feasily to
>> direct calls for this fast path.
>> 
>> This patch adds a check if we are using dma_direct_ops in each fast path
>> DMA operation, and just uses a direct call instead.  For the XDP workload
>> this increases the number of packets per second from 7,438,283 to
>> 9,610,088, so it provides a very significant speedup.
> 
> Full test report avail here:
> https://github.com/xdp-project/xdp-project/blob/master/areas/dma/dma01_test_hellwig_direct_dma.org
> 
> 
>> Note that the patch depends on a lot of work either queued up in the
>> DMA mapping tree, or still out on the list from review, so to actually
>> try the patch you probably want this git tree:
>> 
>> 
>>    git://git.infradead.org/users/hch/misc.git dma-direct-calls
>> 
>> Gitweb:
>> 
>>    http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dma-direct-calls

Did you happen to see my RFC for "automatic" indirect call promotion? 

	https://lkml.org/lkml/2018/10/18/175

I hope to get v1 based on Josh responses next week.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-06 18:28 ` Linus Torvalds
@ 2018-12-06 18:30   ` Linus Torvalds
  2018-12-06 18:43   ` Christoph Hellwig
  1 sibling, 0 replies; 17+ messages in thread
From: Linus Torvalds @ 2018-12-06 18:30 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: iommu, brouer, tariqt, ilias.apalodimas, toke, robin.murphy,
	Linux List Kernel Mailing

On Thu, Dec 6, 2018 at 10:28 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Put another way, you made the fast case unnecessarily slow.

Side note: the code seems to be a bit confused about it, because
*some* cases test the fast case first, and some do it after they've
already accessed the pointer for the slow case.

So even aside from the performance and code generation issue (and
possible future "use a special bit pattern for the fast case"), it
would be good for _consistency_ to just always do the fast-case test
first.

                Linus

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] dma-mapping: bypass indirect calls for dma-direct
  2018-12-06 17:40   ` Jesper Dangaard Brouer
@ 2018-12-06 18:35     ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2018-12-06 18:35 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Christoph Hellwig, iommu, Linus Torvalds, Tariq Toukan,
	Ilias Apalodimas, Toke Høiland-Jørgensen, Robin Murphy,
	linux-kernel

On Thu, Dec 06, 2018 at 06:40:27PM +0100, Jesper Dangaard Brouer wrote:
> Is this patch to be applied on top of the other (41) patches I just
> tested with my XDP forward workload?

This is just the last patch of the git tree you tested.  All the
others were sent out separately already.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-06 18:28 ` Linus Torvalds
  2018-12-06 18:30   ` Linus Torvalds
@ 2018-12-06 18:43   ` Christoph Hellwig
  2018-12-06 18:51     ` Linus Torvalds
  2018-12-06 18:54     ` Robin Murphy
  1 sibling, 2 replies; 17+ messages in thread
From: Christoph Hellwig @ 2018-12-06 18:43 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Hellwig, iommu, brouer, tariqt, ilias.apalodimas, toke,
	robin.murphy, Linux List Kernel Mailing

On Thu, Dec 06, 2018 at 10:28:22AM -0800, Linus Torvalds wrote:
> which is counterproductive because it checks for the fast case *after*
> you've done a load (and a check) that is entirely pointless for the
> fast case.
> 
> Put another way, you made the fast case unnecessarily slow.
> 
> If this fast case matters (and the whole point of the series is that
> it *does* matter), then you should make sure that
> 
>  (a) the dma_is_direct() function/macro uses "likely()" for the test

I'm a little worried about that.  Yes, for the common case it is
likely.  But if you run a setup where you say always have an iommu
it is not, in fact it is never called in that case, but we only know
that at runtime.

>  (b) the code is organized like this instead:
> 
> +       if (dma_is_direct(ops))
> +               dma_direct_unmap_page(dev, addr, size, dir, attrs);
> +       else if (ops->unmap_page)
> +               ops->unmap_page(dev, addr, size, dir, attrs);
> 
> because now you avoid that unnecessary conditional for the common
> case, and you hopefully never even access the pointer (because you
> know what's behind it: the direct ops cases that you're
> short-circuiting).

Agreed on that, I've fixed it up now (current patch attached below).

> In fact, as a further micro-optimization it might be a good idea to
> just specify that the dma_is_direct() ops is a special pointer
> (perhaps even just say that "NULL means it's direct"), because that
> then makes the fast-case test much simpler (avoids a whole nasty
> constant load, and testing for NULL in particular is often much
> better).

So I wanted to do the NULL case, and it would be much nicer.  But the
arm folks went to great length to make sure they don't have a default
set of dma ops and require it to be explicitly set on every device
to catch cases where people don't set things up properly and I didn't
want to piss them off..  But maybe I should just go for it and see who
screams, as the benefit is pretty obvious.

---
From 89cc8ab03798177339ad916efabd320e792ee034 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 3 Dec 2018 16:49:29 +0100
Subject: dma-mapping: bypass indirect calls for dma-direct

Avoid expensive indirect calls in the fast path DMA mapping
operations by directly calling the dma_direct_* ops if we are using
the directly mapped DMA operations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-direct.h  |  17 ------
 include/linux/dma-mapping.h | 110 ++++++++++++++++++++++++++++++++----
 kernel/dma/direct.c         |  13 +++--
 3 files changed, 106 insertions(+), 34 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 3b0a3ea3876d..b7338702592a 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -60,22 +60,5 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
 void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page);
-dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs);
-void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
-int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
-		enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs);
-void dma_direct_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
-void dma_direct_sync_single_for_cpu(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir);
-void dma_direct_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
 int dma_direct_supported(struct device *dev, u64 mask);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 8916499d2805..98e4dd67c906 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -221,6 +221,69 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 }
 #endif
 
+static inline bool dma_is_direct(const struct dma_map_ops *ops)
+{
+	return IS_ENABLED(CONFIG_DMA_DIRECT_OPS) && ops == &dma_direct_ops;
+}
+
+/*
+ * All the dma_direct_* declarations are here just for the indirect call bypass,
+ * and must not be used directly drivers!
+ */
+dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs);
+int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, unsigned long attrs);
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_direct_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
+    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
+    defined(CONFIG_SWIOTLB)
+void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
+		int nents, enum dma_data_direction dir, unsigned long attrs);
+void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir);
+void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
+#else
+static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+}
+static inline void dma_direct_unmap_sg(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+}
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
+{
+}
+#endif
+
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 					      size_t size,
 					      enum dma_data_direction dir,
@@ -231,9 +294,12 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_map_single(dev, ptr, size);
-	addr = ops->map_page(dev, virt_to_page(ptr),
-			     offset_in_page(ptr), size,
-			     dir, attrs);
+	if (dma_is_direct(ops))
+		addr = dma_direct_map_page(dev, virt_to_page(ptr),
+				offset_in_page(ptr), size, dir, attrs);
+	else
+		addr = ops->map_page(dev, virt_to_page(ptr),
+				offset_in_page(ptr), size, dir, attrs);
 	debug_dma_map_page(dev, virt_to_page(ptr),
 			   offset_in_page(ptr), size,
 			   dir, addr, true);
@@ -248,7 +314,9 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_page)
+	if (dma_is_direct(ops))
+		dma_direct_unmap_page(dev, addr, size, dir, attrs);
+	else if (ops->unmap_page)
 		ops->unmap_page(dev, addr, size, dir, attrs);
 	debug_dma_unmap_page(dev, addr, size, dir, true);
 }
@@ -265,7 +333,10 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 	int ents;
 
 	BUG_ON(!valid_dma_direction(dir));
-	ents = ops->map_sg(dev, sg, nents, dir, attrs);
+	if (dma_is_direct(ops))
+		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
+	else
+		ents = ops->map_sg(dev, sg, nents, dir, attrs);
 	BUG_ON(ents < 0);
 	debug_dma_map_sg(dev, sg, nents, ents, dir);
 
@@ -280,7 +351,9 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (ops->unmap_sg)
+	if (dma_is_direct(ops))
+		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+	else if (ops->unmap_sg)
 		ops->unmap_sg(dev, sg, nents, dir, attrs);
 }
 
@@ -294,7 +367,10 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
 	dma_addr_t addr;
 
 	BUG_ON(!valid_dma_direction(dir));
-	addr = ops->map_page(dev, page, offset, size, dir, attrs);
+	if (dma_is_direct(ops))
+		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	else
+		addr = ops->map_page(dev, page, offset, size, dir, attrs);
 	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
 
 	return addr;
@@ -308,7 +384,9 @@ static inline void dma_unmap_page_attrs(struct device *dev,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_page)
+	if (dma_is_direct(ops))
+		dma_direct_unmap_page(dev, addr, size, dir, attrs);
+	else if (ops->unmap_page)
 		ops->unmap_page(dev, addr, size, dir, attrs);
 	debug_dma_unmap_page(dev, addr, size, dir, false);
 }
@@ -355,7 +433,9 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_cpu)
+	if (dma_is_direct(ops))
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+	else if (ops->sync_single_for_cpu)
 		ops->sync_single_for_cpu(dev, addr, size, dir);
 	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
 }
@@ -374,7 +454,9 @@ static inline void dma_sync_single_for_device(struct device *dev,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_device)
+	if (dma_is_direct(ops))
+		dma_direct_sync_single_for_device(dev, addr, size, dir);
+	else if (ops->sync_single_for_device)
 		ops->sync_single_for_device(dev, addr, size, dir);
 	debug_dma_sync_single_for_device(dev, addr, size, dir);
 }
@@ -393,7 +475,9 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_cpu)
+	if (dma_is_direct(ops))
+		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_cpu)
 		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
 	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
 }
@@ -405,7 +489,9 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_device)
+	if (dma_is_direct(ops))
+		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_device)
 		ops->sync_sg_for_device(dev, sg, nelems, dir);
 	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 372c1955454a..cd535e7c67d7 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -223,6 +223,7 @@ void dma_direct_sync_single_for_device(struct device *dev,
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_device(dev, paddr, size, dir);
 }
+EXPORT_SYMBOL(dma_direct_sync_single_for_device);
 
 void dma_direct_sync_sg_for_device(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
@@ -240,6 +241,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 					dir);
 	}
 }
+EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
 #endif
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
@@ -258,6 +260,7 @@ void dma_direct_sync_single_for_cpu(struct device *dev,
 	if (is_swiotlb_buffer(paddr))
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
 }
+EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
 
 void dma_direct_sync_sg_for_cpu(struct device *dev,
 		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
@@ -277,6 +280,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 	if (!dev_is_dma_coherent(dev))
 		arch_sync_dma_for_cpu_all(dev);
 }
+EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
 
 void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
@@ -289,6 +293,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
 	if (is_swiotlb_buffer(phys))
 		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
+EXPORT_SYMBOL(dma_direct_unmap_page);
 
 void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		int nents, enum dma_data_direction dir, unsigned long attrs)
@@ -300,11 +305,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
 			     attrs);
 }
-#else
-void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
-		int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-}
+EXPORT_SYMBOL(dma_direct_unmap_sg);
 #endif
 
 static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
@@ -331,6 +332,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		arch_sync_dma_for_device(dev, phys, size, dir);
 	return dma_addr;
 }
+EXPORT_SYMBOL(dma_direct_map_page);
 
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs)
@@ -352,6 +354,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 	dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
 	return 0;
 }
+EXPORT_SYMBOL(dma_direct_map_sg);
 
 /*
  * Because 32-bit DMA masks are so common we expect every architecture to be
-- 
2.19.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-06 18:29   ` Nadav Amit
@ 2018-12-06 18:45     ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2018-12-06 18:45 UTC (permalink / raw)
  To: Nadav Amit
  Cc: Jesper Dangaard Brouer, Christoph Hellwig,
	Toke Høiland-Jørgensen, Linus Torvalds,
	Ilias Apalodimas, LKML, iommu, Robin Murphy, Tariq Toukan,
	Josh Poimboeuf

On Thu, Dec 06, 2018 at 10:29:35AM -0800, Nadav Amit wrote:
> Did you happen to see my RFC for "automatic" indirect call promotion? 
> 
> 	https://lkml.org/lkml/2018/10/18/175
> 
> I hope to get v1 based on Josh responses next week.

I'll take a look when you repost it.  Although I'm always a little vary
of under the hood magic like this..

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-06 18:43   ` Christoph Hellwig
@ 2018-12-06 18:51     ` Linus Torvalds
  2018-12-06 18:54     ` Robin Murphy
  1 sibling, 0 replies; 17+ messages in thread
From: Linus Torvalds @ 2018-12-06 18:51 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: iommu, brouer, tariqt, ilias.apalodimas, toke, robin.murphy,
	Linux List Kernel Mailing

On Thu, Dec 6, 2018 at 10:43 AM Christoph Hellwig <hch@lst.de> wrote:
>
> >
> >  (a) the dma_is_direct() function/macro uses "likely()" for the test
>
> I'm a little worried about that.  Yes, for the common case it is
> likely.  But if you run a setup where you say always have an iommu
> it is not, in fact it is never called in that case, but we only know
> that at runtime.

Note that "likely()" doesn't have any really huge overhead - it just
makes the compiler move the unlikely case out-of-line.

Compared to the overhead of the indirect branch, it's simply not a
huge deal, it's more a mispredict and cache layout issue.

So marking something "likely()" when it isn't doesn't really penalize
things too much. It's not like an exception or anything like that,
it's really just a marker for better code layout.

                  Linus

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-06 18:43   ` Christoph Hellwig
  2018-12-06 18:51     ` Linus Torvalds
@ 2018-12-06 18:54     ` Robin Murphy
  2018-12-06 20:00       ` Christoph Hellwig
  1 sibling, 1 reply; 17+ messages in thread
From: Robin Murphy @ 2018-12-06 18:54 UTC (permalink / raw)
  To: Christoph Hellwig, Linus Torvalds
  Cc: iommu, brouer, tariqt, ilias.apalodimas, toke, Linux List Kernel Mailing

On 06/12/2018 18:43, Christoph Hellwig wrote:
> On Thu, Dec 06, 2018 at 10:28:22AM -0800, Linus Torvalds wrote:
>> which is counterproductive because it checks for the fast case *after*
>> you've done a load (and a check) that is entirely pointless for the
>> fast case.
>>
>> Put another way, you made the fast case unnecessarily slow.
>>
>> If this fast case matters (and the whole point of the series is that
>> it *does* matter), then you should make sure that
>>
>>   (a) the dma_is_direct() function/macro uses "likely()" for the test
> 
> I'm a little worried about that.  Yes, for the common case it is
> likely.  But if you run a setup where you say always have an iommu
> it is not, in fact it is never called in that case, but we only know
> that at runtime.
> 
>>   (b) the code is organized like this instead:
>>
>> +       if (dma_is_direct(ops))
>> +               dma_direct_unmap_page(dev, addr, size, dir, attrs);
>> +       else if (ops->unmap_page)
>> +               ops->unmap_page(dev, addr, size, dir, attrs);
>>
>> because now you avoid that unnecessary conditional for the common
>> case, and you hopefully never even access the pointer (because you
>> know what's behind it: the direct ops cases that you're
>> short-circuiting).
> 
> Agreed on that, I've fixed it up now (current patch attached below).
> 
>> In fact, as a further micro-optimization it might be a good idea to
>> just specify that the dma_is_direct() ops is a special pointer
>> (perhaps even just say that "NULL means it's direct"), because that
>> then makes the fast-case test much simpler (avoids a whole nasty
>> constant load, and testing for NULL in particular is often much
>> better).
> 
> So I wanted to do the NULL case, and it would be much nicer.  But the
> arm folks went to great length to make sure they don't have a default
> set of dma ops and require it to be explicitly set on every device
> to catch cases where people don't set things up properly and I didn't
> want to piss them off..  But maybe I should just go for it and see who
> screams, as the benefit is pretty obvious.

I'm pretty sure we used to assign dummy_dma_ops explicitly to devices at 
the point we detected the ACPI properties are wrong - that shouldn't be 
too much of a headache to go back to.

Robin.

> 
> ---
>  From 89cc8ab03798177339ad916efabd320e792ee034 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <hch@lst.de>
> Date: Mon, 3 Dec 2018 16:49:29 +0100
> Subject: dma-mapping: bypass indirect calls for dma-direct
> 
> Avoid expensive indirect calls in the fast path DMA mapping
> operations by directly calling the dma_direct_* ops if we are using
> the directly mapped DMA operations.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   include/linux/dma-direct.h  |  17 ------
>   include/linux/dma-mapping.h | 110 ++++++++++++++++++++++++++++++++----
>   kernel/dma/direct.c         |  13 +++--
>   3 files changed, 106 insertions(+), 34 deletions(-)
> 
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index 3b0a3ea3876d..b7338702592a 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -60,22 +60,5 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
>   struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>   		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
>   void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page);
> -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
> -		unsigned long offset, size_t size, enum dma_data_direction dir,
> -		unsigned long attrs);
> -void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
> -		size_t size, enum dma_data_direction dir, unsigned long attrs);
> -int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
> -		enum dma_data_direction dir, unsigned long attrs);
> -void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
> -		int nents, enum dma_data_direction dir, unsigned long attrs);
> -void dma_direct_sync_single_for_device(struct device *dev,
> -		dma_addr_t addr, size_t size, enum dma_data_direction dir);
> -void dma_direct_sync_sg_for_device(struct device *dev,
> -		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
> -void dma_direct_sync_single_for_cpu(struct device *dev,
> -		dma_addr_t addr, size_t size, enum dma_data_direction dir);
> -void dma_direct_sync_sg_for_cpu(struct device *dev,
> -		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
>   int dma_direct_supported(struct device *dev, u64 mask);
>   #endif /* _LINUX_DMA_DIRECT_H */
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index 8916499d2805..98e4dd67c906 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -221,6 +221,69 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
>   }
>   #endif
>   
> +static inline bool dma_is_direct(const struct dma_map_ops *ops)
> +{
> +	return IS_ENABLED(CONFIG_DMA_DIRECT_OPS) && ops == &dma_direct_ops;
> +}
> +
> +/*
> + * All the dma_direct_* declarations are here just for the indirect call bypass,
> + * and must not be used directly drivers!
> + */
> +dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
> +		unsigned long offset, size_t size, enum dma_data_direction dir,
> +		unsigned long attrs);
> +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
> +		enum dma_data_direction dir, unsigned long attrs);
> +
> +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
> +    defined(CONFIG_SWIOTLB)
> +void dma_direct_sync_single_for_device(struct device *dev,
> +		dma_addr_t addr, size_t size, enum dma_data_direction dir);
> +void dma_direct_sync_sg_for_device(struct device *dev,
> +		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
> +#else
> +static inline void dma_direct_sync_single_for_device(struct device *dev,
> +		dma_addr_t addr, size_t size, enum dma_data_direction dir)
> +{
> +}
> +static inline void dma_direct_sync_sg_for_device(struct device *dev,
> +		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
> +{
> +}
> +#endif
> +
> +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
> +    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
> +    defined(CONFIG_SWIOTLB)
> +void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
> +		size_t size, enum dma_data_direction dir, unsigned long attrs);
> +void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
> +		int nents, enum dma_data_direction dir, unsigned long attrs);
> +void dma_direct_sync_single_for_cpu(struct device *dev,
> +		dma_addr_t addr, size_t size, enum dma_data_direction dir);
> +void dma_direct_sync_sg_for_cpu(struct device *dev,
> +		struct scatterlist *sgl, int nents, enum dma_data_direction dir);
> +#else
> +static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
> +		size_t size, enum dma_data_direction dir, unsigned long attrs)
> +{
> +}
> +static inline void dma_direct_unmap_sg(struct device *dev,
> +		struct scatterlist *sgl, int nents, enum dma_data_direction dir,
> +		unsigned long attrs)
> +{
> +}
> +static inline void dma_direct_sync_single_for_cpu(struct device *dev,
> +		dma_addr_t addr, size_t size, enum dma_data_direction dir)
> +{
> +}
> +static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
> +		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
> +{
> +}
> +#endif
> +
>   static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
>   					      size_t size,
>   					      enum dma_data_direction dir,
> @@ -231,9 +294,12 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
>   
>   	BUG_ON(!valid_dma_direction(dir));
>   	debug_dma_map_single(dev, ptr, size);
> -	addr = ops->map_page(dev, virt_to_page(ptr),
> -			     offset_in_page(ptr), size,
> -			     dir, attrs);
> +	if (dma_is_direct(ops))
> +		addr = dma_direct_map_page(dev, virt_to_page(ptr),
> +				offset_in_page(ptr), size, dir, attrs);
> +	else
> +		addr = ops->map_page(dev, virt_to_page(ptr),
> +				offset_in_page(ptr), size, dir, attrs);
>   	debug_dma_map_page(dev, virt_to_page(ptr),
>   			   offset_in_page(ptr), size,
>   			   dir, addr, true);
> @@ -248,7 +314,9 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
>   	const struct dma_map_ops *ops = get_dma_ops(dev);
>   
>   	BUG_ON(!valid_dma_direction(dir));
> -	if (ops->unmap_page)
> +	if (dma_is_direct(ops))
> +		dma_direct_unmap_page(dev, addr, size, dir, attrs);
> +	else if (ops->unmap_page)
>   		ops->unmap_page(dev, addr, size, dir, attrs);
>   	debug_dma_unmap_page(dev, addr, size, dir, true);
>   }
> @@ -265,7 +333,10 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
>   	int ents;
>   
>   	BUG_ON(!valid_dma_direction(dir));
> -	ents = ops->map_sg(dev, sg, nents, dir, attrs);
> +	if (dma_is_direct(ops))
> +		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
> +	else
> +		ents = ops->map_sg(dev, sg, nents, dir, attrs);
>   	BUG_ON(ents < 0);
>   	debug_dma_map_sg(dev, sg, nents, ents, dir);
>   
> @@ -280,7 +351,9 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
>   
>   	BUG_ON(!valid_dma_direction(dir));
>   	debug_dma_unmap_sg(dev, sg, nents, dir);
> -	if (ops->unmap_sg)
> +	if (dma_is_direct(ops))
> +		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
> +	else if (ops->unmap_sg)
>   		ops->unmap_sg(dev, sg, nents, dir, attrs);
>   }
>   
> @@ -294,7 +367,10 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
>   	dma_addr_t addr;
>   
>   	BUG_ON(!valid_dma_direction(dir));
> -	addr = ops->map_page(dev, page, offset, size, dir, attrs);
> +	if (dma_is_direct(ops))
> +		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
> +	else
> +		addr = ops->map_page(dev, page, offset, size, dir, attrs);
>   	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
>   
>   	return addr;
> @@ -308,7 +384,9 @@ static inline void dma_unmap_page_attrs(struct device *dev,
>   	const struct dma_map_ops *ops = get_dma_ops(dev);
>   
>   	BUG_ON(!valid_dma_direction(dir));
> -	if (ops->unmap_page)
> +	if (dma_is_direct(ops))
> +		dma_direct_unmap_page(dev, addr, size, dir, attrs);
> +	else if (ops->unmap_page)
>   		ops->unmap_page(dev, addr, size, dir, attrs);
>   	debug_dma_unmap_page(dev, addr, size, dir, false);
>   }
> @@ -355,7 +433,9 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
>   	const struct dma_map_ops *ops = get_dma_ops(dev);
>   
>   	BUG_ON(!valid_dma_direction(dir));
> -	if (ops->sync_single_for_cpu)
> +	if (dma_is_direct(ops))
> +		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
> +	else if (ops->sync_single_for_cpu)
>   		ops->sync_single_for_cpu(dev, addr, size, dir);
>   	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
>   }
> @@ -374,7 +454,9 @@ static inline void dma_sync_single_for_device(struct device *dev,
>   	const struct dma_map_ops *ops = get_dma_ops(dev);
>   
>   	BUG_ON(!valid_dma_direction(dir));
> -	if (ops->sync_single_for_device)
> +	if (dma_is_direct(ops))
> +		dma_direct_sync_single_for_device(dev, addr, size, dir);
> +	else if (ops->sync_single_for_device)
>   		ops->sync_single_for_device(dev, addr, size, dir);
>   	debug_dma_sync_single_for_device(dev, addr, size, dir);
>   }
> @@ -393,7 +475,9 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
>   	const struct dma_map_ops *ops = get_dma_ops(dev);
>   
>   	BUG_ON(!valid_dma_direction(dir));
> -	if (ops->sync_sg_for_cpu)
> +	if (dma_is_direct(ops))
> +		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
> +	else if (ops->sync_sg_for_cpu)
>   		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
>   	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
>   }
> @@ -405,7 +489,9 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
>   	const struct dma_map_ops *ops = get_dma_ops(dev);
>   
>   	BUG_ON(!valid_dma_direction(dir));
> -	if (ops->sync_sg_for_device)
> +	if (dma_is_direct(ops))
> +		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
> +	else if (ops->sync_sg_for_device)
>   		ops->sync_sg_for_device(dev, sg, nelems, dir);
>   	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
>   
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 372c1955454a..cd535e7c67d7 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -223,6 +223,7 @@ void dma_direct_sync_single_for_device(struct device *dev,
>   	if (!dev_is_dma_coherent(dev))
>   		arch_sync_dma_for_device(dev, paddr, size, dir);
>   }
> +EXPORT_SYMBOL(dma_direct_sync_single_for_device);
>   
>   void dma_direct_sync_sg_for_device(struct device *dev,
>   		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
> @@ -240,6 +241,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
>   					dir);
>   	}
>   }
> +EXPORT_SYMBOL(dma_direct_sync_sg_for_device);
>   #endif
>   
>   #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
> @@ -258,6 +260,7 @@ void dma_direct_sync_single_for_cpu(struct device *dev,
>   	if (is_swiotlb_buffer(paddr))
>   		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU);
>   }
> +EXPORT_SYMBOL(dma_direct_sync_single_for_cpu);
>   
>   void dma_direct_sync_sg_for_cpu(struct device *dev,
>   		struct scatterlist *sgl, int nents, enum dma_data_direction dir)
> @@ -277,6 +280,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
>   	if (!dev_is_dma_coherent(dev))
>   		arch_sync_dma_for_cpu_all(dev);
>   }
> +EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
>   
>   void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
>   		size_t size, enum dma_data_direction dir, unsigned long attrs)
> @@ -289,6 +293,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
>   	if (is_swiotlb_buffer(phys))
>   		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
>   }
> +EXPORT_SYMBOL(dma_direct_unmap_page);
>   
>   void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
>   		int nents, enum dma_data_direction dir, unsigned long attrs)
> @@ -300,11 +305,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
>   		dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
>   			     attrs);
>   }
> -#else
> -void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
> -		int nents, enum dma_data_direction dir, unsigned long attrs)
> -{
> -}
> +EXPORT_SYMBOL(dma_direct_unmap_sg);
>   #endif
>   
>   static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
> @@ -331,6 +332,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
>   		arch_sync_dma_for_device(dev, phys, size, dir);
>   	return dma_addr;
>   }
> +EXPORT_SYMBOL(dma_direct_map_page);
>   
>   int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
>   		enum dma_data_direction dir, unsigned long attrs)
> @@ -352,6 +354,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
>   	dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
>   	return 0;
>   }
> +EXPORT_SYMBOL(dma_direct_map_sg);
>   
>   /*
>    * Because 32-bit DMA masks are so common we expect every architecture to be
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-06 18:54     ` Robin Murphy
@ 2018-12-06 20:00       ` Christoph Hellwig
  2018-12-06 20:24         ` Robin Murphy
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2018-12-06 20:00 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Christoph Hellwig, Linus Torvalds, iommu, brouer, tariqt,
	ilias.apalodimas, toke, Linux List Kernel Mailing

On Thu, Dec 06, 2018 at 06:54:17PM +0000, Robin Murphy wrote:
> I'm pretty sure we used to assign dummy_dma_ops explicitly to devices at 
> the point we detected the ACPI properties are wrong - that shouldn't be too 
> much of a headache to go back to.

Ok.  I've cooked up a patch to use NULL as the go direct marker.
This cleans up a few things nicely, but also means we now need to
do the bypass scheme for all ops, not just the fast path.  But we
probably should just move the slow path ops out of line anyway,
so I'm not worried about it.  This has survived some very basic
testing on x86, and really needs to be cleaned up and split into
multiple patches..

The other nice thing this would allow is removing dma_direct_ops
entirely, which means we can simplify a few things even further.

This patch is relative to what I sent out before and the git tree,
and survives some very basic x86 testing.

---
From 9318145675791833f752cba22484a0b4b4387f1b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Dec 2018 10:52:35 -0800
Subject: dma-direct: use NULL as the bypass indicator

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/alpha/include/asm/dma-mapping.h |   2 +-
 arch/arm/include/asm/dma-mapping.h   |   2 +-
 arch/arm/mm/dma-mapping-nommu.c      |  12 +--
 arch/arm64/include/asm/dma-mapping.h |   8 +-
 arch/arm64/mm/dma-mapping.c          |  89 -------------------
 arch/ia64/hp/common/hwsw_iommu.c     |   2 +-
 arch/ia64/hp/common/sba_iommu.c      |   4 +-
 arch/ia64/kernel/dma-mapping.c       |   1 -
 arch/ia64/kernel/pci-dma.c           |   2 +-
 arch/mips/include/asm/dma-mapping.h  |   2 +-
 arch/parisc/kernel/setup.c           |   4 -
 arch/sparc/include/asm/dma-mapping.h |   4 +-
 arch/x86/kernel/pci-dma.c            |   2 +-
 drivers/base/platform.c              |   2 +
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c  |   2 +-
 drivers/iommu/amd_iommu.c            |  13 +--
 drivers/pci/controller/vmd.c         |   4 +-
 include/asm-generic/dma-mapping.h    |   2 +-
 include/linux/dma-direct.h           |   6 --
 include/linux/dma-mapping.h          | 125 ++++++++++++++-------------
 include/linux/dma-noncoherent.h      |   7 --
 kernel/dma/Kconfig                   |   2 +-
 kernel/dma/direct.c                  |   3 +
 23 files changed, 92 insertions(+), 208 deletions(-)

diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h
index 8beeafd4f68e..c9203468d4fe 100644
--- a/arch/alpha/include/asm/dma-mapping.h
+++ b/arch/alpha/include/asm/dma-mapping.h
@@ -7,7 +7,7 @@ extern const struct dma_map_ops alpha_pci_ops;
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 #ifdef CONFIG_ALPHA_JENSEN
-	return &dma_direct_ops;
+	return NULL; /* use direct mapping */
 #else
 	return &alpha_pci_ops;
 #endif
diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index 965b7c846ecb..31d3b96f0f4b 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -18,7 +18,7 @@ extern const struct dma_map_ops arm_coherent_dma_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	return IS_ENABLED(CONFIG_MMU) ? &arm_dma_ops : &dma_direct_ops;
+	return IS_ENABLED(CONFIG_MMU) ? &arm_dma_ops : NULL;
 }
 
 #ifdef __arch_page_to_dma
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index 712416ecd8e6..378763003079 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -209,16 +209,9 @@ const struct dma_map_ops arm_nommu_dma_ops = {
 };
 EXPORT_SYMBOL(arm_nommu_dma_ops);
 
-static const struct dma_map_ops *arm_nommu_get_dma_map_ops(bool coherent)
-{
-	return coherent ? &dma_direct_ops : &arm_nommu_dma_ops;
-}
-
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 			const struct iommu_ops *iommu, bool coherent)
 {
-	const struct dma_map_ops *dma_ops;
-
 	if (IS_ENABLED(CONFIG_CPU_V7M)) {
 		/*
 		 * Cache support for v7m is optional, so can be treated as
@@ -234,7 +227,6 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		dev->archdata.dma_coherent = (get_cr() & CR_M) ? coherent : true;
 	}
 
-	dma_ops = arm_nommu_get_dma_map_ops(dev->archdata.dma_coherent);
-
-	set_dma_ops(dev, dma_ops);
+	if (!dev->archdata.dma_coherent)
+		set_dma_ops(dev, &arm_nommu_dma_ops);
 }
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index c41f3fb1446c..95dbf3ef735a 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -24,15 +24,9 @@
 #include <xen/xen.h>
 #include <asm/xen/hypervisor.h>
 
-extern const struct dma_map_ops dummy_dma_ops;
-
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	/*
-	 * We expect no ISA devices, and all other DMA masters are expected to
-	 * have someone call arch_setup_dma_ops at device creation time.
-	 */
-	return &dummy_dma_ops;
+	return NULL;
 }
 
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index e4effbb243b1..95eda81e3f2d 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -89,92 +89,6 @@ static int __swiotlb_mmap_pfn(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_IOMMU_DMA */
 
-/********************************************
- * The following APIs are for dummy DMA ops *
- ********************************************/
-
-static void *__dummy_alloc(struct device *dev, size_t size,
-			   dma_addr_t *dma_handle, gfp_t flags,
-			   unsigned long attrs)
-{
-	return NULL;
-}
-
-static void __dummy_free(struct device *dev, size_t size,
-			 void *vaddr, dma_addr_t dma_handle,
-			 unsigned long attrs)
-{
-}
-
-static int __dummy_mmap(struct device *dev,
-			struct vm_area_struct *vma,
-			void *cpu_addr, dma_addr_t dma_addr, size_t size,
-			unsigned long attrs)
-{
-	return -ENXIO;
-}
-
-static dma_addr_t __dummy_map_page(struct device *dev, struct page *page,
-				   unsigned long offset, size_t size,
-				   enum dma_data_direction dir,
-				   unsigned long attrs)
-{
-	return DMA_MAPPING_ERROR;
-}
-
-static void __dummy_unmap_page(struct device *dev, dma_addr_t dev_addr,
-			       size_t size, enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-}
-
-static int __dummy_map_sg(struct device *dev, struct scatterlist *sgl,
-			  int nelems, enum dma_data_direction dir,
-			  unsigned long attrs)
-{
-	return 0;
-}
-
-static void __dummy_unmap_sg(struct device *dev,
-			     struct scatterlist *sgl, int nelems,
-			     enum dma_data_direction dir,
-			     unsigned long attrs)
-{
-}
-
-static void __dummy_sync_single(struct device *dev,
-				dma_addr_t dev_addr, size_t size,
-				enum dma_data_direction dir)
-{
-}
-
-static void __dummy_sync_sg(struct device *dev,
-			    struct scatterlist *sgl, int nelems,
-			    enum dma_data_direction dir)
-{
-}
-
-static int __dummy_dma_supported(struct device *hwdev, u64 mask)
-{
-	return 0;
-}
-
-const struct dma_map_ops dummy_dma_ops = {
-	.alloc                  = __dummy_alloc,
-	.free                   = __dummy_free,
-	.mmap                   = __dummy_mmap,
-	.map_page               = __dummy_map_page,
-	.unmap_page             = __dummy_unmap_page,
-	.map_sg                 = __dummy_map_sg,
-	.unmap_sg               = __dummy_unmap_sg,
-	.sync_single_for_cpu    = __dummy_sync_single,
-	.sync_single_for_device = __dummy_sync_single,
-	.sync_sg_for_cpu        = __dummy_sync_sg,
-	.sync_sg_for_device     = __dummy_sync_sg,
-	.dma_supported          = __dummy_dma_supported,
-};
-EXPORT_SYMBOL(dummy_dma_ops);
-
 static int __init arm64_dma_init(void)
 {
 	WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(),
@@ -548,9 +462,6 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 			const struct iommu_ops *iommu, bool coherent)
 {
-	if (!dev->dma_ops)
-		dev->dma_ops = &dma_direct_ops;
-
 	dev->dma_coherent = coherent;
 	__iommu_setup_dma_ops(dev, dma_base, size, iommu);
 
diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c
index f40ca499b246..8840ed97712f 100644
--- a/arch/ia64/hp/common/hwsw_iommu.c
+++ b/arch/ia64/hp/common/hwsw_iommu.c
@@ -38,7 +38,7 @@ static inline int use_swiotlb(struct device *dev)
 const struct dma_map_ops *hwsw_dma_get_ops(struct device *dev)
 {
 	if (use_swiotlb(dev))
-		return &dma_direct_ops;
+		return NULL;
 	return &sba_dma_ops;
 }
 EXPORT_SYMBOL(hwsw_dma_get_ops);
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 5ee74820a0f6..5a361e51cb1e 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -2078,7 +2078,7 @@ sba_init(void)
 	 * a successful kdump kernel boot is to use the swiotlb.
 	 */
 	if (is_kdump_kernel()) {
-		dma_ops = &dma_direct_ops;
+		dma_ops = NULL;
 		if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
 			panic("Unable to initialize software I/O TLB:"
 				  " Try machvec=dig boot option");
@@ -2100,7 +2100,7 @@ sba_init(void)
 		 * If we didn't find something sba_iommu can claim, we
 		 * need to setup the swiotlb and switch to the dig machvec.
 		 */
-		dma_ops = &dma_direct_ops;
+		dma_ops = NULL;
 		if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
 			panic("Unable to find SBA IOMMU or initialize "
 			      "software I/O TLB: Try machvec=dig boot option");
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index 80cd3e1ea95a..ad7d9963de34 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -36,7 +36,6 @@ long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
 
 void __init swiotlb_dma_init(void)
 {
-	dma_ops = &dma_direct_ops;
 	swiotlb_init(1);
 }
 #endif
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
index fe988c49f01c..a4d61278417a 100644
--- a/arch/ia64/kernel/pci-dma.c
+++ b/arch/ia64/kernel/pci-dma.c
@@ -48,7 +48,7 @@ void __init pci_iommu_alloc(void)
 #ifdef CONFIG_IA64_GENERIC
 		printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
 		machvec_init("dig");
-		swiotlb_dma_init();
+		swiotlb_init(1);
 #else
 		panic("Unable to find Intel IOMMU");
 #endif /* CONFIG_IA64_GENERIC */
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index 69f914667f3e..20dfaad3a55d 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -11,7 +11,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 #if defined(CONFIG_MACH_JAZZ)
 	return &jazz_dma_ops;
 #else
-	return &dma_direct_ops;
+	return NULL;
 #endif
 }
 
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index cd227f1cf629..54818cd78bd0 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -99,10 +99,6 @@ void __init dma_ops_init(void)
 
 	case pcxl2:
 		pa7300lc_init();
-	case pcxl: /* falls through */
-	case pcxs:
-	case pcxt:
-		hppa_dma_ops = &dma_direct_ops;
 		break;
 	default:
 		break;
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index b0bb2fcaf1c9..59f5a0f17316 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -14,11 +14,11 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
 #ifdef CONFIG_SPARC_LEON
 	if (sparc_cpu_model == sparc_leon)
-		return &dma_direct_ops;
+		return NULL;
 #endif
 #if defined(CONFIG_SPARC32) && defined(CONFIG_PCI)
 	if (bus == &pci_bus_type)
-		return &dma_direct_ops;
+		return NULL;
 #endif
 	return dma_ops;
 }
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index f4562fcec681..d460998ae828 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -17,7 +17,7 @@
 
 static bool disable_dac_quirk __read_mostly;
 
-const struct dma_map_ops *dma_ops = &dma_direct_ops;
+const struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 #ifdef CONFIG_IOMMU_DEBUG
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 41b91af95afb..1a659225d01a 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1203,6 +1203,8 @@ u64 dma_get_required_mask(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
+	if (dma_is_direct(ops))
+		return dma_direct_get_required_mask(dev);
 	if (ops->get_required_mask)
 		return ops->get_required_mask(dev);
 	return dma_default_get_required_mask(dev);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 61a84b958d67..50637f372e9f 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -581,7 +581,7 @@ static int vmw_dma_select_mode(struct vmw_private *dev_priv)
 
 	dev_priv->map_mode = vmw_dma_map_populate;
 
-	if (dma_ops->sync_single_for_cpu)
+	if (dma_ops && dma_ops->sync_single_for_cpu)
 		dev_priv->map_mode = vmw_dma_alloc_coherent;
 #ifdef CONFIG_SWIOTLB
 	if (swiotlb_nr_tbl() == 0)
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index c5d6c7c42b0a..567221cca13c 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2184,7 +2184,7 @@ static int amd_iommu_add_device(struct device *dev)
 				dev_name(dev));
 
 		iommu_ignore_device(dev);
-		dev->dma_ops = &dma_direct_ops;
+		dev->dma_ops = NULL;
 		goto out;
 	}
 	init_iommu_group(dev);
@@ -2770,17 +2770,6 @@ int __init amd_iommu_init_dma_ops(void)
 	swiotlb        = (iommu_pass_through || sme_me_mask) ? 1 : 0;
 	iommu_detected = 1;
 
-	/*
-	 * In case we don't initialize SWIOTLB (actually the common case
-	 * when AMD IOMMU is enabled and SME is not active), make sure there
-	 * are global dma_ops set as a fall-back for devices not handled by
-	 * this driver (for example non-PCI devices). When SME is active,
-	 * make sure that swiotlb variable remains set so the global dma_ops
-	 * continue to be SWIOTLB.
-	 */
-	if (!swiotlb)
-		dma_ops = &dma_direct_ops;
-
 	if (amd_iommu_unmap_flush)
 		pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n");
 	else
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 98ce79eac128..a63568c7172d 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -309,7 +309,9 @@ static struct device *to_vmd_dev(struct device *dev)
 
 static const struct dma_map_ops *vmd_dma_ops(struct device *dev)
 {
-	return get_dma_ops(to_vmd_dev(dev));
+	const struct dma_map_ops *ops = get_dma_ops(to_vmd_dev(dev));
+
+	return ops ? ops : &dma_direct_ops;
 }
 
 static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr,
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index 880a292d792f..c13f46109e88 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -4,7 +4,7 @@
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
-	return &dma_direct_ops;
+	return NULL;
 }
 
 #endif /* _ASM_GENERIC_DMA_MAPPING_H */
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index b7338702592a..b2012ba1c436 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -48,11 +48,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 	return __sme_clr(__dma_to_phys(dev, daddr));
 }
 
-u64 dma_direct_get_required_mask(struct device *dev);
-void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t gfp, unsigned long attrs);
-void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
-		dma_addr_t dma_addr, unsigned long attrs);
 void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
@@ -60,5 +55,4 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs);
 void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page);
-int dma_direct_supported(struct device *dev, u64 mask);
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 648ca7da3bb3..c45b649330a8 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -223,7 +223,7 @@ static inline const struct dma_map_ops *get_dma_ops(struct device *dev)
 
 static inline bool dma_is_direct(const struct dma_map_ops *ops)
 {
-	return IS_ENABLED(CONFIG_DMA_DIRECT_OPS) && ops == &dma_direct_ops;
+	return likely(IS_ENABLED(CONFIG_DMA_DIRECT_OPS) && ops == NULL);
 }
 
 /*
@@ -235,6 +235,12 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 		unsigned long attrs);
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs);
+u64 dma_direct_get_required_mask(struct device *dev);
+void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		gfp_t gfp, unsigned long attrs);
+void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
+		dma_addr_t dma_addr, unsigned long attrs);
+int dma_direct_supported(struct device *dev, u64 mask);
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
     defined(CONFIG_SWIOTLB)
@@ -284,6 +290,14 @@ static inline void dma_direct_sync_sg_for_cpu(struct device *dev,
 }
 #endif
 
+#ifdef CONFIG_DMA_NONCOHERENT_CACHE_SYNC
+void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+		enum dma_data_direction direction);
+#else
+#define arch_dma_cache_sync NULL
+#endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */
+
+
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 					      size_t size,
 					      enum dma_data_direction dir,
@@ -314,12 +328,10 @@ static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_page) {
-		if (dma_is_direct(ops))
-			dma_direct_unmap_page(dev, addr, size, dir, attrs);
-		else
-			ops->unmap_page(dev, addr, size, dir, attrs);
-	}
+	if (dma_is_direct(ops))
+		dma_direct_unmap_page(dev, addr, size, dir, attrs);
+	else if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, attrs);
 	debug_dma_unmap_page(dev, addr, size, dir, true);
 }
 
@@ -353,12 +365,10 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (ops->unmap_sg) {
-		if (dma_is_direct(ops))
-			dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
-		else
-			ops->unmap_sg(dev, sg, nents, dir, attrs);
-	}
+	if (dma_is_direct(ops))
+		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
+	else if (ops->unmap_sg)
+		ops->unmap_sg(dev, sg, nents, dir, attrs);
 }
 
 static inline dma_addr_t dma_map_page_attrs(struct device *dev,
@@ -388,12 +398,10 @@ static inline void dma_unmap_page_attrs(struct device *dev,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_page) {
-		if (dma_is_direct(ops))
-			dma_direct_unmap_page(dev, addr, size, dir, attrs);
-		else
-			ops->unmap_page(dev, addr, size, dir, attrs);
-	}
+	if (dma_is_direct(ops))
+		dma_direct_unmap_page(dev, addr, size, dir, attrs);
+	else if (ops->unmap_page)
+		ops->unmap_page(dev, addr, size, dir, attrs);
 	debug_dma_unmap_page(dev, addr, size, dir, false);
 }
 
@@ -412,7 +420,7 @@ static inline dma_addr_t dma_map_resource(struct device *dev,
 	BUG_ON(pfn_valid(PHYS_PFN(phys_addr)));
 
 	addr = phys_addr;
-	if (ops->map_resource)
+	if (ops && ops->map_resource)
 		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
 
 	debug_dma_map_resource(dev, phys_addr, size, dir, addr);
@@ -427,7 +435,7 @@ static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->unmap_resource)
+	if (ops && ops->unmap_resource)
 		ops->unmap_resource(dev, addr, size, dir, attrs);
 	debug_dma_unmap_resource(dev, addr, size, dir);
 }
@@ -439,12 +447,10 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_cpu) {
-		if (dma_is_direct(ops))
-			dma_direct_sync_single_for_cpu(dev, addr, size, dir);
-		else
-			ops->sync_single_for_cpu(dev, addr, size, dir);
-	}
+	if (dma_is_direct(ops))
+		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
+	else if (ops->sync_single_for_cpu)
+		ops->sync_single_for_cpu(dev, addr, size, dir);
 	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
 }
 
@@ -462,12 +468,10 @@ static inline void dma_sync_single_for_device(struct device *dev,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_single_for_device) {
-		if (dma_is_direct(ops))
-			dma_direct_sync_single_for_device(dev, addr, size, dir);
-		else
-			ops->sync_single_for_device(dev, addr, size, dir);
-	}
+	if (dma_is_direct(ops))
+		dma_direct_sync_single_for_device(dev, addr, size, dir);
+	else if (ops->sync_single_for_device)
+		ops->sync_single_for_device(dev, addr, size, dir);
 	debug_dma_sync_single_for_device(dev, addr, size, dir);
 }
 
@@ -485,12 +489,10 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_cpu) {
-		if (dma_is_direct(ops))
-			dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
-		else
-			ops->sync_sg_for_cpu(dev, sg, nelems, dir);
-	}
+	if (dma_is_direct(ops))
+		dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_cpu)
+		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
 	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
 }
 
@@ -501,12 +503,10 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->sync_sg_for_device) {
-		if (dma_is_direct(ops))
-			dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
-		else
-			ops->sync_sg_for_device(dev, sg, nelems, dir);
-	}
+	if (dma_is_direct(ops))
+		dma_direct_sync_sg_for_device(dev, sg, nelems, dir);
+	else if (ops->sync_sg_for_device)
+		ops->sync_sg_for_device(dev, sg, nelems, dir);
 	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
 
 }
@@ -522,11 +522,15 @@ static inline void
 dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		enum dma_data_direction dir)
 {
+#ifdef CONFIG_DMA_NONCOHERENT_CACHE_SYNC
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (ops->cache_sync)
+	if (dma_is_direct(ops))
+		arch_dma_cache_sync(dev, vaddr, size, dir);
+	else if (ops->cache_sync)
 		ops->cache_sync(dev, vaddr, size, dir);
+#endif
 }
 
 extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
@@ -565,8 +569,8 @@ dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr,
 	       dma_addr_t dma_addr, size_t size, unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-	BUG_ON(!ops);
-	if (ops->mmap)
+
+	if (!dma_is_direct(ops) && ops->mmap)
 		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
 	return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
 }
@@ -583,8 +587,8 @@ dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
 		      unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-	BUG_ON(!ops);
-	if (ops->get_sgtable)
+
+	if (!dma_is_direct(ops) && ops->get_sgtable)
 		return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
 					attrs);
 	return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
@@ -604,7 +608,6 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	void *cpu_addr;
 
-	BUG_ON(!ops);
 	WARN_ON_ONCE(dev && !dev->coherent_dma_mask);
 
 	if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
@@ -615,10 +618,13 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
 
 	if (!arch_dma_alloc_attrs(&dev))
 		return NULL;
-	if (!ops->alloc)
-		return NULL;
 
-	cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+	if (dma_is_direct(ops))
+		cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
+	else if (ops->alloc)
+		cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+	else
+		return NULL;
 	debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
 	return cpu_addr;
 }
@@ -629,8 +635,6 @@ static inline void dma_free_attrs(struct device *dev, size_t size,
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	BUG_ON(!ops);
-
 	if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr))
 		return;
 	/*
@@ -642,11 +646,14 @@ static inline void dma_free_attrs(struct device *dev, size_t size,
 	 */
 	WARN_ON(irqs_disabled());
 
-	if (!ops->free || !cpu_addr)
+	if (!cpu_addr)
 		return;
 
 	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-	ops->free(dev, size, cpu_addr, dma_handle, attrs);
+	if (dma_is_direct(ops))
+		dma_direct_free(dev, size, cpu_addr, dma_handle, attrs);
+	else if (ops->free)
+		ops->free(dev, size, cpu_addr, dma_handle, attrs);
 }
 
 static inline void *dma_alloc_coherent(struct device *dev, size_t size,
@@ -682,6 +689,8 @@ static inline int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
+	if (dma_is_direct(ops))
+		return dma_direct_supported(dev, mask);
 	if (!ops)
 		return 0;
 	if (!ops->dma_supported)
diff --git a/include/linux/dma-noncoherent.h b/include/linux/dma-noncoherent.h
index 306557331d7d..51c8db377c17 100644
--- a/include/linux/dma-noncoherent.h
+++ b/include/linux/dma-noncoherent.h
@@ -34,13 +34,6 @@ pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot,
 # define arch_dma_mmap_pgprot(dev, prot, attrs)	pgprot_noncached(prot)
 #endif
 
-#ifdef CONFIG_DMA_NONCOHERENT_CACHE_SYNC
-void arch_dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-		enum dma_data_direction direction);
-#else
-#define arch_dma_cache_sync NULL
-#endif /* CONFIG_DMA_NONCOHERENT_CACHE_SYNC */
-
 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE
 void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
 		size_t size, enum dma_data_direction dir);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 41c3b1df70eb..a73ef291499a 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -36,7 +36,7 @@ config ARCH_HAS_DMA_MMAP_PGPROT
 	bool
 
 config DMA_DIRECT_OPS
-	bool
+	def_bool y
 	depends on HAS_DMA
 
 config DMA_NONCOHERENT_CACHE_SYNC
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index cd535e7c67d7..119a981a3899 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -200,6 +200,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
 	return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
 }
+EXPORT_SYMBOL(dma_direct_alloc);
 
 void dma_direct_free(struct device *dev, size_t size,
 		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
@@ -209,6 +210,7 @@ void dma_direct_free(struct device *dev, size_t size,
 	else
 		dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
 }
+EXPORT_SYMBOL(dma_direct_free);
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
     defined(CONFIG_SWIOTLB)
@@ -375,6 +377,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
 
 	return mask >= phys_to_dma(dev, min_mask);
 }
+EXPORT_SYMBOL(dma_direct_supported);
 
 const struct dma_map_ops dma_direct_ops = {
 	.alloc			= dma_direct_alloc,
-- 
2.19.1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-06 20:00       ` Christoph Hellwig
@ 2018-12-06 20:24         ` Robin Murphy
  2018-12-07  1:21           ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Robin Murphy @ 2018-12-06 20:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Linus Torvalds, iommu, brouer, tariqt, ilias.apalodimas, toke,
	Linux List Kernel Mailing

On 06/12/2018 20:00, Christoph Hellwig wrote:
> On Thu, Dec 06, 2018 at 06:54:17PM +0000, Robin Murphy wrote:
>> I'm pretty sure we used to assign dummy_dma_ops explicitly to devices at
>> the point we detected the ACPI properties are wrong - that shouldn't be too
>> much of a headache to go back to.
> 
> Ok.  I've cooked up a patch to use NULL as the go direct marker.
> This cleans up a few things nicely, but also means we now need to
> do the bypass scheme for all ops, not just the fast path.  But we
> probably should just move the slow path ops out of line anyway,
> so I'm not worried about it.  This has survived some very basic
> testing on x86, and really needs to be cleaned up and split into
> multiple patches..

I've also just finished hacking something up to keep the arm64 status 
quo - I'll need to actually test it tomorrow, but the overall diff looks 
like the below.

Robin.

----->8-----
diff --git a/arch/arm64/include/asm/dma-mapping.h 
b/arch/arm64/include/asm/dma-mapping.h
index 0626c3a93730..fe6287f68adb 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -19,15 +19,13 @@
  #include <linux/types.h>
  #include <linux/vmalloc.h>

-extern const struct dma_map_ops dummy_dma_ops;
-
  static inline const struct dma_map_ops *get_arch_dma_ops(struct 
bus_type *bus)
  {
  	/*
  	 * We expect no ISA devices, and all other DMA masters are expected to
  	 * have someone call arch_setup_dma_ops at device creation time.
  	 */
-	return &dummy_dma_ops;
+	return &dma_dummy_ops;
  }

  void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 17f8fc784de2..574aee2d04e7 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -255,98 +255,6 @@ static int __init atomic_pool_init(void)
  	return -ENOMEM;
  }

-/********************************************
- * The following APIs are for dummy DMA ops *
- ********************************************/
-
-static void *__dummy_alloc(struct device *dev, size_t size,
-			   dma_addr_t *dma_handle, gfp_t flags,
-			   unsigned long attrs)
-{
-	return NULL;
-}
-
-static void __dummy_free(struct device *dev, size_t size,
-			 void *vaddr, dma_addr_t dma_handle,
-			 unsigned long attrs)
-{
-}
-
-static int __dummy_mmap(struct device *dev,
-			struct vm_area_struct *vma,
-			void *cpu_addr, dma_addr_t dma_addr, size_t size,
-			unsigned long attrs)
-{
-	return -ENXIO;
-}
-
-static dma_addr_t __dummy_map_page(struct device *dev, struct page *page,
-				   unsigned long offset, size_t size,
-				   enum dma_data_direction dir,
-				   unsigned long attrs)
-{
-	return 0;
-}
-
-static void __dummy_unmap_page(struct device *dev, dma_addr_t dev_addr,
-			       size_t size, enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-}
-
-static int __dummy_map_sg(struct device *dev, struct scatterlist *sgl,
-			  int nelems, enum dma_data_direction dir,
-			  unsigned long attrs)
-{
-	return 0;
-}
-
-static void __dummy_unmap_sg(struct device *dev,
-			     struct scatterlist *sgl, int nelems,
-			     enum dma_data_direction dir,
-			     unsigned long attrs)
-{
-}
-
-static void __dummy_sync_single(struct device *dev,
-				dma_addr_t dev_addr, size_t size,
-				enum dma_data_direction dir)
-{
-}
-
-static void __dummy_sync_sg(struct device *dev,
-			    struct scatterlist *sgl, int nelems,
-			    enum dma_data_direction dir)
-{
-}
-
-static int __dummy_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
-{
-	return 1;
-}
-
-static int __dummy_dma_supported(struct device *hwdev, u64 mask)
-{
-	return 0;
-}
-
-const struct dma_map_ops dummy_dma_ops = {
-	.alloc                  = __dummy_alloc,
-	.free                   = __dummy_free,
-	.mmap                   = __dummy_mmap,
-	.map_page               = __dummy_map_page,
-	.unmap_page             = __dummy_unmap_page,
-	.map_sg                 = __dummy_map_sg,
-	.unmap_sg               = __dummy_unmap_sg,
-	.sync_single_for_cpu    = __dummy_sync_single,
-	.sync_single_for_device = __dummy_sync_single,
-	.sync_sg_for_cpu        = __dummy_sync_sg,
-	.sync_sg_for_device     = __dummy_sync_sg,
-	.mapping_error          = __dummy_mapping_error,
-	.dma_supported          = __dummy_dma_supported,
-};
-EXPORT_SYMBOL(dummy_dma_ops);
-
  static int __init arm64_dma_init(void)
  {
  	WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(),
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index bd1c59fb0e17..b75ae34ed188 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1456,6 +1456,11 @@ int acpi_dma_configure(struct device *dev, enum 
dev_dma_attr attr)
  	const struct iommu_ops *iommu;
  	u64 dma_addr = 0, size = 0;

+	if (attr == DEV_DMA_NOT_SUPPORTED) {
+		set_dma_ops(dev, &dma_dummy_ops);
+		return 0;
+	}
+
  	iort_dma_setup(dev, &dma_addr, &size);

  	iommu = iort_iommu_configure(dev);
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 41b91af95afb..c6daca875c17 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1138,8 +1138,7 @@ int platform_dma_configure(struct device *dev)
  		ret = of_dma_configure(dev, dev->of_node, true);
  	} else if (has_acpi_companion(dev)) {
  		attr = acpi_get_dma_attr(to_acpi_device_node(dev->fwnode));
-		if (attr != DEV_DMA_NOT_SUPPORTED)
-			ret = acpi_dma_configure(dev, attr);
+		ret = acpi_dma_configure(dev, attr);
  	}

  	return ret;
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index bef17c3fca67..f899a28b90f8 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1602,8 +1602,7 @@ static int pci_dma_configure(struct device *dev)
  		struct acpi_device *adev = to_acpi_device_node(bridge->fwnode);
  		enum dev_dma_attr attr = acpi_get_dma_attr(adev);

-		if (attr != DEV_DMA_NOT_SUPPORTED)
-			ret = acpi_dma_configure(dev, attr);
+		ret = acpi_dma_configure(dev, attr);
  	}

  	pci_put_host_bridge_device(bridge);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 15bd41447025..3a1928ea23c9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -134,6 +134,7 @@ struct dma_map_ops {
  };

  extern const struct dma_map_ops dma_direct_ops;
+extern const struct dma_map_ops dma_dummy_ops;
  extern const struct dma_map_ops dma_virt_ops;

  #define DMA_BIT_MASK(n)	(((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 58dec7a92b7b..957a42ebabcf 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -346,3 +346,57 @@ void dma_common_free_remap(void *cpu_addr, size_t 
size, unsigned long vm_flags)
  	vunmap(cpu_addr);
  }
  #endif
+
+/*
+ * Dummy DMA ops always fail
+ */
+
+static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma,
+			  void *cpu_addr, dma_addr_t dma_addr, size_t size,
+			  unsigned long attrs)
+{
+	return -ENXIO;
+}
+
+static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page,
+				     unsigned long offset, size_t size,
+				     enum dma_data_direction dir,
+				     unsigned long attrs)
+{
+	return 0;
+}
+
+static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl,
+			    int nelems, enum dma_data_direction dir,
+			    unsigned long attrs)
+{
+	return 0;
+}
+
+static dma_addr_t dma_dummy_map_resource(struct device *dev,
+					 phys_addr_t phys_addr, size_t size,
+					 enum dma_data_direction dir,
+					 unsigned long attrs)
+{
+	return 0;
+}
+
+static int dma_dummy_mapping_error(struct device *hwdev, dma_addr_t 
dma_addr)
+{
+	return 1;
+}
+
+static int dma_dummy_supported(struct device *hwdev, u64 mask)
+{
+	return 0;
+}
+
+const struct dma_map_ops dma_dummy_ops = {
+	.mmap		= dma_dummy_mmap,
+	.map_page	= dma_dummy_map_page,
+	.map_sg		= dma_dummy_map_sg,
+	.map_resource	= dma_dummy_map_resource,
+	.mapping_error	= dma_dummy_mapping_error,
+	.dma_supported	= dma_dummy_supported,
+};
+EXPORT_SYMBOL(dma_dummy_ops);

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-06 20:24         ` Robin Murphy
@ 2018-12-07  1:21           ` Christoph Hellwig
  2018-12-07 15:44             ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2018-12-07  1:21 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Christoph Hellwig, Linus Torvalds, iommu, brouer, tariqt,
	ilias.apalodimas, toke, Linux List Kernel Mailing

On Thu, Dec 06, 2018 at 08:24:38PM +0000, Robin Murphy wrote:
> On 06/12/2018 20:00, Christoph Hellwig wrote:
>> On Thu, Dec 06, 2018 at 06:54:17PM +0000, Robin Murphy wrote:
>>> I'm pretty sure we used to assign dummy_dma_ops explicitly to devices at
>>> the point we detected the ACPI properties are wrong - that shouldn't be too
>>> much of a headache to go back to.
>>
>> Ok.  I've cooked up a patch to use NULL as the go direct marker.
>> This cleans up a few things nicely, but also means we now need to
>> do the bypass scheme for all ops, not just the fast path.  But we
>> probably should just move the slow path ops out of line anyway,
>> so I'm not worried about it.  This has survived some very basic
>> testing on x86, and really needs to be cleaned up and split into
>> multiple patches..
>
> I've also just finished hacking something up to keep the arm64 status quo - 
> I'll need to actually test it tomorrow, but the overall diff looks like the 
> below.

Nice.  I created a branch that picked up your bits and also the ideas
from Linus, and the result looks reall nice.  I'll still need a signoff
for your bits, though.

Jesper, can you give this a spin if it changes the number even further?

  git://git.infradead.org/users/hch/misc.git dma-direct-calls.2

  http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dma-direct-calls.2

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-07  1:21           ` Christoph Hellwig
@ 2018-12-07 15:44             ` Jesper Dangaard Brouer
  2018-12-07 16:05               ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2018-12-07 15:44 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Robin Murphy, Linus Torvalds, iommu, tariqt, ilias.apalodimas,
	toke, Linux List Kernel Mailing, brouer

On Fri, 7 Dec 2018 02:21:42 +0100
Christoph Hellwig <hch@lst.de> wrote:

> On Thu, Dec 06, 2018 at 08:24:38PM +0000, Robin Murphy wrote:
> > On 06/12/2018 20:00, Christoph Hellwig wrote:  
> >> On Thu, Dec 06, 2018 at 06:54:17PM +0000, Robin Murphy wrote:  
> >>> I'm pretty sure we used to assign dummy_dma_ops explicitly to devices at
> >>> the point we detected the ACPI properties are wrong - that shouldn't be too
> >>> much of a headache to go back to.  
> >>
> >> Ok.  I've cooked up a patch to use NULL as the go direct marker.
> >> This cleans up a few things nicely, but also means we now need to
> >> do the bypass scheme for all ops, not just the fast path.  But we
> >> probably should just move the slow path ops out of line anyway,
> >> so I'm not worried about it.  This has survived some very basic
> >> testing on x86, and really needs to be cleaned up and split into
> >> multiple patches..  
> >
> > I've also just finished hacking something up to keep the arm64 status quo - 
> > I'll need to actually test it tomorrow, but the overall diff looks like the 
> > below.  
> 
> Nice.  I created a branch that picked up your bits and also the ideas
> from Linus, and the result looks reall nice.  I'll still need a signoff
> for your bits, though.
> 
> Jesper, can you give this a spin if it changes the number even further?
> 
>   git://git.infradead.org/users/hch/misc.git dma-direct-calls.2
> 
>   http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dma-direct-calls.2

I'll test it soon...

I looked at my perf stat recording on my existing tests[1] and there
seems to be significantly more I-cache usage.

Copy-paste from my summary[1]:
 [1] https://github.com/xdp-project/xdp-project/blob/master/areas/dma/dma01_test_hellwig_direct_dma.org#summary-of-results

* Summary of results

Using XDP_REDIRECT between drivers RX ixgbe(10G) redirect TX i40e(40G),
via BPF devmap (used samples/bpf/xdp_redirect_map) . (Note choose
higher TX link-speed to assure that we don't to have a TX bottleneck).
The baseline-kernel is at commit https://git.kernel.org/torvalds/c/ef78e5ec9214,
which is commit just before Hellwigs changes in this tree.

Performance numbers in packets/sec (XDP_REDIRECT ixgbe -> i40e):
 - 11913154 (11,913,154) pps - baseline compiled without retpoline
 -  7438283  (7,438,283) pps - regression due to CONFIG_RETPOLINE
 -  9610088  (9,610,088) pps - mitigation via Hellwig dma-direct-calls

From the inst per cycle, it is clear that retpolines are stalling the CPU
pipeline:

| pps        | insn per cycle |
|------------+----------------|
| 11,913,154 |           2.39 |
| 7,438,283  |           1.54 |
| 9,610,088  |           2.04 |


Strangely the Instruction-Cache is also under heavier pressure:

| pps        | l2_rqsts.all_code_rd | l2_rqsts.code_rd_hit | l2_rqsts.code_rd_miss |
|------------+----------------------+----------------------+-----------------------|
| 11,913,154 | 874,547              | 742,335              | 132,198               |
| 7,438,283  | 649,513              | 547,581              | 101,945               |
| 9,610,088  | 2,568,064            | 2,001,369            | 566,683               |
|            |                      |                      |                       |


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] avoid indirect calls for DMA direct mappings
  2018-12-07 15:44             ` Jesper Dangaard Brouer
@ 2018-12-07 16:05               ` Jesper Dangaard Brouer
  0 siblings, 0 replies; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2018-12-07 16:05 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Robin Murphy, Linus Torvalds, iommu, tariqt, ilias.apalodimas,
	toke, Linux List Kernel Mailing, brouer

On Fri, 7 Dec 2018 16:44:35 +0100
Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> On Fri, 7 Dec 2018 02:21:42 +0100
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > On Thu, Dec 06, 2018 at 08:24:38PM +0000, Robin Murphy wrote:  
> > > On 06/12/2018 20:00, Christoph Hellwig wrote:    
> > >> On Thu, Dec 06, 2018 at 06:54:17PM +0000, Robin Murphy wrote:    
> > >>> I'm pretty sure we used to assign dummy_dma_ops explicitly to devices at
> > >>> the point we detected the ACPI properties are wrong - that shouldn't be too
> > >>> much of a headache to go back to.    
> > >>
> > >> Ok.  I've cooked up a patch to use NULL as the go direct marker.
> > >> This cleans up a few things nicely, but also means we now need to
> > >> do the bypass scheme for all ops, not just the fast path.  But we
> > >> probably should just move the slow path ops out of line anyway,
> > >> so I'm not worried about it.  This has survived some very basic
> > >> testing on x86, and really needs to be cleaned up and split into
> > >> multiple patches..    
> > >
> > > I've also just finished hacking something up to keep the arm64 status quo - 
> > > I'll need to actually test it tomorrow, but the overall diff looks like the 
> > > below.    
> > 
> > Nice.  I created a branch that picked up your bits and also the ideas
> > from Linus, and the result looks reall nice.  I'll still need a signoff
> > for your bits, though.
> > 
> > Jesper, can you give this a spin if it changes the number even further?
> > 
> >   git://git.infradead.org/users/hch/misc.git dma-direct-calls.2
> > 
> >   http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dma-direct-calls.2  
> 
> I'll test it soon...
> 
> I looked at my perf stat recording on my existing tests[1] and there
> seems to be significantly more I-cache usage.

The I-cache pressure seems to be lower with the new branch, and
performance improved with 4.5 nanosec.

 
> Copy-paste from my summary[1]:
> [1] https://github.com/xdp-project/xdp-project/blob/master/areas/dma/dma01_test_hellwig_direct_dma.org#summary-of-results

Updated:

* Summary of results

Using XDP_REDIRECT between drivers RX ixgbe(10G) redirect TX i40e(40G),
via BPF devmap (used samples/bpf/xdp_redirect_map) . (Note choose
higher TX link-speed to assure that we don't to have a TX bottleneck).
The baseline-kernel is at commit [[https://git.kernel.org/torvalds/c/ef78e5ec9214][ef78e5ec9214]], which is commit just
before Hellwigs changes in this tree.

Performance numbers in packets/sec (XDP_REDIRECT ixgbe -> i40e):
 - 11913154 (11,913,154) pps - baseline compiled without retpoline
 -  7438283  (7,438,283) pps - regression due to CONFIG_RETPOLINE
 -  9610088  (9,610,088) pps - mitigation via Hellwig dma-direct-calls
 - 10049223 (10,049,223) pps - Hellwig branch dma-direct-calls.2

Do notice at these extreme speeds the pps number increase rabbit with
small changes, e.g. difference to new branch is:
 - (1/9610088-1/10049223)*10^9 = 4.54 nanosec faster

From the inst per cycle, it is clear that retpolines are stalling the CPU
pipeline:

| pps        | insn per cycle |
|------------+----------------|
| 11,913,154 |           2.39 |
| 7,438,283  |           1.54 |
| 9,610,088  |           2.04 |
| 10,049,223 |           1.99 |
|            |                |


Strangely the Instruction-Cache is also under heavier pressure:

| pps        | l2_rqsts.all_code_rd | l2_rqsts.code_rd_hit | l2_rqsts.code_rd_miss |
|------------+----------------------+----------------------+-----------------------|
| 11,913,154 | 874,547              | 742,335              | 132,198               |
| 7,438,283  | 649,513              | 547,581              | 101,945               |
| 9,610,088  | 2,568,064            | 2,001,369            | 566,683               |
| 10,049,223 | 1,232,818            | 1,152,514            | 80,299                |
|            |                      |                      |                       |

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, back to index

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-06 15:37 [RFC] avoid indirect calls for DMA direct mappings Christoph Hellwig
2018-12-06 15:37 ` [PATCH] dma-mapping: bypass indirect calls for dma-direct Christoph Hellwig
2018-12-06 17:40   ` Jesper Dangaard Brouer
2018-12-06 18:35     ` Christoph Hellwig
2018-12-06 17:43 ` [RFC] avoid indirect calls for DMA direct mappings Jesper Dangaard Brouer
2018-12-06 18:29   ` Nadav Amit
2018-12-06 18:45     ` Christoph Hellwig
2018-12-06 18:28 ` Linus Torvalds
2018-12-06 18:30   ` Linus Torvalds
2018-12-06 18:43   ` Christoph Hellwig
2018-12-06 18:51     ` Linus Torvalds
2018-12-06 18:54     ` Robin Murphy
2018-12-06 20:00       ` Christoph Hellwig
2018-12-06 20:24         ` Robin Murphy
2018-12-07  1:21           ` Christoph Hellwig
2018-12-07 15:44             ` Jesper Dangaard Brouer
2018-12-07 16:05               ` Jesper Dangaard Brouer

LKML Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/lkml/0 lkml/git/0.git
	git clone --mirror https://lore.kernel.org/lkml/1 lkml/git/1.git
	git clone --mirror https://lore.kernel.org/lkml/2 lkml/git/2.git
	git clone --mirror https://lore.kernel.org/lkml/3 lkml/git/3.git
	git clone --mirror https://lore.kernel.org/lkml/4 lkml/git/4.git
	git clone --mirror https://lore.kernel.org/lkml/5 lkml/git/5.git
	git clone --mirror https://lore.kernel.org/lkml/6 lkml/git/6.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 lkml lkml/ https://lore.kernel.org/lkml \
		linux-kernel@vger.kernel.org linux-kernel@archiver.kernel.org
	public-inbox-index lkml


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-kernel


AGPL code for this site: git clone https://public-inbox.org/ public-inbox