iommu.lists.linux-foundation.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net] xsk: remove cheap_dma optimization
@ 2020-06-26 13:43 Björn Töpel
  2020-06-26 20:44 ` Jonathan Lemon
  2020-06-26 23:00 ` Daniel Borkmann
  0 siblings, 2 replies; 17+ messages in thread
From: Björn Töpel @ 2020-06-26 13:43 UTC (permalink / raw)
  To: netdev
  Cc: maximmi, konrad.wilk, linux-kernel, davem, iommu, jonathan.lemon,
	bpf, Björn Töpel, hch, magnus.karlsson

From: Björn Töpel <bjorn.topel@intel.com>

When the AF_XDP buffer allocation API was introduced it had an
optimization, "cheap_dma". The idea was that when the umem was DMA
mapped, the pool also checked whether the mapping required a
synchronization (CPU to device, and vice versa). If not, it would be
marked as "cheap_dma" and the synchronization would be elided.

In [1] Christoph points out that the optimization above breaks the DMA
API abstraction, and should be removed. Further, Christoph points out
that optimizations like this should be done within the DMA mapping
core, and not elsewhere.

Unfortunately this has implications for the packet rate
performance. The AF_XDP rxdrop scenario shows a 9% decrease in packets
per second.

[1] https://lore.kernel.org/netdev/20200626074725.GA21790@lst.de/

Cc: Christoph Hellwig <hch@lst.de>
Fixes: 2b43470add8c ("xsk: Introduce AF_XDP buffer allocation API")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 include/net/xsk_buff_pool.h | 16 +++------
 net/xdp/xsk_buff_pool.c     | 69 ++-----------------------------------
 2 files changed, 6 insertions(+), 79 deletions(-)

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index a4ff226505c9..3ea9b9654632 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -40,7 +40,6 @@ struct xsk_buff_pool {
 	u32 headroom;
 	u32 chunk_size;
 	u32 frame_len;
-	bool cheap_dma;
 	bool unaligned;
 	void *addrs;
 	struct device *dev;
@@ -77,24 +76,17 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
 	return xskb->frame_dma;
 }
 
-void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb);
 static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
 {
-	if (xskb->pool->cheap_dma)
-		return;
-
-	xp_dma_sync_for_cpu_slow(xskb);
+	dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
+				      xskb->pool->frame_len, DMA_BIDIRECTIONAL);
 }
 
-void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
-				 size_t size);
 static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
 					  dma_addr_t dma, size_t size)
 {
-	if (pool->cheap_dma)
-		return;
-
-	xp_dma_sync_for_device_slow(pool, dma, size);
+	dma_sync_single_range_for_device(pool->dev, dma, 0,
+					 size, DMA_BIDIRECTIONAL);
 }
 
 /* Masks for xdp_umem_page flags.
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 540ed75e4482..c330e5f3aadf 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -2,9 +2,6 @@
 
 #include <net/xsk_buff_pool.h>
 #include <net/xdp_sock.h>
-#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
-#include <linux/swiotlb.h>
 
 #include "xsk_queue.h"
 
@@ -55,7 +52,6 @@ struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
 	pool->free_heads_cnt = chunks;
 	pool->headroom = headroom;
 	pool->chunk_size = chunk_size;
-	pool->cheap_dma = true;
 	pool->unaligned = unaligned;
 	pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM;
 	INIT_LIST_HEAD(&pool->free_list);
@@ -125,48 +121,6 @@ static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
 	}
 }
 
-static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool)
-{
-#if defined(CONFIG_SWIOTLB)
-	phys_addr_t paddr;
-	u32 i;
-
-	for (i = 0; i < pool->dma_pages_cnt; i++) {
-		paddr = dma_to_phys(pool->dev, pool->dma_pages[i]);
-		if (is_swiotlb_buffer(paddr))
-			return false;
-	}
-#endif
-	return true;
-}
-
-static bool xp_check_cheap_dma(struct xsk_buff_pool *pool)
-{
-#if defined(CONFIG_HAS_DMA)
-	const struct dma_map_ops *ops = get_dma_ops(pool->dev);
-
-	if (ops) {
-		return !ops->sync_single_for_cpu &&
-			!ops->sync_single_for_device;
-	}
-
-	if (!dma_is_direct(ops))
-		return false;
-
-	if (!xp_check_swiotlb_dma(pool))
-		return false;
-
-	if (!dev_is_dma_coherent(pool->dev)) {
-#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) ||		\
-	defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) ||	\
-	defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
-		return false;
-#endif
-	}
-#endif
-	return true;
-}
-
 int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
 	       unsigned long attrs, struct page **pages, u32 nr_pages)
 {
@@ -195,7 +149,6 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
 		xp_check_dma_contiguity(pool);
 
 	pool->dev = dev;
-	pool->cheap_dma = xp_check_cheap_dma(pool);
 	return 0;
 }
 EXPORT_SYMBOL(xp_dma_map);
@@ -280,11 +233,8 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
 	xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
 	xskb->xdp.data_meta = xskb->xdp.data;
 
-	if (!pool->cheap_dma) {
-		dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
-						 pool->frame_len,
-						 DMA_BIDIRECTIONAL);
-	}
+	dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
+					 pool->frame_len, DMA_BIDIRECTIONAL);
 	return &xskb->xdp;
 }
 EXPORT_SYMBOL(xp_alloc);
@@ -319,18 +269,3 @@ dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
 		(addr & ~PAGE_MASK);
 }
 EXPORT_SYMBOL(xp_raw_get_dma);
-
-void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb)
-{
-	dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
-				      xskb->pool->frame_len, DMA_BIDIRECTIONAL);
-}
-EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow);
-
-void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
-				 size_t size)
-{
-	dma_sync_single_range_for_device(pool->dev, dma, 0,
-					 size, DMA_BIDIRECTIONAL);
-}
-EXPORT_SYMBOL(xp_dma_sync_for_device_slow);

base-commit: 4a21185cda0fbb860580eeeb4f1a70a9cda332a4
-- 
2.25.1

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-26 13:43 [PATCH net] xsk: remove cheap_dma optimization Björn Töpel
@ 2020-06-26 20:44 ` Jonathan Lemon
  2020-06-26 23:00 ` Daniel Borkmann
  1 sibling, 0 replies; 17+ messages in thread
From: Jonathan Lemon @ 2020-06-26 20:44 UTC (permalink / raw)
  To: Björn Töpel
  Cc: maximmi, konrad.wilk, netdev, linux-kernel, davem, iommu, bpf,
	Björn Töpel, hch, magnus.karlsson

On Fri, Jun 26, 2020 at 03:43:58PM +0200, Björn Töpel wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
> 
> When the AF_XDP buffer allocation API was introduced it had an
> optimization, "cheap_dma". The idea was that when the umem was DMA
> mapped, the pool also checked whether the mapping required a
> synchronization (CPU to device, and vice versa). If not, it would be
> marked as "cheap_dma" and the synchronization would be elided.
> 
> In [1] Christoph points out that the optimization above breaks the DMA
> API abstraction, and should be removed. Further, Christoph points out
> that optimizations like this should be done within the DMA mapping
> core, and not elsewhere.
> 
> Unfortunately this has implications for the packet rate
> performance. The AF_XDP rxdrop scenario shows a 9% decrease in packets
> per second.
> 
> [1] https://lore.kernel.org/netdev/20200626074725.GA21790@lst.de/
> 
> Cc: Christoph Hellwig <hch@lst.de>
> Fixes: 2b43470add8c ("xsk: Introduce AF_XDP buffer allocation API")
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-26 13:43 [PATCH net] xsk: remove cheap_dma optimization Björn Töpel
  2020-06-26 20:44 ` Jonathan Lemon
@ 2020-06-26 23:00 ` Daniel Borkmann
  2020-06-27  7:04   ` Christoph Hellwig
  1 sibling, 1 reply; 17+ messages in thread
From: Daniel Borkmann @ 2020-06-26 23:00 UTC (permalink / raw)
  To: Björn Töpel, netdev
  Cc: maximmi, konrad.wilk, linux-kernel, davem, iommu, jonathan.lemon,
	bpf, Björn Töpel, hch, magnus.karlsson

On 6/26/20 3:43 PM, Björn Töpel wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
> 
> When the AF_XDP buffer allocation API was introduced it had an
> optimization, "cheap_dma". The idea was that when the umem was DMA
> mapped, the pool also checked whether the mapping required a
> synchronization (CPU to device, and vice versa). If not, it would be
> marked as "cheap_dma" and the synchronization would be elided.
> 
> In [1] Christoph points out that the optimization above breaks the DMA
> API abstraction, and should be removed. Further, Christoph points out
> that optimizations like this should be done within the DMA mapping
> core, and not elsewhere.
> 
> Unfortunately this has implications for the packet rate
> performance. The AF_XDP rxdrop scenario shows a 9% decrease in packets
> per second.
> 
> [1] https://lore.kernel.org/netdev/20200626074725.GA21790@lst.de/
> 
> Cc: Christoph Hellwig <hch@lst.de>
> Fixes: 2b43470add8c ("xsk: Introduce AF_XDP buffer allocation API")
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>

Given there is roughly a ~5 weeks window at max where this removal could
still be applied in the worst case, could we come up with a fix / proposal
first that moves this into the DMA mapping core? If there is something that
can be agreed upon by all parties, then we could avoid re-adding the 9%
slowdown. :/

Thanks,
Daniel
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-26 23:00 ` Daniel Borkmann
@ 2020-06-27  7:04   ` Christoph Hellwig
  2020-06-28 17:16     ` Björn Töpel
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2020-06-27  7:04 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: maximmi, konrad.wilk, jonathan.lemon, linux-kernel, hch, iommu,
	netdev, bpf, Björn Töpel, davem, magnus.karlsson

On Sat, Jun 27, 2020 at 01:00:19AM +0200, Daniel Borkmann wrote:
> Given there is roughly a ~5 weeks window at max where this removal could
> still be applied in the worst case, could we come up with a fix / proposal
> first that moves this into the DMA mapping core? If there is something that
> can be agreed upon by all parties, then we could avoid re-adding the 9%
> slowdown. :/

I'd rather turn it upside down - this abuse of the internals blocks work
that has basically just missed the previous window and I'm not going
to wait weeks to sort out the API misuse.  But we can add optimizations
back later if we find a sane way.

That being said I really can't see how this would make so much of a
difference.  What architecture and what dma_ops are you using for
those measurements?  What is the workload?
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-27  7:04   ` Christoph Hellwig
@ 2020-06-28 17:16     ` Björn Töpel
  2020-06-29 13:52       ` Daniel Borkmann
  2020-06-29 15:41       ` Robin Murphy
  0 siblings, 2 replies; 17+ messages in thread
From: Björn Töpel @ 2020-06-28 17:16 UTC (permalink / raw)
  To: Christoph Hellwig, Daniel Borkmann
  Cc: maximmi, konrad.wilk, jonathan.lemon, linux-kernel, iommu,
	netdev, bpf, davem, magnus.karlsson


On 2020-06-27 09:04, Christoph Hellwig wrote:
> On Sat, Jun 27, 2020 at 01:00:19AM +0200, Daniel Borkmann wrote:
>> Given there is roughly a ~5 weeks window at max where this removal could
>> still be applied in the worst case, could we come up with a fix / proposal
>> first that moves this into the DMA mapping core? If there is something that
>> can be agreed upon by all parties, then we could avoid re-adding the 9%
>> slowdown. :/
> 
> I'd rather turn it upside down - this abuse of the internals blocks work
> that has basically just missed the previous window and I'm not going
> to wait weeks to sort out the API misuse.  But we can add optimizations
> back later if we find a sane way.
>

I'm not super excited about the performance loss, but I do get
Christoph's frustration about gutting the DMA API making it harder for
DMA people to get work done. Lets try to solve this properly using
proper DMA APIs.


> That being said I really can't see how this would make so much of a
> difference.  What architecture and what dma_ops are you using for
> those measurements?  What is the workload?
> 

The 9% is for an AF_XDP (Fast raw Ethernet socket. Think AF_PACKET, but 
faster.) benchmark: receive the packet from the NIC, and drop it. The 
DMA syncs stand out in the perf top:

   28.63%  [kernel]                   [k] i40e_clean_rx_irq_zc
   17.12%  [kernel]                   [k] xp_alloc
    8.80%  [kernel]                   [k] __xsk_rcv_zc
    7.69%  [kernel]                   [k] xdp_do_redirect
    5.35%  bpf_prog_992d9ddc835e5629  [k] bpf_prog_992d9ddc835e5629
    4.77%  [kernel]                   [k] xsk_rcv.part.0
    4.07%  [kernel]                   [k] __xsk_map_redirect
    3.80%  [kernel]                   [k] dma_direct_sync_single_for_cpu
    3.03%  [kernel]                   [k] dma_direct_sync_single_for_device
    2.76%  [kernel]                   [k] i40e_alloc_rx_buffers_zc
    1.83%  [kernel]                   [k] xsk_flush
...

For this benchmark the dma_ops are NULL (dma_is_direct() == true), and
the main issue is that SWIOTLB is now unconditionally enabled [1] for
x86, and for each sync we have to check that if is_swiotlb_buffer()
which involves a some costly indirection.

That was pretty much what my hack avoided. Instead we did all the checks
upfront, since AF_XDP has long-term DMA mappings, and just set a flag
for that.

Avoiding the whole "is this address swiotlb" in
dma_direct_sync_single_for_{cpu, device]() per-packet
would help a lot.

Somewhat related to the DMA API; It would have performance benefits for
AF_XDP if the DMA range of the mapped memory was linear, i.e. by IOMMU
utilization. I've started hacking a thing a little bit, but it would be
nice if such API was part of the mapping core.

Input: array of pages Output: array of dma addrs (and obviously dev,
flags and such)

For non-IOMMU len(array of pages) == len(array of dma addrs)
For best-case IOMMU len(array of dma addrs) == 1 (large linear space)

But that's for later. :-)


Björn


[1] commit: 09230cbc1bab ("swiotlb: move the SWIOTLB config symbol to 
lib/Kconfig")

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-28 17:16     ` Björn Töpel
@ 2020-06-29 13:52       ` Daniel Borkmann
  2020-06-29 15:10         ` Björn Töpel
  2020-06-29 15:41       ` Robin Murphy
  1 sibling, 1 reply; 17+ messages in thread
From: Daniel Borkmann @ 2020-06-29 13:52 UTC (permalink / raw)
  To: Björn Töpel, Christoph Hellwig
  Cc: maximmi, konrad.wilk, jonathan.lemon, linux-kernel, iommu,
	netdev, bpf, davem, magnus.karlsson

On 6/28/20 7:16 PM, Björn Töpel wrote:
> On 2020-06-27 09:04, Christoph Hellwig wrote:
>> On Sat, Jun 27, 2020 at 01:00:19AM +0200, Daniel Borkmann wrote:
>>> Given there is roughly a ~5 weeks window at max where this removal could
>>> still be applied in the worst case, could we come up with a fix / proposal
>>> first that moves this into the DMA mapping core? If there is something that
>>> can be agreed upon by all parties, then we could avoid re-adding the 9%
>>> slowdown. :/
>>
>> I'd rather turn it upside down - this abuse of the internals blocks work
>> that has basically just missed the previous window and I'm not going
>> to wait weeks to sort out the API misuse.  But we can add optimizations
>> back later if we find a sane way.
> 
> I'm not super excited about the performance loss, but I do get
> Christoph's frustration about gutting the DMA API making it harder for
> DMA people to get work done. Lets try to solve this properly using
> proper DMA APIs.

Ok, fair enough, please work with DMA folks to get this properly integrated and
restored then. Applied, thanks!
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-29 13:52       ` Daniel Borkmann
@ 2020-06-29 15:10         ` Björn Töpel
  2020-06-29 15:18           ` Daniel Borkmann
  0 siblings, 1 reply; 17+ messages in thread
From: Björn Töpel @ 2020-06-29 15:10 UTC (permalink / raw)
  To: Daniel Borkmann, Christoph Hellwig
  Cc: maximmi, konrad.wilk, jonathan.lemon, linux-kernel, iommu,
	netdev, bpf, davem, magnus.karlsson

On 2020-06-29 15:52, Daniel Borkmann wrote:
> 
> Ok, fair enough, please work with DMA folks to get this properly 
> integrated and
> restored then. Applied, thanks!

Daniel, you were too quick! Please revert this one; Christoph just 
submitted a 4-patch-series that addresses both the DMA API, and the perf 
regression!


Björn
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-29 15:10         ` Björn Töpel
@ 2020-06-29 15:18           ` Daniel Borkmann
  2020-06-29 16:23             ` Björn Töpel
  2020-06-30  5:07             ` Christoph Hellwig
  0 siblings, 2 replies; 17+ messages in thread
From: Daniel Borkmann @ 2020-06-29 15:18 UTC (permalink / raw)
  To: Björn Töpel, Christoph Hellwig
  Cc: maximmi, konrad.wilk, jonathan.lemon, linux-kernel, iommu,
	netdev, bpf, davem, magnus.karlsson

On 6/29/20 5:10 PM, Björn Töpel wrote:
> On 2020-06-29 15:52, Daniel Borkmann wrote:
>>
>> Ok, fair enough, please work with DMA folks to get this properly integrated and
>> restored then. Applied, thanks!
> 
> Daniel, you were too quick! Please revert this one; Christoph just submitted a 4-patch-series that addresses both the DMA API, and the perf regression!

Nice, tossed from bpf tree then! (Looks like it didn't land on the bpf list yet,
but seems other mails are currently stuck as well on vger. I presume it will be
routed to Linus via Christoph?)

Thanks,
Daniel
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-28 17:16     ` Björn Töpel
  2020-06-29 13:52       ` Daniel Borkmann
@ 2020-06-29 15:41       ` Robin Murphy
  2020-07-01 10:17         ` Björn Töpel
  2020-07-08  6:50         ` Christoph Hellwig
  1 sibling, 2 replies; 17+ messages in thread
From: Robin Murphy @ 2020-06-29 15:41 UTC (permalink / raw)
  To: Björn Töpel, Christoph Hellwig, Daniel Borkmann
  Cc: maximmi, konrad.wilk, netdev, linux-kernel, iommu,
	jonathan.lemon, bpf, davem, magnus.karlsson

On 2020-06-28 18:16, Björn Töpel wrote:
> 
> On 2020-06-27 09:04, Christoph Hellwig wrote:
>> On Sat, Jun 27, 2020 at 01:00:19AM +0200, Daniel Borkmann wrote:
>>> Given there is roughly a ~5 weeks window at max where this removal could
>>> still be applied in the worst case, could we come up with a fix / 
>>> proposal
>>> first that moves this into the DMA mapping core? If there is 
>>> something that
>>> can be agreed upon by all parties, then we could avoid re-adding the 9%
>>> slowdown. :/
>>
>> I'd rather turn it upside down - this abuse of the internals blocks work
>> that has basically just missed the previous window and I'm not going
>> to wait weeks to sort out the API misuse.  But we can add optimizations
>> back later if we find a sane way.
>>
> 
> I'm not super excited about the performance loss, but I do get
> Christoph's frustration about gutting the DMA API making it harder for
> DMA people to get work done. Lets try to solve this properly using
> proper DMA APIs.
> 
> 
>> That being said I really can't see how this would make so much of a
>> difference.  What architecture and what dma_ops are you using for
>> those measurements?  What is the workload?
>>
> 
> The 9% is for an AF_XDP (Fast raw Ethernet socket. Think AF_PACKET, but 
> faster.) benchmark: receive the packet from the NIC, and drop it. The 
> DMA syncs stand out in the perf top:
> 
>    28.63%  [kernel]                   [k] i40e_clean_rx_irq_zc
>    17.12%  [kernel]                   [k] xp_alloc
>     8.80%  [kernel]                   [k] __xsk_rcv_zc
>     7.69%  [kernel]                   [k] xdp_do_redirect
>     5.35%  bpf_prog_992d9ddc835e5629  [k] bpf_prog_992d9ddc835e5629
>     4.77%  [kernel]                   [k] xsk_rcv.part.0
>     4.07%  [kernel]                   [k] __xsk_map_redirect
>     3.80%  [kernel]                   [k] dma_direct_sync_single_for_cpu
>     3.03%  [kernel]                   [k] dma_direct_sync_single_for_device
>     2.76%  [kernel]                   [k] i40e_alloc_rx_buffers_zc
>     1.83%  [kernel]                   [k] xsk_flush
> ...
> 
> For this benchmark the dma_ops are NULL (dma_is_direct() == true), and
> the main issue is that SWIOTLB is now unconditionally enabled [1] for
> x86, and for each sync we have to check that if is_swiotlb_buffer()
> which involves a some costly indirection.
> 
> That was pretty much what my hack avoided. Instead we did all the checks
> upfront, since AF_XDP has long-term DMA mappings, and just set a flag
> for that.
> 
> Avoiding the whole "is this address swiotlb" in
> dma_direct_sync_single_for_{cpu, device]() per-packet
> would help a lot.

I'm pretty sure that's one of the things we hope to achieve with the 
generic bypass flag :)

> Somewhat related to the DMA API; It would have performance benefits for
> AF_XDP if the DMA range of the mapped memory was linear, i.e. by IOMMU
> utilization. I've started hacking a thing a little bit, but it would be
> nice if such API was part of the mapping core.
> 
> Input: array of pages Output: array of dma addrs (and obviously dev,
> flags and such)
> 
> For non-IOMMU len(array of pages) == len(array of dma addrs)
> For best-case IOMMU len(array of dma addrs) == 1 (large linear space)
> 
> But that's for later. :-)

FWIW you will typically get that behaviour from IOMMU-based 
implementations of dma_map_sg() right now, although it's not strictly 
guaranteed. If you can weather some additional setup cost of calling 
sg_alloc_table_from_pages() plus walking the list after mapping to test 
whether you did get a contiguous result, you could start taking 
advantage of it as some of the dma-buf code in DRM and v4l2 does already 
(although those cases actually treat it as a strict dependency rather 
than an optimisation).

I'm inclined to agree that if we're going to see more of these cases, a 
new API call that did formally guarantee a DMA-contiguous mapping 
(either via IOMMU or bounce buffering) or failure might indeed be handy.

Robin.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-29 15:18           ` Daniel Borkmann
@ 2020-06-29 16:23             ` Björn Töpel
  2020-06-30  5:07             ` Christoph Hellwig
  1 sibling, 0 replies; 17+ messages in thread
From: Björn Töpel @ 2020-06-29 16:23 UTC (permalink / raw)
  To: Daniel Borkmann, Christoph Hellwig
  Cc: maximmi, konrad.wilk, jonathan.lemon, linux-kernel, iommu,
	netdev, bpf, davem, magnus.karlsson


On 2020-06-29 17:18, Daniel Borkmann wrote:
> Nice, tossed from bpf tree then! (Looks like it didn't land on the bpf 
> list yet,
> but seems other mails are currently stuck as well on vger. I presume it 
> will be
> routed to Linus via Christoph?)

Thanks!

Christoph (according to the other mail) was OK taking the series via 
your bpf, Dave's net, or his dma-mapping tree.


Björn
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-29 15:18           ` Daniel Borkmann
  2020-06-29 16:23             ` Björn Töpel
@ 2020-06-30  5:07             ` Christoph Hellwig
  2020-06-30 13:47               ` Daniel Borkmann
  1 sibling, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2020-06-30  5:07 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: maximmi, konrad.wilk, jonathan.lemon, linux-kernel,
	Christoph Hellwig, iommu, netdev, bpf, Björn Töpel,
	davem, magnus.karlsson

On Mon, Jun 29, 2020 at 05:18:38PM +0200, Daniel Borkmann wrote:
> On 6/29/20 5:10 PM, Björn Töpel wrote:
>> On 2020-06-29 15:52, Daniel Borkmann wrote:
>>>
>>> Ok, fair enough, please work with DMA folks to get this properly integrated and
>>> restored then. Applied, thanks!
>>
>> Daniel, you were too quick! Please revert this one; Christoph just submitted a 4-patch-series that addresses both the DMA API, and the perf regression!
>
> Nice, tossed from bpf tree then! (Looks like it didn't land on the bpf list yet,
> but seems other mails are currently stuck as well on vger. I presume it will be
> routed to Linus via Christoph?)

I send the patches to the bpf list, did you get them now that vger
is unclogged?  Thinking about it the best route might be through
bpf/net, so if that works for you please pick it up.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-30  5:07             ` Christoph Hellwig
@ 2020-06-30 13:47               ` Daniel Borkmann
  0 siblings, 0 replies; 17+ messages in thread
From: Daniel Borkmann @ 2020-06-30 13:47 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: maximmi, konrad.wilk, jonathan.lemon, linux-kernel, iommu,
	netdev, bpf, Björn Töpel, davem, magnus.karlsson

On 6/30/20 7:07 AM, Christoph Hellwig wrote:
> On Mon, Jun 29, 2020 at 05:18:38PM +0200, Daniel Borkmann wrote:
>> On 6/29/20 5:10 PM, Björn Töpel wrote:
>>> On 2020-06-29 15:52, Daniel Borkmann wrote:
>>>>
>>>> Ok, fair enough, please work with DMA folks to get this properly integrated and
>>>> restored then. Applied, thanks!
>>>
>>> Daniel, you were too quick! Please revert this one; Christoph just submitted a 4-patch-series that addresses both the DMA API, and the perf regression!
>>
>> Nice, tossed from bpf tree then! (Looks like it didn't land on the bpf list yet,
>> but seems other mails are currently stuck as well on vger. I presume it will be
>> routed to Linus via Christoph?)
> 
> I send the patches to the bpf list, did you get them now that vger
> is unclogged?  Thinking about it the best route might be through
> bpf/net, so if that works for you please pick it up.

Yeah, that's fine, I just applied your series to the bpf tree. Thanks!
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-29 15:41       ` Robin Murphy
@ 2020-07-01 10:17         ` Björn Töpel
  2020-07-08  6:50         ` Christoph Hellwig
  1 sibling, 0 replies; 17+ messages in thread
From: Björn Töpel @ 2020-07-01 10:17 UTC (permalink / raw)
  To: Robin Murphy, Christoph Hellwig, Daniel Borkmann
  Cc: maximmi, konrad.wilk, netdev, linux-kernel, iommu,
	jonathan.lemon, bpf, davem, magnus.karlsson

On 2020-06-29 17:41, Robin Murphy wrote:
> On 2020-06-28 18:16, Björn Töpel wrote:
[...]>
>> Somewhat related to the DMA API; It would have performance benefits for
>> AF_XDP if the DMA range of the mapped memory was linear, i.e. by IOMMU
>> utilization. I've started hacking a thing a little bit, but it would be
>> nice if such API was part of the mapping core.
>>
>> Input: array of pages Output: array of dma addrs (and obviously dev,
>> flags and such)
>>
>> For non-IOMMU len(array of pages) == len(array of dma addrs)
>> For best-case IOMMU len(array of dma addrs) == 1 (large linear space)
>>
>> But that's for later. :-)
> 
> FWIW you will typically get that behaviour from IOMMU-based 
> implementations of dma_map_sg() right now, although it's not strictly 
> guaranteed. If you can weather some additional setup cost of calling 
> sg_alloc_table_from_pages() plus walking the list after mapping to test 
> whether you did get a contiguous result, you could start taking 
> advantage of it as some of the dma-buf code in DRM and v4l2 does already 
> (although those cases actually treat it as a strict dependency rather 
> than an optimisation).
> 
> I'm inclined to agree that if we're going to see more of these cases, a 
> new API call that did formally guarantee a DMA-contiguous mapping 
> (either via IOMMU or bounce buffering) or failure might indeed be handy.
>

I forgot to reply to this one! My current hack is using the iommu code 
directly, similar to what vfio-pci does (hopefully not gutting the API 
this time ;-)).

Your approach sound much nicer, and easier. I'll try that out! Thanks a 
lot for the pointers, and I might be back with more questions.


Cheers,
Björn

> Robin.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-06-29 15:41       ` Robin Murphy
  2020-07-01 10:17         ` Björn Töpel
@ 2020-07-08  6:50         ` Christoph Hellwig
  2020-07-08  7:57           ` Song Bao Hua (Barry Song)
  2020-07-08 13:18           ` Robin Murphy
  1 sibling, 2 replies; 17+ messages in thread
From: Christoph Hellwig @ 2020-07-08  6:50 UTC (permalink / raw)
  To: Robin Murphy
  Cc: maximmi, Daniel Borkmann, konrad.wilk, netdev, linux-kernel,
	davem, iommu, jonathan.lemon, bpf, Björn Töpel,
	Christoph Hellwig, magnus.karlsson

On Mon, Jun 29, 2020 at 04:41:16PM +0100, Robin Murphy wrote:
> On 2020-06-28 18:16, Björn Töpel wrote:
>>
>> On 2020-06-27 09:04, Christoph Hellwig wrote:
>>> On Sat, Jun 27, 2020 at 01:00:19AM +0200, Daniel Borkmann wrote:
>>>> Given there is roughly a ~5 weeks window at max where this removal could
>>>> still be applied in the worst case, could we come up with a fix / 
>>>> proposal
>>>> first that moves this into the DMA mapping core? If there is something 
>>>> that
>>>> can be agreed upon by all parties, then we could avoid re-adding the 9%
>>>> slowdown. :/
>>>
>>> I'd rather turn it upside down - this abuse of the internals blocks work
>>> that has basically just missed the previous window and I'm not going
>>> to wait weeks to sort out the API misuse.  But we can add optimizations
>>> back later if we find a sane way.
>>>
>>
>> I'm not super excited about the performance loss, but I do get
>> Christoph's frustration about gutting the DMA API making it harder for
>> DMA people to get work done. Lets try to solve this properly using
>> proper DMA APIs.
>>
>>
>>> That being said I really can't see how this would make so much of a
>>> difference.  What architecture and what dma_ops are you using for
>>> those measurements?  What is the workload?
>>>
>>
>> The 9% is for an AF_XDP (Fast raw Ethernet socket. Think AF_PACKET, but 
>> faster.) benchmark: receive the packet from the NIC, and drop it. The DMA 
>> syncs stand out in the perf top:
>>
>>    28.63%  [kernel]                   [k] i40e_clean_rx_irq_zc
>>    17.12%  [kernel]                   [k] xp_alloc
>>     8.80%  [kernel]                   [k] __xsk_rcv_zc
>>     7.69%  [kernel]                   [k] xdp_do_redirect
>>     5.35%  bpf_prog_992d9ddc835e5629  [k] bpf_prog_992d9ddc835e5629
>>     4.77%  [kernel]                   [k] xsk_rcv.part.0
>>     4.07%  [kernel]                   [k] __xsk_map_redirect
>>     3.80%  [kernel]                   [k] dma_direct_sync_single_for_cpu
>>     3.03%  [kernel]                   [k] dma_direct_sync_single_for_device
>>     2.76%  [kernel]                   [k] i40e_alloc_rx_buffers_zc
>>     1.83%  [kernel]                   [k] xsk_flush
>> ...
>>
>> For this benchmark the dma_ops are NULL (dma_is_direct() == true), and
>> the main issue is that SWIOTLB is now unconditionally enabled [1] for
>> x86, and for each sync we have to check that if is_swiotlb_buffer()
>> which involves a some costly indirection.
>>
>> That was pretty much what my hack avoided. Instead we did all the checks
>> upfront, since AF_XDP has long-term DMA mappings, and just set a flag
>> for that.
>>
>> Avoiding the whole "is this address swiotlb" in
>> dma_direct_sync_single_for_{cpu, device]() per-packet
>> would help a lot.
>
> I'm pretty sure that's one of the things we hope to achieve with the 
> generic bypass flag :)
>
>> Somewhat related to the DMA API; It would have performance benefits for
>> AF_XDP if the DMA range of the mapped memory was linear, i.e. by IOMMU
>> utilization. I've started hacking a thing a little bit, but it would be
>> nice if such API was part of the mapping core.
>>
>> Input: array of pages Output: array of dma addrs (and obviously dev,
>> flags and such)
>>
>> For non-IOMMU len(array of pages) == len(array of dma addrs)
>> For best-case IOMMU len(array of dma addrs) == 1 (large linear space)
>>
>> But that's for later. :-)
>
> FWIW you will typically get that behaviour from IOMMU-based implementations 
> of dma_map_sg() right now, although it's not strictly guaranteed. If you 
> can weather some additional setup cost of calling 
> sg_alloc_table_from_pages() plus walking the list after mapping to test 
> whether you did get a contiguous result, you could start taking advantage 
> of it as some of the dma-buf code in DRM and v4l2 does already (although 
> those cases actually treat it as a strict dependency rather than an 
> optimisation).

Yikes.

> I'm inclined to agree that if we're going to see more of these cases, a new 
> API call that did formally guarantee a DMA-contiguous mapping (either via 
> IOMMU or bounce buffering) or failure might indeed be handy.

I was planning on adding a dma-level API to add more pages to an
IOMMU batch, but was waiting for at least the intel IOMMU driver to be
converted to the dma-iommu code (and preferably arm32 and s390 as well).

Here is my old pseudo-code sketch for what I was aiming for from the
block/nvme perspective.  I haven't even implemented it yet, so there might
be some holes in the design:


/*
 * Returns 0 if batching is possible, postitive number of segments required
 * if batching is not possible, or negatie values on error.
 */
int dma_map_batch_start(struct device *dev, size_t rounded_len,
	enum dma_data_direction dir, unsigned long attrs, dma_addr_t *addr);
int dma_map_batch_add(struct device *dev, dma_addr_t *addr, struct page *page,
		unsigned long offset, size_t size);
int dma_map_batch_end(struct device *dev, int ret, dma_addr_t start_addr);

int blk_dma_map_rq(struct device *dev, struct request *rq, 
		enum dma_data_direction dir, unsigned long attrs,
		dma_addr_t *start_addr, size_t *len)
{
	struct req_iterator iter;
	struct bio_vec bvec;
	dma_addr_t next_addr;
	int ret;

	if (number_of_segments(req) == 1) {
		// plain old dma_map_page();
		return 0;
	}

	// XXX: block helper for rounded_len?
	*len = length_of_request(req);
	ret = dma_map_batch_start(dev, *len, dir, attrs, start_addr);
	if (ret)
		return ret;

	next_addr = *start_addr;
	rq_for_each_segment(bvec, rq, iter) {
		ret = dma_map_batch_add(dev, &next_addr, bvec.bv_page,
				bvec.bv_offset, bvev.bv_len);
		if (ret)
			break;
	}

	return dma_map_batch_end(dev, ret, *start_addr);
}

dma_addr_t blk_dma_map_bvec(struct device *dev, struct bio_vec *bvec,
		enum dma_data_direction dir, unsigned long attrs)
{
	return dma_map_page_attrs(dev, bv_page, bvec.bv_offset, bvev.bv_len,
			dir, attrs);
}

int queue_rq()
{
	dma_addr_t addr;
	int ret;

	ret = blk_dma_map_rq(dev, rq, dir, attrs. &addr, &len);
	if (ret < 0)
		return ret;

	if (ret == 0) {
		if (use_sgl()) {
			nvme_pci_sgl_set_data(&cmd->dptr.sgl, addr, len);
		} else {
			set_prps();
		}
		return;
	}

	if (use_sgl()) {
		alloc_one_sgl_per_segment();

		rq_for_each_segment(bvec, rq, iter) {
			addr = blk_dma_map_bvec(dev, &bdev, dir, 0);
			set_one_sgl();
		}
	} else {
		alloc_one_prp_per_page();

		rq_for_each_segment(bvec, rq, iter) {
			ret = blk_dma_map_bvec(dev, &bdev, dir, 0);
			if (ret)
				break;
			set_prps();
	}
}
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH net] xsk: remove cheap_dma optimization
  2020-07-08  6:50         ` Christoph Hellwig
@ 2020-07-08  7:57           ` Song Bao Hua (Barry Song)
  2020-07-08 12:19             ` Christoph Hellwig
  2020-07-08 13:18           ` Robin Murphy
  1 sibling, 1 reply; 17+ messages in thread
From: Song Bao Hua (Barry Song) @ 2020-07-08  7:57 UTC (permalink / raw)
  To: Christoph Hellwig, Robin Murphy
  Cc: maximmi, Daniel Borkmann, konrad.wilk, netdev, linux-kernel,
	iommu, jonathan.lemon, bpf, Björn Töpel, davem,
	magnus.karlsson



> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
> On Behalf Of Christoph Hellwig
> Sent: Wednesday, July 8, 2020 6:50 PM
> To: Robin Murphy <robin.murphy@arm.com>
> Cc: Björn Töpel <bjorn.topel@intel.com>; Christoph Hellwig <hch@lst.de>;
> Daniel Borkmann <daniel@iogearbox.net>; maximmi@mellanox.com;
> konrad.wilk@oracle.com; jonathan.lemon@gmail.com;
> linux-kernel@vger.kernel.org; iommu@lists.linux-foundation.org;
> netdev@vger.kernel.org; bpf@vger.kernel.org; davem@davemloft.net;
> magnus.karlsson@intel.com
> Subject: Re: [PATCH net] xsk: remove cheap_dma optimization
> 
> On Mon, Jun 29, 2020 at 04:41:16PM +0100, Robin Murphy wrote:
> > On 2020-06-28 18:16, Björn Töpel wrote:
> >>
> >> On 2020-06-27 09:04, Christoph Hellwig wrote:
> >>> On Sat, Jun 27, 2020 at 01:00:19AM +0200, Daniel Borkmann wrote:
> >>>> Given there is roughly a ~5 weeks window at max where this removal
> could
> >>>> still be applied in the worst case, could we come up with a fix /
> >>>> proposal
> >>>> first that moves this into the DMA mapping core? If there is something
> >>>> that
> >>>> can be agreed upon by all parties, then we could avoid re-adding the 9%
> >>>> slowdown. :/
> >>>
> >>> I'd rather turn it upside down - this abuse of the internals blocks work
> >>> that has basically just missed the previous window and I'm not going
> >>> to wait weeks to sort out the API misuse.  But we can add optimizations
> >>> back later if we find a sane way.
> >>>
> >>
> >> I'm not super excited about the performance loss, but I do get
> >> Christoph's frustration about gutting the DMA API making it harder for
> >> DMA people to get work done. Lets try to solve this properly using
> >> proper DMA APIs.
> >>
> >>
> >>> That being said I really can't see how this would make so much of a
> >>> difference.  What architecture and what dma_ops are you using for
> >>> those measurements?  What is the workload?
> >>>
> >>
> >> The 9% is for an AF_XDP (Fast raw Ethernet socket. Think AF_PACKET, but
> >> faster.) benchmark: receive the packet from the NIC, and drop it. The DMA
> >> syncs stand out in the perf top:
> >>
> >>    28.63%  [kernel]                   [k] i40e_clean_rx_irq_zc
> >>    17.12%  [kernel]                   [k] xp_alloc
> >>     8.80%  [kernel]                   [k] __xsk_rcv_zc
> >>     7.69%  [kernel]                   [k] xdp_do_redirect
> >>     5.35%  bpf_prog_992d9ddc835e5629  [k]
> bpf_prog_992d9ddc835e5629
> >>     4.77%  [kernel]                   [k] xsk_rcv.part.0
> >>     4.07%  [kernel]                   [k] __xsk_map_redirect
> >>     3.80%  [kernel]                   [k]
> dma_direct_sync_single_for_cpu
> >>     3.03%  [kernel]                   [k]
> dma_direct_sync_single_for_device
> >>     2.76%  [kernel]                   [k]
> i40e_alloc_rx_buffers_zc
> >>     1.83%  [kernel]                   [k] xsk_flush
> >> ...
> >>
> >> For this benchmark the dma_ops are NULL (dma_is_direct() == true), and
> >> the main issue is that SWIOTLB is now unconditionally enabled [1] for
> >> x86, and for each sync we have to check that if is_swiotlb_buffer()
> >> which involves a some costly indirection.
> >>
> >> That was pretty much what my hack avoided. Instead we did all the checks
> >> upfront, since AF_XDP has long-term DMA mappings, and just set a flag
> >> for that.
> >>
> >> Avoiding the whole "is this address swiotlb" in
> >> dma_direct_sync_single_for_{cpu, device]() per-packet
> >> would help a lot.
> >
> > I'm pretty sure that's one of the things we hope to achieve with the
> > generic bypass flag :)
> >
> >> Somewhat related to the DMA API; It would have performance benefits for
> >> AF_XDP if the DMA range of the mapped memory was linear, i.e. by IOMMU
> >> utilization. I've started hacking a thing a little bit, but it would be
> >> nice if such API was part of the mapping core.
> >>
> >> Input: array of pages Output: array of dma addrs (and obviously dev,
> >> flags and such)
> >>
> >> For non-IOMMU len(array of pages) == len(array of dma addrs)
> >> For best-case IOMMU len(array of dma addrs) == 1 (large linear space)
> >>
> >> But that's for later. :-)
> >
> > FWIW you will typically get that behaviour from IOMMU-based
> implementations
> > of dma_map_sg() right now, although it's not strictly guaranteed. If you
> > can weather some additional setup cost of calling
> > sg_alloc_table_from_pages() plus walking the list after mapping to test
> > whether you did get a contiguous result, you could start taking advantage
> > of it as some of the dma-buf code in DRM and v4l2 does already (although
> > those cases actually treat it as a strict dependency rather than an
> > optimisation).
> 
> Yikes.
> 
> > I'm inclined to agree that if we're going to see more of these cases, a new
> > API call that did formally guarantee a DMA-contiguous mapping (either via
> > IOMMU or bounce buffering) or failure might indeed be handy.
> 
> I was planning on adding a dma-level API to add more pages to an
> IOMMU batch, but was waiting for at least the intel IOMMU driver to be
> converted to the dma-iommu code (and preferably arm32 and s390 as well).
> 
> Here is my old pseudo-code sketch for what I was aiming for from the
> block/nvme perspective.  I haven't even implemented it yet, so there might
> be some holes in the design:
> 
> 
> /*
>  * Returns 0 if batching is possible, postitive number of segments required
>  * if batching is not possible, or negatie values on error.
>  */
> int dma_map_batch_start(struct device *dev, size_t rounded_len,
> 	enum dma_data_direction dir, unsigned long attrs, dma_addr_t *addr);
> int dma_map_batch_add(struct device *dev, dma_addr_t *addr, struct page
> *page,
> 		unsigned long offset, size_t size);
> int dma_map_batch_end(struct device *dev, int ret, dma_addr_t start_addr);
> 

Hello Christoph,

What is the different between dma_map_batch_add() and adding the buffer to sg of dma_map_sg()?

> int blk_dma_map_rq(struct device *dev, struct request *rq,
> 		enum dma_data_direction dir, unsigned long attrs,
> 		dma_addr_t *start_addr, size_t *len)
> {
> 	struct req_iterator iter;
> 	struct bio_vec bvec;
> 	dma_addr_t next_addr;
> 	int ret;
> 
> 	if (number_of_segments(req) == 1) {
> 		// plain old dma_map_page();
> 		return 0;
> 	}
> 
> 	// XXX: block helper for rounded_len?
> 	*len = length_of_request(req);
> 	ret = dma_map_batch_start(dev, *len, dir, attrs, start_addr);
> 	if (ret)
> 		return ret;
> 
> 	next_addr = *start_addr;
> 	rq_for_each_segment(bvec, rq, iter) {
> 		ret = dma_map_batch_add(dev, &next_addr, bvec.bv_page,
> 				bvec.bv_offset, bvev.bv_len);
> 		if (ret)
> 			break;
> 	}
> 
> 	return dma_map_batch_end(dev, ret, *start_addr);
> }
> 
> dma_addr_t blk_dma_map_bvec(struct device *dev, struct bio_vec *bvec,
> 		enum dma_data_direction dir, unsigned long attrs)
> {
> 	return dma_map_page_attrs(dev, bv_page, bvec.bv_offset, bvev.bv_len,
> 			dir, attrs);
> }
> 
> int queue_rq()
> {
> 	dma_addr_t addr;
> 	int ret;
> 
> 	ret = blk_dma_map_rq(dev, rq, dir, attrs. &addr, &len);
> 	if (ret < 0)
> 		return ret;
> 
> 	if (ret == 0) {
> 		if (use_sgl()) {
> 			nvme_pci_sgl_set_data(&cmd->dptr.sgl, addr, len);
> 		} else {
> 			set_prps();
> 		}
> 		return;
> 	}
> 
> 	if (use_sgl()) {
> 		alloc_one_sgl_per_segment();
> 
> 		rq_for_each_segment(bvec, rq, iter) {
> 			addr = blk_dma_map_bvec(dev, &bdev, dir, 0);
> 			set_one_sgl();
> 		}
> 	} else {
> 		alloc_one_prp_per_page();
> 
> 		rq_for_each_segment(bvec, rq, iter) {
> 			ret = blk_dma_map_bvec(dev, &bdev, dir, 0);
> 			if (ret)
> 				break;
> 			set_prps();
> 	}
> }

Thanks
Barry

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-07-08  7:57           ` Song Bao Hua (Barry Song)
@ 2020-07-08 12:19             ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2020-07-08 12:19 UTC (permalink / raw)
  To: Song Bao Hua (Barry Song)
  Cc: maximmi, Daniel Borkmann, konrad.wilk, netdev, linux-kernel,
	davem, iommu, jonathan.lemon, bpf, Björn Töpel,
	Robin Murphy, Christoph Hellwig, magnus.karlsson

On Wed, Jul 08, 2020 at 07:57:23AM +0000, Song Bao Hua (Barry Song) wrote:
> > int dma_map_batch_start(struct device *dev, size_t rounded_len,
> > 	enum dma_data_direction dir, unsigned long attrs, dma_addr_t *addr);
> > int dma_map_batch_add(struct device *dev, dma_addr_t *addr, struct page
> > *page,
> > 		unsigned long offset, size_t size);
> > int dma_map_batch_end(struct device *dev, int ret, dma_addr_t start_addr);
> > 
> 
> Hello Christoph,
> 
> What is the different between dma_map_batch_add() and adding the buffer to sg of dma_map_sg()?

There is not struct scatterlist involved in this API, avoiding the
overhead to allocate it (which is kinda the point).
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH net] xsk: remove cheap_dma optimization
  2020-07-08  6:50         ` Christoph Hellwig
  2020-07-08  7:57           ` Song Bao Hua (Barry Song)
@ 2020-07-08 13:18           ` Robin Murphy
  1 sibling, 0 replies; 17+ messages in thread
From: Robin Murphy @ 2020-07-08 13:18 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: maximmi, Daniel Borkmann, konrad.wilk, netdev, linux-kernel,
	iommu, jonathan.lemon, bpf, Björn Töpel, davem,
	magnus.karlsson

On 2020-07-08 07:50, Christoph Hellwig wrote:
> On Mon, Jun 29, 2020 at 04:41:16PM +0100, Robin Murphy wrote:
>> On 2020-06-28 18:16, Bj�rn T�pel wrote:
>>>
>>> On 2020-06-27 09:04, Christoph Hellwig wrote:
>>>> On Sat, Jun 27, 2020 at 01:00:19AM +0200, Daniel Borkmann wrote:
>>>>> Given there is roughly a ~5 weeks window at max where this removal could
>>>>> still be applied in the worst case, could we come up with a fix /
>>>>> proposal
>>>>> first that moves this into the DMA mapping core? If there is something
>>>>> that
>>>>> can be agreed upon by all parties, then we could avoid re-adding the 9%
>>>>> slowdown. :/
>>>>
>>>> I'd rather turn it upside down - this abuse of the internals blocks work
>>>> that has basically just missed the previous window and I'm not going
>>>> to wait weeks to sort out the API misuse.� But we can add optimizations
>>>> back later if we find a sane way.
>>>>
>>>
>>> I'm not super excited about the performance loss, but I do get
>>> Christoph's frustration about gutting the DMA API making it harder for
>>> DMA people to get work done. Lets try to solve this properly using
>>> proper DMA APIs.
>>>
>>>
>>>> That being said I really can't see how this would make so much of a
>>>> difference.� What architecture and what dma_ops are you using for
>>>> those measurements?� What is the workload?
>>>>
>>>
>>> The 9% is for an AF_XDP (Fast raw Ethernet socket. Think AF_PACKET, but
>>> faster.) benchmark: receive the packet from the NIC, and drop it. The DMA
>>> syncs stand out in the perf top:
>>>
>>>   � 28.63%� [kernel]������������������ [k] i40e_clean_rx_irq_zc
>>>   � 17.12%� [kernel]������������������ [k] xp_alloc
>>>   �� 8.80%� [kernel]������������������ [k] __xsk_rcv_zc
>>>   �� 7.69%� [kernel]������������������ [k] xdp_do_redirect
>>>   �� 5.35%� bpf_prog_992d9ddc835e5629� [k] bpf_prog_992d9ddc835e5629
>>>   �� 4.77%� [kernel]������������������ [k] xsk_rcv.part.0
>>>   �� 4.07%� [kernel]������������������ [k] __xsk_map_redirect
>>>   �� 3.80%� [kernel]������������������ [k] dma_direct_sync_single_for_cpu
>>>   �� 3.03%� [kernel]������������������ [k] dma_direct_sync_single_for_device
>>>   �� 2.76%� [kernel]������������������ [k] i40e_alloc_rx_buffers_zc
>>>   �� 1.83%� [kernel]������������������ [k] xsk_flush
>>> ...
>>>
>>> For this benchmark the dma_ops are NULL (dma_is_direct() == true), and
>>> the main issue is that SWIOTLB is now unconditionally enabled [1] for
>>> x86, and for each sync we have to check that if is_swiotlb_buffer()
>>> which involves a some costly indirection.
>>>
>>> That was pretty much what my hack avoided. Instead we did all the checks
>>> upfront, since AF_XDP has long-term DMA mappings, and just set a flag
>>> for that.
>>>
>>> Avoiding the whole "is this address swiotlb" in
>>> dma_direct_sync_single_for_{cpu, device]() per-packet
>>> would help a lot.
>>
>> I'm pretty sure that's one of the things we hope to achieve with the
>> generic bypass flag :)
>>
>>> Somewhat related to the DMA API; It would have performance benefits for
>>> AF_XDP if the DMA range of the mapped memory was linear, i.e. by IOMMU
>>> utilization. I've started hacking a thing a little bit, but it would be
>>> nice if such API was part of the mapping core.
>>>
>>> Input: array of pages Output: array of dma addrs (and obviously dev,
>>> flags and such)
>>>
>>> For non-IOMMU len(array of pages) == len(array of dma addrs)
>>> For best-case IOMMU len(array of dma addrs) == 1 (large linear space)
>>>
>>> But that's for later. :-)
>>
>> FWIW you will typically get that behaviour from IOMMU-based implementations
>> of dma_map_sg() right now, although it's not strictly guaranteed. If you
>> can weather some additional setup cost of calling
>> sg_alloc_table_from_pages() plus walking the list after mapping to test
>> whether you did get a contiguous result, you could start taking advantage
>> of it as some of the dma-buf code in DRM and v4l2 does already (although
>> those cases actually treat it as a strict dependency rather than an
>> optimisation).
> 
> Yikes.

Heh, consider it as iommu_dma_alloc_remap() and 
vb2_dc_get_contiguous_size() having a beautiful baby ;)

>> I'm inclined to agree that if we're going to see more of these cases, a new
>> API call that did formally guarantee a DMA-contiguous mapping (either via
>> IOMMU or bounce buffering) or failure might indeed be handy.
> 
> I was planning on adding a dma-level API to add more pages to an
> IOMMU batch, but was waiting for at least the intel IOMMU driver to be
> converted to the dma-iommu code (and preferably arm32 and s390 as well).

FWIW I did finally get round to having an initial crack at arm32 
recently[1] - of course it needs significant rework already for all the 
IOMMU API motion, and I still need to attempt to test any of it (at 
least I do have a couple of 32-bit boards here), but with any luck I 
hope I'll be able to pick it up again next cycle.

> Here is my old pseudo-code sketch for what I was aiming for from the
> block/nvme perspective.  I haven't even implemented it yet, so there might
> be some holes in the design:
> 
> 
> /*
>   * Returns 0 if batching is possible, postitive number of segments required
>   * if batching is not possible, or negatie values on error.
>   */
> int dma_map_batch_start(struct device *dev, size_t rounded_len,
> 	enum dma_data_direction dir, unsigned long attrs, dma_addr_t *addr);
> int dma_map_batch_add(struct device *dev, dma_addr_t *addr, struct page *page,
> 		unsigned long offset, size_t size);
> int dma_map_batch_end(struct device *dev, int ret, dma_addr_t start_addr);

Just as an initial thought, it's probably nicer to have some kind of 
encapsulated state structure to pass around between these calls rather 
than a menagerie of bare address pointers, similar to what we did with 
iommu_iotlb_gather. An IOMMU-based backend might not want to commit 
batch_add() calls immediately, but look for physically-sequential pages 
and merge them into larger mappings if it can, and keeping track of 
things based only on next_addr, when multiple batch requests could be 
happening in parallel for the same device, would get messy fast.

I also don't entirely see how the backend can be expected to determine 
the number of segments required in advance - e.g. bounce-buffering could 
join two half-page segments into one while an IOMMU typically couldn't, 
yet the opposite might also be true of larger multi-page segments.

Robin.

[1] 
http://www.linux-arm.org/git?p=linux-rm.git;a=shortlog;h=refs/heads/arm/dma

> int blk_dma_map_rq(struct device *dev, struct request *rq,
> 		enum dma_data_direction dir, unsigned long attrs,
> 		dma_addr_t *start_addr, size_t *len)
> {
> 	struct req_iterator iter;
> 	struct bio_vec bvec;
> 	dma_addr_t next_addr;
> 	int ret;
> 
> 	if (number_of_segments(req) == 1) {
> 		// plain old dma_map_page();
> 		return 0;
> 	}
> 
> 	// XXX: block helper for rounded_len?
> 	*len = length_of_request(req);
> 	ret = dma_map_batch_start(dev, *len, dir, attrs, start_addr);
> 	if (ret)
> 		return ret;
> 
> 	next_addr = *start_addr;
> 	rq_for_each_segment(bvec, rq, iter) {
> 		ret = dma_map_batch_add(dev, &next_addr, bvec.bv_page,
> 				bvec.bv_offset, bvev.bv_len);
> 		if (ret)
> 			break;
> 	}
> 
> 	return dma_map_batch_end(dev, ret, *start_addr);
> }
> 
> dma_addr_t blk_dma_map_bvec(struct device *dev, struct bio_vec *bvec,
> 		enum dma_data_direction dir, unsigned long attrs)
> {
> 	return dma_map_page_attrs(dev, bv_page, bvec.bv_offset, bvev.bv_len,
> 			dir, attrs);
> }
> 
> int queue_rq()
> {
> 	dma_addr_t addr;
> 	int ret;
> 
> 	ret = blk_dma_map_rq(dev, rq, dir, attrs. &addr, &len);
> 	if (ret < 0)
> 		return ret;
> 
> 	if (ret == 0) {
> 		if (use_sgl()) {
> 			nvme_pci_sgl_set_data(&cmd->dptr.sgl, addr, len);
> 		} else {
> 			set_prps();
> 		}
> 		return;
> 	}
> 
> 	if (use_sgl()) {
> 		alloc_one_sgl_per_segment();
> 
> 		rq_for_each_segment(bvec, rq, iter) {
> 			addr = blk_dma_map_bvec(dev, &bdev, dir, 0);
> 			set_one_sgl();
> 		}
> 	} else {
> 		alloc_one_prp_per_page();
> 
> 		rq_for_each_segment(bvec, rq, iter) {
> 			ret = blk_dma_map_bvec(dev, &bdev, dir, 0);
> 			if (ret)
> 				break;
> 			set_prps();
> 	}
> }
> 
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2020-07-08 13:18 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-26 13:43 [PATCH net] xsk: remove cheap_dma optimization Björn Töpel
2020-06-26 20:44 ` Jonathan Lemon
2020-06-26 23:00 ` Daniel Borkmann
2020-06-27  7:04   ` Christoph Hellwig
2020-06-28 17:16     ` Björn Töpel
2020-06-29 13:52       ` Daniel Borkmann
2020-06-29 15:10         ` Björn Töpel
2020-06-29 15:18           ` Daniel Borkmann
2020-06-29 16:23             ` Björn Töpel
2020-06-30  5:07             ` Christoph Hellwig
2020-06-30 13:47               ` Daniel Borkmann
2020-06-29 15:41       ` Robin Murphy
2020-07-01 10:17         ` Björn Töpel
2020-07-08  6:50         ` Christoph Hellwig
2020-07-08  7:57           ` Song Bao Hua (Barry Song)
2020-07-08 12:19             ` Christoph Hellwig
2020-07-08 13:18           ` Robin Murphy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).