linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations
@ 2022-11-12  4:04 Eric Dumazet
  2022-11-14 13:30 ` Robin Murphy
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Eric Dumazet @ 2022-11-12  4:04 UTC (permalink / raw)
  To: Joerg Roedel, Robin Murphy, Will Deacon
  Cc: linux-kernel, netdev, Eric Dumazet, Eric Dumazet, iommu

Quite often, NIC devices do not need dma_sync operations
on x86_64 at least.

Indeed, when dev_is_dma_coherent(dev) is true and
dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
and friends do nothing.

However, indirectly calling them when CONFIG_RETPOLINE=y
consumes about 10% of cycles on a cpu receiving packets
from softirq at ~100Gbit rate, as shown in [1]

Even if/when CONFIG_RETPOLINE is not set, there
is a cost of about 3%.

This patch adds a copy of iommu_dma_ops structure,
where sync_single_for_cpu, sync_single_for_device,
sync_sg_for_cpu and sync_sg_for_device are unset.

perf profile before the patch:

    18.53%  [kernel]       [k] gq_rx_skb
    14.77%  [kernel]       [k] napi_reuse_skb
     8.95%  [kernel]       [k] skb_release_data
     5.42%  [kernel]       [k] dev_gro_receive
     5.37%  [kernel]       [k] memcpy
<*>  5.26%  [kernel]       [k] iommu_dma_sync_sg_for_cpu
     4.78%  [kernel]       [k] tcp_gro_receive
<*>  4.42%  [kernel]       [k] iommu_dma_sync_sg_for_device
     4.12%  [kernel]       [k] ipv6_gro_receive
     3.65%  [kernel]       [k] gq_pool_get
     3.25%  [kernel]       [k] skb_gro_receive
     2.07%  [kernel]       [k] napi_gro_frags
     1.98%  [kernel]       [k] tcp6_gro_receive
     1.27%  [kernel]       [k] gq_rx_prep_buffers
     1.18%  [kernel]       [k] gq_rx_napi_handler
     0.99%  [kernel]       [k] csum_partial
     0.74%  [kernel]       [k] csum_ipv6_magic
     0.72%  [kernel]       [k] free_pcp_prepare
     0.60%  [kernel]       [k] __napi_poll
     0.58%  [kernel]       [k] net_rx_action
     0.56%  [kernel]       [k] read_tsc
<*>  0.50%  [kernel]       [k] __x86_indirect_thunk_r11
     0.45%  [kernel]       [k] memset

After patch, lines with <*> no longer show up, and overall
cpu usage looks much better (~60% instead of ~72%)

    25.56%  [kernel]       [k] gq_rx_skb
     9.90%  [kernel]       [k] napi_reuse_skb
     7.39%  [kernel]       [k] dev_gro_receive
     6.78%  [kernel]       [k] memcpy
     6.53%  [kernel]       [k] skb_release_data
     6.39%  [kernel]       [k] tcp_gro_receive
     5.71%  [kernel]       [k] ipv6_gro_receive
     4.35%  [kernel]       [k] napi_gro_frags
     4.34%  [kernel]       [k] skb_gro_receive
     3.50%  [kernel]       [k] gq_pool_get
     3.08%  [kernel]       [k] gq_rx_napi_handler
     2.35%  [kernel]       [k] tcp6_gro_receive
     2.06%  [kernel]       [k] gq_rx_prep_buffers
     1.32%  [kernel]       [k] csum_partial
     0.93%  [kernel]       [k] csum_ipv6_magic
     0.65%  [kernel]       [k] net_rx_action

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Cc: iommu@lists.linux.dev
---
 drivers/iommu/dma-iommu.c | 67 +++++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 9297b741f5e80e2408e864fc3f779410d6b04d49..976ba20a55eab5fd94e9bec2d38a2a60e0690444 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -522,6 +522,11 @@ static bool dev_use_swiotlb(struct device *dev)
 	return IS_ENABLED(CONFIG_SWIOTLB) && dev_is_untrusted(dev);
 }
 
+static bool dev_is_dma_sync_needed(struct device *dev)
+{
+	return !dev_is_dma_coherent(dev) || dev_use_swiotlb(dev);
+}
+
 /**
  * iommu_dma_init_domain - Initialise a DMA mapping domain
  * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
@@ -914,7 +919,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
 {
 	phys_addr_t phys;
 
-	if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
+	if (!dev_is_dma_sync_needed(dev))
 		return;
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
@@ -930,7 +935,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
 {
 	phys_addr_t phys;
 
-	if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
+	if (!dev_is_dma_sync_needed(dev))
 		return;
 
 	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
@@ -1544,30 +1549,51 @@ static size_t iommu_dma_opt_mapping_size(void)
 	return iova_rcache_range();
 }
 
+#define iommu_dma_ops_common_fields \
+	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,		\
+	.alloc			= iommu_dma_alloc,			\
+	.free			= iommu_dma_free,			\
+	.alloc_pages		= dma_common_alloc_pages,		\
+	.free_pages		= dma_common_free_pages,		\
+	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,	\
+	.free_noncontiguous	= iommu_dma_free_noncontiguous,		\
+	.mmap			= iommu_dma_mmap,			\
+	.get_sgtable		= iommu_dma_get_sgtable,		\
+	.map_page		= iommu_dma_map_page,			\
+	.unmap_page		= iommu_dma_unmap_page,			\
+	.map_sg			= iommu_dma_map_sg,			\
+	.unmap_sg		= iommu_dma_unmap_sg,			\
+	.map_resource		= iommu_dma_map_resource,		\
+	.unmap_resource		= iommu_dma_unmap_resource,		\
+	.get_merge_boundary	= iommu_dma_get_merge_boundary,		\
+	.opt_mapping_size	= iommu_dma_opt_mapping_size,
+
 static const struct dma_map_ops iommu_dma_ops = {
-	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,
-	.alloc			= iommu_dma_alloc,
-	.free			= iommu_dma_free,
-	.alloc_pages		= dma_common_alloc_pages,
-	.free_pages		= dma_common_free_pages,
-	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,
-	.free_noncontiguous	= iommu_dma_free_noncontiguous,
-	.mmap			= iommu_dma_mmap,
-	.get_sgtable		= iommu_dma_get_sgtable,
-	.map_page		= iommu_dma_map_page,
-	.unmap_page		= iommu_dma_unmap_page,
-	.map_sg			= iommu_dma_map_sg,
-	.unmap_sg		= iommu_dma_unmap_sg,
+	iommu_dma_ops_common_fields
+
 	.sync_single_for_cpu	= iommu_dma_sync_single_for_cpu,
 	.sync_single_for_device	= iommu_dma_sync_single_for_device,
 	.sync_sg_for_cpu	= iommu_dma_sync_sg_for_cpu,
 	.sync_sg_for_device	= iommu_dma_sync_sg_for_device,
-	.map_resource		= iommu_dma_map_resource,
-	.unmap_resource		= iommu_dma_unmap_resource,
-	.get_merge_boundary	= iommu_dma_get_merge_boundary,
-	.opt_mapping_size	= iommu_dma_opt_mapping_size,
 };
 
+/* Special instance of iommu_dma_ops for devices satisfying this condition:
+ *   !dev_is_dma_sync_needed(dev)
+ *
+ * iommu_dma_sync_single_for_cpu(), iommu_dma_sync_single_for_device(),
+ * iommu_dma_sync_sg_for_cpu(), iommu_dma_sync_sg_for_device()
+ * do nothing special and can be avoided, saving indirect calls.
+ */
+static const struct dma_map_ops iommu_nosync_dma_ops = {
+	iommu_dma_ops_common_fields
+
+	.sync_single_for_cpu	= NULL,
+	.sync_single_for_device	= NULL,
+	.sync_sg_for_cpu	= NULL,
+	.sync_sg_for_device	= NULL,
+};
+#undef iommu_dma_ops_common_fields
+
 /*
  * The IOMMU core code allocates the default DMA domain, which the underlying
  * IOMMU driver needs to support via the dma-iommu layer.
@@ -1586,7 +1612,8 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
 	if (iommu_is_dma_domain(domain)) {
 		if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
 			goto out_err;
-		dev->dma_ops = &iommu_dma_ops;
+		dev->dma_ops = dev_is_dma_sync_needed(dev) ?
+				&iommu_dma_ops : &iommu_nosync_dma_ops;
 	}
 
 	return;
-- 
2.38.1.431.g37b22c650d-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations
  2022-11-12  4:04 [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations Eric Dumazet
@ 2022-11-14 13:30 ` Robin Murphy
  2022-11-14 13:52   ` Robin Murphy
  2022-11-22 19:17 ` Michal Kubiak
  2022-11-24  4:23 ` Ethan Zhao
  2 siblings, 1 reply; 7+ messages in thread
From: Robin Murphy @ 2022-11-14 13:30 UTC (permalink / raw)
  To: Eric Dumazet, Joerg Roedel, Will Deacon
  Cc: linux-kernel, netdev, Eric Dumazet, iommu

On 2022-11-12 04:04, Eric Dumazet wrote:
> Quite often, NIC devices do not need dma_sync operations
> on x86_64 at least.
> 
> Indeed, when dev_is_dma_coherent(dev) is true and
> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> and friends do nothing.
> 
> However, indirectly calling them when CONFIG_RETPOLINE=y
> consumes about 10% of cycles on a cpu receiving packets
> from softirq at ~100Gbit rate, as shown in [1]
> 
> Even if/when CONFIG_RETPOLINE is not set, there
> is a cost of about 3%.
> 
> This patch adds a copy of iommu_dma_ops structure,
> where sync_single_for_cpu, sync_single_for_device,
> sync_sg_for_cpu and sync_sg_for_device are unset.

TBH I reckon it might be worthwhile to add another top-level bitfield to 
struct device to indicate when syncs can be optimised out completely, so 
we can handle it at the DMA API dispatch level and short-circuit a bit 
more of the dma-direct path too.

> perf profile before the patch:
> 
>      18.53%  [kernel]       [k] gq_rx_skb
>      14.77%  [kernel]       [k] napi_reuse_skb
>       8.95%  [kernel]       [k] skb_release_data
>       5.42%  [kernel]       [k] dev_gro_receive
>       5.37%  [kernel]       [k] memcpy
> <*>  5.26%  [kernel]       [k] iommu_dma_sync_sg_for_cpu
>       4.78%  [kernel]       [k] tcp_gro_receive
> <*>  4.42%  [kernel]       [k] iommu_dma_sync_sg_for_device
>       4.12%  [kernel]       [k] ipv6_gro_receive
>       3.65%  [kernel]       [k] gq_pool_get
>       3.25%  [kernel]       [k] skb_gro_receive
>       2.07%  [kernel]       [k] napi_gro_frags
>       1.98%  [kernel]       [k] tcp6_gro_receive
>       1.27%  [kernel]       [k] gq_rx_prep_buffers
>       1.18%  [kernel]       [k] gq_rx_napi_handler
>       0.99%  [kernel]       [k] csum_partial
>       0.74%  [kernel]       [k] csum_ipv6_magic
>       0.72%  [kernel]       [k] free_pcp_prepare
>       0.60%  [kernel]       [k] __napi_poll
>       0.58%  [kernel]       [k] net_rx_action
>       0.56%  [kernel]       [k] read_tsc
> <*>  0.50%  [kernel]       [k] __x86_indirect_thunk_r11
>       0.45%  [kernel]       [k] memset
> 
> After patch, lines with <*> no longer show up, and overall
> cpu usage looks much better (~60% instead of ~72%)
> 
>      25.56%  [kernel]       [k] gq_rx_skb
>       9.90%  [kernel]       [k] napi_reuse_skb
>       7.39%  [kernel]       [k] dev_gro_receive
>       6.78%  [kernel]       [k] memcpy
>       6.53%  [kernel]       [k] skb_release_data
>       6.39%  [kernel]       [k] tcp_gro_receive
>       5.71%  [kernel]       [k] ipv6_gro_receive
>       4.35%  [kernel]       [k] napi_gro_frags
>       4.34%  [kernel]       [k] skb_gro_receive
>       3.50%  [kernel]       [k] gq_pool_get
>       3.08%  [kernel]       [k] gq_rx_napi_handler
>       2.35%  [kernel]       [k] tcp6_gro_receive
>       2.06%  [kernel]       [k] gq_rx_prep_buffers
>       1.32%  [kernel]       [k] csum_partial
>       0.93%  [kernel]       [k] csum_ipv6_magic
>       0.65%  [kernel]       [k] net_rx_action
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Will Deacon <will@kernel.org>
> Cc: iommu@lists.linux.dev
> ---
>   drivers/iommu/dma-iommu.c | 67 +++++++++++++++++++++++++++------------
>   1 file changed, 47 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 9297b741f5e80e2408e864fc3f779410d6b04d49..976ba20a55eab5fd94e9bec2d38a2a60e0690444 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -522,6 +522,11 @@ static bool dev_use_swiotlb(struct device *dev)
>   	return IS_ENABLED(CONFIG_SWIOTLB) && dev_is_untrusted(dev);
>   }
>   
> +static bool dev_is_dma_sync_needed(struct device *dev)
> +{
> +	return !dev_is_dma_coherent(dev) || dev_use_swiotlb(dev);
> +}
> +
>   /**
>    * iommu_dma_init_domain - Initialise a DMA mapping domain
>    * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
> @@ -914,7 +919,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
>   {
>   	phys_addr_t phys;
>   
> -	if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
> +	if (!dev_is_dma_sync_needed(dev))
>   		return;
>   
>   	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
> @@ -930,7 +935,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
>   {
>   	phys_addr_t phys;
>   
> -	if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
> +	if (!dev_is_dma_sync_needed(dev))
>   		return;
>   
>   	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
> @@ -1544,30 +1549,51 @@ static size_t iommu_dma_opt_mapping_size(void)
>   	return iova_rcache_range();
>   }
>   
> +#define iommu_dma_ops_common_fields \
> +	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,		\
> +	.alloc			= iommu_dma_alloc,			\
> +	.free			= iommu_dma_free,			\
> +	.alloc_pages		= dma_common_alloc_pages,		\
> +	.free_pages		= dma_common_free_pages,		\
> +	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,	\
> +	.free_noncontiguous	= iommu_dma_free_noncontiguous,		\
> +	.mmap			= iommu_dma_mmap,			\
> +	.get_sgtable		= iommu_dma_get_sgtable,		\
> +	.map_page		= iommu_dma_map_page,			\
> +	.unmap_page		= iommu_dma_unmap_page,			\
> +	.map_sg			= iommu_dma_map_sg,			\
> +	.unmap_sg		= iommu_dma_unmap_sg,			\
> +	.map_resource		= iommu_dma_map_resource,		\
> +	.unmap_resource		= iommu_dma_unmap_resource,		\
> +	.get_merge_boundary	= iommu_dma_get_merge_boundary,		\
> +	.opt_mapping_size	= iommu_dma_opt_mapping_size,
> +
>   static const struct dma_map_ops iommu_dma_ops = {
> -	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,
> -	.alloc			= iommu_dma_alloc,
> -	.free			= iommu_dma_free,
> -	.alloc_pages		= dma_common_alloc_pages,
> -	.free_pages		= dma_common_free_pages,
> -	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,
> -	.free_noncontiguous	= iommu_dma_free_noncontiguous,
> -	.mmap			= iommu_dma_mmap,
> -	.get_sgtable		= iommu_dma_get_sgtable,
> -	.map_page		= iommu_dma_map_page,
> -	.unmap_page		= iommu_dma_unmap_page,
> -	.map_sg			= iommu_dma_map_sg,
> -	.unmap_sg		= iommu_dma_unmap_sg,
> +	iommu_dma_ops_common_fields
> +
>   	.sync_single_for_cpu	= iommu_dma_sync_single_for_cpu,
>   	.sync_single_for_device	= iommu_dma_sync_single_for_device,
>   	.sync_sg_for_cpu	= iommu_dma_sync_sg_for_cpu,
>   	.sync_sg_for_device	= iommu_dma_sync_sg_for_device,
> -	.map_resource		= iommu_dma_map_resource,
> -	.unmap_resource		= iommu_dma_unmap_resource,
> -	.get_merge_boundary	= iommu_dma_get_merge_boundary,
> -	.opt_mapping_size	= iommu_dma_opt_mapping_size,
>   };
>   
> +/* Special instance of iommu_dma_ops for devices satisfying this condition:
> + *   !dev_is_dma_sync_needed(dev)
> + *
> + * iommu_dma_sync_single_for_cpu(), iommu_dma_sync_single_for_device(),
> + * iommu_dma_sync_sg_for_cpu(), iommu_dma_sync_sg_for_device()
> + * do nothing special and can be avoided, saving indirect calls.
> + */
> +static const struct dma_map_ops iommu_nosync_dma_ops = {
> +	iommu_dma_ops_common_fields
> +
> +	.sync_single_for_cpu	= NULL,
> +	.sync_single_for_device	= NULL,
> +	.sync_sg_for_cpu	= NULL,
> +	.sync_sg_for_device	= NULL,
> +};
> +#undef iommu_dma_ops_common_fields
> +
>   /*
>    * The IOMMU core code allocates the default DMA domain, which the underlying
>    * IOMMU driver needs to support via the dma-iommu layer.
> @@ -1586,7 +1612,8 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
>   	if (iommu_is_dma_domain(domain)) {
>   		if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
>   			goto out_err;
> -		dev->dma_ops = &iommu_dma_ops;
> +		dev->dma_ops = dev_is_dma_sync_needed(dev) ?
> +				&iommu_dma_ops : &iommu_nosync_dma_ops;

This doesn't work, because at this point we don't know whether a 
coherent device is still going to need SWIOTLB for DMA mask reasons or not.

Thanks,
Robin.

>   	}
>   
>   	return;

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations
  2022-11-14 13:30 ` Robin Murphy
@ 2022-11-14 13:52   ` Robin Murphy
  0 siblings, 0 replies; 7+ messages in thread
From: Robin Murphy @ 2022-11-14 13:52 UTC (permalink / raw)
  To: Eric Dumazet, Joerg Roedel, Will Deacon
  Cc: linux-kernel, netdev, Eric Dumazet, iommu

On 2022-11-14 13:30, Robin Murphy wrote:
> On 2022-11-12 04:04, Eric Dumazet wrote:
>> Quite often, NIC devices do not need dma_sync operations
>> on x86_64 at least.
>>
>> Indeed, when dev_is_dma_coherent(dev) is true and
>> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
>> and friends do nothing.
>>
>> However, indirectly calling them when CONFIG_RETPOLINE=y
>> consumes about 10% of cycles on a cpu receiving packets
>> from softirq at ~100Gbit rate, as shown in [1]
>>
>> Even if/when CONFIG_RETPOLINE is not set, there
>> is a cost of about 3%.
>>
>> This patch adds a copy of iommu_dma_ops structure,
>> where sync_single_for_cpu, sync_single_for_device,
>> sync_sg_for_cpu and sync_sg_for_device are unset.
> 
> TBH I reckon it might be worthwhile to add another top-level bitfield to 
> struct device to indicate when syncs can be optimised out completely, so 
> we can handle it at the DMA API dispatch level and short-circuit a bit 
> more of the dma-direct path too.
> 
>> perf profile before the patch:
>>
>>      18.53%  [kernel]       [k] gq_rx_skb
>>      14.77%  [kernel]       [k] napi_reuse_skb
>>       8.95%  [kernel]       [k] skb_release_data
>>       5.42%  [kernel]       [k] dev_gro_receive
>>       5.37%  [kernel]       [k] memcpy
>> <*>  5.26%  [kernel]       [k] iommu_dma_sync_sg_for_cpu
>>       4.78%  [kernel]       [k] tcp_gro_receive
>> <*>  4.42%  [kernel]       [k] iommu_dma_sync_sg_for_device
>>       4.12%  [kernel]       [k] ipv6_gro_receive
>>       3.65%  [kernel]       [k] gq_pool_get
>>       3.25%  [kernel]       [k] skb_gro_receive
>>       2.07%  [kernel]       [k] napi_gro_frags
>>       1.98%  [kernel]       [k] tcp6_gro_receive
>>       1.27%  [kernel]       [k] gq_rx_prep_buffers
>>       1.18%  [kernel]       [k] gq_rx_napi_handler
>>       0.99%  [kernel]       [k] csum_partial
>>       0.74%  [kernel]       [k] csum_ipv6_magic
>>       0.72%  [kernel]       [k] free_pcp_prepare
>>       0.60%  [kernel]       [k] __napi_poll
>>       0.58%  [kernel]       [k] net_rx_action
>>       0.56%  [kernel]       [k] read_tsc
>> <*>  0.50%  [kernel]       [k] __x86_indirect_thunk_r11
>>       0.45%  [kernel]       [k] memset
>>
>> After patch, lines with <*> no longer show up, and overall
>> cpu usage looks much better (~60% instead of ~72%)
>>
>>      25.56%  [kernel]       [k] gq_rx_skb
>>       9.90%  [kernel]       [k] napi_reuse_skb
>>       7.39%  [kernel]       [k] dev_gro_receive
>>       6.78%  [kernel]       [k] memcpy
>>       6.53%  [kernel]       [k] skb_release_data
>>       6.39%  [kernel]       [k] tcp_gro_receive
>>       5.71%  [kernel]       [k] ipv6_gro_receive
>>       4.35%  [kernel]       [k] napi_gro_frags
>>       4.34%  [kernel]       [k] skb_gro_receive
>>       3.50%  [kernel]       [k] gq_pool_get
>>       3.08%  [kernel]       [k] gq_rx_napi_handler
>>       2.35%  [kernel]       [k] tcp6_gro_receive
>>       2.06%  [kernel]       [k] gq_rx_prep_buffers
>>       1.32%  [kernel]       [k] csum_partial
>>       0.93%  [kernel]       [k] csum_ipv6_magic
>>       0.65%  [kernel]       [k] net_rx_action
>>
>> Signed-off-by: Eric Dumazet <edumazet@google.com>
>> Cc: Robin Murphy <robin.murphy@arm.com>
>> Cc: Joerg Roedel <joro@8bytes.org>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: iommu@lists.linux.dev
>> ---
>>   drivers/iommu/dma-iommu.c | 67 +++++++++++++++++++++++++++------------
>>   1 file changed, 47 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
>> index 
>> 9297b741f5e80e2408e864fc3f779410d6b04d49..976ba20a55eab5fd94e9bec2d38a2a60e0690444 100644
>> --- a/drivers/iommu/dma-iommu.c
>> +++ b/drivers/iommu/dma-iommu.c
>> @@ -522,6 +522,11 @@ static bool dev_use_swiotlb(struct device *dev)
>>       return IS_ENABLED(CONFIG_SWIOTLB) && dev_is_untrusted(dev);
>>   }
>> +static bool dev_is_dma_sync_needed(struct device *dev)
>> +{
>> +    return !dev_is_dma_coherent(dev) || dev_use_swiotlb(dev);
>> +}
>> +
>>   /**
>>    * iommu_dma_init_domain - Initialise a DMA mapping domain
>>    * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
>> @@ -914,7 +919,7 @@ static void iommu_dma_sync_single_for_cpu(struct 
>> device *dev,
>>   {
>>       phys_addr_t phys;
>> -    if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
>> +    if (!dev_is_dma_sync_needed(dev))
>>           return;
>>       phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
>> @@ -930,7 +935,7 @@ static void 
>> iommu_dma_sync_single_for_device(struct device *dev,
>>   {
>>       phys_addr_t phys;
>> -    if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
>> +    if (!dev_is_dma_sync_needed(dev))
>>           return;
>>       phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
>> @@ -1544,30 +1549,51 @@ static size_t iommu_dma_opt_mapping_size(void)
>>       return iova_rcache_range();
>>   }
>> +#define iommu_dma_ops_common_fields \
>> +    .flags            = DMA_F_PCI_P2PDMA_SUPPORTED,        \
>> +    .alloc            = iommu_dma_alloc,            \
>> +    .free            = iommu_dma_free,            \
>> +    .alloc_pages        = dma_common_alloc_pages,        \
>> +    .free_pages        = dma_common_free_pages,        \
>> +    .alloc_noncontiguous    = iommu_dma_alloc_noncontiguous,    \
>> +    .free_noncontiguous    = iommu_dma_free_noncontiguous,        \
>> +    .mmap            = iommu_dma_mmap,            \
>> +    .get_sgtable        = iommu_dma_get_sgtable,        \
>> +    .map_page        = iommu_dma_map_page,            \
>> +    .unmap_page        = iommu_dma_unmap_page,            \
>> +    .map_sg            = iommu_dma_map_sg,            \
>> +    .unmap_sg        = iommu_dma_unmap_sg,            \
>> +    .map_resource        = iommu_dma_map_resource,        \
>> +    .unmap_resource        = iommu_dma_unmap_resource,        \
>> +    .get_merge_boundary    = iommu_dma_get_merge_boundary,        \
>> +    .opt_mapping_size    = iommu_dma_opt_mapping_size,
>> +
>>   static const struct dma_map_ops iommu_dma_ops = {
>> -    .flags            = DMA_F_PCI_P2PDMA_SUPPORTED,
>> -    .alloc            = iommu_dma_alloc,
>> -    .free            = iommu_dma_free,
>> -    .alloc_pages        = dma_common_alloc_pages,
>> -    .free_pages        = dma_common_free_pages,
>> -    .alloc_noncontiguous    = iommu_dma_alloc_noncontiguous,
>> -    .free_noncontiguous    = iommu_dma_free_noncontiguous,
>> -    .mmap            = iommu_dma_mmap,
>> -    .get_sgtable        = iommu_dma_get_sgtable,
>> -    .map_page        = iommu_dma_map_page,
>> -    .unmap_page        = iommu_dma_unmap_page,
>> -    .map_sg            = iommu_dma_map_sg,
>> -    .unmap_sg        = iommu_dma_unmap_sg,
>> +    iommu_dma_ops_common_fields
>> +
>>       .sync_single_for_cpu    = iommu_dma_sync_single_for_cpu,
>>       .sync_single_for_device    = iommu_dma_sync_single_for_device,
>>       .sync_sg_for_cpu    = iommu_dma_sync_sg_for_cpu,
>>       .sync_sg_for_device    = iommu_dma_sync_sg_for_device,
>> -    .map_resource        = iommu_dma_map_resource,
>> -    .unmap_resource        = iommu_dma_unmap_resource,
>> -    .get_merge_boundary    = iommu_dma_get_merge_boundary,
>> -    .opt_mapping_size    = iommu_dma_opt_mapping_size,
>>   };
>> +/* Special instance of iommu_dma_ops for devices satisfying this 
>> condition:
>> + *   !dev_is_dma_sync_needed(dev)
>> + *
>> + * iommu_dma_sync_single_for_cpu(), iommu_dma_sync_single_for_device(),
>> + * iommu_dma_sync_sg_for_cpu(), iommu_dma_sync_sg_for_device()
>> + * do nothing special and can be avoided, saving indirect calls.
>> + */
>> +static const struct dma_map_ops iommu_nosync_dma_ops = {
>> +    iommu_dma_ops_common_fields
>> +
>> +    .sync_single_for_cpu    = NULL,
>> +    .sync_single_for_device    = NULL,
>> +    .sync_sg_for_cpu    = NULL,
>> +    .sync_sg_for_device    = NULL,
>> +};
>> +#undef iommu_dma_ops_common_fields
>> +
>>   /*
>>    * The IOMMU core code allocates the default DMA domain, which the 
>> underlying
>>    * IOMMU driver needs to support via the dma-iommu layer.
>> @@ -1586,7 +1612,8 @@ void iommu_setup_dma_ops(struct device *dev, u64 
>> dma_base, u64 dma_limit)
>>       if (iommu_is_dma_domain(domain)) {
>>           if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
>>               goto out_err;
>> -        dev->dma_ops = &iommu_dma_ops;
>> +        dev->dma_ops = dev_is_dma_sync_needed(dev) ?
>> +                &iommu_dma_ops : &iommu_nosync_dma_ops;
> 
> This doesn't work, because at this point we don't know whether a 
> coherent device is still going to need SWIOTLB for DMA mask reasons or not.

Wait, no, now I've completely confused myself... :(

This probably *is* OK since it's specifically iommu_dma_ops, not DMA ops 
in general, and we don't support IOMMUs with addressing limitations of 
their own. Plus the other reasons for hooking into SWIOTLB here that 
have also muddled my brain have been for non-coherent stuff, so still 
probably shouldn't make a difference.

Either way I do think it would be neatest to handle this higher up in 
the API (not to mention apparently easier to reason about...)

Thanks,
Robin.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations
  2022-11-12  4:04 [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations Eric Dumazet
  2022-11-14 13:30 ` Robin Murphy
@ 2022-11-22 19:17 ` Michal Kubiak
  2022-11-22 22:54   ` Eric Dumazet
  2022-11-23 10:15   ` Michal Kubiak
  2022-11-24  4:23 ` Ethan Zhao
  2 siblings, 2 replies; 7+ messages in thread
From: Michal Kubiak @ 2022-11-22 19:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Joerg Roedel, Robin Murphy, Will Deacon, linux-kernel, netdev,
	Eric Dumazet, iommu, maciej.fijalkowski, magnus.karlsson

On Sat, Nov 12, 2022 at 04:04:52AM +0000, Eric Dumazet wrote:
> Quite often, NIC devices do not need dma_sync operations
> on x86_64 at least.
> 
> Indeed, when dev_is_dma_coherent(dev) is true and
> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> and friends do nothing.
> 
> However, indirectly calling them when CONFIG_RETPOLINE=y
> consumes about 10% of cycles on a cpu receiving packets
> from softirq at ~100Gbit rate, as shown in [1]
> 
> Even if/when CONFIG_RETPOLINE is not set, there
> is a cost of about 3%.
> 
> This patch adds a copy of iommu_dma_ops structure,
> where sync_single_for_cpu, sync_single_for_device,
> sync_sg_for_cpu and sync_sg_for_device are unset.


Larysa from our team has found out this patch introduces also a
functional improvement for batch allocation in AF_XDP while iommmu is
turned on.
In 'xp_alloc_batch()' function there is a check if DMA needs a
synchronization. If so, batch allocation is not supported and we can
allocate only one buffer at a time.

The flag 'dma_need_sync' is being set according to the value returned by
the function 'dma_need_sync()' (from '/kernel/dma/mapping.c').
That function only checks if at least one of two DMA ops is defined:
'ops->sync_single_for_cpu' or 'ops->sync_single_for_device'.

> +static const struct dma_map_ops iommu_nosync_dma_ops = {
> +	iommu_dma_ops_common_fields
> +
> +	.sync_single_for_cpu	= NULL,
> +	.sync_single_for_device	= NULL,
> +	.sync_sg_for_cpu	= NULL,
> +	.sync_sg_for_device	= NULL,
> +};
> +#undef iommu_dma_ops_common_fields
> +
>  /*
>   * The IOMMU core code allocates the default DMA domain, which the underlying
>   * IOMMU driver needs to support via the dma-iommu layer.
> @@ -1586,7 +1612,8 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
>  	if (iommu_is_dma_domain(domain)) {
>  		if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
>  			goto out_err;
> -		dev->dma_ops = &iommu_dma_ops;
> +		dev->dma_ops = dev_is_dma_sync_needed(dev) ?
> +				&iommu_dma_ops : &iommu_nosync_dma_ops;
>  	}
>  
>  	return;

 This code removes defining 'sync_*' DMA ops if they are not actually
 used. Thanks to that improvement the function 'dma_need_sync()' will
 always return more meaningful information if any DMA synchronization is
 actually needed for iommu.

 Together with Larysa we have applied that patch and we can confirm it
 helps for batch buffer allocation in AF_XDP ('xsk_buff_alloc_batch()'
 call) when iommu is enabled.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations
  2022-11-22 19:17 ` Michal Kubiak
@ 2022-11-22 22:54   ` Eric Dumazet
  2022-11-23 10:15   ` Michal Kubiak
  1 sibling, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2022-11-22 22:54 UTC (permalink / raw)
  To: Michal Kubiak
  Cc: Joerg Roedel, Robin Murphy, Will Deacon, linux-kernel, netdev,
	Eric Dumazet, iommu, maciej.fijalkowski, magnus.karlsson

On Tue, Nov 22, 2022 at 11:18 AM Michal Kubiak <michal.kubiak@intel.com> wrote:
>
> On Sat, Nov 12, 2022 at 04:04:52AM +0000, Eric Dumazet wrote:
> > Quite often, NIC devices do not need dma_sync operations
> > on x86_64 at least.
> >
> > Indeed, when dev_is_dma_coherent(dev) is true and
> > dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> > and friends do nothing.
> >
> > However, indirectly calling them when CONFIG_RETPOLINE=y
> > consumes about 10% of cycles on a cpu receiving packets
> > from softirq at ~100Gbit rate, as shown in [1]
> >
> > Even if/when CONFIG_RETPOLINE is not set, there
> > is a cost of about 3%.
> >
> > This patch adds a copy of iommu_dma_ops structure,
> > where sync_single_for_cpu, sync_single_for_device,
> > sync_sg_for_cpu and sync_sg_for_device are unset.
>
>
> Larysa from our team has found out this patch introduces also a
> functional improvement for batch allocation in AF_XDP while iommmu is
> turned on.
> In 'xp_alloc_batch()' function there is a check if DMA needs a
> synchronization. If so, batch allocation is not supported and we can
> allocate only one buffer at a time.
>
> The flag 'dma_need_sync' is being set according to the value returned by
> the function 'dma_need_sync()' (from '/kernel/dma/mapping.c').
> That function only checks if at least one of two DMA ops is defined:
> 'ops->sync_single_for_cpu' or 'ops->sync_single_for_device'.
>
> > +static const struct dma_map_ops iommu_nosync_dma_ops = {
> > +     iommu_dma_ops_common_fields
> > +
> > +     .sync_single_for_cpu    = NULL,
> > +     .sync_single_for_device = NULL,
> > +     .sync_sg_for_cpu        = NULL,
> > +     .sync_sg_for_device     = NULL,
> > +};
> > +#undef iommu_dma_ops_common_fields
> > +
> >  /*
> >   * The IOMMU core code allocates the default DMA domain, which the underlying
> >   * IOMMU driver needs to support via the dma-iommu layer.
> > @@ -1586,7 +1612,8 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
> >       if (iommu_is_dma_domain(domain)) {
> >               if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
> >                       goto out_err;
> > -             dev->dma_ops = &iommu_dma_ops;
> > +             dev->dma_ops = dev_is_dma_sync_needed(dev) ?
> > +                             &iommu_dma_ops : &iommu_nosync_dma_ops;
> >       }
> >
> >       return;
>
>  This code removes defining 'sync_*' DMA ops if they are not actually
>  used. Thanks to that improvement the function 'dma_need_sync()' will
>  always return more meaningful information if any DMA synchronization is
>  actually needed for iommu.
>
>  Together with Larysa we have applied that patch and we can confirm it
>  helps for batch buffer allocation in AF_XDP ('xsk_buff_alloc_batch()'
>  call) when iommu is enabled.

Thanks for testing !

I am quite busy relocating, I will address Christoph feedback next week.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations
  2022-11-22 19:17 ` Michal Kubiak
  2022-11-22 22:54   ` Eric Dumazet
@ 2022-11-23 10:15   ` Michal Kubiak
  1 sibling, 0 replies; 7+ messages in thread
From: Michal Kubiak @ 2022-11-23 10:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Joerg Roedel, Robin Murphy, Will Deacon, linux-kernel, netdev,
	Eric Dumazet, iommu, maciej.fijalkowski, magnus.karlsson,
	larysa.zaremba

On Tue, Nov 22, 2022 at 02:17:58PM -0500, Michal Kubiak wrote:
> 
>  Together with Larysa we have applied that patch and we can confirm it
>  helps for batch buffer allocation in AF_XDP ('xsk_buff_alloc_batch()'
>  call) when iommu is enabled.


I am sorry I have forgotten to add Larysa to this thread, adding.

Michal

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations
  2022-11-12  4:04 [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations Eric Dumazet
  2022-11-14 13:30 ` Robin Murphy
  2022-11-22 19:17 ` Michal Kubiak
@ 2022-11-24  4:23 ` Ethan Zhao
  2 siblings, 0 replies; 7+ messages in thread
From: Ethan Zhao @ 2022-11-24  4:23 UTC (permalink / raw)
  To: Eric Dumazet, Joerg Roedel, Robin Murphy, Will Deacon
  Cc: linux-kernel, netdev, Eric Dumazet, iommu

Hi,

On 2022/11/12 12:04, Eric Dumazet wrote:
> Quite often, NIC devices do not need dma_sync operations
> on x86_64 at least.
>
> Indeed, when dev_is_dma_coherent(dev) is true and
> dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
> and friends do nothing.
>
> However, indirectly calling them when CONFIG_RETPOLINE=y
> consumes about 10% of cycles on a cpu receiving packets
> from softirq at ~100Gbit rate, as shown in [1]
>
> Even if/when CONFIG_RETPOLINE is not set, there
> is a cost of about 3%.
>
> This patch adds a copy of iommu_dma_ops structure,
> where sync_single_for_cpu, sync_single_for_device,
> sync_sg_for_cpu and sync_sg_for_device are unset.
>
> perf profile before the patch:
>
>      18.53%  [kernel]       [k] gq_rx_skb
>      14.77%  [kernel]       [k] napi_reuse_skb
>       8.95%  [kernel]       [k] skb_release_data
>       5.42%  [kernel]       [k] dev_gro_receive
>       5.37%  [kernel]       [k] memcpy
> <*>  5.26%  [kernel]       [k] iommu_dma_sync_sg_for_cpu
>       4.78%  [kernel]       [k] tcp_gro_receive
> <*>  4.42%  [kernel]       [k] iommu_dma_sync_sg_for_device
>       4.12%  [kernel]       [k] ipv6_gro_receive
>       3.65%  [kernel]       [k] gq_pool_get
>       3.25%  [kernel]       [k] skb_gro_receive
>       2.07%  [kernel]       [k] napi_gro_frags
>       1.98%  [kernel]       [k] tcp6_gro_receive
>       1.27%  [kernel]       [k] gq_rx_prep_buffers
>       1.18%  [kernel]       [k] gq_rx_napi_handler
>       0.99%  [kernel]       [k] csum_partial
>       0.74%  [kernel]       [k] csum_ipv6_magic
>       0.72%  [kernel]       [k] free_pcp_prepare
>       0.60%  [kernel]       [k] __napi_poll
>       0.58%  [kernel]       [k] net_rx_action
>       0.56%  [kernel]       [k] read_tsc
> <*>  0.50%  [kernel]       [k] __x86_indirect_thunk_r11
>       0.45%  [kernel]       [k] memset
>
> After patch, lines with <*> no longer show up, and overall
> cpu usage looks much better (~60% instead of ~72%)
>
>      25.56%  [kernel]       [k] gq_rx_skb
>       9.90%  [kernel]       [k] napi_reuse_skb
>       7.39%  [kernel]       [k] dev_gro_receive
>       6.78%  [kernel]       [k] memcpy
>       6.53%  [kernel]       [k] skb_release_data
>       6.39%  [kernel]       [k] tcp_gro_receive
>       5.71%  [kernel]       [k] ipv6_gro_receive
>       4.35%  [kernel]       [k] napi_gro_frags
>       4.34%  [kernel]       [k] skb_gro_receive
>       3.50%  [kernel]       [k] gq_pool_get
>       3.08%  [kernel]       [k] gq_rx_napi_handler
>       2.35%  [kernel]       [k] tcp6_gro_receive
>       2.06%  [kernel]       [k] gq_rx_prep_buffers
>       1.32%  [kernel]       [k] csum_partial
>       0.93%  [kernel]       [k] csum_ipv6_magic
>       0.65%  [kernel]       [k] net_rx_action
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Will Deacon <will@kernel.org>
> Cc: iommu@lists.linux.dev
> ---
>   drivers/iommu/dma-iommu.c | 67 +++++++++++++++++++++++++++------------
>   1 file changed, 47 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 9297b741f5e80e2408e864fc3f779410d6b04d49..976ba20a55eab5fd94e9bec2d38a2a60e0690444 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -522,6 +522,11 @@ static bool dev_use_swiotlb(struct device *dev)
>   	return IS_ENABLED(CONFIG_SWIOTLB) && dev_is_untrusted(dev);
>   }
>   
> +static bool dev_is_dma_sync_needed(struct device *dev)
> +{
> +	return !dev_is_dma_coherent(dev) || dev_use_swiotlb(dev);
> +}
> +
>   /**
>    * iommu_dma_init_domain - Initialise a DMA mapping domain
>    * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
> @@ -914,7 +919,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev,
>   {
>   	phys_addr_t phys;
>   
> -	if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
> +	if (!dev_is_dma_sync_needed(dev))

Seems this function is also called by iommu_dma_map_page()  pair and it 
already checked

if the device is coherent,  so do we need this duplicate 
dev_is_dma_sync_needed(dev) ?

How about we move this checking to iommu_dma_map_page() 
/iommu_dma_unmap_page()

then no need checking here anymore ?


Thanks,

Ethan


>   		return;
>   
>   	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
> @@ -930,7 +935,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,
>   {
>   	phys_addr_t phys;
>   
> -	if (dev_is_dma_coherent(dev) && !dev_use_swiotlb(dev))
> +	if (!dev_is_dma_sync_needed(dev))
>   		return;
>   
>   	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
> @@ -1544,30 +1549,51 @@ static size_t iommu_dma_opt_mapping_size(void)
>   	return iova_rcache_range();
>   }
>   
> +#define iommu_dma_ops_common_fields \
> +	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,		\
> +	.alloc			= iommu_dma_alloc,			\
> +	.free			= iommu_dma_free,			\
> +	.alloc_pages		= dma_common_alloc_pages,		\
> +	.free_pages		= dma_common_free_pages,		\
> +	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,	\
> +	.free_noncontiguous	= iommu_dma_free_noncontiguous,		\
> +	.mmap			= iommu_dma_mmap,			\
> +	.get_sgtable		= iommu_dma_get_sgtable,		\
> +	.map_page		= iommu_dma_map_page,			\
> +	.unmap_page		= iommu_dma_unmap_page,			\
> +	.map_sg			= iommu_dma_map_sg,			\
> +	.unmap_sg		= iommu_dma_unmap_sg,			\
> +	.map_resource		= iommu_dma_map_resource,		\
> +	.unmap_resource		= iommu_dma_unmap_resource,		\
> +	.get_merge_boundary	= iommu_dma_get_merge_boundary,		\
> +	.opt_mapping_size	= iommu_dma_opt_mapping_size,
> +
>   static const struct dma_map_ops iommu_dma_ops = {
> -	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,
> -	.alloc			= iommu_dma_alloc,
> -	.free			= iommu_dma_free,
> -	.alloc_pages		= dma_common_alloc_pages,
> -	.free_pages		= dma_common_free_pages,
> -	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,
> -	.free_noncontiguous	= iommu_dma_free_noncontiguous,
> -	.mmap			= iommu_dma_mmap,
> -	.get_sgtable		= iommu_dma_get_sgtable,
> -	.map_page		= iommu_dma_map_page,
> -	.unmap_page		= iommu_dma_unmap_page,
> -	.map_sg			= iommu_dma_map_sg,
> -	.unmap_sg		= iommu_dma_unmap_sg,
> +	iommu_dma_ops_common_fields
> +
>   	.sync_single_for_cpu	= iommu_dma_sync_single_for_cpu,
>   	.sync_single_for_device	= iommu_dma_sync_single_for_device,
>   	.sync_sg_for_cpu	= iommu_dma_sync_sg_for_cpu,
>   	.sync_sg_for_device	= iommu_dma_sync_sg_for_device,
> -	.map_resource		= iommu_dma_map_resource,
> -	.unmap_resource		= iommu_dma_unmap_resource,
> -	.get_merge_boundary	= iommu_dma_get_merge_boundary,
> -	.opt_mapping_size	= iommu_dma_opt_mapping_size,
>   };
>   
> +/* Special instance of iommu_dma_ops for devices satisfying this condition:
> + *   !dev_is_dma_sync_needed(dev)
> + *
> + * iommu_dma_sync_single_for_cpu(), iommu_dma_sync_single_for_device(),
> + * iommu_dma_sync_sg_for_cpu(), iommu_dma_sync_sg_for_device()
> + * do nothing special and can be avoided, saving indirect calls.
> + */
> +static const struct dma_map_ops iommu_nosync_dma_ops = {
> +	iommu_dma_ops_common_fields
> +
> +	.sync_single_for_cpu	= NULL,
> +	.sync_single_for_device	= NULL,
> +	.sync_sg_for_cpu	= NULL,
> +	.sync_sg_for_device	= NULL,
> +};
> +#undef iommu_dma_ops_common_fields
> +
>   /*
>    * The IOMMU core code allocates the default DMA domain, which the underlying
>    * IOMMU driver needs to support via the dma-iommu layer.
> @@ -1586,7 +1612,8 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
>   	if (iommu_is_dma_domain(domain)) {
>   		if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
>   			goto out_err;
> -		dev->dma_ops = &iommu_dma_ops;
> +		dev->dma_ops = dev_is_dma_sync_needed(dev) ?
> +				&iommu_dma_ops : &iommu_nosync_dma_ops;
>   	}
>   
>   	return;

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-11-24  4:23 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-12  4:04 [PATCH -next] iommu/dma: avoid expensive indirect calls for sync operations Eric Dumazet
2022-11-14 13:30 ` Robin Murphy
2022-11-14 13:52   ` Robin Murphy
2022-11-22 19:17 ` Michal Kubiak
2022-11-22 22:54   ` Eric Dumazet
2022-11-23 10:15   ` Michal Kubiak
2022-11-24  4:23 ` Ethan Zhao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).