linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] iommu/iova: Add a best-fit algorithm
@ 2020-02-14 23:06 Isaac J. Manjarres
  2020-02-17  8:03 ` Christoph Hellwig
  2020-02-17 16:03 ` Robin Murphy
  0 siblings, 2 replies; 6+ messages in thread
From: Isaac J. Manjarres @ 2020-02-14 23:06 UTC (permalink / raw)
  To: iommu, linux-kernel
  Cc: Liam Mark, joro, pratikp, kernel-team, Isaac J. Manjarres

From: Liam Mark <lmark@codeaurora.org>

Using the best-fit algorithm, instead of the first-fit
algorithm, may reduce fragmentation when allocating
IOVAs.

Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
---
 drivers/iommu/dma-iommu.c | 17 +++++++++++
 drivers/iommu/iova.c      | 73 +++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/dma-iommu.h |  7 +++++
 include/linux/iova.h      |  1 +
 4 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index a2e96a5..af08770 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -364,9 +364,26 @@ static int iommu_dma_deferred_attach(struct device *dev,
 	if (unlikely(ops->is_attach_deferred &&
 			ops->is_attach_deferred(domain, dev)))
 		return iommu_attach_device(domain, dev);
+	return 0;
+}
+
+/*
+ * Should be called prior to using dma-apis.
+ */
+int iommu_dma_enable_best_fit_algo(struct device *dev)
+{
+	struct iommu_domain *domain;
+	struct iova_domain *iovad;
+
+	domain = iommu_get_domain_for_dev(dev);
+	if (!domain || !domain->iova_cookie)
+		return -EINVAL;
 
+	iovad = &((struct iommu_dma_cookie *)domain->iova_cookie)->iovad;
+	iovad->best_fit = true;
 	return 0;
 }
+EXPORT_SYMBOL(iommu_dma_enable_best_fit_algo);
 
 /**
  * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 0e6a953..716b05f 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -50,6 +50,7 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
 	iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
 	rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
 	rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
+	iovad->best_fit = false;
 	init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
@@ -227,6 +228,69 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
 	return -ENOMEM;
 }
 
+static int __alloc_and_insert_iova_best_fit(struct iova_domain *iovad,
+		unsigned long size, unsigned long limit_pfn,
+			struct iova *new, bool size_aligned)
+{
+	struct rb_node *curr, *prev;
+	struct iova *curr_iova, *prev_iova;
+	unsigned long flags;
+	unsigned long align_mask = ~0UL;
+	struct rb_node *candidate_rb_parent;
+	unsigned long new_pfn, candidate_pfn = ~0UL;
+	unsigned long gap, candidate_gap = ~0UL;
+
+	if (size_aligned)
+		align_mask <<= limit_align(iovad, fls_long(size - 1));
+
+	/* Walk the tree backwards */
+	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+	curr = &iovad->anchor.node;
+	prev = rb_prev(curr);
+	for (; prev; curr = prev, prev = rb_prev(curr)) {
+		curr_iova = rb_entry(curr, struct iova, node);
+		prev_iova = rb_entry(prev, struct iova, node);
+
+		limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
+		new_pfn = (limit_pfn - size) & align_mask;
+		gap = curr_iova->pfn_lo - prev_iova->pfn_hi - 1;
+		if ((limit_pfn >= size) && (new_pfn > prev_iova->pfn_hi)
+				&& (gap < candidate_gap)) {
+			candidate_gap = gap;
+			candidate_pfn = new_pfn;
+			candidate_rb_parent = curr;
+			if (gap == size)
+				goto insert;
+		}
+	}
+
+	curr_iova = rb_entry(curr, struct iova, node);
+	limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
+	new_pfn = (limit_pfn - size) & align_mask;
+	gap = curr_iova->pfn_lo - iovad->start_pfn;
+	if (limit_pfn >= size && new_pfn >= iovad->start_pfn &&
+			gap < candidate_gap) {
+		candidate_gap = gap;
+		candidate_pfn = new_pfn;
+		candidate_rb_parent = curr;
+	}
+
+insert:
+	if (candidate_pfn == ~0UL) {
+		spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+		return -ENOMEM;
+	}
+
+	/* pfn_lo will point to size aligned address if size_aligned is set */
+	new->pfn_lo = candidate_pfn;
+	new->pfn_hi = new->pfn_lo + size - 1;
+
+	/* If we have 'prev', it's a valid place to start the insertion. */
+	iova_insert_rbtree(&iovad->rbroot, new, candidate_rb_parent);
+	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+	return 0;
+}
+
 static struct kmem_cache *iova_cache;
 static unsigned int iova_cache_users;
 static DEFINE_MUTEX(iova_cache_mutex);
@@ -302,8 +366,13 @@ struct iova *
 	if (!new_iova)
 		return NULL;
 
-	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
-			new_iova, size_aligned);
+	if (iovad->best_fit) {
+		ret = __alloc_and_insert_iova_best_fit(iovad, size,
+				limit_pfn + 1, new_iova, size_aligned);
+	} else {
+		ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
+				new_iova, size_aligned);
+	}
 
 	if (ret) {
 		free_iova_mem(new_iova);
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 2112f21..b01a31a 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -37,6 +37,8 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 
+int iommu_dma_enable_best_fit_algo(struct device *dev);
+
 #else /* CONFIG_IOMMU_DMA */
 
 struct iommu_domain;
@@ -78,5 +80,10 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he
 {
 }
 
+static inline int iommu_dma_enable_best_fit_algo(struct device *dev)
+{
+	return -ENODEV;
+}
+
 #endif	/* CONFIG_IOMMU_DMA */
 #endif	/* __DMA_IOMMU_H */
diff --git a/include/linux/iova.h b/include/linux/iova.h
index a0637ab..58713bb 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -95,6 +95,7 @@ struct iova_domain {
 						   flush-queues */
 	atomic_t fq_timer_on;			/* 1 when timer is active, 0
 						   when not */
+	bool best_fit;
 };
 
 static inline unsigned long iova_size(struct iova *iova)
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] iommu/iova: Add a best-fit algorithm
  2020-02-14 23:06 [RFC PATCH] iommu/iova: Add a best-fit algorithm Isaac J. Manjarres
@ 2020-02-17  8:03 ` Christoph Hellwig
  2020-02-17 16:03 ` Robin Murphy
  1 sibling, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2020-02-17  8:03 UTC (permalink / raw)
  To: Isaac J. Manjarres
  Cc: iommu, linux-kernel, Liam Mark, joro, pratikp, kernel-team

On Fri, Feb 14, 2020 at 03:06:42PM -0800, Isaac J. Manjarres wrote:
> From: Liam Mark <lmark@codeaurora.org>
> 
> Using the best-fit algorithm, instead of the first-fit
> algorithm, may reduce fragmentation when allocating
> IOVAs.

As we like to say in standards groups:  may also implies may not.
Please provide numbers showing that this helps, and preferably and
explanation how it doesn't hurt as well.

> + * Should be called prior to using dma-apis.
> + */
> +int iommu_dma_enable_best_fit_algo(struct device *dev)
> +{
> +	struct iommu_domain *domain;
> +	struct iova_domain *iovad;
> +
> +	domain = iommu_get_domain_for_dev(dev);
> +	if (!domain || !domain->iova_cookie)
> +		return -EINVAL;
>  
> +	iovad = &((struct iommu_dma_cookie *)domain->iova_cookie)->iovad;
> +	iovad->best_fit = true;
>  	return 0;
>  }
> +EXPORT_SYMBOL(iommu_dma_enable_best_fit_algo);

Who is going to call this?  Patches adding exports always need a user
that goes along with the export.  Also drivers have no business calling
directly into dma-iommu.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] iommu/iova: Add a best-fit algorithm
  2020-02-14 23:06 [RFC PATCH] iommu/iova: Add a best-fit algorithm Isaac J. Manjarres
  2020-02-17  8:03 ` Christoph Hellwig
@ 2020-02-17 16:03 ` Robin Murphy
  2020-02-19 12:28   ` Will Deacon
  2020-02-20  6:38   ` isaacm
  1 sibling, 2 replies; 6+ messages in thread
From: Robin Murphy @ 2020-02-17 16:03 UTC (permalink / raw)
  To: Isaac J. Manjarres, iommu, linux-kernel; +Cc: kernel-team, pratikp, Liam Mark

On 14/02/2020 11:06 pm, Isaac J. Manjarres wrote:
> From: Liam Mark <lmark@codeaurora.org>
> 
> Using the best-fit algorithm, instead of the first-fit
> algorithm, may reduce fragmentation when allocating
> IOVAs.

What kind of pathological allocation patterns make that a serious 
problem? Is there any scope for simply changing the order of things in 
the callers? Do these drivers also run under other DMA API backends 
(e.g. 32-bit Arm)?

More generally, if a driver knows enough to want to second-guess a 
generic DMA API allocator, that's a reasonable argument that it should 
perhaps be properly IOMMU-aware and managing its own address space 
anyway. Perhaps this effort might be better spent finishing off the DMA 
ops bypass stuff to make that approach more robust and welcoming.

Robin.

> Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
> ---
>   drivers/iommu/dma-iommu.c | 17 +++++++++++
>   drivers/iommu/iova.c      | 73 +++++++++++++++++++++++++++++++++++++++++++++--
>   include/linux/dma-iommu.h |  7 +++++
>   include/linux/iova.h      |  1 +
>   4 files changed, 96 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index a2e96a5..af08770 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -364,9 +364,26 @@ static int iommu_dma_deferred_attach(struct device *dev,
>   	if (unlikely(ops->is_attach_deferred &&
>   			ops->is_attach_deferred(domain, dev)))
>   		return iommu_attach_device(domain, dev);
> +	return 0;
> +}
> +
> +/*
> + * Should be called prior to using dma-apis.
> + */
> +int iommu_dma_enable_best_fit_algo(struct device *dev)
> +{
> +	struct iommu_domain *domain;
> +	struct iova_domain *iovad;
> +
> +	domain = iommu_get_domain_for_dev(dev);
> +	if (!domain || !domain->iova_cookie)
> +		return -EINVAL;
>   
> +	iovad = &((struct iommu_dma_cookie *)domain->iova_cookie)->iovad;
> +	iovad->best_fit = true;
>   	return 0;
>   }
> +EXPORT_SYMBOL(iommu_dma_enable_best_fit_algo);
>   
>   /**
>    * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index 0e6a953..716b05f 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -50,6 +50,7 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
>   	iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
>   	rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
>   	rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
> +	iovad->best_fit = false;
>   	init_iova_rcaches(iovad);
>   }
>   EXPORT_SYMBOL_GPL(init_iova_domain);
> @@ -227,6 +228,69 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
>   	return -ENOMEM;
>   }
>   
> +static int __alloc_and_insert_iova_best_fit(struct iova_domain *iovad,
> +		unsigned long size, unsigned long limit_pfn,
> +			struct iova *new, bool size_aligned)
> +{
> +	struct rb_node *curr, *prev;
> +	struct iova *curr_iova, *prev_iova;
> +	unsigned long flags;
> +	unsigned long align_mask = ~0UL;
> +	struct rb_node *candidate_rb_parent;
> +	unsigned long new_pfn, candidate_pfn = ~0UL;
> +	unsigned long gap, candidate_gap = ~0UL;
> +
> +	if (size_aligned)
> +		align_mask <<= limit_align(iovad, fls_long(size - 1));
> +
> +	/* Walk the tree backwards */
> +	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
> +	curr = &iovad->anchor.node;
> +	prev = rb_prev(curr);
> +	for (; prev; curr = prev, prev = rb_prev(curr)) {
> +		curr_iova = rb_entry(curr, struct iova, node);
> +		prev_iova = rb_entry(prev, struct iova, node);
> +
> +		limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
> +		new_pfn = (limit_pfn - size) & align_mask;
> +		gap = curr_iova->pfn_lo - prev_iova->pfn_hi - 1;
> +		if ((limit_pfn >= size) && (new_pfn > prev_iova->pfn_hi)
> +				&& (gap < candidate_gap)) {
> +			candidate_gap = gap;
> +			candidate_pfn = new_pfn;
> +			candidate_rb_parent = curr;
> +			if (gap == size)
> +				goto insert;
> +		}
> +	}
> +
> +	curr_iova = rb_entry(curr, struct iova, node);
> +	limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
> +	new_pfn = (limit_pfn - size) & align_mask;
> +	gap = curr_iova->pfn_lo - iovad->start_pfn;
> +	if (limit_pfn >= size && new_pfn >= iovad->start_pfn &&
> +			gap < candidate_gap) {
> +		candidate_gap = gap;
> +		candidate_pfn = new_pfn;
> +		candidate_rb_parent = curr;
> +	}
> +
> +insert:
> +	if (candidate_pfn == ~0UL) {
> +		spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
> +		return -ENOMEM;
> +	}
> +
> +	/* pfn_lo will point to size aligned address if size_aligned is set */
> +	new->pfn_lo = candidate_pfn;
> +	new->pfn_hi = new->pfn_lo + size - 1;
> +
> +	/* If we have 'prev', it's a valid place to start the insertion. */
> +	iova_insert_rbtree(&iovad->rbroot, new, candidate_rb_parent);
> +	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
> +	return 0;
> +}
> +
>   static struct kmem_cache *iova_cache;
>   static unsigned int iova_cache_users;
>   static DEFINE_MUTEX(iova_cache_mutex);
> @@ -302,8 +366,13 @@ struct iova *
>   	if (!new_iova)
>   		return NULL;
>   
> -	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
> -			new_iova, size_aligned);
> +	if (iovad->best_fit) {
> +		ret = __alloc_and_insert_iova_best_fit(iovad, size,
> +				limit_pfn + 1, new_iova, size_aligned);
> +	} else {
> +		ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
> +				new_iova, size_aligned);
> +	}
>   
>   	if (ret) {
>   		free_iova_mem(new_iova);
> diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
> index 2112f21..b01a31a 100644
> --- a/include/linux/dma-iommu.h
> +++ b/include/linux/dma-iommu.h
> @@ -37,6 +37,8 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
>   
>   void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
>   
> +int iommu_dma_enable_best_fit_algo(struct device *dev);
> +
>   #else /* CONFIG_IOMMU_DMA */
>   
>   struct iommu_domain;
> @@ -78,5 +80,10 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he
>   {
>   }
>   
> +static inline int iommu_dma_enable_best_fit_algo(struct device *dev)
> +{
> +	return -ENODEV;
> +}
> +
>   #endif	/* CONFIG_IOMMU_DMA */
>   #endif	/* __DMA_IOMMU_H */
> diff --git a/include/linux/iova.h b/include/linux/iova.h
> index a0637ab..58713bb 100644
> --- a/include/linux/iova.h
> +++ b/include/linux/iova.h
> @@ -95,6 +95,7 @@ struct iova_domain {
>   						   flush-queues */
>   	atomic_t fq_timer_on;			/* 1 when timer is active, 0
>   						   when not */
> +	bool best_fit;
>   };
>   
>   static inline unsigned long iova_size(struct iova *iova)
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] iommu/iova: Add a best-fit algorithm
  2020-02-17 16:03 ` Robin Murphy
@ 2020-02-19 12:28   ` Will Deacon
  2020-02-20  6:38   ` isaacm
  1 sibling, 0 replies; 6+ messages in thread
From: Will Deacon @ 2020-02-19 12:28 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Isaac J. Manjarres, iommu, linux-kernel, kernel-team, pratikp, Liam Mark

On Mon, Feb 17, 2020 at 04:03:54PM +0000, Robin Murphy wrote:
> On 14/02/2020 11:06 pm, Isaac J. Manjarres wrote:
> > From: Liam Mark <lmark@codeaurora.org>
> > 
> > Using the best-fit algorithm, instead of the first-fit
> > algorithm, may reduce fragmentation when allocating
> > IOVAs.
> 
> What kind of pathological allocation patterns make that a serious problem?
> Is there any scope for simply changing the order of things in the callers?
> Do these drivers also run under other DMA API backends (e.g. 32-bit Arm)?
> 
> More generally, if a driver knows enough to want to second-guess a generic
> DMA API allocator, that's a reasonable argument that it should perhaps be
> properly IOMMU-aware and managing its own address space anyway. Perhaps this
> effort might be better spent finishing off the DMA ops bypass stuff to make
> that approach more robust and welcoming.

Anecdotally, it appears to be a fairly common problem for 32-bit capable
DMA masters to hit fragmentation problems with the current IOVA allocator
but yes, some numbers to show how that is improved using best-fit (as
opposed to e.g. worst-fit) are definitely required here.

It might be that we can simply swizzle the algorithm to focus on reduced
fragmentation for smaller (i.e. 32-bit) address spaces, but leave larger
domains with the current approach to avoid increasing the allocation
overhead.

Will

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] iommu/iova: Add a best-fit algorithm
  2020-02-17 16:03 ` Robin Murphy
  2020-02-19 12:28   ` Will Deacon
@ 2020-02-20  6:38   ` isaacm
  2020-02-20 18:42     ` Robin Murphy
  1 sibling, 1 reply; 6+ messages in thread
From: isaacm @ 2020-02-20  6:38 UTC (permalink / raw)
  To: Robin Murphy; +Cc: iommu, linux-kernel, kernel-team, pratikp, Liam Mark

On 2020-02-17 08:03, Robin Murphy wrote:
> On 14/02/2020 11:06 pm, Isaac J. Manjarres wrote:
>> From: Liam Mark <lmark@codeaurora.org>
>> 
>> Using the best-fit algorithm, instead of the first-fit
>> algorithm, may reduce fragmentation when allocating
>> IOVAs.
> 
> What kind of pathological allocation patterns make that a serious
> problem? Is there any scope for simply changing the order of things in
> the callers? Do these drivers also run under other DMA API backends
> (e.g. 32-bit Arm)?
> 
The usecases where the IOVA space has been fragmented have 
non-deterministic allocation
patterns, and thus, it's not feasible to change the allocation order to 
avoid fragmenting
the IOVA space.

 From what we've observed, the usecases involve allocations of two types 
of
buffers: one type of buffer between 1 KB to 4 MB in size, and another 
type of
buffer between 1 KB to 400 MB in size.

The pathological scenarios seem to arise when there are
many (100+) randomly distributed non-power of two allocations, which in 
some cases leaves
behind holes of up to 100+ MB in the IOVA space.

Here are some examples that show the state of the IOVA space under which 
failure to
allocate an IOVA was observed:

Instance 1:
	Currently mapped total size : ~1.3GB
	Free space available : ~2GB
	Map for ~162MB fails.
         Max contiguous space available : < 162MB

Instance 2:
	Currently mapped total size : ~950MB
	Free space available : ~2.3GB
	Map for ~320MB fails.
	Max contiguous space available : ~189MB

Instance 3:
	Currently mapped total size : ~1.2GB
	Free space available : ~2.7GB
	Map for ~162MB fails.
	Max contiguous space available : <162MB

We are still in the process of collecting data with the best-fit 
algorithm enabled
to provide some numbers to show that it results in less IOVA space
fragmentation.

To answer your question about whether if this driver run under other DMA 
API backends:
yes, such as 32 bit ARM.
> More generally, if a driver knows enough to want to second-guess a
> generic DMA API allocator, that's a reasonable argument that it should
> perhaps be properly IOMMU-aware and managing its own address space
> anyway. Perhaps this effort might be better spent finishing off the
> DMA ops bypass stuff to make that approach more robust and welcoming.
> 
> Robin.
> 
>> Signed-off-by: Isaac J. Manjarres <isaacm@codeaurora.org>
>> ---
>>   drivers/iommu/dma-iommu.c | 17 +++++++++++
>>   drivers/iommu/iova.c      | 73 
>> +++++++++++++++++++++++++++++++++++++++++++++--
>>   include/linux/dma-iommu.h |  7 +++++
>>   include/linux/iova.h      |  1 +
>>   4 files changed, 96 insertions(+), 2 deletions(-)
>> 
>> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
>> index a2e96a5..af08770 100644
>> --- a/drivers/iommu/dma-iommu.c
>> +++ b/drivers/iommu/dma-iommu.c
>> @@ -364,9 +364,26 @@ static int iommu_dma_deferred_attach(struct 
>> device *dev,
>>   	if (unlikely(ops->is_attach_deferred &&
>>   			ops->is_attach_deferred(domain, dev)))
>>   		return iommu_attach_device(domain, dev);
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Should be called prior to using dma-apis.
>> + */
>> +int iommu_dma_enable_best_fit_algo(struct device *dev)
>> +{
>> +	struct iommu_domain *domain;
>> +	struct iova_domain *iovad;
>> +
>> +	domain = iommu_get_domain_for_dev(dev);
>> +	if (!domain || !domain->iova_cookie)
>> +		return -EINVAL;
>>   +	iovad = &((struct iommu_dma_cookie *)domain->iova_cookie)->iovad;
>> +	iovad->best_fit = true;
>>   	return 0;
>>   }
>> +EXPORT_SYMBOL(iommu_dma_enable_best_fit_algo);
>>     /**
>>    * dma_info_to_prot - Translate DMA API directions and attributes to 
>> IOMMU API
>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
>> index 0e6a953..716b05f 100644
>> --- a/drivers/iommu/iova.c
>> +++ b/drivers/iommu/iova.c
>> @@ -50,6 +50,7 @@ static unsigned long iova_rcache_get(struct 
>> iova_domain *iovad,
>>   	iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
>>   	rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
>>   	rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
>> +	iovad->best_fit = false;
>>   	init_iova_rcaches(iovad);
>>   }
>>   EXPORT_SYMBOL_GPL(init_iova_domain);
>> @@ -227,6 +228,69 @@ static int __alloc_and_insert_iova_range(struct 
>> iova_domain *iovad,
>>   	return -ENOMEM;
>>   }
>>   +static int __alloc_and_insert_iova_best_fit(struct iova_domain 
>> *iovad,
>> +		unsigned long size, unsigned long limit_pfn,
>> +			struct iova *new, bool size_aligned)
>> +{
>> +	struct rb_node *curr, *prev;
>> +	struct iova *curr_iova, *prev_iova;
>> +	unsigned long flags;
>> +	unsigned long align_mask = ~0UL;
>> +	struct rb_node *candidate_rb_parent;
>> +	unsigned long new_pfn, candidate_pfn = ~0UL;
>> +	unsigned long gap, candidate_gap = ~0UL;
>> +
>> +	if (size_aligned)
>> +		align_mask <<= limit_align(iovad, fls_long(size - 1));
>> +
>> +	/* Walk the tree backwards */
>> +	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
>> +	curr = &iovad->anchor.node;
>> +	prev = rb_prev(curr);
>> +	for (; prev; curr = prev, prev = rb_prev(curr)) {
>> +		curr_iova = rb_entry(curr, struct iova, node);
>> +		prev_iova = rb_entry(prev, struct iova, node);
>> +
>> +		limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
>> +		new_pfn = (limit_pfn - size) & align_mask;
>> +		gap = curr_iova->pfn_lo - prev_iova->pfn_hi - 1;
>> +		if ((limit_pfn >= size) && (new_pfn > prev_iova->pfn_hi)
>> +				&& (gap < candidate_gap)) {
>> +			candidate_gap = gap;
>> +			candidate_pfn = new_pfn;
>> +			candidate_rb_parent = curr;
>> +			if (gap == size)
>> +				goto insert;
>> +		}
>> +	}
>> +
>> +	curr_iova = rb_entry(curr, struct iova, node);
>> +	limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
>> +	new_pfn = (limit_pfn - size) & align_mask;
>> +	gap = curr_iova->pfn_lo - iovad->start_pfn;
>> +	if (limit_pfn >= size && new_pfn >= iovad->start_pfn &&
>> +			gap < candidate_gap) {
>> +		candidate_gap = gap;
>> +		candidate_pfn = new_pfn;
>> +		candidate_rb_parent = curr;
>> +	}
>> +
>> +insert:
>> +	if (candidate_pfn == ~0UL) {
>> +		spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	/* pfn_lo will point to size aligned address if size_aligned is set 
>> */
>> +	new->pfn_lo = candidate_pfn;
>> +	new->pfn_hi = new->pfn_lo + size - 1;
>> +
>> +	/* If we have 'prev', it's a valid place to start the insertion. */
>> +	iova_insert_rbtree(&iovad->rbroot, new, candidate_rb_parent);
>> +	spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
>> +	return 0;
>> +}
>> +
>>   static struct kmem_cache *iova_cache;
>>   static unsigned int iova_cache_users;
>>   static DEFINE_MUTEX(iova_cache_mutex);
>> @@ -302,8 +366,13 @@ struct iova *
>>   	if (!new_iova)
>>   		return NULL;
>>   -	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
>> -			new_iova, size_aligned);
>> +	if (iovad->best_fit) {
>> +		ret = __alloc_and_insert_iova_best_fit(iovad, size,
>> +				limit_pfn + 1, new_iova, size_aligned);
>> +	} else {
>> +		ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
>> +				new_iova, size_aligned);
>> +	}
>>     	if (ret) {
>>   		free_iova_mem(new_iova);
>> diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
>> index 2112f21..b01a31a 100644
>> --- a/include/linux/dma-iommu.h
>> +++ b/include/linux/dma-iommu.h
>> @@ -37,6 +37,8 @@ void iommu_dma_compose_msi_msg(struct msi_desc 
>> *desc,
>>     void iommu_dma_get_resv_regions(struct device *dev, struct 
>> list_head *list);
>>   +int iommu_dma_enable_best_fit_algo(struct device *dev);
>> +
>>   #else /* CONFIG_IOMMU_DMA */
>>     struct iommu_domain;
>> @@ -78,5 +80,10 @@ static inline void 
>> iommu_dma_get_resv_regions(struct device *dev, struct list_he
>>   {
>>   }
>>   +static inline int iommu_dma_enable_best_fit_algo(struct device 
>> *dev)
>> +{
>> +	return -ENODEV;
>> +}
>> +
>>   #endif	/* CONFIG_IOMMU_DMA */
>>   #endif	/* __DMA_IOMMU_H */
>> diff --git a/include/linux/iova.h b/include/linux/iova.h
>> index a0637ab..58713bb 100644
>> --- a/include/linux/iova.h
>> +++ b/include/linux/iova.h
>> @@ -95,6 +95,7 @@ struct iova_domain {
>>   						   flush-queues */
>>   	atomic_t fq_timer_on;			/* 1 when timer is active, 0
>>   						   when not */
>> +	bool best_fit;
>>   };
>>     static inline unsigned long iova_size(struct iova *iova)
>> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] iommu/iova: Add a best-fit algorithm
  2020-02-20  6:38   ` isaacm
@ 2020-02-20 18:42     ` Robin Murphy
  0 siblings, 0 replies; 6+ messages in thread
From: Robin Murphy @ 2020-02-20 18:42 UTC (permalink / raw)
  To: isaacm; +Cc: iommu, linux-kernel, kernel-team, pratikp, Liam Mark

On 20/02/2020 6:38 am, isaacm@codeaurora.org wrote:
> On 2020-02-17 08:03, Robin Murphy wrote:
>> On 14/02/2020 11:06 pm, Isaac J. Manjarres wrote:
>>> From: Liam Mark <lmark@codeaurora.org>
>>>
>>> Using the best-fit algorithm, instead of the first-fit
>>> algorithm, may reduce fragmentation when allocating
>>> IOVAs.
>>
>> What kind of pathological allocation patterns make that a serious
>> problem? Is there any scope for simply changing the order of things in
>> the callers? Do these drivers also run under other DMA API backends
>> (e.g. 32-bit Arm)?
>>
> The usecases where the IOVA space has been fragmented have 
> non-deterministic allocation
> patterns, and thus, it's not feasible to change the allocation order to 
> avoid fragmenting
> the IOVA space.

What about combining smaller buffers into larger individual allocations; 
any scope for that sort of thing? Certainly if you're consistently 
allocating small things less than PAGE_SIZE then DMA pools would be 
useful to avoid wanton memory wastage in general.

>  From what we've observed, the usecases involve allocations of two types of
> buffers: one type of buffer between 1 KB to 4 MB in size, and another 
> type of
> buffer between 1 KB to 400 MB in size.
> 
> The pathological scenarios seem to arise when there are
> many (100+) randomly distributed non-power of two allocations, which in 
> some cases leaves
> behind holes of up to 100+ MB in the IOVA space.
> 
> Here are some examples that show the state of the IOVA space under which 
> failure to
> allocate an IOVA was observed:
> 
> Instance 1:
>      Currently mapped total size : ~1.3GB
>      Free space available : ~2GB
>      Map for ~162MB fails.
>          Max contiguous space available : < 162MB
> 
> Instance 2:
>      Currently mapped total size : ~950MB
>      Free space available : ~2.3GB
>      Map for ~320MB fails.
>      Max contiguous space available : ~189MB
> 
> Instance 3:
>      Currently mapped total size : ~1.2GB
>      Free space available : ~2.7GB
>      Map for ~162MB fails.
>      Max contiguous space available : <162MB
> 
> We are still in the process of collecting data with the best-fit 
> algorithm enabled
> to provide some numbers to show that it results in less IOVA space
> fragmentation.

Thanks for those examples, and I'd definitely like to see the 
comparative figures. To dig a bit further, at the point where things 
start failing, where are the cached nodes pointing? IIRC there is still 
a pathological condition where empty space between limit_pfn and 
cached32_node gets 'lost' if nothing in between is freed, so the bigger 
the range of allocation sizes, the worse the effect, e.g.:

(considering an empty domain, pfn 0 *not* reserved, 32-bit limit_pfn)

	alloc 4K, succeeds, cached32_node now at 4G-4K
	alloc 2G, succeeds, cached32_node now at 0
	alloc 4K, fails despite almost 2G contiguous free space within limit_pfn
	(and max32_alloc_size==1 now fast-forwards *any* further allocation 
attempt to failure)

If you're falling foul of this case (I was never sure how realistic a 
problem it would be in practice), there are at least a couple of much 
less invasive tweaks I can think of that would be worth exploring.

> To answer your question about whether if this driver run under other DMA 
> API backends:
> yes, such as 32 bit ARM.

OK, that's what I suspected :)

AFAICS arch/arm's __alloc_iova() is also a first-fit algorithm, so if 
you get better behaviour there then it would suggest that this aspect 
isn't really the most important issue. Certainly, the fact that the 
"best fit" logic here also happens to ignore the cached nodes does start 
drawing me back to the point above.

Robin.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-02-20 18:42 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-14 23:06 [RFC PATCH] iommu/iova: Add a best-fit algorithm Isaac J. Manjarres
2020-02-17  8:03 ` Christoph Hellwig
2020-02-17 16:03 ` Robin Murphy
2020-02-19 12:28   ` Will Deacon
2020-02-20  6:38   ` isaacm
2020-02-20 18:42     ` Robin Murphy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).