[1/3] x86,mm/pat: Use generic interval trees
diff mbox series

Message ID 20190813224620.31005-2-dave@stgolabs.net
State New, archived
Headers show
Series
  • x86,mm/pat: Move towards using generic interval trees
Related show

Commit Message

Davidlohr Bueso Aug. 13, 2019, 10:46 p.m. UTC
With some considerations, the custom pat_rbtree implementation can be
simplified to use most of the generic interval_tree machinery.

o The tree inorder traversal can slightly differ when there are key
('start') collisions in the tree due to one going left and another right.
This, however, only affects the output of debugfs' pat_memtype_list file.

o The border cases for overlapping differ -- interval trees are closed,
while memtype intervals are open. We need to maintain semantics such
that conflict detection and getting the lowest match does not change.

o Erasing logic must remain untouched as well.

In order for the types to remain u64, the 'memtype_interval' calls are
introduced, as opposed to simply using struct interval_tree.

Pat tree might potentially also benefit by the fast overlap detection
for the insertion case when looking up the first overlapping node in
the tree; albeit not as far as my testing as shown, due to the use case.

Finally, I've tested this on various servers, via sanity warnings, running
side by side with the current version and so far see no differences in the
returned pointer node when doing memtype_rb_lowest_match() lookups.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 arch/x86/mm/pat_rbtree.c | 167 +++++++++++++++--------------------------------
 1 file changed, 54 insertions(+), 113 deletions(-)

Comments

Peter Zijlstra Aug. 21, 2019, 4:03 p.m. UTC | #1
On Tue, Aug 13, 2019 at 03:46:18PM -0700, Davidlohr Bueso wrote:

> diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
> index fa16036fa592..1be4d1856a9b 100644
> --- a/arch/x86/mm/pat_rbtree.c
> +++ b/arch/x86/mm/pat_rbtree.c

> @@ -34,68 +34,41 @@
>   * memtype_lock protects the rbtree.
>   */
>  
> +static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
> +
> +#define START(node) ((node)->start)
> +#define END(node)  ((node)->end)

Afaict these could actually be inlines, but I see other interval tree
users also use macros for them, albeit with END called LAST.

> +INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
> +		     START, END, static, memtype_interval)
>  
>  static int is_node_overlap(struct memtype *node, u64 start, u64 end)
>  {
> +	/*
> +	 * Unlike generic interval trees, the memtype nodes are ]a, b[
> +	 * therefore we need to adjust the ranges accordingly. Missing
> +	 * an overlap can lead to incorrectly detecting conflicts,
> +	 * for example.
> +	 */
> +	if (node->start + 1 >= end || node->end - 1 <= start)
>  		return 0;
>  
>  	return 1;
>  }

I've not looked yet; but how horrible would it be to change that
weirdness?

> @@ -149,10 +116,8 @@ static int memtype_rb_check_conflict(struct rb_root *root,
>  	dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
>  	found_type = match->type;
>  
> +        match = memtype_interval_iter_next(match, start, end);
> +	while (match) {

whilespace fail

>  		if (match->start >= end) /* Checked all possible matches */
>  			goto success;
>  

> @@ -176,43 +141,21 @@ static int memtype_rb_check_conflict(struct rb_root *root,
>  	return -EBUSY;
>  }
>  
>  int rbt_memtype_check_insert(struct memtype *new,
>  			     enum page_cache_mode *ret_type)
>  {
> +	int err;
>  
> +	err =  memtype_rb_check_conflict(&memtype_rbroot,
> +					 new->start, new->end,
> +					 new->type, ret_type);

whitespace fail

> +	if (err)
> +		return err;
>  
> +	if (ret_type)
> +		new->type = *ret_type;
>  
> +	memtype_interval_insert(new, &memtype_rbroot);
>  	return err;
>  }
>
Michel Lespinasse Aug. 21, 2019, 9:57 p.m. UTC | #2
On Tue, Aug 13, 2019 at 03:46:18PM -0700, Davidlohr Bueso wrote:
> o The border cases for overlapping differ -- interval trees are closed,
> while memtype intervals are open. We need to maintain semantics such
> that conflict detection and getting the lowest match does not change.

Agree on the need to maintain semantics.

As I had commented some time ago, I wish the interval trees used [start,end)
intervals instead of [start,last] - it would be a better fit for basically
all of the current interval tree users.

I'm not sure where to go with this - would it make sense to add a new
interval tree header file that uses [start,end) intervals (with the
thought of eventually converting all current interval tree users to it)
instead of adding one more use of the less-natural [start,last]
interval trees ?

> diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
> index fa16036fa592..1be4d1856a9b 100644
> --- a/arch/x86/mm/pat_rbtree.c
> +++ b/arch/x86/mm/pat_rbtree.c
> @@ -12,7 +12,7 @@
>  #include <linux/seq_file.h>
>  #include <linux/debugfs.h>
>  #include <linux/kernel.h>
> -#include <linux/rbtree_augmented.h>
> +#include <linux/interval_tree_generic.h>
>  #include <linux/sched.h>
>  #include <linux/gfp.h>
>  
> @@ -34,68 +34,41 @@
>   * memtype_lock protects the rbtree.
>   */
>  
> -static struct rb_root memtype_rbroot = RB_ROOT;
> +static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
> +
> +#define START(node) ((node)->start)
> +#define END(node)  ((node)->end)
> +INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
> +		     START, END, static, memtype_interval)
>  
>  static int is_node_overlap(struct memtype *node, u64 start, u64 end)
>  {
> -	if (node->start >= end || node->end <= start)
> +	/*
> +	 * Unlike generic interval trees, the memtype nodes are ]a, b[

I think the memtype nodes are [a, b)  (which one could also write as [a, b[
depending on their local customs - but either way, closed on the start side
and open on the end side) ?

> +	 * therefore we need to adjust the ranges accordingly. Missing
> +	 * an overlap can lead to incorrectly detecting conflicts,
> +	 * for example.
> +	 */
> +	if (node->start + 1 >= end || node->end - 1 <= start)
>  		return 0;
>  
>  	return 1;
>  }

All right, now I am *really* confused.

My understanding is as follows:
* the PAT code wants to use [start, end( intervals
* interval trees are defined to use [start, last] intervals with last == end-1

At first, I thought that you were handling that by removing 1 from the
end of the interval, to adjust between the PAT and interval tree
definitions. But, I don't see you doing that anywhere.

Then, I thought that you were using [start, end( intervals everywhere,
and the interval tree functions memtype_interval_iter_first and
memtype_interval_iter_next would just return too many candidate
matches as as you are passing "end" instead of "last" == end-1 as the
interval endpoint, but then you would filter out the extra intervals
using is_node_overlap(). But, if that is the case, then I don't
understand why you need to redefine is_node_overlap() here.

Could you help me out by defining if the intervals are open or closed,
both when stored in the node->start and node->end values, and when
passed as start and end arguments to the functions in this file ?



Generally, I think using the interval tree code in this file is a good idea,
but 1- I do not understand how you are handling the differences in interval
definitions in this change, and 2- I wonder if it'd be better to just have
a version of the interval tree code that handles [start,end( half-open
intervals like we do everywhere else in the kernel.
Davidlohr Bueso Aug. 22, 2019, 4:49 a.m. UTC | #3
On Wed, 21 Aug 2019, Michel Lespinasse wrote:

>On Tue, Aug 13, 2019 at 03:46:18PM -0700, Davidlohr Bueso wrote:
>> o The border cases for overlapping differ -- interval trees are closed,
>> while memtype intervals are open. We need to maintain semantics such
>> that conflict detection and getting the lowest match does not change.
>
>Agree on the need to maintain semantics.
>
>As I had commented some time ago, I wish the interval trees used [start,end)
>intervals instead of [start,last] - it would be a better fit for basically
>all of the current interval tree users.

Yes, after going through all the users of interval-tree,  I agree that
they all want to use [start,end intervals.

>
>I'm not sure where to go with this - would it make sense to add a new
>interval tree header file that uses [start,end) intervals (with the
>thought of eventually converting all current interval tree users to it)
>instead of adding one more use of the less-natural [start,last]
>interval trees ?

It might be the safest way, although I really hate having another
header file for interval_tree... The following is a diffstat of a
tentative conversion (I'll send the patch separately); I'm not sure
if a single shot conversion would be acceptable, albeit with relevant
maintainer acks.

 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c      |  8 +-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  5 +++--
 drivers/gpu/drm/drm_mm.c                    |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 13 +++++--------
 drivers/gpu/drm/radeon/radeon_mn.c          | 10 +++-------
 drivers/gpu/drm/radeon/radeon_vm.c          |  2 +-
 drivers/infiniband/hw/hfi1/mmu_rb.c         | 12 ++++++------
 drivers/iommu/virtio-iommu.c                |  4 ++--
 drivers/vhost/vhost.c                       |  6 +++---
 include/drm/drm_mm.h                        |  2 +-
 include/linux/interval_tree_generic.h       | 28 ++++++++++++++--------------
 mm/interval_tree.c                          |  2 +-
 mm/rmap.c                                   |  2 +-
 13 files changed, 42 insertions(+), 54 deletions(-)

This gets rid of 'end - 1' trick from the users and converts
cond1 and cond2 checks in interval_tree_generic.h

Note that I think amdgpu_vm.c actually uses fully open intervals.

>
>> diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
>> index fa16036fa592..1be4d1856a9b 100644
>> --- a/arch/x86/mm/pat_rbtree.c
>> +++ b/arch/x86/mm/pat_rbtree.c
>> @@ -12,7 +12,7 @@
>>  #include <linux/seq_file.h>
>>  #include <linux/debugfs.h>
>>  #include <linux/kernel.h>
>> -#include <linux/rbtree_augmented.h>
>> +#include <linux/interval_tree_generic.h>
>>  #include <linux/sched.h>
>>  #include <linux/gfp.h>
>>
>> @@ -34,68 +34,41 @@
>>   * memtype_lock protects the rbtree.
>>   */
>>
>> -static struct rb_root memtype_rbroot = RB_ROOT;
>> +static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
>> +
>> +#define START(node) ((node)->start)
>> +#define END(node)  ((node)->end)
>> +INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
>> +		     START, END, static, memtype_interval)
>>
>>  static int is_node_overlap(struct memtype *node, u64 start, u64 end)
>>  {
>> -	if (node->start >= end || node->end <= start)
>> +	/*
>> +	 * Unlike generic interval trees, the memtype nodes are ]a, b[
>
>I think the memtype nodes are [a, b)  (which one could also write as [a, b[
>depending on their local customs - but either way, closed on the start side
>and open on the end side) ?
>
>> +	 * therefore we need to adjust the ranges accordingly. Missing
>> +	 * an overlap can lead to incorrectly detecting conflicts,
>> +	 * for example.
>> +	 */
>> +	if (node->start + 1 >= end || node->end - 1 <= start)
>>  		return 0;
>>
>>  	return 1;
>>  }
>
>All right, now I am *really* confused.
>
>My understanding is as follows:
>* the PAT code wants to use [start, end( intervals
>* interval trees are defined to use [start, last] intervals with last == end-1

Yes, we're talking about the same thing, but I overcomplicated things by
considering memtype lookups to be different than the nodes in the tree;
which obviously doesn't make sense... it is actually [a,b[ as you mention.

>
>At first, I thought that you were handling that by removing 1 from the
>end of the interval, to adjust between the PAT and interval tree
>definitions. But, I don't see you doing that anywhere.

This should have been my first approach.

>
>Then, I thought that you were using [start, end( intervals everywhere,
>and the interval tree functions memtype_interval_iter_first and
>memtype_interval_iter_next would just return too many candidate
>matches as as you are passing "end" instead of "last" == end-1 as the
>interval endpoint, but then you would filter out the extra intervals
>using is_node_overlap(). But, if that is the case, then I don't
>understand why you need to redefine is_node_overlap() here.

My original expectation was to actually remove a lot more of pat_rbtree,
including the is_node_overlap() and the filtering. Yes, I think this can
be done if the interval-tree is converted to [a,b[ and we can thus
just iterate the tree seamlessly.

>
>Could you help me out by defining if the intervals are open or closed,
>both when stored in the node->start and node->end values, and when
>passed as start and end arguments to the functions in this file ?

No, no endpoints are modified in pat.

>
>Generally, I think using the interval tree code in this file is a good idea,
>but 1- I do not understand how you are handling the differences in interval
>definitions in this change, and 2- I wonder if it'd be better to just have
>a version of the interval tree code that handles [start,end( half-open
>intervals like we do everywhere else in the kernel.

I think doing the conversion you suggested to [a,b[ for all users, then
redoing this series on top of that would be the way to move forward.

Thanks,
Davidlohr
Davidlohr Bueso Aug. 22, 2019, 6:17 p.m. UTC | #4
>On Wed, 21 Aug 2019, Michel Lespinasse wrote:
>>As I had commented some time ago, I wish the interval trees used [start,end)
>>intervals instead of [start,last] - it would be a better fit for basically
>>all of the current interval tree users.

So the vma_interval_tree (which is a pretty important user) tends to break this
pattern, as most lookups are [a,a]. We would have to update most of the
vma_interval_tree_foreach calls, for example, to now do [a,a+1[ such that we
don't break things. Some cases for the anon_vma_tree as well (ie memory-failure).

I'm not sure anymore it's worth going down this path as we end up exchanging one
hack for another (and the vma_interval_tree is a pretty big user); but I'm sure
you're aware of this and thus disagree.

Thanks,
Davidlohr
Michel Lespinasse Aug. 22, 2019, 8:10 p.m. UTC | #5
I think vma_interval_tree is a bit of a mixed bag, but mostly leans
towards using half closed intervals.

Right now vma_last_pgoff() has to do -1 because of the interval tree
using closed intervals. Similarly, rmap_walk_file(), which I consider
to be the main user of the vma_interval_tree, also has to do -1 when
computing pgoff_end because of the interval tree closed intervals. So,
I think overall vma_interval_tree would also more naturally use
half-open intervals.

But, that's not a 100% thing for vma_interval_tree, as it also has
uses that do stabbing queries (in arch specific code, in hugetlb
cases, and in dax code).

On Thu, Aug 22, 2019 at 11:17 AM Davidlohr Bueso <dave@stgolabs.net> wrote:
>
> >On Wed, 21 Aug 2019, Michel Lespinasse wrote:
> >>As I had commented some time ago, I wish the interval trees used [start,end)
> >>intervals instead of [start,last] - it would be a better fit for basically
> >>all of the current interval tree users.
>
> So the vma_interval_tree (which is a pretty important user) tends to break this
> pattern, as most lookups are [a,a]. We would have to update most of the
> vma_interval_tree_foreach calls, for example, to now do [a,a+1[ such that we
> don't break things. Some cases for the anon_vma_tree as well (ie memory-failure).
>
> I'm not sure anymore it's worth going down this path as we end up exchanging one
> hack for another (and the vma_interval_tree is a pretty big user); but I'm sure
> you're aware of this and thus disagree.
>
> Thanks,
> Davidlohr
Michel Lespinasse Aug. 22, 2019, 8:24 p.m. UTC | #6
On Wed, Aug 21, 2019 at 9:49 PM Davidlohr Bueso <dave@stgolabs.net> wrote:
> On Wed, 21 Aug 2019, Michel Lespinasse wrote:
> >I'm not sure where to go with this - would it make sense to add a new
> >interval tree header file that uses [start,end) intervals (with the
> >thought of eventually converting all current interval tree users to it)
> >instead of adding one more use of the less-natural [start,last]
> >interval trees ?
>
> It might be the safest way, although I really hate having another
> header file for interval_tree... The following is a diffstat of a
> tentative conversion (I'll send the patch separately); I'm not sure
> if a single shot conversion would be acceptable, albeit with relevant
> maintainer acks.

That would make sense to me. Maybe doing it all in a single commit is
too hard to review or bisect, but having a small series that
duplicates the interval tree header at the start and removes the old
(closed intervals) version at the end looks like it would work IMO.

> >At first, I thought that you were handling that by removing 1 from the
> >end of the interval, to adjust between the PAT and interval tree
> >definitions. But, I don't see you doing that anywhere.
>
> This should have been my first approach.
>
> >Then, I thought that you were using [start, end( intervals everywhere,
> >and the interval tree functions memtype_interval_iter_first and
> >memtype_interval_iter_next would just return too many candidate
> >matches as as you are passing "end" instead of "last" == end-1 as the
> >interval endpoint, but then you would filter out the extra intervals
> >using is_node_overlap(). But, if that is the case, then I don't
> >understand why you need to redefine is_node_overlap() here.
>
> My original expectation was to actually remove a lot more of pat_rbtree,
> including the is_node_overlap() and the filtering. Yes, I think this can
> be done if the interval-tree is converted to [a,b[ and we can thus
> just iterate the tree seamlessly.

All right. So, my preference would be if we can use a version of
interval trees that would work natively with half open intervals, as
we discussed above. But, if that seems like too much change, I would
also be ready to approve a version of pat_interval.c that would adjust
the interval endpoint to compensate for the interval tree's use of
closed intervals. I think either way, I can't approve the current
version as it's too much of an in-between which makes it hard to
follow IMO.

> I think doing the conversion you suggested to [a,b[ for all users, then
> redoing this series on top of that would be the way to move forward.

That would be ideal; hopefully you don't see the vma_interval_tree
thing as blocking this approach ?

Thanks,
Davidlohr Bueso Sept. 5, 2019, 12:52 a.m. UTC | #7
On Thu, 22 Aug 2019, Michel Lespinasse wrote:

>I think vma_interval_tree is a bit of a mixed bag, but mostly leans
>towards using half closed intervals.
>
>Right now vma_last_pgoff() has to do -1 because of the interval tree
>using closed intervals. Similarly, rmap_walk_file(), which I consider
>to be the main user of the vma_interval_tree, also has to do -1 when
>computing pgoff_end because of the interval tree closed intervals. So,
>I think overall vma_interval_tree would also more naturally use
>half-open intervals.
>
>But, that's not a 100% thing for vma_interval_tree, as it also has
>uses that do stabbing queries (in arch specific code, in hugetlb
>cases, and in dax code).

Ok, so for that I've added the following helper which will make the
conversion a bit more straightforward:

#define vma_interval_tree_foreach_stab(vma, root, start)
       vma_interval_tree_foreach(vma, root, start, start)

I think this also makes sense as it documents the nature of the lookup.

Now for the interval-tree conversion to [a,b[ , how does the following
look to you? Note that this should be the end result; we can discuss
how to get there later, but I wanna make sure that it was more or less
what you were picturing.

I also converted the pat tree and the cleanups are much nicer now; which
shows in the diffsat:

2 files changed, 38 insertions(+), 123 deletions(-)

I'll send this once the interval tree stuff is sorted out.

Thanks,
Davidlohr

-------8<-------------------------------------------------------------

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 31d4deb5d294..f2d98550bdbf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -205,9 +205,6 @@ amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
 	bool blockable = mmu_notifier_range_blockable(update);
 	struct interval_tree_node *it;
 
-	/* notification is exclusive, but interval is inclusive */
-	end -= 1;
-
 	/* TODO we should be able to split locking for interval tree and
 	 * amdgpu_mn_invalidate_node
 	 */
@@ -254,9 +251,6 @@ amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
 	bool blockable = mmu_notifier_range_blockable(update);
 	struct interval_tree_node *it;
 
-	/* notification is exclusive, but interval is inclusive */
-	end -= 1;
-
 	if (amdgpu_mn_read_lock(amn, blockable))
 		return -EAGAIN;
 
@@ -374,7 +368,7 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
  */
 int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr)
 {
-	unsigned long end = addr + amdgpu_bo_size(bo) - 1;
+	unsigned long end = addr + amdgpu_bo_size(bo);
 	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
 	enum amdgpu_mn_type type =
 		bo->kfd_bo ? AMDGPU_MN_TYPE_HSA : AMDGPU_MN_TYPE_GFX;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 74da35611d7c..36c3c510e01e 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -99,9 +99,6 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
 	if (RB_EMPTY_ROOT(&mn->objects.rb_root))
 		return 0;
 
-	/* interval ranges are inclusive, but invalidate range is exclusive */
-	end = range->end - 1;
-
 	spin_lock(&mn->lock);
 	it = interval_tree_iter_first(&mn->objects, range->start, end);
 	while (it) {
@@ -275,7 +272,7 @@ i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object *obj,
 	mo->mn = mn;
 	mo->obj = obj;
 	mo->it.start = obj->userptr.ptr;
-	mo->it.last = obj->userptr.ptr + obj->base.size - 1;
+	mo->it.last = obj->userptr.ptr + obj->base.size;
 	RB_CLEAR_NODE(&mo->it.rb);
 
 	obj->userptr.mmu_object = mo;
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
index dbab9a3a969b..2beb22269793 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -66,12 +66,8 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 	struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
 	struct ttm_operation_ctx ctx = { false, false };
 	struct interval_tree_node *it;
-	unsigned long end;
 	int ret = 0;
 
-	/* notification is exclusive, but interval is inclusive */
-	end = range->end - 1;
-
 	/* TODO we should be able to split locking for interval tree and
 	 * the tear down.
 	 */
@@ -80,7 +76,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 	else if (!mutex_trylock(&rmn->lock))
 		return -EAGAIN;
 
-	it = interval_tree_iter_first(&rmn->objects, range->start, end);
+	it = interval_tree_iter_first(&rmn->objects, range->start, range->end);
 	while (it) {
 		struct radeon_mn_node *node;
 		struct radeon_bo *bo;
@@ -92,7 +88,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
 		}
 
 		node = container_of(it, struct radeon_mn_node, it);
-		it = interval_tree_iter_next(it, range->start, end);
+		it = interval_tree_iter_next(it, range->start, range->end);
 
 		list_for_each_entry(bo, &node->bos, mn_list) {
 
@@ -174,7 +170,7 @@ static const struct mmu_notifier_ops radeon_mn_ops = {
  */
 int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
 {
-	unsigned long end = addr + radeon_bo_size(bo) - 1;
+	unsigned long end = addr + radeon_bo_size(bo);
 	struct mmu_notifier *mn;
 	struct radeon_mn *rmn;
 	struct radeon_mn_node *node = NULL;
diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c
index e0ad547786e8..dc9fad8ea84a 100644
--- a/drivers/gpu/drm/radeon/radeon_vm.c
+++ b/drivers/gpu/drm/radeon/radeon_vm.c
@@ -450,13 +450,14 @@ int radeon_vm_bo_set_addr(struct radeon_device *rdev,
 {
 	uint64_t size = radeon_bo_size(bo_va->bo);
 	struct radeon_vm *vm = bo_va->vm;
-	unsigned last_pfn, pt_idx;
+	unsigned pt_idx;
 	uint64_t eoffset;
 	int r;
 
 	if (soffset) {
+		unsigned last_pfn;
 		/* make sure object fit at this offset */
-		eoffset = soffset + size - 1;
+		eoffset = soffset + size;
 		if (soffset >= eoffset) {
 			r = -EINVAL;
 			goto error_unreserve;
@@ -471,7 +472,7 @@ int radeon_vm_bo_set_addr(struct radeon_device *rdev,
 		}
 
 	} else {
-		eoffset = last_pfn = 0;
+		eoffset = 1; /* interval trees are [a,b[ */
 	}
 
 	mutex_lock(&vm->mutex);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 9aebe9ce8b07..58c9269aa072 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -761,9 +761,6 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
 }
 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
 
-/* @last is not a part of the interval. See comment for function
- * node_last.
- */
 int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
 				  u64 start, u64 last,
 				  umem_call_back cb,
@@ -777,12 +774,12 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
 	if (unlikely(start == last))
 		return ret_val;
 
-	for (node = interval_tree_iter_first(root, start, last - 1);
+	for (node = interval_tree_iter_first(root, start, last);
 			node; node = next) {
 		/* TODO move the blockable decision up to the callback */
 		if (!blockable)
 			return -EAGAIN;
-		next = interval_tree_iter_next(node, start, last - 1);
+		next = interval_tree_iter_next(node, start, last);
 		umem = container_of(node, struct ib_umem_odp, interval_tree);
 		ret_val = cb(umem, start, last, cookie) || ret_val;
 	}
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index 14d2a90964c3..d708c45bfabf 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -89,7 +89,7 @@ static unsigned long mmu_node_start(struct mmu_rb_node *node)
 
 static unsigned long mmu_node_last(struct mmu_rb_node *node)
 {
-	return PAGE_ALIGN(node->addr + node->len) - 1;
+	return PAGE_ALIGN(node->addr + node->len);
 }
 
 int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
@@ -195,13 +195,13 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
 	trace_hfi1_mmu_rb_search(addr, len);
 	if (!handler->ops->filter) {
 		node = __mmu_int_rb_iter_first(&handler->root, addr,
-					       (addr + len) - 1);
+					       (addr + len));
 	} else {
 		for (node = __mmu_int_rb_iter_first(&handler->root, addr,
-						    (addr + len) - 1);
+						    (addr + len));
 		     node;
 		     node = __mmu_int_rb_iter_next(node, addr,
-						   (addr + len) - 1)) {
+						   (addr + len))) {
 			if (handler->ops->filter(node, addr, len))
 				return node;
 		}
@@ -293,11 +293,10 @@ static int mmu_notifier_range_start(struct mmu_notifier *mn,
 	bool added = false;
 
 	spin_lock_irqsave(&handler->lock, flags);
-	for (node = __mmu_int_rb_iter_first(root, range->start, range->end-1);
+	for (node = __mmu_int_rb_iter_first(root, range->start, range->end);
 	     node; node = ptr) {
 		/* Guard against node removal. */
-		ptr = __mmu_int_rb_iter_next(node, range->start,
-					     range->end - 1);
+		ptr = __mmu_int_rb_iter_next(node, range->start, range->end);
 		trace_hfi1_mmu_mem_invalidate(node->addr, node->len);
 		if (handler->ops->invalidate(handler->ops_arg, node)) {
 			__mmu_int_rb_remove(node, root);
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 62e6ffa9ad78..1d6661b1fe04 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -223,7 +223,7 @@ static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd,
 
 	npages = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
 	vpn_start = (uiomr->va & PAGE_MASK) >> PAGE_SHIFT;
-	vpn_last = vpn_start + npages - 1;
+	vpn_last = vpn_start + npages;
 
 	spin_lock(&pd->lock);
 	usnic_uiom_remove_interval(&pd->root, vpn_start,
@@ -354,7 +354,7 @@ struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
 	offset = addr & ~PAGE_MASK;
 	npages = PAGE_ALIGN(size + offset) >> PAGE_SHIFT;
 	vpn_start = (addr & PAGE_MASK) >> PAGE_SHIFT;
-	vpn_last = vpn_start + npages - 1;
+	vpn_last = vpn_start + npages;
 
 	uiomr = kmalloc(sizeof(*uiomr), GFP_KERNEL);
 	if (!uiomr)
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 3ea9d7682999..b25709d67d72 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -323,7 +323,7 @@ static int viommu_add_mapping(struct viommu_domain *vdomain, unsigned long iova,
 
 	mapping->paddr		= paddr;
 	mapping->iova.start	= iova;
-	mapping->iova.last	= iova + size - 1;
+	mapping->iova.last	= iova + size;
 	mapping->flags		= flags;
 
 	spin_lock_irqsave(&vdomain->mappings_lock, irqflags);
@@ -348,7 +348,7 @@ static size_t viommu_del_mappings(struct viommu_domain *vdomain,
 {
 	size_t unmapped = 0;
 	unsigned long flags;
-	unsigned long last = iova + size - 1;
+	unsigned long last = iova + size;
 	struct viommu_mapping *mapping = NULL;
 	struct interval_tree_node *node, *next;
 
@@ -367,7 +367,7 @@ static size_t viommu_del_mappings(struct viommu_domain *vdomain,
 		 * Virtio-iommu doesn't allow UNMAP to split a mapping created
 		 * with a single MAP request, so remove the full mapping.
 		 */
-		unmapped += mapping->iova.last - mapping->iova.start + 1;
+		unmapped += mapping->iova.last - mapping->iova.start;
 
 		interval_tree_remove(node, &vdomain->mappings);
 		kfree(mapping);
@@ -787,7 +787,7 @@ static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain,
 	struct viommu_domain *vdomain = to_viommu_domain(domain);
 
 	spin_lock_irqsave(&vdomain->mappings_lock, flags);
-	node = interval_tree_iter_first(&vdomain->mappings, iova, iova);
+	node = interval_tree_iter_first(&vdomain->mappings, iova, iova + 1);
 	if (node) {
 		mapping = container_of(node, struct viommu_mapping, iova);
 		paddr = mapping->paddr + (iova - mapping->iova.start);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5dc174ac8cac..98590a7d900c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1126,7 +1126,7 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
 		}
 		vhost_vq_meta_reset(dev);
 		vhost_del_umem_range(dev->iotlb, msg->iova,
-				     msg->iova + msg->size - 1);
+				     msg->iova + msg->size);
 		break;
 	default:
 		ret = -EINVAL;
@@ -1320,15 +1320,14 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 {
 	const struct vhost_umem_node *node;
 	struct vhost_umem *umem = vq->iotlb;
-	u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
+	u64 s = 0, size, orig_addr = addr, last = addr + len;
 
 	if (vhost_vq_meta_fetch(vq, addr, len, type))
 		return true;
 
 	while (len > s) {
 		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
-							   addr,
-							   last);
+							   addr, last);
 		if (node == NULL || node->start > addr) {
 			vhost_iotlb_miss(vq, addr, access);
 			return false;
@@ -2055,7 +2054,7 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 		}
 
 		node = vhost_umem_interval_tree_iter_first(&umem->umem_tree,
-							addr, addr + len - 1);
+							addr, addr + len);
 		if (node == NULL || node->start > addr) {
 			if (umem != dev->iotlb) {
 				ret = -EFAULT;
diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
index aaa8a0767aa3..e714e67ebdb5 100644
--- a/include/linux/interval_tree_generic.h
+++ b/include/linux/interval_tree_generic.h
@@ -69,12 +69,12 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node,			      \
 }									      \
 									      \
 /*									      \
- * Iterate over intervals intersecting [start;last]			      \
+ * Iterate over intervals intersecting [start;last[			      \
  *									      \
- * Note that a node's interval intersects [start;last] iff:		      \
- *   Cond1: ITSTART(node) <= last					      \
+ * Note that a node's interval intersects [start;last[ iff:		      \
+ *   Cond1: ITSTART(node) < last					      \
  * and									      \
- *   Cond2: start <= ITLAST(node)					      \
+ *   Cond2: start < ITLAST(node)					      \
  */									      \
 									      \
 static ITSTRUCT *							      \
@@ -88,7 +88,7 @@ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last)	      \
 		if (node->ITRB.rb_left) {				      \
 			ITSTRUCT *left = rb_entry(node->ITRB.rb_left,	      \
 						  ITSTRUCT, ITRB);	      \
-			if (start <= left->ITSUBTREE) {			      \
+			if (start < left->ITSUBTREE) {			      \
 				/*					      \
 				 * Some nodes in left subtree satisfy Cond2.  \
 				 * Iterate to find the leftmost such node N.  \
@@ -101,8 +101,8 @@ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last)	      \
 				continue;				      \
 			}						      \
 		}							      \
-		if (ITSTART(node) <= last) {		/* Cond1 */	      \
-			if (start <= ITLAST(node))	/* Cond2 */	      \
+		if (ITSTART(node) < last) {		/* Cond1 */	      \
+			if (start < ITLAST(node))	/* Cond2 */	      \
 				return node;	/* node is leftmost match */  \
 			if (node->ITRB.rb_right) {			      \
 				node = rb_entry(node->ITRB.rb_right,	      \
@@ -125,24 +125,19 @@ ITPREFIX ## _iter_first(struct rb_root_cached *root,			      \
 		return NULL;						      \
 									      \
 	/*								      \
-	 * Fastpath range intersection/overlap between A: [a0, a1] and	      \
-	 * B: [b0, b1] is given by:					      \
-	 *								      \
-	 *         a0 <= b1 && b0 <= a1					      \
-	 *								      \
-	 *  ... where A holds the lock range and B holds the smallest	      \
-	 * 'start' and largest 'last' in the tree. For the later, we	      \
-	 * rely on the root node, which by augmented interval tree	      \
+	 * Fastpath range intersection/overlap where we compare the	      \
+	 * smallest 'start' and largest 'last' in the tree. For the later,    \
+	 * we rely on the root node, which by augmented interval tree	      \
 	 * property, holds the largest value in its last-in-subtree.	      \
 	 * This allows mitigating some of the tree walk overhead for	      \
 	 * for non-intersecting ranges, maintained and consulted in O(1).     \
 	 */								      \
 	node = rb_entry(root->rb_root.rb_node, ITSTRUCT, ITRB);		      \
-	if (node->ITSUBTREE < start)					      \
+	if (node->ITSUBTREE <= start)	/* !Cond2 */			      \
 		return NULL;						      \
 									      \
 	leftmost = rb_entry(root->rb_leftmost, ITSTRUCT, ITRB);		      \
-	if (ITSTART(leftmost) > last)					      \
+	if (ITSTART(leftmost) >= last)	/* !Cond1 */			      \
 		return NULL;						      \
 									      \
 	return ITPREFIX ## _subtree_search(node, start, last);		      \
@@ -156,14 +151,14 @@ ITPREFIX ## _iter_next(ITSTRUCT *node, ITTYPE start, ITTYPE last)	      \
 	while (true) {							      \
 		/*							      \
 		 * Loop invariants:					      \
-		 *   Cond1: ITSTART(node) <= last			      \
+		 *   Cond1: ITSTART(node) < last			      \
 		 *   rb == node->ITRB.rb_right				      \
 		 *							      \
 		 * First, search right subtree if suitable		      \
 		 */							      \
 		if (rb) {						      \
 			ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB);	      \
-			if (start <= right->ITSUBTREE)			      \
+			if (start < right->ITSUBTREE)			      \
 				return ITPREFIX ## _subtree_search(right,     \
 								start, last); \
 		}							      \
@@ -178,10 +173,10 @@ ITPREFIX ## _iter_next(ITSTRUCT *node, ITTYPE start, ITTYPE last)	      \
 			rb = node->ITRB.rb_right;			      \
 		} while (prev == rb);					      \
 									      \
-		/* Check if the node intersects [start;last] */		      \
-		if (last < ITSTART(node))		/* !Cond1 */	      \
+		/* Check if the node intersects [start;last[ */		      \
+		if (last <= ITSTART(node))		/* !Cond1 */	      \
 			return NULL;					      \
-		else if (start <= ITLAST(node))		/* Cond2 */	      \
+		else if (start < ITLAST(node))		/* Cond2 */	      \
 			return node;					      \
 	}								      \
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 14c5eb514bfe..ed5dad316aa1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2245,7 +2245,7 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
 	     vma; vma = vma_interval_tree_iter_next(vma, start, last))
 
 #define vma_interval_tree_foreach_stab(vma, root, start)		\
-	vma_interval_tree_foreach(vma, root, start, start)
+	vma_interval_tree_foreach(vma, root, start, start + 1)
 
 void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
 				   struct rb_root_cached *root);
@@ -2265,7 +2265,7 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
 	     avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
 
 #define anon_vma_interval_tree_foreach_stab(vma, root, start)		\
-	anon_vma_interval_tree_foreach(vma, root, start, start)
+	anon_vma_interval_tree_foreach(vma, root, start, start + 1)
 
 /* mmap.c */
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 253df1a1fa54..6ef5d62fc715 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -165,11 +165,10 @@ rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length)
 {
 	struct interval_tree_node *node;
 
-	node = interval_tree_iter_first(root, addr, addr + length - 1);
+	node = interval_tree_iter_first(root, addr, addr + length);
 	if (!node)
 		return NULL;
 	return container_of(node, struct ib_umem_odp, interval_tree);
-
 }
 
 static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 11c75fb07584..99d1a260043c 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -17,7 +17,7 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
 
 static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
 {
-	return v->vm_pgoff + vma_pages(v) - 1;
+	return v->vm_pgoff + vma_pages(v);
 }
 
 INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
Michel Lespinasse Sept. 5, 2019, 2 a.m. UTC | #8
Hi Davidlohr,

On Wed, Sep 4, 2019 at 5:52 PM Davidlohr Bueso <dave@stgolabs.net> wrote:
> Ok, so for that I've added the following helper which will make the
> conversion a bit more straightforward:
>
> #define vma_interval_tree_foreach_stab(vma, root, start)
>        vma_interval_tree_foreach(vma, root, start, start)

Yes, that should help with the conversion :)

> I think this also makes sense as it documents the nature of the lookup.
>
> Now for the interval-tree conversion to [a,b[ , how does the following
> look to you? Note that this should be the end result; we can discuss
> how to get there later, but I wanna make sure that it was more or less
> what you were picturing.

I do not have time for a full review right now, but I did have a quick
pass at it and it does seem to match the direction I'd like this to
take.

Please let me know if you'd like me to do a full review of this, or if
it'll be easier to do it on the individual steps once this gets broken
up.

BTW are you going to plumbers ? I am and I would love to talk to you
there, about this and about MM range locking, too.

Things I noticed in my quick pass:

> diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c
> index e0ad547786e8..dc9fad8ea84a 100644
> --- a/drivers/gpu/drm/radeon/radeon_vm.c
> +++ b/drivers/gpu/drm/radeon/radeon_vm.c
> @@ -450,13 +450,14 @@ int radeon_vm_bo_set_addr(struct radeon_device *rdev,
>  {
>         uint64_t size = radeon_bo_size(bo_va->bo);
>         struct radeon_vm *vm = bo_va->vm;
> -       unsigned last_pfn, pt_idx;
> +       unsigned pt_idx;
>         uint64_t eoffset;
>         int r;
>
>         if (soffset) {
> +               unsigned last_pfn;
>                 /* make sure object fit at this offset */
> -               eoffset = soffset + size - 1;
> +               eoffset = soffset + size;
>                 if (soffset >= eoffset) {
Should probably be if (soffset > eoffset) now (just checking for
arithmetic overflow).
>                         r = -EINVAL;
>                         goto error_unreserve;
> @@ -471,7 +472,7 @@ int radeon_vm_bo_set_addr(struct radeon_device *rdev,
>                 }
>
>         } else {
> -               eoffset = last_pfn = 0;
> +               eoffset = 1; /* interval trees are [a,b[ */
Looks highly suspicious to me, and doesn't jive with the eoffset /=
RADEON_GPU_PAGE_SIZE below.
I did not look deep enough into this to figure out what would be correct though.
>         }
>
>         mutex_lock(&vm->mutex);
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
> index 14d2a90964c3..d708c45bfabf 100644
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -195,13 +195,13 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
>         trace_hfi1_mmu_rb_search(addr, len);
>         if (!handler->ops->filter) {
>                 node = __mmu_int_rb_iter_first(&handler->root, addr,
> -                                              (addr + len) - 1);
> +                                              (addr + len));
>         } else {
>                 for (node = __mmu_int_rb_iter_first(&handler->root, addr,
> -                                                   (addr + len) - 1);
> +                                                   (addr + len));
>                      node;
>                      node = __mmu_int_rb_iter_next(node, addr,
> -                                                  (addr + len) - 1)) {
> +                                                  (addr + len))) {
>                         if (handler->ops->filter(node, addr, len))
>                                 return node;
>                 }

Weird unnecessary parentheses through this block.

> @@ -787,7 +787,7 @@ static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain,
>         struct viommu_domain *vdomain = to_viommu_domain(domain);
>
>         spin_lock_irqsave(&vdomain->mappings_lock, flags);
> -       node = interval_tree_iter_first(&vdomain->mappings, iova, iova);
> +       node = interval_tree_iter_first(&vdomain->mappings, iova, iova + 1);

I was gonna suggest a stab iterator for the generic interval tree, but
maybe not if it's only used here.

> diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
> index aaa8a0767aa3..e714e67ebdb5 100644
> --- a/include/linux/interval_tree_generic.h
> +++ b/include/linux/interval_tree_generic.h
> @@ -69,12 +69,12 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node,                         \
>  }                                                                            \
>                                                                               \
>  /*                                                                           \
> - * Iterate over intervals intersecting [start;last]                          \
> + * Iterate over intervals intersecting [start;last[                          \

Maybe I'm extra nitpicky, but I would suggest to use 'end' instead of
'last' when the intervals are half open: [start;end[
To me 'last' implies that it's in the interval, while 'end' doesn't
imply anything (and thus the half open convention used through the
kernel applies).
similarly ITLAST should be changed to ITEND, etc and similarly in the
various call sites (drm and others).
Again, maybe it's nitpicky but I feel changing the name also helps
verify that we don't leave behind any off-by-one use.

That said, it's really only my preference - if you think it's too
painful to change, that's OK.
Davidlohr Bueso Sept. 5, 2019, 3:45 a.m. UTC | #9
On Wed, 04 Sep 2019, Michel Lespinasse wrote:

>I do not have time for a full review right now, but I did have a quick
>pass at it and it does seem to match the direction I'd like this to
>take.

Thanks, and no worries, I consider all this v5.5 material anyway.

>
>Please let me know if you'd like me to do a full review of this, or if
>it'll be easier to do it on the individual steps once this gets broken
>up.

If nothing else, I would appreciate you making sure I didn't do anything
stupid in the interval_tree_generic.h lookup changes.

>
>BTW are you going to plumbers ? I am and I would love to talk to you
>there, about this and about MM range locking, too.

No, not this year; perhaps lsfmm (although that's not for a while).

>
>Things I noticed in my quick pass:
>
>> diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c
>> index e0ad547786e8..dc9fad8ea84a 100644
>> --- a/drivers/gpu/drm/radeon/radeon_vm.c
>> +++ b/drivers/gpu/drm/radeon/radeon_vm.c
>> @@ -450,13 +450,14 @@ int radeon_vm_bo_set_addr(struct radeon_device *rdev,
>>  {
>>         uint64_t size = radeon_bo_size(bo_va->bo);
>>         struct radeon_vm *vm = bo_va->vm;
>> -       unsigned last_pfn, pt_idx;
>> +       unsigned pt_idx;
>>         uint64_t eoffset;
>>         int r;
>>
>>         if (soffset) {
>> +               unsigned last_pfn;
>>                 /* make sure object fit at this offset */
>> -               eoffset = soffset + size - 1;
>> +               eoffset = soffset + size;
>>                 if (soffset >= eoffset) {
>Should probably be if (soffset > eoffset) now (just checking for
>arithmetic overflow).
>>                         r = -EINVAL;
>>                         goto error_unreserve;
>> @@ -471,7 +472,7 @@ int radeon_vm_bo_set_addr(struct radeon_device *rdev,
>>                 }
>>
>>         } else {
>> -               eoffset = last_pfn = 0;
>> +               eoffset = 1; /* interval trees are [a,b[ */
>Looks highly suspicious to me, and doesn't jive with the eoffset /=
>RADEON_GPU_PAGE_SIZE below.
>I did not look deep enough into this to figure out what would be correct though.

I think you're right, I will double check this. I think we only wanna do
the RADEON_GPU_PAGE_SIZE division when soffset is non-zero.

>>         }
>>
>>         mutex_lock(&vm->mutex);
>> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
>> index 14d2a90964c3..d708c45bfabf 100644
>> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
>> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
>> @@ -195,13 +195,13 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
>>         trace_hfi1_mmu_rb_search(addr, len);
>>         if (!handler->ops->filter) {
>>                 node = __mmu_int_rb_iter_first(&handler->root, addr,
>> -                                              (addr + len) - 1);
>> +                                              (addr + len));
>>         } else {
>>                 for (node = __mmu_int_rb_iter_first(&handler->root, addr,
>> -                                                   (addr + len) - 1);
>> +                                                   (addr + len));
>>                      node;
>>                      node = __mmu_int_rb_iter_next(node, addr,
>> -                                                  (addr + len) - 1)) {
>> +                                                  (addr + len))) {
>>                         if (handler->ops->filter(node, addr, len))
>>                                 return node;
>>                 }
>
>Weird unnecessary parentheses through this block.

Yes, but I wanted to leave it with the least amount of changes. There are plenty
of places that could use interval_tree_foreach helpers, like the vma tree has.

>
>> @@ -787,7 +787,7 @@ static phys_addr_t viommu_iova_to_phys(struct iommu_domain *domain,
>>         struct viommu_domain *vdomain = to_viommu_domain(domain);
>>
>>         spin_lock_irqsave(&vdomain->mappings_lock, flags);
>> -       node = interval_tree_iter_first(&vdomain->mappings, iova, iova);
>> +       node = interval_tree_iter_first(&vdomain->mappings, iova, iova + 1);
>
>I was gonna suggest a stab iterator for the generic interval tree, but
>maybe not if it's only used here.

I considered it as well.

>
>> diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
>> index aaa8a0767aa3..e714e67ebdb5 100644
>> --- a/include/linux/interval_tree_generic.h
>> +++ b/include/linux/interval_tree_generic.h
>> @@ -69,12 +69,12 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node,                         \
>>  }                                                                            \
>>                                                                               \
>>  /*                                                                           \
>> - * Iterate over intervals intersecting [start;last]                          \
>> + * Iterate over intervals intersecting [start;last[                          \
>
>Maybe I'm extra nitpicky, but I would suggest to use 'end' instead of
>'last' when the intervals are half open: [start;end[
>To me 'last' implies that it's in the interval, while 'end' doesn't
>imply anything (and thus the half open convention used through the
>kernel applies).

That's fair enough.

>similarly ITLAST should be changed to ITEND, etc and similarly in the
>various call sites (drm and others).
>Again, maybe it's nitpicky but I feel changing the name also helps
>verify that we don't leave behind any off-by-one use.
>
>That said, it's really only my preference - if you think it's too
>painful to change, that's OK.

I'll see what I can do, but yeah it might be too much of a pain for
the benefits.

Thanks,
Davidlohr
Davidlohr Bueso Sept. 5, 2019, 5:03 a.m. UTC | #10
On Wed, 04 Sep 2019, Michel Lespinasse wrote:

>Hi Davidlohr,
>
>On Wed, Sep 4, 2019 at 5:52 PM Davidlohr Bueso <dave@stgolabs.net> wrote:
>> Ok, so for that I've added the following helper which will make the
>> conversion a bit more straightforward:
>>
>> #define vma_interval_tree_foreach_stab(vma, root, start)
>>        vma_interval_tree_foreach(vma, root, start, start)
>
>Yes, that should help with the conversion :)
>
>> I think this also makes sense as it documents the nature of the lookup.

Because this is a nice helper regardless of the interval tree stuff, I went
ahead and sent a patch as well as the conversion, which is quite trivial,
and hopefully akpm can pick it up for -next; which makes one less patch
to carry when doing the interval tree conversion series.

Thanks,
Davidlohr

Patch
diff mbox series

diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index fa16036fa592..1be4d1856a9b 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -12,7 +12,7 @@ 
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
-#include <linux/rbtree_augmented.h>
+#include <linux/interval_tree_generic.h>
 #include <linux/sched.h>
 #include <linux/gfp.h>
 
@@ -34,68 +34,41 @@ 
  * memtype_lock protects the rbtree.
  */
 
-static struct rb_root memtype_rbroot = RB_ROOT;
+static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
+
+#define START(node) ((node)->start)
+#define END(node)  ((node)->end)
+INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
+		     START, END, static, memtype_interval)
 
 static int is_node_overlap(struct memtype *node, u64 start, u64 end)
 {
-	if (node->start >= end || node->end <= start)
+	/*
+	 * Unlike generic interval trees, the memtype nodes are ]a, b[
+	 * therefore we need to adjust the ranges accordingly. Missing
+	 * an overlap can lead to incorrectly detecting conflicts,
+	 * for example.
+	 */
+	if (node->start + 1 >= end || node->end - 1 <= start)
 		return 0;
 
 	return 1;
 }
 
-static u64 get_subtree_max_end(struct rb_node *node)
-{
-	u64 ret = 0;
-	if (node) {
-		struct memtype *data = rb_entry(node, struct memtype, rb);
-		ret = data->subtree_max_end;
-	}
-	return ret;
-}
-
-static u64 compute_subtree_max_end(struct memtype *data)
+static struct memtype *memtype_rb_lowest_match(struct rb_root_cached *root,
+						u64 start, u64 end)
 {
-	u64 max_end = data->end, child_max_end;
-
-	child_max_end = get_subtree_max_end(data->rb.rb_right);
-	if (child_max_end > max_end)
-		max_end = child_max_end;
-
-	child_max_end = get_subtree_max_end(data->rb.rb_left);
-	if (child_max_end > max_end)
-		max_end = child_max_end;
-
-	return max_end;
-}
-
-RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
-		     u64, subtree_max_end, compute_subtree_max_end)
-
-/* Find the first (lowest start addr) overlapping range from rb tree */
-static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
-				u64 start, u64 end)
-{
-	struct rb_node *node = root->rb_node;
-	struct memtype *last_lower = NULL;
+	struct memtype *node;
 
+	node = memtype_interval_iter_first(root, start, end);
 	while (node) {
-		struct memtype *data = rb_entry(node, struct memtype, rb);
-
-		if (get_subtree_max_end(node->rb_left) > start) {
-			/* Lowest overlap if any must be on left side */
-			node = node->rb_left;
-		} else if (is_node_overlap(data, start, end)) {
-			last_lower = data;
-			break;
-		} else if (start >= data->start) {
-			/* Lowest overlap if any must be on right side */
-			node = node->rb_right;
-		} else {
-			break;
-		}
+		if (is_node_overlap(node, start, end))
+			return node;
+
+		node = memtype_interval_iter_next(node, start, end);
 	}
-	return last_lower; /* Returns NULL if there is no overlap */
+
+	return NULL;
 }
 
 enum {
@@ -103,15 +76,14 @@  enum {
 	MEMTYPE_END_MATCH	= 1
 };
 
-static struct memtype *memtype_rb_match(struct rb_root *root,
-				u64 start, u64 end, int match_type)
+static struct memtype *memtype_rb_match(struct rb_root_cached *root,
+					u64 start, u64 end, int match_type)
 {
 	struct memtype *match;
 
 	match = memtype_rb_lowest_match(root, start, end);
-	while (match != NULL && match->start < end) {
-		struct rb_node *node;
 
+	while (match != NULL && match->start < end) {
 		if ((match_type == MEMTYPE_EXACT_MATCH) &&
 		    (match->start == start) && (match->end == end))
 			return match;
@@ -120,26 +92,21 @@  static struct memtype *memtype_rb_match(struct rb_root *root,
 		    (match->start < start) && (match->end == end))
 			return match;
 
-		node = rb_next(&match->rb);
-		if (node)
-			match = rb_entry(node, struct memtype, rb);
-		else
-			match = NULL;
+	        match = memtype_interval_iter_next(match, start, end);
 	}
 
 	return NULL; /* Returns NULL if there is no match */
 }
 
-static int memtype_rb_check_conflict(struct rb_root *root,
-				u64 start, u64 end,
-				enum page_cache_mode reqtype,
-				enum page_cache_mode *newtype)
+static int memtype_rb_check_conflict(struct rb_root_cached *root,
+				      u64 start, u64 end,
+				      enum page_cache_mode reqtype,
+				      enum page_cache_mode *newtype)
 {
-	struct rb_node *node;
 	struct memtype *match;
 	enum page_cache_mode found_type = reqtype;
 
-	match = memtype_rb_lowest_match(&memtype_rbroot, start, end);
+	match = memtype_rb_lowest_match(root, start, end);
 	if (match == NULL)
 		goto success;
 
@@ -149,10 +116,8 @@  static int memtype_rb_check_conflict(struct rb_root *root,
 	dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
 	found_type = match->type;
 
-	node = rb_next(&match->rb);
-	while (node) {
-		match = rb_entry(node, struct memtype, rb);
-
+        match = memtype_interval_iter_next(match, start, end);
+	while (match) {
 		if (match->start >= end) /* Checked all possible matches */
 			goto success;
 
@@ -161,7 +126,7 @@  static int memtype_rb_check_conflict(struct rb_root *root,
 			goto failure;
 		}
 
-		node = rb_next(&match->rb);
+	        match = memtype_interval_iter_next(match, start, end);
 	}
 success:
 	if (newtype)
@@ -176,43 +141,21 @@  static int memtype_rb_check_conflict(struct rb_root *root,
 	return -EBUSY;
 }
 
-static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
-{
-	struct rb_node **node = &(root->rb_node);
-	struct rb_node *parent = NULL;
-
-	while (*node) {
-		struct memtype *data = rb_entry(*node, struct memtype, rb);
-
-		parent = *node;
-		if (data->subtree_max_end < newdata->end)
-			data->subtree_max_end = newdata->end;
-		if (newdata->start <= data->start)
-			node = &((*node)->rb_left);
-		else if (newdata->start > data->start)
-			node = &((*node)->rb_right);
-	}
-
-	newdata->subtree_max_end = newdata->end;
-	rb_link_node(&newdata->rb, parent, node);
-	rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
-}
-
 int rbt_memtype_check_insert(struct memtype *new,
 			     enum page_cache_mode *ret_type)
 {
-	int err = 0;
+	int err;
 
-	err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end,
-						new->type, ret_type);
+	err =  memtype_rb_check_conflict(&memtype_rbroot,
+					 new->start, new->end,
+					 new->type, ret_type);
+	if (err)
+		return err;
 
-	if (!err) {
-		if (ret_type)
-			new->type = *ret_type;
+	if (ret_type)
+		new->type = *ret_type;
 
-		new->subtree_max_end = new->end;
-		memtype_rb_insert(&memtype_rbroot, new);
-	}
+	memtype_interval_insert(new, &memtype_rbroot);
 	return err;
 }
 
@@ -238,15 +181,13 @@  struct memtype *rbt_memtype_erase(u64 start, u64 end)
 
 	if (data->start == start) {
 		/* munmap: erase this node */
-		rb_erase_augmented(&data->rb, &memtype_rbroot,
-					&memtype_rb_augment_cb);
+		memtype_interval_remove(data, &memtype_rbroot);
 	} else {
 		/* mremap: update the end value of this node */
-		rb_erase_augmented(&data->rb, &memtype_rbroot,
-					&memtype_rb_augment_cb);
+		memtype_interval_remove(data, &memtype_rbroot);
 		data->end = start;
-		data->subtree_max_end = data->end;
-		memtype_rb_insert(&memtype_rbroot, data);
+		memtype_interval_insert(data, &memtype_rbroot);
+
 		return NULL;
 	}
 
@@ -261,21 +202,21 @@  struct memtype *rbt_memtype_lookup(u64 addr)
 #if defined(CONFIG_DEBUG_FS)
 int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
 {
-	struct rb_node *node;
+	struct memtype *node;
 	int i = 1;
 
-	node = rb_first(&memtype_rbroot);
+	node = memtype_interval_iter_first(&memtype_rbroot, 0, ULONG_MAX);
 	while (node && pos != i) {
-		node = rb_next(node);
+		node = memtype_interval_iter_next(node, 0, ULONG_MAX);
 		i++;
 	}
 
 	if (node) { /* pos == i */
-		struct memtype *this = rb_entry(node, struct memtype, rb);
-		*out = *this;
+		*out = *node;
 		return 0;
 	} else {
 		return 1;
 	}
 }
+
 #endif