linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] make radix tree gang lookup faster by using a bitmap search
@ 2005-08-27 16:26 James Bottomley
  2005-08-27 17:53 ` Andrew Morton
  0 siblings, 1 reply; 19+ messages in thread
From: James Bottomley @ 2005-08-27 16:26 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Kernel, linux-mm

The current gang lookup is rather naive and slow.  This patch replaces
the integer count with an unsigned long representing the bitmap of
occupied elements.  We then use that bitmap to find the first occupied
entry instead of looping over all the entries from the beginning of the
radix node.

The penalty of doing this is that on 32 bit machines, the size of the
radix tree array is reduced from 64 to 32 (so an unsigned long can
represent the bitmap).

I also exported radix_tree_preload() so modules can make use of radix
trees.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>

---

James

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -32,9 +32,17 @@
 
 
 #ifdef __KERNEL__
+#if BITS_PER_LONG == 32
+#define RADIX_TREE_MAP_SHIFT	5
+#elif BITS_PER_LONG == 64
 #define RADIX_TREE_MAP_SHIFT	6
 #else
+#error BITS_PER_LONG neither 32 nor 64
+#endif
+#define RADIX_TREE_MAP_FULL	(~0UL)
+#else
 #define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
+#define RADIX_TREE_MAP_FULL	((1UL << (1UL << RADIX_TREE_MAP_SHIFT)) - 1UL)
 #endif
 #define RADIX_TREE_TAGS		2
 
@@ -45,7 +53,7 @@
 	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
 
 struct radix_tree_node {
-	unsigned int	count;
+	unsigned long	occupied;
 	void		*slots[RADIX_TREE_MAP_SIZE];
 	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
 };
@@ -133,6 +141,7 @@ int radix_tree_preload(int gfp_mask)
 out:
 	return ret;
 }
+EXPORT_SYMBOL(radix_tree_preload);
 
 static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
 {
@@ -208,7 +217,7 @@ static int radix_tree_extend(struct radi
 				tag_set(node, tag, 0);
 		}
 
-		node->count = 1;
+		node->occupied = 1;
 		root->rnode = node;
 		root->height++;
 	} while (height > root->height);
@@ -251,8 +260,10 @@ int radix_tree_insert(struct radix_tree_
 			if (!(tmp = radix_tree_node_alloc(root)))
 				return -ENOMEM;
 			*slot = tmp;
-			if (node)
-				node->count++;
+			if (node) {
+				BUG_ON(node->occupied & (1UL << offset));
+				node->occupied |= (1UL << offset);
+			}
 		}
 
 		/* Go a level down */
@@ -265,11 +276,11 @@ int radix_tree_insert(struct radix_tree_
 
 	if (*slot != NULL)
 		return -EEXIST;
-	if (node) {
-		node->count++;
-		BUG_ON(tag_get(node, 0, offset));
-		BUG_ON(tag_get(node, 1, offset));
-	}
+	BUG_ON(!node);
+	BUG_ON(node->occupied & (1UL << offset));
+	node->occupied |= (1UL << offset);
+	BUG_ON(tag_get(node, 0, offset));
+	BUG_ON(tag_get(node, 1, offset));
 
 	*slot = item;
 	return 0;
@@ -480,30 +491,48 @@ __lookup(struct radix_tree_root *root, v
 	slot = root->rnode;
 
 	while (height > 0) {
-		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+		unsigned long j = (index >> shift) & RADIX_TREE_MAP_MASK, i;
+		unsigned long occupied_mask = 0;
 
-		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
-			if (slot->slots[i] != NULL)
-				break;
-			index &= ~((1UL << shift) - 1);
-			index += 1UL << shift;
-			if (index == 0)
-				goto out;	/* 32-bit wraparound */
-		}
-		if (i == RADIX_TREE_MAP_SIZE)
+		/* mark all the slots up to but excluding the starting
+		 * index occupied */
+		occupied_mask = (1UL << j) - 1;
+		/* Now or in the remaining occupations (inverted so
+		 * we can use ffz to find the next occupied slot) */
+		occupied_mask |= ~slot->occupied;
+
+		/* If everything from this on up is empty, then there's
+		 * nothing more in the tree */
+		if (occupied_mask == RADIX_TREE_MAP_FULL) {
+			index = 0;
 			goto out;
+		}
+
+		i = ffz(occupied_mask);
+		if (i != j) {
+			index &= ~((1UL << (shift + RADIX_TREE_MAP_SHIFT)) - 1);
+			index |= i << shift;
+		}
+
 		height--;
 		if (height == 0) {	/* Bottom level: grab some items */
-			unsigned long j = index & RADIX_TREE_MAP_MASK;
-
-			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
-				index++;
-				if (slot->slots[j]) {
-					results[nr_found++] = slot->slots[j];
-					if (nr_found == max_items)
-						goto out;
+			while (i < RADIX_TREE_MAP_SIZE) {
+				unsigned long occupied_mask;
+				BUG_ON(!slot->slots[i]);
+				results[nr_found++] = slot->slots[i];
+				if (nr_found == max_items) {
+					index++;
+					goto out;
 				}
+				occupied_mask = (1UL << i) - 1 + (1UL << i);
+				occupied_mask |= ~slot->occupied;
+				if (occupied_mask == RADIX_TREE_MAP_FULL)
+					break;
+				j = i;
+				i = ffz(occupied_mask);
+				index += i-j;
 			}
+			goto out;
 		}
 		shift -= RADIX_TREE_MAP_SHIFT;
 		slot = slot->slots[i];
@@ -719,7 +748,9 @@ void *radix_tree_delete(struct radix_tre
 
 	pathp = orig_pathp;
 	*pathp[0].slot = NULL;
-	while (pathp[0].node && --pathp[0].node->count == 0) {
+	BUG_ON(pathp[0].node && (pathp[0].node->occupied & (1UL << pathp[0].offset)) == 0);
+	while (pathp[0].node && 
+	       (pathp[0].node->occupied &= ~(1UL << pathp[0].offset)) == 0) {
 		pathp--;
 		BUG_ON(*pathp[0].slot == NULL);
 		*pathp[0].slot = NULL;



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-27 16:26 [PATCH] make radix tree gang lookup faster by using a bitmap search James Bottomley
@ 2005-08-27 17:53 ` Andrew Morton
  2005-08-28 19:43   ` James Bottomley
  2005-08-29  0:45   ` James Bottomley
  0 siblings, 2 replies; 19+ messages in thread
From: Andrew Morton @ 2005-08-27 17:53 UTC (permalink / raw)
  To: James Bottomley; +Cc: linux-kernel, linux-mm

James Bottomley <James.Bottomley@SteelEye.com> wrote:
>
> The current gang lookup is rather naive and slow.

I'd say the main naivety in gang lookup is the awkward top-level iteration
algorithm.  The way it bales out all the way to the top level of the tree
once __lookup() hits the end of the slots[] array, even though results[]
isn't full yet.  It's surely possible to go back up the tree just a
sufficient distance to resume the iteration, rather than all the way to the
top.  But it's hard, and it's all in CPU cache anyway there.

It would be much simpler if it was using recursion, of course.

>  This patch replaces
> the integer count with an unsigned long representing the bitmap of
> occupied elements.  We then use that bitmap to find the first occupied
> entry instead of looping over all the entries from the beginning of the
> radix node.

But only in __lookup().  I think most gang lookups use
radix_tree_gang_lookup_tag() -> __lookup_tag().

And __lookup_tag() could use find_next_bit() on the tags anyway, as the
comment says.  I spent a bit of time doing that, had some bug, shelved it,
never got on to fixing it up.

There's a userspace test/devel setup at
http://www.zip.com.au/~akpm/linux/patches/stuff/rtth.tar.gz, btw.

> The penalty of doing this is that on 32 bit machines, the size of the
> radix tree array is reduced from 64 to 32 (so an unsigned long can
> represent the bitmap).

If we did the bitmap lookup in __lookup_tag() we wouldn't have this
restriction.

Maybe we can

a) fix radix_tree_gang_lookup() to use find_next_bit()

b) remove radix_tree_node.count

c) Add a new tag field which simply means "present"

d) remove radix_tree_gang_lookup() and __lookup() altogether

e) Implement radix_tree_gang_lookup() via radix_tree_gang_lookup_tag()

That would involve setting and clearing bit in the "present" tag field when
adding and removing items.

> I also exported radix_tree_preload() so modules can make use of radix
> trees.

uh, OK.  Note that radix_tree_preload() uses prempt_disable() protection. 
So it has the limitation that the guarantee which it provides will become
unreliable, kernel-wide, if anyone anywhere tries to do a
radix_tree_insert() from interrupt context.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-27 17:53 ` Andrew Morton
@ 2005-08-28 19:43   ` James Bottomley
  2005-08-28 20:00     ` Linus Torvalds
  2005-08-29  0:45   ` James Bottomley
  1 sibling, 1 reply; 19+ messages in thread
From: James Bottomley @ 2005-08-28 19:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Kernel

On Sat, 2005-08-27 at 10:53 -0700, Andrew Morton wrote:
> I'd say the main naivety in gang lookup is the awkward top-level iteration
> algorithm.  The way it bales out all the way to the top level of the tree
> once __lookup() hits the end of the slots[] array, even though results[]
> isn't full yet.  It's surely possible to go back up the tree just a
> sufficient distance to resume the iteration, rather than all the way to the
> top.  But it's hard, and it's all in CPU cache anyway there.
> 
> It would be much simpler if it was using recursion, of course.

I agree; I actually checked this point: most page gang lookups do have
to restart the search.  At least using a bitmap gets it back on point
much faster.  The page radix tree lookups are usually at most four
levels deep, anyway.

> >  This patch replaces
> > the integer count with an unsigned long representing the bitmap of
> > occupied elements.  We then use that bitmap to find the first occupied
> > entry instead of looping over all the entries from the beginning of the
> > radix node.
> 
> But only in __lookup().  I think most gang lookups use
> radix_tree_gang_lookup_tag() -> __lookup_tag().
> 
> And __lookup_tag() could use find_next_bit() on the tags anyway, as the
> comment says.  I spent a bit of time doing that, had some bug, shelved it,
> never got on to fixing it up.
> 
> There's a userspace test/devel setup at
> http://www.zip.com.au/~akpm/linux/patches/stuff/rtth.tar.gz, btw.

OK ... I'll take a look.  I didn't mean to do this, it's just that for
the idr replacement code I had to use bitmap lookup, so this seemed like
a natural precursor.

> > The penalty of doing this is that on 32 bit machines, the size of the
> > radix tree array is reduced from 64 to 32 (so an unsigned long can
> > represent the bitmap).
> 
> If we did the bitmap lookup in __lookup_tag() we wouldn't have this
> restriction.
> 
> Maybe we can
> 
> a) fix radix_tree_gang_lookup() to use find_next_bit()

Well, not quite; with the size changes, the tag map now never overflows
an unsigned long.

> b) remove radix_tree_node.count

yes, did that.

> c) Add a new tag field which simply means "present"

> d) remove radix_tree_gang_lookup() and __lookup() altogether

> e) Implement radix_tree_gang_lookup() via radix_tree_gang_lookup_tag()
> 
> That would involve setting and clearing bit in the "present" tag field when
> adding and removing items.

OK, I see how to do all of this using the currently implemented logic
(the occupied word is what you would like to be the present tag).  I'll
see what I can do.

> > I also exported radix_tree_preload() so modules can make use of radix
> > trees.
> 
> uh, OK.  Note that radix_tree_preload() uses prempt_disable() protection. 
> So it has the limitation that the guarantee which it provides will become
> unreliable, kernel-wide, if anyone anywhere tries to do a
> radix_tree_insert() from interrupt context.

radix_tree_insert() is reliable from IRQ provided you don't try to use
radix_tree_preload() and you defined your radix tree gfp flag to be
GFP_ATOMIC.  preloading is only optional, and should only be done really
if you have process context to preload with GFP_KERNEL.  Preloading with
GFP_ATOMIC is pretty pointless since radix_tree_insert() will also try
to allocate with the radix tree flags.

James



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-28 19:43   ` James Bottomley
@ 2005-08-28 20:00     ` Linus Torvalds
  2005-08-28 20:39       ` Andrew Morton
  0 siblings, 1 reply; 19+ messages in thread
From: Linus Torvalds @ 2005-08-28 20:00 UTC (permalink / raw)
  To: James Bottomley; +Cc: Andrew Morton, Linux Kernel



On Sun, 28 Aug 2005, James Bottomley wrote:
> 
> radix_tree_insert() is reliable from IRQ provided you don't try to use
> radix_tree_preload() and you defined your radix tree gfp flag to be
> GFP_ATOMIC.

It would be better if it wasn't, though.

I really don't see why we made it irq-safe, and take the hit of disabling
interrupts in addition to the locking.  That's a quite noticeable loss,
and I don't think it's really a valid thing to insert (or look up) page
cache entries from interrupts.

What _is_ it that makes us do that, btw? Is it just because we clear the 
writeback tag bits or something? Sad. It makes page lookup noticeably more 
expensive.

		Linus

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-28 20:00     ` Linus Torvalds
@ 2005-08-28 20:39       ` Andrew Morton
  0 siblings, 0 replies; 19+ messages in thread
From: Andrew Morton @ 2005-08-28 20:39 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: James.Bottomley, linux-kernel

Linus Torvalds <torvalds@osdl.org> wrote:
>
> 
> 
> On Sun, 28 Aug 2005, James Bottomley wrote:
> > 
> > radix_tree_insert() is reliable from IRQ provided you don't try to use
> > radix_tree_preload() and you defined your radix tree gfp flag to be
> > GFP_ATOMIC.
> 
> It would be better if it wasn't, though.

There's nothing in radix-tree which forces this: it requires
caller-provided locking.

> I really don't see why we made it irq-safe, and take the hit of disabling
> interrupts in addition to the locking.  That's a quite noticeable loss,
> and I don't think it's really a valid thing to insert (or look up) page
> cache entries from interrupts.
> 
> What _is_ it that makes us do that, btw? Is it just because we clear the 
> writeback tag bits or something? Sad. It makes page lookup noticeably more 
> expensive.

Yes, address_space.tree_lock was made IRQ-safe so we could alter the tree's
tags from disk completions.  Presumably Nick's lockless pagecache stuff
removes that.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-27 17:53 ` Andrew Morton
  2005-08-28 19:43   ` James Bottomley
@ 2005-08-29  0:45   ` James Bottomley
  2005-08-29  0:52     ` Andrew Morton
  1 sibling, 1 reply; 19+ messages in thread
From: James Bottomley @ 2005-08-29  0:45 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Kernel

On Sat, 2005-08-27 at 10:53 -0700, Andrew Morton wrote:
> a) fix radix_tree_gang_lookup() to use find_next_bit()
> 
> b) remove radix_tree_node.count
> 
> c) Add a new tag field which simply means "present"
> 
> d) remove radix_tree_gang_lookup() and __lookup() altogether
> 
> e) Implement radix_tree_gang_lookup() via radix_tree_gang_lookup_tag()

OK, here it is: the combined version which treats the present bits as a
private tag, combines __lookup and __lookup_tag and does a fast bitmap
search for both.

James

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -32,22 +32,29 @@
 
 
 #ifdef __KERNEL__
+#if BITS_PER_LONG == 32
+#define RADIX_TREE_MAP_SHIFT	5
+#elif BITS_PER_LONG == 64
 #define RADIX_TREE_MAP_SHIFT	6
 #else
+#error BITS_PER_LONG neither 32 nor 64
+#endif
+#define RADIX_TREE_MAP_FULL	(~0UL)
+#else
 #define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
+#define RADIX_TREE_MAP_FULL	((1UL << (1UL << RADIX_TREE_MAP_SHIFT)) - 1UL)
 #endif
-#define RADIX_TREE_TAGS		2
+#define RADIX_TREE_USER_TAGS	2
+#define RADIX_TREE_TAG_PRESENT	(RADIX_TREE_USER_TAGS + 0)
+/* Set this to the last private tag plus one */
+#define RADIX_TREE_TAGS		(RADIX_TREE_TAG_PRESENT + 1)
 
 #define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
 #define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
 
-#define RADIX_TREE_TAG_LONGS	\
-	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
-
 struct radix_tree_node {
-	unsigned int	count;
 	void		*slots[RADIX_TREE_MAP_SIZE];
-	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
+	unsigned long	tags[RADIX_TREE_TAGS];
 };
 
 struct radix_tree_path {
@@ -133,21 +140,22 @@ int radix_tree_preload(int gfp_mask)
 out:
 	return ret;
 }
+EXPORT_SYMBOL(radix_tree_preload);
 
 static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
 {
-	if (!test_bit(offset, &node->tags[tag][0]))
-		__set_bit(offset, &node->tags[tag][0]);
+	if ((node->tags[tag] & (1UL << offset)) == 0)
+		node->tags[tag] |= (1UL << offset);
 }
 
 static inline void tag_clear(struct radix_tree_node *node, int tag, int offset)
 {
-	__clear_bit(offset, &node->tags[tag][0]);
+	node->tags[tag] &= ~(1UL << offset);
 }
 
 static inline int tag_get(struct radix_tree_node *node, int tag, int offset)
 {
-	return test_bit(offset, &node->tags[tag][0]);
+	return (node->tags[tag] & (1UL << offset)) ? 1 : 0;
 }
 
 /*
@@ -184,15 +192,9 @@ static int radix_tree_extend(struct radi
 	 * into the newly-pushed top-level node(s)
 	 */
 	for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
-		int idx;
-
 		tags[tag] = 0;
-		for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-			if (root->rnode->tags[tag][idx]) {
-				tags[tag] = 1;
-				break;
-			}
-		}
+		if (root->rnode->tags[tag])
+			tags[tag] = 1;
 	}
 
 	do {
@@ -208,7 +210,7 @@ static int radix_tree_extend(struct radi
 				tag_set(node, tag, 0);
 		}
 
-		node->count = 1;
+		node->tags[RADIX_TREE_TAG_PRESENT] = 1;
 		root->rnode = node;
 		root->height++;
 	} while (height > root->height);
@@ -251,8 +253,11 @@ int radix_tree_insert(struct radix_tree_
 			if (!(tmp = radix_tree_node_alloc(root)))
 				return -ENOMEM;
 			*slot = tmp;
-			if (node)
-				node->count++;
+			if (node) {
+				BUG_ON(tag_get(node, RADIX_TREE_TAG_PRESENT,
+					       offset));
+				tag_set(node, RADIX_TREE_TAG_PRESENT, offset);
+			}
 		}
 
 		/* Go a level down */
@@ -265,11 +270,11 @@ int radix_tree_insert(struct radix_tree_
 
 	if (*slot != NULL)
 		return -EEXIST;
-	if (node) {
-		node->count++;
-		BUG_ON(tag_get(node, 0, offset));
-		BUG_ON(tag_get(node, 1, offset));
-	}
+	BUG_ON(!node);
+	BUG_ON(tag_get(node, RADIX_TREE_TAG_PRESENT, offset));
+	tag_set(node, RADIX_TREE_TAG_PRESENT, offset);
+	BUG_ON(tag_get(node, 0, offset));
+	BUG_ON(tag_get(node, 1, offset));
 
 	*slot = item;
 	return 0;
@@ -399,13 +404,10 @@ void *radix_tree_tag_clear(struct radix_
 		goto out;
 
 	do {
-		int idx;
-
 		tag_clear(pathp[0].node, tag, pathp[0].offset);
-		for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-			if (pathp[0].node->tags[tag][idx])
-				goto out;
-		}
+		if (pathp[0].node->tags[tag])
+			goto out;
+
 		pathp--;
 	} while (pathp[0].node);
 out:
@@ -468,8 +470,8 @@ EXPORT_SYMBOL(radix_tree_tag_get);
 #endif
 
 static unsigned int
-__lookup(struct radix_tree_root *root, void **results, unsigned long index,
-	unsigned int max_items, unsigned long *next_index)
+__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
+	     unsigned int max_items, unsigned long *next_index, int tag)
 {
 	unsigned int nr_found = 0;
 	unsigned int shift;
@@ -480,30 +482,48 @@ __lookup(struct radix_tree_root *root, v
 	slot = root->rnode;
 
 	while (height > 0) {
-		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+		unsigned long j = (index >> shift) & RADIX_TREE_MAP_MASK, i;
+		unsigned long occupied_mask = 0;
 
-		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
-			if (slot->slots[i] != NULL)
-				break;
-			index &= ~((1UL << shift) - 1);
-			index += 1UL << shift;
-			if (index == 0)
-				goto out;	/* 32-bit wraparound */
-		}
-		if (i == RADIX_TREE_MAP_SIZE)
+		/* mark all the slots up to but excluding the starting
+		 * index occupied */
+		occupied_mask = (1UL << j) - 1;
+		/* Now or in the remaining occupations (inverted so
+		 * we can use ffz to find the next occupied slot) */
+		occupied_mask |= ~slot->tags[tag];
+
+		/* If everything from this on up is empty, then there's
+		 * nothing more in the tree */
+		if (occupied_mask == RADIX_TREE_MAP_FULL) {
+			index = 0;
 			goto out;
+		}
+
+		i = ffz(occupied_mask);
+		if (i != j) {
+			index &= ~((1UL << (shift + RADIX_TREE_MAP_SHIFT)) - 1);
+			index |= i << shift;
+		}
+
 		height--;
 		if (height == 0) {	/* Bottom level: grab some items */
-			unsigned long j = index & RADIX_TREE_MAP_MASK;
-
-			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
-				index++;
-				if (slot->slots[j]) {
-					results[nr_found++] = slot->slots[j];
-					if (nr_found == max_items)
-						goto out;
+			while (i < RADIX_TREE_MAP_SIZE) {
+				unsigned long occupied_mask;
+				BUG_ON(!slot->slots[i]);
+				results[nr_found++] = slot->slots[i];
+				if (nr_found == max_items) {
+					index++;
+					goto out;
 				}
+				occupied_mask = (1UL << i) - 1 + (1UL << i);
+				occupied_mask |= ~slot->tags[tag];
+				if (occupied_mask == RADIX_TREE_MAP_FULL)
+					break;
+				j = i;
+				i = ffz(occupied_mask);
+				index += i-j;
 			}
+			goto out;
 		}
 		shift -= RADIX_TREE_MAP_SHIFT;
 		slot = slot->slots[i];
@@ -540,8 +560,9 @@ radix_tree_gang_lookup(struct radix_tree
 
 		if (cur_index > max_index)
 			break;
-		nr_found = __lookup(root, results + ret, cur_index,
-					max_items - ret, &next_index);
+		nr_found = __lookup_tag(root, results + ret, cur_index,
+					max_items - ret, &next_index,
+					RADIX_TREE_TAG_PRESENT);
 		ret += nr_found;
 		if (next_index == 0)
 			break;
@@ -551,59 +572,6 @@ radix_tree_gang_lookup(struct radix_tree
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup);
 
-/*
- * FIXME: the two tag_get()s here should use find_next_bit() instead of
- * open-coding the search.
- */
-static unsigned int
-__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
-	unsigned int max_items, unsigned long *next_index, int tag)
-{
-	unsigned int nr_found = 0;
-	unsigned int shift;
-	unsigned int height = root->height;
-	struct radix_tree_node *slot;
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	while (height > 0) {
-		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
-
-		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
-			if (tag_get(slot, tag, i)) {
-				BUG_ON(slot->slots[i] == NULL);
-				break;
-			}
-			index &= ~((1UL << shift) - 1);
-			index += 1UL << shift;
-			if (index == 0)
-				goto out;	/* 32-bit wraparound */
-		}
-		if (i == RADIX_TREE_MAP_SIZE)
-			goto out;
-		height--;
-		if (height == 0) {	/* Bottom level: grab some items */
-			unsigned long j = index & RADIX_TREE_MAP_MASK;
-
-			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
-				index++;
-				if (tag_get(slot, tag, j)) {
-					BUG_ON(slot->slots[j] == NULL);
-					results[nr_found++] = slot->slots[j];
-					if (nr_found == max_items)
-						goto out;
-				}
-			}
-		}
-		shift -= RADIX_TREE_MAP_SHIFT;
-		slot = slot->slots[i];
-	}
-out:
-	*next_index = index;
-	return nr_found;
-}
-
 /**
  *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
  *	                             based on a tag
@@ -657,7 +625,7 @@ void *radix_tree_delete(struct radix_tre
 	struct radix_tree_path *orig_pathp;
 	unsigned int height, shift;
 	void *ret = NULL;
-	char tags[RADIX_TREE_TAGS];
+	char tags[RADIX_TREE_TAGS - 1];
 	int nr_cleared_tags;
 
 	height = root->height;
@@ -691,27 +659,25 @@ void *radix_tree_delete(struct radix_tre
 	orig_pathp = pathp;
 
 	/*
-	 * Clear all tags associated with the just-deleted item
+	 * Clear all user tags associated with the just-deleted item
 	 */
 	memset(tags, 0, sizeof(tags));
 	do {
 		int tag;
 
-		nr_cleared_tags = RADIX_TREE_TAGS;
-		for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
-			int idx;
-
+		/* The private tag RADIX_TREE_TAG_PRESENT is used to
+		 * free the slots below, so don't clear it */
+		nr_cleared_tags = RADIX_TREE_TAGS - 1;
+		for (tag = 0; tag < RADIX_TREE_TAGS - 1;
+		     tag++) {
 			if (tags[tag])
 				continue;
 
 			tag_clear(pathp[0].node, tag, pathp[0].offset);
 
-			for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-				if (pathp[0].node->tags[tag][idx]) {
-					tags[tag] = 1;
-					nr_cleared_tags--;
-					break;
-				}
+			if (pathp[0].node->tags[tag]) {
+				tags[tag] = 1;
+				nr_cleared_tags--;
 			}
 		}
 		pathp--;
@@ -719,7 +685,12 @@ void *radix_tree_delete(struct radix_tre
 
 	pathp = orig_pathp;
 	*pathp[0].slot = NULL;
-	while (pathp[0].node && --pathp[0].node->count == 0) {
+	BUG_ON(pathp[0].node &&
+	       !tag_get(pathp[0].node, RADIX_TREE_TAG_PRESENT, 
+			pathp[0].offset));
+	while (pathp[0].node && 
+	       (pathp[0].node->tags[RADIX_TREE_TAG_PRESENT] &=
+		~(1UL << pathp[0].offset)) == 0) {
 		pathp--;
 		BUG_ON(*pathp[0].slot == NULL);
 		*pathp[0].slot = NULL;
@@ -739,14 +710,11 @@ EXPORT_SYMBOL(radix_tree_delete);
  */
 int radix_tree_tagged(struct radix_tree_root *root, int tag)
 {
-	int idx;
-
 	if (!root->rnode)
 		return 0;
-	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-		if (root->rnode->tags[tag][idx])
-			return 1;
-	}
+	if (root->rnode->tags[tag])
+		return 1;
+
 	return 0;
 }
 EXPORT_SYMBOL(radix_tree_tagged);



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-29  0:45   ` James Bottomley
@ 2005-08-29  0:52     ` Andrew Morton
  2005-08-29  1:19       ` James Bottomley
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2005-08-29  0:52 UTC (permalink / raw)
  To: James Bottomley; +Cc: linux-kernel

James Bottomley <James.Bottomley@SteelEye.com> wrote:
>
> On Sat, 2005-08-27 at 10:53 -0700, Andrew Morton wrote:
> > a) fix radix_tree_gang_lookup() to use find_next_bit()
> > 
> > b) remove radix_tree_node.count
> > 
> > c) Add a new tag field which simply means "present"
> > 
> > d) remove radix_tree_gang_lookup() and __lookup() altogether
> > 
> > e) Implement radix_tree_gang_lookup() via radix_tree_gang_lookup_tag()
> 
> OK, here it is: the combined version which treats the present bits as a
> private tag, combines __lookup and __lookup_tag and does a fast bitmap
> search for both.
>
> ...
>
> +#if BITS_PER_LONG == 32
> +#define RADIX_TREE_MAP_SHIFT	5
> +#elif BITS_PER_LONG == 64
> ...
>  struct radix_tree_node {
> -	unsigned int	count;
>  	void		*slots[RADIX_TREE_MAP_SIZE];
> -	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
> +	unsigned long	tags[RADIX_TREE_TAGS];
>  };

I don't see why the above change was necessary?  Why not stick with the
current more flexible sizing option?

Note that the RADIX_TREE_MAP_FULL trick can still be used.  It just has to
be put inside a `for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++)' loop,
which will vanish if RADIX_TREE_TAG_LONGS==1.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-29  0:52     ` Andrew Morton
@ 2005-08-29  1:19       ` James Bottomley
  2005-08-29  1:35         ` Andrew Morton
  0 siblings, 1 reply; 19+ messages in thread
From: James Bottomley @ 2005-08-29  1:19 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Kernel

On Sun, 2005-08-28 at 17:52 -0700, Andrew Morton wrote:
> > +#if BITS_PER_LONG == 32
> > +#define RADIX_TREE_MAP_SHIFT	5
> > +#elif BITS_PER_LONG == 64
> > ...
> >  struct radix_tree_node {
> > -	unsigned int	count;
> >  	void		*slots[RADIX_TREE_MAP_SIZE];
> > -	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
> > +	unsigned long	tags[RADIX_TREE_TAGS];
> >  };
> 
> I don't see why the above change was necessary?  Why not stick with the
> current more flexible sizing option?
> 
> Note that the RADIX_TREE_MAP_FULL trick can still be used.  It just has to
> be put inside a `for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++)' loop,
> which will vanish if RADIX_TREE_TAG_LONGS==1.

Well ... It's my opinion (and purely unsubstantiated, I suppose) that
it's more efficient on 32 bit platforms to do bit operations on 32 bit
quantities, which is why I changed the radix tree map shift to 5 for
that case.

It also makes much cleaner code than having to open code checks on
variable sized bitmaps.

James



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-29  1:19       ` James Bottomley
@ 2005-08-29  1:35         ` Andrew Morton
  2005-08-29  3:26           ` James Bottomley
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2005-08-29  1:35 UTC (permalink / raw)
  To: James Bottomley; +Cc: linux-kernel

James Bottomley <James.Bottomley@SteelEye.com> wrote:
>
> On Sun, 2005-08-28 at 17:52 -0700, Andrew Morton wrote:
> > > +#if BITS_PER_LONG == 32
> > > +#define RADIX_TREE_MAP_SHIFT	5
> > > +#elif BITS_PER_LONG == 64
> > > ...
> > >  struct radix_tree_node {
> > > -	unsigned int	count;
> > >  	void		*slots[RADIX_TREE_MAP_SIZE];
> > > -	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
> > > +	unsigned long	tags[RADIX_TREE_TAGS];
> > >  };
> > 
> > I don't see why the above change was necessary?  Why not stick with the
> > current more flexible sizing option?
> > 
> > Note that the RADIX_TREE_MAP_FULL trick can still be used.  It just has to
> > be put inside a `for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++)' loop,
> > which will vanish if RADIX_TREE_TAG_LONGS==1.
> 
> Well ... It's my opinion (and purely unsubstantiated, I suppose) that
> it's more efficient on 32 bit platforms to do bit operations on 32 bit
> quantities, which is why I changed the radix tree map shift to 5 for
> that case.
> 
> It also makes much cleaner code than having to open code checks on
> variable sized bitmaps.
> 

It does make the tree higher and hence will incur some more cache missing
when descending the tree.

We changed the node size a few years back.  umm.... 
http://www.ussg.iu.edu/hypermail/linux/kernel/0206.2/0141.html

It would be a little bit sad to be unable to make such tuning adjustments
in the future.  Not a huge loss, but a loss.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-29  1:35         ` Andrew Morton
@ 2005-08-29  3:26           ` James Bottomley
  2005-08-29  3:37             ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: James Bottomley @ 2005-08-29  3:26 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Kernel

On Sun, 2005-08-28 at 18:35 -0700, Andrew Morton wrote:
> > Well ... It's my opinion (and purely unsubstantiated, I suppose) that
> > it's more efficient on 32 bit platforms to do bit operations on 32 bit
> > quantities, which is why I changed the radix tree map shift to 5 for
> > that case.
> > 
> > It also makes much cleaner code than having to open code checks on
> > variable sized bitmaps.

> It does make the tree higher and hence will incur some more cache missing
> when descending the tree.

Actually, I don't think it does:  the common user is the page tree.
Obviously, I've changed nothing on 64 bits, so we only need to consider
what I've done on 32 bits.  A page size is almost universally 4k on 32
bit, so we need 20 bits to store the page tree index.  Regardless of
whether the index size is 5 or 6, that gives a radix tree depth of 4.

> We changed the node size a few years back.  umm.... 
> http://www.ussg.iu.edu/hypermail/linux/kernel/0206.2/0141.html

Yes, but that was to reduce the index size from 7 to 6 for slab
allocation reasons.  I've just reduced it to 5 on 32 bit.

> It would be a little bit sad to be unable to make such tuning adjustments
> in the future.  Not a huge loss, but a loss.

Well .. OK .. If the benchmarks say I've slowed us down on 32 bits, I'll
put the variable sizing back in the tag array.

James


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-29  3:26           ` James Bottomley
@ 2005-08-29  3:37             ` Nick Piggin
  2005-08-29  3:54               ` Trond Myklebust
                                 ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Nick Piggin @ 2005-08-29  3:37 UTC (permalink / raw)
  To: James Bottomley; +Cc: Andrew Morton, Linux Kernel

James Bottomley wrote:
> On Sun, 2005-08-28 at 18:35 -0700, Andrew Morton wrote:

>>It does make the tree higher and hence will incur some more cache missing
>>when descending the tree.
> 
> 
> Actually, I don't think it does:  the common user is the page tree.
> Obviously, I've changed nothing on 64 bits, so we only need to consider
> what I've done on 32 bits.  A page size is almost universally 4k on 32
> bit, so we need 20 bits to store the page tree index.  Regardless of
> whether the index size is 5 or 6, that gives a radix tree depth of 4.
> 

s/common/only ?

But the page tree is indexed by file offset rather than virtual
address, and we try to span the file's pagecache with the smallest
possible tree. So it will tend to make the trees taller.

> 
>>We changed the node size a few years back.  umm.... 
>>http://www.ussg.iu.edu/hypermail/linux/kernel/0206.2/0141.html
> 
> 
> Yes, but that was to reduce the index size from 7 to 6 for slab
> allocation reasons.  I've just reduced it to 5 on 32 bit.
> 
> 
>>It would be a little bit sad to be unable to make such tuning adjustments
>>in the future.  Not a huge loss, but a loss.
> 
> 
> Well .. OK .. If the benchmarks say I've slowed us down on 32 bits, I'll
> put the variable sizing back in the tag array.
> 

I'm curious: what do the benchmarks say about your gang lookup?

Thanks,
Nick

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-29  3:37             ` Nick Piggin
@ 2005-08-29  3:54               ` Trond Myklebust
  2005-08-29 13:16                 ` Luben Tuikov
  2005-08-29 15:01               ` James Bottomley
       [not found]               ` <20050829164144.GC9508@localhost.localdomain>
  2 siblings, 1 reply; 19+ messages in thread
From: Trond Myklebust @ 2005-08-29  3:54 UTC (permalink / raw)
  To: Nick Piggin; +Cc: James Bottomley, Andrew Morton, Linux Kernel

må den 29.08.2005 Klokka 13:37 (+1000) skreiv Nick Piggin:
> James Bottomley wrote:
> > On Sun, 2005-08-28 at 18:35 -0700, Andrew Morton wrote:
> 
> >>It does make the tree higher and hence will incur some more cache missing
> >>when descending the tree.
> > 
> > 
> > Actually, I don't think it does:  the common user is the page tree.
> > Obviously, I've changed nothing on 64 bits, so we only need to consider
> > what I've done on 32 bits.  A page size is almost universally 4k on 32
> > bit, so we need 20 bits to store the page tree index.  Regardless of
> > whether the index size is 5 or 6, that gives a radix tree depth of 4.
> > 
> 
> s/common/only ?

grep again... I use it in the NFS client for bookkeeping requests.

Cheers,
  Trond


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-29  3:54               ` Trond Myklebust
@ 2005-08-29 13:16                 ` Luben Tuikov
  0 siblings, 0 replies; 19+ messages in thread
From: Luben Tuikov @ 2005-08-29 13:16 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: Nick Piggin, James Bottomley, Andrew Morton, Linux Kernel

On 08/28/05 23:54, Trond Myklebust wrote:
> må den 29.08.2005 Klokka 13:37 (+1000) skreiv Nick Piggin:
> 
>>James Bottomley wrote:
>>
>>>On Sun, 2005-08-28 at 18:35 -0700, Andrew Morton wrote:
>>
>>>>It does make the tree higher and hence will incur some more cache missing
>>>>when descending the tree.
>>>
>>>
>>>Actually, I don't think it does:  the common user is the page tree.
>>>Obviously, I've changed nothing on 64 bits, so we only need to consider
>>>what I've done on 32 bits.  A page size is almost universally 4k on 32
>>>bit, so we need 20 bits to store the page tree index.  Regardless of
>>>whether the index size is 5 or 6, that gives a radix tree depth of 4.
>>>
>>
>>s/common/only ?
> 
> 
> grep again... I use it in the NFS client for bookkeeping requests.

Hey guys,

Take the posix-timers for example:

idr_remove() is called with spin_lock_irqsave()/spin_unlock_irqrestore(),
in release_posix_timer().

_But_, idr_pre_get() is called without any locks (as it should) which
calls alloc_layer() in the IDR code, grabbing a spinlock _internally_ (no IRQ).

If you're in the middle of idr_pre_get() inside the internally locked
section of alloc_layer(), _and_ you call idr_remove() at that same time
and (try to) enter free_layer(), you have a deadlock.

The reason no one has seen this is: _load_.  It wasn't until I started
doing mkfs simultaneously on 3 or more disks in a loop, that I saw this
problem.  I've a stack trace to show if anyone cares.

I'm not sure why no one sees this.

I'm also not sure why people are working on IDR when there is no
_pending_ need to improve this code (other than the deadlock I pointed out),
while SCSI Core has been in dire need of improvement for the last 5 years.

If SCSI Core developed in the same scheme as IDR is now: 
"no need, but its cool so I'll write it and my buddies will include the patch",
we would have a *T10 spec-ed out SCSI Core by now*.(1)

	Luben

(1) I can hear it now... "But do we need it?"
But do we need these IDR improvements when there is no bug report
or oops or architectural need to change it?  Other than the
deadlock?




^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-29  3:37             ` Nick Piggin
  2005-08-29  3:54               ` Trond Myklebust
@ 2005-08-29 15:01               ` James Bottomley
       [not found]               ` <20050829164144.GC9508@localhost.localdomain>
  2 siblings, 0 replies; 19+ messages in thread
From: James Bottomley @ 2005-08-29 15:01 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Linux Kernel

On Mon, 2005-08-29 at 13:37 +1000, Nick Piggin wrote:
> But the page tree is indexed by file offset rather than virtual
> address, and we try to span the file's pagecache with the smallest
> possible tree. So it will tend to make the trees taller.

Well, OK, I concede that one.

However, my contention that it's better to do it this way is rooted in
the simplicity argument:  a bitmap lookup obviously can move straight to
the slot you're looking for.  However, we've always had that loop (which
wastes time) because no-one could get the bitmap code right.  The reason
for this is that variable length bitmaps are hard to manage and
introduce a lot of complexity into what should really be simple code.

I think simplicity is better (for ease of understanding and maintenance)
unless it has an unacceptable cost.  In this case, that cost would be
shown by the benchmarks.  If what I've done is slower on 32 bits than
what we had previously then I'll recode it all with variable length
bitmaps so we can increase the 32 bin index bits to 6 again.

> > Well .. OK .. If the benchmarks say I've slowed us down on 32 bits, I'll
> > put the variable sizing back in the tag array.

> I'm curious: what do the benchmarks say about your gang lookup?

That's why I tried to cc the mm list; I'm not sure what you use for
benchmarks of this type of thing.  My guess would be a mmap of a file,
read followed by some writes, then munmap again?

James



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
       [not found]               ` <20050829164144.GC9508@localhost.localdomain>
@ 2005-08-30  0:56                 ` Nick Piggin
  2005-08-30  1:54                   ` Andrew Morton
  2005-08-30  2:46                   ` James Bottomley
  0 siblings, 2 replies; 19+ messages in thread
From: Nick Piggin @ 2005-08-30  0:56 UTC (permalink / raw)
  To: Sonny Rao; +Cc: James Bottomley, Andrew Morton, Linux Kernel

Sonny Rao wrote:

>On Mon, Aug 29, 2005 at 01:37:48PM +1000, Nick Piggin wrote:
>
>>s/common/only ?
>>
>>But the page tree is indexed by file offset rather than virtual
>>address, and we try to span the file's pagecache with the smallest
>>possible tree. So it will tend to make the trees taller.
>>
>>
>
>I did some experiments with different map-shift values,
>interestingly overall read throughput didn't change much but the
>cpu-utilization of radix_tree_lookup (from oprofile) changed quite a
>bit, especially in the case of MAP_SHIFT == 4 :  
>
>http://www.linuxsymposium.org/2005/linuxsymposium_procv2.pdf 
>
>Look on page 80, where I have the table.
>
>

Nice. So we can see that 6 is a pretty good choice of shift,
4 is too low. That doesn't tell us much about 5, but if you
fit the curve, 5 should be between 14 and 15 ... so getting
expensive.

Of course, different systems and different workloads will
be different. But I'd be wary of going below 6 unless there
is a good reason.

>>I'm curious: what do the benchmarks say about your gang lookup?
>>
>
>Gang-lookup isn't used in the page-cache lookups presently, so I'm
>not sure why optimizing it is very important -- unless someone is
>planning on implementing gang-lookups for page-cache reads. This would
>also cut down on number times a lock is taken and released (expensive,
>in the case of rwlock).  Perhaps there is another reason?
>
>

Gang lookup is mainly used on IO paths but also on truncate,
which is a reasonably fast path on some workloads (James,
this is my suggestion for what you should test - truncate).

>I actually talk about this a little bit at the end of the paper as
>well. I think gang-lookup when readahead has been turned off is
>potentially a good way to go, since we are fairly confident that all
>the pages are in the cache, unfortunately I haven't had (and probably
>won't) have any time to implement it. 
>
>

It is fairly difficult to do gang lookups in the cached cases,
but probably not impossible. But the code around there is already
probably as complicated as we would want it to be, so it would be
nice if it could be cleaned up first (or maybe it is already as
simple as possible :( ).

>Of course, if we go the lockless path it's much less important.
>
>

Yep, that's what I'm thinking. The lockless patches I have can do
lockless gang lookup and use it for truncate. It should be usable
in the same way as the current locked gang lookup is.

But of course gang lookup is only useful if a single read() call
asks for more than 1 page - is that a performance critical path?

Nick


Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-30  0:56                 ` Nick Piggin
@ 2005-08-30  1:54                   ` Andrew Morton
  2005-08-30  2:46                   ` James Bottomley
  1 sibling, 0 replies; 19+ messages in thread
From: Andrew Morton @ 2005-08-30  1:54 UTC (permalink / raw)
  To: Nick Piggin; +Cc: sonnyrao, James.Bottomley, linux-kernel

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> But of course gang lookup is only useful if a single read() call
>  asks for more than 1 page - is that a performance critical path?

readahead should do gang lookups (or, preferably, find-next, when it's
implemented).  But nobody got around to it.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-30  0:56                 ` Nick Piggin
  2005-08-30  1:54                   ` Andrew Morton
@ 2005-08-30  2:46                   ` James Bottomley
  2005-08-30  2:53                     ` Nick Piggin
  1 sibling, 1 reply; 19+ messages in thread
From: James Bottomley @ 2005-08-30  2:46 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Sonny Rao, Andrew Morton, Linux Kernel

On Tue, 2005-08-30 at 10:56 +1000, Nick Piggin wrote:
> Sonny Rao wrote:
> 
> >On Mon, Aug 29, 2005 at 01:37:48PM +1000, Nick Piggin wrote:
> >
> >>s/common/only ?
> >>
> >>But the page tree is indexed by file offset rather than virtual
> >>address, and we try to span the file's pagecache with the smallest
> >>possible tree. So it will tend to make the trees taller.
> >>
> >>
> >
> >I did some experiments with different map-shift values,
> >interestingly overall read throughput didn't change much but the
> >cpu-utilization of radix_tree_lookup (from oprofile) changed quite a
> >bit, especially in the case of MAP_SHIFT == 4 :  
> >
> >http://www.linuxsymposium.org/2005/linuxsymposium_procv2.pdf 
> >
> >Look on page 80, where I have the table.
> >
> Nice. So we can see that 6 is a pretty good choice of shift,
> 4 is too low. That doesn't tell us much about 5, but if you
> fit the curve, 5 should be between 14 and 15 ... so getting
> expensive.

Actually, several crucial pieces of data are missing.  It's not hard to
imagine that the results for 8, 10 and 12 are all equal to within the
error bars.  The missing piece of data, of course, is how big the file
was, which would tell us how deep the tree was.

> Of course, different systems and different workloads will
> be different. But I'd be wary of going below 6 unless there
> is a good reason.
> 
> >>I'm curious: what do the benchmarks say about your gang lookup?
> >>
> >
> >Gang-lookup isn't used in the page-cache lookups presently, so I'm
> >not sure why optimizing it is very important -- unless someone is
> >planning on implementing gang-lookups for page-cache reads. This would
> >also cut down on number times a lock is taken and released (expensive,
> >in the case of rwlock).  Perhaps there is another reason?
> >
> >
> 
> Gang lookup is mainly used on IO paths but also on truncate,
> which is a reasonably fast path on some workloads (James,
> this is my suggestion for what you should test - truncate).

Actually, I don't think I can test this.  In order to show a difference
between index 5 and index 6 on 32 bit, I'd have to deal with files > 4GB
in size.  My 32 bit machines are the voyagers and only have 4GB discs.

The machine with all the huge discs, is, naturally, ia64.

James



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
  2005-08-30  2:46                   ` James Bottomley
@ 2005-08-30  2:53                     ` Nick Piggin
       [not found]                       ` <20050830052405.GB20843@localhost.localdomain>
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2005-08-30  2:53 UTC (permalink / raw)
  To: James Bottomley; +Cc: Sonny Rao, Andrew Morton, Linux Kernel

James Bottomley wrote:

>On Tue, 2005-08-30 at 10:56 +1000, Nick Piggin wrote:
>
>
>>Gang lookup is mainly used on IO paths but also on truncate,
>>which is a reasonably fast path on some workloads (James,
>>this is my suggestion for what you should test - truncate).
>>
>
>Actually, I don't think I can test this.  In order to show a difference
>between index 5 and index 6 on 32 bit, I'd have to deal with files > 4GB
>in size.  My 32 bit machines are the voyagers and only have 4GB discs.
>
>The machine with all the huge discs, is, naturally, ia64.
>
>

Sorry, I meant for testing your gang lookup speedups.

For testing regular lookups, yeah that's more difficult. For a
microbenchmark you can use sparse files, which can be a good
trick for testing pagecache performance without the IO.

Nick


Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] make radix tree gang lookup faster by using a bitmap search
       [not found]                       ` <20050830052405.GB20843@localhost.localdomain>
@ 2005-08-30 13:06                         ` Nick Piggin
  0 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2005-08-30 13:06 UTC (permalink / raw)
  To: Sonny Rao; +Cc: James Bottomley, Andrew Morton, Linux Kernel, sonny

Sonny Rao wrote:
> On Tue, Aug 30, 2005 at 12:53:18PM +1000, Nick Piggin wrote:

>>For testing regular lookups, yeah that's more difficult. For a
>>microbenchmark you can use sparse files, which can be a good
>>trick for testing pagecache performance without the IO.
> 
> 
> I have a feeling that testing using sparse files should be done with
> great care to avoid exaggerating the effects of the CPU cache --
> i.e. you still need to have quite a number of pages to look up, but
> spread them apart by large offsets so that the top few levels don't
> become quickly cached for the whole test. 
> 

That's true depending on what kind of test you want to run.

Just to be clear: by 'sparse files', I mean create a huge,
sparse on-disk file from which pagecache can be arbitrarily
populated without taking up disk space by read(2)ing from it.

 From the pagecache's point of view, there would be no
difference between using that or an on-disk file.

Nick

-- 
SUSE Labs, Novell Inc.


Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2005-08-30 13:06 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-08-27 16:26 [PATCH] make radix tree gang lookup faster by using a bitmap search James Bottomley
2005-08-27 17:53 ` Andrew Morton
2005-08-28 19:43   ` James Bottomley
2005-08-28 20:00     ` Linus Torvalds
2005-08-28 20:39       ` Andrew Morton
2005-08-29  0:45   ` James Bottomley
2005-08-29  0:52     ` Andrew Morton
2005-08-29  1:19       ` James Bottomley
2005-08-29  1:35         ` Andrew Morton
2005-08-29  3:26           ` James Bottomley
2005-08-29  3:37             ` Nick Piggin
2005-08-29  3:54               ` Trond Myklebust
2005-08-29 13:16                 ` Luben Tuikov
2005-08-29 15:01               ` James Bottomley
     [not found]               ` <20050829164144.GC9508@localhost.localdomain>
2005-08-30  0:56                 ` Nick Piggin
2005-08-30  1:54                   ` Andrew Morton
2005-08-30  2:46                   ` James Bottomley
2005-08-30  2:53                     ` Nick Piggin
     [not found]                       ` <20050830052405.GB20843@localhost.localdomain>
2005-08-30 13:06                         ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).