All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] fix quadratic behavior of shrink_dcache_parent()
@ 2007-02-09 22:01 Miklos Szeredi
  2007-02-10  0:00 ` Andrew Morton
  0 siblings, 1 reply; 8+ messages in thread
From: Miklos Szeredi @ 2007-02-09 22:01 UTC (permalink / raw)
  To: akpm; +Cc: Russ Cox, linux-fsdevel, linux-kernel

From: Miklos Szeredi <mszeredi@suse.cz>

The time shrink_dcache_parent() takes, grows quadratically with the
depth of the tree under 'parent'.  This starts to get noticable at
about 10,000.

These kinds of depths don't occur normally, and filesystems which
invoke shrink_dcache_parent() via d_invalidate() seem to have other
depth dependent timings, so it's not even easy to expose this problem.

However with FUSE it's easy to create a deep tree and d_invalidate()
will also get called.  This can make a syscall hang for a very long
time.

This is the original discovery of the problem by Russ Cox:

  http://article.gmane.org/gmane.comp.file-systems.fuse.devel/3826

The following patch fixes the quadratic behavior, by optionally
allowing prune_dcache() to prune ancestors of a dentry in one go,
instead of doing it one at a time.

Common code in dput() and prune_one_dentry() is extracted into a new
helper function d_kill().

shrink_dcache_parent() as well as shrink_dcache_sb() are converted to
use the ancestry-pruner option.  Only for shrink_dcache_memory() is
this behavior not desirable, so it keeps using the old algorithm.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>

---
Index: linux/fs/dcache.c
===================================================================
--- linux.orig/fs/dcache.c	2007-02-09 22:11:02.000000000 +0100
+++ linux/fs/dcache.c	2007-02-09 22:51:02.000000000 +0100
@@ -121,6 +121,28 @@ static void dentry_iput(struct dentry * 
 	}
 }
 
+/**
+ * d_kill - kill dentry and return parent
+ * @dentry: dentry to kill
+ *
+ * Called with dcache_lock and d_lock, releases both.  The dentry must
+ * already be unhashed and removed from the LRU.
+ *
+ * If this is the root of the dentry tree, return NULL.
+ */
+static struct dentry *d_kill(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	list_del(&dentry->d_u.d_child);
+	dentry_stat.nr_dentry--;	/* For d_free, below */
+	/*drops the locks, at that point nobody can reach this dentry */
+	dentry_iput(dentry);
+	parent = dentry->d_parent;
+	d_free(dentry);
+	return dentry == parent ? NULL : parent;
+}
+
 /* 
  * This is dput
  *
@@ -189,28 +211,17 @@ repeat:
 
 unhash_it:
 	__d_drop(dentry);
-
-kill_it: {
-		struct dentry *parent;
-
-		/* If dentry was on d_lru list
-		 * delete it from there
-		 */
-  		if (!list_empty(&dentry->d_lru)) {
-  			list_del(&dentry->d_lru);
-  			dentry_stat.nr_unused--;
-  		}
-  		list_del(&dentry->d_u.d_child);
-		dentry_stat.nr_dentry--;	/* For d_free, below */
-		/*drops the locks, at that point nobody can reach this dentry */
-		dentry_iput(dentry);
-		parent = dentry->d_parent;
-		d_free(dentry);
-		if (dentry == parent)
-			return;
-		dentry = parent;
-		goto repeat;
+kill_it:
+	/* If dentry was on d_lru list
+	 * delete it from there
+	 */
+	if (!list_empty(&dentry->d_lru)) {
+		list_del(&dentry->d_lru);
+		dentry_stat.nr_unused--;
 	}
+	dentry = d_kill(dentry);
+	if (dentry)
+		goto repeat;
 }
 
 /**
@@ -371,22 +382,41 @@ restart:
  * Throw away a dentry - free the inode, dput the parent.  This requires that
  * the LRU list has already been removed.
  *
+ * If prune_parents is true, try to prune ancestors as well.
+ *
  * Called with dcache_lock, drops it and then regains.
  * Called with dentry->d_lock held, drops it.
  */
-static void prune_one_dentry(struct dentry * dentry)
+static void prune_one_dentry(struct dentry * dentry, int prune_parents)
 {
-	struct dentry * parent;
-
 	__d_drop(dentry);
-	list_del(&dentry->d_u.d_child);
-	dentry_stat.nr_dentry--;	/* For d_free, below */
-	dentry_iput(dentry);
-	parent = dentry->d_parent;
-	d_free(dentry);
-	if (parent != dentry)
-		dput(parent);
+	dentry = d_kill(dentry);
+	if (!prune_parents) {
+		if (dentry)
+			dput(dentry);
+		spin_lock(&dcache_lock);
+		return;
+	}
+
+	/*
+	 * Prune ancestors.  Locking is simpler than in dput(),
+	 * because dcache_lock needs to be taken anyway.
+	 */
 	spin_lock(&dcache_lock);
+	while (dentry) {
+		if (!atomic_dec_and_lock(&dentry->d_count, &dentry->d_lock))
+			return;
+
+		if (dentry->d_op && dentry->d_op->d_delete)
+			dentry->d_op->d_delete(dentry);
+		if (!list_empty(&dentry->d_lru)) {
+			list_del(&dentry->d_lru);
+			dentry_stat.nr_unused--;
+		}
+		__d_drop(dentry);
+		dentry = d_kill(dentry);
+		spin_lock(&dcache_lock);
+	}
 }
 
 /**
@@ -394,6 +424,7 @@ static void prune_one_dentry(struct dent
  * @count: number of entries to try and free
  * @sb: if given, ignore dentries for other superblocks
  *         which are being unmounted.
+ * @prune_parents: if true, try to prune ancestors as well in one go
  *
  * Shrink the dcache. This is done when we need
  * more memory, or simply when we need to unmount
@@ -404,7 +435,7 @@ static void prune_one_dentry(struct dent
  * all the dentries are in use.
  */
  
-static void prune_dcache(int count, struct super_block *sb)
+static void prune_dcache(int count, struct super_block *sb, int prune_parents)
 {
 	spin_lock(&dcache_lock);
 	for (; count ; count--) {
@@ -464,7 +495,7 @@ static void prune_dcache(int count, stru
 		 * without taking the s_umount lock (I already hold it).
 		 */
 		if (sb && dentry->d_sb == sb) {
-			prune_one_dentry(dentry);
+			prune_one_dentry(dentry, prune_parents);
 			continue;
 		}
 		/*
@@ -479,7 +510,7 @@ static void prune_dcache(int count, stru
 		s_umount = &dentry->d_sb->s_umount;
 		if (down_read_trylock(s_umount)) {
 			if (dentry->d_sb->s_root != NULL) {
-				prune_one_dentry(dentry);
+				prune_one_dentry(dentry, prune_parents);
 				up_read(s_umount);
 				continue;
 			}
@@ -550,7 +581,7 @@ repeat:
 			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		prune_one_dentry(dentry);
+		prune_one_dentry(dentry, 1);
 		cond_resched_lock(&dcache_lock);
 		goto repeat;
 	}
@@ -829,7 +860,7 @@ void shrink_dcache_parent(struct dentry 
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found, parent->d_sb);
+		prune_dcache(found, parent->d_sb, 1);
 }
 
 /*
@@ -849,7 +880,7 @@ static int shrink_dcache_memory(int nr, 
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
-		prune_dcache(nr, NULL);
+		prune_dcache(nr, NULL, 0);
 	}
 	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] fix quadratic behavior of shrink_dcache_parent()
  2007-02-09 22:01 [PATCH] fix quadratic behavior of shrink_dcache_parent() Miklos Szeredi
@ 2007-02-10  0:00 ` Andrew Morton
  2007-02-10  0:23   ` Russ Cox
  0 siblings, 1 reply; 8+ messages in thread
From: Andrew Morton @ 2007-02-10  0:00 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Russ Cox, linux-fsdevel, linux-kernel

On Fri, 09 Feb 2007 23:01:06 +0100
Miklos Szeredi <miklos@szeredi.hu> wrote:

> From: Miklos Szeredi <mszeredi@suse.cz>
> 
> The time shrink_dcache_parent() takes, grows quadratically with the
> depth of the tree under 'parent'.  This starts to get noticable at
> about 10,000.
> 
> These kinds of depths don't occur normally, and filesystems which
> invoke shrink_dcache_parent() via d_invalidate() seem to have other
> depth dependent timings, so it's not even easy to expose this problem.
> 
> However with FUSE it's easy to create a deep tree and d_invalidate()
> will also get called.  This can make a syscall hang for a very long
> time.
> 
> This is the original discovery of the problem by Russ Cox:
> 
>   http://article.gmane.org/gmane.comp.file-systems.fuse.devel/3826

"The file system mounted on /tmp/z in the example contains 2^50
directories".   heh.

I do wonder how realistic this problem is in real life.

> The following patch fixes the quadratic behavior, by optionally
> allowing prune_dcache() to prune ancestors of a dentry in one go,
> instead of doing it one at a time.
> 
> Common code in dput() and prune_one_dentry() is extracted into a new
> helper function d_kill().
> 
> shrink_dcache_parent() as well as shrink_dcache_sb() are converted to
> use the ancestry-pruner option.  Only for shrink_dcache_memory() is
> this behavior not desirable, so it keeps using the old algorithm.
> 

I wonder if we should be setting shrink_parents=1 in
shrink_dcache_memory()?  Because we have this problem where the dentry
slabs suffer lots of internal fragmentation and we end up with whole slab
pages pinned by a single directory dentry.  I expect that if
shrink_dcache_memory() were aggressive about reaping newly-childless
directory dentries, some improvements might be realised there.

If so, we should change prune_dcache() to return the number pruned, so that
shrink_dcache_memory() can keep its arithmetic correct.  Would require some
careful testing and is out of scope for your work.



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] fix quadratic behavior of shrink_dcache_parent()
  2007-02-10  0:00 ` Andrew Morton
@ 2007-02-10  0:23   ` Russ Cox
  2007-02-10  0:40     ` Andrew Morton
  2007-02-10  8:46     ` Miklos Szeredi
  0 siblings, 2 replies; 8+ messages in thread
From: Russ Cox @ 2007-02-10  0:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Miklos Szeredi, linux-fsdevel, linux-kernel

> "The file system mounted on /tmp/z in the example contains 2^50
> directories".   heh.
>
> I do wonder how realistic this problem is in real life.

That's a fair concern, although I was trying this as part
of evaluating how much someone could hose a system
if we let them mount arbitrary FUSE servers.  And the
answer is: they could make it completely unusable,
requiring reboot.

I ran a later test that printed how deep it got into
the file tree and it was only a few hundred thousand
if I recall correctly.  A determined attacker might even
manage to do this in a normal file system.

But sure, it's not a common case.  ;-)

Russ

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] fix quadratic behavior of shrink_dcache_parent()
  2007-02-10  0:23   ` Russ Cox
@ 2007-02-10  0:40     ` Andrew Morton
  2007-02-10  8:46     ` Miklos Szeredi
  1 sibling, 0 replies; 8+ messages in thread
From: Andrew Morton @ 2007-02-10  0:40 UTC (permalink / raw)
  To: Russ Cox; +Cc: Miklos Szeredi, linux-fsdevel, linux-kernel

On Fri, 9 Feb 2007 19:23:31 -0500
"Russ Cox" <rsc@swtch.com> wrote:

> > "The file system mounted on /tmp/z in the example contains 2^50
> > directories".   heh.
> >
> > I do wonder how realistic this problem is in real life.
> 
> That's a fair concern, although I was trying this as part
> of evaluating how much someone could hose a system
> if we let them mount arbitrary FUSE servers.  And the
> answer is: they could make it completely unusable,
> requiring reboot.
> 
> I ran a later test that printed how deep it got into
> the file tree and it was only a few hundred thousand
> if I recall correctly.  A determined attacker might even
> manage to do this in a normal file system.
> 
> But sure, it's not a common case.  ;-)

Well that's a good point - sometimes people do crazy things on purpose.  We
were all University students once ;) 

The patches look nice and as I said, potentially of some use for memory
reclaim.  But I hope that someone who has worked on dcache.c more recently
than I has time to apply a toothcomb to this work.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] fix quadratic behavior of shrink_dcache_parent()
  2007-02-10  0:23   ` Russ Cox
  2007-02-10  0:40     ` Andrew Morton
@ 2007-02-10  8:46     ` Miklos Szeredi
  2007-02-10 10:51       ` Andi Kleen
  2007-02-10 12:57       ` Russ Cox
  1 sibling, 2 replies; 8+ messages in thread
From: Miklos Szeredi @ 2007-02-10  8:46 UTC (permalink / raw)
  To: rsc; +Cc: akpm, linux-fsdevel, linux-kernel

> > "The file system mounted on /tmp/z in the example contains 2^50
> > directories".   heh.
> >
> > I do wonder how realistic this problem is in real life.
> 
> That's a fair concern, although I was trying this as part
> of evaluating how much someone could hose a system
> if we let them mount arbitrary FUSE servers.  And the
> answer is: they could make it completely unusable,
> requiring reboot.
> 
> I ran a later test that printed how deep it got into
> the file tree and it was only a few hundred thousand
> if I recall correctly.  A determined attacker might even
> manage to do this in a normal file system.

Unfortunately this patch doesn't completely solve this problem, since
the system will still be hosed due to all memory being used up by
dentries.  And I bet the OOM killer won't find the real target (du)
but will kill anything before that.

So the second part of the problem is to somehow limit the number of
dentries used.  Not easy...

Miklos

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] fix quadratic behavior of shrink_dcache_parent()
  2007-02-10  8:46     ` Miklos Szeredi
@ 2007-02-10 10:51       ` Andi Kleen
  2007-02-10 12:57       ` Russ Cox
  1 sibling, 0 replies; 8+ messages in thread
From: Andi Kleen @ 2007-02-10 10:51 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: rsc, akpm, linux-fsdevel, linux-kernel

Miklos Szeredi <miklos@szeredi.hu> writes:
 
> So the second part of the problem is to somehow limit the number of
> dentries used.  Not easy...

OpenVZ has some existing work in this area to separate their virtual machines. 
I assume they will eventually submit it.

-Andi

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] fix quadratic behavior of shrink_dcache_parent()
  2007-02-10  8:46     ` Miklos Szeredi
  2007-02-10 10:51       ` Andi Kleen
@ 2007-02-10 12:57       ` Russ Cox
  2007-02-11 12:13         ` Miklos Szeredi
  1 sibling, 1 reply; 8+ messages in thread
From: Russ Cox @ 2007-02-10 12:57 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: akpm, linux-fsdevel, linux-kernel

> Unfortunately this patch doesn't completely solve this problem, since
> the system will still be hosed due to all memory being used up by
> dentries.  And I bet the OOM killer won't find the real target (du)
> but will kill anything before that.
>
> So the second part of the problem is to somehow limit the number of
> dentries used.  Not easy...

At least with this patch (if I am reading it correctly), once the offending
culprit is identified and the programs are killed off, everything will go
back to normal without a reboot.

Russ

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] fix quadratic behavior of shrink_dcache_parent()
  2007-02-10 12:57       ` Russ Cox
@ 2007-02-11 12:13         ` Miklos Szeredi
  0 siblings, 0 replies; 8+ messages in thread
From: Miklos Szeredi @ 2007-02-11 12:13 UTC (permalink / raw)
  To: rsc; +Cc: akpm, linux-fsdevel, linux-kernel

> > Unfortunately this patch doesn't completely solve this problem, since
> > the system will still be hosed due to all memory being used up by
> > dentries.  And I bet the OOM killer won't find the real target (du)
> > but will kill anything before that.
> >
> > So the second part of the problem is to somehow limit the number of
> > dentries used.  Not easy...
> 
> At least with this patch (if I am reading it correctly), once the offending
> culprit is identified and the programs are killed off, everything will go
> back to normal without a reboot.

Yes, it's basically a "normal" OOM situation.  What makes it worse
than usual, is that it's _kernel_ memory being used up which is not
attributed to any process.

So from the OOM killer's point of view it looks like none of the
userspace programs are responsible for the OOM, and it will just
select targets randomly.

By the time it has come to the actual culprits (the filesystem daemon
and 'du') it will have killed almost everything useful.

So it's definitely better than without the patch, but still has room
for improvement.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2007-02-11 12:14 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-09 22:01 [PATCH] fix quadratic behavior of shrink_dcache_parent() Miklos Szeredi
2007-02-10  0:00 ` Andrew Morton
2007-02-10  0:23   ` Russ Cox
2007-02-10  0:40     ` Andrew Morton
2007-02-10  8:46     ` Miklos Szeredi
2007-02-10 10:51       ` Andi Kleen
2007-02-10 12:57       ` Russ Cox
2007-02-11 12:13         ` Miklos Szeredi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.