linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Fix dcache race during umount
       [not found] <20060403133804.27986.patches@notabene>
@ 2006-04-03  3:40 ` NeilBrown
  2006-04-03 18:12   ` Balbir Singh
  0 siblings, 1 reply; 5+ messages in thread
From: NeilBrown @ 2006-04-03  3:40 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Jan Blunck, Kirill Korotaev, olh, bsingharora

I think we finally have consensus on this patch.  However please speak
up if we don't :-)

Patch is against 2.6.16-mm2

NeilBrown


### Comments for Changeset

The race is that the shrink_dcache_memory shrinker could get called
while a filesystem is being unmounted, and could try to prune
a dentry belonging to that filesystem.

If it does, then it will call in to iput on the inode while the
dentry is no longer able to be found by the umounting process.  If
iput takes a while, generic_shutdown_super could get all the way
though shrink_dcache_parent and shrink_dcache_anon and
invalidate_inodes without ever waiting on this particular inode.

Eventually the superblock gets freed anyway and if the iput tried to
touch it (which some filesystems certainly do), it will lose.  The
promised "Self-destruct in 5 seconds" doesn't lead to a nice day.

The race is closed by holding s_umount while calling prune_one_dentry
on someone else's dentry.  As a down_read_trylock is used,
shrink_dcache_memory will no longer try to prune the dentry of a
filesystem that is being unmounted, and unmount will not be able to
start until any such active prune_one_dentry completes.

This requires that prune_dcache *knows* which filesystem (if any) it
is doing the prune on behalf of so that it can be careful of other
filesystems.  shrink_dcache_memory isn't called it on behalf of any
filesystem, and so is careful of everything.

shrink_dcache_anon is now passed a super_block rather than the s_anon
list out of the superblock, so it can get the s_anon list itself, and
can pass the superblock down to prune_dcache.

I believe this race was first found by Kirill Korotaev.

Cc: Jan Blunck <jblunck@suse.de>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: olh@suse.de
Cc: bsingharora@gmail.com

Signed-off-by: Neil Brown <neilb@suse.de>

### Diffstat output
 ./fs/dcache.c            |   41 ++++++++++++++++++++++++++++-------------
 ./fs/super.c             |    2 +-
 ./include/linux/dcache.h |    2 +-
 3 files changed, 30 insertions(+), 15 deletions(-)

diff ./fs/dcache.c~current~ ./fs/dcache.c
--- ./fs/dcache.c~current~	2006-04-03 12:21:49.000000000 +1000
+++ ./fs/dcache.c	2006-04-03 12:21:55.000000000 +1000
@@ -382,6 +382,8 @@ static inline void prune_one_dentry(stru
 /**
  * prune_dcache - shrink the dcache
  * @count: number of entries to try and free
+ * @sb: if given, ignore dentries for other superblocks
+ *         which are being unmounted.
  *
  * Shrink the dcache. This is done when we need
  * more memory, or simply when we need to unmount
@@ -392,7 +394,7 @@ static inline void prune_one_dentry(stru
  * all the dentries are in use.
  */
  
-static void prune_dcache(int count)
+static void prune_dcache(int count, struct super_block *sb)
 {
 	spin_lock(&dcache_lock);
 	for (; count ; count--) {
@@ -419,15 +421,27 @@ static void prune_dcache(int count)
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		/* If the dentry was recently referenced, don't free it. */
-		if (dentry->d_flags & DCACHE_REFERENCED) {
-			dentry->d_flags &= ~DCACHE_REFERENCED;
- 			list_add(&dentry->d_lru, &dentry_unused);
- 			dentry_stat.nr_unused++;
- 			spin_unlock(&dentry->d_lock);
-			continue;
+		if (!(dentry->d_flags & DCACHE_REFERENCED) &&
+		    (!sb || dentry->d_sb == sb)) {
+			if (sb) {
+				prune_one_dentry(dentry);
+				continue;
+			}
+			/* Need to avoid race with generic_shutdown_super */
+			if (down_read_trylock(&dentry->d_sb->s_umount) &&
+			    dentry->d_sb->s_root != NULL) {
+				prune_one_dentry(dentry);
+				up_read(&dentry->d_sb->s_umount);
+				continue;
+			}
 		}
-		prune_one_dentry(dentry);
+		/* The dentry was recently referenced, or the filesystem
+		 * is being unmounted, so don't free it. */
+
+		dentry->d_flags &= ~DCACHE_REFERENCED;
+		list_add(&dentry->d_lru, &dentry_unused);
+		dentry_stat.nr_unused++;
+		spin_unlock(&dentry->d_lock);
 	}
 	spin_unlock(&dcache_lock);
 }
@@ -630,7 +644,7 @@ void shrink_dcache_parent(struct dentry 
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found);
+		prune_dcache(found, parent->d_sb);
 }
 
 /**
@@ -643,9 +657,10 @@ void shrink_dcache_parent(struct dentry 
  * done under dcache_lock.
  *
  */
-void shrink_dcache_anon(struct hlist_head *head)
+void shrink_dcache_anon(struct super_block *sb)
 {
 	struct hlist_node *lp;
+	struct hlist_head *head = &sb->s_anon;
 	int found;
 	do {
 		found = 0;
@@ -668,7 +683,7 @@ void shrink_dcache_anon(struct hlist_hea
 			}
 		}
 		spin_unlock(&dcache_lock);
-		prune_dcache(found);
+		prune_dcache(found, sb);
 	} while(found);
 }
 
@@ -689,7 +704,7 @@ static int shrink_dcache_memory(int nr, 
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
-		prune_dcache(nr);
+		prune_dcache(nr, NULL);
 	}
 	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }

diff ./fs/super.c~current~ ./fs/super.c
--- ./fs/super.c~current~	2006-04-03 12:21:49.000000000 +1000
+++ ./fs/super.c	2006-04-03 12:21:55.000000000 +1000
@@ -231,7 +231,7 @@ void generic_shutdown_super(struct super
 	if (root) {
 		sb->s_root = NULL;
 		shrink_dcache_parent(root);
-		shrink_dcache_anon(&sb->s_anon);
+		shrink_dcache_anon(sb);
 		dput(root);
 		fsync_super(sb);
 		lock_super(sb);

diff ./include/linux/dcache.h~current~ ./include/linux/dcache.h
--- ./include/linux/dcache.h~current~	2006-04-03 12:21:49.000000000 +1000
+++ ./include/linux/dcache.h	2006-04-03 12:21:55.000000000 +1000
@@ -217,7 +217,7 @@ extern struct dentry * d_alloc_anon(stru
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
-extern void shrink_dcache_anon(struct hlist_head *);
+extern void shrink_dcache_anon(struct super_block *);
 extern int d_invalidate(struct dentry *);
 
 /* only used at mount-time */

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Fix dcache race during umount
  2006-04-03  3:40 ` [PATCH] Fix dcache race during umount NeilBrown
@ 2006-04-03 18:12   ` Balbir Singh
  2006-04-04  0:59     ` Neil Brown
  2006-04-06 14:05     ` [PATCH 2.6.17-rc1-mm1] BUG due to freed dentry in dcache race fix Balbir Singh
  0 siblings, 2 replies; 5+ messages in thread
From: Balbir Singh @ 2006-04-03 18:12 UTC (permalink / raw)
  To: NeilBrown
  Cc: Andrew Morton, linux-kernel, Jan Blunck, Kirill Korotaev, olh, balbir

Hi, Neil,

>
> Cc: Jan Blunck <jblunck@suse.de>
> Cc: Kirill Korotaev <dev@openvz.org>
> Cc: olh@suse.de
> Cc: bsingharora@gmail.com

Could you please make this balbir@in.ibm.com

>
> Signed-off-by: Neil Brown <neilb@suse.de>
>
<snip>
-               /* If the dentry was recently referenced, don't free it. */
-               if (dentry->d_flags & DCACHE_REFERENCED) {
-                       dentry->d_flags &= ~DCACHE_REFERENCED;
-                       list_add(&dentry->d_lru, &dentry_unused);
-                       dentry_stat.nr_unused++;
-                       spin_unlock(&dentry->d_lock);
-                       continue;
+               if (!(dentry->d_flags & DCACHE_REFERENCED) &&
+                   (!sb || dentry->d_sb == sb)) {

Comments for the condition please. Something like

/*
 * If the dentry is not DCACHED_REFERENCED, it is time to move it to LRU list,
 * provided the super block is NULL (which means we are trying to reclaim memory
 * or this dentry belongs to the same super block that we want to shrink.
 */

One side-effect of this check I see is

Earlier, all prune_dcache() calls would prune the dentry cache. This
condition will cause dentries belonging only those super blocks being
shrink'ed to be freed up. shrink_dcache_memory() will have to do the
additional work of freeing dentries (especially for file systems like
sysfs, procfs, etc). But the good thing is it should make the per
super block operations faster (like unmount). IMO, this is the correct
behaviour, but I am not sure of the side-effects.

+                       if (sb) {
+                               prune_one_dentry(dentry);
+                               continue;
+                       }
> +                       /* Need to avoid race with generic_shutdown_super */
> +                       if (down_read_trylock(&dentry->d_sb->s_umount) &&
> +                           dentry->d_sb->s_root != NULL) {

There is a probable bug here. What if down_read_trylock() succeeds and
dentry->d_sb->s_root == NULL? We still need to do an up_read before we
move on.
The comment would be better put as

/*
 * If we are able to acquire the umount semaphore, then the super
block cannot be unmounted
 * while we are pruning this dentry. This helps avoid a race condition
that is caused due to
 * intermediate reference counts held by the children of the dentry in
prune_one_dentry().
 * This leads to select_dcache_parent() ignoring those dentries,
leaving behind non-dput
 * dentries. The unmount happens before prune_one_dentry() can dput
the dentries.
 */

> +                               prune_one_dentry(dentry);
> +                               up_read(&dentry->d_sb->s_umount);
> +                               continue;
> +                       }
>                 }
<snip>

Looks good to go, except for the comments (especially the up_read() bug)

Balbir

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Fix dcache race during umount
  2006-04-03 18:12   ` Balbir Singh
@ 2006-04-04  0:59     ` Neil Brown
  2006-04-04  5:02       ` Balbir Singh
  2006-04-06 14:05     ` [PATCH 2.6.17-rc1-mm1] BUG due to freed dentry in dcache race fix Balbir Singh
  1 sibling, 1 reply; 5+ messages in thread
From: Neil Brown @ 2006-04-04  0:59 UTC (permalink / raw)
  To: balbir; +Cc: Andrew Morton, linux-kernel, Jan Blunck, Kirill Korotaev, olh

On Monday April 3, balbir@in.ibm.com wrote:
> Hi, Neil,
> 
> >
> > Cc: Jan Blunck <jblunck@suse.de>
> > Cc: Kirill Korotaev <dev@openvz.org>
> > Cc: olh@suse.de
> > Cc: bsingharora@gmail.com
> 
> Could you please make this balbir@in.ibm.com

Sure.

> 
> >
> > Signed-off-by: Neil Brown <neilb@suse.de>
> >
> <snip>
> -               /* If the dentry was recently referenced, don't free it. */
> -               if (dentry->d_flags & DCACHE_REFERENCED) {
> -                       dentry->d_flags &= ~DCACHE_REFERENCED;
> -                       list_add(&dentry->d_lru, &dentry_unused);
> -                       dentry_stat.nr_unused++;
> -                       spin_unlock(&dentry->d_lock);
> -                       continue;
> +               if (!(dentry->d_flags & DCACHE_REFERENCED) &&
> +                   (!sb || dentry->d_sb == sb)) {
> 
> Comments for the condition please. Something like
> 
> /*
>  * If the dentry is not DCACHED_REFERENCED, it is time to move it to LRU list,
>  * provided the super block is NULL (which means we are trying to reclaim memory
>  * or this dentry belongs to the same super block that we want to shrink.
>  */

Ok, thanks.  However it isn't time to "move it to the LRU list" but
rather time to "move it from the LRU list, out of the cache all
together, and through it away".

> 
> One side-effect of this check I see is
> 
> Earlier, all prune_dcache() calls would prune the dentry cache. This
> condition will cause dentries belonging only those super blocks being
> shrink'ed to be freed up. shrink_dcache_memory() will have to do the
> additional work of freeing dentries (especially for file systems like
> sysfs, procfs, etc). But the good thing is it should make the per
> super block operations faster (like unmount). IMO, this is the correct
> behaviour, but I am not sure of the side-effects.
> 

Hmm... yes, but there is a worse side-effect I think. If
shrink_dcache_memory finds a dentry that it cannot free, it will move
it to the head of the LRU, so unmount will not be able to find it so
easily, and will end up moving it back down to the tail.  I don't
think this can livelock, but it is unpleasant.

Rather than move these entries to the head of the list, I'd like to
leave them at the tail, and try to skip over entries that we might not
be able to free.



> +                       if (sb) {
> +                               prune_one_dentry(dentry);
> +                               continue;
> +                       }
> > +                       /* Need to avoid race with generic_shutdown_super */
> > +                       if (down_read_trylock(&dentry->d_sb->s_umount) &&
> > +                           dentry->d_sb->s_root != NULL) {
> 
> There is a probable bug here. What if down_read_trylock() succeeds and
> dentry->d_sb->s_root == NULL? We still need to do an up_read before we
> move on.

Oops, yes.  Thanks for that!

> The comment would be better put as
> 
> /*
>  * If we are able to acquire the umount semaphore, then the super
> block cannot be unmounted
>  * while we are pruning this dentry. This helps avoid a race condition
> that is caused due to
>  * intermediate reference counts held by the children of the dentry in
> prune_one_dentry().
>  * This leads to select_dcache_parent() ignoring those dentries,
> leaving behind non-dput
>  * dentries. The unmount happens before prune_one_dentry() can dput
> the dentries.
>  */

Well, I've beefed up the comment, but I would rather focus on the
prune_one_dentry holding an inode rather than holding a dentry.  I
think that problem is clearer and it comes to the same thing.

Patch to follow.

Thanks,
NeilBrown

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Fix dcache race during umount
  2006-04-04  0:59     ` Neil Brown
@ 2006-04-04  5:02       ` Balbir Singh
  0 siblings, 0 replies; 5+ messages in thread
From: Balbir Singh @ 2006-04-04  5:02 UTC (permalink / raw)
  To: Neil Brown; +Cc: Andrew Morton, linux-kernel, Jan Blunck, Kirill Korotaev, olh

> >  * If the dentry is not DCACHED_REFERENCED, it is time to move it to LRU list,
> >  * provided the super block is NULL (which means we are trying to reclaim memory
> >  * or this dentry belongs to the same super block that we want to shrink.
> >  */
>
> Ok, thanks.  However it isn't time to "move it to the LRU list" but
> rather time to "move it from the LRU list, out of the cache all
> together, and through it away".

Oops, yes

>
> >
> > One side-effect of this check I see is
> >
> > Earlier, all prune_dcache() calls would prune the dentry cache. This
> > condition will cause dentries belonging only those super blocks being
> > shrink'ed to be freed up. shrink_dcache_memory() will have to do the
> > additional work of freeing dentries (especially for file systems like
> > sysfs, procfs, etc). But the good thing is it should make the per
> > super block operations faster (like unmount). IMO, this is the correct
> > behaviour, but I am not sure of the side-effects.
> >
>
> Hmm... yes, but there is a worse side-effect I think. If
> shrink_dcache_memory finds a dentry that it cannot free, it will move
> it to the head of the LRU, so unmount will not be able to find it so
> easily, and will end up moving it back down to the tail.  I don't
> think this can livelock, but it is unpleasant.
>
> Rather than move these entries to the head of the list, I'd like to
> leave them at the tail, and try to skip over entries that we might not
> be able to free.

Yes and you could use the first pass of dcache_shrink_sb() to do that for you.

Thanks,
Balbir

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 2.6.17-rc1-mm1] BUG due to freed dentry in dcache race fix
  2006-04-03 18:12   ` Balbir Singh
  2006-04-04  0:59     ` Neil Brown
@ 2006-04-06 14:05     ` Balbir Singh
  1 sibling, 0 replies; 5+ messages in thread
From: Balbir Singh @ 2006-04-06 14:05 UTC (permalink / raw)
  To: NeilBrown, Andrew Morton; +Cc: linux-kernel, Jan Blunck, Kirill Korotaev, olh

Hi, Andrew,

Please apply this patch on top of

fix-dcache-race-during-umount.patch

we need to save a reference to the s_umount read write semaphore. The dentry
can be freed by prune_one_dentry(). Dereferencing dentry->d_sb->s_umount is
not safe after that point.

I hit an Oops while running 2.6.17-rc1-mm1

DMA free:3584kB min:68kB low:84kB high:100kB active:10448kB inactive:0kB presentOops: 0002 [#1]
PREEMPT SMP
last sysfs file: /devices/pci0000:00/0000:00:0a.0/power/state
Modules linked in: loop dm_mod ide_cd cdrom ohci_hcd usbcore serverworks generii
CPU:    1
EIP:    0060:[<c10824f1>]    Not tainted VLI
EFLAGS: 00010212   (2.6.17-rc1-mm1cpum #2)
EIP is at prune_dcache+0x91/0x1d0
eax: 6b6b6ba7   ebx: e45918e0   ecx: 00000001   edx: ffffffff
esi: e45918e8   edi: 00000058   ebp: e4cfcbe0   esp: e4cfcbbc
ds: 007b   es: 007b   ss: 0068
Process hackbench (pid: 11183, threadinfo=e4cfc000 task=e4d076b0)
Stack: <0>c12fb400 e4cfcbd0 c122f5ed c2288504 00000000 00000000 0000283c 000a0f
       c2259404 e4cfcbe8 c108266e e4cfcc28 c104fe9b 00000080 000000d0 0000000b
       00000021 00000000 e4cfc000 00000000 0000008c e4cfc000 00000080 00004db7
Call Trace:
 <c1003f9d> show_stack_log_lvl+0xad/0xe0   <c10041e7> show_registers+0x1c7/0x250
 <c10043aa> die+0x13a/0x330   <c1230f50> do_page_fault+0x2d0/0x750
 <c1003987> error_code+0x4f/0x54   <c108266e> shrink_dcache_memory+0x3e/0x50
 <c104fe9b> shrink_slab+0x17b/0x240   <c105077f> try_to_free_pages+0x1bf/0x2b0
 <c104b466> __alloc_pages+0x136/0x310   <c10635fc> cache_alloc_refill+0x40c/0x70
 <c1063b86> __kmalloc_track_caller+0xc6/0xf0   <c11d922f> __alloc_skb+0x5f/0x110
 <c11d5247> sock_alloc_send_skb+0x1a7/0x200   <c1227a2d> unix_stream_sendmsg+0x0
 <c11d1bb4> do_sock_write+0xb4/0xc0   <c11d2367> sock_aio_write+0x67/0x70
 <c1067809> do_sync_write+0xb9/0xf0   <c10682f1> vfs_write+0x181/0x190
 <c1068a07> sys_write+0x47/0x70   <c122f93f> sysenter_past_esp+0x54/0x75
Code: 0a 75 f3 85 c0 0f 88 fe 00 00 00 8b 4b 60 8b 41 38 85 c0 0f 84 de 00 00 0

Thanks,
Balbir

Signed-off-by: Balbir Singh <balbir@in.ibm.com>
---

 fs/dcache.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletion(-)

diff -puN fs/dcache.c~dcache_race_umount_sem_fix fs/dcache.c
--- linux-2.6.17/fs/dcache.c~dcache_race_umount_sem_fix	2006-04-06 17:11:41.000000000 +0530
+++ linux-2.6.17-balbir/fs/dcache.c	2006-04-06 17:17:02.000000000 +0530
@@ -464,9 +464,14 @@ static void prune_dcache(int count, stru
 		 * So we try to get s_umount, and make sure s_root isn't NULL
 		 */
 		if (down_read_trylock(&dentry->d_sb->s_umount)) {
+			/*
+			 * Save the semaphore reference, prune_one_dentry() can
+			 * free the dentry
+			 */
+			struct rw_semaphore *umnt_sem = &dentry->d_sb->s_umount;
 			if (dentry->d_sb->s_root != NULL) {
 				prune_one_dentry(dentry);
-				up_read(&dentry->d_sb->s_umount);
+				up_read(umnt_sem);
 				continue;
 			}
 			up_read(&dentry->d_sb->s_umount);
_

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2006-04-06 14:08 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20060403133804.27986.patches@notabene>
2006-04-03  3:40 ` [PATCH] Fix dcache race during umount NeilBrown
2006-04-03 18:12   ` Balbir Singh
2006-04-04  0:59     ` Neil Brown
2006-04-04  5:02       ` Balbir Singh
2006-04-06 14:05     ` [PATCH 2.6.17-rc1-mm1] BUG due to freed dentry in dcache race fix Balbir Singh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).