linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
@ 2006-01-20 20:36 Jan Blunck
  2006-01-23  5:22 ` Andrew Morton
  2006-01-23  8:07 ` Kirill Korotaev
  0 siblings, 2 replies; 20+ messages in thread
From: Jan Blunck @ 2006-01-20 20:36 UTC (permalink / raw)
  To: viro; +Cc: dev, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1522 bytes --]

Kirill Korotaev <dev@sw.ru> discovered a race between shrink_dcache_parent()
and shrink_dcache_memory(). That one is based on dput() is calling
dentry_iput() too early and therefore is giving up the dcache_lock. This leads
to the situation that the parent dentry might be still referenced although all
childs are already dead. This parent is ignore by a concurrent select_parent()
call which might be the reason for busy inode after umount failures.

This is from Kirill's original patch:

CPU 1                    CPU 2
~~~~~                    ~~~~~
umount /dev/sda1
generic_shutdown_super   shrink_dcache_memory
shrink_dcache_parent     prune_one_dentry
select_parent            dput     <<<< child is dead, locks are released,
                                       but parent is still referenced!!! >>>>
skip dentry->parent,
since it's d_count > 0

message: BUSY inodes after umount...
                                  <<< parent is left on dentry_unused list,
                                      referencing freed super block >>>

This patch is introducing dput_locked() which is doing all the dput work
except of freeing up the dentry's inode and memory itself. Therefore, when the
dcache_lock is given up, all the reference counts of the parents are correct.
prune_one_dentry() must also use the dput_locked version and free up the
inodes and the memory of the parents later. Otherwise we have an incorrect
reference count on the parents of the dentry to prune.

Signed-off-by: Jan Blunck <jblunck@suse.de>
---

[-- Attachment #2: dput-late_iput.diff --]
[-- Type: text/plain, Size: 3111 bytes --]

 fs/dcache.c |   76 ++++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 22 deletions(-)

Index: linux-2.6/fs/dcache.c
===================================================================
--- linux-2.6.orig/fs/dcache.c
+++ linux-2.6/fs/dcache.c
@@ -143,21 +143,18 @@ static void dentry_iput(struct dentry * 
  * no dcache lock, please.
  */
 
-void dput(struct dentry *dentry)
+static void dput_locked(struct dentry *dentry, struct list_head *list)
 {
 	if (!dentry)
 		return;
 
-repeat:
-	if (atomic_read(&dentry->d_count) == 1)
-		might_sleep();
-	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
+	if (!atomic_dec_and_test(&dentry->d_count))
 		return;
 
+repeat:
 	spin_lock(&dentry->d_lock);
 	if (atomic_read(&dentry->d_count)) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&dcache_lock);
 		return;
 	}
 
@@ -177,32 +174,54 @@ repeat:
   		dentry_stat.nr_unused++;
   	}
  	spin_unlock(&dentry->d_lock);
-	spin_unlock(&dcache_lock);
 	return;
 
 unhash_it:
 	__d_drop(dentry);
 
 kill_it: {
-		struct dentry *parent;
-
 		/* If dentry was on d_lru list
 		 * delete it from there
 		 */
   		if (!list_empty(&dentry->d_lru)) {
-  			list_del(&dentry->d_lru);
+  			list_del_init(&dentry->d_lru);
   			dentry_stat.nr_unused--;
   		}
   		list_del(&dentry->d_u.d_child);
 		dentry_stat.nr_dentry--;	/* For d_free, below */
-		/*drops the locks, at that point nobody can reach this dentry */
-		dentry_iput(dentry);
-		parent = dentry->d_parent;
-		d_free(dentry);
-		if (dentry == parent)
+		/* at this point nobody can reach this dentry */
+		list_add(&dentry->d_lru, list);
+		spin_unlock(&dentry->d_lock);
+		if (dentry == dentry->d_parent)
 			return;
-		dentry = parent;
-		goto repeat;
+		dentry = dentry->d_parent;
+		if (atomic_dec_and_test(&dentry->d_count))
+			goto repeat;
+		/* out */
+	}
+}
+
+void dput(struct dentry *dentry)
+{
+	LIST_HEAD(free_list);
+
+	if (!dentry)
+		return;
+
+	if (atomic_add_unless(&dentry->d_count, -1, 1))
+		return;
+
+	spin_lock(&dcache_lock);
+	dput_locked(dentry, &free_list);
+	spin_unlock(&dcache_lock);
+
+	if (!list_empty(&free_list)) {
+		struct dentry *dentry, *p;
+		list_for_each_entry_safe(dentry, p, &free_list, d_lru) {
+			list_del(&dentry->d_lru);
+			dentry_iput(dentry);
+			d_free(dentry);
+		}
 	}
 }
 
@@ -364,16 +383,29 @@ restart:
  */
 static inline void prune_one_dentry(struct dentry * dentry)
 {
-	struct dentry * parent;
+	LIST_HEAD(free_list);
 
 	__d_drop(dentry);
 	list_del(&dentry->d_u.d_child);
 	dentry_stat.nr_dentry--;	/* For d_free, below */
-	dentry_iput(dentry);
-	parent = dentry->d_parent;
+
+	/* dput the parent here before we release dcache_lock */
+	if (dentry != dentry->d_parent)
+		dput_locked(dentry->d_parent, &free_list);
+
+	dentry_iput(dentry);		/* drop locks */
 	d_free(dentry);
-	if (parent != dentry)
-		dput(parent);
+
+	if (!list_empty(&free_list)) {
+		struct dentry *tmp, *p;
+
+		list_for_each_entry_safe(tmp, p, &free_list, d_lru) {
+			list_del(&tmp->d_lru);
+			dentry_iput(tmp);
+			d_free(tmp);
+		}
+	}
+
 	spin_lock(&dcache_lock);
 }
 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-20 20:36 [PATCH] shrink_dcache_parent() races against shrink_dcache_memory() Jan Blunck
@ 2006-01-23  5:22 ` Andrew Morton
  2006-01-23  8:12   ` Kirill Korotaev
  2006-01-23 15:13   ` Jan Blunck
  2006-01-23  8:07 ` Kirill Korotaev
  1 sibling, 2 replies; 20+ messages in thread
From: Andrew Morton @ 2006-01-23  5:22 UTC (permalink / raw)
  To: Jan Blunck; +Cc: viro, dev, linux-kernel

Jan Blunck <jblunck@suse.de> wrote:
>
> Kirill Korotaev <dev@sw.ru> discovered a race between shrink_dcache_parent()
> and shrink_dcache_memory(). That one is based on dput() is calling
> dentry_iput() too early and therefore is giving up the dcache_lock. This leads
> to the situation that the parent dentry might be still referenced although all
> childs are already dead. This parent is ignore by a concurrent select_parent()
> call which might be the reason for busy inode after umount failures.
> 
> This is from Kirill's original patch:
> 
> CPU 1                    CPU 2
> ~~~~~                    ~~~~~
> umount /dev/sda1
> generic_shutdown_super   shrink_dcache_memory
> shrink_dcache_parent     prune_one_dentry
> select_parent            dput     <<<< child is dead, locks are released,
>                                        but parent is still referenced!!! >>>>
> skip dentry->parent,
> since it's d_count > 0
> 
> message: BUSY inodes after umount...
>                                   <<< parent is left on dentry_unused list,
>                                       referencing freed super block >>>
> 
> This patch is introducing dput_locked() which is doing all the dput work
> except of freeing up the dentry's inode and memory itself. Therefore, when the
> dcache_lock is given up, all the reference counts of the parents are correct.
> prune_one_dentry() must also use the dput_locked version and free up the
> inodes and the memory of the parents later. Otherwise we have an incorrect
> reference count on the parents of the dentry to prune.
> 
> ...

> -void dput(struct dentry *dentry)
> +static void dput_locked(struct dentry *dentry, struct list_head *list)
>  {
>  	if (!dentry)
>  		return;
>  
> -repeat:
> -	if (atomic_read(&dentry->d_count) == 1)
> -		might_sleep();
> -	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
> +	if (!atomic_dec_and_test(&dentry->d_count))
>  		return;
>  
> +
>
> ...
>
> +void dput(struct dentry *dentry)
> +{
> +	LIST_HEAD(free_list);
> +
> +	if (!dentry)
> +		return;
> +
> +	if (atomic_add_unless(&dentry->d_count, -1, 1))
> +		return;
> +
> +	spin_lock(&dcache_lock);
> +	dput_locked(dentry, &free_list);
> +	spin_unlock(&dcache_lock);

This seems to be an open-coded copy of atomic_dec_and_lock()?


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-20 20:36 [PATCH] shrink_dcache_parent() races against shrink_dcache_memory() Jan Blunck
  2006-01-23  5:22 ` Andrew Morton
@ 2006-01-23  8:07 ` Kirill Korotaev
  2006-01-23 15:57   ` Jan Blunck
  2006-01-30 12:03   ` Jan Blunck
  1 sibling, 2 replies; 20+ messages in thread
From: Kirill Korotaev @ 2006-01-23  8:07 UTC (permalink / raw)
  To: Jan Blunck; +Cc: viro, linux-kernel, Andrew Morton, olh

Jan,

1. this patch doesn't fix the whole problem. iput() after sb free is 
still possible. So busy inodes after umount too.
2. it has big problems with locking...

comments below inside.

Kirill

> Kirill Korotaev <dev@sw.ru> discovered a race between shrink_dcache_parent()
> and shrink_dcache_memory(). That one is based on dput() is calling
> dentry_iput() too early and therefore is giving up the dcache_lock. This leads
> to the situation that the parent dentry might be still referenced although all
> childs are already dead. This parent is ignore by a concurrent select_parent()
> call which might be the reason for busy inode after umount failures.
> 
> This is from Kirill's original patch:
> 
> CPU 1                    CPU 2
> ~~~~~                    ~~~~~
> umount /dev/sda1
> generic_shutdown_super   shrink_dcache_memory
> shrink_dcache_parent     prune_one_dentry
> select_parent            dput     <<<< child is dead, locks are released,
>                                        but parent is still referenced!!! >>>>
> skip dentry->parent,
> since it's d_count > 0
> 
> message: BUSY inodes after umount...
>                                   <<< parent is left on dentry_unused list,
>                                       referencing freed super block >>>
> 
> This patch is introducing dput_locked() which is doing all the dput work
> except of freeing up the dentry's inode and memory itself. Therefore, when the
> dcache_lock is given up, all the reference counts of the parents are correct.
> prune_one_dentry() must also use the dput_locked version and free up the
> inodes and the memory of the parents later. Otherwise we have an incorrect
> reference count on the parents of the dentry to prune.
> 
> Signed-off-by: Jan Blunck <jblunck@suse.de>
> ---
> 
> 
> ------------------------------------------------------------------------
> 
>  fs/dcache.c |   76 ++++++++++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 54 insertions(+), 22 deletions(-)
> 
> Index: linux-2.6/fs/dcache.c
> ===================================================================
> --- linux-2.6.orig/fs/dcache.c
> +++ linux-2.6/fs/dcache.c
> @@ -143,21 +143,18 @@ static void dentry_iput(struct dentry * 
>   * no dcache lock, please.
>   */
>  
> -void dput(struct dentry *dentry)
> +static void dput_locked(struct dentry *dentry, struct list_head *list)
>  {
>  	if (!dentry)
>  		return;
>  
> -repeat:
> -	if (atomic_read(&dentry->d_count) == 1)
> -		might_sleep();
> -	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
> +	if (!atomic_dec_and_test(&dentry->d_count))
>  		return;
>  
> +repeat:
>  	spin_lock(&dentry->d_lock);
>  	if (atomic_read(&dentry->d_count)) {
>  		spin_unlock(&dentry->d_lock);
> -		spin_unlock(&dcache_lock);
>  		return;
>  	}
>  
> @@ -177,32 +174,54 @@ repeat:
>    		dentry_stat.nr_unused++;
>    	}
>   	spin_unlock(&dentry->d_lock);
> -	spin_unlock(&dcache_lock);
>  	return;
>  
>  unhash_it:
>  	__d_drop(dentry);
>  
>  kill_it: {
> -		struct dentry *parent;
> -
>  		/* If dentry was on d_lru list
>  		 * delete it from there
>  		 */
>    		if (!list_empty(&dentry->d_lru)) {
> -  			list_del(&dentry->d_lru);
> +  			list_del_init(&dentry->d_lru);
>    			dentry_stat.nr_unused--;
>    		}
>    		list_del(&dentry->d_u.d_child);
>  		dentry_stat.nr_dentry--;	/* For d_free, below */
> -		/*drops the locks, at that point nobody can reach this dentry */
> -		dentry_iput(dentry);
> -		parent = dentry->d_parent;
> -		d_free(dentry);
> -		if (dentry == parent)
> +		/* at this point nobody can reach this dentry */
> +		list_add(&dentry->d_lru, list);
> +		spin_unlock(&dentry->d_lock);
> +		if (dentry == dentry->d_parent)
>  			return;
> -		dentry = parent;
> -		goto repeat;
> +		dentry = dentry->d_parent;
> +		if (atomic_dec_and_test(&dentry->d_count))
> +			goto repeat;
<<<< I would prefer to have "goto repeat" as it was before...
> +		/* out */
> +	}
> +}
> +
> +void dput(struct dentry *dentry)
> +{
> +	LIST_HEAD(free_list);
> +
> +	if (!dentry)
> +		return;
> +
> +	if (atomic_add_unless(&dentry->d_count, -1, 1))
> +		return;
<<<< I would better introduce __dput_locked() w/o atomic_dec_and_test() 
instead of using this atomic_add_unless()...
<<<< For me it looks like an obfuscation of a code, which must be clean 
and tidy.
> +
> +	spin_lock(&dcache_lock);
> +	dput_locked(dentry, &free_list);
> +	spin_unlock(&dcache_lock);
> +
<<<< 1. locking here is totally broken... spin_unlock() in dentry_iput()
<<<< 2. it doesn't help the situation I wrote to you,
<<<<    since iput() can be done on inode _after_ sb freeing...
> +	if (!list_empty(&free_list)) {
> +		struct dentry *dentry, *p;
> +		list_for_each_entry_safe(dentry, p, &free_list, d_lru) {
> +			list_del(&dentry->d_lru);
> +			dentry_iput(dentry);
> +			d_free(dentry);
> +		}
>  	}
>  }
>  
> @@ -364,16 +383,29 @@ restart:
>   */
>  static inline void prune_one_dentry(struct dentry * dentry)
>  {
> -	struct dentry * parent;
> +	LIST_HEAD(free_list);
>  
>  	__d_drop(dentry);
>  	list_del(&dentry->d_u.d_child);
>  	dentry_stat.nr_dentry--;	/* For d_free, below */
> -	dentry_iput(dentry);
> -	parent = dentry->d_parent;
> +
> +	/* dput the parent here before we release dcache_lock */
> +	if (dentry != dentry->d_parent)
> +		dput_locked(dentry->d_parent, &free_list);
> +
> +	dentry_iput(dentry);		/* drop locks */
<<<< comment 2) from dput()
>  	d_free(dentry);
> -	if (parent != dentry)
> -		dput(parent);
> +
> +	if (!list_empty(&free_list)) {
> +		struct dentry *tmp, *p;
> +
> +		list_for_each_entry_safe(tmp, p, &free_list, d_lru) {
> +			list_del(&tmp->d_lru);
> +			dentry_iput(tmp);
<<<< comment 1) from dput()
> +			d_free(tmp);
> +		}
> +	}
> +
>  	spin_lock(&dcache_lock);
>  }
>  



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-23  5:22 ` Andrew Morton
@ 2006-01-23  8:12   ` Kirill Korotaev
  2006-01-23 15:13   ` Jan Blunck
  1 sibling, 0 replies; 20+ messages in thread
From: Kirill Korotaev @ 2006-01-23  8:12 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Blunck, viro, linux-kernel

>>Kirill Korotaev <dev@sw.ru> discovered a race between shrink_dcache_parent()
>>and shrink_dcache_memory(). That one is based on dput() is calling
>>dentry_iput() too early and therefore is giving up the dcache_lock. This leads
>>to the situation that the parent dentry might be still referenced although all
>>childs are already dead. This parent is ignore by a concurrent select_parent()
>>call which might be the reason for busy inode after umount failures.
>>
>>This is from Kirill's original patch:
>>
>>CPU 1                    CPU 2
>>~~~~~                    ~~~~~
>>umount /dev/sda1
>>generic_shutdown_super   shrink_dcache_memory
>>shrink_dcache_parent     prune_one_dentry
>>select_parent            dput     <<<< child is dead, locks are released,
>>                                       but parent is still referenced!!! >>>>
>>skip dentry->parent,
>>since it's d_count > 0
>>
>>message: BUSY inodes after umount...
>>                                  <<< parent is left on dentry_unused list,
>>                                      referencing freed super block >>>
>>
>>This patch is introducing dput_locked() which is doing all the dput work
>>except of freeing up the dentry's inode and memory itself. Therefore, when the
>>dcache_lock is given up, all the reference counts of the parents are correct.
>>prune_one_dentry() must also use the dput_locked version and free up the
>>inodes and the memory of the parents later. Otherwise we have an incorrect
>>reference count on the parents of the dentry to prune.
>>
>>...
> 
> 
>>-void dput(struct dentry *dentry)
>>+static void dput_locked(struct dentry *dentry, struct list_head *list)
>> {
>> 	if (!dentry)
>> 		return;
>> 
>>-repeat:
>>-	if (atomic_read(&dentry->d_count) == 1)
>>-		might_sleep();
>>-	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
>>+	if (!atomic_dec_and_test(&dentry->d_count))
>> 		return;
>> 
>>+
>>
>>...
>>
>>+void dput(struct dentry *dentry)
>>+{
>>+	LIST_HEAD(free_list);
>>+
>>+	if (!dentry)
>>+		return;
>>+
>>+	if (atomic_add_unless(&dentry->d_count, -1, 1))
>>+		return;
>>+
>>+	spin_lock(&dcache_lock);
>>+	dput_locked(dentry, &free_list);
>>+	spin_unlock(&dcache_lock);
> 
> 
> This seems to be an open-coded copy of atomic_dec_and_lock()?

Yeah, this is what I also didn't like...
Why do it this way, when it's _really_ _possible_ to use old-good 
atomic_dec_and_test() keeping logic more clear?

Kirill


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-23  5:22 ` Andrew Morton
  2006-01-23  8:12   ` Kirill Korotaev
@ 2006-01-23 15:13   ` Jan Blunck
  1 sibling, 0 replies; 20+ messages in thread
From: Jan Blunck @ 2006-01-23 15:13 UTC (permalink / raw)
  To: Andrew Morton; +Cc: viro, dev, linux-kernel

On Sun, Jan 22, Andrew Morton wrote:

> > -void dput(struct dentry *dentry)
> > +static void dput_locked(struct dentry *dentry, struct list_head *list)
> >  {
> >  	if (!dentry)
> >  		return;
> >  
> > -repeat:
> > -	if (atomic_read(&dentry->d_count) == 1)
> > -		might_sleep();
> > -	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
> > +	if (!atomic_dec_and_test(&dentry->d_count))
> >  		return;
> >  
> > +
> >
> > ...
> >
> > +void dput(struct dentry *dentry)
> > +{
> > +	LIST_HEAD(free_list);
> > +
> > +	if (!dentry)
> > +		return;
> > +
> > +	if (atomic_add_unless(&dentry->d_count, -1, 1))
> > +		return;
> > +
> > +	spin_lock(&dcache_lock);
> > +	dput_locked(dentry, &free_list);
> > +	spin_unlock(&dcache_lock);
> 
> This seems to be an open-coded copy of atomic_dec_and_lock()?
> 

Yes, it is. Otherwise the reference counting would be like

 if(!atomic_dec_and_lock())
	return;
 atomic_inc();
 dput_locked();

or something similar stupid/racy.

Regards,
	Jan

-- 
Jan Blunck                                               jblunck@suse.de
SuSE LINUX AG - A Novell company
Maxfeldstr. 5                                          +49-911-74053-608
D-90409 Nürnberg                                      http://www.suse.de

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-23  8:07 ` Kirill Korotaev
@ 2006-01-23 15:57   ` Jan Blunck
  2006-01-24  5:54     ` Balbir Singh
  2006-01-30 12:03   ` Jan Blunck
  1 sibling, 1 reply; 20+ messages in thread
From: Jan Blunck @ 2006-01-23 15:57 UTC (permalink / raw)
  To: Kirill Korotaev; +Cc: viro, linux-kernel, Andrew Morton, olh

On Mon, Jan 23, Kirill Korotaev wrote:

> 
> 1. this patch doesn't fix the whole problem. iput() after sb free is 
> still possible. So busy inodes after umount too.
> 2. it has big problems with locking...
> 

Yes, you're right. I'll fix that and send an updated version.

> >+			goto repeat;
> <<<< I would prefer to have "goto repeat" as it was before...
> >+		/* out */
> >+	}

Fair.

> >+	if (atomic_add_unless(&dentry->d_count, -1, 1))
> >+		return;
> <<<< I would better introduce __dput_locked() w/o atomic_dec_and_test() 
> instead of using this atomic_add_unless()...
> <<<< For me it looks like an obfuscation of a code, which must be clean 
> and tidy.

Then it isn't dput_locked() anymore and you have to manually dereference
before you use __dput_locked(). Doesn't sound better. I'll give it a try ...

> >+
> >+	spin_lock(&dcache_lock);
> >+	dput_locked(dentry, &free_list);
> >+	spin_unlock(&dcache_lock);
> >+
> <<<< 1. locking here is totally broken... spin_unlock() in dentry_iput()

Yes, I totally missed the locking issue here. I'll rework that one.

> <<<< 2. it doesn't help the situation I wrote to you,
> <<<<    since iput() can be done on inode _after_ sb freeing...

Hmm, will think about that one again. shrink_dcache_parent() and
shrink_dcache_memory()/dput() are not racing against each other now since the
reference counting is done before giving up dcache_lock and the select_parent
could start.

Regards,
	Jan

-- 
Jan Blunck                                               jblunck@suse.de
SuSE LINUX AG - A Novell company
Maxfeldstr. 5                                          +49-911-74053-608
D-90409 Nürnberg                                      http://www.suse.de

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-23 15:57   ` Jan Blunck
@ 2006-01-24  5:54     ` Balbir Singh
  2006-01-24  9:48       ` Kirill Korotaev
  0 siblings, 1 reply; 20+ messages in thread
From: Balbir Singh @ 2006-01-24  5:54 UTC (permalink / raw)
  To: Jan Blunck; +Cc: Kirill Korotaev, viro, linux-kernel, Andrew Morton, olh

On Mon, Jan 23, 2006 at 04:57:28PM +0100, Jan Blunck wrote:
> On Mon, Jan 23, Kirill Korotaev wrote:
>
> [snip] 
>
> Hmm, will think about that one again. shrink_dcache_parent() and
> shrink_dcache_memory()/dput() are not racing against each other now since the
> reference counting is done before giving up dcache_lock and the select_parent
> could start.
> 
> Regards,
> 	Jan
>

I have been playing around with a possible solution to the problem.
I have not been able to reproduce this issue, hence I am unable to verify
if the patch below fixes the problem. I have run the system with this
patch and verified that no obvious badness is observed.

Kirill, Jan if you can easily reproduce the problem, could you
try this patch and review it as well for correctness of the solution?

All callers that try to free memory set the PF_MEMALLOC flag, we check
if the super block is going away due to an unmount, if so we ask the
allocator to return.

The patch adds additional cost of holding the sb_lock for each dentry
being pruned. It holds sb_lock under dentry->d_lock and dcache_lock,
I am not sure about the locking order of these locks.

Signed-off-by: Balbir Singh <balbir@in.ibm.com>
---

 fs/dcache.c |   23 +++++++++++++++++++++++
 1 files changed, 23 insertions(+)

diff -puN fs/dcache.c~dcache_race_fix2 fs/dcache.c
--- linux-2.6/fs/dcache.c~dcache_race_fix2	2006-01-24 11:05:46.000000000 +0530
+++ linux-2.6-balbir/fs/dcache.c	2006-01-24 11:05:46.000000000 +0530
@@ -425,6 +425,29 @@ static void prune_dcache(int count)
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
+
+		/*
+		 * Note to reviewers: our current lock order is dcache_lock,
+		 * dentry->d_lock & sb_lock. Could this create a deadlock?
+		 */
+		spin_lock(&sb_lock);
+		if (!atomic_read(&dentry->d_sb->s_active)) {
+			/*
+			 * Race condition, umount and other pruning is happening
+			 * in parallel.
+			 */
+			if (current->flags & PF_MEMALLOC) {
+				/*
+				 * let the allocator leave this dentry alone
+				 */
+				spin_unlock(&sb_lock);
+				spin_unlock(&dentry->d_lock);
+				spin_unlock(&dcache_lock);
+				return;
+			}
+		}
+		spin_unlock(&sb_lock);
+
 		prune_one_dentry(dentry);
 	}
 	spin_unlock(&dcache_lock);

Thanks,
Balbir
_

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-24  5:54     ` Balbir Singh
@ 2006-01-24  9:48       ` Kirill Korotaev
  2006-01-24 11:10         ` Balbir Singh
  0 siblings, 1 reply; 20+ messages in thread
From: Kirill Korotaev @ 2006-01-24  9:48 UTC (permalink / raw)
  To: balbir; +Cc: Jan Blunck, viro, linux-kernel, Andrew Morton, olh

I like your idea, but some comments below... I doubt it works.
I will think it over a bit later...

Kirill
P.S. it's not easily reproducable. Before my fix it took us 3-6 hours on 
automated stress testing to hit this bug. Right now I can't setup it for 
testing, maybe in a week or so.

>>On Mon, Jan 23, Kirill Korotaev wrote:
>>
>>[snip] 
>>
>>Hmm, will think about that one again. shrink_dcache_parent() and
>>shrink_dcache_memory()/dput() are not racing against each other now since the
>>reference counting is done before giving up dcache_lock and the select_parent
>>could start.
>>
>>Regards,
>>	Jan
>>
> 
> 
> I have been playing around with a possible solution to the problem.
> I have not been able to reproduce this issue, hence I am unable to verify
> if the patch below fixes the problem. I have run the system with this
> patch and verified that no obvious badness is observed.
> 
> Kirill, Jan if you can easily reproduce the problem, could you
> try this patch and review it as well for correctness of the solution?
> 
> All callers that try to free memory set the PF_MEMALLOC flag, we check
> if the super block is going away due to an unmount, if so we ask the
> allocator to return.
> 
> The patch adds additional cost of holding the sb_lock for each dentry
> being pruned. It holds sb_lock under dentry->d_lock and dcache_lock,
> I am not sure about the locking order of these locks.
> 
> Signed-off-by: Balbir Singh <balbir@in.ibm.com>
> ---
> 
>  fs/dcache.c |   23 +++++++++++++++++++++++
>  1 files changed, 23 insertions(+)
> 
> diff -puN fs/dcache.c~dcache_race_fix2 fs/dcache.c
> --- linux-2.6/fs/dcache.c~dcache_race_fix2	2006-01-24 11:05:46.000000000 +0530
> +++ linux-2.6-balbir/fs/dcache.c	2006-01-24 11:05:46.000000000 +0530
> @@ -425,6 +425,29 @@ static void prune_dcache(int count)
>   			spin_unlock(&dentry->d_lock);
>  			continue;
>  		}
> +
> +		/*
> +		 * Note to reviewers: our current lock order is dcache_lock,
> +		 * dentry->d_lock & sb_lock. Could this create a deadlock?
> +		 */
> +		spin_lock(&sb_lock);
<<<< 1. sb_lock doesn't protect atomic_read() anyhow...
<<<<    I mean, sb_lock is not required to read its value...
> +		if (!atomic_read(&dentry->d_sb->s_active)) {
> +			/*
> +			 * Race condition, umount and other pruning is happening
> +			 * in parallel.
> +			 */
> +			if (current->flags & PF_MEMALLOC) {
> +				/*
> +				 * let the allocator leave this dentry alone
> +				 */
> +				spin_unlock(&sb_lock);
> +				spin_unlock(&dentry->d_lock);
> +				spin_unlock(&dcache_lock);
> +				return;
<<<< you should not return, but rather 'continue'. otherwise you skip 
_all_ dentries, even from active super blocks.
> +			}
> +		}
> +		spin_unlock(&sb_lock);
> +
<<<< and here, when you drop sb_lock, and dentry->d_lock/dcache_lock in 
prune_dentry() it looks to me that we have exactly the same situation as 
it was without your patch:
<<<< another CPU can start umount in parallel.
<<<< maybe sb_lock barrier helps this somehow, but I can't see how yet...

<<<< another idea: down_read(&sb->s_umount) probably could help...
<<<< because it will block the whole umount operation...
<<<< but we can't take it under dcache_lock...
>  		prune_one_dentry(dentry);
>  	}
>  	spin_unlock(&dcache_lock);
> 
> Thanks,
> Balbir
> _
> 



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-24  9:48       ` Kirill Korotaev
@ 2006-01-24 11:10         ` Balbir Singh
  2006-01-24 17:18           ` Kirill Korotaev
  0 siblings, 1 reply; 20+ messages in thread
From: Balbir Singh @ 2006-01-24 11:10 UTC (permalink / raw)
  To: Kirill Korotaev; +Cc: Jan Blunck, viro, linux-kernel, Andrew Morton, olh

Hi, Kirill,

On Tue, Jan 24, 2006 at 12:48:13PM +0300, Kirill Korotaev wrote:
> I like your idea, but some comments below... I doubt it works.
> I will think it over a bit later...
> 

Thanks. Please find my comments and updated patch below

> Kirill
> P.S. it's not easily reproducable. Before my fix it took us 3-6 hours on 
> automated stress testing to hit this bug. Right now I can't setup it for 
> testing, maybe in a week or so.

Sure, please test whenever you set it up.

[snip]

> >+		spin_lock(&sb_lock);
> <<<< 1. sb_lock doesn't protect atomic_read() anyhow...
> <<<<    I mean, sb_lock is not required to read its value...

Good point, the sb_lock is not required. I have removed it.

> >+		if (!atomic_read(&dentry->d_sb->s_active)) {
> >+			/*
> >+			 * Race condition, umount and other pruning is 
> >happening
> >+			 * in parallel.
> >+			 */
> >+			if (current->flags & PF_MEMALLOC) {
> >+				/*
> >+				 * let the allocator leave this dentry alone
> >+				 */
> >+				spin_unlock(&sb_lock);
> >+				spin_unlock(&dentry->d_lock);
> >+				spin_unlock(&dcache_lock);
> >+				return;
> <<<< you should not return, but rather 'continue'. otherwise you skip 
> _all_ dentries, even from active super blocks.

Good point.

> >+			}
> >+		}
> >+		spin_unlock(&sb_lock);
> >+
> <<<< and here, when you drop sb_lock, and dentry->d_lock/dcache_lock in 
> prune_dentry() it looks to me that we have exactly the same situation as 
> it was without your patch:
> <<<< another CPU can start umount in parallel.
> <<<< maybe sb_lock barrier helps this somehow, but I can't see how yet...

>From the unmount path, __mntput() is called. It sets s_active to 0 in
deactivate_super(), hence our check would prevent us from pruning a dentry
that is a part of a super block that is going to go away soon. The idea
is to let the unmount do all the work here, the allocator can concentrate
on other dentries.

> 
> <<<< another idea: down_read(&sb->s_umount) probably could help...
> <<<< because it will block the whole umount operation...
> <<<< but we can't take it under dcache_lock...

Yes, we cannot do a down* under a spinlock

[snip]

How does the modified patch look? 

Regards,
Balbir


Signed-off-by: Balbir Singh <balbir@in.ibm.com>
---

 fs/dcache.c |   15 +++++++++++++++
 1 files changed, 15 insertions(+)

diff -puN fs/dcache.c~dcache_race_fix2 fs/dcache.c
--- linux-2.6/fs/dcache.c~dcache_race_fix2	2006-01-24 11:05:46.000000000 +0530
+++ linux-2.6-balbir/fs/dcache.c	2006-01-24 15:49:30.000000000 +0530
@@ -425,6 +425,21 @@ static void prune_dcache(int count)
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
+
+		if (!atomic_read(&dentry->d_sb->s_active)) {
+			/*
+			 * Race condition, umount and other pruning is happening
+			 * in parallel.
+			 */
+			if (current->flags & PF_MEMALLOC) {
+				/*
+				 * Ask the allocator leave this dentry alone
+				 */
+				spin_unlock(&dentry->d_lock);
+				continue;
+			}
+		}
+
 		prune_one_dentry(dentry);
 	}
 	spin_unlock(&dcache_lock);
_

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-24 11:10         ` Balbir Singh
@ 2006-01-24 17:18           ` Kirill Korotaev
  2006-01-25  7:03             ` Balbir Singh
  0 siblings, 1 reply; 20+ messages in thread
From: Kirill Korotaev @ 2006-01-24 17:18 UTC (permalink / raw)
  To: balbir; +Cc: Jan Blunck, viro, linux-kernel, Andrew Morton, olh

>><<<< and here, when you drop sb_lock, and dentry->d_lock/dcache_lock in 
>>prune_dentry() it looks to me that we have exactly the same situation as 
>>it was without your patch:
>><<<< another CPU can start umount in parallel.
>><<<< maybe sb_lock barrier helps this somehow, but I can't see how yet...
> 
>>From the unmount path, __mntput() is called. It sets s_active to 0 in
> deactivate_super(), hence our check would prevent us from pruning a dentry
> that is a part of a super block that is going to go away soon. The idea
> is to let the unmount do all the work here, the allocator can concentrate
> on other dentries.
you check can happen 1 nanosecond before it sets s_active, after that 
the code goes into prune_dentry(), while deactivate_super() successfully 
sets s_active and starts umount main job. Nothing prevents the race... :(

Kirill


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-24 17:18           ` Kirill Korotaev
@ 2006-01-25  7:03             ` Balbir Singh
  0 siblings, 0 replies; 20+ messages in thread
From: Balbir Singh @ 2006-01-25  7:03 UTC (permalink / raw)
  To: Kirill Korotaev; +Cc: Jan Blunck, viro, linux-kernel, Andrew Morton, olh

[snip]
> you check can happen 1 nanosecond before it sets s_active, after that 
> the code goes into prune_dentry(), while deactivate_super() successfully 
> sets s_active and starts umount main job. Nothing prevents the race... :(
> 
> 
Yes, true. Thanks for pointing this out.

Now I am thinking about s_umount semaphore that you mentioned yesterday.
I thought we could always do a down_read_trylock on it under the 
dcache lock.

Here is one more attempt at fixing the race :-)

Assumptions: 

1. Super block s is still valid after prune_one_dentry (found this to be true).
   This is required to do the up_read().

Costs:

1. Holding s_umount for each dentry

Comments?

Thanks,
Balbir




Signed-off-by: Balbir Singh <balbir@in.ibm.com>
---

 fs/dcache.c |   20 ++++++++++++++++++++
 1 files changed, 20 insertions(+)

diff -puN fs/dcache.c~dcache_race_fix2 fs/dcache.c
--- linux-2.6/fs/dcache.c~dcache_race_fix2	2006-01-24 11:05:46.000000000 +0530
+++ linux-2.6-balbir/fs/dcache.c	2006-01-25 12:16:06.000000000 +0530
@@ -396,6 +396,8 @@ static void prune_dcache(int count)
 	for (; count ; count--) {
 		struct dentry *dentry;
 		struct list_head *tmp;
+		int ret = 0;
+		struct super_block *s;
 
 		cond_resched_lock(&dcache_lock);
 
@@ -425,7 +427,25 @@ static void prune_dcache(int count)
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
+
+		/*
+		 * Is someone is unmounting the filesystem associated with
+		 * this dentry? If we are the allocator, leave the dentry
+		 * alone.
+		 */
+		s = dentry->d_sb;
+		if ((current->flags & PF_MEMALLOC) &&
+			!(ret = down_read_trylock(&s->s_umount))) {
+ 			spin_unlock(&dentry->d_lock);
+			continue;
+		}
+
 		prune_one_dentry(dentry);
+
+		if (ret) {
+			up_read(&s->s_umount);
+			ret = 0; /* for the next iteration */
+		}
 	}
 	spin_unlock(&dcache_lock);
 }
_

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-23  8:07 ` Kirill Korotaev
  2006-01-23 15:57   ` Jan Blunck
@ 2006-01-30 12:03   ` Jan Blunck
  2006-01-30 14:38     ` Balbir Singh
  2006-01-30 14:42     ` Kirill Korotaev
  1 sibling, 2 replies; 20+ messages in thread
From: Jan Blunck @ 2006-01-30 12:03 UTC (permalink / raw)
  To: Kirill Korotaev; +Cc: viro, linux-kernel, Andrew Morton, olh, balbir

[-- Attachment #1: Type: text/plain, Size: 701 bytes --]

On Mon, Jan 23, Kirill Korotaev wrote:

> 1. this patch doesn't fix the whole problem. iput() after sb free is 
> still possible. So busy inodes after umount too.
> 2. it has big problems with locking...
> 

Uh yeah! I fixed the second issue but since the patch doesnt helped and only
gots the reference counting a little bit cleaner I don't post it.

> comments below inside.
> 

New patch attached below. Comments are welcome.

Regards,
	Jan

-- 
Jan Blunck                                               jblunck@suse.de
SuSE LINUX AG - A Novell company
Maxfeldstr. 5                                          +49-911-74053-608
D-90409 Nürnberg                                      http://www.suse.de

[-- Attachment #2: umount-prune_one_dentry-fix.diff --]
[-- Type: text/plain, Size: 3770 bytes --]

From: Jan Blunck <jblunck@suse.de>
Subject: Fix shrink_dcache_parent() against shrink_dcache_memory() race
References: 136310

Kirill Korotaev <dev@sw.ru> discovered a race between shrink_dcache_parent()
and shrink_dcache_memory() which leads to "Busy inodes after unmount".
When unmounting a file system shrink_dcache_parent() is racing against a
possible shrink_dcache_memory(). This might lead to the situation that
shrink_dcache_parent() is returning too early. In this situation the
super_block is destroyed before shrink_dcache_memory() could put the inode.

This patch fixes the problem through introducing a prunes counter which is
incremented when a dentry is pruned but the corresponding inoded isn't put yet.
When the prunes counter is not null, shrink_dcache_parent() is waiting and
restarting its work.

Signed-off-by: Jan Blunck <jblunck@suse.de>

---

 fs/dcache.c        |   36 ++++++++++++++++++++++++++++++++++++
 fs/super.c         |    4 +++-
 include/linux/fs.h |    3 +++
 3 files changed, 42 insertions(+), 1 deletion(-)

Index: linux-2.6/fs/dcache.c
===================================================================
--- linux-2.6.orig/fs/dcache.c
+++ linux-2.6/fs/dcache.c
@@ -364,17 +364,21 @@ restart:
  */
 static inline void prune_one_dentry(struct dentry * dentry)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct dentry * parent;
 
 	__d_drop(dentry);
 	list_del(&dentry->d_u.d_child);
 	dentry_stat.nr_dentry--;	/* For d_free, below */
+	sb->s_prunes++;
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
 	d_free(dentry);
 	if (parent != dentry)
 		dput(parent);
 	spin_lock(&dcache_lock);
+	sb->s_prunes--;
+	wake_up(&sb->s_wait_prunes);
 }
 
 /**
@@ -623,6 +627,34 @@ out:
 	return found;
 }
 
+static int wait_on_prunes(struct super_block *sb)
+{
+	DEFINE_WAIT(wait);
+
+	spin_lock(&dcache_lock);
+	if (!sb->s_prunes) {
+		spin_unlock(&dcache_lock);
+		return 0;
+	}
+
+	printk(KERN_DEBUG "%s: waiting for %d prunes\n", __FUNCTION__,
+	       sb->s_prunes);
+
+	while (1) {
+		prepare_to_wait(&sb->s_wait_prunes, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (!sb->s_prunes)
+			break;
+		spin_unlock(&dcache_lock);
+		schedule();
+		spin_lock(&dcache_lock);
+	}
+
+	finish_wait(&sb->s_wait_prunes, &wait);
+	spin_unlock(&dcache_lock);
+	return 1;
+}
+
 /**
  * shrink_dcache_parent - prune dcache
  * @parent: parent of entries to prune
@@ -634,8 +666,12 @@ void shrink_dcache_parent(struct dentry 
 {
 	int found;
 
+ again:
 	while ((found = select_parent(parent)) != 0)
 		prune_dcache(found);
+
+	if (wait_on_prunes(parent->d_sb))
+		goto again;
 }
 
 /**
Index: linux-2.6/fs/super.c
===================================================================
--- linux-2.6.orig/fs/super.c
+++ linux-2.6/fs/super.c
@@ -80,6 +80,8 @@ static struct super_block *alloc_super(v
 		sema_init(&s->s_dquot.dqio_sem, 1);
 		sema_init(&s->s_dquot.dqonoff_sem, 1);
 		init_rwsem(&s->s_dquot.dqptr_sem);
+		s->s_prunes = 0;
+		init_waitqueue_head(&s->s_wait_prunes);
 		init_waitqueue_head(&s->s_wait_unfrozen);
 		s->s_maxbytes = MAX_NON_LFS;
 		s->dq_op = sb_dquot_ops;
@@ -230,8 +232,8 @@ void generic_shutdown_super(struct super
 
 	if (root) {
 		sb->s_root = NULL;
-		shrink_dcache_parent(root);
 		shrink_dcache_anon(&sb->s_anon);
+		shrink_dcache_parent(root);
 		dput(root);
 		fsync_super(sb);
 		lock_super(sb);
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -833,6 +833,9 @@ struct super_block {
 	struct list_head	s_instances;
 	struct quota_info	s_dquot;	/* Diskquota specific options */
 
+	int			s_prunes;
+	wait_queue_head_t	s_wait_prunes;
+
 	int			s_frozen;
 	wait_queue_head_t	s_wait_unfrozen;
 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-30 12:03   ` Jan Blunck
@ 2006-01-30 14:38     ` Balbir Singh
  2006-01-30 14:54       ` Jan Blunck
  2006-01-30 14:42     ` Kirill Korotaev
  1 sibling, 1 reply; 20+ messages in thread
From: Balbir Singh @ 2006-01-30 14:38 UTC (permalink / raw)
  To: Jan Blunck; +Cc: Kirill Korotaev, viro, linux-kernel, Andrew Morton, olh

> 
> New patch attached below. Comments are welcome.
> 
> Regards,
> 	Jan
>
[snip]
 
> From: Jan Blunck <jblunck@suse.de>
> Subject: Fix shrink_dcache_parent() against shrink_dcache_memory() race
> References: 136310
> 
> Kirill Korotaev <dev@sw.ru> discovered a race between shrink_dcache_parent()
> and shrink_dcache_memory() which leads to "Busy inodes after unmount".
> When unmounting a file system shrink_dcache_parent() is racing against a
> possible shrink_dcache_memory(). This might lead to the situation that
> shrink_dcache_parent() is returning too early. In this situation the
> super_block is destroyed before shrink_dcache_memory() could put the inode.
> 
> This patch fixes the problem through introducing a prunes counter which is
> incremented when a dentry is pruned but the corresponding inoded isn't put yet.
> When the prunes counter is not null, shrink_dcache_parent() is waiting and
> restarting its work.
> 
> Signed-off-by: Jan Blunck <jblunck@suse.de>
> 
> ---
> 
>  fs/dcache.c        |   36 ++++++++++++++++++++++++++++++++++++
>  fs/super.c         |    4 +++-
>  include/linux/fs.h |    3 +++
>  3 files changed, 42 insertions(+), 1 deletion(-)
> 
> Index: linux-2.6/fs/dcache.c
> ===================================================================
> --- linux-2.6.orig/fs/dcache.c
> +++ linux-2.6/fs/dcache.c
> @@ -364,17 +364,21 @@ restart:
>   */
>  static inline void prune_one_dentry(struct dentry * dentry)
>  {
> +	struct super_block *sb = dentry->d_sb;
>  	struct dentry * parent;
>  
>  	__d_drop(dentry);
>  	list_del(&dentry->d_u.d_child);
>  	dentry_stat.nr_dentry--;	/* For d_free, below */
> +	sb->s_prunes++;
>  	dentry_iput(dentry);
>  	parent = dentry->d_parent;
>  	d_free(dentry);
>  	if (parent != dentry)
>  		dput(parent);
>  	spin_lock(&dcache_lock);
> +	sb->s_prunes--;
> +	wake_up(&sb->s_wait_prunes);
>  }
>
  
We can think about optimizing this to
   if (!sb->sprunes)
	wake_up(&sb->s_wait_prunes);

>  /**
> @@ -623,6 +627,34 @@ out:
>  	return found;
>  }
>  
> +static int wait_on_prunes(struct super_block *sb)
> +{
> +	DEFINE_WAIT(wait);
> +
> +	spin_lock(&dcache_lock);
> +	if (!sb->s_prunes) {
> +		spin_unlock(&dcache_lock);
> +		return 0;
> +	}
> +
> +	printk(KERN_DEBUG "%s: waiting for %d prunes\n", __FUNCTION__,
> +	       sb->s_prunes);
> +
> +	while (1) {
> +		prepare_to_wait(&sb->s_wait_prunes, &wait,
> +				TASK_UNINTERRUPTIBLE);
> +		if (!sb->s_prunes)
> +			break;
> +		spin_unlock(&dcache_lock);
> +		schedule();
> +		spin_lock(&dcache_lock);
> +	}
> +
> +	finish_wait(&sb->s_wait_prunes, &wait);
> +	spin_unlock(&dcache_lock);
> +	return 1;
> +}
> +
>  /**
>   * shrink_dcache_parent - prune dcache
>   * @parent: parent of entries to prune
> @@ -634,8 +666,12 @@ void shrink_dcache_parent(struct dentry 
>  {
>  	int found;
>  
> + again:
>  	while ((found = select_parent(parent)) != 0)
>  		prune_dcache(found);
> +
> +	if (wait_on_prunes(parent->d_sb))
> +		goto again;
>  }

Is the goto again required? At this point select_parent() should have pruned
all entries, except those missed due to the race. These should be captured
by sb->s_prunes. Once the code comes out of wait_on_prunes() everything
should be ok since a dput has happened on the missed parent dentries.

>  
>  /**
> Index: linux-2.6/fs/super.c
> ===================================================================
> --- linux-2.6.orig/fs/super.c
> +++ linux-2.6/fs/super.c
> @@ -80,6 +80,8 @@ static struct super_block *alloc_super(v
>  		sema_init(&s->s_dquot.dqio_sem, 1);
>  		sema_init(&s->s_dquot.dqonoff_sem, 1);
>  		init_rwsem(&s->s_dquot.dqptr_sem);
> +		s->s_prunes = 0;
> +		init_waitqueue_head(&s->s_wait_prunes);
>  		init_waitqueue_head(&s->s_wait_unfrozen);
>  		s->s_maxbytes = MAX_NON_LFS;
>  		s->dq_op = sb_dquot_ops;
> @@ -230,8 +232,8 @@ void generic_shutdown_super(struct super
>  
>  	if (root) {
>  		sb->s_root = NULL;
> -		shrink_dcache_parent(root);
>  		shrink_dcache_anon(&sb->s_anon);
> +		shrink_dcache_parent(root);
>  		dput(root);
>  		fsync_super(sb);
>  		lock_super(sb);
> Index: linux-2.6/include/linux/fs.h
> ===================================================================
> --- linux-2.6.orig/include/linux/fs.h
> +++ linux-2.6/include/linux/fs.h
> @@ -833,6 +833,9 @@ struct super_block {
>  	struct list_head	s_instances;
>  	struct quota_info	s_dquot;	/* Diskquota specific options */
>  
> +	int			s_prunes;

Can this be an unsigned int? Perhaps you might to mention that is protected
by the dcache_lock.

> +	wait_queue_head_t	s_wait_prunes;
> +
>  	int			s_frozen;
>  	wait_queue_head_t	s_wait_unfrozen;
>  


Your fix seems correct at first sight and good to be included. But could you 
please do a correctness/speed/cost analysis of your fix with the fix I 
previously sent out?

Regards,
Balbir

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-30 12:03   ` Jan Blunck
  2006-01-30 14:38     ` Balbir Singh
@ 2006-01-30 14:42     ` Kirill Korotaev
  2006-01-30 14:58       ` Jan Blunck
  1 sibling, 1 reply; 20+ messages in thread
From: Kirill Korotaev @ 2006-01-30 14:42 UTC (permalink / raw)
  To: Jan Blunck; +Cc: viro, linux-kernel, Andrew Morton, olh, balbir

Hello Jan,

this is much cleaner now and looks more like my original patch and is 
smaller/more beautifull with counters usage. Thanks.

However, with counters instead of list it is possible to create a live 
lock :( So I'm not sure it is really ok.
BTW, what kernel is it for? 2.6.15 or 2.6.16-X?

Kirill

>>1. this patch doesn't fix the whole problem. iput() after sb free is 
>>still possible. So busy inodes after umount too.
>>2. it has big problems with locking...
>>
> 
> 
> Uh yeah! I fixed the second issue but since the patch doesnt helped and only
> gots the reference counting a little bit cleaner I don't post it.
> 
> 
>>comments below inside.
>>
> 
> 
> New patch attached below. Comments are welcome.
> 
> Regards,
> 	Jan
> 
> 
> 
> ------------------------------------------------------------------------
> 
> From: Jan Blunck <jblunck@suse.de>
> Subject: Fix shrink_dcache_parent() against shrink_dcache_memory() race
> References: 136310
> 
> Kirill Korotaev <dev@sw.ru> discovered a race between shrink_dcache_parent()
> and shrink_dcache_memory() which leads to "Busy inodes after unmount".
> When unmounting a file system shrink_dcache_parent() is racing against a
> possible shrink_dcache_memory(). This might lead to the situation that
> shrink_dcache_parent() is returning too early. In this situation the
> super_block is destroyed before shrink_dcache_memory() could put the inode.
> 
> This patch fixes the problem through introducing a prunes counter which is
> incremented when a dentry is pruned but the corresponding inoded isn't put yet.
> When the prunes counter is not null, shrink_dcache_parent() is waiting and
> restarting its work.
> 
> Signed-off-by: Jan Blunck <jblunck@suse.de>
> 
> ---
> 
>  fs/dcache.c        |   36 ++++++++++++++++++++++++++++++++++++
>  fs/super.c         |    4 +++-
>  include/linux/fs.h |    3 +++
>  3 files changed, 42 insertions(+), 1 deletion(-)
> 
> Index: linux-2.6/fs/dcache.c
> ===================================================================
> --- linux-2.6.orig/fs/dcache.c
> +++ linux-2.6/fs/dcache.c
> @@ -364,17 +364,21 @@ restart:
>   */
>  static inline void prune_one_dentry(struct dentry * dentry)
>  {
> +	struct super_block *sb = dentry->d_sb;
>  	struct dentry * parent;
>  
>  	__d_drop(dentry);
>  	list_del(&dentry->d_u.d_child);
>  	dentry_stat.nr_dentry--;	/* For d_free, below */
> +	sb->s_prunes++;
>  	dentry_iput(dentry);
>  	parent = dentry->d_parent;
>  	d_free(dentry);
>  	if (parent != dentry)
>  		dput(parent);
>  	spin_lock(&dcache_lock);
> +	sb->s_prunes--;
> +	wake_up(&sb->s_wait_prunes);
>  }
>  
>  /**
> @@ -623,6 +627,34 @@ out:
>  	return found;
>  }
>  
> +static int wait_on_prunes(struct super_block *sb)
> +{
> +	DEFINE_WAIT(wait);
> +
> +	spin_lock(&dcache_lock);
> +	if (!sb->s_prunes) {
> +		spin_unlock(&dcache_lock);
> +		return 0;
> +	}
> +
> +	printk(KERN_DEBUG "%s: waiting for %d prunes\n", __FUNCTION__,
> +	       sb->s_prunes);
> +
> +	while (1) {
> +		prepare_to_wait(&sb->s_wait_prunes, &wait,
> +				TASK_UNINTERRUPTIBLE);
> +		if (!sb->s_prunes)
> +			break;
> +		spin_unlock(&dcache_lock);
> +		schedule();
> +		spin_lock(&dcache_lock);
> +	}
> +
> +	finish_wait(&sb->s_wait_prunes, &wait);
> +	spin_unlock(&dcache_lock);
> +	return 1;
> +}
> +
>  /**
>   * shrink_dcache_parent - prune dcache
>   * @parent: parent of entries to prune
> @@ -634,8 +666,12 @@ void shrink_dcache_parent(struct dentry 
>  {
>  	int found;
>  
> + again:
>  	while ((found = select_parent(parent)) != 0)
>  		prune_dcache(found);
> +
> +	if (wait_on_prunes(parent->d_sb))
> +		goto again;
>  }
>  
>  /**
> Index: linux-2.6/fs/super.c
> ===================================================================
> --- linux-2.6.orig/fs/super.c
> +++ linux-2.6/fs/super.c
> @@ -80,6 +80,8 @@ static struct super_block *alloc_super(v
>  		sema_init(&s->s_dquot.dqio_sem, 1);
>  		sema_init(&s->s_dquot.dqonoff_sem, 1);
>  		init_rwsem(&s->s_dquot.dqptr_sem);
> +		s->s_prunes = 0;
> +		init_waitqueue_head(&s->s_wait_prunes);
>  		init_waitqueue_head(&s->s_wait_unfrozen);
>  		s->s_maxbytes = MAX_NON_LFS;
>  		s->dq_op = sb_dquot_ops;
> @@ -230,8 +232,8 @@ void generic_shutdown_super(struct super
>  
>  	if (root) {
>  		sb->s_root = NULL;
> -		shrink_dcache_parent(root);
>  		shrink_dcache_anon(&sb->s_anon);
> +		shrink_dcache_parent(root);
>  		dput(root);
>  		fsync_super(sb);
>  		lock_super(sb);
> Index: linux-2.6/include/linux/fs.h
> ===================================================================
> --- linux-2.6.orig/include/linux/fs.h
> +++ linux-2.6/include/linux/fs.h
> @@ -833,6 +833,9 @@ struct super_block {
>  	struct list_head	s_instances;
>  	struct quota_info	s_dquot;	/* Diskquota specific options */
>  
> +	int			s_prunes;
> +	wait_queue_head_t	s_wait_prunes;
> +
>  	int			s_frozen;
>  	wait_queue_head_t	s_wait_unfrozen;
>  



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-30 14:38     ` Balbir Singh
@ 2006-01-30 14:54       ` Jan Blunck
  2006-01-30 15:02         ` Kirill Korotaev
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Blunck @ 2006-01-30 14:54 UTC (permalink / raw)
  To: Balbir Singh; +Cc: Kirill Korotaev, viro, linux-kernel, Andrew Morton, olh

On Mon, Jan 30, Balbir Singh wrote:

> >  static inline void prune_one_dentry(struct dentry * dentry)
> >  {
> > +	struct super_block *sb = dentry->d_sb;
> >  	struct dentry * parent;
> >  
> >  	__d_drop(dentry);
> >  	list_del(&dentry->d_u.d_child);
> >  	dentry_stat.nr_dentry--;	/* For d_free, below */
> > +	sb->s_prunes++;
> >  	dentry_iput(dentry);
> >  	parent = dentry->d_parent;
> >  	d_free(dentry);
> >  	if (parent != dentry)
> >  		dput(parent);
> >  	spin_lock(&dcache_lock);
> > +	sb->s_prunes--;
> > +	wake_up(&sb->s_wait_prunes);
> >  }
> >
>   
> We can think about optimizing this to
>    if (!sb->sprunes)
> 	wake_up(&sb->s_wait_prunes);
> 

Hardly. This is only the case when two or more shrinkers are active in
parallel. If that was the case often, we would have seen this much more
frequent IMHO.

> > @@ -634,8 +666,12 @@ void shrink_dcache_parent(struct dentry 
> >  {
> >  	int found;
> >  
> > + again:
> >  	while ((found = select_parent(parent)) != 0)
> >  		prune_dcache(found);
> > +
> > +	if (wait_on_prunes(parent->d_sb))
> > +		goto again;
> >  }
> 
> Is the goto again required? At this point select_parent() should have pruned
> all entries, except those missed due to the race. These should be captured
> by sb->s_prunes. Once the code comes out of wait_on_prunes() everything
> should be ok since a dput has happened on the missed parent dentries.

Yes, because the last select_parent might returned zero because the parent of
the dentry which is just pruned isn't dereferenced yet. Although we can change
it to something like

   do { 
      while(select_parent())
   } while(wait_on_prunes()) 


> > +++ linux-2.6/include/linux/fs.h
> > @@ -833,6 +833,9 @@ struct super_block {
> >  	struct list_head	s_instances;
> >  	struct quota_info	s_dquot;	/* Diskquota specific options */
> >  
> > +	int			s_prunes;
> 
> Can this be an unsigned int? Perhaps you might to mention that is protected
> by the dcache_lock.
> 

Yes, will fix that.

Regards,
	Jan

-- 
Jan Blunck                                               jblunck@suse.de
SuSE LINUX AG - A Novell company
Maxfeldstr. 5                                          +49-911-74053-608
D-90409 Nürnberg                                      http://www.suse.de

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-30 14:42     ` Kirill Korotaev
@ 2006-01-30 14:58       ` Jan Blunck
  2006-01-30 15:59         ` Kirill Korotaev
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Blunck @ 2006-01-30 14:58 UTC (permalink / raw)
  To: Kirill Korotaev; +Cc: viro, linux-kernel, Andrew Morton, olh, balbir

On Mon, Jan 30, Kirill Korotaev wrote:

> Hello Jan,
> 
> this is much cleaner now and looks more like my original patch and is 
> smaller/more beautifull with counters usage. Thanks.

Yes, it is heavily inspired by you patch.

> However, with counters instead of list it is possible to create a live 
> lock :( So I'm not sure it is really ok.

Hmm, I don't really get what you mean with "live lock".

> BTW, what kernel is it for? 2.6.15 or 2.6.16-X?

http://www.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git from
today.

Regards,
	Jan

-- 
Jan Blunck                                               jblunck@suse.de
SuSE LINUX AG - A Novell company
Maxfeldstr. 5                                          +49-911-74053-608
D-90409 Nürnberg                                      http://www.suse.de

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-30 14:54       ` Jan Blunck
@ 2006-01-30 15:02         ` Kirill Korotaev
  2006-01-30 15:25           ` Jan Blunck
  0 siblings, 1 reply; 20+ messages in thread
From: Kirill Korotaev @ 2006-01-30 15:02 UTC (permalink / raw)
  To: Jan Blunck; +Cc: Balbir Singh, viro, linux-kernel, Andrew Morton, olh

>>We can think about optimizing this to
>>   if (!sb->sprunes)
>>	wake_up(&sb->s_wait_prunes);
>>
> 
> 
> Hardly. This is only the case when two or more shrinkers are active in
> parallel. If that was the case often, we would have seen this much more
> frequent IMHO.
But this avoids taking 2nd lock on fast path.

Kirill



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-30 15:02         ` Kirill Korotaev
@ 2006-01-30 15:25           ` Jan Blunck
  2006-01-30 15:31             ` Kirill Korotaev
  0 siblings, 1 reply; 20+ messages in thread
From: Jan Blunck @ 2006-01-30 15:25 UTC (permalink / raw)
  To: Kirill Korotaev; +Cc: Balbir Singh, viro, linux-kernel, Andrew Morton, olh

On Mon, Jan 30, Kirill Korotaev wrote:

> >>We can think about optimizing this to
> >>  if (!sb->sprunes)
> >>	wake_up(&sb->s_wait_prunes);
> >>
> >
> >
> >Hardly. This is only the case when two or more shrinkers are active in
> >parallel. If that was the case often, we would have seen this much more
> >frequent IMHO.
> But this avoids taking 2nd lock on fast path.
> 

No, the fast path (more frequent) is s_prunes == 0.

sb->s_prunes--;
if (likely(!sb->s_prunes))
   wake_up(&sb->s_wait_prunes);

This is only optimizing a rare case ... and unmounting isn't very time
critical.

Regards,
	Jan

-- 
Jan Blunck                                               jblunck@suse.de
SuSE LINUX AG - A Novell company
Maxfeldstr. 5                                          +49-911-74053-608
D-90409 Nürnberg                                      http://www.suse.de

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-30 15:25           ` Jan Blunck
@ 2006-01-30 15:31             ` Kirill Korotaev
  0 siblings, 0 replies; 20+ messages in thread
From: Kirill Korotaev @ 2006-01-30 15:31 UTC (permalink / raw)
  To: Jan Blunck; +Cc: Balbir Singh, viro, linux-kernel, Andrew Morton, olh

> No, the fast path (more frequent) is s_prunes == 0.
> 
> sb->s_prunes--;
> if (likely(!sb->s_prunes))
>    wake_up(&sb->s_wait_prunes);
> 
> This is only optimizing a rare case ... and unmounting isn't very time
> critical.
Yeah, you are right. I was thinking about 2 things at the same time and 
was wrong :)

Kirill


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] shrink_dcache_parent() races against shrink_dcache_memory()
  2006-01-30 14:58       ` Jan Blunck
@ 2006-01-30 15:59         ` Kirill Korotaev
  0 siblings, 0 replies; 20+ messages in thread
From: Kirill Korotaev @ 2006-01-30 15:59 UTC (permalink / raw)
  To: Jan Blunck; +Cc: viro, linux-kernel, Andrew Morton, olh, balbir

Hello Jan,

>>
>>this is much cleaner now and looks more like my original patch and is 
>>smaller/more beautifull with counters usage. Thanks.
> 
> 
> Yes, it is heavily inspired by you patch.
thanks again. BTW, out of curiosity why do you work on this?

>>However, with counters instead of list it is possible to create a live 
>>lock :( So I'm not sure it is really ok.
> 
> 
> Hmm, I don't really get what you mean with "live lock".
By "live lock" I mean the situation when you are "locked" in 
shrink_dcache_parent() due to wait_on_prunes() always returns 1.
We used shrinker list with a reference to dentry specially to avoid this 
as much as possible. I'm not sure how real such live lock can be 
created, but I can think it over.

>>BTW, what kernel is it for? 2.6.15 or 2.6.16-X?
> 
> 
> http://www.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git from
> today.
thanks!

Kirill


^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2006-01-30 15:57 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-01-20 20:36 [PATCH] shrink_dcache_parent() races against shrink_dcache_memory() Jan Blunck
2006-01-23  5:22 ` Andrew Morton
2006-01-23  8:12   ` Kirill Korotaev
2006-01-23 15:13   ` Jan Blunck
2006-01-23  8:07 ` Kirill Korotaev
2006-01-23 15:57   ` Jan Blunck
2006-01-24  5:54     ` Balbir Singh
2006-01-24  9:48       ` Kirill Korotaev
2006-01-24 11:10         ` Balbir Singh
2006-01-24 17:18           ` Kirill Korotaev
2006-01-25  7:03             ` Balbir Singh
2006-01-30 12:03   ` Jan Blunck
2006-01-30 14:38     ` Balbir Singh
2006-01-30 14:54       ` Jan Blunck
2006-01-30 15:02         ` Kirill Korotaev
2006-01-30 15:25           ` Jan Blunck
2006-01-30 15:31             ` Kirill Korotaev
2006-01-30 14:42     ` Kirill Korotaev
2006-01-30 14:58       ` Jan Blunck
2006-01-30 15:59         ` Kirill Korotaev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).