All of lore.kernel.org
 help / color / mirror / Atom feed
* PATCH - Don't remove inode from hash until filesystem has deleted it.
@ 2003-05-08  1:44 Neil Brown
  2003-05-08 20:43 ` Jan Harkes
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Neil Brown @ 2003-05-08  1:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel


Andrew,
 I wonder if you would consider including this patch in -mm so that it
 gets some testing and review.

Thanks,
NeilBrown


------------------------------------------------------
Don't remove inode from hash until filesystem has deleted it.

There is a small race with knfsd using iget to get an inode
that is currently being deleted.  This is because it is removed
from the hash table *before* the filesystem gets to delete it.
If nfsd does an iget in this window it will cause a read_inode which
will return an apparently valid inode.  However that inode will
shortly be deleted from disc without knfsd noticing... until
it is too late.

With this patch, the inode being deleted is left on the hash table,
and if a lookup find an inode being freed in the hashtable, it waits
in the inode waitqueue for the inode to be fully deleted.

 ----------- Diffstat output ------------
 ./fs/fs-writeback.c |    3 ++-
 ./fs/inode.c        |   31 ++++++++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff ./fs/fs-writeback.c~current~ ./fs/fs-writeback.c
--- ./fs/fs-writeback.c~current~	2003-05-02 16:41:32.000000000 +1000
+++ ./fs/fs-writeback.c	2003-05-02 16:41:33.000000000 +1000
@@ -90,7 +90,8 @@ void __mark_inode_dirty(struct inode *in
 		 * Only add valid (hashed) inodes to the superblock's
 		 * dirty list.  Add blockdev inodes as well.
 		 */
-		if (hlist_unhashed(&inode->i_hash) && !S_ISBLK(inode->i_mode))
+		if ((hlist_unhashed(&inode->i_hash) || (inode->i_state & (I_FREEING|I_CLEAR)))
+		    && !S_ISBLK(inode->i_mode))
 			goto out;
 
 		/*

diff ./fs/inode.c~current~ ./fs/inode.c
--- ./fs/inode.c~current~	2003-05-02 16:41:32.000000000 +1000
+++ ./fs/inode.c	2003-05-02 16:41:33.000000000 +1000
@@ -466,6 +466,7 @@ static int shrink_icache_memory(int nr, 
 	return inodes_stat.nr_unused;
 }
 
+void __wait_on_freeing_inode(struct inode *inode);
 /*
  * Called with the inode lock held.
  * NOTE: we are not increasing the inode-refcount, you must call __iget()
@@ -484,6 +485,11 @@ static struct inode * find_inode(struct 
 			continue;
 		if (!test(inode, data))
 			continue;
+		if (inode->i_state & (I_FREEING|I_CLEAR)) {
+			__wait_on_freeing_inode(inode);
+			node = head->first;
+			continue;
+		}
 		break;
 	}
 	return node ? inode : NULL;
@@ -505,6 +511,11 @@ static struct inode * find_inode_fast(st
 			continue;
 		if (inode->i_sb != sb)
 			continue;
+		if (inode->i_state & (I_FREEING|I_CLEAR)) {
+			__wait_on_freeing_inode(inode);
+			node = head->first;
+			continue;
+		}
 		break;
 	}
 	return node ? inode : NULL;
@@ -937,7 +948,6 @@ void generic_delete_inode(struct inode *
 {
 	struct super_operations *op = inode->i_sb->s_op;
 
-	hlist_del_init(&inode->i_hash);
 	list_del_init(&inode->i_list);
 	inode->i_state|=I_FREEING;
 	inodes_stat.nr_inodes--;
@@ -956,6 +966,10 @@ void generic_delete_inode(struct inode *
 		delete(inode);
 	} else
 		clear_inode(inode);
+	spin_lock(&inode_lock);
+	hlist_del_init(&inode->i_hash);
+	spin_unlock(&inode_lock);
+	wake_up_inode(inode);
 	if (inode->i_state != I_CLEAR)
 		BUG();
 	destroy_inode(inode);
@@ -1229,6 +1243,21 @@ repeat:
 	__set_current_state(TASK_RUNNING);
 }
 
+void __wait_on_freeing_inode(struct inode *inode)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	wait_queue_head_t *wq = i_waitq_head(inode);
+
+	add_wait_queue(wq, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock(&inode_lock);
+	schedule();
+	remove_wait_queue(wq, &wait);
+	current->state = TASK_RUNNING;
+	spin_lock(&inode_lock);
+}
+
+
 void wake_up_inode(struct inode *inode)
 {
 	wait_queue_head_t *wq = i_waitq_head(inode);

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: PATCH - Don't remove inode from hash until filesystem has deleted it.
  2003-05-08  1:44 PATCH - Don't remove inode from hash until filesystem has deleted it Neil Brown
@ 2003-05-08 20:43 ` Jan Harkes
  2003-05-09  4:36   ` Neil Brown
  2003-05-09  8:06 ` David Woodhouse
  2003-05-09 22:24 ` Andrew Morton
  2 siblings, 1 reply; 6+ messages in thread
From: Jan Harkes @ 2003-05-08 20:43 UTC (permalink / raw)
  To: Neil Brown; +Cc: Andrew Morton, linux-kernel

On Thu, May 08, 2003 at 11:44:32AM +1000, Neil Brown wrote:
> ------------------------------------------------------
> Don't remove inode from hash until filesystem has deleted it.
>
> With this patch, the inode being deleted is left on the hash table,
> and if a lookup find an inode being freed in the hashtable, it waits
> in the inode waitqueue for the inode to be fully deleted.

I could be wrong, but won't that break the following sequence of
operations,

    mkdir("foo", 0755);
    fd = creat("foo/bar", 0644);
    unlink("foo/bar");
    rmdir("foo"); /* this should succeed, but if the file is hashed
		     we get EBUSY here */
    close(fd);

Or have potential deadlock effects when rmdir is replaced with some
operation that tries to perform a lookup for the inode, f.i. a
stat("foo/bar", &statbuf);

Jan


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: PATCH - Don't remove inode from hash until filesystem has deleted it.
  2003-05-08 20:43 ` Jan Harkes
@ 2003-05-09  4:36   ` Neil Brown
  0 siblings, 0 replies; 6+ messages in thread
From: Neil Brown @ 2003-05-09  4:36 UTC (permalink / raw)
  To: Jan Harkes; +Cc: Andrew Morton, linux-kernel

On Thursday May 8, jaharkes@cs.cmu.edu wrote:
> On Thu, May 08, 2003 at 11:44:32AM +1000, Neil Brown wrote:
> > ------------------------------------------------------
> > Don't remove inode from hash until filesystem has deleted it.
> >
> > With this patch, the inode being deleted is left on the hash table,
> > and if a lookup find an inode being freed in the hashtable, it waits
> > in the inode waitqueue for the inode to be fully deleted.
> 
> I could be wrong, but won't that break the following sequence of
> operations,
> 
>     mkdir("foo", 0755);
>     fd = creat("foo/bar", 0644);
>     unlink("foo/bar");
>     rmdir("foo"); /* this should succeed, but if the file is hashed
> 		     we get EBUSY here */
>     close(fd);
> 
> Or have potential deadlock effects when rmdir is replaced with some
> operation that tries to perform a lookup for the inode, f.i. a
> stat("foo/bar", &statbuf);
> 
> Jan

Thankyou for your feedback.

However I don't think this is a problem.

The patch keeps the *inode* in the *inode hash table* slightly longer
than normal, but it does nothing to any dentry that might be connected
to the inode and might be hashed in a dentry table.
So when you unlink("foo/bar") the dentry is treated just as it always
is and is unhashed.

It is just the inode that instead of being unhash before the
filesystem it told that it is to be delete (this is different from
telling it that a filename is to be deleted), it is now unhash after
the filesystem has completed it's deletion process.

NeilBrown

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: PATCH - Don't remove inode from hash until filesystem has deleted it.
  2003-05-08  1:44 PATCH - Don't remove inode from hash until filesystem has deleted it Neil Brown
  2003-05-08 20:43 ` Jan Harkes
@ 2003-05-09  8:06 ` David Woodhouse
  2003-05-09 22:24 ` Andrew Morton
  2 siblings, 0 replies; 6+ messages in thread
From: David Woodhouse @ 2003-05-09  8:06 UTC (permalink / raw)
  To: Neil Brown; +Cc: Andrew Morton, linux-kernel

On Thu, 2003-05-08 at 02:44, Neil Brown wrote:
> Don't remove inode from hash until filesystem has deleted it.
> 
> There is a small race with knfsd using iget to get an inode
> that is currently being deleted.  This is because it is removed
> from the hash table *before* the filesystem gets to delete it.
> If nfsd does an iget in this window it will cause a read_inode which
> will return an apparently valid inode.  However that inode will
> shortly be deleted from disc without knfsd noticing... until
> it is too late.

JFFS2 suffers similarly from the VFS simultaneously calling read_inode()
and clear_inode() for the same inode, since it uses iget() internally
for garbage collection. Since deletion is a slow operation which
involves marking nodes obsolete on the medium, it's nice and easy to hit
the race and have jffs2_read_inode() start walking the list of nodes
belonging to a certain inode at the same time as jffs2_clear_inode() is
walking the same list freeing them all :)

I've added locking to prevent this from happening in that particular
case, by suspending garbage collection during jffs2_clear_inode(), but
that's undesirable and anyway, the problem still exists when a JFFS2
file system is exported by nfsd, if nfsd attempts to open the inode
while it's being deleted.

Your patch should solve that too -- looks sane to me.

-- 
dwmw2


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: PATCH - Don't remove inode from hash until filesystem has deleted it.
  2003-05-08  1:44 PATCH - Don't remove inode from hash until filesystem has deleted it Neil Brown
  2003-05-08 20:43 ` Jan Harkes
  2003-05-09  8:06 ` David Woodhouse
@ 2003-05-09 22:24 ` Andrew Morton
  2003-05-12  0:41   ` Neil Brown
  2 siblings, 1 reply; 6+ messages in thread
From: Andrew Morton @ 2003-05-09 22:24 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-kernel

Neil Brown <neilb@cse.unsw.edu.au> wrote:
>
> 
> There is a small race with knfsd using iget to get an inode
> that is currently being deleted.  This is because it is removed
> from the hash table *before* the filesystem gets to delete it.
> If nfsd does an iget in this window it will cause a read_inode which
> will return an apparently valid inode.  However that inode will
> shortly be deleted from disc without knfsd noticing... until
> it is too late.

Cannot nfsd use igrab()?

> With this patch, the inode being deleted is left on the hash table,
> and if a lookup find an inode being freed in the hashtable, it waits
> in the inode waitqueue for the inode to be fully deleted.

Few things:

- Why the tests for I_CLEAR as well?

- There are lots of paths which set I_FREEING, and lots of paths which
  unhash inodes.  But only one path in which a waiter on
  __wait_on_freeing_inode() gets woken up.

  Are you sure there is sufficient coverage here?  That there are no
  paths by which someone goes to sleep on a freeing inode but never gets
  woken up?

- wart: when a task gets woken in __wait_on_freeing_inode(), it doesn't
  know that it got woken on behalf of the correct inode (hash collision). 
  So the inode can still be in state I_FREEING when
  __wait_on_freeing_inode() returns.

  Yeah, it happens that this is OK because the caller will just repeat the
  search, but that sort of subtlety needs to be covered in commentary.

- Cleanups:

 25-akpm/fs/inode.c |   10 ++++------
 1 files changed, 4 insertions(+), 6 deletions(-)

diff -puN fs/inode.c~inode-unhashing-fix-cleanups fs/inode.c
--- 25/fs/inode.c~inode-unhashing-fix-cleanups	Fri May  9 15:19:22 2003
+++ 25-akpm/fs/inode.c	Fri May  9 15:20:20 2003
@@ -478,6 +478,7 @@ static struct inode * find_inode(struct 
 	struct hlist_node *node;
 	struct inode * inode = NULL;
 
+repeat:
 	hlist_for_each (node, head) { 
 		prefetch(node->next);
 		inode = hlist_entry(node, struct inode, i_hash);
@@ -487,8 +488,7 @@ static struct inode * find_inode(struct 
 			continue;
 		if (inode->i_state & (I_FREEING|I_CLEAR)) {
 			__wait_on_freeing_inode(inode);
-			node = head->first;
-			continue;
+			goto repeat;
 		}
 		break;
 	}
@@ -504,6 +504,7 @@ static struct inode * find_inode_fast(st
 	struct hlist_node *node;
 	struct inode * inode = NULL;
 
+repeat:
 	hlist_for_each (node, head) {
 		prefetch(node->next);
 		inode = list_entry(node, struct inode, i_hash);
@@ -513,8 +514,7 @@ static struct inode * find_inode_fast(st
 			continue;
 		if (inode->i_state & (I_FREEING|I_CLEAR)) {
 			__wait_on_freeing_inode(inode);
-			node = head->first;
-			continue;
+			goto repeat;
 		}
 		break;
 	}
@@ -1253,11 +1253,9 @@ void __wait_on_freeing_inode(struct inod
 	spin_unlock(&inode_lock);
 	schedule();
 	remove_wait_queue(wq, &wait);
-	current->state = TASK_RUNNING;
 	spin_lock(&inode_lock);
 }
 
-
 void wake_up_inode(struct inode *inode)
 {
 	wait_queue_head_t *wq = i_waitq_head(inode);

_


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: PATCH - Don't remove inode from hash until filesystem has deleted it.
  2003-05-09 22:24 ` Andrew Morton
@ 2003-05-12  0:41   ` Neil Brown
  0 siblings, 0 replies; 6+ messages in thread
From: Neil Brown @ 2003-05-12  0:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

On Friday May 9, akpm@digeo.com wrote:
> Neil Brown <neilb@cse.unsw.edu.au> wrote:
> >
> > 
> > There is a small race with knfsd using iget to get an inode
> > that is currently being deleted.  This is because it is removed
> > from the hash table *before* the filesystem gets to delete it.
> > If nfsd does an iget in this window it will cause a read_inode which
> > will return an apparently valid inode.  However that inode will
> > shortly be deleted from disc without knfsd noticing... until
> > it is too late.
> 
> Cannot nfsd use igrab()?

That wouldn't help.  By the time nfsd gets the inode, and so has a
chance to use igrab, it is too late.

> 
> > With this patch, the inode being deleted is left on the hash table,
> > and if a lookup find an inode being freed in the hashtable, it waits
> > in the inode waitqueue for the inode to be fully deleted.
> 
> Few things:
> 
> - Why the tests for I_CLEAR as well?

Because the inode is unhashed (very) shortly *after* clear_inode is
called, which clears I_FREEING.
So instead of :
   unhash
   set I_FREEING
   call into filesystem
   change I_FREEING to I_CLEAR
   destroy

we now
   set I_FREEING
   call into filesystem
   change I_FREEING to I_CLEAR
   unhash
   destroy

so the inode is now still in the hash table while I_FREEING is set,
and momentarily while I_CLEAR is set as well.  We need to guard
against these posibilities.

> 
> - There are lots of paths which set I_FREEING, and lots of paths which
>   unhash inodes.  But only one path in which a waiter on
>   __wait_on_freeing_inode() gets woken up.
> 
>   Are you sure there is sufficient coverage here?  That there are no
>   paths by which someone goes to sleep on a freeing inode but never gets
>   woken up?

Yes (but please check my logic).

Sometimes we are Freeing and Clearing an inode because we aren't
interested in it any more and want to get it out of the cache.
Sometimes we are Freeing and Clearing because the inode is dead and is
to be completely forgotten.

In the first case we set I_FREEING atomically (via inode_lock) with
removing from the hash table.  In this case the inode will have always
been un-dirtied first so the filesystem storage device contains
up-to-date information.  If we go an read from disk instantly after
the unhas, we will get a valid inode.

It is only in the second case that an I_FREEING inode will ever appear
in the hash table.  In this case we set I_FREEING while the inode is
still potentially dirty.  The inode has been 'deleted' (link count +
ref count == 0) but the data on disc doesn't not yet reflect this.
It is in this case that we don't want to load an inode from disk
before the filesystem has completely finished with it, and so only in
this case that we need to wait.

So, the only time anyone will find an I_FREEING or I_CLEAR inode in
the hash table and so wait for it, is when that inode is being
deleted.

This only happens in the generic_delete_inode path and that one does
call wake_up_inode after unhashing.

> 
> - wart: when a task gets woken in __wait_on_freeing_inode(), it doesn't
>   know that it got woken on behalf of the correct inode (hash collision). 
>   So the inode can still be in state I_FREEING when
>   __wait_on_freeing_inode() returns.
> 
>   Yeah, it happens that this is OK because the caller will just repeat the
>   search, but that sort of subtlety needs to be covered in
>   commentary.

Fair comment.  I assume the "goto repeat" is partly to provide such
commentary?

> 
> - Cleanups:
> 
They all look good.  This makes the resives patch as below.

Thanks,
NeilBrown


---------------------------------------
Don't remove inode from hash until filesystem has deleted it.

There is a small race with knfsd using iget to get and inode
that is currently being deleted.  This is because it is removed
from the hash table *before* the filesystem gets to delete it.
If nfsd does an iget in this window it will cause a readinode which
will return an apparently valid inode.  However that inode will
shortly be deleted from disc without knfsd noticing... until
it is too late.

With this patch, the inode being deleted is left on the hash table,
and if a lookup find an inode being freed in the hashtable, it waits
in the inode waitqueue for the inode to be fully deleted.

 ----------- Diffstat output ------------
 ./fs/fs-writeback.c |    3 ++-
 ./fs/inode.c        |   29 ++++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff ./fs/fs-writeback.c~current~ ./fs/fs-writeback.c
--- ./fs/fs-writeback.c~current~	2003-05-09 15:52:31.000000000 +1000
+++ ./fs/fs-writeback.c	2003-05-12 10:36:27.000000000 +1000
@@ -90,7 +90,8 @@ void __mark_inode_dirty(struct inode *in
 		 * Only add valid (hashed) inodes to the superblock's
 		 * dirty list.  Add blockdev inodes as well.
 		 */
-		if (hlist_unhashed(&inode->i_hash) && !S_ISBLK(inode->i_mode))
+		if ((hlist_unhashed(&inode->i_hash) || (inode->i_state & (I_FREEING|I_CLEAR)))
+		    && !S_ISBLK(inode->i_mode))
 			goto out;
 
 		/*

diff ./fs/inode.c~current~ ./fs/inode.c
--- ./fs/inode.c~current~	2003-05-09 15:52:31.000000000 +1000
+++ ./fs/inode.c	2003-05-12 10:37:06.000000000 +1000
@@ -466,6 +466,7 @@ static int shrink_icache_memory(int nr, 
 	return inodes_stat.nr_unused;
 }
 
+void __wait_on_freeing_inode(struct inode *inode);
 /*
  * Called with the inode lock held.
  * NOTE: we are not increasing the inode-refcount, you must call __iget()
@@ -477,6 +478,7 @@ static struct inode * find_inode(struct 
 	struct hlist_node *node;
 	struct inode * inode = NULL;
 
+repeat:
 	hlist_for_each (node, head) { 
 		prefetch(node->next);
 		inode = hlist_entry(node, struct inode, i_hash);
@@ -484,6 +486,10 @@ static struct inode * find_inode(struct 
 			continue;
 		if (!test(inode, data))
 			continue;
+		if (inode->i_state & (I_FREEING|I_CLEAR)) {
+			__wait_on_freeing_inode(inode);
+			goto repeat;
+		}
 		break;
 	}
 	return node ? inode : NULL;
@@ -498,6 +504,7 @@ static struct inode * find_inode_fast(st
 	struct hlist_node *node;
 	struct inode * inode = NULL;
 
+repeat:
 	hlist_for_each (node, head) {
 		prefetch(node->next);
 		inode = list_entry(node, struct inode, i_hash);
@@ -505,6 +512,10 @@ static struct inode * find_inode_fast(st
 			continue;
 		if (inode->i_sb != sb)
 			continue;
+		if (inode->i_state & (I_FREEING|I_CLEAR)) {
+			__wait_on_freeing_inode(inode);
+			goto repeat;
+		}
 		break;
 	}
 	return node ? inode : NULL;
@@ -937,7 +948,6 @@ void generic_delete_inode(struct inode *
 {
 	struct super_operations *op = inode->i_sb->s_op;
 
-	hlist_del_init(&inode->i_hash);
 	list_del_init(&inode->i_list);
 	inode->i_state|=I_FREEING;
 	inodes_stat.nr_inodes--;
@@ -956,6 +966,10 @@ void generic_delete_inode(struct inode *
 		delete(inode);
 	} else
 		clear_inode(inode);
+	spin_lock(&inode_lock);
+	hlist_del_init(&inode->i_hash);
+	spin_unlock(&inode_lock);
+	wake_up_inode(inode);
 	if (inode->i_state != I_CLEAR)
 		BUG();
 	destroy_inode(inode);
@@ -1229,6 +1243,19 @@ repeat:
 	__set_current_state(TASK_RUNNING);
 }
 
+void __wait_on_freeing_inode(struct inode *inode)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	wait_queue_head_t *wq = i_waitq_head(inode);
+
+	add_wait_queue(wq, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock(&inode_lock);
+	schedule();
+	remove_wait_queue(wq, &wait);
+	spin_lock(&inode_lock);
+}
+
 void wake_up_inode(struct inode *inode)
 {
 	wait_queue_head_t *wq = i_waitq_head(inode);

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2003-05-12  0:29 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-05-08  1:44 PATCH - Don't remove inode from hash until filesystem has deleted it Neil Brown
2003-05-08 20:43 ` Jan Harkes
2003-05-09  4:36   ` Neil Brown
2003-05-09  8:06 ` David Woodhouse
2003-05-09 22:24 ` Andrew Morton
2003-05-12  0:41   ` Neil Brown

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.