All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Btrfs: fix ->iterate_shared() by upgrading i_rwsem for delayed nodes
@ 2016-05-20 20:50 Omar Sandoval
  2016-05-25 20:11 ` David Sterba
  0 siblings, 1 reply; 8+ messages in thread
From: Omar Sandoval @ 2016-05-20 20:50 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Chris Mason, Al Viro, kernel-team, Omar Sandoval

From: Omar Sandoval <osandov@fb.com>

Commit fe742fd4f90f ("Revert "btrfs: switch to ->iterate_shared()"")
backed out the conversion to ->iterate_shared() for Btrfs because the
delayed inode handling in btrfs_real_readdir() is racy. However, we can
still do readdir in parallel if there are no delayed nodes.

This is a temporary fix which upgrades the shared inode lock to an
exclusive lock only when we have delayed items until we come up with a
more complete solution. While we're here, rename the
btrfs_{get,put}_delayed_items functions to make it very clear that
they're just for readdir.

Tested with xfstests and by doing a parallel kernel build:

	while make tinyconfig && make -j4 && git clean dqfx; do
		:
	done

along with a bunch of parallel finds in another shell:

	while true; do
		for ((i=0; i<4; i++)); do
			find . >/dev/null &
		done
		wait
	done

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 fs/btrfs/delayed-inode.c | 27 ++++++++++++++++++++++-----
 fs/btrfs/delayed-inode.h | 10 ++++++----
 fs/btrfs/inode.c         | 10 ++++++----
 3 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 6cef0062f929..d60cd17ea66b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1606,15 +1606,23 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode)
 	return 0;
 }
 
-void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
-			     struct list_head *del_list)
+bool btrfs_readdir_get_delayed_items(struct inode *inode,
+				     struct list_head *ins_list,
+				     struct list_head *del_list)
 {
 	struct btrfs_delayed_node *delayed_node;
 	struct btrfs_delayed_item *item;
 
 	delayed_node = btrfs_get_delayed_node(inode);
 	if (!delayed_node)
-		return;
+		return false;
+
+	/*
+	 * We can only do one readdir with delayed items at a time because of
+	 * item->readdir_list.
+	 */
+	inode_unlock_shared(inode);
+	inode_lock(inode);
 
 	mutex_lock(&delayed_node->mutex);
 	item = __btrfs_first_delayed_insertion_item(delayed_node);
@@ -1641,10 +1649,13 @@ void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
 	 * requeue or dequeue this delayed node.
 	 */
 	atomic_dec(&delayed_node->refs);
+
+	return true;
 }
 
-void btrfs_put_delayed_items(struct list_head *ins_list,
-			     struct list_head *del_list)
+void btrfs_readdir_put_delayed_items(struct inode *inode,
+				     struct list_head *ins_list,
+				     struct list_head *del_list)
 {
 	struct btrfs_delayed_item *curr, *next;
 
@@ -1659,6 +1670,12 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
 		if (atomic_dec_and_test(&curr->refs))
 			kfree(curr);
 	}
+
+	/*
+	 * The VFS is going to do up_read(), so we need to downgrade back to a
+	 * read lock.
+	 */
+	downgrade_write(&inode->i_rwsem);
 }
 
 int btrfs_should_delete_dir_index(struct list_head *del_list,
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 0167853c84ae..2495b3d4075f 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -137,10 +137,12 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
 void btrfs_destroy_delayed_inodes(struct btrfs_root *root);
 
 /* Used for readdir() */
-void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
-			     struct list_head *del_list);
-void btrfs_put_delayed_items(struct list_head *ins_list,
-			     struct list_head *del_list);
+bool btrfs_readdir_get_delayed_items(struct inode *inode,
+				     struct list_head *ins_list,
+				     struct list_head *del_list);
+void btrfs_readdir_put_delayed_items(struct inode *inode,
+				     struct list_head *ins_list,
+				     struct list_head *del_list);
 int btrfs_should_delete_dir_index(struct list_head *del_list,
 				  u64 index);
 int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6b7fe291a174..6ab6ca195f2f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5733,6 +5733,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 	int name_len;
 	int is_curr = 0;	/* ctx->pos points to the current index? */
 	bool emitted;
+	bool put = false;
 
 	/* FIXME, use a real flag for deciding about the key type */
 	if (root->fs_info->tree_root == root)
@@ -5750,7 +5751,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 	if (key_type == BTRFS_DIR_INDEX_KEY) {
 		INIT_LIST_HEAD(&ins_list);
 		INIT_LIST_HEAD(&del_list);
-		btrfs_get_delayed_items(inode, &ins_list, &del_list);
+		put = btrfs_readdir_get_delayed_items(inode, &ins_list,
+						      &del_list);
 	}
 
 	key.type = key_type;
@@ -5897,8 +5899,8 @@ next:
 nopos:
 	ret = 0;
 err:
-	if (key_type == BTRFS_DIR_INDEX_KEY)
-		btrfs_put_delayed_items(&ins_list, &del_list);
+	if (put)
+		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -10181,7 +10183,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
 static const struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.iterate	= btrfs_real_readdir,
+	.iterate_shared	= btrfs_real_readdir,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
-- 
2.8.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: fix ->iterate_shared() by upgrading i_rwsem for delayed nodes
  2016-05-20 20:50 [PATCH] Btrfs: fix ->iterate_shared() by upgrading i_rwsem for delayed nodes Omar Sandoval
@ 2016-05-25 20:11 ` David Sterba
  2016-05-25 20:22   ` Chris Mason
  0 siblings, 1 reply; 8+ messages in thread
From: David Sterba @ 2016-05-25 20:11 UTC (permalink / raw)
  To: Omar Sandoval
  Cc: linux-btrfs, Chris Mason, Al Viro, kernel-team, Omar Sandoval

On Fri, May 20, 2016 at 01:50:33PM -0700, Omar Sandoval wrote:
> Commit fe742fd4f90f ("Revert "btrfs: switch to ->iterate_shared()"")
> backed out the conversion to ->iterate_shared() for Btrfs because the
> delayed inode handling in btrfs_real_readdir() is racy. However, we can
> still do readdir in parallel if there are no delayed nodes.

So this is for current master (pre 4.7-rc1), I'll add an appropriate
merge point for to my for-next.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: fix ->iterate_shared() by upgrading i_rwsem for delayed nodes
  2016-05-25 20:11 ` David Sterba
@ 2016-05-25 20:22   ` Chris Mason
  2016-05-25 20:48     ` Al Viro
  2016-06-17 17:55     ` Omar Sandoval
  0 siblings, 2 replies; 8+ messages in thread
From: Chris Mason @ 2016-05-25 20:22 UTC (permalink / raw)
  To: dsterba, Omar Sandoval, linux-btrfs, Al Viro, kernel-team, Omar Sandoval

On Wed, May 25, 2016 at 10:11:29PM +0200, David Sterba wrote:
> On Fri, May 20, 2016 at 01:50:33PM -0700, Omar Sandoval wrote:
> > Commit fe742fd4f90f ("Revert "btrfs: switch to ->iterate_shared()"")
> > backed out the conversion to ->iterate_shared() for Btrfs because the
> > delayed inode handling in btrfs_real_readdir() is racy. However, we can
> > still do readdir in parallel if there are no delayed nodes.
> 
> So this is for current master (pre 4.7-rc1), I'll add an appropriate
> merge point for to my for-next.

I'll get this bashed on in a big stress.sh run, but it looks good to me.

-chris


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: fix ->iterate_shared() by upgrading i_rwsem for delayed nodes
  2016-05-25 20:22   ` Chris Mason
@ 2016-05-25 20:48     ` Al Viro
  2016-05-25 22:11       ` Omar Sandoval
  2016-06-17 17:55     ` Omar Sandoval
  1 sibling, 1 reply; 8+ messages in thread
From: Al Viro @ 2016-05-25 20:48 UTC (permalink / raw)
  To: Chris Mason, dsterba, Omar Sandoval, linux-btrfs, kernel-team,
	Omar Sandoval

On Wed, May 25, 2016 at 04:22:26PM -0400, Chris Mason wrote:
> On Wed, May 25, 2016 at 10:11:29PM +0200, David Sterba wrote:
> > On Fri, May 20, 2016 at 01:50:33PM -0700, Omar Sandoval wrote:
> > > Commit fe742fd4f90f ("Revert "btrfs: switch to ->iterate_shared()"")
> > > backed out the conversion to ->iterate_shared() for Btrfs because the
> > > delayed inode handling in btrfs_real_readdir() is racy. However, we can
> > > still do readdir in parallel if there are no delayed nodes.
> > 
> > So this is for current master (pre 4.7-rc1), I'll add an appropriate
> > merge point for to my for-next.
> 
> I'll get this bashed on in a big stress.sh run, but it looks good to me.

I really don't like that approach, TBH ;-/  Is there any reason to exclude
lookups for the duration of that thing?  Conversely, are we really OK
with changes to directory happening during that "unlock and relock exclusive"?

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: fix ->iterate_shared() by upgrading i_rwsem for delayed nodes
  2016-05-25 20:48     ` Al Viro
@ 2016-05-25 22:11       ` Omar Sandoval
  0 siblings, 0 replies; 8+ messages in thread
From: Omar Sandoval @ 2016-05-25 22:11 UTC (permalink / raw)
  To: Al Viro; +Cc: Chris Mason, dsterba, linux-btrfs, kernel-team, Omar Sandoval

On Wed, May 25, 2016 at 09:48:21PM +0100, Al Viro wrote:
> On Wed, May 25, 2016 at 04:22:26PM -0400, Chris Mason wrote:
> > On Wed, May 25, 2016 at 10:11:29PM +0200, David Sterba wrote:
> > > On Fri, May 20, 2016 at 01:50:33PM -0700, Omar Sandoval wrote:
> > > > Commit fe742fd4f90f ("Revert "btrfs: switch to ->iterate_shared()"")
> > > > backed out the conversion to ->iterate_shared() for Btrfs because the
> > > > delayed inode handling in btrfs_real_readdir() is racy. However, we can
> > > > still do readdir in parallel if there are no delayed nodes.
> > > 
> > > So this is for current master (pre 4.7-rc1), I'll add an appropriate
> > > merge point for to my for-next.
> > 
> > I'll get this bashed on in a big stress.sh run, but it looks good to me.
> 
> I really don't like that approach, TBH ;-/  Is there any reason to exclude
> lookups for the duration of that thing?

Nope, no reason at all. We'll fix it properly, hopefully by 4.8, but we
at least want the parallelism for now in the easy case when we don't
have delayed nodes.

> Conversely, are we really OK
> with changes to directory happening during that "unlock and relock exclusive"?

Yeah, so at that point, the only thing we've done is checked if we had
to emit the "." and ".." entries, which is safe because of f_pos_lock,
and allocated a struct btrfs_path. So we'd be alright if something came
in and changed the directory during that window.

-- 
Omar

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: fix ->iterate_shared() by upgrading i_rwsem for delayed nodes
  2016-05-25 20:22   ` Chris Mason
  2016-05-25 20:48     ` Al Viro
@ 2016-06-17 17:55     ` Omar Sandoval
  2016-06-17 17:59       ` Omar Sandoval
  2016-06-20 15:07       ` David Sterba
  1 sibling, 2 replies; 8+ messages in thread
From: Omar Sandoval @ 2016-06-17 17:55 UTC (permalink / raw)
  To: Chris Mason, dsterba, linux-btrfs, Al Viro, kernel-team,
	Omar Sandoval, g

On Wed, May 25, 2016 at 04:22:26PM -0400, Chris Mason wrote:
> On Wed, May 25, 2016 at 10:11:29PM +0200, David Sterba wrote:
> > On Fri, May 20, 2016 at 01:50:33PM -0700, Omar Sandoval wrote:
> > > Commit fe742fd4f90f ("Revert "btrfs: switch to ->iterate_shared()"")
> > > backed out the conversion to ->iterate_shared() for Btrfs because the
> > > delayed inode handling in btrfs_real_readdir() is racy. However, we can
> > > still do readdir in parallel if there are no delayed nodes.
> > 
> > So this is for current master (pre 4.7-rc1), I'll add an appropriate
> > merge point for to my for-next.
> 
> I'll get this bashed on in a big stress.sh run, but it looks good to me.
> 
> -chris
> 

Chris/Dave, what's the plan for this? Since it's getting late for 4.7,
should we just think harder for a proper solution for 4.8?

-- 
Omar

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: fix ->iterate_shared() by upgrading i_rwsem for delayed nodes
  2016-06-17 17:55     ` Omar Sandoval
@ 2016-06-17 17:59       ` Omar Sandoval
  2016-06-20 15:07       ` David Sterba
  1 sibling, 0 replies; 8+ messages in thread
From: Omar Sandoval @ 2016-06-17 17:59 UTC (permalink / raw)
  To: Chris Mason, dsterba, linux-btrfs, Al Viro, kernel-team, Omar Sandoval

On Fri, Jun 17, 2016 at 10:55:54AM -0700, Omar Sandoval wrote:
> On Wed, May 25, 2016 at 04:22:26PM -0400, Chris Mason wrote:
> > On Wed, May 25, 2016 at 10:11:29PM +0200, David Sterba wrote:
> > > On Fri, May 20, 2016 at 01:50:33PM -0700, Omar Sandoval wrote:
> > > > Commit fe742fd4f90f ("Revert "btrfs: switch to ->iterate_shared()"")
> > > > backed out the conversion to ->iterate_shared() for Btrfs because the
> > > > delayed inode handling in btrfs_real_readdir() is racy. However, we can
> > > > still do readdir in parallel if there are no delayed nodes.
> > > 
> > > So this is for current master (pre 4.7-rc1), I'll add an appropriate
> > > merge point for to my for-next.
> > 
> > I'll get this bashed on in a big stress.sh run, but it looks good to me.
> > 
> > -chris
> > 
> 
> Chris/Dave, what's the plan for this? Since it's getting late for 4.7,
> should we just think harder for a proper solution for 4.8?
> 
> -- 
> Omar

Whoops, I hit "g" in Mutt one too many times and sent to a bogus
recipient, ignore that guy.

-- 
Omar

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] Btrfs: fix ->iterate_shared() by upgrading i_rwsem for delayed nodes
  2016-06-17 17:55     ` Omar Sandoval
  2016-06-17 17:59       ` Omar Sandoval
@ 2016-06-20 15:07       ` David Sterba
  1 sibling, 0 replies; 8+ messages in thread
From: David Sterba @ 2016-06-20 15:07 UTC (permalink / raw)
  To: Omar Sandoval
  Cc: Chris Mason, dsterba, linux-btrfs, Al Viro, kernel-team, Omar Sandoval

On Fri, Jun 17, 2016 at 10:55:54AM -0700, Omar Sandoval wrote:
> On Wed, May 25, 2016 at 04:22:26PM -0400, Chris Mason wrote:
> > On Wed, May 25, 2016 at 10:11:29PM +0200, David Sterba wrote:
> > > On Fri, May 20, 2016 at 01:50:33PM -0700, Omar Sandoval wrote:
> > > > Commit fe742fd4f90f ("Revert "btrfs: switch to ->iterate_shared()"")
> > > > backed out the conversion to ->iterate_shared() for Btrfs because the
> > > > delayed inode handling in btrfs_real_readdir() is racy. However, we can
> > > > still do readdir in parallel if there are no delayed nodes.
> > > 
> > > So this is for current master (pre 4.7-rc1), I'll add an appropriate
> > > merge point for to my for-next.
> > 
> > I'll get this bashed on in a big stress.sh run, but it looks good to me.
> > 
> > -chris
> > 
> 
> Chris/Dave, what's the plan for this? Since it's getting late for 4.7,
> should we just think harder for a proper solution for 4.8?

There are a few more fixes queued for rc5 so I can add this one as well.
It's been in for-next since the first posting but I haven't run any
specific tests.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-06-20 15:54 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-20 20:50 [PATCH] Btrfs: fix ->iterate_shared() by upgrading i_rwsem for delayed nodes Omar Sandoval
2016-05-25 20:11 ` David Sterba
2016-05-25 20:22   ` Chris Mason
2016-05-25 20:48     ` Al Viro
2016-05-25 22:11       ` Omar Sandoval
2016-06-17 17:55     ` Omar Sandoval
2016-06-17 17:59       ` Omar Sandoval
2016-06-20 15:07       ` David Sterba

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.