All of lore.kernel.org
 help / color / mirror / Atom feed
From: Miao Xie <miaox@cn.fujitsu.com>
To: Josef Bacik <jbacik@fusionio.com>
Cc: linux-btrfs@vger.kernel.org
Subject: Re: [PATCH] Btrfs: place ordered operations on a per transaction list
Date: Mon, 18 Feb 2013 19:22:09 +0800	[thread overview]
Message-ID: <51220EE1.6090607@cn.fujitsu.com> (raw)
In-Reply-To: <1360772002-8923-1-git-send-email-jbacik@fusionio.com>

On wed, 13 Feb 2013 11:13:22 -0500, Josef Bacik wrote:
> Miao made the ordered operations stuff run async, which introduced a
> deadlock where we could get somebody (sync) racing in and committing the
> transaction while a commit was already happening.  The new committer would
> try and flush ordered operations which would hang waiting for the commit to
> finish because it is done asynchronously and no longer inherits the callers
> trans handle.  To fix this we need to make the ordered operations list a per
> transaction list.  We can get new inodes added to the ordered operation list
> by truncating them and then having another process writing to them, so this
> makes it so that anybody trying to add an ordered operation _must_ start a
> transaction in order to add itself to the list, which will keep new inodes
> from getting added to the ordered operations list after we start committing.
> This should fix the deadlock and also keeps us from doing a lot more work
> than we need to during commit.  Thanks,

Firstly, thanks to deal with the bug which was introduced by my patch.

But comparing with this fix method, I prefer the following one because:
- we won't worry the similar problem if we add more work during commit
  in the future.
- it is unnecessary to get a new handle and commit it if the transaction
  is under the commit.

Thanks
Miao

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fc03aa6..c449cb5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -277,7 +277,8 @@ static void wait_current_trans(struct btrfs_root *root)
 	}
 }
 
-static int may_wait_transaction(struct btrfs_root *root, int type)
+static int may_wait_transaction(struct btrfs_root *root, int type,
+				bool is_joined)
 {
 	if (root->fs_info->log_root_recovering)
 		return 0;
@@ -285,6 +286,14 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 	if (type == TRANS_USERSPACE)
 		return 1;
 
+	/*
+	 * If we are ATTACH, it means we just want to catch the current
+	 * transaction and commit it. So if someone is committing the
+	 * current transaction now, it is very glad to wait it.
+	 */
+	if (is_joined && type == TRANS_ATTACH)
+		return 1;
+
 	if (type == TRANS_START &&
 	    !atomic_read(&root->fs_info->open_ioctl_trans))
 		return 1;
@@ -355,7 +364,7 @@ again:
 	if (type < TRANS_JOIN_NOLOCK)
 		sb_start_intwrite(root->fs_info->sb);
 
-	if (may_wait_transaction(root, type))
+	if (may_wait_transaction(root, type, false))
 		wait_current_trans(root);
 
 	do {
@@ -383,16 +392,26 @@ again:
 	h->block_rsv = NULL;
 	h->orig_rsv = NULL;
 	h->aborted = 0;
-	h->qgroup_reserved = qgroup_reserved;
+	h->qgroup_reserved = 0;
 	h->delayed_ref_elem.seq = 0;
 	h->type = type;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
 
 	smp_mb();
-	if (cur_trans->blocked && may_wait_transaction(root, type)) {
-		btrfs_commit_transaction(h, root);
-		goto again;
+	if (cur_trans->blocked && may_wait_transaction(root, type, true)) {
+		if (cur_trans->in_commit) {
+			btrfs_end_transaction(h, root);
+			wait_current_trans(root);
+		} else {
+			btrfs_commit_transaction(h, root);
+		}
+		if (unlikely(type == TRANS_ATTACH)) {
+			ret = -ENOENT;
+			goto alloc_fail;
+		} else {
+			goto again;
+		}
 	}
 
 	if (num_bytes) {
@@ -401,6 +420,7 @@ again:
 		h->block_rsv = &root->fs_info->trans_block_rsv;
 		h->bytes_reserved = num_bytes;
 	}
+	h->qgroup_reserved = qgroup_reserved;
 
 got_it:
 	btrfs_record_root_in_trans(h, root);
-- 
1.6.5.2

> 
> Signed-off-by: Josef Bacik <jbacik@fusionio.com>
> ---
>  fs/btrfs/ctree.h        |    7 -------
>  fs/btrfs/disk-io.c      |   11 ++++++-----
>  fs/btrfs/file.c         |   15 ++++++++++++++-
>  fs/btrfs/ordered-data.c |   13 ++++++++-----
>  fs/btrfs/ordered-data.h |    3 ++-
>  fs/btrfs/transaction.c  |    5 +++--
>  fs/btrfs/transaction.h  |    1 +
>  7 files changed, 34 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 0c4e4df..9f72ec8 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1408,13 +1408,6 @@ struct btrfs_fs_info {
>  	struct list_head delalloc_inodes;
>  
>  	/*
> -	 * special rename and truncate targets that must be on disk before
> -	 * we're allowed to commit.  This is basically the ext3 style
> -	 * data=ordered list.
> -	 */
> -	struct list_head ordered_operations;
> -
> -	/*
>  	 * there is a pool of worker threads for checksumming during writes
>  	 * and a pool for checksumming after reads.  This is because readers
>  	 * can run with FS locks held, and the writers may be waiting for
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 52573d0..6baa77d 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -56,7 +56,8 @@ static void end_workqueue_fn(struct btrfs_work *work);
>  static void free_fs_root(struct btrfs_root *root);
>  static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
>  				    int read_only);
> -static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
> +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
> +					     struct btrfs_root *root);
>  static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
>  static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
>  				      struct btrfs_root *root);
> @@ -2029,7 +2030,6 @@ int open_ctree(struct super_block *sb,
>  	INIT_LIST_HEAD(&fs_info->dead_roots);
>  	INIT_LIST_HEAD(&fs_info->delayed_iputs);
>  	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
> -	INIT_LIST_HEAD(&fs_info->ordered_operations);
>  	INIT_LIST_HEAD(&fs_info->caching_block_groups);
>  	spin_lock_init(&fs_info->delalloc_lock);
>  	spin_lock_init(&fs_info->trans_lock);
> @@ -3538,7 +3538,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
>  	btrfs_cleanup_transaction(root);
>  }
>  
> -static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
> +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
> +					     struct btrfs_root *root)
>  {
>  	struct btrfs_inode *btrfs_inode;
>  	struct list_head splice;
> @@ -3548,7 +3549,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
>  	mutex_lock(&root->fs_info->ordered_operations_mutex);
>  	spin_lock(&root->fs_info->ordered_extent_lock);
>  
> -	list_splice_init(&root->fs_info->ordered_operations, &splice);
> +	list_splice_init(&t->ordered_operations, &splice);
>  	while (!list_empty(&splice)) {
>  		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
>  					 ordered_operations);
> @@ -3829,7 +3830,7 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
>  	while (!list_empty(&list)) {
>  		t = list_entry(list.next, struct btrfs_transaction, list);
>  
> -		btrfs_destroy_ordered_operations(root);
> +		btrfs_destroy_ordered_operations(t, root);
>  
>  		btrfs_destroy_ordered_extents(root);
>  
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 237bf92..848442c 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1629,7 +1629,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
>  	 */
>  	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
>  			       &BTRFS_I(inode)->runtime_flags)) {
> -		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
> +		struct btrfs_trans_handle *trans;
> +		struct btrfs_root *root = BTRFS_I(inode)->root;
> +
> +		/*
> +		 * We need to block on a committing transaction to keep us from
> +		 * throwing a ordered operation on to the list and causing
> +		 * something like sync to deadlock trying to flush out this
> +		 * inode.
> +		 */
> +		trans = btrfs_start_transaction(root, 0);
> +		if (IS_ERR(trans))
> +			return PTR_ERR(trans);
> +		btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
> +		btrfs_end_transaction(trans, root);
>  		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
>  			filemap_flush(inode->i_mapping);
>  	}
> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
> index cd8f6e9..346dca0 100644
> --- a/fs/btrfs/ordered-data.c
> +++ b/fs/btrfs/ordered-data.c
> @@ -612,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
>   * extra check to make sure the ordered operation list really is empty
>   * before we return
>   */
> -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
> +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
> +				 struct btrfs_root *root, int wait)
>  {
>  	struct btrfs_inode *btrfs_inode;
>  	struct inode *inode;
> +	struct btrfs_transaction *cur_trans = trans->transaction;
>  	struct list_head splice;
>  	struct list_head works;
>  	struct btrfs_delalloc_work *work, *next;
> @@ -626,7 +628,7 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
>  
>  	mutex_lock(&root->fs_info->ordered_operations_mutex);
>  	spin_lock(&root->fs_info->ordered_extent_lock);
> -	list_splice_init(&root->fs_info->ordered_operations, &splice);
> +	list_splice_init(&cur_trans->ordered_operations, &splice);
>  	while (!list_empty(&splice)) {
>  		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
>  				   ordered_operations);
> @@ -643,7 +645,7 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
>  
>  		if (!wait)
>  			list_add_tail(&BTRFS_I(inode)->ordered_operations,
> -				      &root->fs_info->ordered_operations);
> +				      &cur_trans->ordered_operations);
>  		spin_unlock(&root->fs_info->ordered_extent_lock);
>  
>  		work = btrfs_alloc_delalloc_work(inode, wait, 1);
> @@ -653,7 +655,7 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
>  				list_add_tail(&btrfs_inode->ordered_operations,
>  					      &splice);
>  			list_splice_tail(&splice,
> -					 &root->fs_info->ordered_operations);
> +					 &cur_trans->ordered_operations);
>  			spin_unlock(&root->fs_info->ordered_extent_lock);
>  			ret = -ENOMEM;
>  			goto out;
> @@ -1033,6 +1035,7 @@ out:
>  void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
>  				 struct btrfs_root *root, struct inode *inode)
>  {
> +	struct btrfs_transaction *cur_trans = trans->transaction;
>  	u64 last_mod;
>  
>  	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
> @@ -1047,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
>  	spin_lock(&root->fs_info->ordered_extent_lock);
>  	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
>  		list_add_tail(&BTRFS_I(inode)->ordered_operations,
> -			      &root->fs_info->ordered_operations);
> +			      &cur_trans->ordered_operations);
>  	}
>  	spin_unlock(&root->fs_info->ordered_extent_lock);
>  }
> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
> index d523dbd..267ac99 100644
> --- a/fs/btrfs/ordered-data.h
> +++ b/fs/btrfs/ordered-data.h
> @@ -197,7 +197,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
>  int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
>  				struct btrfs_ordered_extent *ordered);
>  int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
> -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
> +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
> +				 struct btrfs_root *root, int wait);
>  void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
>  				 struct btrfs_root *root,
>  				 struct inode *inode);
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 49c79b3..149ce09 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -157,6 +157,7 @@ loop:
>  	spin_lock_init(&cur_trans->delayed_refs.lock);
>  
>  	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
> +	INIT_LIST_HEAD(&cur_trans->ordered_operations);
>  	list_add_tail(&cur_trans->list, &fs_info->trans_list);
>  	extent_io_tree_init(&cur_trans->dirty_pages,
>  			     fs_info->btree_inode->i_mapping);
> @@ -1457,7 +1458,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
>  	 * it here and no for sure that nothing new will be added
>  	 * to the list
>  	 */
> -	ret = btrfs_run_ordered_operations(root, 1);
> +	ret = btrfs_run_ordered_operations(trans, root, 1);
>  
>  	return ret;
>  }
> @@ -1480,7 +1481,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
>  	int should_grow = 0;
>  	unsigned long now = get_seconds();
>  
> -	ret = btrfs_run_ordered_operations(root, 0);
> +	ret = btrfs_run_ordered_operations(trans, root, 0);
>  	if (ret) {
>  		btrfs_abort_transaction(trans, root, ret);
>  		btrfs_end_transaction(trans, root);
> diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
> index 4662821..3f772fd 100644
> --- a/fs/btrfs/transaction.h
> +++ b/fs/btrfs/transaction.h
> @@ -43,6 +43,7 @@ struct btrfs_transaction {
>  	wait_queue_head_t writer_wait;
>  	wait_queue_head_t commit_wait;
>  	struct list_head pending_snapshots;
> +	struct list_head ordered_operations;
>  	struct btrfs_delayed_ref_root delayed_refs;
>  	int aborted;
>  };
> 


  parent reply	other threads:[~2013-02-18 11:21 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-02-13 16:13 [PATCH] Btrfs: place ordered operations on a per transaction list Josef Bacik
2013-02-14 12:47 ` David Sterba
2013-02-18 11:22 ` Miao Xie [this message]
2013-02-18 15:33   ` Josef Bacik

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=51220EE1.6090607@cn.fujitsu.com \
    --to=miaox@cn.fujitsu.com \
    --cc=jbacik@fusionio.com \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.