From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from cn.fujitsu.com ([222.73.24.84]:2024 "EHLO song.cn.fujitsu.com" rhost-flags-OK-FAIL-OK-OK) by vger.kernel.org with ESMTP id S1751897Ab3BRLV3 (ORCPT ); Mon, 18 Feb 2013 06:21:29 -0500 Message-ID: <51220EE1.6090607@cn.fujitsu.com> Date: Mon, 18 Feb 2013 19:22:09 +0800 From: Miao Xie Reply-To: miaox@cn.fujitsu.com MIME-Version: 1.0 To: Josef Bacik CC: linux-btrfs@vger.kernel.org Subject: Re: [PATCH] Btrfs: place ordered operations on a per transaction list References: <1360772002-8923-1-git-send-email-jbacik@fusionio.com> In-Reply-To: <1360772002-8923-1-git-send-email-jbacik@fusionio.com> Content-Type: text/plain; charset=UTF-8 Sender: linux-btrfs-owner@vger.kernel.org List-ID: On wed, 13 Feb 2013 11:13:22 -0500, Josef Bacik wrote: > Miao made the ordered operations stuff run async, which introduced a > deadlock where we could get somebody (sync) racing in and committing the > transaction while a commit was already happening. The new committer would > try and flush ordered operations which would hang waiting for the commit to > finish because it is done asynchronously and no longer inherits the callers > trans handle. To fix this we need to make the ordered operations list a per > transaction list. We can get new inodes added to the ordered operation list > by truncating them and then having another process writing to them, so this > makes it so that anybody trying to add an ordered operation _must_ start a > transaction in order to add itself to the list, which will keep new inodes > from getting added to the ordered operations list after we start committing. > This should fix the deadlock and also keeps us from doing a lot more work > than we need to during commit. Thanks, Firstly, thanks to deal with the bug which was introduced by my patch. But comparing with this fix method, I prefer the following one because: - we won't worry the similar problem if we add more work during commit in the future. - it is unnecessary to get a new handle and commit it if the transaction is under the commit. Thanks Miao diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fc03aa6..c449cb5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -277,7 +277,8 @@ static void wait_current_trans(struct btrfs_root *root) } } -static int may_wait_transaction(struct btrfs_root *root, int type) +static int may_wait_transaction(struct btrfs_root *root, int type, + bool is_joined) { if (root->fs_info->log_root_recovering) return 0; @@ -285,6 +286,14 @@ static int may_wait_transaction(struct btrfs_root *root, int type) if (type == TRANS_USERSPACE) return 1; + /* + * If we are ATTACH, it means we just want to catch the current + * transaction and commit it. So if someone is committing the + * current transaction now, it is very glad to wait it. + */ + if (is_joined && type == TRANS_ATTACH) + return 1; + if (type == TRANS_START && !atomic_read(&root->fs_info->open_ioctl_trans)) return 1; @@ -355,7 +364,7 @@ again: if (type < TRANS_JOIN_NOLOCK) sb_start_intwrite(root->fs_info->sb); - if (may_wait_transaction(root, type)) + if (may_wait_transaction(root, type, false)) wait_current_trans(root); do { @@ -383,16 +392,26 @@ again: h->block_rsv = NULL; h->orig_rsv = NULL; h->aborted = 0; - h->qgroup_reserved = qgroup_reserved; + h->qgroup_reserved = 0; h->delayed_ref_elem.seq = 0; h->type = type; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); smp_mb(); - if (cur_trans->blocked && may_wait_transaction(root, type)) { - btrfs_commit_transaction(h, root); - goto again; + if (cur_trans->blocked && may_wait_transaction(root, type, true)) { + if (cur_trans->in_commit) { + btrfs_end_transaction(h, root); + wait_current_trans(root); + } else { + btrfs_commit_transaction(h, root); + } + if (unlikely(type == TRANS_ATTACH)) { + ret = -ENOENT; + goto alloc_fail; + } else { + goto again; + } } if (num_bytes) { @@ -401,6 +420,7 @@ again: h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } + h->qgroup_reserved = qgroup_reserved; got_it: btrfs_record_root_in_trans(h, root); -- 1.6.5.2 > > Signed-off-by: Josef Bacik > --- > fs/btrfs/ctree.h | 7 ------- > fs/btrfs/disk-io.c | 11 ++++++----- > fs/btrfs/file.c | 15 ++++++++++++++- > fs/btrfs/ordered-data.c | 13 ++++++++----- > fs/btrfs/ordered-data.h | 3 ++- > fs/btrfs/transaction.c | 5 +++-- > fs/btrfs/transaction.h | 1 + > 7 files changed, 34 insertions(+), 21 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 0c4e4df..9f72ec8 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -1408,13 +1408,6 @@ struct btrfs_fs_info { > struct list_head delalloc_inodes; > > /* > - * special rename and truncate targets that must be on disk before > - * we're allowed to commit. This is basically the ext3 style > - * data=ordered list. > - */ > - struct list_head ordered_operations; > - > - /* > * there is a pool of worker threads for checksumming during writes > * and a pool for checksumming after reads. This is because readers > * can run with FS locks held, and the writers may be waiting for > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index 52573d0..6baa77d 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -56,7 +56,8 @@ static void end_workqueue_fn(struct btrfs_work *work); > static void free_fs_root(struct btrfs_root *root); > static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, > int read_only); > -static void btrfs_destroy_ordered_operations(struct btrfs_root *root); > +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, > + struct btrfs_root *root); > static void btrfs_destroy_ordered_extents(struct btrfs_root *root); > static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, > struct btrfs_root *root); > @@ -2029,7 +2030,6 @@ int open_ctree(struct super_block *sb, > INIT_LIST_HEAD(&fs_info->dead_roots); > INIT_LIST_HEAD(&fs_info->delayed_iputs); > INIT_LIST_HEAD(&fs_info->delalloc_inodes); > - INIT_LIST_HEAD(&fs_info->ordered_operations); > INIT_LIST_HEAD(&fs_info->caching_block_groups); > spin_lock_init(&fs_info->delalloc_lock); > spin_lock_init(&fs_info->trans_lock); > @@ -3538,7 +3538,8 @@ void btrfs_error_commit_super(struct btrfs_root *root) > btrfs_cleanup_transaction(root); > } > > -static void btrfs_destroy_ordered_operations(struct btrfs_root *root) > +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, > + struct btrfs_root *root) > { > struct btrfs_inode *btrfs_inode; > struct list_head splice; > @@ -3548,7 +3549,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) > mutex_lock(&root->fs_info->ordered_operations_mutex); > spin_lock(&root->fs_info->ordered_extent_lock); > > - list_splice_init(&root->fs_info->ordered_operations, &splice); > + list_splice_init(&t->ordered_operations, &splice); > while (!list_empty(&splice)) { > btrfs_inode = list_entry(splice.next, struct btrfs_inode, > ordered_operations); > @@ -3829,7 +3830,7 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) > while (!list_empty(&list)) { > t = list_entry(list.next, struct btrfs_transaction, list); > > - btrfs_destroy_ordered_operations(root); > + btrfs_destroy_ordered_operations(t, root); > > btrfs_destroy_ordered_extents(root); > > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index 237bf92..848442c 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -1629,7 +1629,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp) > */ > if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, > &BTRFS_I(inode)->runtime_flags)) { > - btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); > + struct btrfs_trans_handle *trans; > + struct btrfs_root *root = BTRFS_I(inode)->root; > + > + /* > + * We need to block on a committing transaction to keep us from > + * throwing a ordered operation on to the list and causing > + * something like sync to deadlock trying to flush out this > + * inode. > + */ > + trans = btrfs_start_transaction(root, 0); > + if (IS_ERR(trans)) > + return PTR_ERR(trans); > + btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); > + btrfs_end_transaction(trans, root); > if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) > filemap_flush(inode->i_mapping); > } > diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c > index cd8f6e9..346dca0 100644 > --- a/fs/btrfs/ordered-data.c > +++ b/fs/btrfs/ordered-data.c > @@ -612,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) > * extra check to make sure the ordered operation list really is empty > * before we return > */ > -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) > +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, > + struct btrfs_root *root, int wait) > { > struct btrfs_inode *btrfs_inode; > struct inode *inode; > + struct btrfs_transaction *cur_trans = trans->transaction; > struct list_head splice; > struct list_head works; > struct btrfs_delalloc_work *work, *next; > @@ -626,7 +628,7 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) > > mutex_lock(&root->fs_info->ordered_operations_mutex); > spin_lock(&root->fs_info->ordered_extent_lock); > - list_splice_init(&root->fs_info->ordered_operations, &splice); > + list_splice_init(&cur_trans->ordered_operations, &splice); > while (!list_empty(&splice)) { > btrfs_inode = list_entry(splice.next, struct btrfs_inode, > ordered_operations); > @@ -643,7 +645,7 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) > > if (!wait) > list_add_tail(&BTRFS_I(inode)->ordered_operations, > - &root->fs_info->ordered_operations); > + &cur_trans->ordered_operations); > spin_unlock(&root->fs_info->ordered_extent_lock); > > work = btrfs_alloc_delalloc_work(inode, wait, 1); > @@ -653,7 +655,7 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) > list_add_tail(&btrfs_inode->ordered_operations, > &splice); > list_splice_tail(&splice, > - &root->fs_info->ordered_operations); > + &cur_trans->ordered_operations); > spin_unlock(&root->fs_info->ordered_extent_lock); > ret = -ENOMEM; > goto out; > @@ -1033,6 +1035,7 @@ out: > void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, > struct btrfs_root *root, struct inode *inode) > { > + struct btrfs_transaction *cur_trans = trans->transaction; > u64 last_mod; > > last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); > @@ -1047,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, > spin_lock(&root->fs_info->ordered_extent_lock); > if (list_empty(&BTRFS_I(inode)->ordered_operations)) { > list_add_tail(&BTRFS_I(inode)->ordered_operations, > - &root->fs_info->ordered_operations); > + &cur_trans->ordered_operations); > } > spin_unlock(&root->fs_info->ordered_extent_lock); > } > diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h > index d523dbd..267ac99 100644 > --- a/fs/btrfs/ordered-data.h > +++ b/fs/btrfs/ordered-data.h > @@ -197,7 +197,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, > int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, > struct btrfs_ordered_extent *ordered); > int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); > -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); > +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, > + struct btrfs_root *root, int wait); > void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, > struct btrfs_root *root, > struct inode *inode); > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c > index 49c79b3..149ce09 100644 > --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -157,6 +157,7 @@ loop: > spin_lock_init(&cur_trans->delayed_refs.lock); > > INIT_LIST_HEAD(&cur_trans->pending_snapshots); > + INIT_LIST_HEAD(&cur_trans->ordered_operations); > list_add_tail(&cur_trans->list, &fs_info->trans_list); > extent_io_tree_init(&cur_trans->dirty_pages, > fs_info->btree_inode->i_mapping); > @@ -1457,7 +1458,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, > * it here and no for sure that nothing new will be added > * to the list > */ > - ret = btrfs_run_ordered_operations(root, 1); > + ret = btrfs_run_ordered_operations(trans, root, 1); > > return ret; > } > @@ -1480,7 +1481,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, > int should_grow = 0; > unsigned long now = get_seconds(); > > - ret = btrfs_run_ordered_operations(root, 0); > + ret = btrfs_run_ordered_operations(trans, root, 0); > if (ret) { > btrfs_abort_transaction(trans, root, ret); > btrfs_end_transaction(trans, root); > diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h > index 4662821..3f772fd 100644 > --- a/fs/btrfs/transaction.h > +++ b/fs/btrfs/transaction.h > @@ -43,6 +43,7 @@ struct btrfs_transaction { > wait_queue_head_t writer_wait; > wait_queue_head_t commit_wait; > struct list_head pending_snapshots; > + struct list_head ordered_operations; > struct btrfs_delayed_ref_root delayed_refs; > int aborted; > }; >