[PATCH] btrfs: fix dead lock while running replace and defrag concurrently

From: Gui Hecheng <guihc.fnst@cn.fujitsu.com>
To: <linux-btrfs@vger.kernel.org>
Cc: Gui Hecheng <guihc.fnst@cn.fujitsu.com>
Subject: [PATCH] btrfs: fix dead lock while running replace and defrag concurrently
Date: Mon, 10 Nov 2014 15:36:08 +0800	[thread overview]
Message-ID: <1415604968-21916-1-git-send-email-guihc.fnst@cn.fujitsu.com> (raw)

This can be reproduced by fstests: btrfs/070

The scenario is like the following:

replace worker thread		defrag thread
---------------------		-------------
copy_nocow_pages_worker		btrfs_defrag_file
  copy_nocow_pages_for_inode	    ...
				  btrfs_writepages
  |A| lock_extent_bits		    extent_write_cache_pages
				|B|   lock_page
					__extent_writepage
		...			  writepage_delalloc
					    find_lock_delalloc_range
				|B| 	      lock_extent_bits
  find_or_create_page
    pagecache_get_page
  |A| lock_page

This leads to an ABBA pattern deadlock. To fix it,
o we just change it to an AABB pattern which means to @unlock_extent_bits()
  before we @lock_page(), and in this way the @extent_read_full_page_nolock()
  is no longer in an locked context, so change it back to @extent_read_full_page()
  to regain protection.

o Since we @unlock_extent_bits() earlier, then before @write_page_nocow(),
  the extent may not really point at the physical block we want, so we
  have to check it before write.

Signed-off-by: Gui Hecheng <guihc.fnst@cn.fujitsu.com>
---
 fs/btrfs/scrub.c | 90 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 30 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efa0831..4325bb0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3310,6 +3310,50 @@ out:
 	scrub_pending_trans_workers_dec(sctx);
 }
 
+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
+				 u64 logical)
+{
+	struct extent_state *cached_state = NULL;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_io_tree *io_tree;
+	struct extent_map *em;
+	u64 lockstart = start, lockend = start + len - 1;
+	int ret = 0;
+
+	io_tree = &BTRFS_I(inode)->io_tree;
+
+	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		ret = 1;
+		goto out_unlock;
+	}
+
+	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out_unlock;
+	}
+
+	/*
+	 * This extent does not actually cover the logical extent anymore,
+	 * move on to the next inode.
+	 */
+	if (em->block_start > logical ||
+	    em->block_start + em->block_len < logical + len) {
+		free_extent_map(em);
+		ret = 1;
+		goto out_unlock;
+	}
+	free_extent_map(em);
+
+out_unlock:
+	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+			     GFP_NOFS);
+	return ret;
+}
+
 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 				      struct scrub_copy_nocow_ctx *nocow_ctx)
 {
@@ -3318,13 +3362,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 	struct inode *inode;
 	struct page *page;
 	struct btrfs_root *local_root;
-	struct btrfs_ordered_extent *ordered;
-	struct extent_map *em;
-	struct extent_state *cached_state = NULL;
 	struct extent_io_tree *io_tree;
 	u64 physical_for_dev_replace;
+	u64 nocow_ctx_logical;
 	u64 len = nocow_ctx->len;
-	u64 lockstart = offset, lockend = offset + len - 1;
 	unsigned long index;
 	int srcu_index;
 	int ret = 0;
@@ -3356,30 +3397,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 
 	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
 	io_tree = &BTRFS_I(inode)->io_tree;
+	nocow_ctx_logical = nocow_ctx->logical;
 
-	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
-	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
-	if (ordered) {
-		btrfs_put_ordered_extent(ordered);
-		goto out_unlock;
-	}
-
-	em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
-	if (IS_ERR(em)) {
-		ret = PTR_ERR(em);
-		goto out_unlock;
-	}
-
-	/*
-	 * This extent does not actually cover the logical extent anymore,
-	 * move on to the next inode.
-	 */
-	if (em->block_start > nocow_ctx->logical ||
-	    em->block_start + em->block_len < nocow_ctx->logical + len) {
-		free_extent_map(em);
-		goto out_unlock;
+	ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
+	if (ret) {
+		ret = ret > 0 ? 0 : ret;
+		goto out;
 	}
-	free_extent_map(em);
 
 	while (len >= PAGE_CACHE_SIZE) {
 		index = offset >> PAGE_CACHE_SHIFT;
@@ -3396,7 +3420,7 @@ again:
 				goto next_page;
 		} else {
 			ClearPageError(page);
-			err = extent_read_full_page_nolock(io_tree, page,
+			err = extent_read_full_page(io_tree, page,
 							   btrfs_get_extent,
 							   nocow_ctx->mirror_num);
 			if (err) {
@@ -3421,6 +3445,14 @@ again:
 				goto next_page;
 			}
 		}
+
+		ret = check_extent_to_block(inode, offset, len,
+					    nocow_ctx_logical);
+		if (ret) {
+			ret = ret > 0 ? 0 : ret;
+			goto next_page;
+		}
+
 		err = write_page_nocow(nocow_ctx->sctx,
 				       physical_for_dev_replace, page);
 		if (err)
@@ -3434,12 +3466,10 @@ next_page:
 
 		offset += PAGE_CACHE_SIZE;
 		physical_for_dev_replace += PAGE_CACHE_SIZE;
+		nocow_ctx_logical += PAGE_CACHE_SIZE;
 		len -= PAGE_CACHE_SIZE;
 	}
 	ret = COPY_COMPLETE;
-out_unlock:
-	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
-			     GFP_NOFS);
 out:
 	mutex_unlock(&inode->i_mutex);
 	iput(inode);
-- 
1.8.1.4