All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/5] raid56: scrub related fixes
@ 2017-03-24  2:00 Qu Wenruo
  2017-03-24  2:00 ` [PATCH v2 1/5] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Qu Wenruo @ 2017-03-24  2:00 UTC (permalink / raw)
  To: linux-btrfs; +Cc: bo.li.liu

This patchset can be fetched from my github repo:
https://github.com/adam900710/linux.git raid56_fixes

It's based on v4.11-rc2, the last two patches get modified according to
the advice from Liu Bo.

The patchset fixes the following bugs:

1) False alert or wrong csum error number when scrubbing RAID5/6
   The bug itself won't cause any damage to fs, just pure race.

2) Corrupted data stripe rebuild corrupts P/Q
   So scrub makes one error into another, not really fixing anything

3) Use-after-free caused by cancelling dev-replace 
   This is quite a deadly bug, since cancelling dev-replace can
   cause kernel panic.

   Can be triggered by btrfs/069.

v2:
  Use bio_counter to protect rbio against dev-replace cancel, instead of
  original btrfs_device refcount, which is too restrict and must disable
  rbio cache, suggested by Liu Bo.

Qu Wenruo (5):
  btrfs: scrub: Introduce full stripe lock for RAID56
  btrfs: scrub: Fix RAID56 recovery race condition
  btrfs: scrub: Don't append on-disk pages for raid56 scrub
  btrfs: dev-replace: Wait flighting bio before removing target device
  btrfs: raid56: Use bio_counter to protect scrub

 fs/btrfs/ctree.h       |   4 +
 fs/btrfs/dev-replace.c |   2 +
 fs/btrfs/extent-tree.c |   3 +
 fs/btrfs/raid56.c      |   2 +
 fs/btrfs/scrub.c       | 201 ++++++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 208 insertions(+), 4 deletions(-)

-- 
2.12.1




^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 1/5] btrfs: scrub: Introduce full stripe lock for RAID56
  2017-03-24  2:00 [PATCH v2 0/5] raid56: scrub related fixes Qu Wenruo
@ 2017-03-24  2:00 ` Qu Wenruo
  2017-03-27 16:38   ` David Sterba
  2017-03-24  2:00 ` [PATCH v2 2/5] btrfs: scrub: Fix RAID56 recovery race condition Qu Wenruo
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Qu Wenruo @ 2017-03-24  2:00 UTC (permalink / raw)
  To: linux-btrfs; +Cc: bo.li.liu

Unlike mirror based profiles, RAID5/6 recovery needs to read out the
whole full stripe.

And if we don't do proper protect, it can easily cause race condition.

Introduce 2 new functions: lock_full_stripe() and unlock_full_stripe()
for RAID5/6.
Which stores a rb_tree of mutex for full stripes, so scrub callers can
use them to lock a full stripe to avoid race.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/ctree.h       |   4 ++
 fs/btrfs/extent-tree.c |   3 +
 fs/btrfs/scrub.c       | 177 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 184 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 29b7fc28c607..4d570e1040e6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -648,6 +648,10 @@ struct btrfs_block_group_cache {
 	 * Protected by free_space_lock.
 	 */
 	int needs_free_space;
+
+	/* Scrub full stripe lock tree for RAID5/6 scrub */
+	struct rb_root scrub_lock_root;
+	spinlock_t scrub_lock;
 };
 
 /* delayed seq elem */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index be5477676cc8..db5d9f84535e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -131,6 +131,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 	if (atomic_dec_and_test(&cache->count)) {
 		WARN_ON(cache->pinned > 0);
 		WARN_ON(cache->reserved > 0);
+		WARN_ON(!RB_EMPTY_ROOT(&cache->scrub_lock_root));
 		kfree(cache->free_space_ctl);
 		kfree(cache);
 	}
@@ -9907,6 +9908,8 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
 
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
+	spin_lock_init(&cache->scrub_lock);
+	cache->scrub_lock_root = RB_ROOT;
 	init_rwsem(&cache->data_rwsem);
 	INIT_LIST_HEAD(&cache->list);
 	INIT_LIST_HEAD(&cache->cluster_list);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b0251eb1239f..38b300ac4567 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -240,6 +240,13 @@ struct scrub_warning {
 	struct btrfs_device	*dev;
 };
 
+struct scrub_full_stripe_lock {
+	struct rb_node node;
+	u64 logical;
+	u64 refs;
+	struct mutex mutex;
+};
+
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -349,6 +356,176 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 }
 
 /*
+ * Caller must hold cache->scrub_root_lock.
+ *
+ * Return existing full stripe and increase it refs
+ * Or return NULL, and insert @fstripe_lock into the bg cache
+ */
+static struct scrub_full_stripe_lock *
+add_scrub_lock(struct btrfs_block_group_cache *cache,
+	       struct scrub_full_stripe_lock *fstripe_lock)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct scrub_full_stripe_lock *entry;
+
+	p = &cache->scrub_lock_root.rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct scrub_full_stripe_lock, node);
+		if (fstripe_lock->logical < entry->logical) {
+			p = &(*p)->rb_left;
+		} else if (fstripe_lock->logical > entry->logical) {
+			p = &(*p)->rb_right;
+		} else {
+			entry->refs++;
+			return entry;
+		}
+	}
+	/* Insert new one */
+	rb_link_node(&fstripe_lock->node, parent, p);
+	rb_insert_color(&fstripe_lock->node, &cache->scrub_lock_root);
+
+	return NULL;
+}
+
+static struct scrub_full_stripe_lock *
+search_scrub_lock(struct btrfs_block_group_cache *cache, u64 bytenr)
+{
+	struct rb_node *node;
+	struct scrub_full_stripe_lock *entry;
+
+	node = cache->scrub_lock_root.rb_node;
+	while (node) {
+		entry = rb_entry(node, struct scrub_full_stripe_lock, node);
+		if (bytenr < entry->logical)
+			node = node->rb_left;
+		else if (bytenr > entry->logical)
+			node = node->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Helper to get full stripe logical from a normal bytenr.
+ * Thanks to the chaos of scrub structures, we need to get it all
+ * by ourselves, using btrfs_map_sblock().
+ */
+static int get_full_stripe_logical(struct btrfs_fs_info *fs_info, u64 bytenr,
+				   u64 *bytenr_ret)
+{
+	struct btrfs_bio *bbio = NULL;
+	u64 len;
+	int ret;
+
+	/* Just use map_sblock() to get full stripe logical */
+	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, bytenr,
+			       &len, &bbio, 0, 1);
+	if (ret || !bbio || !bbio->raid_map)
+		goto error;
+	*bytenr_ret = bbio->raid_map[0];
+	btrfs_put_bbio(bbio);
+	return 0;
+error:
+	btrfs_put_bbio(bbio);
+	if (ret)
+		return ret;
+	return -EIO;
+}
+
+/*
+ * To lock a full stripe to avoid concurrency of recovery and read
+ * It's only used for profiles with parities(RAID5/6), for other profiles it
+ * does nothing
+ */
+static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+			    gfp_t gfp_flags)
+{
+	struct btrfs_block_group_cache *bg_cache;
+	struct scrub_full_stripe_lock *fstripe_lock;
+	struct scrub_full_stripe_lock *existing;
+	u64 fstripe_start;
+	int ret = 0;
+
+	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+	if (!bg_cache)
+		return -ENOENT;
+
+	/* Mirror based profiles don't need full stripe lock */
+	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+		goto out;
+
+	ret = get_full_stripe_logical(fs_info, bytenr, &fstripe_start);
+	if (ret < 0)
+		goto out;
+
+	fstripe_lock = kmalloc(sizeof(*fstripe_lock), gfp_flags);
+	if (!fstripe_lock) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	fstripe_lock->logical = fstripe_start;
+	fstripe_lock->refs = 1;
+	mutex_init(&fstripe_lock->mutex);
+
+	/* Now insert the full stripe lock */
+	spin_lock(&bg_cache->scrub_lock);
+	existing = add_scrub_lock(bg_cache, fstripe_lock);
+	if (existing) {
+		kfree(fstripe_lock);
+		fstripe_lock = existing;
+	}
+	spin_unlock(&bg_cache->scrub_lock);
+	mutex_lock(&fstripe_lock->mutex);
+
+out:
+	btrfs_put_block_group(bg_cache);
+	return ret;
+}
+
+static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+	struct btrfs_block_group_cache *bg_cache;
+	struct scrub_full_stripe_lock *fstripe_lock;
+	u64 fstripe_start;
+	int ret = 0;
+
+	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+	if (!bg_cache)
+		return -ENOENT;
+	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+		goto out;
+
+	ret = get_full_stripe_logical(fs_info, bytenr, &fstripe_start);
+	if (ret < 0)
+		goto out;
+
+	spin_lock(&bg_cache->scrub_lock);
+	fstripe_lock = search_scrub_lock(bg_cache, fstripe_start);
+	/* This is a deadly problem, we hold a mutex but can't unlock it */
+	if (WARN_ON(!fstripe_lock)) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	mutex_unlock(&fstripe_lock->mutex);
+	if (!WARN_ON(fstripe_lock->refs == 0))
+		fstripe_lock->refs--;
+	if (fstripe_lock->refs == 0) {
+		rb_erase(&fstripe_lock->node, &bg_cache->scrub_lock_root);
+		kfree(fstripe_lock);
+	}
+unlock:
+	spin_unlock(&bg_cache->scrub_lock);
+out:
+	btrfs_put_block_group(bg_cache);
+	return ret;
+}
+
+/*
  * used for workers that require transaction commits (i.e., for the
  * NOCOW case)
  */
-- 
2.12.1




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 2/5] btrfs: scrub: Fix RAID56 recovery race condition
  2017-03-24  2:00 [PATCH v2 0/5] raid56: scrub related fixes Qu Wenruo
  2017-03-24  2:00 ` [PATCH v2 1/5] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
@ 2017-03-24  2:00 ` Qu Wenruo
  2017-03-27 16:47   ` David Sterba
  2017-03-24  2:00 ` [PATCH v2 3/5] btrfs: scrub: Don't append on-disk pages for raid56 scrub Qu Wenruo
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Qu Wenruo @ 2017-03-24  2:00 UTC (permalink / raw)
  To: linux-btrfs; +Cc: bo.li.liu

When scrubbing a RAID5 which has recoverable data corruption (only one
data stripe is corrupted), sometimes scrub will report more csum errors
than expected. Sometimes even unrecoverable error will be reported.

The problem can be easily reproduced by the following steps:
1) Create a btrfs with RAID5 data profile with 3 devs
2) Mount it with nospace_cache or space_cache=v2
   To avoid extra data space usage.
3) Create a 128K file and sync the fs, unmount it
   Now the 128K file lies at the beginning of the data chunk
4) Locate the physical bytenr of data chunk on dev3
   Dev3 is the 1st data stripe.
5) Corrupt the first 64K of the data chunk stripe on dev3
6) Mount the fs and scrub it

The correct csum error number should be 16(assuming using x86_64).
Larger csum error number can be reported in a 1/3 chance.
And unrecoverable error can also be reported in a 1/10 chance.

The root cause of the problem is RAID5/6 recover code has race
condition, due to the fact that full scrub is initiated per device.

While for other mirror based profiles, each mirror is independent with
each other, so race won't cause any big problem.

For example:
        Corrupted       |       Correct          |      Correct        |
|   Scrub dev3 (D1)     |    Scrub dev2 (D2)     |    Scrub dev1(P)    |
------------------------------------------------------------------------
Read out D1             |Read out D2             |Read full stripe     |
Check csum              |Check csum              |Check parity         |
Csum mismatch           |Csum match, continue    |Parity mismatch      |
handle_errored_block    |                        |handle_errored_block |
 Read out full stripe   |                        | Read out full stripe|
 D1 csum error(err++)   |                        | D1 csum error(err++)|
 Recover D1             |                        | Recover D1          |

So D1's csum error is accounted twice, just because
handle_errored_block() doesn't have enough protect, and race can happen.

On even worse case, for example D1's recovery code is re-writing
D1/D2/P, and P's recovery code is just reading out full stripe, then we
can cause unrecoverable error.

This patch will use previously introduced lock_full_stripe() and
unlock_full_stripe() to protect the whole scrub_handle_errored_block()
function for RAID56 recovery.
So no extra csum error nor unrecoverable error.

Reported-by: Goffredo Baroncelli <kreijack@libero.it>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/scrub.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 38b300ac4567..6832feece7a7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1065,6 +1065,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	unsigned int have_csum;
 	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 	struct scrub_block *sblock_bad;
+	int lock_ret;
 	int ret;
 	int mirror_index;
 	int page_num;
@@ -1094,6 +1095,17 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	have_csum = sblock_to_check->pagev[0]->have_csum;
 	dev = sblock_to_check->pagev[0]->dev;
 
+	/*
+	 * For RAID5/6 race can happen for different dev scrub thread.
+	 * For data corruption, Parity and Data thread will both try
+	 * to recovery the data.
+	 * Race can lead to double added csum error, or even unrecoverable
+	 * error.
+	 */
+	ret = lock_full_stripe(fs_info, logical, GFP_NOFS);
+	if (ret < 0)
+		return ret;
+
 	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 		sblocks_for_recheck = NULL;
 		goto nodatasum_case;
@@ -1428,6 +1440,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 		kfree(sblocks_for_recheck);
 	}
 
+	lock_ret = unlock_full_stripe(fs_info, logical);
+	if (lock_ret < 0)
+		return lock_ret;
 	return 0;
 }
 
-- 
2.12.1




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 3/5] btrfs: scrub: Don't append on-disk pages for raid56 scrub
  2017-03-24  2:00 [PATCH v2 0/5] raid56: scrub related fixes Qu Wenruo
  2017-03-24  2:00 ` [PATCH v2 1/5] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
  2017-03-24  2:00 ` [PATCH v2 2/5] btrfs: scrub: Fix RAID56 recovery race condition Qu Wenruo
@ 2017-03-24  2:00 ` Qu Wenruo
  2017-03-24 22:06   ` Liu Bo
  2017-03-24  2:00 ` [PATCH v2 4/5] btrfs: dev-replace: Wait flighting bio before removing target device Qu Wenruo
  2017-03-24  2:00 ` [PATCH v2 5/5] btrfs: raid56: Use bio_counter to protect scrub Qu Wenruo
  4 siblings, 1 reply; 12+ messages in thread
From: Qu Wenruo @ 2017-03-24  2:00 UTC (permalink / raw)
  To: linux-btrfs; +Cc: bo.li.liu

In the following situation, scrub will calculate wrong parity to
overwrite correct one:

RAID5 full stripe:

Before
|     Dev 1      |     Dev  2     |     Dev 3     |
| Data stripe 1  | Data stripe 2  | Parity Stripe |
--------------------------------------------------- 0
| 0x0000 (Bad)   |     0xcdcd     |     0x0000    |
--------------------------------------------------- 4K
|     0xcdcd     |     0xcdcd     |     0x0000    |
...
|     0xcdcd     |     0xcdcd     |     0x0000    |
--------------------------------------------------- 64K

After scrubbing dev3 only:

|     Dev 1      |     Dev  2     |     Dev 3     |
| Data stripe 1  | Data stripe 2  | Parity Stripe |
--------------------------------------------------- 0
| 0xcdcd (Good)  |     0xcdcd     | 0xcdcd (Bad)  |
--------------------------------------------------- 4K
|     0xcdcd     |     0xcdcd     |     0x0000    |
...
|     0xcdcd     |     0xcdcd     |     0x0000    |
--------------------------------------------------- 64K

The reason is after raid56 read rebuild rbio->stripe_pages are all
correct recovered (0xcd for data stripes).

However when we check and repair parity, in
scrub_parity_check_and_repair(), we will appending pages in
sparity->spages list to rbio->bio_pages[], which contains old on-disk
data.

And when we submit parity data to disk, we calculate parity using
rbio->bio_pages[] first, if rbio->bio_pages[] not found, then fallback
to rbio->stripe_pages[].

The patch fix it by not appending pages from sparity->spages.
So finish_parity_scrub() will use rbio->stripe_pages[] which is correct.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/scrub.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6832feece7a7..2a5458004279 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2957,7 +2957,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct bio *bio;
 	struct btrfs_raid_bio *rbio;
-	struct scrub_page *spage;
 	struct btrfs_bio *bbio = NULL;
 	u64 length;
 	int ret;
@@ -2987,9 +2986,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 	if (!rbio)
 		goto rbio_out;
 
-	list_for_each_entry(spage, &sparity->spages, list)
-		raid56_add_scrub_pages(rbio, spage->page, spage->logical);
-
 	scrub_pending_bio_inc(sctx);
 	raid56_parity_submit_scrub_rbio(rbio);
 	return;
-- 
2.12.1




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 4/5] btrfs: dev-replace: Wait flighting bio before removing target device
  2017-03-24  2:00 [PATCH v2 0/5] raid56: scrub related fixes Qu Wenruo
                   ` (2 preceding siblings ...)
  2017-03-24  2:00 ` [PATCH v2 3/5] btrfs: scrub: Don't append on-disk pages for raid56 scrub Qu Wenruo
@ 2017-03-24  2:00 ` Qu Wenruo
  2017-03-24  2:00 ` [PATCH v2 5/5] btrfs: raid56: Use bio_counter to protect scrub Qu Wenruo
  4 siblings, 0 replies; 12+ messages in thread
From: Qu Wenruo @ 2017-03-24  2:00 UTC (permalink / raw)
  To: linux-btrfs; +Cc: bo.li.liu

When dev replace is cancelled by running scrub, we will free target
device without waiting flighting bios, causing the following NULL
pointer deference or general protection.

 BUG: unable to handle kernel NULL pointer dereference at 00000000000005e0
 IP: generic_make_request_checks+0x4d/0x610
 CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G  O    4.11.0-rc2 #72
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
 Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs]
 task: ffff88002875b4c0 task.stack: ffffc90001334000
 RIP: 0010:generic_make_request_checks+0x4d/0x610
 Call Trace:
  ? generic_make_request+0xc7/0x360
  generic_make_request+0x24/0x360
  ? generic_make_request+0xc7/0x360
  submit_bio+0x64/0x120
  ? page_in_rbio+0x4d/0x80 [btrfs]
  ? rbio_orig_end_io+0x80/0x80 [btrfs]
  finish_rmw+0x3f4/0x540 [btrfs]
  validate_rbio_for_rmw+0x36/0x40 [btrfs]
  raid_rmw_end_io+0x7a/0x90 [btrfs]
  bio_endio+0x56/0x60
  end_workqueue_fn+0x3c/0x40 [btrfs]
  btrfs_scrubparity_helper+0xef/0x620 [btrfs]
  btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
  process_one_work+0x2af/0x720
  ? process_one_work+0x22b/0x720
  worker_thread+0x4b/0x4f0
  kthread+0x10f/0x150
  ? process_one_work+0x720/0x720
  ? kthread_create_on_node+0x40/0x40
  ret_from_fork+0x2e/0x40
 Code: 67 28 48 c7 c7 27 7f c9 81 e8 90 6c d4 ff e8 0b bb 54 00 41 c1 ec 09 48 8b 7b 08 45 85 e4 0f 85 be 00 00 00 48 8b 87 00 01 00 00 <4c> 8b b0 e0 05 00 00 4d 85 f6 0f 84 86 01 00 00 4c 8b af f0 00
 RIP: generic_make_request_checks+0x4d/0x610 RSP: ffffc90001337bb8

In btrfs_dev_replace_finishing(), we will call
btrfs_rm_dev_replace_blocked() to wait bios before destroying the target
device when scrub is finished normally.

However when scrub is aborted, either due to error or canceled by scrub,
we didn't wait bios, this can leads to use-after-free if there are bios
holding the target device.

Fix it by also calling btrfs_rm_dev_replace_blocked() before calling
btrfs_destroy_dev_replace_tgtdev().

Cc: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/dev-replace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e653921f05d9..b9d88136b5a9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -546,8 +546,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 		mutex_unlock(&fs_info->chunk_mutex);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		mutex_unlock(&uuid_mutex);
+		btrfs_rm_dev_replace_blocked(fs_info);
 		if (tgt_device)
 			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+		btrfs_rm_dev_replace_unblocked(fs_info);
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 
 		return scrub_ret;
-- 
2.12.1




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 5/5] btrfs: raid56: Use bio_counter to protect scrub
  2017-03-24  2:00 [PATCH v2 0/5] raid56: scrub related fixes Qu Wenruo
                   ` (3 preceding siblings ...)
  2017-03-24  2:00 ` [PATCH v2 4/5] btrfs: dev-replace: Wait flighting bio before removing target device Qu Wenruo
@ 2017-03-24  2:00 ` Qu Wenruo
  2017-03-24 23:21   ` Liu Bo
  4 siblings, 1 reply; 12+ messages in thread
From: Qu Wenruo @ 2017-03-24  2:00 UTC (permalink / raw)
  To: linux-btrfs; +Cc: bo.li.liu

Unlike other place calling btrfs_map_block(), in raid56 scrub, we don't
use bio_counter to protect from race against dev replace.

This patch will use bio_counter to protect from the beginning of calling
btrfs_map_sblock(), until rbio endio.

Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/raid56.c | 2 ++
 fs/btrfs/scrub.c  | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 1571bf26dc07..3a083165400f 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2642,6 +2642,7 @@ static void async_scrub_parity(struct btrfs_raid_bio *rbio)
 
 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
 {
+	rbio->generic_bio_cnt = 1;
 	if (!lock_stripe_add(rbio))
 		async_scrub_parity(rbio);
 }
@@ -2694,6 +2695,7 @@ static void async_missing_raid56(struct btrfs_raid_bio *rbio)
 
 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
 {
+	rbio->generic_bio_cnt = 1;
 	if (!lock_stripe_add(rbio))
 		async_missing_raid56(rbio);
 }
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2a5458004279..265387bf3af8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2379,6 +2379,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
 	int ret;
 	int i;
 
+	btrfs_bio_counter_inc_blocked(fs_info);
 	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
 			&length, &bbio, 0, 1);
 	if (ret || !bbio || !bbio->raid_map)
@@ -2423,6 +2424,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
 rbio_out:
 	bio_put(bio);
 bbio_out:
+	btrfs_bio_counter_dec(fs_info);
 	btrfs_put_bbio(bbio);
 	spin_lock(&sctx->stat_lock);
 	sctx->stat.malloc_errors++;
@@ -2966,6 +2968,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 		goto out;
 
 	length = sparity->logic_end - sparity->logic_start;
+
+	btrfs_bio_counter_inc_blocked(fs_info);
 	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
 			       &length, &bbio, 0, 1);
 	if (ret || !bbio || !bbio->raid_map)
@@ -2993,6 +2997,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 rbio_out:
 	bio_put(bio);
 bbio_out:
+	btrfs_bio_counter_dec(fs_info);
 	btrfs_put_bbio(bbio);
 	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
 		  sparity->nsectors);
-- 
2.12.1




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/5] btrfs: scrub: Don't append on-disk pages for raid56 scrub
  2017-03-24  2:00 ` [PATCH v2 3/5] btrfs: scrub: Don't append on-disk pages for raid56 scrub Qu Wenruo
@ 2017-03-24 22:06   ` Liu Bo
  0 siblings, 0 replies; 12+ messages in thread
From: Liu Bo @ 2017-03-24 22:06 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Fri, Mar 24, 2017 at 10:00:25AM +0800, Qu Wenruo wrote:
> In the following situation, scrub will calculate wrong parity to
> overwrite correct one:
> 
> RAID5 full stripe:
> 
> Before
> |     Dev 1      |     Dev  2     |     Dev 3     |
> | Data stripe 1  | Data stripe 2  | Parity Stripe |
> --------------------------------------------------- 0
> | 0x0000 (Bad)   |     0xcdcd     |     0x0000    |
> --------------------------------------------------- 4K
> |     0xcdcd     |     0xcdcd     |     0x0000    |
> ...
> |     0xcdcd     |     0xcdcd     |     0x0000    |
> --------------------------------------------------- 64K
> 
> After scrubbing dev3 only:
> 
> |     Dev 1      |     Dev  2     |     Dev 3     |
> | Data stripe 1  | Data stripe 2  | Parity Stripe |
> --------------------------------------------------- 0
> | 0xcdcd (Good)  |     0xcdcd     | 0xcdcd (Bad)  |
> --------------------------------------------------- 4K
> |     0xcdcd     |     0xcdcd     |     0x0000    |
> ...
> |     0xcdcd     |     0xcdcd     |     0x0000    |
> --------------------------------------------------- 64K
> 
> The reason is after raid56 read rebuild rbio->stripe_pages are all
> correct recovered (0xcd for data stripes).
> 
> However when we check and repair parity, in
> scrub_parity_check_and_repair(), we will appending pages in
> sparity->spages list to rbio->bio_pages[], which contains old on-disk
> data.
> 
> And when we submit parity data to disk, we calculate parity using
> rbio->bio_pages[] first, if rbio->bio_pages[] not found, then fallback
> to rbio->stripe_pages[].
> 
> The patch fix it by not appending pages from sparity->spages.
> So finish_parity_scrub() will use rbio->stripe_pages[] which is correct.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>

Thanks,

-liubo
> 
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
>  fs/btrfs/scrub.c | 4 ----
>  1 file changed, 4 deletions(-)
> 
> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index 6832feece7a7..2a5458004279 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -2957,7 +2957,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
>  	struct btrfs_fs_info *fs_info = sctx->fs_info;
>  	struct bio *bio;
>  	struct btrfs_raid_bio *rbio;
> -	struct scrub_page *spage;
>  	struct btrfs_bio *bbio = NULL;
>  	u64 length;
>  	int ret;
> @@ -2987,9 +2986,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
>  	if (!rbio)
>  		goto rbio_out;
>  
> -	list_for_each_entry(spage, &sparity->spages, list)
> -		raid56_add_scrub_pages(rbio, spage->page, spage->logical);
> -
>  	scrub_pending_bio_inc(sctx);
>  	raid56_parity_submit_scrub_rbio(rbio);
>  	return;
> -- 
> 2.12.1
> 
> 
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 5/5] btrfs: raid56: Use bio_counter to protect scrub
  2017-03-24  2:00 ` [PATCH v2 5/5] btrfs: raid56: Use bio_counter to protect scrub Qu Wenruo
@ 2017-03-24 23:21   ` Liu Bo
  2017-03-27  1:37     ` Qu Wenruo
  0 siblings, 1 reply; 12+ messages in thread
From: Liu Bo @ 2017-03-24 23:21 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Fri, Mar 24, 2017 at 10:00:27AM +0800, Qu Wenruo wrote:
> Unlike other place calling btrfs_map_block(), in raid56 scrub, we don't
> use bio_counter to protect from race against dev replace.
> 
> This patch will use bio_counter to protect from the beginning of calling
> btrfs_map_sblock(), until rbio endio.
> 
> Liu Bo <bo.li.liu@oracle.com>
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
>  fs/btrfs/raid56.c | 2 ++
>  fs/btrfs/scrub.c  | 5 +++++
>  2 files changed, 7 insertions(+)
> 
> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> index 1571bf26dc07..3a083165400f 100644
> --- a/fs/btrfs/raid56.c
> +++ b/fs/btrfs/raid56.c
> @@ -2642,6 +2642,7 @@ static void async_scrub_parity(struct btrfs_raid_bio *rbio)
>  
>  void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
>  {
> +	rbio->generic_bio_cnt = 1;

To keep consistent with other places, can you please do this setting when
allocating rbio?

>  	if (!lock_stripe_add(rbio))
>  		async_scrub_parity(rbio);
>  }
> @@ -2694,6 +2695,7 @@ static void async_missing_raid56(struct btrfs_raid_bio *rbio)
>  
>  void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
>  {
> +	rbio->generic_bio_cnt = 1;
>  	if (!lock_stripe_add(rbio))
>  		async_missing_raid56(rbio);
>  }

Ditto.

> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index 2a5458004279..265387bf3af8 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -2379,6 +2379,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
>  	int ret;
>  	int i;
>  
> +	btrfs_bio_counter_inc_blocked(fs_info);
>  	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
>  			&length, &bbio, 0, 1);
>  	if (ret || !bbio || !bbio->raid_map)
> @@ -2423,6 +2424,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
>  rbio_out:
>  	bio_put(bio);
>  bbio_out:
> +	btrfs_bio_counter_dec(fs_info);
>  	btrfs_put_bbio(bbio);
>  	spin_lock(&sctx->stat_lock);
>  	sctx->stat.malloc_errors++;
> @@ -2966,6 +2968,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
>  		goto out;
>  
>  	length = sparity->logic_end - sparity->logic_start;
> +
> +	btrfs_bio_counter_inc_blocked(fs_info);
>  	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
>  			       &length, &bbio, 0, 1);
>  	if (ret || !bbio || !bbio->raid_map)
> @@ -2993,6 +2997,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
>  rbio_out:
>  	bio_put(bio);
>  bbio_out:
> +	btrfs_bio_counter_dec(fs_info);
>  	btrfs_put_bbio(bbio);
>  	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
>  		  sparity->nsectors);
> -- 
> 2.12.1
> 
> 
>

If patch 4 and 5 are still supposed to fix the same problem, can you please
merge them into one patch so that a future bisect could be precise?

And while I believe this fixes the crash described in patch 4,
scrub_setup_recheck_block() also retrives all stripes, and if we scrub
one device, and another device is being replaced so it could be freed
during scrub, is it another potential race case?

Thanks,

-liubo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 5/5] btrfs: raid56: Use bio_counter to protect scrub
  2017-03-24 23:21   ` Liu Bo
@ 2017-03-27  1:37     ` Qu Wenruo
  0 siblings, 0 replies; 12+ messages in thread
From: Qu Wenruo @ 2017-03-27  1:37 UTC (permalink / raw)
  To: bo.li.liu; +Cc: linux-btrfs



At 03/25/2017 07:21 AM, Liu Bo wrote:
> On Fri, Mar 24, 2017 at 10:00:27AM +0800, Qu Wenruo wrote:
>> Unlike other place calling btrfs_map_block(), in raid56 scrub, we don't
>> use bio_counter to protect from race against dev replace.
>>
>> This patch will use bio_counter to protect from the beginning of calling
>> btrfs_map_sblock(), until rbio endio.
>>
>> Liu Bo <bo.li.liu@oracle.com>
>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>> ---
>>  fs/btrfs/raid56.c | 2 ++
>>  fs/btrfs/scrub.c  | 5 +++++
>>  2 files changed, 7 insertions(+)
>>
>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>> index 1571bf26dc07..3a083165400f 100644
>> --- a/fs/btrfs/raid56.c
>> +++ b/fs/btrfs/raid56.c
>> @@ -2642,6 +2642,7 @@ static void async_scrub_parity(struct btrfs_raid_bio *rbio)
>>
>>  void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
>>  {
>> +	rbio->generic_bio_cnt = 1;
>
> To keep consistent with other places, can you please do this setting when
> allocating rbio?

No problem.

>
>>  	if (!lock_stripe_add(rbio))
>>  		async_scrub_parity(rbio);
>>  }
>> @@ -2694,6 +2695,7 @@ static void async_missing_raid56(struct btrfs_raid_bio *rbio)
>>
>>  void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
>>  {
>> +	rbio->generic_bio_cnt = 1;
>>  	if (!lock_stripe_add(rbio))
>>  		async_missing_raid56(rbio);
>>  }
>
> Ditto.
>
>> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
>> index 2a5458004279..265387bf3af8 100644
>> --- a/fs/btrfs/scrub.c
>> +++ b/fs/btrfs/scrub.c
>> @@ -2379,6 +2379,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
>>  	int ret;
>>  	int i;
>>
>> +	btrfs_bio_counter_inc_blocked(fs_info);
>>  	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
>>  			&length, &bbio, 0, 1);
>>  	if (ret || !bbio || !bbio->raid_map)
>> @@ -2423,6 +2424,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
>>  rbio_out:
>>  	bio_put(bio);
>>  bbio_out:
>> +	btrfs_bio_counter_dec(fs_info);
>>  	btrfs_put_bbio(bbio);
>>  	spin_lock(&sctx->stat_lock);
>>  	sctx->stat.malloc_errors++;
>> @@ -2966,6 +2968,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
>>  		goto out;
>>
>>  	length = sparity->logic_end - sparity->logic_start;
>> +
>> +	btrfs_bio_counter_inc_blocked(fs_info);
>>  	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
>>  			       &length, &bbio, 0, 1);
>>  	if (ret || !bbio || !bbio->raid_map)
>> @@ -2993,6 +2997,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
>>  rbio_out:
>>  	bio_put(bio);
>>  bbio_out:
>> +	btrfs_bio_counter_dec(fs_info);
>>  	btrfs_put_bbio(bbio);
>>  	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
>>  		  sparity->nsectors);
>> --
>> 2.12.1
>>
>>
>>
>
> If patch 4 and 5 are still supposed to fix the same problem, can you please
> merge them into one patch so that a future bisect could be precise?

Yes, they are still fixing the same problem, and tests have already show 
the test is working. (We found a physical machine which normal btrfs/069 
can easily trigger it)

I'll merge them into one patch in next version.

>
> And while I believe this fixes the crash described in patch 4,
> scrub_setup_recheck_block() also retrives all stripes, and if we scrub
> one device, and another device is being replaced so it could be freed
> during scrub, is it another potential race case?

Seems to be another race, and it can only be triggered when a corruption 
is detected, while current test case doesn't include such corruption 
scenario (unless using degraded mount), so we didn't encounter it yet.

Although I can fix it in next update, I'm afraid we won't have proper 
test case for it until we have good enough btrfs-corrupt-block.

Thanks,
Qu

>
> Thanks,
>
> -liubo
>
>



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/5] btrfs: scrub: Introduce full stripe lock for RAID56
  2017-03-24  2:00 ` [PATCH v2 1/5] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
@ 2017-03-27 16:38   ` David Sterba
  2017-03-28  6:24     ` Qu Wenruo
  0 siblings, 1 reply; 12+ messages in thread
From: David Sterba @ 2017-03-27 16:38 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs, bo.li.liu

On Fri, Mar 24, 2017 at 10:00:23AM +0800, Qu Wenruo wrote:
> Unlike mirror based profiles, RAID5/6 recovery needs to read out the
> whole full stripe.
> 
> And if we don't do proper protect, it can easily cause race condition.
> 
> Introduce 2 new functions: lock_full_stripe() and unlock_full_stripe()
> for RAID5/6.
> Which stores a rb_tree of mutex for full stripes, so scrub callers can
> use them to lock a full stripe to avoid race.
> 
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
>  fs/btrfs/ctree.h       |   4 ++
>  fs/btrfs/extent-tree.c |   3 +
>  fs/btrfs/scrub.c       | 177 +++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 184 insertions(+)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 29b7fc28c607..4d570e1040e6 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -648,6 +648,10 @@ struct btrfs_block_group_cache {
>  	 * Protected by free_space_lock.
>  	 */
>  	int needs_free_space;
> +
> +	/* Scrub full stripe lock tree for RAID5/6 scrub */
> +	struct rb_root scrub_lock_root;

The name scrub_lock_root is a bit confusing, it should better describe
what object the tree stores.

> +	spinlock_t scrub_lock;

And the lock could be named accordingly.

>  };
>  
>  /* delayed seq elem */
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index be5477676cc8..db5d9f84535e 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -131,6 +131,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
>  	if (atomic_dec_and_test(&cache->count)) {
>  		WARN_ON(cache->pinned > 0);
>  		WARN_ON(cache->reserved > 0);
> +		WARN_ON(!RB_EMPTY_ROOT(&cache->scrub_lock_root));
>  		kfree(cache->free_space_ctl);
>  		kfree(cache);
>  	}
> @@ -9907,6 +9908,8 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
>  
>  	atomic_set(&cache->count, 1);
>  	spin_lock_init(&cache->lock);
> +	spin_lock_init(&cache->scrub_lock);
> +	cache->scrub_lock_root = RB_ROOT;
>  	init_rwsem(&cache->data_rwsem);
>  	INIT_LIST_HEAD(&cache->list);
>  	INIT_LIST_HEAD(&cache->cluster_list);
> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index b0251eb1239f..38b300ac4567 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -240,6 +240,13 @@ struct scrub_warning {
>  	struct btrfs_device	*dev;
>  };
>  
> +struct scrub_full_stripe_lock {
> +	struct rb_node node;
> +	u64 logical;
> +	u64 refs;
> +	struct mutex mutex;
> +};
> +
>  static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
>  static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
>  static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
> @@ -349,6 +356,176 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
>  }
>  
>  /*
> + * Caller must hold cache->scrub_root_lock.
> + *
> + * Return existing full stripe and increase it refs
> + * Or return NULL, and insert @fstripe_lock into the bg cache
> + */
> +static struct scrub_full_stripe_lock *
> +add_scrub_lock(struct btrfs_block_group_cache *cache,
> +	       struct scrub_full_stripe_lock *fstripe_lock)

Please stick to the function prototype common in that file, ie. type and
name on the same line. The arguments can go below it.

The function name is also quite confusing.

> +{
> +	struct rb_node **p;
> +	struct rb_node *parent = NULL;
> +	struct scrub_full_stripe_lock *entry;
> +
> +	p = &cache->scrub_lock_root.rb_node;
> +	while (*p) {
> +		parent = *p;
> +		entry = rb_entry(parent, struct scrub_full_stripe_lock, node);
> +		if (fstripe_lock->logical < entry->logical) {
> +			p = &(*p)->rb_left;
> +		} else if (fstripe_lock->logical > entry->logical) {
> +			p = &(*p)->rb_right;
> +		} else {
> +			entry->refs++;
> +			return entry;
> +		}
> +	}
> +	/* Insert new one */
> +	rb_link_node(&fstripe_lock->node, parent, p);
> +	rb_insert_color(&fstripe_lock->node, &cache->scrub_lock_root);
> +
> +	return NULL
> +}
> +
> +static struct scrub_full_stripe_lock *
> +search_scrub_lock(struct btrfs_block_group_cache *cache, u64 bytenr)

Same here.

> +{
> +	struct rb_node *node;
> +	struct scrub_full_stripe_lock *entry;
> +
> +	node = cache->scrub_lock_root.rb_node;
> +	while (node) {
> +		entry = rb_entry(node, struct scrub_full_stripe_lock, node);
> +		if (bytenr < entry->logical)
> +			node = node->rb_left;
> +		else if (bytenr > entry->logical)
> +			node = node->rb_right;
> +		else
> +			return entry;
> +	}
> +	return NULL;
> +}
> +
> +/*
> + * Helper to get full stripe logical from a normal bytenr.
> + * Thanks to the chaos of scrub structures, we need to get it all
> + * by ourselves, using btrfs_map_sblock().
> + */
> +static int get_full_stripe_logical(struct btrfs_fs_info *fs_info, u64 bytenr,
> +				   u64 *bytenr_ret)
> +{
> +	struct btrfs_bio *bbio = NULL;
> +	u64 len;
> +	int ret;
> +
> +	/* Just use map_sblock() to get full stripe logical */
> +	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, bytenr,
> +			       &len, &bbio, 0, 1);
> +	if (ret || !bbio || !bbio->raid_map)
> +		goto error;
> +	*bytenr_ret = bbio->raid_map[0];
> +	btrfs_put_bbio(bbio);
> +	return 0;
> +error:
> +	btrfs_put_bbio(bbio);
> +	if (ret)
> +		return ret;
> +	return -EIO;
> +}
> +
> +/*
> + * To lock a full stripe to avoid concurrency of recovery and read
> + * It's only used for profiles with parities(RAID5/6), for other profiles it
                                               ^ insert space please

> + * does nothing
> + */
> +static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
> +			    gfp_t gfp_flags)

I found only 1 user of the function, that passes GFP_NOFS to this
function. Do you plan to use it in other code with other flags? If not,
the argument can be dropped. And also the use of NOFS should be
evaluated if we couldn't use GFP_KERNEL instead.

> +{
> +	struct btrfs_block_group_cache *bg_cache;
> +	struct scrub_full_stripe_lock *fstripe_lock;
> +	struct scrub_full_stripe_lock *existing;
> +	u64 fstripe_start;
> +	int ret = 0;
> +
> +	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
> +	if (!bg_cache)
> +		return -ENOENT;
> +
> +	/* Mirror based profiles don't need full stripe lock */
> +	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
> +		goto out;
> +
> +	ret = get_full_stripe_logical(fs_info, bytenr, &fstripe_start);
> +	if (ret < 0)
> +		goto out;
> +
> +	fstripe_lock = kmalloc(sizeof(*fstripe_lock), gfp_flags);
> +	if (!fstripe_lock) {
> +		ret = -ENOENT;

The error code does not match the condition, why isn't it ENOMEM ? If
there's a reason why ENOENT is ok, please add a comment here or to the
function.

> +		goto out;
> +	}
> +
> +	fstripe_lock->logical = fstripe_start;
> +	fstripe_lock->refs = 1;
> +	mutex_init(&fstripe_lock->mutex);
> +
> +	/* Now insert the full stripe lock */
> +	spin_lock(&bg_cache->scrub_lock);
> +	existing = add_scrub_lock(bg_cache, fstripe_lock);
> +	if (existing) {
> +		kfree(fstripe_lock);

I don't like the speculative allocation and then free pattern, but it
does not seem we can avoid it. Please at least move the kfree call out
of the mutex/spinlock.

> +		fstripe_lock = existing;
> +	}
> +	spin_unlock(&bg_cache->scrub_lock);
> +	mutex_lock(&fstripe_lock->mutex);

The function should be annotated that it leaves a mutex locked.

> +
> +out:
> +	btrfs_put_block_group(bg_cache);
> +	return ret;
> +}
> +
> +static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr)
> +{
> +	struct btrfs_block_group_cache *bg_cache;
> +	struct scrub_full_stripe_lock *fstripe_lock;
> +	u64 fstripe_start;
> +	int ret = 0;
> +
> +	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
> +	if (!bg_cache)
> +		return -ENOENT;
> +	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
> +		goto out;
> +
> +	ret = get_full_stripe_logical(fs_info, bytenr, &fstripe_start);
> +	if (ret < 0)
> +		goto out;
> +
> +	spin_lock(&bg_cache->scrub_lock);
> +	fstripe_lock = search_scrub_lock(bg_cache, fstripe_start);
> +	/* This is a deadly problem, we hold a mutex but can't unlock it */

Not sure that it's deadly. Would be great to add comments why we can't
unlock it, as this function is supposed to pair with lock and if the
mutex is not unlocked, it must be handled by the caller somehow. I find
this semantic quite dangerous.

> +	if (WARN_ON(!fstripe_lock)) {

Is the WARN_ON necessary? I know it helps debugging, but users also tend
to panic when they see stacktraces in the log. Is there a better way to
report it?

> +		ret = -ENOENT;
> +		goto unlock;
> +	}
> +
> +	mutex_unlock(&fstripe_lock->mutex);

I don't like to see a mutex being freed inside a spinlock section,

> +	if (!WARN_ON(fstripe_lock->refs == 0))

So here it's unexpected refcount, that's worth a warning and also some
human readable error message. Please move the WARN_ON on a separate line
as it's harder to read when mixed with the if statement.

> +		fstripe_lock->refs--;
> +	if (fstripe_lock->refs == 0) {
> +		rb_erase(&fstripe_lock->node, &bg_cache->scrub_lock_root);
> +		kfree(fstripe_lock);

Move kfree out of all locks if possible.

> +	}
> +unlock:
> +	spin_unlock(&bg_cache->scrub_lock);
> +out:
> +	btrfs_put_block_group(bg_cache);
> +	return ret;
> +}
> +
> +/*
>   * used for workers that require transaction commits (i.e., for the
>   * NOCOW case)
>   */

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 2/5] btrfs: scrub: Fix RAID56 recovery race condition
  2017-03-24  2:00 ` [PATCH v2 2/5] btrfs: scrub: Fix RAID56 recovery race condition Qu Wenruo
@ 2017-03-27 16:47   ` David Sterba
  0 siblings, 0 replies; 12+ messages in thread
From: David Sterba @ 2017-03-27 16:47 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs, bo.li.liu

On Fri, Mar 24, 2017 at 10:00:24AM +0800, Qu Wenruo wrote:
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -1065,6 +1065,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
>  	unsigned int have_csum;
>  	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
>  	struct scrub_block *sblock_bad;
> +	int lock_ret;

No need to introduce another variable, just use 'ret'.

>  	int ret;
>  	int mirror_index;
>  	int page_num;
> @@ -1428,6 +1440,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
>  		kfree(sblocks_for_recheck);
>  	}
>  
> +	lock_ret = unlock_full_stripe(fs_info, logical);
> +	if (lock_ret < 0)
> +		return lock_ret;
>  	return 0;

Here.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/5] btrfs: scrub: Introduce full stripe lock for RAID56
  2017-03-27 16:38   ` David Sterba
@ 2017-03-28  6:24     ` Qu Wenruo
  0 siblings, 0 replies; 12+ messages in thread
From: Qu Wenruo @ 2017-03-28  6:24 UTC (permalink / raw)
  To: dsterba, linux-btrfs, bo.li.liu

Thanks for the review first.

At 03/28/2017 12:38 AM, David Sterba wrote:
> On Fri, Mar 24, 2017 at 10:00:23AM +0800, Qu Wenruo wrote:
>> Unlike mirror based profiles, RAID5/6 recovery needs to read out the
>> whole full stripe.
>>
>> And if we don't do proper protect, it can easily cause race condition.
>>
>> Introduce 2 new functions: lock_full_stripe() and unlock_full_stripe()
>> for RAID5/6.
>> Which stores a rb_tree of mutex for full stripes, so scrub callers can
>> use them to lock a full stripe to avoid race.
>>
>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>> ---
>>  fs/btrfs/ctree.h       |   4 ++
>>  fs/btrfs/extent-tree.c |   3 +
>>  fs/btrfs/scrub.c       | 177 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 184 insertions(+)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 29b7fc28c607..4d570e1040e6 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -648,6 +648,10 @@ struct btrfs_block_group_cache {
>>  	 * Protected by free_space_lock.
>>  	 */
>>  	int needs_free_space;
>> +
>> +	/* Scrub full stripe lock tree for RAID5/6 scrub */
>> +	struct rb_root scrub_lock_root;
>
> The name scrub_lock_root is a bit confusing, it should better describe
> what object the tree stores.

I'll encapsulate this rb_root and spinlock into a new structure called 
full_stripe_locks_tree.

I tried to rename them to other names, but "lock" and "root/tree" makes 
it quite hard to name the spinlock.
(name like "full_stripe_locks_root_lock" is just super stupid)

>
>> +	spinlock_t scrub_lock;
>
> And the lock could be named accordingly.
>
>>  };
>>
>>  /* delayed seq elem */
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index be5477676cc8..db5d9f84535e 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -131,6 +131,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
>>  	if (atomic_dec_and_test(&cache->count)) {
>>  		WARN_ON(cache->pinned > 0);
>>  		WARN_ON(cache->reserved > 0);
>> +		WARN_ON(!RB_EMPTY_ROOT(&cache->scrub_lock_root));
>>  		kfree(cache->free_space_ctl);
>>  		kfree(cache);
>>  	}
>> @@ -9907,6 +9908,8 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
>>
>>  	atomic_set(&cache->count, 1);
>>  	spin_lock_init(&cache->lock);
>> +	spin_lock_init(&cache->scrub_lock);
>> +	cache->scrub_lock_root = RB_ROOT;
>>  	init_rwsem(&cache->data_rwsem);
>>  	INIT_LIST_HEAD(&cache->list);
>>  	INIT_LIST_HEAD(&cache->cluster_list);
>> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
>> index b0251eb1239f..38b300ac4567 100644
>> --- a/fs/btrfs/scrub.c
>> +++ b/fs/btrfs/scrub.c
>> @@ -240,6 +240,13 @@ struct scrub_warning {
>>  	struct btrfs_device	*dev;
>>  };
>>
>> +struct scrub_full_stripe_lock {
>> +	struct rb_node node;
>> +	u64 logical;
>> +	u64 refs;
>> +	struct mutex mutex;
>> +};
>> +
>>  static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
>>  static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
>>  static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
>> @@ -349,6 +356,176 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
>>  }
>>
>>  /*
>> + * Caller must hold cache->scrub_root_lock.
>> + *
>> + * Return existing full stripe and increase it refs
>> + * Or return NULL, and insert @fstripe_lock into the bg cache
>> + */
>> +static struct scrub_full_stripe_lock *
>> +add_scrub_lock(struct btrfs_block_group_cache *cache,
>> +	       struct scrub_full_stripe_lock *fstripe_lock)
>
> Please stick to the function prototype common in that file, ie. type and
> name on the same line. The arguments can go below it.

Understood.

>
> The function name is also quite confusing.

What about insert_full_stripe_lock() ?

>
>> +{
>> +	struct rb_node **p;
>> +	struct rb_node *parent = NULL;
>> +	struct scrub_full_stripe_lock *entry;
>> +
>> +	p = &cache->scrub_lock_root.rb_node;
>> +	while (*p) {
>> +		parent = *p;
>> +		entry = rb_entry(parent, struct scrub_full_stripe_lock, node);
>> +		if (fstripe_lock->logical < entry->logical) {
>> +			p = &(*p)->rb_left;
>> +		} else if (fstripe_lock->logical > entry->logical) {
>> +			p = &(*p)->rb_right;
>> +		} else {
>> +			entry->refs++;
>> +			return entry;
>> +		}
>> +	}
>> +	/* Insert new one */
>> +	rb_link_node(&fstripe_lock->node, parent, p);
>> +	rb_insert_color(&fstripe_lock->node, &cache->scrub_lock_root);
>> +
>> +	return NULL
>> +}
>> +
>> +static struct scrub_full_stripe_lock *
>> +search_scrub_lock(struct btrfs_block_group_cache *cache, u64 bytenr)
>
> Same here.

What about search_full_stripe_lock() ?

>
>> +{
>> +	struct rb_node *node;
>> +	struct scrub_full_stripe_lock *entry;
>> +
>> +	node = cache->scrub_lock_root.rb_node;
>> +	while (node) {
>> +		entry = rb_entry(node, struct scrub_full_stripe_lock, node);
>> +		if (bytenr < entry->logical)
>> +			node = node->rb_left;
>> +		else if (bytenr > entry->logical)
>> +			node = node->rb_right;
>> +		else
>> +			return entry;
>> +	}
>> +	return NULL;
>> +}
>> +
>> +/*
>> + * Helper to get full stripe logical from a normal bytenr.
>> + * Thanks to the chaos of scrub structures, we need to get it all
>> + * by ourselves, using btrfs_map_sblock().
>> + */
>> +static int get_full_stripe_logical(struct btrfs_fs_info *fs_info, u64 bytenr,
>> +				   u64 *bytenr_ret)
>> +{
>> +	struct btrfs_bio *bbio = NULL;
>> +	u64 len;
>> +	int ret;
>> +
>> +	/* Just use map_sblock() to get full stripe logical */
>> +	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, bytenr,
>> +			       &len, &bbio, 0, 1);
>> +	if (ret || !bbio || !bbio->raid_map)
>> +		goto error;
>> +	*bytenr_ret = bbio->raid_map[0];
>> +	btrfs_put_bbio(bbio);
>> +	return 0;
>> +error:
>> +	btrfs_put_bbio(bbio);
>> +	if (ret)
>> +		return ret;
>> +	return -EIO;
>> +}
>> +
>> +/*
>> + * To lock a full stripe to avoid concurrency of recovery and read
>> + * It's only used for profiles with parities(RAID5/6), for other profiles it
>                                                ^ insert space please

Old comments always have such problem.

Will be gone.

>
>> + * does nothing
>> + */
>> +static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
>> +			    gfp_t gfp_flags)
>
> I found only 1 user of the function, that passes GFP_NOFS to this
> function. Do you plan to use it in other code with other flags? If not,
> the argument can be dropped. And also the use of NOFS should be
> evaluated if we couldn't use GFP_KERNEL instead.

GFP_KERNEL will be the case.

>
>> +{
>> +	struct btrfs_block_group_cache *bg_cache;
>> +	struct scrub_full_stripe_lock *fstripe_lock;
>> +	struct scrub_full_stripe_lock *existing;
>> +	u64 fstripe_start;
>> +	int ret = 0;
>> +
>> +	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
>> +	if (!bg_cache)
>> +		return -ENOENT;
>> +
>> +	/* Mirror based profiles don't need full stripe lock */
>> +	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
>> +		goto out;
>> +
>> +	ret = get_full_stripe_logical(fs_info, bytenr, &fstripe_start);
>> +	if (ret < 0)
>> +		goto out;
>> +
>> +	fstripe_lock = kmalloc(sizeof(*fstripe_lock), gfp_flags);
>> +	if (!fstripe_lock) {
>> +		ret = -ENOENT;
>
> The error code does not match the condition, why isn't it ENOMEM ? If
> there's a reason why ENOENT is ok, please add a comment here or to the
> function.
>
>> +		goto out;
>> +	}
>> +
>> +	fstripe_lock->logical = fstripe_start;
>> +	fstripe_lock->refs = 1;
>> +	mutex_init(&fstripe_lock->mutex);
>> +
>> +	/* Now insert the full stripe lock */
>> +	spin_lock(&bg_cache->scrub_lock);
>> +	existing = add_scrub_lock(bg_cache, fstripe_lock);
>> +	if (existing) {
>> +		kfree(fstripe_lock);
>
> I don't like the speculative allocation and then free pattern, but it
> does not seem we can avoid it. Please at least move the kfree call out
> of the mutex/spinlock.

Can be modified to use mutex other than spinlock, then we can allocate 
memory when holding the mutex.

Considering the main overhead would be verifying csum, mutex overhead 
should not be a big problem.

>
>> +		fstripe_lock = existing;
>> +	}
>> +	spin_unlock(&bg_cache->scrub_lock);
>> +	mutex_lock(&fstripe_lock->mutex);
>
> The function should be annotated that it leaves a mutex locked.

I'll add comment for it at the comment of the function.

>
>> +
>> +out:
>> +	btrfs_put_block_group(bg_cache);
>> +	return ret;
>> +}
>> +
>> +static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr)
>> +{
>> +	struct btrfs_block_group_cache *bg_cache;
>> +	struct scrub_full_stripe_lock *fstripe_lock;
>> +	u64 fstripe_start;
>> +	int ret = 0;
>> +
>> +	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
>> +	if (!bg_cache)
>> +		return -ENOENT;
>> +	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
>> +		goto out;
>> +
>> +	ret = get_full_stripe_logical(fs_info, bytenr, &fstripe_start);
>> +	if (ret < 0)
>> +		goto out;
>> +
>> +	spin_lock(&bg_cache->scrub_lock);
>> +	fstripe_lock = search_scrub_lock(bg_cache, fstripe_start);
>> +	/* This is a deadly problem, we hold a mutex but can't unlock it */
>
> Not sure that it's deadly. Would be great to add comments why we can't
> unlock it, as this function is supposed to pair with lock and if the
> mutex is not unlocked, it must be handled by the caller somehow. I find
> this semantic quite dangerous.

I was too nervous about such error.

If it happens with paired lock/unlock_full_stripe(), then memory is 
corrupted either by hardware or other code.
Then it's nothing we can do much.

And if it's caused by unpaired lock/unlock_full_stripe(), then normal 
warning should be enough at unlock_full_stripe() to info developer there 
is a wild unlock.

So the comment is over-reacted.

>
>> +	if (WARN_ON(!fstripe_lock)) {
>
> Is the WARN_ON necessary? I know it helps debugging, but users also tend
> to panic when they see stacktraces in the log. Is there a better way to
> report it?

The warning can only happen if we have wild unlock. Although it's less a 
problem than wild locked mutex, such warning helps us to expose such 
problem at QA time before it hitting end users.

And if we have ensured lock and unlock are paired, I don't think it will 
cause anything wrong.

>
>> +		ret = -ENOENT;
>> +		goto unlock;
>> +	}
>> +
>> +	mutex_unlock(&fstripe_lock->mutex);
>
> I don't like to see a mutex being freed inside a spinlock section,

Will be modified.

>
>> +	if (!WARN_ON(fstripe_lock->refs == 0))
>
> So here it's unexpected refcount, that's worth a warning and also some
> human readable error message. Please move the WARN_ON on a separate line
> as it's harder to read when mixed with the if statement.

OK, while personally I found such trick quite handy :(

Thanks,
Qu

>
>> +		fstripe_lock->refs--;
>> +	if (fstripe_lock->refs == 0) {
>> +		rb_erase(&fstripe_lock->node, &bg_cache->scrub_lock_root);
>> +		kfree(fstripe_lock);
>
> Move kfree out of all locks if possible.
>
>> +	}
>> +unlock:
>> +	spin_unlock(&bg_cache->scrub_lock);
>> +out:
>> +	btrfs_put_block_group(bg_cache);
>> +	return ret;
>> +}
>> +
>> +/*
>>   * used for workers that require transaction commits (i.e., for the
>>   * NOCOW case)
>>   */
>
>



^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-03-28  6:24 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-24  2:00 [PATCH v2 0/5] raid56: scrub related fixes Qu Wenruo
2017-03-24  2:00 ` [PATCH v2 1/5] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
2017-03-27 16:38   ` David Sterba
2017-03-28  6:24     ` Qu Wenruo
2017-03-24  2:00 ` [PATCH v2 2/5] btrfs: scrub: Fix RAID56 recovery race condition Qu Wenruo
2017-03-27 16:47   ` David Sterba
2017-03-24  2:00 ` [PATCH v2 3/5] btrfs: scrub: Don't append on-disk pages for raid56 scrub Qu Wenruo
2017-03-24 22:06   ` Liu Bo
2017-03-24  2:00 ` [PATCH v2 4/5] btrfs: dev-replace: Wait flighting bio before removing target device Qu Wenruo
2017-03-24  2:00 ` [PATCH v2 5/5] btrfs: raid56: Use bio_counter to protect scrub Qu Wenruo
2017-03-24 23:21   ` Liu Bo
2017-03-27  1:37     ` Qu Wenruo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.