All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] RAID5/6 scrub race fix
@ 2016-11-15  2:50 Qu Wenruo
  2016-11-15  2:50 ` [PATCH 1/2] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Qu Wenruo @ 2016-11-15  2:50 UTC (permalink / raw)
  To: linux-btrfs; +Cc: kreijack

Fix the so-called famous RAID5/6 scrub error.

Thanks Goffredo Baroncelli for reporting the bug, and make it into our
sight.
(Yes, without the Phoronix report on this,
https://www.phoronix.com/scan.php?page=news_item&px=Btrfs-RAID-56-Is-Bad,
I won't ever be aware of it)

Unlike many of us(including myself) assumed, it's not a timed bomb buried
deeply into the RAID5/6 code, but a race condition in scrub recovery
code.

The problem is not found because normal mirror based profiles aren't
affected by the race, since they are independent with each other.

Although this time the fix doesn't affect the scrub code much, it should
warn us that current scrub code is really hard to maintain.

Abuse of workquque to delay works and the full fs scrub is race prone.

Xfstest will follow a little later, as we don't have good enough tools
to corrupt data stripes pinpointly.

Qu Wenruo (2):
  btrfs: scrub: Introduce full stripe lock for RAID56
  btrfs: scrub: Fix RAID56 recovery race condition

 fs/btrfs/ctree.h       |   4 ++
 fs/btrfs/extent-tree.c |   3 +
 fs/btrfs/scrub.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 199 insertions(+)

-- 
2.10.2




^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/2] btrfs: scrub: Introduce full stripe lock for RAID56
  2016-11-15  2:50 [PATCH 0/2] RAID5/6 scrub race fix Qu Wenruo
@ 2016-11-15  2:50 ` Qu Wenruo
  2016-11-15  2:50 ` [PATCH 2/2] btrfs: scrub: Fix RAID56 recovery race condition Qu Wenruo
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 13+ messages in thread
From: Qu Wenruo @ 2016-11-15  2:50 UTC (permalink / raw)
  To: linux-btrfs; +Cc: kreijack

Unlike mirror based profiles, RAID5/6 recovery needs to read out the
whole full stripe.

And if we don't do proper protect, it can easily cause race condition.

Introduce 2 new functions: lock_full_stripe() and unlock_full_stripe()
for RAID5/6.
Which stores a rb_tree of mutex for full stripes, so scrub callers can
use them to lock a full stripe to avoid race.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/ctree.h       |   4 ++
 fs/btrfs/extent-tree.c |   3 +
 fs/btrfs/scrub.c       | 177 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 184 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9d8edcb..37d5f29 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -638,6 +638,10 @@ struct btrfs_block_group_cache {
 	 * Protected by free_space_lock.
 	 */
 	int needs_free_space;
+
+	/* Scrub full stripe lock tree for RAID5/6 scrub */
+	struct rb_root scrub_lock_root;
+	spinlock_t scrub_lock;
 };
 
 /* delayed seq elem */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4607af3..b098a1f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -132,6 +132,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 	if (atomic_dec_and_test(&cache->count)) {
 		WARN_ON(cache->pinned > 0);
 		WARN_ON(cache->reserved > 0);
+		WARN_ON(!RB_EMPTY_ROOT(&cache->scrub_lock_root));
 		kfree(cache->free_space_ctl);
 		kfree(cache);
 	}
@@ -10122,6 +10123,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
 
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
+	spin_lock_init(&cache->scrub_lock);
+	cache->scrub_lock_root = RB_ROOT;
 	init_rwsem(&cache->data_rwsem);
 	INIT_LIST_HEAD(&cache->list);
 	INIT_LIST_HEAD(&cache->cluster_list);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index fffb9ab..4fce415 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -240,6 +240,13 @@ struct scrub_warning {
 	struct btrfs_device	*dev;
 };
 
+struct scrub_full_stripe_lock {
+	struct rb_node node;
+	u64 logical;
+	u64 refs;
+	struct mutex mutex;
+};
+
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -351,6 +358,176 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 }
 
 /*
+ * Caller must hold cache->scrub_root_lock.
+ *
+ * Return existing full stripe and increase it refs
+ * Or return NULL, and insert @fstripe_lock into the bg cache
+ */
+static struct scrub_full_stripe_lock *
+add_scrub_lock(struct btrfs_block_group_cache *cache,
+	       struct scrub_full_stripe_lock *fstripe_lock)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct scrub_full_stripe_lock *entry;
+
+	p = &cache->scrub_lock_root.rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct scrub_full_stripe_lock, node);
+		if (fstripe_lock->logical < entry->logical) {
+			p = &(*p)->rb_left;
+		} else if (fstripe_lock->logical > entry->logical) {
+			p = &(*p)->rb_right;
+		} else {
+			entry->refs++;
+			return entry;
+		}
+	}
+	/* Insert new one */
+	rb_link_node(&fstripe_lock->node, parent, p);
+	rb_insert_color(&fstripe_lock->node, &cache->scrub_lock_root);
+
+	return NULL;
+}
+
+static struct scrub_full_stripe_lock *
+search_scrub_lock(struct btrfs_block_group_cache *cache, u64 bytenr)
+{
+	struct rb_node *node;
+	struct scrub_full_stripe_lock *entry;
+
+	node = cache->scrub_lock_root.rb_node;
+	while (node) {
+		entry = rb_entry(node, struct scrub_full_stripe_lock, node);
+		if (bytenr < entry->logical)
+			node = node->rb_left;
+		else if (bytenr > entry->logical)
+			node = node->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Helper to get full stripe logical from a normal bytenr.
+ * Thanks to the chaos of scrub structures, we need to get it all
+ * by ourselves, using btrfs_map_sblock().
+ */
+static int get_full_stripe_logical(struct btrfs_fs_info *fs_info, u64 bytenr,
+				   u64 *bytenr_ret)
+{
+	struct btrfs_bio *bbio = NULL;
+	u64 len;
+	int ret;
+
+	/* Just use map_sblock() to get full stripe logical */
+	ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, bytenr, &len,
+			       &bbio, 0, 1);
+	if (ret || !bbio || !bbio->raid_map)
+		goto error;
+	*bytenr_ret = bbio->raid_map[0];
+	btrfs_put_bbio(bbio);
+	return 0;
+error:
+	btrfs_put_bbio(bbio);
+	if (ret)
+		return ret;
+	return -EIO;
+}
+
+/*
+ * To lock a full stripe to avoid concurrency of recovery and read
+ * It's only used for profiles with parities(RAID5/6), for other profiles it
+ * does nothing
+ */
+static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+			    gfp_t gfp_flags)
+{
+	struct btrfs_block_group_cache *bg_cache;
+	struct scrub_full_stripe_lock *fstripe_lock;
+	struct scrub_full_stripe_lock *existing;
+	u64 fstripe_start;
+	int ret = 0;
+
+	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+	if (!bg_cache)
+		return -ENOENT;
+
+	/* Mirror based profiles don't need full stripe lock */
+	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+		goto out;
+
+	ret = get_full_stripe_logical(fs_info, bytenr, &fstripe_start);
+	if (ret < 0)
+		goto out;
+
+	fstripe_lock = kmalloc(sizeof(*fstripe_lock), gfp_flags);
+	if (!fstripe_lock) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	fstripe_lock->logical = fstripe_start;
+	fstripe_lock->refs = 1;
+	mutex_init(&fstripe_lock->mutex);
+
+	/* Now insert the full stripe lock */
+	spin_lock(&bg_cache->scrub_lock);
+	existing = add_scrub_lock(bg_cache, fstripe_lock);
+	if (existing) {
+		kfree(fstripe_lock);
+		fstripe_lock = existing;
+	}
+	spin_unlock(&bg_cache->scrub_lock);
+	mutex_lock(&fstripe_lock->mutex);
+
+out:
+	btrfs_put_block_group(bg_cache);
+	return ret;
+}
+
+static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+	struct btrfs_block_group_cache *bg_cache;
+	struct scrub_full_stripe_lock *fstripe_lock;
+	u64 fstripe_start;
+	int ret = 0;
+
+	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+	if (!bg_cache)
+		return -ENOENT;
+	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+		goto out;
+
+	ret = get_full_stripe_logical(fs_info, bytenr, &fstripe_start);
+	if (ret < 0)
+		goto out;
+
+	spin_lock(&bg_cache->scrub_lock);
+	fstripe_lock = search_scrub_lock(bg_cache, fstripe_start);
+	/* This is a deadly problem, we hold a mutex but can't unlock it */
+	if (WARN_ON(!fstripe_lock)) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	mutex_unlock(&fstripe_lock->mutex);
+	if (!WARN_ON(fstripe_lock->refs == 0))
+		fstripe_lock->refs--;
+	if (fstripe_lock->refs == 0) {
+		rb_erase(&fstripe_lock->node, &bg_cache->scrub_lock_root);
+		kfree(fstripe_lock);
+	}
+unlock:
+	spin_unlock(&bg_cache->scrub_lock);
+out:
+	btrfs_put_block_group(bg_cache);
+	return ret;
+}
+
+/*
  * used for workers that require transaction commits (i.e., for the
  * NOCOW case)
  */
-- 
2.10.2




^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/2] btrfs: scrub: Fix RAID56 recovery race condition
  2016-11-15  2:50 [PATCH 0/2] RAID5/6 scrub race fix Qu Wenruo
  2016-11-15  2:50 ` [PATCH 1/2] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
@ 2016-11-15  2:50 ` Qu Wenruo
  2016-11-17 18:09 ` [PATCH 0/2] RAID5/6 scrub race fix Goffredo Baroncelli
  2016-11-17 23:13 ` Zygo Blaxell
  3 siblings, 0 replies; 13+ messages in thread
From: Qu Wenruo @ 2016-11-15  2:50 UTC (permalink / raw)
  To: linux-btrfs; +Cc: kreijack

When scrubbing a RAID5 which has recoverable data corruption (only one
data stripe is corrupted), sometimes scrub will report more csum errors
than expected. Sometimes even unrecoverable error will be reported.

The problem can be easily reproduced by the following steps:
1) Create a btrfs with RAID5 data profile with 3 devs
2) Mount it with nospace_cache or space_cache=v2
   To avoid extra data space usage.
3) Create a 128K file and sync the fs, unmount it
   Now the 128K file lies at the beginning of the data chunk
4) Locate the physical bytenr of data chunk on dev3
   Dev3 is the 1st data stripe.
5) Corrupt the first 64K of the data chunk stripe on dev3
6) Mount the fs and scrub it

The correct csum error number should be 16(assuming using x86_64).
Larger csum error number can be reported in a 1/3 chance.
And unrecoverable error can also be reported in a 1/10 chance.

The root cause of the problem is RAID5/6 recover code has race
condition, due to the fact that full scrub is initiated per device.

While for other mirror based profiles, each mirror is independent with
each other, so race won't cause any big problem.

For example:
        Corrupted       |       Correct          |      Correct        |
|   Scrub dev3 (D1)     |    Scrub dev2 (D2)     |    Scrub dev1(P)    |
------------------------------------------------------------------------
Read out D1             |Read out D2             |Read full stripe     |
Check csum              |Check csum              |Check parity         |
Csum mismatch           |Csum match, continue    |Parity mismatch      |
handle_errored_block    |                        |handle_errored_block |
 Read out full stripe   |                        | Read out full stripe|
 D1 csum error(err++)   |                        | D1 csum error(err++)|
 Recover D1             |                        | Recover D1          |

So D1's csum error is accounted twice, just because
handle_errored_block() doesn't have enough protect, and race can happen.

On even worse case, for example D1's recovery code is re-writing
D1/D2/P, and P's recovery code is just reading out full stripe, then we
can cause unrecoverable error.

This patch will use previously introduced lock_full_stripe() and
unlock_full_stripe() to protect the whole scrub_handle_errored_block()
function for RAID56 recovery.
So no extra csum error nor unrecoverable error.

Reported-by: Goffredo Baroncelli <kreijack@libero.it>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/scrub.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4fce415..1215a61 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1067,6 +1067,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	unsigned int have_csum;
 	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 	struct scrub_block *sblock_bad;
+	int lock_ret;
 	int ret;
 	int mirror_index;
 	int page_num;
@@ -1096,6 +1097,17 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	have_csum = sblock_to_check->pagev[0]->have_csum;
 	dev = sblock_to_check->pagev[0]->dev;
 
+	/*
+	 * For RAID5/6 race can happen for different dev scrub thread.
+	 * For data corruption, Parity and Data thread will both try
+	 * to recovery the data.
+	 * Race can lead to double added csum error, or even unrecoverable
+	 * error.
+	 */
+	ret = lock_full_stripe(fs_info, logical, GFP_NOFS);
+	if (ret < 0)
+		return ret;
+
 	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 		sblocks_for_recheck = NULL;
 		goto nodatasum_case;
@@ -1432,6 +1444,9 @@ out:
 		kfree(sblocks_for_recheck);
 	}
 
+	lock_ret = unlock_full_stripe(fs_info, logical);
+	if (lock_ret < 0)
+		return lock_ret;
 	return 0;
 }
 
-- 
2.10.2




^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/2] RAID5/6 scrub race fix
  2016-11-15  2:50 [PATCH 0/2] RAID5/6 scrub race fix Qu Wenruo
  2016-11-15  2:50 ` [PATCH 1/2] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
  2016-11-15  2:50 ` [PATCH 2/2] btrfs: scrub: Fix RAID56 recovery race condition Qu Wenruo
@ 2016-11-17 18:09 ` Goffredo Baroncelli
  2016-11-18  0:57   ` Qu Wenruo
  2016-11-17 23:13 ` Zygo Blaxell
  3 siblings, 1 reply; 13+ messages in thread
From: Goffredo Baroncelli @ 2016-11-17 18:09 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 2993 bytes --]

Hi Qu,

On 2016-11-15 03:50, Qu Wenruo wrote:
> Fix the so-called famous RAID5/6 scrub error.
> 
> Thanks Goffredo Baroncelli for reporting the bug, and make it into our
> sight.
> (Yes, without the Phoronix report on this,
> https://www.phoronix.com/scan.php?page=news_item&px=Btrfs-RAID-56-Is-Bad,
> I won't ever be aware of it)
> 
> Unlike many of us(including myself) assumed, it's not a timed bomb buried
> deeply into the RAID5/6 code, but a race condition in scrub recovery
> code.
> 
> The problem is not found because normal mirror based profiles aren't
> affected by the race, since they are independent with each other.

Unfortunately, even with these patches btrfs still fails to rebuild a raid5 array in my tests.
To perform the tests I create a filesystem raid5-three disks; then I created a file

# python -c "print 'ad'+'a'*65534+'bd'+'b'*65533" >out.txt

The filesystem layout is composed by stripes large 128k of data, and 64k of parity.

The idea is that the first third of the stripe (64k on disk0) is composed by "adaaa...";
the second third (again 64k on disk1 is composed by "bdbbbb"; and finally the parity (disk0^disk1) is stored on the last portion of the stripe.

Doing some calculation it is possible to know where the data is physically stored.

I perform 3 tests:
1) I corrupt some bytes of the data stored on disk0; mount the filesystem; run scrub; unmount the filesystem; check all the disks if the bytes of the stripe are corrected
2) I corrupt some bytes of the data stored on disk1; mount the filesystem; run scrub; unmount the filesystem; check all the disks the bytes of the stripe are corrected
3) I corrupt some bytes of the parity stored on disk2; mount the filesystem; run scrub; unmount the filesystem; check all the disks the bytes of the stripe are corrected

Before your patches, my all tests fail (not all the times, but very often).
After your patches, test1 and test2 still fail. In my test test3 succeded.

Enclosed my script which performs the tests (it uses loop device; please run in a VM; firts you have to uncomment the function make_imgs to create the disk image.).

Let me know if I can help you providing more information.

BR
G.Baroncelli


> 
> Although this time the fix doesn't affect the scrub code much, it should
> warn us that current scrub code is really hard to maintain.
> 
> Abuse of workquque to delay works and the full fs scrub is race prone.
> 
> Xfstest will follow a little later, as we don't have good enough tools
> to corrupt data stripes pinpointly.
> 
> Qu Wenruo (2):
>   btrfs: scrub: Introduce full stripe lock for RAID56
>   btrfs: scrub: Fix RAID56 recovery race condition
> 
>  fs/btrfs/ctree.h       |   4 ++
>  fs/btrfs/extent-tree.c |   3 +
>  fs/btrfs/scrub.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 199 insertions(+)
> 


-- 
gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5

[-- Attachment #2: test-btrfs.sh --]
[-- Type: application/x-sh, Size: 2978 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/2] RAID5/6 scrub race fix
  2016-11-15  2:50 [PATCH 0/2] RAID5/6 scrub race fix Qu Wenruo
                   ` (2 preceding siblings ...)
  2016-11-17 18:09 ` [PATCH 0/2] RAID5/6 scrub race fix Goffredo Baroncelli
@ 2016-11-17 23:13 ` Zygo Blaxell
  2016-11-18  1:19   ` Qu Wenruo
  2016-11-18 18:09   ` Goffredo Baroncelli
  3 siblings, 2 replies; 13+ messages in thread
From: Zygo Blaxell @ 2016-11-17 23:13 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs, kreijack

[-- Attachment #1: Type: text/plain, Size: 3062 bytes --]

On Tue, Nov 15, 2016 at 10:50:22AM +0800, Qu Wenruo wrote:
> Fix the so-called famous RAID5/6 scrub error.
> 
> Thanks Goffredo Baroncelli for reporting the bug, and make it into our
> sight.
> (Yes, without the Phoronix report on this,
> https://www.phoronix.com/scan.php?page=news_item&px=Btrfs-RAID-56-Is-Bad,
> I won't ever be aware of it)

If you're hearing about btrfs RAID5 bugs for the first time through
Phoronix, then your testing coverage is *clearly* inadequate.

Fill up a RAID5 array, start a FS stress test, pull a drive out while
that's running, let the FS stress test run for another hour, then try
to replace or delete the missing device.  If there are any crashes,
corruptions, or EIO during any part of this process (assuming all the
remaining disks are healthy), then btrfs RAID5 is still broken, and
you've found another bug to fix.

The fact that so many problems in btrfs can still be found this way
indicates to me that nobody is doing this basic level of testing
(or if they are, they're not doing anything about the results).

> Unlike many of us(including myself) assumed, it's not a timed bomb buried
> deeply into the RAID5/6 code, but a race condition in scrub recovery
> code.

I don't see how this patch fixes the write hole issue at the core of
btrfs RAID56.  It just makes the thin layer of bugs over that issue a
little thinner.  There's still the metadata RMW update timebomb at the
bottom of the bug pile that can't be fixed by scrub (the filesystem is
unrecoverably damaged when the bomb goes off, so scrub isn't possible).

> The problem is not found because normal mirror based profiles aren't
> affected by the race, since they are independent with each other.

True.

> Although this time the fix doesn't affect the scrub code much, it should
> warn us that current scrub code is really hard to maintain.

This last sentence is true.  I found and fixed three BUG_ONs in RAID5
code on the first day I started testing in degraded mode, then hit
the scrub code and had to give up.  It was like a brick wall made out
of mismatched assumptions and layering inversions, using uninitialized
kernel data as mortar (though I suppose the "uninitialized" data symptom
might just have been an unprotected memory access).

> Abuse of workquque to delay works and the full fs scrub is race prone.
> 
> Xfstest will follow a little later, as we don't have good enough tools
> to corrupt data stripes pinpointly.
> 
> Qu Wenruo (2):
>   btrfs: scrub: Introduce full stripe lock for RAID56
>   btrfs: scrub: Fix RAID56 recovery race condition
> 
>  fs/btrfs/ctree.h       |   4 ++
>  fs/btrfs/extent-tree.c |   3 +
>  fs/btrfs/scrub.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 199 insertions(+)
> 
> -- 
> 2.10.2
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/2] RAID5/6 scrub race fix
  2016-11-17 18:09 ` [PATCH 0/2] RAID5/6 scrub race fix Goffredo Baroncelli
@ 2016-11-18  0:57   ` Qu Wenruo
  0 siblings, 0 replies; 13+ messages in thread
From: Qu Wenruo @ 2016-11-18  0:57 UTC (permalink / raw)
  To: kreijack, linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 3775 bytes --]



At 11/18/2016 02:09 AM, Goffredo Baroncelli wrote:
> Hi Qu,
>
> On 2016-11-15 03:50, Qu Wenruo wrote:
>> Fix the so-called famous RAID5/6 scrub error.
>>
>> Thanks Goffredo Baroncelli for reporting the bug, and make it into our
>> sight.
>> (Yes, without the Phoronix report on this,
>> https://www.phoronix.com/scan.php?page=news_item&px=Btrfs-RAID-56-Is-Bad,
>> I won't ever be aware of it)
>>
>> Unlike many of us(including myself) assumed, it's not a timed bomb buried
>> deeply into the RAID5/6 code, but a race condition in scrub recovery
>> code.
>>
>> The problem is not found because normal mirror based profiles aren't
>> affected by the race, since they are independent with each other.
>
> Unfortunately, even with these patches btrfs still fails to rebuild a raid5 array in my tests.
> To perform the tests I create a filesystem raid5-three disks; then I created a file
>
> # python -c "print 'ad'+'a'*65534+'bd'+'b'*65533" >out.txt
>
> The filesystem layout is composed by stripes large 128k of data, and 64k of parity.
>
> The idea is that the first third of the stripe (64k on disk0) is composed by "adaaa...";
> the second third (again 64k on disk1 is composed by "bdbbbb"; and finally the parity (disk0^disk1) is stored on the last portion of the stripe.
>
> Doing some calculation it is possible to know where the data is physically stored.
>
> I perform 3 tests:
> 1) I corrupt some bytes of the data stored on disk0; mount the filesystem; run scrub; unmount the filesystem; check all the disks if the bytes of the stripe are corrected
> 2) I corrupt some bytes of the data stored on disk1; mount the filesystem; run scrub; unmount the filesystem; check all the disks the bytes of the stripe are corrected
> 3) I corrupt some bytes of the parity stored on disk2; mount the filesystem; run scrub; unmount the filesystem; check all the disks the bytes of the stripe are corrected
>
> Before your patches, my all tests fail (not all the times, but very often).
> After your patches, test1 and test2 still fail. In my test test3 succeded.

Thanks for your report and script.

I compared my script with yours, and found the difference is, I'm 
corrupting the whole data stripe (64K, and without space cache to make 
sure all data are at the 1st full stripe), while you only corrupt 5 
bytes of it.

And in that case, btrfs won't report more error than expected 1, which 
means the race fix is working.

But new btrfs-progs scrub will report that the recovered data stripes 
are all OK, but P/Q is not correct.

This seems to be a new bug, unrelated to the RAID5/6 scrub race(whose 
main symptom is incorrect csum error number and sometimes even 
unrecoverable error).


I assume it will be easier to fix than the race, but I can be totally 
wrong unless I dig into it further.

Anyway I'll fix it in a new patchset.

Thanks,
Qu

>
> Enclosed my script which performs the tests (it uses loop device; please run in a VM; firts you have to uncomment the function make_imgs to create the disk image.).
>
> Let me know if I can help you providing more information.
>
> BR
> G.Baroncelli
>
>
>>
>> Although this time the fix doesn't affect the scrub code much, it should
>> warn us that current scrub code is really hard to maintain.
>>
>> Abuse of workquque to delay works and the full fs scrub is race prone.
>>
>> Xfstest will follow a little later, as we don't have good enough tools
>> to corrupt data stripes pinpointly.
>>
>> Qu Wenruo (2):
>>   btrfs: scrub: Introduce full stripe lock for RAID56
>>   btrfs: scrub: Fix RAID56 recovery race condition
>>
>>  fs/btrfs/ctree.h       |   4 ++
>>  fs/btrfs/extent-tree.c |   3 +
>>  fs/btrfs/scrub.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 199 insertions(+)
>>
>
>



[-- Attachment #2: raid56_recover.sh --]
[-- Type: application/x-shellscript, Size: 612 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/2] RAID5/6 scrub race fix
  2016-11-17 23:13 ` Zygo Blaxell
@ 2016-11-18  1:19   ` Qu Wenruo
  2016-11-18  1:56     ` Hugo Mills
  2016-11-18 18:09   ` Goffredo Baroncelli
  1 sibling, 1 reply; 13+ messages in thread
From: Qu Wenruo @ 2016-11-18  1:19 UTC (permalink / raw)
  To: Zygo Blaxell; +Cc: linux-btrfs, kreijack



At 11/18/2016 07:13 AM, Zygo Blaxell wrote:
> On Tue, Nov 15, 2016 at 10:50:22AM +0800, Qu Wenruo wrote:
>> Fix the so-called famous RAID5/6 scrub error.
>>
>> Thanks Goffredo Baroncelli for reporting the bug, and make it into our
>> sight.
>> (Yes, without the Phoronix report on this,
>> https://www.phoronix.com/scan.php?page=news_item&px=Btrfs-RAID-56-Is-Bad,
>> I won't ever be aware of it)
>
> If you're hearing about btrfs RAID5 bugs for the first time through
> Phoronix, then your testing coverage is *clearly* inadequate.

I'm not fixing everything, I'm just focusing on the exact one bug 
reported by Goffredo Baroncelli.

Although it seems that, the bug reported by him is in fact two bugs.
One is race condition I'm fixing, another one is that recovery is 
recovering data correctly, but screwing up parity.

I just don't understand why you always want to fix everything in one step.

>
> Fill up a RAID5 array, start a FS stress test, pull a drive out while
> that's running, let the FS stress test run for another hour, then try
> to replace or delete the missing device.  If there are any crashes,
> corruptions, or EIO during any part of this process (assuming all the
> remaining disks are healthy), then btrfs RAID5 is still broken, and
> you've found another bug to fix.

Then it will be another bug to fix, not the bug I'm fixing.
Unless you can prove whatever the bug is, is relating to the fix, I 
don't see any help then.

>
> The fact that so many problems in btrfs can still be found this way
> indicates to me that nobody is doing this basic level of testing
> (or if they are, they're not doing anything about the results).
>
>> Unlike many of us(including myself) assumed, it's not a timed bomb buried
>> deeply into the RAID5/6 code, but a race condition in scrub recovery
>> code.
>
> I don't see how this patch fixes the write hole issue at the core of
> btrfs RAID56.  It just makes the thin layer of bugs over that issue a
> little thinner.  There's still the metadata RMW update timebomb at the
> bottom of the bug pile that can't be fixed by scrub (the filesystem is
> unrecoverably damaged when the bomb goes off, so scrub isn't possible).

Not "the core of btrfs RAID56", but "the core of all RAID56".

And, this is just another unrelated problem, I didn't even want to 
repeat what I've written.

Thanks,
Qu

>
>> The problem is not found because normal mirror based profiles aren't
>> affected by the race, since they are independent with each other.
>
> True.
>
>> Although this time the fix doesn't affect the scrub code much, it should
>> warn us that current scrub code is really hard to maintain.
>
> This last sentence is true.  I found and fixed three BUG_ONs in RAID5
> code on the first day I started testing in degraded mode, then hit
> the scrub code and had to give up.  It was like a brick wall made out
> of mismatched assumptions and layering inversions, using uninitialized
> kernel data as mortar (though I suppose the "uninitialized" data symptom
> might just have been an unprotected memory access).
>
>> Abuse of workquque to delay works and the full fs scrub is race prone.
>>
>> Xfstest will follow a little later, as we don't have good enough tools
>> to corrupt data stripes pinpointly.
>>
>> Qu Wenruo (2):
>>   btrfs: scrub: Introduce full stripe lock for RAID56
>>   btrfs: scrub: Fix RAID56 recovery race condition
>>
>>  fs/btrfs/ctree.h       |   4 ++
>>  fs/btrfs/extent-tree.c |   3 +
>>  fs/btrfs/scrub.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 199 insertions(+)
>>
>> --
>> 2.10.2
>>
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/2] RAID5/6 scrub race fix
  2016-11-18  1:19   ` Qu Wenruo
@ 2016-11-18  1:56     ` Hugo Mills
  2016-11-18  2:42       ` Qu Wenruo
  0 siblings, 1 reply; 13+ messages in thread
From: Hugo Mills @ 2016-11-18  1:56 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Zygo Blaxell, linux-btrfs, kreijack

[-- Attachment #1: Type: text/plain, Size: 4542 bytes --]

On Fri, Nov 18, 2016 at 09:19:11AM +0800, Qu Wenruo wrote:
> 
> 
> At 11/18/2016 07:13 AM, Zygo Blaxell wrote:
> >On Tue, Nov 15, 2016 at 10:50:22AM +0800, Qu Wenruo wrote:
> >>Fix the so-called famous RAID5/6 scrub error.
> >>
> >>Thanks Goffredo Baroncelli for reporting the bug, and make it into our
> >>sight.
> >>(Yes, without the Phoronix report on this,
> >>https://www.phoronix.com/scan.php?page=news_item&px=Btrfs-RAID-56-Is-Bad,
> >>I won't ever be aware of it)
> >
> >If you're hearing about btrfs RAID5 bugs for the first time through
> >Phoronix, then your testing coverage is *clearly* inadequate.
> 
> I'm not fixing everything, I'm just focusing on the exact one bug
> reported by Goffredo Baroncelli.
> 
> Although it seems that, the bug reported by him is in fact two bugs.
> One is race condition I'm fixing, another one is that recovery is
> recovering data correctly, but screwing up parity.
> 
> I just don't understand why you always want to fix everything in one step.

   Fix the important, fundamental things first, and the others
later. This, from my understanding of Zygo's comments, appears to be
one of the others.

   It's papering over the missing bricks in the wall instead of
chipping out the mortar and putting new bricks in. It may need to be
fixed, but it's not the fundamental "OMG, everything's totally broken"
problem. If anything, it's only a serious problem *because* the other
thing (write hole) is still there.

   It just seems like a piece of mis-prioritised effort.

> >Fill up a RAID5 array, start a FS stress test, pull a drive out while
> >that's running, let the FS stress test run for another hour, then try
> >to replace or delete the missing device.  If there are any crashes,
> >corruptions, or EIO during any part of this process (assuming all the
> >remaining disks are healthy), then btrfs RAID5 is still broken, and
> >you've found another bug to fix.
> 
> Then it will be another bug to fix, not the bug I'm fixing.
> Unless you can prove whatever the bug is, is relating to the fix, I
> don't see any help then.
> 
> >
> >The fact that so many problems in btrfs can still be found this way
> >indicates to me that nobody is doing this basic level of testing
> >(or if they are, they're not doing anything about the results).
> >
> >>Unlike many of us(including myself) assumed, it's not a timed bomb buried
> >>deeply into the RAID5/6 code, but a race condition in scrub recovery
> >>code.
> >
> >I don't see how this patch fixes the write hole issue at the core of
> >btrfs RAID56.  It just makes the thin layer of bugs over that issue a
> >little thinner.  There's still the metadata RMW update timebomb at the
> >bottom of the bug pile that can't be fixed by scrub (the filesystem is
> >unrecoverably damaged when the bomb goes off, so scrub isn't possible).
> 
> Not "the core of btrfs RAID56", but "the core of all RAID56".
> 
> And, this is just another unrelated problem, I didn't even want to
> repeat what I've written.

   That's the one that needs fixing *first*, though. (IMO, YMMV,
Contents may have settled in transit).

   Hugo.

> Thanks,
> Qu
> 
> >
> >>The problem is not found because normal mirror based profiles aren't
> >>affected by the race, since they are independent with each other.
> >
> >True.
> >
> >>Although this time the fix doesn't affect the scrub code much, it should
> >>warn us that current scrub code is really hard to maintain.
> >
> >This last sentence is true.  I found and fixed three BUG_ONs in RAID5
> >code on the first day I started testing in degraded mode, then hit
> >the scrub code and had to give up.  It was like a brick wall made out
> >of mismatched assumptions and layering inversions, using uninitialized
> >kernel data as mortar (though I suppose the "uninitialized" data symptom
> >might just have been an unprotected memory access).
> >
> >>Abuse of workquque to delay works and the full fs scrub is race prone.
> >>
> >>Xfstest will follow a little later, as we don't have good enough tools
> >>to corrupt data stripes pinpointly.
> >>
> >>Qu Wenruo (2):
> >>  btrfs: scrub: Introduce full stripe lock for RAID56
> >>  btrfs: scrub: Fix RAID56 recovery race condition
> >>
> >> fs/btrfs/ctree.h       |   4 ++
> >> fs/btrfs/extent-tree.c |   3 +
> >> fs/btrfs/scrub.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
> >> 3 files changed, 199 insertions(+)
> >>

-- 
Hugo Mills             | UDP jokes: It's OK if no-one gets them.
hugo@... carfax.org.uk |
http://carfax.org.uk/  |
PGP: E2AB1DE4          |

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/2] RAID5/6 scrub race fix
  2016-11-18  1:56     ` Hugo Mills
@ 2016-11-18  2:42       ` Qu Wenruo
  2016-11-18  4:43         ` Zygo Blaxell
  0 siblings, 1 reply; 13+ messages in thread
From: Qu Wenruo @ 2016-11-18  2:42 UTC (permalink / raw)
  To: Hugo Mills, Zygo Blaxell, linux-btrfs, kreijack



At 11/18/2016 09:56 AM, Hugo Mills wrote:
> On Fri, Nov 18, 2016 at 09:19:11AM +0800, Qu Wenruo wrote:
>>
>>
>> At 11/18/2016 07:13 AM, Zygo Blaxell wrote:
>>> On Tue, Nov 15, 2016 at 10:50:22AM +0800, Qu Wenruo wrote:
>>>> Fix the so-called famous RAID5/6 scrub error.
>>>>
>>>> Thanks Goffredo Baroncelli for reporting the bug, and make it into our
>>>> sight.
>>>> (Yes, without the Phoronix report on this,
>>>> https://www.phoronix.com/scan.php?page=news_item&px=Btrfs-RAID-56-Is-Bad,
>>>> I won't ever be aware of it)
>>>
>>> If you're hearing about btrfs RAID5 bugs for the first time through
>>> Phoronix, then your testing coverage is *clearly* inadequate.
>>
>> I'm not fixing everything, I'm just focusing on the exact one bug
>> reported by Goffredo Baroncelli.
>>
>> Although it seems that, the bug reported by him is in fact two bugs.
>> One is race condition I'm fixing, another one is that recovery is
>> recovering data correctly, but screwing up parity.
>>
>> I just don't understand why you always want to fix everything in one step.
>
>    Fix the important, fundamental things first, and the others
> later. This, from my understanding of Zygo's comments, appears to be
> one of the others.
>
>    It's papering over the missing bricks in the wall instead of
> chipping out the mortar and putting new bricks in. It may need to be
> fixed, but it's not the fundamental "OMG, everything's totally broken"
> problem. If anything, it's only a serious problem *because* the other
> thing (write hole) is still there.
>
>    It just seems like a piece of mis-prioritised effort.

It seems that, we have different standards on the priority.

For me, if some function on the very basic/minimal environment can't 
work reliably, then it's a high priority bug.

In this case, in a very minimal setup, with only 128K data spreading on 
3 devices RAID5. With a data stripe fully corrupted, without any other 
thing interfering.
Scrub can't return correct csum error number and even cause false 
unrecoverable error, then it's a high priority thing.

If the problem involves too many steps like removing devices, degraded 
mode, fsstress and some time. Then it's not that priority unless one 
pin-downs the root case to, for example, degraded mode itself with 
special sequenced operations.

Yes, in fact sometimes our developers are fixing such bug at high 
priority, but that's because other part has no such obvious problem.
But if we have obvious and easy to reproduce bug, then we should start 
from that.

So I don't consider the problem Zygo written is a high priority problem.

>
>>> Fill up a RAID5 array, start a FS stress test, pull a drive out while
>>> that's running, let the FS stress test run for another hour, then try
>>> to replace or delete the missing device.  If there are any crashes,
>>> corruptions, or EIO during any part of this process (assuming all the
>>> remaining disks are healthy), then btrfs RAID5 is still broken, and
>>> you've found another bug to fix.
>>
>> Then it will be another bug to fix, not the bug I'm fixing.
>> Unless you can prove whatever the bug is, is relating to the fix, I
>> don't see any help then.
>>
>>>
>>> The fact that so many problems in btrfs can still be found this way
>>> indicates to me that nobody is doing this basic level of testing
>>> (or if they are, they're not doing anything about the results).
>>>
>>>> Unlike many of us(including myself) assumed, it's not a timed bomb buried
>>>> deeply into the RAID5/6 code, but a race condition in scrub recovery
>>>> code.
>>>
>>> I don't see how this patch fixes the write hole issue at the core of
>>> btrfs RAID56.  It just makes the thin layer of bugs over that issue a
>>> little thinner.  There's still the metadata RMW update timebomb at the
>>> bottom of the bug pile that can't be fixed by scrub (the filesystem is
>>> unrecoverably damaged when the bomb goes off, so scrub isn't possible).
>>
>> Not "the core of btrfs RAID56", but "the core of all RAID56".
>>
>> And, this is just another unrelated problem, I didn't even want to
>> repeat what I've written.
>
>    That's the one that needs fixing *first*, though. (IMO, YMMV,
> Contents may have settled in transit).

And forget to mention, we don't even have an idea on how to fix the 
generic RAID56 RMW problem, even for mdadm RAID56.

BTW, for mdadm RAID56, they just scrub the whole array after every power 
loss, we could just call user to do that, and btrfs csum can assist 
scrub to do a better job than mdadm raid56.

Although, we must make sure btrfs scrub on raid56 won't cause false 
alert(the patchset) and no screwed up parity(I'll fix soon) first.

Thanks,
Qu

>
>    Hugo.
>
>> Thanks,
>> Qu
>>
>>>
>>>> The problem is not found because normal mirror based profiles aren't
>>>> affected by the race, since they are independent with each other.
>>>
>>> True.
>>>
>>>> Although this time the fix doesn't affect the scrub code much, it should
>>>> warn us that current scrub code is really hard to maintain.
>>>
>>> This last sentence is true.  I found and fixed three BUG_ONs in RAID5
>>> code on the first day I started testing in degraded mode, then hit
>>> the scrub code and had to give up.  It was like a brick wall made out
>>> of mismatched assumptions and layering inversions, using uninitialized
>>> kernel data as mortar (though I suppose the "uninitialized" data symptom
>>> might just have been an unprotected memory access).
>>>
>>>> Abuse of workquque to delay works and the full fs scrub is race prone.
>>>>
>>>> Xfstest will follow a little later, as we don't have good enough tools
>>>> to corrupt data stripes pinpointly.
>>>>
>>>> Qu Wenruo (2):
>>>>  btrfs: scrub: Introduce full stripe lock for RAID56
>>>>  btrfs: scrub: Fix RAID56 recovery race condition
>>>>
>>>> fs/btrfs/ctree.h       |   4 ++
>>>> fs/btrfs/extent-tree.c |   3 +
>>>> fs/btrfs/scrub.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>> 3 files changed, 199 insertions(+)
>>>>
>



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/2] RAID5/6 scrub race fix
  2016-11-18  2:42       ` Qu Wenruo
@ 2016-11-18  4:43         ` Zygo Blaxell
  2016-11-18  6:27           ` Qu Wenruo
  0 siblings, 1 reply; 13+ messages in thread
From: Zygo Blaxell @ 2016-11-18  4:43 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Hugo Mills, linux-btrfs, kreijack

[-- Attachment #1: Type: text/plain, Size: 8294 bytes --]

On Fri, Nov 18, 2016 at 10:42:23AM +0800, Qu Wenruo wrote:
> 
> 
> At 11/18/2016 09:56 AM, Hugo Mills wrote:
> >On Fri, Nov 18, 2016 at 09:19:11AM +0800, Qu Wenruo wrote:
> >>
> >>
> >>At 11/18/2016 07:13 AM, Zygo Blaxell wrote:
> >>>On Tue, Nov 15, 2016 at 10:50:22AM +0800, Qu Wenruo wrote:
> >>>>Fix the so-called famous RAID5/6 scrub error.
> >>>>
> >>>>Thanks Goffredo Baroncelli for reporting the bug, and make it into our
> >>>>sight.
> >>>>(Yes, without the Phoronix report on this,
> >>>>https://www.phoronix.com/scan.php?page=news_item&px=Btrfs-RAID-56-Is-Bad,
> >>>>I won't ever be aware of it)
> >>>
> >>>If you're hearing about btrfs RAID5 bugs for the first time through
> >>>Phoronix, then your testing coverage is *clearly* inadequate.
> >>
> >>I'm not fixing everything, I'm just focusing on the exact one bug
> >>reported by Goffredo Baroncelli.
> >>
> >>Although it seems that, the bug reported by him is in fact two bugs.
> >>One is race condition I'm fixing, another one is that recovery is
> >>recovering data correctly, but screwing up parity.
> >>
> >>I just don't understand why you always want to fix everything in one step.
> >
> >   Fix the important, fundamental things first, and the others
> >later. This, from my understanding of Zygo's comments, appears to be
> >one of the others.
> >
> >   It's papering over the missing bricks in the wall instead of
> >chipping out the mortar and putting new bricks in. It may need to be
> >fixed, but it's not the fundamental "OMG, everything's totally broken"
> >problem. If anything, it's only a serious problem *because* the other
> >thing (write hole) is still there.
> >
> >   It just seems like a piece of mis-prioritised effort.
> 
> It seems that, we have different standards on the priority.

My concern isn't priority.  Easier bugs often get fixed first.  That's
just the way Linux development works.

I am very concerned by articles like this:

	http://phoronix.com/scan.php?page=news_item&px=Btrfs-RAID5-RAID6-Fixed

with headlines like "btrfs RAID5/RAID6 support is finally fixed" when
that's very much not the case.  Only one bug has been removed for the
key use case that makes RAID5 interesting, and it's just the first of
many that still remain in the path of a user trying to recover from a
normal disk failure.

Admittedly this is Michael's (Phoronix's) problem more than Qu's, but
it's important to always be clear and _complete_ when stating bug status
because people quote statements out of context.  When the article quoted
the text

	"it's not a timed bomb buried deeply into the RAID5/6 code,
	but a race condition in scrub recovery code"

the commenters on Phoronix are clearly interpreting this to mean "famous
RAID5/6 scrub error" had been fixed *and* the issue reported by Goffredo
was the time bomb issue.  It's more accurate to say something like

	"Goffredo's issue is not the time bomb buried deeply in the
	RAID5/6 code, but a separate issue caused by a race condition
	in scrub recovery code"

Reading the Phoronix article, one might imagine RAID5 is now working
as well as RAID1 on btrfs.  To be clear, it's not--although the gap
is now significantly narrower.

> For me, if some function on the very basic/minimal environment can't work
> reliably, then it's a high priority bug.
> 
> In this case, in a very minimal setup, with only 128K data spreading on 3
> devices RAID5. With a data stripe fully corrupted, without any other thing
> interfering.
> Scrub can't return correct csum error number and even cause false
> unrecoverable error, then it's a high priority thing.

> If the problem involves too many steps like removing devices, degraded mode,
> fsstress and some time. Then it's not that priority unless one pin-downs the
> root case to, for example, degraded mode itself with special sequenced
> operations.

There are multiple bugs in the stress + remove device case.  Some are
quite easy to isolate.  They range in difficulty from simple BUG_ON
instead of error returns to finally solving the RMW update problem.

Run the test, choose any of the bugs that occur to work on, repeat until
the test stops finding new bugs for a while.  There are currently several
bugs to choose from with various levels of difficulty to fix them, and you
should hit the first level of bugs in a matter of hours if not minutes.

Using this method, you would have discovered Goffredo's bug years ago.
Instead, you only discovered it after Phoronix quoted the conclusion
of an investigation that started because of problems I reported here on
this list when I discovered half a dozen of btrfs's critical RAID5 bugs
from analyzing *one* disk failure event.

> Yes, in fact sometimes our developers are fixing such bug at high priority,
> but that's because other part has no such obvious problem.
> But if we have obvious and easy to reproduce bug, then we should start from
> that.

To be able to use a RAID5 in production it must be possible to recover
from one normal disk failure without being stopped by *any* bug in
most cases.  Until that happens, users should be aware that recovery
doesn't work yet.

> So I don't consider the problem Zygo written is a high priority problem.

I really don't care about the order the bugs get fixed.  Just please
don't say (or even suggest through omission) that you've fixed the
*last* one until you can pass a test based on typical failure modes
users will experience.

> And forget to mention, we don't even have an idea on how to fix the generic
> RAID56 RMW problem, even for mdadm RAID56.

It can be solved in the extent allocator if we can adjust it to prevent
allocation patterns that result in RMW writes.  Btrfs CoW logic will
take care of the rest.  I've written about this already on this list.

> BTW, for mdadm RAID56, they just scrub the whole array after every power
> loss, we could just call user to do that, and btrfs csum can assist scrub to
> do a better job than mdadm raid56.

mdadm did properly solve the RMW problem recently, so mdadm is currently
ahead of btrfs.  Btrfs can even solve the RMW problem two different ways.
One of these could massively outperform mdadm's brute-force solution.

> Although, we must make sure btrfs scrub on raid56 won't cause false
> alert(the patchset) and no screwed up parity(I'll fix soon) first.

With a raid5 data + raid1 metadata array this would result in a tolerable
level of data loss.  This is where I *thought* btrfs RAID5 was a year
ago when I started using it (and eight months before I started *reading*
the code), but the btrfs code at the time failed miserably when writing
to block groups in degraded mode, and I had to patch the kernel to make
any serious attempt at recovery at all.

> Thanks,
> Qu
> 
> >
> >   Hugo.
> >
> >>Thanks,
> >>Qu
> >>
> >>>
> >>>>The problem is not found because normal mirror based profiles aren't
> >>>>affected by the race, since they are independent with each other.
> >>>
> >>>True.
> >>>
> >>>>Although this time the fix doesn't affect the scrub code much, it should
> >>>>warn us that current scrub code is really hard to maintain.
> >>>
> >>>This last sentence is true.  I found and fixed three BUG_ONs in RAID5
> >>>code on the first day I started testing in degraded mode, then hit
> >>>the scrub code and had to give up.  It was like a brick wall made out
> >>>of mismatched assumptions and layering inversions, using uninitialized
> >>>kernel data as mortar (though I suppose the "uninitialized" data symptom
> >>>might just have been an unprotected memory access).
> >>>
> >>>>Abuse of workquque to delay works and the full fs scrub is race prone.
> >>>>
> >>>>Xfstest will follow a little later, as we don't have good enough tools
> >>>>to corrupt data stripes pinpointly.
> >>>>
> >>>>Qu Wenruo (2):
> >>>> btrfs: scrub: Introduce full stripe lock for RAID56
> >>>> btrfs: scrub: Fix RAID56 recovery race condition
> >>>>
> >>>>fs/btrfs/ctree.h       |   4 ++
> >>>>fs/btrfs/extent-tree.c |   3 +
> >>>>fs/btrfs/scrub.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>3 files changed, 199 insertions(+)
> >>>>
> >
> 
> 
> 

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/2] RAID5/6 scrub race fix
  2016-11-18  4:43         ` Zygo Blaxell
@ 2016-11-18  6:27           ` Qu Wenruo
  0 siblings, 0 replies; 13+ messages in thread
From: Qu Wenruo @ 2016-11-18  6:27 UTC (permalink / raw)
  To: Zygo Blaxell, Qu Wenruo; +Cc: Hugo Mills, linux-btrfs, kreijack



On 11/18/2016 12:43 PM, Zygo Blaxell wrote:
> On Fri, Nov 18, 2016 at 10:42:23AM +0800, Qu Wenruo wrote:
>>
>>
>> At 11/18/2016 09:56 AM, Hugo Mills wrote:
>>> On Fri, Nov 18, 2016 at 09:19:11AM +0800, Qu Wenruo wrote:
>>>>
>>>>
>>>> At 11/18/2016 07:13 AM, Zygo Blaxell wrote:
>>>>> On Tue, Nov 15, 2016 at 10:50:22AM +0800, Qu Wenruo wrote:
>>>>>> Fix the so-called famous RAID5/6 scrub error.
>>>>>>
>>>>>> Thanks Goffredo Baroncelli for reporting the bug, and make it into our
>>>>>> sight.
>>>>>> (Yes, without the Phoronix report on this,
>>>>>> https://www.phoronix.com/scan.php?page=news_item&px=Btrfs-RAID-56-Is-Bad,
>>>>>> I won't ever be aware of it)
>>>>>
>>>>> If you're hearing about btrfs RAID5 bugs for the first time through
>>>>> Phoronix, then your testing coverage is *clearly* inadequate.
>>>>
>>>> I'm not fixing everything, I'm just focusing on the exact one bug
>>>> reported by Goffredo Baroncelli.
>>>>
>>>> Although it seems that, the bug reported by him is in fact two bugs.
>>>> One is race condition I'm fixing, another one is that recovery is
>>>> recovering data correctly, but screwing up parity.
>>>>
>>>> I just don't understand why you always want to fix everything in one step.
>>>
>>>   Fix the important, fundamental things first, and the others
>>> later. This, from my understanding of Zygo's comments, appears to be
>>> one of the others.
>>>
>>>   It's papering over the missing bricks in the wall instead of
>>> chipping out the mortar and putting new bricks in. It may need to be
>>> fixed, but it's not the fundamental "OMG, everything's totally broken"
>>> problem. If anything, it's only a serious problem *because* the other
>>> thing (write hole) is still there.
>>>
>>>   It just seems like a piece of mis-prioritised effort.
>>
>> It seems that, we have different standards on the priority.
>
> My concern isn't priority.  Easier bugs often get fixed first.  That's
> just the way Linux development works.
>
> I am very concerned by articles like this:
>
> 	http://phoronix.com/scan.php?page=news_item&px=Btrfs-RAID5-RAID6-Fixed
>
> with headlines like "btrfs RAID5/RAID6 support is finally fixed" when
> that's very much not the case.  Only one bug has been removed for the
> key use case that makes RAID5 interesting, and it's just the first of
> many that still remain in the path of a user trying to recover from a
> normal disk failure.
>
> Admittedly this is Michael's (Phoronix's) problem more than Qu's, but
> it's important to always be clear and _complete_ when stating bug status
> because people quote statements out of context.  When the article quoted
> the text
>
> 	"it's not a timed bomb buried deeply into the RAID5/6 code,
> 	but a race condition in scrub recovery code"
>
> the commenters on Phoronix are clearly interpreting this to mean "famous
> RAID5/6 scrub error" had been fixed *and* the issue reported by Goffredo
> was the time bomb issue.  It's more accurate to say something like
>
> 	"Goffredo's issue is not the time bomb buried deeply in the
> 	RAID5/6 code, but a separate issue caused by a race condition
> 	in scrub recovery code"
>
> Reading the Phoronix article, one might imagine RAID5 is now working
> as well as RAID1 on btrfs.  To be clear, it's not--although the gap
> is now significantly narrower.
>
>> For me, if some function on the very basic/minimal environment can't work
>> reliably, then it's a high priority bug.
>>
>> In this case, in a very minimal setup, with only 128K data spreading on 3
>> devices RAID5. With a data stripe fully corrupted, without any other thing
>> interfering.
>> Scrub can't return correct csum error number and even cause false
>> unrecoverable error, then it's a high priority thing.
>
>> If the problem involves too many steps like removing devices, degraded mode,
>> fsstress and some time. Then it's not that priority unless one pin-downs the
>> root case to, for example, degraded mode itself with special sequenced
>> operations.
>
> There are multiple bugs in the stress + remove device case.  Some are
> quite easy to isolate.  They range in difficulty from simple BUG_ON
> instead of error returns to finally solving the RMW update problem.
>
> Run the test, choose any of the bugs that occur to work on, repeat until
> the test stops finding new bugs for a while.  There are currently several
> bugs to choose from with various levels of difficulty to fix them, and you
> should hit the first level of bugs in a matter of hours if not minutes.
>
> Using this method, you would have discovered Goffredo's bug years ago.
> Instead, you only discovered it after Phoronix quoted the conclusion
> of an investigation that started because of problems I reported here on
> this list when I discovered half a dozen of btrfs's critical RAID5 bugs
> from analyzing *one* disk failure event.

Nope, I don't want to touch anything related to degraded mode if any 
fundamental function is broken, and the bug Goffredo reported has 
nothing to do with any steps you mentioned.

How I find the problem is never related to the bug fix.
Yeah, I found it in Phoronix or whatever other source, then does it have 
anything related to the fix?

You can continue your test or fix or report. But that has nothing 
related to the problem I'm fixing.

Write the blahblah in another thread or open it as BZ, you are not 
helping reviewing the bug but showing off how quickly you find a bug.

I'm already frustrated of your complain about words like "you could have 
found the problem years ago".

This is never a fair thing to be blamed for just fixing a bug that one 
guy reported earlier.

>
>> Yes, in fact sometimes our developers are fixing such bug at high priority,
>> but that's because other part has no such obvious problem.
>> But if we have obvious and easy to reproduce bug, then we should start from
>> that.
>
> To be able to use a RAID5 in production it must be possible to recover
> from one normal disk failure without being stopped by *any* bug in
> most cases.  Until that happens, users should be aware that recovery
> doesn't work yet.
>
>> So I don't consider the problem Zygo written is a high priority problem.
>
> I really don't care about the order the bugs get fixed.  Just please
> don't say (or even suggest through omission) that you've fixed the
> *last* one until you can pass a test based on typical failure modes
> users will experience.

I never think RAID5/6 is stable and will never use it(in fact, the whole 
btrfs, in any of my working boxes).
If you think I'm expressing thing like whatever you think, that's just 
your misunderstanding.

Thanks,
Qu

>
>> And forget to mention, we don't even have an idea on how to fix the generic
>> RAID56 RMW problem, even for mdadm RAID56.
>
> It can be solved in the extent allocator if we can adjust it to prevent
> allocation patterns that result in RMW writes.  Btrfs CoW logic will
> take care of the rest.  I've written about this already on this list.
>
>> BTW, for mdadm RAID56, they just scrub the whole array after every power
>> loss, we could just call user to do that, and btrfs csum can assist scrub to
>> do a better job than mdadm raid56.
>
> mdadm did properly solve the RMW problem recently, so mdadm is currently
> ahead of btrfs.  Btrfs can even solve the RMW problem two different ways.
> One of these could massively outperform mdadm's brute-force solution.
>
>> Although, we must make sure btrfs scrub on raid56 won't cause false
>> alert(the patchset) and no screwed up parity(I'll fix soon) first.
>
> With a raid5 data + raid1 metadata array this would result in a tolerable
> level of data loss.  This is where I *thought* btrfs RAID5 was a year
> ago when I started using it (and eight months before I started *reading*
> the code), but the btrfs code at the time failed miserably when writing
> to block groups in degraded mode, and I had to patch the kernel to make
> any serious attempt at recovery at all.
>
>> Thanks,
>> Qu
>>
>>>
>>>   Hugo.
>>>
>>>> Thanks,
>>>> Qu
>>>>
>>>>>
>>>>>> The problem is not found because normal mirror based profiles aren't
>>>>>> affected by the race, since they are independent with each other.
>>>>>
>>>>> True.
>>>>>
>>>>>> Although this time the fix doesn't affect the scrub code much, it should
>>>>>> warn us that current scrub code is really hard to maintain.
>>>>>
>>>>> This last sentence is true.  I found and fixed three BUG_ONs in RAID5
>>>>> code on the first day I started testing in degraded mode, then hit
>>>>> the scrub code and had to give up.  It was like a brick wall made out
>>>>> of mismatched assumptions and layering inversions, using uninitialized
>>>>> kernel data as mortar (though I suppose the "uninitialized" data symptom
>>>>> might just have been an unprotected memory access).
>>>>>
>>>>>> Abuse of workquque to delay works and the full fs scrub is race prone.
>>>>>>
>>>>>> Xfstest will follow a little later, as we don't have good enough tools
>>>>>> to corrupt data stripes pinpointly.
>>>>>>
>>>>>> Qu Wenruo (2):
>>>>>> btrfs: scrub: Introduce full stripe lock for RAID56
>>>>>> btrfs: scrub: Fix RAID56 recovery race condition
>>>>>>
>>>>>> fs/btrfs/ctree.h       |   4 ++
>>>>>> fs/btrfs/extent-tree.c |   3 +
>>>>>> fs/btrfs/scrub.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>> 3 files changed, 199 insertions(+)
>>>>>>
>>>
>>
>>
>>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/2] RAID5/6 scrub race fix
  2016-11-17 23:13 ` Zygo Blaxell
  2016-11-18  1:19   ` Qu Wenruo
@ 2016-11-18 18:09   ` Goffredo Baroncelli
  2016-11-18 22:32     ` Zygo Blaxell
  1 sibling, 1 reply; 13+ messages in thread
From: Goffredo Baroncelli @ 2016-11-18 18:09 UTC (permalink / raw)
  To: Zygo Blaxell, Qu Wenruo; +Cc: linux-btrfs

Hi Zygo
On 2016-11-18 00:13, Zygo Blaxell wrote:
> On Tue, Nov 15, 2016 at 10:50:22AM +0800, Qu Wenruo wrote:
>> Fix the so-called famous RAID5/6 scrub error.
>>
>> Thanks Goffredo Baroncelli for reporting the bug, and make it into our
>> sight.
>> (Yes, without the Phoronix report on this,
>> https://www.phoronix.com/scan.php?page=news_item&px=Btrfs-RAID-56-Is-Bad,
>> I won't ever be aware of it)
> 
> If you're hearing about btrfs RAID5 bugs for the first time through
> Phoronix, then your testing coverage is *clearly* inadequate.
> 
> Fill up a RAID5 array, start a FS stress test, pull a drive out while
> that's running, let the FS stress test run for another hour, then try
> to replace or delete the missing device.  If there are any crashes,
> corruptions, or EIO during any part of this process (assuming all the
> remaining disks are healthy), then btrfs RAID5 is still broken, and
> you've found another bug to fix.
> 
> The fact that so many problems in btrfs can still be found this way
> indicates to me that nobody is doing this basic level of testing
> (or if they are, they're not doing anything about the results).

[...]

Sorry but I don't find useful this kind of discussion. 
Yes BTRFS RAID5/6 needs a lot of care. Yes, *our* test coverage is far to be complete; but this is not a fault of a single person; and Qu tried to solve one issue and for this we should say only tanks..

Even if you don't find valuable the work of Qu (and my little one :-) ), this required some time and need to be respected.

Finally, I don't think that we should compare the RAID-hole with this kind of bug(fix). The former is a design issue, the latter is a bug related of one of the basic feature of the raid system (recover from the lost of a disk/corruption).

Even the MD subsystem (which is far behind btrfs) had tolerated the raid-hole until last year. And its solution is far to be cheap (basically the MD subsystem wrote the data first in the journal then on the disk... which is the kind of issue that a COW filesystem would solve).

BR
G.Baroncelli






-- 
gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/2] RAID5/6 scrub race fix
  2016-11-18 18:09   ` Goffredo Baroncelli
@ 2016-11-18 22:32     ` Zygo Blaxell
  0 siblings, 0 replies; 13+ messages in thread
From: Zygo Blaxell @ 2016-11-18 22:32 UTC (permalink / raw)
  To: kreijack; +Cc: Qu Wenruo, linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 5838 bytes --]

On Fri, Nov 18, 2016 at 07:09:34PM +0100, Goffredo Baroncelli wrote:
> Hi Zygo
> On 2016-11-18 00:13, Zygo Blaxell wrote:
> > On Tue, Nov 15, 2016 at 10:50:22AM +0800, Qu Wenruo wrote:
> >> Fix the so-called famous RAID5/6 scrub error.
> >>
> >> Thanks Goffredo Baroncelli for reporting the bug, and make it into our
> >> sight.
> >> (Yes, without the Phoronix report on this,
> >> https://www.phoronix.com/scan.php?page=news_item&px=Btrfs-RAID-56-Is-Bad,
> >> I won't ever be aware of it)
> > 
> > If you're hearing about btrfs RAID5 bugs for the first time through
> > Phoronix, then your testing coverage is *clearly* inadequate.
> > 
> > Fill up a RAID5 array, start a FS stress test, pull a drive out while
> > that's running, let the FS stress test run for another hour, then try
> > to replace or delete the missing device.  If there are any crashes,
> > corruptions, or EIO during any part of this process (assuming all the
> > remaining disks are healthy), then btrfs RAID5 is still broken, and
> > you've found another bug to fix.
> > 
> > The fact that so many problems in btrfs can still be found this way
> > indicates to me that nobody is doing this basic level of testing
> > (or if they are, they're not doing anything about the results).
> 
> [...]
> 
> Sorry but I don't find useful this kind of discussion.  Yes BTRFS
> RAID5/6 needs a lot of care. Yes, *our* test coverage is far to be
> complete; but this is not a fault of a single person; and Qu tried to
> solve one issue and for this we should say only tanks..
>
> Even if you don't find valuable the work of Qu (and my little one :-)
> ), this required some time and need to be respected.

I do find this work valuable, and I do thank you and Qu for it.
I've been following it with great interest because I haven't had time
to dive into it myself.  It's a use case I used before and would like
to use again.

Most of my recent frustration, if directed at anyone, is really directed
at Phoronix for conflating "one bug was fixed" with "ready for production
use today," and I wanted to ensure that the latter rumor was promptly
quashed.

This is why I'm excited about Qu's work:  on my list of 7 btrfs-raid5
recovery bugs (6 I found plus yours), Qu has fixed at least 2 of them,
maybe as many as 4, with the patches so far.  I can fix 2 of the others,
for a total of 6 fixed out of 7.

Specifically, the 7 bugs I know of are:

	1-2. BUG_ONs in functions that should return errors (I had
	fixed both already when trying to recover my broken arrays)

	3. scrub can't identify which drives or files are corrupted
	(Qu might have fixed this--I won't know until I do testing)

	4-6. symptom groups related to wrong data or EIO in scrub
	recovery, including Goffredo's (Qu might have fixed all of these,
	but from a quick read of the patch I think at least two are done).

	7. the write hole.

I'll know more after I've had a chance to run Qu's patches through
testing, which I intend to do at some point.

Optimistically, this means there could be only *one* bug remaining
in the critical path for btrfs RAID56 single disk failure recovery.
That last bug is the write hole, which is why I keep going on about it.
It's the only bug I know exists in btrfs RAID56 that has neither an
existing fix nor any evidence of someone actively working on it, even
at the design proposal stage.  Please, I'd love to be wrong about this.

When I described the situation recently as "a thin layer of bugs on
top of a design defect", I was not trying to be mean.  I was trying to
describe the situation *precisely*.

The thin layer of bugs is much thinner thanks to Qu's work, and thanks
in part to his work, I now have confidence that further investment in
this area won't be wasted.

> Finally, I don't think that we should compare the RAID-hole with this
> kind of bug(fix). The former is a design issue, the latter is a bug
> related of one of the basic feature of the raid system (recover from
> the lost of a disk/corruption).
>
> Even the MD subsystem (which is far behind btrfs) had tolerated
> the raid-hole until last year. 

My frustration against this point is the attitude that mdadm was ever
good enough, much less a model to emulate in the future.  It's 2016--there
have been some advancements in the state of the art since the IBM patent
describing RAID5 30 years ago, yet in the btrfs world, we seem to insist
on repeating all the same mistakes in the same order.

"We're as good as some existing broken-by-design thing" isn't a really
useful attitude.  We should aspire to do *better* than the existing
broken-by-design things.  If we didn't, we wouldn't be here, we'd all
be lurking on some other list, running ext4 or xfs on mdadm or lvm.

> And its solution is far to be cheap
> (basically the MD subsystem wrote the data first in the journal
> then on the disk... which is the kind of issue that a COW filesystem
> would solve).

Journalling isn't required.  It's sufficient to fix the interaction
between the existing CoW and RAID5 layers (except for some datacow and
PREALLOC cases).  This is "easy" in the sense that it requires only
changes to the allocator (no on-disk format change), but "hard" in the
sense that it requires changes to the allocator.

See https://www.spinics.net/lists/linux-btrfs/msg59684.html
(and look a couple of references upthread)


> BR G.Baroncelli
>
>
>
>
>
>
> -- gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
> Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5 --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
> in the body of a message to majordomo@vger.kernel.org More majordomo
> info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2016-11-18 22:32 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-15  2:50 [PATCH 0/2] RAID5/6 scrub race fix Qu Wenruo
2016-11-15  2:50 ` [PATCH 1/2] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
2016-11-15  2:50 ` [PATCH 2/2] btrfs: scrub: Fix RAID56 recovery race condition Qu Wenruo
2016-11-17 18:09 ` [PATCH 0/2] RAID5/6 scrub race fix Goffredo Baroncelli
2016-11-18  0:57   ` Qu Wenruo
2016-11-17 23:13 ` Zygo Blaxell
2016-11-18  1:19   ` Qu Wenruo
2016-11-18  1:56     ` Hugo Mills
2016-11-18  2:42       ` Qu Wenruo
2016-11-18  4:43         ` Zygo Blaxell
2016-11-18  6:27           ` Qu Wenruo
2016-11-18 18:09   ` Goffredo Baroncelli
2016-11-18 22:32     ` Zygo Blaxell

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.