All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] raid56: variant bug fixes
@ 2017-02-03  8:20 Qu Wenruo
  2017-02-03  8:20 ` [PATCH 1/5] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
                   ` (5 more replies)
  0 siblings, 6 replies; 27+ messages in thread
From: Qu Wenruo @ 2017-02-03  8:20 UTC (permalink / raw)
  To: linux-btrfs

This patchset can be fetched from my github repo:
https://github.com/adam900710/linux.git raid56_fixes

It's based on v4.10-rc6 and none of the patch is modified after its first
appearance in mail list.

The patchset fixes the following bugs:

1) False alert or wrong csum error number when scrubbing RAID5/6
   The bug itself won't cause any damage to fs, just pure race.

2) Corrupted data stripe rebuild corrupts P/Q
   So scrub makes one error into another, not really fixing anything

3) Use-after-free caused by cancelling dev-replace 
   This is quite a deadly bug, since cancelling dev-replace can easily
   cause kernel panic, and thanks to raid bio steal, it makes the race
   windows quite large.

   Can be triggered by btrfs/069.

After all the fixes applied, no scrub/replace related regression can be
detected.

Qu Wenruo (5):
  btrfs: scrub: Introduce full stripe lock for RAID56
  btrfs: scrub: Fix RAID56 recovery race condition
  btrfs: raid56: Use correct stolen pages to calculate P/Q
  btrfs: raid56: Don't keep rbio for later steal
  btrfs: replace: Use ref counts to avoid destroying target device when
    canceled

 fs/btrfs/ctree.h       |   4 ++
 fs/btrfs/dev-replace.c |   7 +-
 fs/btrfs/extent-tree.c |   3 +
 fs/btrfs/raid56.c      |  80 +++++++++++++++------
 fs/btrfs/scrub.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.c     |  36 +++++++++-
 fs/btrfs/volumes.h     |  10 +++
 7 files changed, 309 insertions(+), 23 deletions(-)

-- 
2.11.0




^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/5] btrfs: scrub: Introduce full stripe lock for RAID56
  2017-02-03  8:20 [PATCH 0/5] raid56: variant bug fixes Qu Wenruo
@ 2017-02-03  8:20 ` Qu Wenruo
  2017-02-03  8:20 ` [PATCH 2/5] btrfs: scrub: Fix RAID56 recovery race condition Qu Wenruo
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 27+ messages in thread
From: Qu Wenruo @ 2017-02-03  8:20 UTC (permalink / raw)
  To: linux-btrfs

Unlike mirror based profiles, RAID5/6 recovery needs to read out the
whole full stripe.

And if we don't do proper protect, it can easily cause race condition.

Introduce 2 new functions: lock_full_stripe() and unlock_full_stripe()
for RAID5/6.
Which stores a rb_tree of mutex for full stripes, so scrub callers can
use them to lock a full stripe to avoid race.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/ctree.h       |   4 ++
 fs/btrfs/extent-tree.c |   3 +
 fs/btrfs/scrub.c       | 177 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 184 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6a823719b6c5..0dc0b113a691 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -639,6 +639,10 @@ struct btrfs_block_group_cache {
 	 * Protected by free_space_lock.
 	 */
 	int needs_free_space;
+
+	/* Scrub full stripe lock tree for RAID5/6 scrub */
+	struct rb_root scrub_lock_root;
+	spinlock_t scrub_lock;
 };
 
 /* delayed seq elem */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dcd2e798767e..79769c168230 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -130,6 +130,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 	if (atomic_dec_and_test(&cache->count)) {
 		WARN_ON(cache->pinned > 0);
 		WARN_ON(cache->reserved > 0);
+		WARN_ON(!RB_EMPTY_ROOT(&cache->scrub_lock_root));
 		kfree(cache->free_space_ctl);
 		kfree(cache);
 	}
@@ -9910,6 +9911,8 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
 
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
+	spin_lock_init(&cache->scrub_lock);
+	cache->scrub_lock_root = RB_ROOT;
 	init_rwsem(&cache->data_rwsem);
 	INIT_LIST_HEAD(&cache->list);
 	INIT_LIST_HEAD(&cache->cluster_list);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9a94670536a6..e68369d425b0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -240,6 +240,13 @@ struct scrub_warning {
 	struct btrfs_device	*dev;
 };
 
+struct scrub_full_stripe_lock {
+	struct rb_node node;
+	u64 logical;
+	u64 refs;
+	struct mutex mutex;
+};
+
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -351,6 +358,176 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 }
 
 /*
+ * Caller must hold cache->scrub_root_lock.
+ *
+ * Return existing full stripe and increase it refs
+ * Or return NULL, and insert @fstripe_lock into the bg cache
+ */
+static struct scrub_full_stripe_lock *
+add_scrub_lock(struct btrfs_block_group_cache *cache,
+	       struct scrub_full_stripe_lock *fstripe_lock)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct scrub_full_stripe_lock *entry;
+
+	p = &cache->scrub_lock_root.rb_node;
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct scrub_full_stripe_lock, node);
+		if (fstripe_lock->logical < entry->logical) {
+			p = &(*p)->rb_left;
+		} else if (fstripe_lock->logical > entry->logical) {
+			p = &(*p)->rb_right;
+		} else {
+			entry->refs++;
+			return entry;
+		}
+	}
+	/* Insert new one */
+	rb_link_node(&fstripe_lock->node, parent, p);
+	rb_insert_color(&fstripe_lock->node, &cache->scrub_lock_root);
+
+	return NULL;
+}
+
+static struct scrub_full_stripe_lock *
+search_scrub_lock(struct btrfs_block_group_cache *cache, u64 bytenr)
+{
+	struct rb_node *node;
+	struct scrub_full_stripe_lock *entry;
+
+	node = cache->scrub_lock_root.rb_node;
+	while (node) {
+		entry = rb_entry(node, struct scrub_full_stripe_lock, node);
+		if (bytenr < entry->logical)
+			node = node->rb_left;
+		else if (bytenr > entry->logical)
+			node = node->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Helper to get full stripe logical from a normal bytenr.
+ * Thanks to the chaos of scrub structures, we need to get it all
+ * by ourselves, using btrfs_map_sblock().
+ */
+static int get_full_stripe_logical(struct btrfs_fs_info *fs_info, u64 bytenr,
+				   u64 *bytenr_ret)
+{
+	struct btrfs_bio *bbio = NULL;
+	u64 len;
+	int ret;
+
+	/* Just use map_sblock() to get full stripe logical */
+	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, bytenr,
+			       &len, &bbio, 0, 1);
+	if (ret || !bbio || !bbio->raid_map)
+		goto error;
+	*bytenr_ret = bbio->raid_map[0];
+	btrfs_put_bbio(bbio);
+	return 0;
+error:
+	btrfs_put_bbio(bbio);
+	if (ret)
+		return ret;
+	return -EIO;
+}
+
+/*
+ * To lock a full stripe to avoid concurrency of recovery and read
+ * It's only used for profiles with parities(RAID5/6), for other profiles it
+ * does nothing
+ */
+static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+			    gfp_t gfp_flags)
+{
+	struct btrfs_block_group_cache *bg_cache;
+	struct scrub_full_stripe_lock *fstripe_lock;
+	struct scrub_full_stripe_lock *existing;
+	u64 fstripe_start;
+	int ret = 0;
+
+	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+	if (!bg_cache)
+		return -ENOENT;
+
+	/* Mirror based profiles don't need full stripe lock */
+	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+		goto out;
+
+	ret = get_full_stripe_logical(fs_info, bytenr, &fstripe_start);
+	if (ret < 0)
+		goto out;
+
+	fstripe_lock = kmalloc(sizeof(*fstripe_lock), gfp_flags);
+	if (!fstripe_lock) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	fstripe_lock->logical = fstripe_start;
+	fstripe_lock->refs = 1;
+	mutex_init(&fstripe_lock->mutex);
+
+	/* Now insert the full stripe lock */
+	spin_lock(&bg_cache->scrub_lock);
+	existing = add_scrub_lock(bg_cache, fstripe_lock);
+	if (existing) {
+		kfree(fstripe_lock);
+		fstripe_lock = existing;
+	}
+	spin_unlock(&bg_cache->scrub_lock);
+	mutex_lock(&fstripe_lock->mutex);
+
+out:
+	btrfs_put_block_group(bg_cache);
+	return ret;
+}
+
+static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+	struct btrfs_block_group_cache *bg_cache;
+	struct scrub_full_stripe_lock *fstripe_lock;
+	u64 fstripe_start;
+	int ret = 0;
+
+	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+	if (!bg_cache)
+		return -ENOENT;
+	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+		goto out;
+
+	ret = get_full_stripe_logical(fs_info, bytenr, &fstripe_start);
+	if (ret < 0)
+		goto out;
+
+	spin_lock(&bg_cache->scrub_lock);
+	fstripe_lock = search_scrub_lock(bg_cache, fstripe_start);
+	/* This is a deadly problem, we hold a mutex but can't unlock it */
+	if (WARN_ON(!fstripe_lock)) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	mutex_unlock(&fstripe_lock->mutex);
+	if (!WARN_ON(fstripe_lock->refs == 0))
+		fstripe_lock->refs--;
+	if (fstripe_lock->refs == 0) {
+		rb_erase(&fstripe_lock->node, &bg_cache->scrub_lock_root);
+		kfree(fstripe_lock);
+	}
+unlock:
+	spin_unlock(&bg_cache->scrub_lock);
+out:
+	btrfs_put_block_group(bg_cache);
+	return ret;
+}
+
+/*
  * used for workers that require transaction commits (i.e., for the
  * NOCOW case)
  */
-- 
2.11.0




^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 2/5] btrfs: scrub: Fix RAID56 recovery race condition
  2017-02-03  8:20 [PATCH 0/5] raid56: variant bug fixes Qu Wenruo
  2017-02-03  8:20 ` [PATCH 1/5] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
@ 2017-02-03  8:20 ` Qu Wenruo
  2017-02-03  8:20 ` [PATCH 3/5] btrfs: raid56: Use correct stolen pages to calculate P/Q Qu Wenruo
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 27+ messages in thread
From: Qu Wenruo @ 2017-02-03  8:20 UTC (permalink / raw)
  To: linux-btrfs

When scrubbing a RAID5 which has recoverable data corruption (only one
data stripe is corrupted), sometimes scrub will report more csum errors
than expected. Sometimes even unrecoverable error will be reported.

The problem can be easily reproduced by the following steps:
1) Create a btrfs with RAID5 data profile with 3 devs
2) Mount it with nospace_cache or space_cache=v2
   To avoid extra data space usage.
3) Create a 128K file and sync the fs, unmount it
   Now the 128K file lies at the beginning of the data chunk
4) Locate the physical bytenr of data chunk on dev3
   Dev3 is the 1st data stripe.
5) Corrupt the first 64K of the data chunk stripe on dev3
6) Mount the fs and scrub it

The correct csum error number should be 16(assuming using x86_64).
Larger csum error number can be reported in a 1/3 chance.
And unrecoverable error can also be reported in a 1/10 chance.

The root cause of the problem is RAID5/6 recover code has race
condition, due to the fact that full scrub is initiated per device.

While for other mirror based profiles, each mirror is independent with
each other, so race won't cause any big problem.

For example:
        Corrupted       |       Correct          |      Correct        |
|   Scrub dev3 (D1)     |    Scrub dev2 (D2)     |    Scrub dev1(P)    |
------------------------------------------------------------------------
Read out D1             |Read out D2             |Read full stripe     |
Check csum              |Check csum              |Check parity         |
Csum mismatch           |Csum match, continue    |Parity mismatch      |
handle_errored_block    |                        |handle_errored_block |
 Read out full stripe   |                        | Read out full stripe|
 D1 csum error(err++)   |                        | D1 csum error(err++)|
 Recover D1             |                        | Recover D1          |

So D1's csum error is accounted twice, just because
handle_errored_block() doesn't have enough protect, and race can happen.

On even worse case, for example D1's recovery code is re-writing
D1/D2/P, and P's recovery code is just reading out full stripe, then we
can cause unrecoverable error.

This patch will use previously introduced lock_full_stripe() and
unlock_full_stripe() to protect the whole scrub_handle_errored_block()
function for RAID56 recovery.
So no extra csum error nor unrecoverable error.

Reported-by: Goffredo Baroncelli <kreijack@libero.it>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/scrub.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e68369d425b0..2be7f2546e3a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1067,6 +1067,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	unsigned int have_csum;
 	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 	struct scrub_block *sblock_bad;
+	int lock_ret;
 	int ret;
 	int mirror_index;
 	int page_num;
@@ -1096,6 +1097,17 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	have_csum = sblock_to_check->pagev[0]->have_csum;
 	dev = sblock_to_check->pagev[0]->dev;
 
+	/*
+	 * For RAID5/6 race can happen for different dev scrub thread.
+	 * For data corruption, Parity and Data thread will both try
+	 * to recovery the data.
+	 * Race can lead to double added csum error, or even unrecoverable
+	 * error.
+	 */
+	ret = lock_full_stripe(fs_info, logical, GFP_NOFS);
+	if (ret < 0)
+		return ret;
+
 	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 		sblocks_for_recheck = NULL;
 		goto nodatasum_case;
@@ -1430,6 +1442,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 		kfree(sblocks_for_recheck);
 	}
 
+	lock_ret = unlock_full_stripe(fs_info, logical);
+	if (lock_ret < 0)
+		return lock_ret;
 	return 0;
 }
 
-- 
2.11.0




^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 3/5] btrfs: raid56: Use correct stolen pages to calculate P/Q
  2017-02-03  8:20 [PATCH 0/5] raid56: variant bug fixes Qu Wenruo
  2017-02-03  8:20 ` [PATCH 1/5] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
  2017-02-03  8:20 ` [PATCH 2/5] btrfs: scrub: Fix RAID56 recovery race condition Qu Wenruo
@ 2017-02-03  8:20 ` Qu Wenruo
  2017-03-16  5:36   ` Liu Bo
  2017-02-03  8:20 ` [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal Qu Wenruo
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 27+ messages in thread
From: Qu Wenruo @ 2017-02-03  8:20 UTC (permalink / raw)
  To: linux-btrfs

In the following situation, scrub will calculate wrong parity to
overwrite correct one:

RAID5 full stripe:

Before
|     Dev 1      |     Dev  2     |     Dev 3     |
| Data stripe 1  | Data stripe 2  | Parity Stripe |
--------------------------------------------------- 0
| 0x0000 (Bad)   |     0xcdcd     |     0x0000    |
--------------------------------------------------- 4K
|     0xcdcd     |     0xcdcd     |     0x0000    |
...
|     0xcdcd     |     0xcdcd     |     0x0000    |
--------------------------------------------------- 64K

After scrubbing dev3 only:

|     Dev 1      |     Dev  2     |     Dev 3     |
| Data stripe 1  | Data stripe 2  | Parity Stripe |
--------------------------------------------------- 0
| 0xcdcd (Good)  |     0xcdcd     | 0xcdcd (Bad)  |
--------------------------------------------------- 4K
|     0xcdcd     |     0xcdcd     |     0x0000    |
...
|     0xcdcd     |     0xcdcd     |     0x0000    |
--------------------------------------------------- 64K

The calltrace of such corruption is as following:

scrub_bio_end_io_worker() get called for each extent read out
|- scriub_block_complete()
   |- Data extent csum mismatch
   |- scrub_handle_errored_block
      |- scrub_recheck_block()
         |- scrub_submit_raid56_bio_wait()
            |- raid56_parity_recover()

Now we have a rbio with correct data stripe 1 recovered.
Let's call it "good_rbio".

scrub_parity_check_and_repair()
|- raid56_parity_submit_scrub_rbio()
   |- lock_stripe_add()
   |  |- steal_rbio()
   |     |- Recovered data are steal from "good_rbio", stored into
   |        rbio->stripe_pages[]
   |        Now rbio->bio_pages[] are bad data read from disk.
   |- async_scrub_parity()
      |- scrub_parity_work() (delayed_call to scrub_parity_work)

scrub_parity_work()
|- raid56_parity_scrub_stripe()
   |- validate_rbio_for_parity_scrub()
      |- finish_parity_scrub()
         |- Recalculate parity using *BAD* pages in rbio->bio_pages[]
            So good parity is overwritten with *BAD* one

The fix is to introduce 2 new members, bad_ondisk_a/b, to struct
btrfs_raid_bio, to info scrub code to use correct data pages to
re-calculate parity.

Reported-by: Goffredo Baroncelli <kreijack@inwind.it>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/raid56.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 58 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d2a9a1ee5361..453eefdcb591 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -133,6 +133,16 @@ struct btrfs_raid_bio {
 	/* second bad stripe (for raid6 use) */
 	int failb;
 
+	/*
+	 * For steal_rbio, we can steal recovered correct page,
+	 * but in finish_parity_scrub(), we still use bad on-disk
+	 * page to calculate parity.
+	 * Use these members to info finish_parity_scrub() to use
+	 * correct pages
+	 */
+	int bad_ondisk_a;
+	int bad_ondisk_b;
+
 	int scrubp;
 	/*
 	 * number of pages needed to represent the full
@@ -310,6 +320,12 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 		return;
 
+	/* Record recovered stripe number */
+	if (src->faila != -1)
+		dest->bad_ondisk_a = src->faila;
+	if (src->failb != -1)
+		dest->bad_ondisk_b = src->failb;
+
 	for (i = 0; i < dest->nr_pages; i++) {
 		s = src->stripe_pages[i];
 		if (!s || !PageUptodate(s)) {
@@ -999,6 +1015,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 	rbio->stripe_npages = stripe_npages;
 	rbio->faila = -1;
 	rbio->failb = -1;
+	rbio->bad_ondisk_a = -1;
+	rbio->bad_ondisk_b = -1;
 	atomic_set(&rbio->refs, 1);
 	atomic_set(&rbio->error, 0);
 	atomic_set(&rbio->stripes_pending, 0);
@@ -2261,6 +2279,9 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
 	int bit;
 	int index;
 	struct page *page;
+	struct page *bio_page;
+	void *ptr;
+	void *bio_ptr;
 
 	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
 		for (i = 0; i < rbio->real_stripes; i++) {
@@ -2271,6 +2292,23 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
 			page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 			if (!page)
 				return -ENOMEM;
+
+			/*
+			 * It's possible that only several pages need recover,
+			 * and rest are all good.
+			 * In that case we need to copy good bio pages into
+			 * stripe_pages[], as we will use stripe_pages[] other
+			 * than bio_pages[] in finish_parity_scrub().
+			 */
+			bio_page = page_in_rbio(rbio, i, bit, 0);
+			if ((i == rbio->bad_ondisk_a ||
+			     i == rbio->bad_ondisk_b) && bio_page) {
+				ptr = kmap(page);
+				bio_ptr = kmap(bio_page);
+				memcpy(ptr, bio_ptr, PAGE_SIZE);
+				kunmap(bio_page);
+				kunmap(page);
+			}
 			rbio->stripe_pages[index] = page;
 		}
 	}
@@ -2282,6 +2320,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 {
 	struct btrfs_bio *bbio = rbio->bbio;
 	void *pointers[rbio->real_stripes];
+	struct page *mapped_pages[rbio->real_stripes];
 	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
 	int nr_data = rbio->nr_data;
 	int stripe;
@@ -2342,12 +2381,24 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 		void *parity;
 		/* first collect one page from each data stripe */
 		for (stripe = 0; stripe < nr_data; stripe++) {
-			p = page_in_rbio(rbio, stripe, pagenr, 0);
+
+			/*
+			 * Use stolen recovered page other than bad
+			 * on disk pages
+			 */
+			if (stripe == rbio->bad_ondisk_a ||
+			    stripe == rbio->bad_ondisk_b)
+				p = rbio_stripe_page(rbio, stripe, pagenr);
+			else
+				p = page_in_rbio(rbio, stripe, pagenr, 0);
 			pointers[stripe] = kmap(p);
+			mapped_pages[stripe] = p;
 		}
 
 		/* then add the parity stripe */
-		pointers[stripe++] = kmap(p_page);
+		pointers[stripe] = kmap(p_page);
+		mapped_pages[stripe] = p_page;
+		stripe++;
 
 		if (q_stripe != -1) {
 
@@ -2355,7 +2406,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 			 * raid6, add the qstripe and call the
 			 * library function to fill in our p/q
 			 */
-			pointers[stripe++] = kmap(q_page);
+			pointers[stripe] = kmap(q_page);
+			mapped_pages[stripe] = q_page;
+			stripe++;
 
 			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
 						pointers);
@@ -2375,8 +2428,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 			bitmap_clear(rbio->dbitmap, pagenr, 1);
 		kunmap(p);
 
+		/* Free mapped pages */
 		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
-			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+			kunmap(mapped_pages[stripe]);
 	}
 
 	__free_page(p_page);
-- 
2.11.0




^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal
  2017-02-03  8:20 [PATCH 0/5] raid56: variant bug fixes Qu Wenruo
                   ` (2 preceding siblings ...)
  2017-02-03  8:20 ` [PATCH 3/5] btrfs: raid56: Use correct stolen pages to calculate P/Q Qu Wenruo
@ 2017-02-03  8:20 ` Qu Wenruo
  2017-03-17  4:44   ` Liu Bo
  2017-02-03  8:20 ` [PATCH 5/5] btrfs: replace: Use ref counts to avoid destroying target device when canceled Qu Wenruo
  2017-03-07  3:48 ` [PATCH 0/5] raid56: variant bug fixes Qu Wenruo
  5 siblings, 1 reply; 27+ messages in thread
From: Qu Wenruo @ 2017-02-03  8:20 UTC (permalink / raw)
  To: linux-btrfs

Before this patch, btrfs raid56 will keep raid56 rbio even all its IO is
done.
This may save some time allocating rbio, but it can cause deadly
use-after-free bug, for the following case:

Original fs: 4 devices RAID5

       Process A                 |          Process B
--------------------------------------------------------------------------
                                 |  Start device replace
                                 |    Now the fs has 5 devices
                                 |    devid 0: replace device
                                 |    devid 1~4: old devices
btrfs_map_bio()                  |
|- __btrfs_map_block()           |
|    bbio has 5 stripes          |
|    including devid 0           |
|- raid56_parity_write()         |
                                 |
raid_write_end_io()              |
|- rbio_orig_end_io()            |
   |- unlock_stripe()            |
       Keeps the old rbio for    |
       later steal, which has    |
       stripe for devid 0        |
                                 |  Cancel device replace
                                 |    Now the fs has 4 devices
                                 |    devid 0 is freed
Some IO happens                  |
raid_write_end_io()              |
|- rbio_orig_end_io()            |
   |- unlock_stripe()            |
      |- steal_rbio()            |
           Use old rbio, whose   |
           bbio has freed devid 0|
           stripe                |
Any access to rbio->bbio will    |
cause general protection or NULL |
pointer dereference              |

Such bug can already be triggered by fstests btrfs/069 for RAID5/6
profiles.

Fix it by not keeping old rbio in unlock_stripe(), so we just free the
finished rbio and rbio->bbio, so above problem wont' happen.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/raid56.c | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 453eefdcb591..aba82b95ec73 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -776,7 +776,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 	int bucket;
 	struct btrfs_stripe_hash *h;
 	unsigned long flags;
-	int keep_cache = 0;
 
 	bucket = rbio_bucket(rbio);
 	h = rbio->fs_info->stripe_hash_table->table + bucket;
@@ -788,19 +787,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 	spin_lock(&rbio->bio_list_lock);
 
 	if (!list_empty(&rbio->hash_list)) {
-		/*
-		 * if we're still cached and there is no other IO
-		 * to perform, just leave this rbio here for others
-		 * to steal from later
-		 */
-		if (list_empty(&rbio->plug_list) &&
-		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
-			keep_cache = 1;
-			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
-			BUG_ON(!bio_list_empty(&rbio->bio_list));
-			goto done;
-		}
-
 		list_del_init(&rbio->hash_list);
 		atomic_dec(&rbio->refs);
 
@@ -848,13 +834,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 			goto done_nolock;
 		}
 	}
-done:
 	spin_unlock(&rbio->bio_list_lock);
 	spin_unlock_irqrestore(&h->lock, flags);
 
 done_nolock:
-	if (!keep_cache)
-		remove_rbio_from_cache(rbio);
+	remove_rbio_from_cache(rbio);
 }
 
 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
-- 
2.11.0




^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 5/5] btrfs: replace: Use ref counts to avoid destroying target device when canceled
  2017-02-03  8:20 [PATCH 0/5] raid56: variant bug fixes Qu Wenruo
                   ` (3 preceding siblings ...)
  2017-02-03  8:20 ` [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal Qu Wenruo
@ 2017-02-03  8:20 ` Qu Wenruo
  2017-03-18  2:12   ` Liu Bo
  2017-03-07  3:48 ` [PATCH 0/5] raid56: variant bug fixes Qu Wenruo
  5 siblings, 1 reply; 27+ messages in thread
From: Qu Wenruo @ 2017-02-03  8:20 UTC (permalink / raw)
  To: linux-btrfs

When dev-replace and scrub are run at the same time, dev-replace can be
canceled by scrub. It's quite common for btrfs/069.

The backtrace would be like:
general protection fault: 0000 [#1] SMP
Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs]
RIP: 0010:[<ffffffff813a2fa8>]  [<ffffffff813a2fa8>] generic_make_request_checks+0x198/0x5a0
Call Trace:
 [<ffffffff813a4cff>] ? generic_make_request+0xcf/0x290
 [<ffffffff813a4c54>] generic_make_request+0x24/0x290
 [<ffffffff813a4cff>] ? generic_make_request+0xcf/0x290
 [<ffffffff813a4f2e>] submit_bio+0x6e/0x120
 [<ffffffffa087279d>] ? page_in_rbio+0x4d/0x80 [btrfs]
 [<ffffffffa08737d0>] ? rbio_orig_end_io+0x80/0x80 [btrfs]
 [<ffffffffa0873e31>] finish_rmw+0x401/0x550 [btrfs]
 [<ffffffffa0874fc6>] validate_rbio_for_rmw+0x36/0x40 [btrfs]
 [<ffffffffa087504d>] raid_rmw_end_io+0x7d/0x90 [btrfs]
 [<ffffffff8139c536>] bio_endio+0x56/0x60
 [<ffffffffa07e6e5c>] end_workqueue_fn+0x3c/0x40 [btrfs]
 [<ffffffffa08285bf>] btrfs_scrubparity_helper+0xef/0x610 [btrfs]
 [<ffffffffa0828b9e>] btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
 [<ffffffff810ec8df>] process_one_work+0x2af/0x720
 [<ffffffff810ec85b>] ? process_one_work+0x22b/0x720
 [<ffffffff810ecd9b>] worker_thread+0x4b/0x4f0
 [<ffffffff810ecd50>] ? process_one_work+0x720/0x720
 [<ffffffff810ecd50>] ? process_one_work+0x720/0x720
 [<ffffffff810f39d3>] kthread+0xf3/0x110
 [<ffffffff810f38e0>] ? kthread_park+0x60/0x60
 [<ffffffff81857647>] ret_from_fork+0x27/0x40

While in that case, target device can be destroyed at cancel time,
leading to a user-after-free bug:

     Process A (dev-replace)         |         Process B(scrub)
----------------------------------------------------------------------
                                     |(Any RW is OK)
                                     |scrub_setup_recheck_block()
                                     ||- btrfs_map_sblock()
                                     |   Got a bbio with tgtdev
btrfs_dev_replace_finishing()        |
|- btrfs_destory_dev_replace_tgtdev()|
   |- call_rcu(free_device)          |
      |- __free_device()             |
         |- kfree(device)            |
                                     | Scrub worker:
                                     | Access bbio->stripes[], which
                                     | contains tgtdev.
                                     | This triggers general protection.

The bug is mostly obvious for RAID5/6 since raid56 choose to keep old
rbio and rbio->bbio for later steal, this hugely enlarged the race
window and makes it much easier to trigger the bug.

This patch introduces 'tgtdev_refs' and 'tgtdev_wait' for btrfs_device
to wait for all its user released the target device.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
---
 fs/btrfs/dev-replace.c |  7 ++++++-
 fs/btrfs/volumes.c     | 36 +++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.h     | 10 ++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5de280b9ad73..794a6a0bedf2 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -558,7 +558,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 			  rcu_str_deref(src_device->name),
 			  src_device->devid,
 			  rcu_str_deref(tgt_device->name));
-	tgt_device->is_tgtdev_for_dev_replace = 0;
 	tgt_device->devid = src_device->devid;
 	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
 	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
@@ -579,6 +578,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
 	btrfs_dev_replace_unlock(dev_replace, 1);
 
+	/*
+	 * Only change is_tgtdev_for_dev_replace flag after all its
+	 * users get released.
+	 */
+	wait_target_device(tgt_device);
+	tgt_device->is_tgtdev_for_dev_replace = 0;
 	btrfs_rm_dev_replace_blocked(fs_info);
 
 	btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3c3c69c0eee4..84db9fb22b7d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2064,6 +2064,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	WARN_ON(!tgtdev);
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 
+	wait_target_device(tgtdev);
 	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
 
 	if (tgtdev->bdev)
@@ -2598,6 +2599,8 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	device->is_tgtdev_for_dev_replace = 1;
 	device->mode = FMODE_EXCL;
 	device->dev_stats_valid = 1;
+	atomic_set(&device->tgtdev_refs, 0);
+	init_waitqueue_head(&device->tgtdev_wait);
 	set_blocksize(device->bdev, 4096);
 	device->fs_devices = fs_info->fs_devices;
 	list_add(&device->dev_list, &fs_info->fs_devices->devices);
@@ -2624,6 +2627,8 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
 	tgtdev->sector_size = sectorsize;
 	tgtdev->fs_info = fs_info;
 	tgtdev->in_fs_metadata = 1;
+	atomic_set(&tgtdev->tgtdev_refs, 0);
+	init_waitqueue_head(&tgtdev->tgtdev_wait);
 }
 
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -5302,6 +5307,32 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
 	return bbio;
 }
 
+static void pin_bbio_target_device(struct btrfs_bio *bbio)
+{
+	int i;
+
+	for (i = 0; i < bbio->num_stripes; i++) {
+		struct btrfs_device *device = bbio->stripes[i].dev;
+
+		if (device->is_tgtdev_for_dev_replace)
+			atomic_inc(&device->tgtdev_refs);
+	}
+}
+
+static void unpin_bbio_target_device(struct btrfs_bio *bbio)
+{
+	int i;
+
+	for (i = 0; i < bbio->num_stripes; i++) {
+		struct btrfs_device *device = bbio->stripes[i].dev;
+
+		if (device->is_tgtdev_for_dev_replace) {
+			atomic_dec(&device->tgtdev_refs);
+			wake_up(&device->tgtdev_wait);
+		}
+	}
+}
+
 void btrfs_get_bbio(struct btrfs_bio *bbio)
 {
 	WARN_ON(!atomic_read(&bbio->refs));
@@ -5312,8 +5343,10 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
 {
 	if (!bbio)
 		return;
-	if (atomic_dec_and_test(&bbio->refs))
+	if (atomic_dec_and_test(&bbio->refs)) {
+		unpin_bbio_target_device(bbio);
 		kfree(bbio);
+	}
 }
 
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
@@ -5868,6 +5901,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
 		bbio->mirror_num = map->num_stripes + 1;
 	}
+	pin_bbio_target_device(bbio);
 out:
 	if (dev_replace_is_ongoing) {
 		btrfs_dev_replace_clear_lock_blocking(dev_replace);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 24ba6bc3ec34..702361182597 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -149,6 +149,10 @@ struct btrfs_device {
 	/* Counter to record the change of device stats */
 	atomic_t dev_stats_ccnt;
 	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
+
+	/* To ensure we wait this target device before destroying it */
+	atomic_t tgtdev_refs;
+	wait_queue_head_t tgtdev_wait;
 };
 
 /*
@@ -538,4 +542,10 @@ struct list_head *btrfs_get_fs_uuids(void);
 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
 
+static inline void wait_target_device(struct btrfs_device *tgtdev)
+{
+	if (!tgtdev || !tgtdev->is_tgtdev_for_dev_replace)
+		return;
+	wait_event(tgtdev->tgtdev_wait, atomic_read(&tgtdev->tgtdev_refs) == 0);
+}
 #endif
-- 
2.11.0




^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/5] raid56: variant bug fixes
  2017-02-03  8:20 [PATCH 0/5] raid56: variant bug fixes Qu Wenruo
                   ` (4 preceding siblings ...)
  2017-02-03  8:20 ` [PATCH 5/5] btrfs: replace: Use ref counts to avoid destroying target device when canceled Qu Wenruo
@ 2017-03-07  3:48 ` Qu Wenruo
  2017-03-14 13:47   ` David Sterba
  5 siblings, 1 reply; 27+ messages in thread
From: Qu Wenruo @ 2017-03-07  3:48 UTC (permalink / raw)
  To: linux-btrfs, David Sterba, Chris Mason

So raid56 bug fixes are the same case as qgroup fixes now?

No reviewer so no merge?

I understand we need enough reviewer, however there is never enough 
reviewer for *minor* functions, like qgroup or raid56.

Such situation will just make such functions starve, bugs makes fewer 
tester and users, fewer users leads to even fewer developers, causing a 
minus spiral.

Thanks,
Qu

At 02/03/2017 04:20 PM, Qu Wenruo wrote:
> This patchset can be fetched from my github repo:
> https://github.com/adam900710/linux.git raid56_fixes
>
> It's based on v4.10-rc6 and none of the patch is modified after its first
> appearance in mail list.
>
> The patchset fixes the following bugs:
>
> 1) False alert or wrong csum error number when scrubbing RAID5/6
>    The bug itself won't cause any damage to fs, just pure race.
>
> 2) Corrupted data stripe rebuild corrupts P/Q
>    So scrub makes one error into another, not really fixing anything
>
> 3) Use-after-free caused by cancelling dev-replace
>    This is quite a deadly bug, since cancelling dev-replace can easily
>    cause kernel panic, and thanks to raid bio steal, it makes the race
>    windows quite large.
>
>    Can be triggered by btrfs/069.
>
> After all the fixes applied, no scrub/replace related regression can be
> detected.
>
> Qu Wenruo (5):
>   btrfs: scrub: Introduce full stripe lock for RAID56
>   btrfs: scrub: Fix RAID56 recovery race condition
>   btrfs: raid56: Use correct stolen pages to calculate P/Q
>   btrfs: raid56: Don't keep rbio for later steal
>   btrfs: replace: Use ref counts to avoid destroying target device when
>     canceled
>
>  fs/btrfs/ctree.h       |   4 ++
>  fs/btrfs/dev-replace.c |   7 +-
>  fs/btrfs/extent-tree.c |   3 +
>  fs/btrfs/raid56.c      |  80 +++++++++++++++------
>  fs/btrfs/scrub.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/volumes.c     |  36 +++++++++-
>  fs/btrfs/volumes.h     |  10 +++
>  7 files changed, 309 insertions(+), 23 deletions(-)
>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/5] raid56: variant bug fixes
  2017-03-07  3:48 ` [PATCH 0/5] raid56: variant bug fixes Qu Wenruo
@ 2017-03-14 13:47   ` David Sterba
  2017-03-14 21:30     ` Goffredo Baroncelli
  0 siblings, 1 reply; 27+ messages in thread
From: David Sterba @ 2017-03-14 13:47 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs, David Sterba, Chris Mason

On Tue, Mar 07, 2017 at 11:48:37AM +0800, Qu Wenruo wrote:
> So raid56 bug fixes are the same case as qgroup fixes now?

Not now, it's been like that forever. Or should have been, we're not
perfect, but should strive not to skip reviews just because we want to
let new code in.

> No reviewer so no merge?
> 
> I understand we need enough reviewer, however there is never enough 
> reviewer for *minor* functions, like qgroup or raid56.

I would not call them minor. Qgroups are hooked to the core of the
operations, raid56 is a compartment, but can become complex regarding
the error modes and safety constraints.

> Such situation will just make such functions starve, bugs makes fewer 
> tester and users, fewer users leads to even fewer developers, causing a 
> minus spiral.

Unreviewed code is more buggy, leads to the same. You've been around for
a long time, I'm sure you'll remember times when an underreviewed
patchset caused headaches for several major releases. All developers
have record of a major problem in their code, that's why we must do the
reviews.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 0/5] raid56: variant bug fixes
  2017-03-14 13:47   ` David Sterba
@ 2017-03-14 21:30     ` Goffredo Baroncelli
  0 siblings, 0 replies; 27+ messages in thread
From: Goffredo Baroncelli @ 2017-03-14 21:30 UTC (permalink / raw)
  To: David Sterba; +Cc: Qu Wenruo, linux-btrfs, Chris Mason

On 2017-03-14 14:47, David Sterba wrote:
> On Tue, Mar 07, 2017 at 11:48:37AM +0800, Qu Wenruo wrote:
>> So raid56 bug fixes are the same case as qgroup fixes now?
> 
> Not now, it's been like that forever. Or should have been, we're not
> perfect, but should strive not to skip reviews just because we want to
> let new code in.
> 
>> No reviewer so no merge?
>>
>> I understand we need enough reviewer, however there is never enough 
>> reviewer for *minor* functions, like qgroup or raid56.
> 
> I would not call them minor. Qgroups are hooked to the core of the
> operations, raid56 is a compartment, but can become complex regarding
> the error modes and safety constraints.
> 
>> Such situation will just make such functions starve, bugs makes fewer 
>> tester and users, fewer users leads to even fewer developers, causing a 
>> minus spiral.
> 
> Unreviewed code is more buggy, leads to the same. You've been around for
> a long time, I'm sure you'll remember times when an underreviewed
> patchset caused headaches for several major releases. All developers
> have record of a major problem in their code, that's why we must do the
> reviews.

David, I appreciate your "conservative" approach; but ... how we could exit from this loop ? 

Qu is in the right stating that if there are bugs, nobody uses raid5/6; nobody is interested in the review; and the patches are unaccepted; but doing so the problem will not be not solved.

I am not skilled enough to say if the Qu's patches are completely wrong or these are able to solve the bugs. But which are the other options ?

We are not talking about enhancement, we are talking about bugs in the recovery path of a failing raid5/6 file-system. If we cannot ensure this functionality, raid5/6 is not useful at all.

I think that we should discuss about an exit strategy from this issue. If we aren't able to develop and deploy possible fixes, we should consider deprecating raid5/6 altogether (i.e. prevent btrfs-progs from creating a raid5/6 filesystem, and after some time prevent the kernel to mount this kind of filesystem at all).

I hope that these issues will be addressed and BTRFS will gain a good raid5/6 support. But otherwise I think that it is better to deprecate it than support a badly implementation.


BR
G.Baroncelli


-- 
gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 3/5] btrfs: raid56: Use correct stolen pages to calculate P/Q
  2017-02-03  8:20 ` [PATCH 3/5] btrfs: raid56: Use correct stolen pages to calculate P/Q Qu Wenruo
@ 2017-03-16  5:36   ` Liu Bo
  2017-03-16  8:30     ` Qu Wenruo
  2017-03-17  6:31     ` Qu Wenruo
  0 siblings, 2 replies; 27+ messages in thread
From: Liu Bo @ 2017-03-16  5:36 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Fri, Feb 03, 2017 at 04:20:21PM +0800, Qu Wenruo wrote:
> In the following situation, scrub will calculate wrong parity to
> overwrite correct one:
> 
> RAID5 full stripe:
> 
> Before
> |     Dev 1      |     Dev  2     |     Dev 3     |
> | Data stripe 1  | Data stripe 2  | Parity Stripe |
> --------------------------------------------------- 0
> | 0x0000 (Bad)   |     0xcdcd     |     0x0000    |
> --------------------------------------------------- 4K
> |     0xcdcd     |     0xcdcd     |     0x0000    |
> ...
> |     0xcdcd     |     0xcdcd     |     0x0000    |
> --------------------------------------------------- 64K
> 
> After scrubbing dev3 only:
> 
> |     Dev 1      |     Dev  2     |     Dev 3     |
> | Data stripe 1  | Data stripe 2  | Parity Stripe |
> --------------------------------------------------- 0
> | 0xcdcd (Good)  |     0xcdcd     | 0xcdcd (Bad)  |
> --------------------------------------------------- 4K
> |     0xcdcd     |     0xcdcd     |     0x0000    |
> ...
> |     0xcdcd     |     0xcdcd     |     0x0000    |
> --------------------------------------------------- 64K
> 
> The calltrace of such corruption is as following:
> 
> scrub_bio_end_io_worker() get called for each extent read out
> |- scriub_block_complete()
>    |- Data extent csum mismatch
>    |- scrub_handle_errored_block
>       |- scrub_recheck_block()
>          |- scrub_submit_raid56_bio_wait()
>             |- raid56_parity_recover()
> 
> Now we have a rbio with correct data stripe 1 recovered.
> Let's call it "good_rbio".
> 
> scrub_parity_check_and_repair()
> |- raid56_parity_submit_scrub_rbio()
>    |- lock_stripe_add()
>    |  |- steal_rbio()
>    |     |- Recovered data are steal from "good_rbio", stored into
>    |        rbio->stripe_pages[]
>    |        Now rbio->bio_pages[] are bad data read from disk.

At this point, we should have already known that whether
rbio->bio_pages are corrupted because rbio->bio_pages are indexed from
the list sparity->pages, and we only do scrub_parity_put after
finishing the endio of reading all pages linked at sparity->pages.

Since the previous checksuming failure has made a recovery and we
got the correct data on that rbio, instead of adding this corrupted
page into the new rbio, it'd be fine to skip it and we use all
rbio->stripe_pages which can be stolen from the previous good rbio.

Thanks,

-liubo

>    |- async_scrub_parity()
>       |- scrub_parity_work() (delayed_call to scrub_parity_work)
> 
> scrub_parity_work()
> |- raid56_parity_scrub_stripe()
>    |- validate_rbio_for_parity_scrub()
>       |- finish_parity_scrub()
>          |- Recalculate parity using *BAD* pages in rbio->bio_pages[]
>             So good parity is overwritten with *BAD* one
> 
> The fix is to introduce 2 new members, bad_ondisk_a/b, to struct
> btrfs_raid_bio, to info scrub code to use correct data pages to
> re-calculate parity.
> 
> Reported-by: Goffredo Baroncelli <kreijack@inwind.it>
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
>  fs/btrfs/raid56.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 58 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> index d2a9a1ee5361..453eefdcb591 100644
> --- a/fs/btrfs/raid56.c
> +++ b/fs/btrfs/raid56.c
> @@ -133,6 +133,16 @@ struct btrfs_raid_bio {
>  	/* second bad stripe (for raid6 use) */
>  	int failb;
>  
> +	/*
> +	 * For steal_rbio, we can steal recovered correct page,
> +	 * but in finish_parity_scrub(), we still use bad on-disk
> +	 * page to calculate parity.
> +	 * Use these members to info finish_parity_scrub() to use
> +	 * correct pages
> +	 */
> +	int bad_ondisk_a;
> +	int bad_ondisk_b;
> +
>  	int scrubp;
>  	/*
>  	 * number of pages needed to represent the full
> @@ -310,6 +320,12 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
>  	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
>  		return;
>  
> +	/* Record recovered stripe number */
> +	if (src->faila != -1)
> +		dest->bad_ondisk_a = src->faila;
> +	if (src->failb != -1)
> +		dest->bad_ondisk_b = src->failb;
> +
>  	for (i = 0; i < dest->nr_pages; i++) {
>  		s = src->stripe_pages[i];
>  		if (!s || !PageUptodate(s)) {
> @@ -999,6 +1015,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
>  	rbio->stripe_npages = stripe_npages;
>  	rbio->faila = -1;
>  	rbio->failb = -1;
> +	rbio->bad_ondisk_a = -1;
> +	rbio->bad_ondisk_b = -1;
>  	atomic_set(&rbio->refs, 1);
>  	atomic_set(&rbio->error, 0);
>  	atomic_set(&rbio->stripes_pending, 0);
> @@ -2261,6 +2279,9 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
>  	int bit;
>  	int index;
>  	struct page *page;
> +	struct page *bio_page;
> +	void *ptr;
> +	void *bio_ptr;
>  
>  	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
>  		for (i = 0; i < rbio->real_stripes; i++) {
> @@ -2271,6 +2292,23 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
>  			page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
>  			if (!page)
>  				return -ENOMEM;
> +
> +			/*
> +			 * It's possible that only several pages need recover,
> +			 * and rest are all good.
> +			 * In that case we need to copy good bio pages into
> +			 * stripe_pages[], as we will use stripe_pages[] other
> +			 * than bio_pages[] in finish_parity_scrub().
> +			 */
> +			bio_page = page_in_rbio(rbio, i, bit, 0);
> +			if ((i == rbio->bad_ondisk_a ||
> +			     i == rbio->bad_ondisk_b) && bio_page) {
> +				ptr = kmap(page);
> +				bio_ptr = kmap(bio_page);
> +				memcpy(ptr, bio_ptr, PAGE_SIZE);
> +				kunmap(bio_page);
> +				kunmap(page);
> +			}
>  			rbio->stripe_pages[index] = page;
>  		}
>  	}
> @@ -2282,6 +2320,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>  {
>  	struct btrfs_bio *bbio = rbio->bbio;
>  	void *pointers[rbio->real_stripes];
> +	struct page *mapped_pages[rbio->real_stripes];
>  	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
>  	int nr_data = rbio->nr_data;
>  	int stripe;
> @@ -2342,12 +2381,24 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>  		void *parity;
>  		/* first collect one page from each data stripe */
>  		for (stripe = 0; stripe < nr_data; stripe++) {
> -			p = page_in_rbio(rbio, stripe, pagenr, 0);
> +
> +			/*
> +			 * Use stolen recovered page other than bad
> +			 * on disk pages
> +			 */
> +			if (stripe == rbio->bad_ondisk_a ||
> +			    stripe == rbio->bad_ondisk_b)
> +				p = rbio_stripe_page(rbio, stripe, pagenr);
> +			else
> +				p = page_in_rbio(rbio, stripe, pagenr, 0);
>  			pointers[stripe] = kmap(p);
> +			mapped_pages[stripe] = p;
>  		}
>  
>  		/* then add the parity stripe */
> -		pointers[stripe++] = kmap(p_page);
> +		pointers[stripe] = kmap(p_page);
> +		mapped_pages[stripe] = p_page;
> +		stripe++;
>  
>  		if (q_stripe != -1) {
>  
> @@ -2355,7 +2406,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>  			 * raid6, add the qstripe and call the
>  			 * library function to fill in our p/q
>  			 */
> -			pointers[stripe++] = kmap(q_page);
> +			pointers[stripe] = kmap(q_page);
> +			mapped_pages[stripe] = q_page;
> +			stripe++;
>  
>  			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
>  						pointers);
> @@ -2375,8 +2428,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>  			bitmap_clear(rbio->dbitmap, pagenr, 1);
>  		kunmap(p);
>  
> +		/* Free mapped pages */
>  		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
> -			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
> +			kunmap(mapped_pages[stripe]);
>  	}
>  
>  	__free_page(p_page);
> -- 
> 2.11.0
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 3/5] btrfs: raid56: Use correct stolen pages to calculate P/Q
  2017-03-16  5:36   ` Liu Bo
@ 2017-03-16  8:30     ` Qu Wenruo
  2017-03-17  6:31     ` Qu Wenruo
  1 sibling, 0 replies; 27+ messages in thread
From: Qu Wenruo @ 2017-03-16  8:30 UTC (permalink / raw)
  To: bo.li.liu; +Cc: linux-btrfs



At 03/16/2017 01:36 PM, Liu Bo wrote:
> On Fri, Feb 03, 2017 at 04:20:21PM +0800, Qu Wenruo wrote:
>> In the following situation, scrub will calculate wrong parity to
>> overwrite correct one:
>>
>> RAID5 full stripe:
>>
>> Before
>> |     Dev 1      |     Dev  2     |     Dev 3     |
>> | Data stripe 1  | Data stripe 2  | Parity Stripe |
>> --------------------------------------------------- 0
>> | 0x0000 (Bad)   |     0xcdcd     |     0x0000    |
>> --------------------------------------------------- 4K
>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>> ...
>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>> --------------------------------------------------- 64K
>>
>> After scrubbing dev3 only:
>>
>> |     Dev 1      |     Dev  2     |     Dev 3     |
>> | Data stripe 1  | Data stripe 2  | Parity Stripe |
>> --------------------------------------------------- 0
>> | 0xcdcd (Good)  |     0xcdcd     | 0xcdcd (Bad)  |
>> --------------------------------------------------- 4K
>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>> ...
>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>> --------------------------------------------------- 64K
>>
>> The calltrace of such corruption is as following:
>>
>> scrub_bio_end_io_worker() get called for each extent read out
>> |- scriub_block_complete()
>>    |- Data extent csum mismatch
>>    |- scrub_handle_errored_block
>>       |- scrub_recheck_block()
>>          |- scrub_submit_raid56_bio_wait()
>>             |- raid56_parity_recover()
>>
>> Now we have a rbio with correct data stripe 1 recovered.
>> Let's call it "good_rbio".
>>
>> scrub_parity_check_and_repair()
>> |- raid56_parity_submit_scrub_rbio()
>>    |- lock_stripe_add()
>>    |  |- steal_rbio()
>>    |     |- Recovered data are steal from "good_rbio", stored into
>>    |        rbio->stripe_pages[]
>>    |        Now rbio->bio_pages[] are bad data read from disk.

Thanks for the review first.

>
> At this point, we should have already known that whether
> rbio->bio_pages are corrupted because rbio->bio_pages are indexed from
> the list sparity->pages, and we only do scrub_parity_put after
> finishing the endio of reading all pages linked at sparity->pages.
>
> Since the previous checksuming failure has made a recovery and we
> got the correct data on that rbio, instead of adding this corrupted
> page into the new rbio, it'd be fine to skip it and we use all
> rbio->stripe_pages which can be stolen from the previous good rbio.

Right, this makes the whole check based on bad_ondisk_a/b redundant.

I'll update the patch.

Thanks,
Qu
>
> Thanks,
>
> -liubo
>
>>    |- async_scrub_parity()
>>       |- scrub_parity_work() (delayed_call to scrub_parity_work)
>>
>> scrub_parity_work()
>> |- raid56_parity_scrub_stripe()
>>    |- validate_rbio_for_parity_scrub()
>>       |- finish_parity_scrub()
>>          |- Recalculate parity using *BAD* pages in rbio->bio_pages[]
>>             So good parity is overwritten with *BAD* one
>>
>> The fix is to introduce 2 new members, bad_ondisk_a/b, to struct
>> btrfs_raid_bio, to info scrub code to use correct data pages to
>> re-calculate parity.
>>
>> Reported-by: Goffredo Baroncelli <kreijack@inwind.it>
>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>> ---
>>  fs/btrfs/raid56.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 58 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>> index d2a9a1ee5361..453eefdcb591 100644
>> --- a/fs/btrfs/raid56.c
>> +++ b/fs/btrfs/raid56.c
>> @@ -133,6 +133,16 @@ struct btrfs_raid_bio {
>>  	/* second bad stripe (for raid6 use) */
>>  	int failb;
>>
>> +	/*
>> +	 * For steal_rbio, we can steal recovered correct page,
>> +	 * but in finish_parity_scrub(), we still use bad on-disk
>> +	 * page to calculate parity.
>> +	 * Use these members to info finish_parity_scrub() to use
>> +	 * correct pages
>> +	 */
>> +	int bad_ondisk_a;
>> +	int bad_ondisk_b;
>> +
>>  	int scrubp;
>>  	/*
>>  	 * number of pages needed to represent the full
>> @@ -310,6 +320,12 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
>>  	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
>>  		return;
>>
>> +	/* Record recovered stripe number */
>> +	if (src->faila != -1)
>> +		dest->bad_ondisk_a = src->faila;
>> +	if (src->failb != -1)
>> +		dest->bad_ondisk_b = src->failb;
>> +
>>  	for (i = 0; i < dest->nr_pages; i++) {
>>  		s = src->stripe_pages[i];
>>  		if (!s || !PageUptodate(s)) {
>> @@ -999,6 +1015,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
>>  	rbio->stripe_npages = stripe_npages;
>>  	rbio->faila = -1;
>>  	rbio->failb = -1;
>> +	rbio->bad_ondisk_a = -1;
>> +	rbio->bad_ondisk_b = -1;
>>  	atomic_set(&rbio->refs, 1);
>>  	atomic_set(&rbio->error, 0);
>>  	atomic_set(&rbio->stripes_pending, 0);
>> @@ -2261,6 +2279,9 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
>>  	int bit;
>>  	int index;
>>  	struct page *page;
>> +	struct page *bio_page;
>> +	void *ptr;
>> +	void *bio_ptr;
>>
>>  	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
>>  		for (i = 0; i < rbio->real_stripes; i++) {
>> @@ -2271,6 +2292,23 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
>>  			page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
>>  			if (!page)
>>  				return -ENOMEM;
>> +
>> +			/*
>> +			 * It's possible that only several pages need recover,
>> +			 * and rest are all good.
>> +			 * In that case we need to copy good bio pages into
>> +			 * stripe_pages[], as we will use stripe_pages[] other
>> +			 * than bio_pages[] in finish_parity_scrub().
>> +			 */
>> +			bio_page = page_in_rbio(rbio, i, bit, 0);
>> +			if ((i == rbio->bad_ondisk_a ||
>> +			     i == rbio->bad_ondisk_b) && bio_page) {
>> +				ptr = kmap(page);
>> +				bio_ptr = kmap(bio_page);
>> +				memcpy(ptr, bio_ptr, PAGE_SIZE);
>> +				kunmap(bio_page);
>> +				kunmap(page);
>> +			}
>>  			rbio->stripe_pages[index] = page;
>>  		}
>>  	}
>> @@ -2282,6 +2320,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>  {
>>  	struct btrfs_bio *bbio = rbio->bbio;
>>  	void *pointers[rbio->real_stripes];
>> +	struct page *mapped_pages[rbio->real_stripes];
>>  	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
>>  	int nr_data = rbio->nr_data;
>>  	int stripe;
>> @@ -2342,12 +2381,24 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>  		void *parity;
>>  		/* first collect one page from each data stripe */
>>  		for (stripe = 0; stripe < nr_data; stripe++) {
>> -			p = page_in_rbio(rbio, stripe, pagenr, 0);
>> +
>> +			/*
>> +			 * Use stolen recovered page other than bad
>> +			 * on disk pages
>> +			 */
>> +			if (stripe == rbio->bad_ondisk_a ||
>> +			    stripe == rbio->bad_ondisk_b)
>> +				p = rbio_stripe_page(rbio, stripe, pagenr);
>> +			else
>> +				p = page_in_rbio(rbio, stripe, pagenr, 0);
>>  			pointers[stripe] = kmap(p);
>> +			mapped_pages[stripe] = p;
>>  		}
>>
>>  		/* then add the parity stripe */
>> -		pointers[stripe++] = kmap(p_page);
>> +		pointers[stripe] = kmap(p_page);
>> +		mapped_pages[stripe] = p_page;
>> +		stripe++;
>>
>>  		if (q_stripe != -1) {
>>
>> @@ -2355,7 +2406,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>  			 * raid6, add the qstripe and call the
>>  			 * library function to fill in our p/q
>>  			 */
>> -			pointers[stripe++] = kmap(q_page);
>> +			pointers[stripe] = kmap(q_page);
>> +			mapped_pages[stripe] = q_page;
>> +			stripe++;
>>
>>  			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
>>  						pointers);
>> @@ -2375,8 +2428,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>  			bitmap_clear(rbio->dbitmap, pagenr, 1);
>>  		kunmap(p);
>>
>> +		/* Free mapped pages */
>>  		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
>> -			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
>> +			kunmap(mapped_pages[stripe]);
>>  	}
>>
>>  	__free_page(p_page);
>> --
>> 2.11.0
>>
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal
  2017-02-03  8:20 ` [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal Qu Wenruo
@ 2017-03-17  4:44   ` Liu Bo
  2017-03-17  5:28     ` Qu Wenruo
  0 siblings, 1 reply; 27+ messages in thread
From: Liu Bo @ 2017-03-17  4:44 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Fri, Feb 03, 2017 at 04:20:22PM +0800, Qu Wenruo wrote:
> Before this patch, btrfs raid56 will keep raid56 rbio even all its IO is
> done.
> This may save some time allocating rbio, but it can cause deadly
> use-after-free bug, for the following case:
> 
> Original fs: 4 devices RAID5
> 
>        Process A                 |          Process B
> --------------------------------------------------------------------------
>                                  |  Start device replace
>                                  |    Now the fs has 5 devices
>                                  |    devid 0: replace device
>                                  |    devid 1~4: old devices
> btrfs_map_bio()                  |
> |- __btrfs_map_block()           |
> |    bbio has 5 stripes          |
> |    including devid 0           |
> |- raid56_parity_write()         |
>                                  |
> raid_write_end_io()              |
> |- rbio_orig_end_io()            |
>    |- unlock_stripe()            |
>        Keeps the old rbio for    |
>        later steal, which has    |
>        stripe for devid 0        |
>                                  |  Cancel device replace
>                                  |    Now the fs has 4 devices
>                                  |    devid 0 is freed
> Some IO happens                  |
> raid_write_end_io()              |
> |- rbio_orig_end_io()            |
>    |- unlock_stripe()            |
>       |- steal_rbio()            |
>            Use old rbio, whose   |
>            bbio has freed devid 0|
>            stripe                |
> Any access to rbio->bbio will    |
> cause general protection or NULL |
> pointer dereference              |
> 
> Such bug can already be triggered by fstests btrfs/069 for RAID5/6
> profiles.
> 
> Fix it by not keeping old rbio in unlock_stripe(), so we just free the
> finished rbio and rbio->bbio, so above problem wont' happen.
>

I don't think this is acceptable, keeping a cache is important for
raid56 write performance, could you please fix it by checking if the
device is missing?

Thanks,

-liubo

> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
>  fs/btrfs/raid56.c | 18 +-----------------
>  1 file changed, 1 insertion(+), 17 deletions(-)
> 
> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> index 453eefdcb591..aba82b95ec73 100644
> --- a/fs/btrfs/raid56.c
> +++ b/fs/btrfs/raid56.c
> @@ -776,7 +776,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>  	int bucket;
>  	struct btrfs_stripe_hash *h;
>  	unsigned long flags;
> -	int keep_cache = 0;
>  
>  	bucket = rbio_bucket(rbio);
>  	h = rbio->fs_info->stripe_hash_table->table + bucket;
> @@ -788,19 +787,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>  	spin_lock(&rbio->bio_list_lock);
>  
>  	if (!list_empty(&rbio->hash_list)) {
> -		/*
> -		 * if we're still cached and there is no other IO
> -		 * to perform, just leave this rbio here for others
> -		 * to steal from later
> -		 */
> -		if (list_empty(&rbio->plug_list) &&
> -		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
> -			keep_cache = 1;
> -			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
> -			BUG_ON(!bio_list_empty(&rbio->bio_list));
> -			goto done;
> -		}
> -
>  		list_del_init(&rbio->hash_list);
>  		atomic_dec(&rbio->refs);
>  
> @@ -848,13 +834,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>  			goto done_nolock;
>  		}
>  	}
> -done:
>  	spin_unlock(&rbio->bio_list_lock);
>  	spin_unlock_irqrestore(&h->lock, flags);
>  
>  done_nolock:
> -	if (!keep_cache)
> -		remove_rbio_from_cache(rbio);
> +	remove_rbio_from_cache(rbio);
>  }
>  
>  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
> -- 
> 2.11.0
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal
  2017-03-17  4:44   ` Liu Bo
@ 2017-03-17  5:28     ` Qu Wenruo
  2017-03-18  2:03       ` Liu Bo
  0 siblings, 1 reply; 27+ messages in thread
From: Qu Wenruo @ 2017-03-17  5:28 UTC (permalink / raw)
  To: bo.li.liu; +Cc: linux-btrfs



At 03/17/2017 12:44 PM, Liu Bo wrote:
> On Fri, Feb 03, 2017 at 04:20:22PM +0800, Qu Wenruo wrote:
>> Before this patch, btrfs raid56 will keep raid56 rbio even all its IO is
>> done.
>> This may save some time allocating rbio, but it can cause deadly
>> use-after-free bug, for the following case:
>>
>> Original fs: 4 devices RAID5
>>
>>        Process A                 |          Process B
>> --------------------------------------------------------------------------
>>                                  |  Start device replace
>>                                  |    Now the fs has 5 devices
>>                                  |    devid 0: replace device
>>                                  |    devid 1~4: old devices
>> btrfs_map_bio()                  |
>> |- __btrfs_map_block()           |
>> |    bbio has 5 stripes          |
>> |    including devid 0           |
>> |- raid56_parity_write()         |
>>                                  |
>> raid_write_end_io()              |
>> |- rbio_orig_end_io()            |
>>    |- unlock_stripe()            |
>>        Keeps the old rbio for    |
>>        later steal, which has    |
>>        stripe for devid 0        |
>>                                  |  Cancel device replace
>>                                  |    Now the fs has 4 devices
>>                                  |    devid 0 is freed
>> Some IO happens                  |
>> raid_write_end_io()              |
>> |- rbio_orig_end_io()            |
>>    |- unlock_stripe()            |
>>       |- steal_rbio()            |
>>            Use old rbio, whose   |
>>            bbio has freed devid 0|
>>            stripe                |
>> Any access to rbio->bbio will    |
>> cause general protection or NULL |
>> pointer dereference              |
>>
>> Such bug can already be triggered by fstests btrfs/069 for RAID5/6
>> profiles.
>>
>> Fix it by not keeping old rbio in unlock_stripe(), so we just free the
>> finished rbio and rbio->bbio, so above problem wont' happen.
>>
>
> I don't think this is acceptable, keeping a cache is important for
> raid56 write performance, could you please fix it by checking if the
> device is missing?

Not possible, as it's keeping the btrfs_device pointer and never release 
it, the stolen rbio can be kept forever until umount.

And I think the logical is very strange, if RAID5/6 is unstable, there 
is no meaning to keep it fast.

Keep it stable first, and then consider the performance.

Thanks,
Qu

>
> Thanks,
>
> -liubo
>
>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>> ---
>>  fs/btrfs/raid56.c | 18 +-----------------
>>  1 file changed, 1 insertion(+), 17 deletions(-)
>>
>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>> index 453eefdcb591..aba82b95ec73 100644
>> --- a/fs/btrfs/raid56.c
>> +++ b/fs/btrfs/raid56.c
>> @@ -776,7 +776,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>  	int bucket;
>>  	struct btrfs_stripe_hash *h;
>>  	unsigned long flags;
>> -	int keep_cache = 0;
>>
>>  	bucket = rbio_bucket(rbio);
>>  	h = rbio->fs_info->stripe_hash_table->table + bucket;
>> @@ -788,19 +787,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>  	spin_lock(&rbio->bio_list_lock);
>>
>>  	if (!list_empty(&rbio->hash_list)) {
>> -		/*
>> -		 * if we're still cached and there is no other IO
>> -		 * to perform, just leave this rbio here for others
>> -		 * to steal from later
>> -		 */
>> -		if (list_empty(&rbio->plug_list) &&
>> -		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
>> -			keep_cache = 1;
>> -			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
>> -			BUG_ON(!bio_list_empty(&rbio->bio_list));
>> -			goto done;
>> -		}
>> -
>>  		list_del_init(&rbio->hash_list);
>>  		atomic_dec(&rbio->refs);
>>
>> @@ -848,13 +834,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>  			goto done_nolock;
>>  		}
>>  	}
>> -done:
>>  	spin_unlock(&rbio->bio_list_lock);
>>  	spin_unlock_irqrestore(&h->lock, flags);
>>
>>  done_nolock:
>> -	if (!keep_cache)
>> -		remove_rbio_from_cache(rbio);
>> +	remove_rbio_from_cache(rbio);
>>  }
>>
>>  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
>> --
>> 2.11.0
>>
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 3/5] btrfs: raid56: Use correct stolen pages to calculate P/Q
  2017-03-16  5:36   ` Liu Bo
  2017-03-16  8:30     ` Qu Wenruo
@ 2017-03-17  6:31     ` Qu Wenruo
  2017-03-17 22:19       ` Liu Bo
  1 sibling, 1 reply; 27+ messages in thread
From: Qu Wenruo @ 2017-03-17  6:31 UTC (permalink / raw)
  To: bo.li.liu; +Cc: linux-btrfs



At 03/16/2017 01:36 PM, Liu Bo wrote:
> On Fri, Feb 03, 2017 at 04:20:21PM +0800, Qu Wenruo wrote:
>> In the following situation, scrub will calculate wrong parity to
>> overwrite correct one:
>>
>> RAID5 full stripe:
>>
>> Before
>> |     Dev 1      |     Dev  2     |     Dev 3     |
>> | Data stripe 1  | Data stripe 2  | Parity Stripe |
>> --------------------------------------------------- 0
>> | 0x0000 (Bad)   |     0xcdcd     |     0x0000    |
>> --------------------------------------------------- 4K
>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>> ...
>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>> --------------------------------------------------- 64K
>>
>> After scrubbing dev3 only:
>>
>> |     Dev 1      |     Dev  2     |     Dev 3     |
>> | Data stripe 1  | Data stripe 2  | Parity Stripe |
>> --------------------------------------------------- 0
>> | 0xcdcd (Good)  |     0xcdcd     | 0xcdcd (Bad)  |
>> --------------------------------------------------- 4K
>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>> ...
>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>> --------------------------------------------------- 64K
>>
>> The calltrace of such corruption is as following:
>>
>> scrub_bio_end_io_worker() get called for each extent read out
>> |- scriub_block_complete()
>>    |- Data extent csum mismatch
>>    |- scrub_handle_errored_block
>>       |- scrub_recheck_block()
>>          |- scrub_submit_raid56_bio_wait()
>>             |- raid56_parity_recover()
>>
>> Now we have a rbio with correct data stripe 1 recovered.
>> Let's call it "good_rbio".
>>
>> scrub_parity_check_and_repair()
>> |- raid56_parity_submit_scrub_rbio()
>>    |- lock_stripe_add()
>>    |  |- steal_rbio()
>>    |     |- Recovered data are steal from "good_rbio", stored into
>>    |        rbio->stripe_pages[]
>>    |        Now rbio->bio_pages[] are bad data read from disk.
>
> At this point, we should have already known that whether
> rbio->bio_pages are corrupted because rbio->bio_pages are indexed from
> the list sparity->pages, and we only do scrub_parity_put after
> finishing the endio of reading all pages linked at sparity->pages.
>
> Since the previous checksuming failure has made a recovery and we
> got the correct data on that rbio, instead of adding this corrupted
> page into the new rbio, it'd be fine to skip it and we use all
> rbio->stripe_pages which can be stolen from the previous good rbio.

Unfortunately, the rbio->bio_pages[] are pages stored in rbio->bio_list.

While we don't have good function to remove page from a bio, nor the 
function to remove a bio from a bio_list (although it's quite easy to 
implement), it will be quite a mess to do it in steal_rbio() or in btrfs.

That's why I use new bad_on_disk_a/b other than directly stealing pages 
from rbio->bio_pages[].

Thanks,
Qu
>
> Thanks,
>
> -liubo
>
>>    |- async_scrub_parity()
>>       |- scrub_parity_work() (delayed_call to scrub_parity_work)
>>
>> scrub_parity_work()
>> |- raid56_parity_scrub_stripe()
>>    |- validate_rbio_for_parity_scrub()
>>       |- finish_parity_scrub()
>>          |- Recalculate parity using *BAD* pages in rbio->bio_pages[]
>>             So good parity is overwritten with *BAD* one
>>
>> The fix is to introduce 2 new members, bad_ondisk_a/b, to struct
>> btrfs_raid_bio, to info scrub code to use correct data pages to
>> re-calculate parity.
>>
>> Reported-by: Goffredo Baroncelli <kreijack@inwind.it>
>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>> ---
>>  fs/btrfs/raid56.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 58 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>> index d2a9a1ee5361..453eefdcb591 100644
>> --- a/fs/btrfs/raid56.c
>> +++ b/fs/btrfs/raid56.c
>> @@ -133,6 +133,16 @@ struct btrfs_raid_bio {
>>  	/* second bad stripe (for raid6 use) */
>>  	int failb;
>>
>> +	/*
>> +	 * For steal_rbio, we can steal recovered correct page,
>> +	 * but in finish_parity_scrub(), we still use bad on-disk
>> +	 * page to calculate parity.
>> +	 * Use these members to info finish_parity_scrub() to use
>> +	 * correct pages
>> +	 */
>> +	int bad_ondisk_a;
>> +	int bad_ondisk_b;
>> +
>>  	int scrubp;
>>  	/*
>>  	 * number of pages needed to represent the full
>> @@ -310,6 +320,12 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
>>  	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
>>  		return;
>>
>> +	/* Record recovered stripe number */
>> +	if (src->faila != -1)
>> +		dest->bad_ondisk_a = src->faila;
>> +	if (src->failb != -1)
>> +		dest->bad_ondisk_b = src->failb;
>> +
>>  	for (i = 0; i < dest->nr_pages; i++) {
>>  		s = src->stripe_pages[i];
>>  		if (!s || !PageUptodate(s)) {
>> @@ -999,6 +1015,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
>>  	rbio->stripe_npages = stripe_npages;
>>  	rbio->faila = -1;
>>  	rbio->failb = -1;
>> +	rbio->bad_ondisk_a = -1;
>> +	rbio->bad_ondisk_b = -1;
>>  	atomic_set(&rbio->refs, 1);
>>  	atomic_set(&rbio->error, 0);
>>  	atomic_set(&rbio->stripes_pending, 0);
>> @@ -2261,6 +2279,9 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
>>  	int bit;
>>  	int index;
>>  	struct page *page;
>> +	struct page *bio_page;
>> +	void *ptr;
>> +	void *bio_ptr;
>>
>>  	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
>>  		for (i = 0; i < rbio->real_stripes; i++) {
>> @@ -2271,6 +2292,23 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
>>  			page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
>>  			if (!page)
>>  				return -ENOMEM;
>> +
>> +			/*
>> +			 * It's possible that only several pages need recover,
>> +			 * and rest are all good.
>> +			 * In that case we need to copy good bio pages into
>> +			 * stripe_pages[], as we will use stripe_pages[] other
>> +			 * than bio_pages[] in finish_parity_scrub().
>> +			 */
>> +			bio_page = page_in_rbio(rbio, i, bit, 0);
>> +			if ((i == rbio->bad_ondisk_a ||
>> +			     i == rbio->bad_ondisk_b) && bio_page) {
>> +				ptr = kmap(page);
>> +				bio_ptr = kmap(bio_page);
>> +				memcpy(ptr, bio_ptr, PAGE_SIZE);
>> +				kunmap(bio_page);
>> +				kunmap(page);
>> +			}
>>  			rbio->stripe_pages[index] = page;
>>  		}
>>  	}
>> @@ -2282,6 +2320,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>  {
>>  	struct btrfs_bio *bbio = rbio->bbio;
>>  	void *pointers[rbio->real_stripes];
>> +	struct page *mapped_pages[rbio->real_stripes];
>>  	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
>>  	int nr_data = rbio->nr_data;
>>  	int stripe;
>> @@ -2342,12 +2381,24 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>  		void *parity;
>>  		/* first collect one page from each data stripe */
>>  		for (stripe = 0; stripe < nr_data; stripe++) {
>> -			p = page_in_rbio(rbio, stripe, pagenr, 0);
>> +
>> +			/*
>> +			 * Use stolen recovered page other than bad
>> +			 * on disk pages
>> +			 */
>> +			if (stripe == rbio->bad_ondisk_a ||
>> +			    stripe == rbio->bad_ondisk_b)
>> +				p = rbio_stripe_page(rbio, stripe, pagenr);
>> +			else
>> +				p = page_in_rbio(rbio, stripe, pagenr, 0);
>>  			pointers[stripe] = kmap(p);
>> +			mapped_pages[stripe] = p;
>>  		}
>>
>>  		/* then add the parity stripe */
>> -		pointers[stripe++] = kmap(p_page);
>> +		pointers[stripe] = kmap(p_page);
>> +		mapped_pages[stripe] = p_page;
>> +		stripe++;
>>
>>  		if (q_stripe != -1) {
>>
>> @@ -2355,7 +2406,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>  			 * raid6, add the qstripe and call the
>>  			 * library function to fill in our p/q
>>  			 */
>> -			pointers[stripe++] = kmap(q_page);
>> +			pointers[stripe] = kmap(q_page);
>> +			mapped_pages[stripe] = q_page;
>> +			stripe++;
>>
>>  			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
>>  						pointers);
>> @@ -2375,8 +2428,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>  			bitmap_clear(rbio->dbitmap, pagenr, 1);
>>  		kunmap(p);
>>
>> +		/* Free mapped pages */
>>  		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
>> -			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
>> +			kunmap(mapped_pages[stripe]);
>>  	}
>>
>>  	__free_page(p_page);
>> --
>> 2.11.0
>>
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 3/5] btrfs: raid56: Use correct stolen pages to calculate P/Q
  2017-03-17  6:31     ` Qu Wenruo
@ 2017-03-17 22:19       ` Liu Bo
  2017-03-20  4:33         ` Qu Wenruo
  0 siblings, 1 reply; 27+ messages in thread
From: Liu Bo @ 2017-03-17 22:19 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Fri, Mar 17, 2017 at 02:31:08PM +0800, Qu Wenruo wrote:
> 
> 
> At 03/16/2017 01:36 PM, Liu Bo wrote:
> > On Fri, Feb 03, 2017 at 04:20:21PM +0800, Qu Wenruo wrote:
> > > In the following situation, scrub will calculate wrong parity to
> > > overwrite correct one:
> > > 
> > > RAID5 full stripe:
> > > 
> > > Before
> > > |     Dev 1      |     Dev  2     |     Dev 3     |
> > > | Data stripe 1  | Data stripe 2  | Parity Stripe |
> > > --------------------------------------------------- 0
> > > | 0x0000 (Bad)   |     0xcdcd     |     0x0000    |
> > > --------------------------------------------------- 4K
> > > |     0xcdcd     |     0xcdcd     |     0x0000    |
> > > ...
> > > |     0xcdcd     |     0xcdcd     |     0x0000    |
> > > --------------------------------------------------- 64K
> > > 
> > > After scrubbing dev3 only:
> > > 
> > > |     Dev 1      |     Dev  2     |     Dev 3     |
> > > | Data stripe 1  | Data stripe 2  | Parity Stripe |
> > > --------------------------------------------------- 0
> > > | 0xcdcd (Good)  |     0xcdcd     | 0xcdcd (Bad)  |
> > > --------------------------------------------------- 4K
> > > |     0xcdcd     |     0xcdcd     |     0x0000    |
> > > ...
> > > |     0xcdcd     |     0xcdcd     |     0x0000    |
> > > --------------------------------------------------- 64K
> > > 
> > > The calltrace of such corruption is as following:
> > > 
> > > scrub_bio_end_io_worker() get called for each extent read out
> > > |- scriub_block_complete()
> > >    |- Data extent csum mismatch
> > >    |- scrub_handle_errored_block
> > >       |- scrub_recheck_block()
> > >          |- scrub_submit_raid56_bio_wait()
> > >             |- raid56_parity_recover()
> > > 
> > > Now we have a rbio with correct data stripe 1 recovered.
> > > Let's call it "good_rbio".
> > > 
> > > scrub_parity_check_and_repair()
> > > |- raid56_parity_submit_scrub_rbio()
> > >    |- lock_stripe_add()
> > >    |  |- steal_rbio()
> > >    |     |- Recovered data are steal from "good_rbio", stored into
> > >    |        rbio->stripe_pages[]
> > >    |        Now rbio->bio_pages[] are bad data read from disk.
> > 
> > At this point, we should have already known that whether
> > rbio->bio_pages are corrupted because rbio->bio_pages are indexed from
> > the list sparity->pages, and we only do scrub_parity_put after
> > finishing the endio of reading all pages linked at sparity->pages.
> > 
> > Since the previous checksuming failure has made a recovery and we
> > got the correct data on that rbio, instead of adding this corrupted
> > page into the new rbio, it'd be fine to skip it and we use all
> > rbio->stripe_pages which can be stolen from the previous good rbio.
> 
> Unfortunately, the rbio->bio_pages[] are pages stored in rbio->bio_list.
> 
> While we don't have good function to remove page from a bio, nor the
> function to remove a bio from a bio_list (although it's quite easy to
> implement), it will be quite a mess to do it in steal_rbio() or in btrfs.
>

That's not what I was suggesting.

This bug is about the case that we have corrupted data on one of the stripes,
not corrupted parity.  So raid56 is a little bit different with raid1, for raid1
we're good to go after writing back good copy to the position that has bad copy,
while for raid56 we need to check/repair the parity after writing back good
copy, however, the pages listed at sparity->pages have the original bad copy we
read from the disk, not the good copy from re-checking all mirrors, the parity
then ends up being corrupted by taking these bad copy's pages to do the xor.

If the raid56 read rebuild was successful after scrub_handle_errored_block(),
then we got all good data across this stripe, and they're cached in
rbio->stripe_pages for any later read/write to steal from.  So the bug can be
fixed by not adding these bad copy's pages into the new rbio.

There is another case that we have a broken parity, then if all data copy are
good, we're good to go directly check and repair anything wrong on parity
without adding pages listed at sparity_pages to the new rbio, if there is some
bad data, then we're not able to fix up the data or parity so eventually we get
an unrecoverable error, meanwhile both ebitmap and dbitmap got updated
respectively so that it won't update that horizonal stripe on disk.

In a word, by the time we go check and repair parity, the raid56 stripe on disk
should have correct data, there is no need to put pages into rbio.

So my point of view is to remove the code of adding pages at sparity->pages to
the rbio which is with operation BTRFS_RBIO_PARITY_SCRUB.

Thanks,

-liubo

> That's why I use new bad_on_disk_a/b other than directly stealing pages from
> rbio->bio_pages[].
> 
> Thanks,
> Qu
> > 
> > Thanks,
> > 
> > -liubo
> > 
> > >    |- async_scrub_parity()
> > >       |- scrub_parity_work() (delayed_call to scrub_parity_work)
> > > 
> > > scrub_parity_work()
> > > |- raid56_parity_scrub_stripe()
> > >    |- validate_rbio_for_parity_scrub()
> > >       |- finish_parity_scrub()
> > >          |- Recalculate parity using *BAD* pages in rbio->bio_pages[]
> > >             So good parity is overwritten with *BAD* one
> > > 
> > > The fix is to introduce 2 new members, bad_ondisk_a/b, to struct
> > > btrfs_raid_bio, to info scrub code to use correct data pages to
> > > re-calculate parity.
> > > 
> > > Reported-by: Goffredo Baroncelli <kreijack@inwind.it>
> > > Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> > > ---
> > >  fs/btrfs/raid56.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++----
> > >  1 file changed, 58 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> > > index d2a9a1ee5361..453eefdcb591 100644
> > > --- a/fs/btrfs/raid56.c
> > > +++ b/fs/btrfs/raid56.c
> > > @@ -133,6 +133,16 @@ struct btrfs_raid_bio {
> > >  	/* second bad stripe (for raid6 use) */
> > >  	int failb;
> > > 
> > > +	/*
> > > +	 * For steal_rbio, we can steal recovered correct page,
> > > +	 * but in finish_parity_scrub(), we still use bad on-disk
> > > +	 * page to calculate parity.
> > > +	 * Use these members to info finish_parity_scrub() to use
> > > +	 * correct pages
> > > +	 */
> > > +	int bad_ondisk_a;
> > > +	int bad_ondisk_b;
> > > +
> > >  	int scrubp;
> > >  	/*
> > >  	 * number of pages needed to represent the full
> > > @@ -310,6 +320,12 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
> > >  	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
> > >  		return;
> > > 
> > > +	/* Record recovered stripe number */
> > > +	if (src->faila != -1)
> > > +		dest->bad_ondisk_a = src->faila;
> > > +	if (src->failb != -1)
> > > +		dest->bad_ondisk_b = src->failb;
> > > +
> > >  	for (i = 0; i < dest->nr_pages; i++) {
> > >  		s = src->stripe_pages[i];
> > >  		if (!s || !PageUptodate(s)) {
> > > @@ -999,6 +1015,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
> > >  	rbio->stripe_npages = stripe_npages;
> > >  	rbio->faila = -1;
> > >  	rbio->failb = -1;
> > > +	rbio->bad_ondisk_a = -1;
> > > +	rbio->bad_ondisk_b = -1;
> > >  	atomic_set(&rbio->refs, 1);
> > >  	atomic_set(&rbio->error, 0);
> > >  	atomic_set(&rbio->stripes_pending, 0);
> > > @@ -2261,6 +2279,9 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
> > >  	int bit;
> > >  	int index;
> > >  	struct page *page;
> > > +	struct page *bio_page;
> > > +	void *ptr;
> > > +	void *bio_ptr;
> > > 
> > >  	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
> > >  		for (i = 0; i < rbio->real_stripes; i++) {
> > > @@ -2271,6 +2292,23 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
> > >  			page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
> > >  			if (!page)
> > >  				return -ENOMEM;
> > > +
> > > +			/*
> > > +			 * It's possible that only several pages need recover,
> > > +			 * and rest are all good.
> > > +			 * In that case we need to copy good bio pages into
> > > +			 * stripe_pages[], as we will use stripe_pages[] other
> > > +			 * than bio_pages[] in finish_parity_scrub().
> > > +			 */
> > > +			bio_page = page_in_rbio(rbio, i, bit, 0);
> > > +			if ((i == rbio->bad_ondisk_a ||
> > > +			     i == rbio->bad_ondisk_b) && bio_page) {
> > > +				ptr = kmap(page);
> > > +				bio_ptr = kmap(bio_page);
> > > +				memcpy(ptr, bio_ptr, PAGE_SIZE);
> > > +				kunmap(bio_page);
> > > +				kunmap(page);
> > > +			}
> > >  			rbio->stripe_pages[index] = page;
> > >  		}
> > >  	}
> > > @@ -2282,6 +2320,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
> > >  {
> > >  	struct btrfs_bio *bbio = rbio->bbio;
> > >  	void *pointers[rbio->real_stripes];
> > > +	struct page *mapped_pages[rbio->real_stripes];
> > >  	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
> > >  	int nr_data = rbio->nr_data;
> > >  	int stripe;
> > > @@ -2342,12 +2381,24 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
> > >  		void *parity;
> > >  		/* first collect one page from each data stripe */
> > >  		for (stripe = 0; stripe < nr_data; stripe++) {
> > > -			p = page_in_rbio(rbio, stripe, pagenr, 0);
> > > +
> > > +			/*
> > > +			 * Use stolen recovered page other than bad
> > > +			 * on disk pages
> > > +			 */
> > > +			if (stripe == rbio->bad_ondisk_a ||
> > > +			    stripe == rbio->bad_ondisk_b)
> > > +				p = rbio_stripe_page(rbio, stripe, pagenr);
> > > +			else
> > > +				p = page_in_rbio(rbio, stripe, pagenr, 0);
> > >  			pointers[stripe] = kmap(p);
> > > +			mapped_pages[stripe] = p;
> > >  		}
> > > 
> > >  		/* then add the parity stripe */
> > > -		pointers[stripe++] = kmap(p_page);
> > > +		pointers[stripe] = kmap(p_page);
> > > +		mapped_pages[stripe] = p_page;
> > > +		stripe++;
> > > 
> > >  		if (q_stripe != -1) {
> > > 
> > > @@ -2355,7 +2406,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
> > >  			 * raid6, add the qstripe and call the
> > >  			 * library function to fill in our p/q
> > >  			 */
> > > -			pointers[stripe++] = kmap(q_page);
> > > +			pointers[stripe] = kmap(q_page);
> > > +			mapped_pages[stripe] = q_page;
> > > +			stripe++;
> > > 
> > >  			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
> > >  						pointers);
> > > @@ -2375,8 +2428,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
> > >  			bitmap_clear(rbio->dbitmap, pagenr, 1);
> > >  		kunmap(p);
> > > 
> > > +		/* Free mapped pages */
> > >  		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
> > > -			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
> > > +			kunmap(mapped_pages[stripe]);
> > >  	}
> > > 
> > >  	__free_page(p_page);
> > > --
> > > 2.11.0
> > > 
> > > 
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> > 
> 
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal
  2017-03-17  5:28     ` Qu Wenruo
@ 2017-03-18  2:03       ` Liu Bo
  2017-03-20  6:21         ` Qu Wenruo
  0 siblings, 1 reply; 27+ messages in thread
From: Liu Bo @ 2017-03-18  2:03 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Fri, Mar 17, 2017 at 01:28:45PM +0800, Qu Wenruo wrote:
> 
> 
> At 03/17/2017 12:44 PM, Liu Bo wrote:
> > On Fri, Feb 03, 2017 at 04:20:22PM +0800, Qu Wenruo wrote:
> > > Before this patch, btrfs raid56 will keep raid56 rbio even all its IO is
> > > done.
> > > This may save some time allocating rbio, but it can cause deadly
> > > use-after-free bug, for the following case:
> > > 
> > > Original fs: 4 devices RAID5
> > > 
> > >        Process A                 |          Process B
> > > --------------------------------------------------------------------------
> > >                                  |  Start device replace
> > >                                  |    Now the fs has 5 devices
> > >                                  |    devid 0: replace device
> > >                                  |    devid 1~4: old devices
> > > btrfs_map_bio()                  |
> > > |- __btrfs_map_block()           |
> > > |    bbio has 5 stripes          |
> > > |    including devid 0           |
> > > |- raid56_parity_write()         |
> > >                                  |
> > > raid_write_end_io()              |
> > > |- rbio_orig_end_io()            |
> > >    |- unlock_stripe()            |
> > >        Keeps the old rbio for    |
> > >        later steal, which has    |
> > >        stripe for devid 0        |
> > >                                  |  Cancel device replace
> > >                                  |    Now the fs has 4 devices
> > >                                  |    devid 0 is freed
> > > Some IO happens                  |
> > > raid_write_end_io()              |
> > > |- rbio_orig_end_io()            |
> > >    |- unlock_stripe()            |
> > >       |- steal_rbio()            |
> > >            Use old rbio, whose   |
> > >            bbio has freed devid 0|
> > >            stripe                |
> > > Any access to rbio->bbio will    |
> > > cause general protection or NULL |
> > > pointer dereference              |
> > > 
> > > Such bug can already be triggered by fstests btrfs/069 for RAID5/6
> > > profiles.
> > > 
> > > Fix it by not keeping old rbio in unlock_stripe(), so we just free the
> > > finished rbio and rbio->bbio, so above problem wont' happen.
> > > 
> > 
> > I don't think this is acceptable, keeping a cache is important for
> > raid56 write performance, could you please fix it by checking if the
> > device is missing?
> 
> Not possible, as it's keeping the btrfs_device pointer and never release it,
> the stolen rbio can be kept forever until umount.
>

steal_rbio() only takes pages from rbio->stripe_pages, so the cached
rbio->bbio is not going to the next IO's rbio because the cached one
got freed immediately in steal_rbio(), where could we dereference
rbio->bbio?

Thanks,

-liubo

> And I think the logical is very strange, if RAID5/6 is unstable, there is no
> meaning to keep it fast.
> 
> Keep it stable first, and then consider the performance.
> 
> Thanks,
> Qu
> 
> > 
> > Thanks,
> > 
> > -liubo
> > 
> > > Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> > > ---
> > >  fs/btrfs/raid56.c | 18 +-----------------
> > >  1 file changed, 1 insertion(+), 17 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> > > index 453eefdcb591..aba82b95ec73 100644
> > > --- a/fs/btrfs/raid56.c
> > > +++ b/fs/btrfs/raid56.c
> > > @@ -776,7 +776,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > >  	int bucket;
> > >  	struct btrfs_stripe_hash *h;
> > >  	unsigned long flags;
> > > -	int keep_cache = 0;
> > > 
> > >  	bucket = rbio_bucket(rbio);
> > >  	h = rbio->fs_info->stripe_hash_table->table + bucket;
> > > @@ -788,19 +787,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > >  	spin_lock(&rbio->bio_list_lock);
> > > 
> > >  	if (!list_empty(&rbio->hash_list)) {
> > > -		/*
> > > -		 * if we're still cached and there is no other IO
> > > -		 * to perform, just leave this rbio here for others
> > > -		 * to steal from later
> > > -		 */
> > > -		if (list_empty(&rbio->plug_list) &&
> > > -		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
> > > -			keep_cache = 1;
> > > -			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
> > > -			BUG_ON(!bio_list_empty(&rbio->bio_list));
> > > -			goto done;
> > > -		}
> > > -
> > >  		list_del_init(&rbio->hash_list);
> > >  		atomic_dec(&rbio->refs);
> > > 
> > > @@ -848,13 +834,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > >  			goto done_nolock;
> > >  		}
> > >  	}
> > > -done:
> > >  	spin_unlock(&rbio->bio_list_lock);
> > >  	spin_unlock_irqrestore(&h->lock, flags);
> > > 
> > >  done_nolock:
> > > -	if (!keep_cache)
> > > -		remove_rbio_from_cache(rbio);
> > > +	remove_rbio_from_cache(rbio);
> > >  }
> > > 
> > >  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
> > > --
> > > 2.11.0
> > > 
> > > 
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> > 
> 
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] btrfs: replace: Use ref counts to avoid destroying target device when canceled
  2017-02-03  8:20 ` [PATCH 5/5] btrfs: replace: Use ref counts to avoid destroying target device when canceled Qu Wenruo
@ 2017-03-18  2:12   ` Liu Bo
  2017-03-20  6:30     ` Qu Wenruo
  0 siblings, 1 reply; 27+ messages in thread
From: Liu Bo @ 2017-03-18  2:12 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Fri, Feb 03, 2017 at 04:20:23PM +0800, Qu Wenruo wrote:
> When dev-replace and scrub are run at the same time, dev-replace can be
> canceled by scrub. It's quite common for btrfs/069.
> 
> The backtrace would be like:
> general protection fault: 0000 [#1] SMP
> Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs]
> RIP: 0010:[<ffffffff813a2fa8>]  [<ffffffff813a2fa8>] generic_make_request_checks+0x198/0x5a0
> Call Trace:
>  [<ffffffff813a4cff>] ? generic_make_request+0xcf/0x290
>  [<ffffffff813a4c54>] generic_make_request+0x24/0x290
>  [<ffffffff813a4cff>] ? generic_make_request+0xcf/0x290
>  [<ffffffff813a4f2e>] submit_bio+0x6e/0x120
>  [<ffffffffa087279d>] ? page_in_rbio+0x4d/0x80 [btrfs]
>  [<ffffffffa08737d0>] ? rbio_orig_end_io+0x80/0x80 [btrfs]
>  [<ffffffffa0873e31>] finish_rmw+0x401/0x550 [btrfs]
>  [<ffffffffa0874fc6>] validate_rbio_for_rmw+0x36/0x40 [btrfs]
>  [<ffffffffa087504d>] raid_rmw_end_io+0x7d/0x90 [btrfs]
>  [<ffffffff8139c536>] bio_endio+0x56/0x60
>  [<ffffffffa07e6e5c>] end_workqueue_fn+0x3c/0x40 [btrfs]
>  [<ffffffffa08285bf>] btrfs_scrubparity_helper+0xef/0x610 [btrfs]
>  [<ffffffffa0828b9e>] btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
>  [<ffffffff810ec8df>] process_one_work+0x2af/0x720
>  [<ffffffff810ec85b>] ? process_one_work+0x22b/0x720
>  [<ffffffff810ecd9b>] worker_thread+0x4b/0x4f0
>  [<ffffffff810ecd50>] ? process_one_work+0x720/0x720
>  [<ffffffff810ecd50>] ? process_one_work+0x720/0x720
>  [<ffffffff810f39d3>] kthread+0xf3/0x110
>  [<ffffffff810f38e0>] ? kthread_park+0x60/0x60
>  [<ffffffff81857647>] ret_from_fork+0x27/0x40
> 
> While in that case, target device can be destroyed at cancel time,
> leading to a user-after-free bug:
> 
>      Process A (dev-replace)         |         Process B(scrub)
> ----------------------------------------------------------------------
>                                      |(Any RW is OK)
>                                      |scrub_setup_recheck_block()
>                                      ||- btrfs_map_sblock()
>                                      |   Got a bbio with tgtdev
> btrfs_dev_replace_finishing()        |
> |- btrfs_destory_dev_replace_tgtdev()|
>    |- call_rcu(free_device)          |
>       |- __free_device()             |
>          |- kfree(device)            |
>                                      | Scrub worker:
>                                      | Access bbio->stripes[], which
>                                      | contains tgtdev.
>                                      | This triggers general protection.
>

We already have bio_counter to block device replace to avoid
use-after-free problem, is there any particular reason of not using
it?

Thanks,

-liubo

> The bug is mostly obvious for RAID5/6 since raid56 choose to keep old
> rbio and rbio->bbio for later steal, this hugely enlarged the race
> window and makes it much easier to trigger the bug.
> 
> This patch introduces 'tgtdev_refs' and 'tgtdev_wait' for btrfs_device
> to wait for all its user released the target device.
> 
> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> ---
>  fs/btrfs/dev-replace.c |  7 ++++++-
>  fs/btrfs/volumes.c     | 36 +++++++++++++++++++++++++++++++++++-
>  fs/btrfs/volumes.h     | 10 ++++++++++
>  3 files changed, 51 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> index 5de280b9ad73..794a6a0bedf2 100644
> --- a/fs/btrfs/dev-replace.c
> +++ b/fs/btrfs/dev-replace.c
> @@ -558,7 +558,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
>  			  rcu_str_deref(src_device->name),
>  			  src_device->devid,
>  			  rcu_str_deref(tgt_device->name));
> -	tgt_device->is_tgtdev_for_dev_replace = 0;
>  	tgt_device->devid = src_device->devid;
>  	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
>  	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
> @@ -579,6 +578,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
>  
>  	btrfs_dev_replace_unlock(dev_replace, 1);
>  
> +	/*
> +	 * Only change is_tgtdev_for_dev_replace flag after all its
> +	 * users get released.
> +	 */
> +	wait_target_device(tgt_device);
> +	tgt_device->is_tgtdev_for_dev_replace = 0;
>  	btrfs_rm_dev_replace_blocked(fs_info);
>  
>  	btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 3c3c69c0eee4..84db9fb22b7d 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -2064,6 +2064,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
>  	WARN_ON(!tgtdev);
>  	mutex_lock(&fs_info->fs_devices->device_list_mutex);
>  
> +	wait_target_device(tgtdev);
>  	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
>  
>  	if (tgtdev->bdev)
> @@ -2598,6 +2599,8 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
>  	device->is_tgtdev_for_dev_replace = 1;
>  	device->mode = FMODE_EXCL;
>  	device->dev_stats_valid = 1;
> +	atomic_set(&device->tgtdev_refs, 0);
> +	init_waitqueue_head(&device->tgtdev_wait);
>  	set_blocksize(device->bdev, 4096);
>  	device->fs_devices = fs_info->fs_devices;
>  	list_add(&device->dev_list, &fs_info->fs_devices->devices);
> @@ -2624,6 +2627,8 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
>  	tgtdev->sector_size = sectorsize;
>  	tgtdev->fs_info = fs_info;
>  	tgtdev->in_fs_metadata = 1;
> +	atomic_set(&tgtdev->tgtdev_refs, 0);
> +	init_waitqueue_head(&tgtdev->tgtdev_wait);
>  }
>  
>  static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
> @@ -5302,6 +5307,32 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
>  	return bbio;
>  }
>  
> +static void pin_bbio_target_device(struct btrfs_bio *bbio)
> +{
> +	int i;
> +
> +	for (i = 0; i < bbio->num_stripes; i++) {
> +		struct btrfs_device *device = bbio->stripes[i].dev;
> +
> +		if (device->is_tgtdev_for_dev_replace)
> +			atomic_inc(&device->tgtdev_refs);
> +	}
> +}
> +
> +static void unpin_bbio_target_device(struct btrfs_bio *bbio)
> +{
> +	int i;
> +
> +	for (i = 0; i < bbio->num_stripes; i++) {
> +		struct btrfs_device *device = bbio->stripes[i].dev;
> +
> +		if (device->is_tgtdev_for_dev_replace) {
> +			atomic_dec(&device->tgtdev_refs);
> +			wake_up(&device->tgtdev_wait);
> +		}
> +	}
> +}
> +
>  void btrfs_get_bbio(struct btrfs_bio *bbio)
>  {
>  	WARN_ON(!atomic_read(&bbio->refs));
> @@ -5312,8 +5343,10 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
>  {
>  	if (!bbio)
>  		return;
> -	if (atomic_dec_and_test(&bbio->refs))
> +	if (atomic_dec_and_test(&bbio->refs)) {
> +		unpin_bbio_target_device(bbio);
>  		kfree(bbio);
> +	}
>  }
>  
>  static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
> @@ -5868,6 +5901,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
>  		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
>  		bbio->mirror_num = map->num_stripes + 1;
>  	}
> +	pin_bbio_target_device(bbio);
>  out:
>  	if (dev_replace_is_ongoing) {
>  		btrfs_dev_replace_clear_lock_blocking(dev_replace);
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 24ba6bc3ec34..702361182597 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -149,6 +149,10 @@ struct btrfs_device {
>  	/* Counter to record the change of device stats */
>  	atomic_t dev_stats_ccnt;
>  	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
> +
> +	/* To ensure we wait this target device before destroying it */
> +	atomic_t tgtdev_refs;
> +	wait_queue_head_t tgtdev_wait;
>  };
>  
>  /*
> @@ -538,4 +542,10 @@ struct list_head *btrfs_get_fs_uuids(void);
>  void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
>  void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
>  
> +static inline void wait_target_device(struct btrfs_device *tgtdev)
> +{
> +	if (!tgtdev || !tgtdev->is_tgtdev_for_dev_replace)
> +		return;
> +	wait_event(tgtdev->tgtdev_wait, atomic_read(&tgtdev->tgtdev_refs) == 0);
> +}
>  #endif
> -- 
> 2.11.0
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 3/5] btrfs: raid56: Use correct stolen pages to calculate P/Q
  2017-03-17 22:19       ` Liu Bo
@ 2017-03-20  4:33         ` Qu Wenruo
  0 siblings, 0 replies; 27+ messages in thread
From: Qu Wenruo @ 2017-03-20  4:33 UTC (permalink / raw)
  To: bo.li.liu; +Cc: linux-btrfs



At 03/18/2017 06:19 AM, Liu Bo wrote:
> On Fri, Mar 17, 2017 at 02:31:08PM +0800, Qu Wenruo wrote:
>>
>>
>> At 03/16/2017 01:36 PM, Liu Bo wrote:
>>> On Fri, Feb 03, 2017 at 04:20:21PM +0800, Qu Wenruo wrote:
>>>> In the following situation, scrub will calculate wrong parity to
>>>> overwrite correct one:
>>>>
>>>> RAID5 full stripe:
>>>>
>>>> Before
>>>> |     Dev 1      |     Dev  2     |     Dev 3     |
>>>> | Data stripe 1  | Data stripe 2  | Parity Stripe |
>>>> --------------------------------------------------- 0
>>>> | 0x0000 (Bad)   |     0xcdcd     |     0x0000    |
>>>> --------------------------------------------------- 4K
>>>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>>>> ...
>>>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>>>> --------------------------------------------------- 64K
>>>>
>>>> After scrubbing dev3 only:
>>>>
>>>> |     Dev 1      |     Dev  2     |     Dev 3     |
>>>> | Data stripe 1  | Data stripe 2  | Parity Stripe |
>>>> --------------------------------------------------- 0
>>>> | 0xcdcd (Good)  |     0xcdcd     | 0xcdcd (Bad)  |
>>>> --------------------------------------------------- 4K
>>>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>>>> ...
>>>> |     0xcdcd     |     0xcdcd     |     0x0000    |
>>>> --------------------------------------------------- 64K
>>>>
>>>> The calltrace of such corruption is as following:
>>>>
>>>> scrub_bio_end_io_worker() get called for each extent read out
>>>> |- scriub_block_complete()
>>>>    |- Data extent csum mismatch
>>>>    |- scrub_handle_errored_block
>>>>       |- scrub_recheck_block()
>>>>          |- scrub_submit_raid56_bio_wait()
>>>>             |- raid56_parity_recover()
>>>>
>>>> Now we have a rbio with correct data stripe 1 recovered.
>>>> Let's call it "good_rbio".
>>>>
>>>> scrub_parity_check_and_repair()
>>>> |- raid56_parity_submit_scrub_rbio()
>>>>    |- lock_stripe_add()
>>>>    |  |- steal_rbio()
>>>>    |     |- Recovered data are steal from "good_rbio", stored into
>>>>    |        rbio->stripe_pages[]
>>>>    |        Now rbio->bio_pages[] are bad data read from disk.
>>>
>>> At this point, we should have already known that whether
>>> rbio->bio_pages are corrupted because rbio->bio_pages are indexed from
>>> the list sparity->pages, and we only do scrub_parity_put after
>>> finishing the endio of reading all pages linked at sparity->pages.
>>>
>>> Since the previous checksuming failure has made a recovery and we
>>> got the correct data on that rbio, instead of adding this corrupted
>>> page into the new rbio, it'd be fine to skip it and we use all
>>> rbio->stripe_pages which can be stolen from the previous good rbio.
>>
>> Unfortunately, the rbio->bio_pages[] are pages stored in rbio->bio_list.
>>
>> While we don't have good function to remove page from a bio, nor the
>> function to remove a bio from a bio_list (although it's quite easy to
>> implement), it will be quite a mess to do it in steal_rbio() or in btrfs.
>>
>
> That's not what I was suggesting.
>
> This bug is about the case that we have corrupted data on one of the stripes,
> not corrupted parity.  So raid56 is a little bit different with raid1, for raid1
> we're good to go after writing back good copy to the position that has bad copy,
> while for raid56 we need to check/repair the parity after writing back good
> copy, however, the pages listed at sparity->pages have the original bad copy we
> read from the disk, not the good copy from re-checking all mirrors, the parity
> then ends up being corrupted by taking these bad copy's pages to do the xor.
>
> If the raid56 read rebuild was successful after scrub_handle_errored_block(),
> then we got all good data across this stripe, and they're cached in
> rbio->stripe_pages for any later read/write to steal from.  So the bug can be
> fixed by not adding these bad copy's pages into the new rbio.
>
> There is another case that we have a broken parity, then if all data copy are
> good, we're good to go directly check and repair anything wrong on parity
> without adding pages listed at sparity_pages to the new rbio, if there is some
> bad data, then we're not able to fix up the data or parity so eventually we get
> an unrecoverable error, meanwhile both ebitmap and dbitmap got updated
> respectively so that it won't update that horizonal stripe on disk.
>
> In a word, by the time we go check and repair parity, the raid56 stripe on disk
> should have correct data, there is no need to put pages into rbio.
>
> So my point of view is to remove the code of adding pages at sparity->pages to
> the rbio which is with operation BTRFS_RBIO_PARITY_SCRUB.

Now I understand.

Indeed at finish_parity_scrub() time, rbio->stripes_pages[] are 
recovered correct pages.

It's the added pages causing the bug.

I'll use this method in next update.

Thanks,
Qu
>
> Thanks,
>
> -liubo
>
>> That's why I use new bad_on_disk_a/b other than directly stealing pages from
>> rbio->bio_pages[].
>>
>> Thanks,
>> Qu
>>>
>>> Thanks,
>>>
>>> -liubo
>>>
>>>>    |- async_scrub_parity()
>>>>       |- scrub_parity_work() (delayed_call to scrub_parity_work)
>>>>
>>>> scrub_parity_work()
>>>> |- raid56_parity_scrub_stripe()
>>>>    |- validate_rbio_for_parity_scrub()
>>>>       |- finish_parity_scrub()
>>>>          |- Recalculate parity using *BAD* pages in rbio->bio_pages[]
>>>>             So good parity is overwritten with *BAD* one
>>>>
>>>> The fix is to introduce 2 new members, bad_ondisk_a/b, to struct
>>>> btrfs_raid_bio, to info scrub code to use correct data pages to
>>>> re-calculate parity.
>>>>
>>>> Reported-by: Goffredo Baroncelli <kreijack@inwind.it>
>>>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>> ---
>>>>  fs/btrfs/raid56.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++----
>>>>  1 file changed, 58 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>>>> index d2a9a1ee5361..453eefdcb591 100644
>>>> --- a/fs/btrfs/raid56.c
>>>> +++ b/fs/btrfs/raid56.c
>>>> @@ -133,6 +133,16 @@ struct btrfs_raid_bio {
>>>>  	/* second bad stripe (for raid6 use) */
>>>>  	int failb;
>>>>
>>>> +	/*
>>>> +	 * For steal_rbio, we can steal recovered correct page,
>>>> +	 * but in finish_parity_scrub(), we still use bad on-disk
>>>> +	 * page to calculate parity.
>>>> +	 * Use these members to info finish_parity_scrub() to use
>>>> +	 * correct pages
>>>> +	 */
>>>> +	int bad_ondisk_a;
>>>> +	int bad_ondisk_b;
>>>> +
>>>>  	int scrubp;
>>>>  	/*
>>>>  	 * number of pages needed to represent the full
>>>> @@ -310,6 +320,12 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
>>>>  	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
>>>>  		return;
>>>>
>>>> +	/* Record recovered stripe number */
>>>> +	if (src->faila != -1)
>>>> +		dest->bad_ondisk_a = src->faila;
>>>> +	if (src->failb != -1)
>>>> +		dest->bad_ondisk_b = src->failb;
>>>> +
>>>>  	for (i = 0; i < dest->nr_pages; i++) {
>>>>  		s = src->stripe_pages[i];
>>>>  		if (!s || !PageUptodate(s)) {
>>>> @@ -999,6 +1015,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
>>>>  	rbio->stripe_npages = stripe_npages;
>>>>  	rbio->faila = -1;
>>>>  	rbio->failb = -1;
>>>> +	rbio->bad_ondisk_a = -1;
>>>> +	rbio->bad_ondisk_b = -1;
>>>>  	atomic_set(&rbio->refs, 1);
>>>>  	atomic_set(&rbio->error, 0);
>>>>  	atomic_set(&rbio->stripes_pending, 0);
>>>> @@ -2261,6 +2279,9 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
>>>>  	int bit;
>>>>  	int index;
>>>>  	struct page *page;
>>>> +	struct page *bio_page;
>>>> +	void *ptr;
>>>> +	void *bio_ptr;
>>>>
>>>>  	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
>>>>  		for (i = 0; i < rbio->real_stripes; i++) {
>>>> @@ -2271,6 +2292,23 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
>>>>  			page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
>>>>  			if (!page)
>>>>  				return -ENOMEM;
>>>> +
>>>> +			/*
>>>> +			 * It's possible that only several pages need recover,
>>>> +			 * and rest are all good.
>>>> +			 * In that case we need to copy good bio pages into
>>>> +			 * stripe_pages[], as we will use stripe_pages[] other
>>>> +			 * than bio_pages[] in finish_parity_scrub().
>>>> +			 */
>>>> +			bio_page = page_in_rbio(rbio, i, bit, 0);
>>>> +			if ((i == rbio->bad_ondisk_a ||
>>>> +			     i == rbio->bad_ondisk_b) && bio_page) {
>>>> +				ptr = kmap(page);
>>>> +				bio_ptr = kmap(bio_page);
>>>> +				memcpy(ptr, bio_ptr, PAGE_SIZE);
>>>> +				kunmap(bio_page);
>>>> +				kunmap(page);
>>>> +			}
>>>>  			rbio->stripe_pages[index] = page;
>>>>  		}
>>>>  	}
>>>> @@ -2282,6 +2320,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>>>  {
>>>>  	struct btrfs_bio *bbio = rbio->bbio;
>>>>  	void *pointers[rbio->real_stripes];
>>>> +	struct page *mapped_pages[rbio->real_stripes];
>>>>  	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
>>>>  	int nr_data = rbio->nr_data;
>>>>  	int stripe;
>>>> @@ -2342,12 +2381,24 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>>>  		void *parity;
>>>>  		/* first collect one page from each data stripe */
>>>>  		for (stripe = 0; stripe < nr_data; stripe++) {
>>>> -			p = page_in_rbio(rbio, stripe, pagenr, 0);
>>>> +
>>>> +			/*
>>>> +			 * Use stolen recovered page other than bad
>>>> +			 * on disk pages
>>>> +			 */
>>>> +			if (stripe == rbio->bad_ondisk_a ||
>>>> +			    stripe == rbio->bad_ondisk_b)
>>>> +				p = rbio_stripe_page(rbio, stripe, pagenr);
>>>> +			else
>>>> +				p = page_in_rbio(rbio, stripe, pagenr, 0);
>>>>  			pointers[stripe] = kmap(p);
>>>> +			mapped_pages[stripe] = p;
>>>>  		}
>>>>
>>>>  		/* then add the parity stripe */
>>>> -		pointers[stripe++] = kmap(p_page);
>>>> +		pointers[stripe] = kmap(p_page);
>>>> +		mapped_pages[stripe] = p_page;
>>>> +		stripe++;
>>>>
>>>>  		if (q_stripe != -1) {
>>>>
>>>> @@ -2355,7 +2406,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>>>  			 * raid6, add the qstripe and call the
>>>>  			 * library function to fill in our p/q
>>>>  			 */
>>>> -			pointers[stripe++] = kmap(q_page);
>>>> +			pointers[stripe] = kmap(q_page);
>>>> +			mapped_pages[stripe] = q_page;
>>>> +			stripe++;
>>>>
>>>>  			raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
>>>>  						pointers);
>>>> @@ -2375,8 +2428,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
>>>>  			bitmap_clear(rbio->dbitmap, pagenr, 1);
>>>>  		kunmap(p);
>>>>
>>>> +		/* Free mapped pages */
>>>>  		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
>>>> -			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
>>>> +			kunmap(mapped_pages[stripe]);
>>>>  	}
>>>>
>>>>  	__free_page(p_page);
>>>> --
>>>> 2.11.0
>>>>
>>>>
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>>
>>
>>
>
>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal
  2017-03-18  2:03       ` Liu Bo
@ 2017-03-20  6:21         ` Qu Wenruo
  2017-03-20 20:23           ` Liu Bo
  0 siblings, 1 reply; 27+ messages in thread
From: Qu Wenruo @ 2017-03-20  6:21 UTC (permalink / raw)
  To: bo.li.liu; +Cc: linux-btrfs



At 03/18/2017 10:03 AM, Liu Bo wrote:
> On Fri, Mar 17, 2017 at 01:28:45PM +0800, Qu Wenruo wrote:
>>
>>
>> At 03/17/2017 12:44 PM, Liu Bo wrote:
>>> On Fri, Feb 03, 2017 at 04:20:22PM +0800, Qu Wenruo wrote:
>>>> Before this patch, btrfs raid56 will keep raid56 rbio even all its IO is
>>>> done.
>>>> This may save some time allocating rbio, but it can cause deadly
>>>> use-after-free bug, for the following case:
>>>>
>>>> Original fs: 4 devices RAID5
>>>>
>>>>        Process A                 |          Process B
>>>> --------------------------------------------------------------------------
>>>>                                  |  Start device replace
>>>>                                  |    Now the fs has 5 devices
>>>>                                  |    devid 0: replace device
>>>>                                  |    devid 1~4: old devices
>>>> btrfs_map_bio()                  |
>>>> |- __btrfs_map_block()           |
>>>> |    bbio has 5 stripes          |
>>>> |    including devid 0           |
>>>> |- raid56_parity_write()         |
>>>>                                  |
>>>> raid_write_end_io()              |
>>>> |- rbio_orig_end_io()            |
>>>>    |- unlock_stripe()            |
>>>>        Keeps the old rbio for    |
>>>>        later steal, which has    |
>>>>        stripe for devid 0        |
>>>>                                  |  Cancel device replace
>>>>                                  |    Now the fs has 4 devices
>>>>                                  |    devid 0 is freed
>>>> Some IO happens                  |
>>>> raid_write_end_io()              |
>>>> |- rbio_orig_end_io()            |
>>>>    |- unlock_stripe()            |
>>>>       |- steal_rbio()            |
>>>>            Use old rbio, whose   |
>>>>            bbio has freed devid 0|
>>>>            stripe                |
>>>> Any access to rbio->bbio will    |
>>>> cause general protection or NULL |
>>>> pointer dereference              |
>>>>
>>>> Such bug can already be triggered by fstests btrfs/069 for RAID5/6
>>>> profiles.
>>>>
>>>> Fix it by not keeping old rbio in unlock_stripe(), so we just free the
>>>> finished rbio and rbio->bbio, so above problem wont' happen.
>>>>
>>>
>>> I don't think this is acceptable, keeping a cache is important for
>>> raid56 write performance, could you please fix it by checking if the
>>> device is missing?
>>
>> Not possible, as it's keeping the btrfs_device pointer and never release it,
>> the stolen rbio can be kept forever until umount.
>>
>
> steal_rbio() only takes pages from rbio->stripe_pages, so the cached
> rbio->bbio is not going to the next IO's rbio because the cached one
> got freed immediately in steal_rbio(), where could we dereference
> rbio->bbio?

Did you mean in unlock_stripe(), we still keep the rbio in cache, while 
release its bbio?

This sounds quite good but I'm afraid it may cause more problems.

Quite a lot of places are accessing rbio->bbio either for their btrfs 
logical address or stripes or even stripe->dev.

Thanks,
Qu


>
> Thanks,
>
> -liubo
>
>> And I think the logical is very strange, if RAID5/6 is unstable, there is no
>> meaning to keep it fast.
>>
>> Keep it stable first, and then consider the performance.
>>
>> Thanks,
>> Qu
>>
>>>
>>> Thanks,
>>>
>>> -liubo
>>>
>>>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>> ---
>>>>  fs/btrfs/raid56.c | 18 +-----------------
>>>>  1 file changed, 1 insertion(+), 17 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>>>> index 453eefdcb591..aba82b95ec73 100644
>>>> --- a/fs/btrfs/raid56.c
>>>> +++ b/fs/btrfs/raid56.c
>>>> @@ -776,7 +776,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>  	int bucket;
>>>>  	struct btrfs_stripe_hash *h;
>>>>  	unsigned long flags;
>>>> -	int keep_cache = 0;
>>>>
>>>>  	bucket = rbio_bucket(rbio);
>>>>  	h = rbio->fs_info->stripe_hash_table->table + bucket;
>>>> @@ -788,19 +787,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>  	spin_lock(&rbio->bio_list_lock);
>>>>
>>>>  	if (!list_empty(&rbio->hash_list)) {
>>>> -		/*
>>>> -		 * if we're still cached and there is no other IO
>>>> -		 * to perform, just leave this rbio here for others
>>>> -		 * to steal from later
>>>> -		 */
>>>> -		if (list_empty(&rbio->plug_list) &&
>>>> -		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
>>>> -			keep_cache = 1;
>>>> -			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
>>>> -			BUG_ON(!bio_list_empty(&rbio->bio_list));
>>>> -			goto done;
>>>> -		}
>>>> -
>>>>  		list_del_init(&rbio->hash_list);
>>>>  		atomic_dec(&rbio->refs);
>>>>
>>>> @@ -848,13 +834,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>  			goto done_nolock;
>>>>  		}
>>>>  	}
>>>> -done:
>>>>  	spin_unlock(&rbio->bio_list_lock);
>>>>  	spin_unlock_irqrestore(&h->lock, flags);
>>>>
>>>>  done_nolock:
>>>> -	if (!keep_cache)
>>>> -		remove_rbio_from_cache(rbio);
>>>> +	remove_rbio_from_cache(rbio);
>>>>  }
>>>>
>>>>  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
>>>> --
>>>> 2.11.0
>>>>
>>>>
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>>
>>
>>
>
>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] btrfs: replace: Use ref counts to avoid destroying target device when canceled
  2017-03-18  2:12   ` Liu Bo
@ 2017-03-20  6:30     ` Qu Wenruo
  2017-03-20 19:31       ` Liu Bo
  0 siblings, 1 reply; 27+ messages in thread
From: Qu Wenruo @ 2017-03-20  6:30 UTC (permalink / raw)
  To: bo.li.liu; +Cc: linux-btrfs



At 03/18/2017 10:12 AM, Liu Bo wrote:
> On Fri, Feb 03, 2017 at 04:20:23PM +0800, Qu Wenruo wrote:
>> When dev-replace and scrub are run at the same time, dev-replace can be
>> canceled by scrub. It's quite common for btrfs/069.
>>
>> The backtrace would be like:
>> general protection fault: 0000 [#1] SMP
>> Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs]
>> RIP: 0010:[<ffffffff813a2fa8>]  [<ffffffff813a2fa8>] generic_make_request_checks+0x198/0x5a0
>> Call Trace:
>>  [<ffffffff813a4cff>] ? generic_make_request+0xcf/0x290
>>  [<ffffffff813a4c54>] generic_make_request+0x24/0x290
>>  [<ffffffff813a4cff>] ? generic_make_request+0xcf/0x290
>>  [<ffffffff813a4f2e>] submit_bio+0x6e/0x120
>>  [<ffffffffa087279d>] ? page_in_rbio+0x4d/0x80 [btrfs]
>>  [<ffffffffa08737d0>] ? rbio_orig_end_io+0x80/0x80 [btrfs]
>>  [<ffffffffa0873e31>] finish_rmw+0x401/0x550 [btrfs]
>>  [<ffffffffa0874fc6>] validate_rbio_for_rmw+0x36/0x40 [btrfs]
>>  [<ffffffffa087504d>] raid_rmw_end_io+0x7d/0x90 [btrfs]
>>  [<ffffffff8139c536>] bio_endio+0x56/0x60
>>  [<ffffffffa07e6e5c>] end_workqueue_fn+0x3c/0x40 [btrfs]
>>  [<ffffffffa08285bf>] btrfs_scrubparity_helper+0xef/0x610 [btrfs]
>>  [<ffffffffa0828b9e>] btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
>>  [<ffffffff810ec8df>] process_one_work+0x2af/0x720
>>  [<ffffffff810ec85b>] ? process_one_work+0x22b/0x720
>>  [<ffffffff810ecd9b>] worker_thread+0x4b/0x4f0
>>  [<ffffffff810ecd50>] ? process_one_work+0x720/0x720
>>  [<ffffffff810ecd50>] ? process_one_work+0x720/0x720
>>  [<ffffffff810f39d3>] kthread+0xf3/0x110
>>  [<ffffffff810f38e0>] ? kthread_park+0x60/0x60
>>  [<ffffffff81857647>] ret_from_fork+0x27/0x40
>>
>> While in that case, target device can be destroyed at cancel time,
>> leading to a user-after-free bug:
>>
>>      Process A (dev-replace)         |         Process B(scrub)
>> ----------------------------------------------------------------------
>>                                      |(Any RW is OK)
>>                                      |scrub_setup_recheck_block()
>>                                      ||- btrfs_map_sblock()
>>                                      |   Got a bbio with tgtdev
>> btrfs_dev_replace_finishing()        |
>> |- btrfs_destory_dev_replace_tgtdev()|
>>    |- call_rcu(free_device)          |
>>       |- __free_device()             |
>>          |- kfree(device)            |
>>                                      | Scrub worker:
>>                                      | Access bbio->stripes[], which
>>                                      | contains tgtdev.
>>                                      | This triggers general protection.
>>
>
> We already have bio_counter to block device replace to avoid
> use-after-free problem, is there any particular reason of not using
> it?

Thanks for pointing this out.

And I found raid56 is already using bio_counter().
But we still trigger such use-after-free bug because bio_counter only 
ensure we won't delete/replace a device when there is still bio running.

Here we are facing a difference situation, where we holds a pointer to 
btrfs_dev which can be freed since there is no pending bios for the 
whole fs.

So here I'm afraid we need refcount based solution for each btrfs_dev 
structure, other than global bio_counter.

Thanks,
Qu
>
> Thanks,
>
> -liubo
>
>> The bug is mostly obvious for RAID5/6 since raid56 choose to keep old
>> rbio and rbio->bbio for later steal, this hugely enlarged the race
>> window and makes it much easier to trigger the bug.
>>
>> This patch introduces 'tgtdev_refs' and 'tgtdev_wait' for btrfs_device
>> to wait for all its user released the target device.
>>
>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>> ---
>>  fs/btrfs/dev-replace.c |  7 ++++++-
>>  fs/btrfs/volumes.c     | 36 +++++++++++++++++++++++++++++++++++-
>>  fs/btrfs/volumes.h     | 10 ++++++++++
>>  3 files changed, 51 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
>> index 5de280b9ad73..794a6a0bedf2 100644
>> --- a/fs/btrfs/dev-replace.c
>> +++ b/fs/btrfs/dev-replace.c
>> @@ -558,7 +558,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
>>  			  rcu_str_deref(src_device->name),
>>  			  src_device->devid,
>>  			  rcu_str_deref(tgt_device->name));
>> -	tgt_device->is_tgtdev_for_dev_replace = 0;
>>  	tgt_device->devid = src_device->devid;
>>  	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
>>  	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
>> @@ -579,6 +578,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
>>
>>  	btrfs_dev_replace_unlock(dev_replace, 1);
>>
>> +	/*
>> +	 * Only change is_tgtdev_for_dev_replace flag after all its
>> +	 * users get released.
>> +	 */
>> +	wait_target_device(tgt_device);
>> +	tgt_device->is_tgtdev_for_dev_replace = 0;
>>  	btrfs_rm_dev_replace_blocked(fs_info);
>>
>>  	btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 3c3c69c0eee4..84db9fb22b7d 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -2064,6 +2064,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
>>  	WARN_ON(!tgtdev);
>>  	mutex_lock(&fs_info->fs_devices->device_list_mutex);
>>
>> +	wait_target_device(tgtdev);
>>  	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
>>
>>  	if (tgtdev->bdev)
>> @@ -2598,6 +2599,8 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
>>  	device->is_tgtdev_for_dev_replace = 1;
>>  	device->mode = FMODE_EXCL;
>>  	device->dev_stats_valid = 1;
>> +	atomic_set(&device->tgtdev_refs, 0);
>> +	init_waitqueue_head(&device->tgtdev_wait);
>>  	set_blocksize(device->bdev, 4096);
>>  	device->fs_devices = fs_info->fs_devices;
>>  	list_add(&device->dev_list, &fs_info->fs_devices->devices);
>> @@ -2624,6 +2627,8 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
>>  	tgtdev->sector_size = sectorsize;
>>  	tgtdev->fs_info = fs_info;
>>  	tgtdev->in_fs_metadata = 1;
>> +	atomic_set(&tgtdev->tgtdev_refs, 0);
>> +	init_waitqueue_head(&tgtdev->tgtdev_wait);
>>  }
>>
>>  static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
>> @@ -5302,6 +5307,32 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
>>  	return bbio;
>>  }
>>
>> +static void pin_bbio_target_device(struct btrfs_bio *bbio)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < bbio->num_stripes; i++) {
>> +		struct btrfs_device *device = bbio->stripes[i].dev;
>> +
>> +		if (device->is_tgtdev_for_dev_replace)
>> +			atomic_inc(&device->tgtdev_refs);
>> +	}
>> +}
>> +
>> +static void unpin_bbio_target_device(struct btrfs_bio *bbio)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < bbio->num_stripes; i++) {
>> +		struct btrfs_device *device = bbio->stripes[i].dev;
>> +
>> +		if (device->is_tgtdev_for_dev_replace) {
>> +			atomic_dec(&device->tgtdev_refs);
>> +			wake_up(&device->tgtdev_wait);
>> +		}
>> +	}
>> +}
>> +
>>  void btrfs_get_bbio(struct btrfs_bio *bbio)
>>  {
>>  	WARN_ON(!atomic_read(&bbio->refs));
>> @@ -5312,8 +5343,10 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
>>  {
>>  	if (!bbio)
>>  		return;
>> -	if (atomic_dec_and_test(&bbio->refs))
>> +	if (atomic_dec_and_test(&bbio->refs)) {
>> +		unpin_bbio_target_device(bbio);
>>  		kfree(bbio);
>> +	}
>>  }
>>
>>  static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
>> @@ -5868,6 +5901,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
>>  		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
>>  		bbio->mirror_num = map->num_stripes + 1;
>>  	}
>> +	pin_bbio_target_device(bbio);
>>  out:
>>  	if (dev_replace_is_ongoing) {
>>  		btrfs_dev_replace_clear_lock_blocking(dev_replace);
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index 24ba6bc3ec34..702361182597 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -149,6 +149,10 @@ struct btrfs_device {
>>  	/* Counter to record the change of device stats */
>>  	atomic_t dev_stats_ccnt;
>>  	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
>> +
>> +	/* To ensure we wait this target device before destroying it */
>> +	atomic_t tgtdev_refs;
>> +	wait_queue_head_t tgtdev_wait;
>>  };
>>
>>  /*
>> @@ -538,4 +542,10 @@ struct list_head *btrfs_get_fs_uuids(void);
>>  void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
>>  void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
>>
>> +static inline void wait_target_device(struct btrfs_device *tgtdev)
>> +{
>> +	if (!tgtdev || !tgtdev->is_tgtdev_for_dev_replace)
>> +		return;
>> +	wait_event(tgtdev->tgtdev_wait, atomic_read(&tgtdev->tgtdev_refs) == 0);
>> +}
>>  #endif
>> --
>> 2.11.0
>>
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] btrfs: replace: Use ref counts to avoid destroying target device when canceled
  2017-03-20  6:30     ` Qu Wenruo
@ 2017-03-20 19:31       ` Liu Bo
  0 siblings, 0 replies; 27+ messages in thread
From: Liu Bo @ 2017-03-20 19:31 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Mon, Mar 20, 2017 at 02:30:48PM +0800, Qu Wenruo wrote:
> 
> 
> At 03/18/2017 10:12 AM, Liu Bo wrote:
> > On Fri, Feb 03, 2017 at 04:20:23PM +0800, Qu Wenruo wrote:
> > > When dev-replace and scrub are run at the same time, dev-replace can be
> > > canceled by scrub. It's quite common for btrfs/069.
> > > 
> > > The backtrace would be like:
> > > general protection fault: 0000 [#1] SMP
> > > Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs]
> > > RIP: 0010:[<ffffffff813a2fa8>]  [<ffffffff813a2fa8>] generic_make_request_checks+0x198/0x5a0
> > > Call Trace:
> > >  [<ffffffff813a4cff>] ? generic_make_request+0xcf/0x290
> > >  [<ffffffff813a4c54>] generic_make_request+0x24/0x290
> > >  [<ffffffff813a4cff>] ? generic_make_request+0xcf/0x290
> > >  [<ffffffff813a4f2e>] submit_bio+0x6e/0x120
> > >  [<ffffffffa087279d>] ? page_in_rbio+0x4d/0x80 [btrfs]
> > >  [<ffffffffa08737d0>] ? rbio_orig_end_io+0x80/0x80 [btrfs]
> > >  [<ffffffffa0873e31>] finish_rmw+0x401/0x550 [btrfs]
> > >  [<ffffffffa0874fc6>] validate_rbio_for_rmw+0x36/0x40 [btrfs]
> > >  [<ffffffffa087504d>] raid_rmw_end_io+0x7d/0x90 [btrfs]
> > >  [<ffffffff8139c536>] bio_endio+0x56/0x60
> > >  [<ffffffffa07e6e5c>] end_workqueue_fn+0x3c/0x40 [btrfs]
> > >  [<ffffffffa08285bf>] btrfs_scrubparity_helper+0xef/0x610 [btrfs]
> > >  [<ffffffffa0828b9e>] btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
> > >  [<ffffffff810ec8df>] process_one_work+0x2af/0x720
> > >  [<ffffffff810ec85b>] ? process_one_work+0x22b/0x720
> > >  [<ffffffff810ecd9b>] worker_thread+0x4b/0x4f0
> > >  [<ffffffff810ecd50>] ? process_one_work+0x720/0x720
> > >  [<ffffffff810ecd50>] ? process_one_work+0x720/0x720
> > >  [<ffffffff810f39d3>] kthread+0xf3/0x110
> > >  [<ffffffff810f38e0>] ? kthread_park+0x60/0x60
> > >  [<ffffffff81857647>] ret_from_fork+0x27/0x40
> > > 
> > > While in that case, target device can be destroyed at cancel time,
> > > leading to a user-after-free bug:
> > > 
> > >      Process A (dev-replace)         |         Process B(scrub)
> > > ----------------------------------------------------------------------
> > >                                      |(Any RW is OK)
> > >                                      |scrub_setup_recheck_block()
> > >                                      ||- btrfs_map_sblock()
> > >                                      |   Got a bbio with tgtdev
> > > btrfs_dev_replace_finishing()        |
> > > |- btrfs_destory_dev_replace_tgtdev()|
> > >    |- call_rcu(free_device)          |
> > >       |- __free_device()             |
> > >          |- kfree(device)            |
> > >                                      | Scrub worker:
> > >                                      | Access bbio->stripes[], which
> > >                                      | contains tgtdev.
> > >                                      | This triggers general protection.
> > > 
> > 
> > We already have bio_counter to block device replace to avoid
> > use-after-free problem, is there any particular reason of not using
> > it?
> 
> Thanks for pointing this out.
> 
> And I found raid56 is already using bio_counter().
> But we still trigger such use-after-free bug because bio_counter only ensure
> we won't delete/replace a device when there is still bio running.
> 
> Here we are facing a difference situation, where we holds a pointer to
> btrfs_dev which can be freed since there is no pending bios for the whole
> fs.
>

It's not just for in-flight bios, check btrfs_discard_extent(), anyone
who expects target devices to be alive can use bio_counter to avoid
the race against device replace.

You could inc/dec bio_counter along with scrub_recover as it takes a
pointer of bbio and maintains a reference.

Thanks,

-liubo

> So here I'm afraid we need refcount based solution for each btrfs_dev
> structure, other than global bio_counter.
> 
> Thanks,
> Qu
> > 
> > Thanks,
> > 
> > -liubo
> > 
> > > The bug is mostly obvious for RAID5/6 since raid56 choose to keep old
> > > rbio and rbio->bbio for later steal, this hugely enlarged the race
> > > window and makes it much easier to trigger the bug.
> > > 
> > > This patch introduces 'tgtdev_refs' and 'tgtdev_wait' for btrfs_device
> > > to wait for all its user released the target device.
> > > 
> > > Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> > > ---
> > >  fs/btrfs/dev-replace.c |  7 ++++++-
> > >  fs/btrfs/volumes.c     | 36 +++++++++++++++++++++++++++++++++++-
> > >  fs/btrfs/volumes.h     | 10 ++++++++++
> > >  3 files changed, 51 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> > > index 5de280b9ad73..794a6a0bedf2 100644
> > > --- a/fs/btrfs/dev-replace.c
> > > +++ b/fs/btrfs/dev-replace.c
> > > @@ -558,7 +558,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
> > >  			  rcu_str_deref(src_device->name),
> > >  			  src_device->devid,
> > >  			  rcu_str_deref(tgt_device->name));
> > > -	tgt_device->is_tgtdev_for_dev_replace = 0;
> > >  	tgt_device->devid = src_device->devid;
> > >  	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
> > >  	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
> > > @@ -579,6 +578,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
> > > 
> > >  	btrfs_dev_replace_unlock(dev_replace, 1);
> > > 
> > > +	/*
> > > +	 * Only change is_tgtdev_for_dev_replace flag after all its
> > > +	 * users get released.
> > > +	 */
> > > +	wait_target_device(tgt_device);
> > > +	tgt_device->is_tgtdev_for_dev_replace = 0;
> > >  	btrfs_rm_dev_replace_blocked(fs_info);
> > > 
> > >  	btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
> > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> > > index 3c3c69c0eee4..84db9fb22b7d 100644
> > > --- a/fs/btrfs/volumes.c
> > > +++ b/fs/btrfs/volumes.c
> > > @@ -2064,6 +2064,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
> > >  	WARN_ON(!tgtdev);
> > >  	mutex_lock(&fs_info->fs_devices->device_list_mutex);
> > > 
> > > +	wait_target_device(tgtdev);
> > >  	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
> > > 
> > >  	if (tgtdev->bdev)
> > > @@ -2598,6 +2599,8 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
> > >  	device->is_tgtdev_for_dev_replace = 1;
> > >  	device->mode = FMODE_EXCL;
> > >  	device->dev_stats_valid = 1;
> > > +	atomic_set(&device->tgtdev_refs, 0);
> > > +	init_waitqueue_head(&device->tgtdev_wait);
> > >  	set_blocksize(device->bdev, 4096);
> > >  	device->fs_devices = fs_info->fs_devices;
> > >  	list_add(&device->dev_list, &fs_info->fs_devices->devices);
> > > @@ -2624,6 +2627,8 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
> > >  	tgtdev->sector_size = sectorsize;
> > >  	tgtdev->fs_info = fs_info;
> > >  	tgtdev->in_fs_metadata = 1;
> > > +	atomic_set(&tgtdev->tgtdev_refs, 0);
> > > +	init_waitqueue_head(&tgtdev->tgtdev_wait);
> > >  }
> > > 
> > >  static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
> > > @@ -5302,6 +5307,32 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
> > >  	return bbio;
> > >  }
> > > 
> > > +static void pin_bbio_target_device(struct btrfs_bio *bbio)
> > > +{
> > > +	int i;
> > > +
> > > +	for (i = 0; i < bbio->num_stripes; i++) {
> > > +		struct btrfs_device *device = bbio->stripes[i].dev;
> > > +
> > > +		if (device->is_tgtdev_for_dev_replace)
> > > +			atomic_inc(&device->tgtdev_refs);
> > > +	}
> > > +}
> > > +
> > > +static void unpin_bbio_target_device(struct btrfs_bio *bbio)
> > > +{
> > > +	int i;
> > > +
> > > +	for (i = 0; i < bbio->num_stripes; i++) {
> > > +		struct btrfs_device *device = bbio->stripes[i].dev;
> > > +
> > > +		if (device->is_tgtdev_for_dev_replace) {
> > > +			atomic_dec(&device->tgtdev_refs);
> > > +			wake_up(&device->tgtdev_wait);
> > > +		}
> > > +	}
> > > +}
> > > +
> > >  void btrfs_get_bbio(struct btrfs_bio *bbio)
> > >  {
> > >  	WARN_ON(!atomic_read(&bbio->refs));
> > > @@ -5312,8 +5343,10 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
> > >  {
> > >  	if (!bbio)
> > >  		return;
> > > -	if (atomic_dec_and_test(&bbio->refs))
> > > +	if (atomic_dec_and_test(&bbio->refs)) {
> > > +		unpin_bbio_target_device(bbio);
> > >  		kfree(bbio);
> > > +	}
> > >  }
> > > 
> > >  static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
> > > @@ -5868,6 +5901,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
> > >  		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
> > >  		bbio->mirror_num = map->num_stripes + 1;
> > >  	}
> > > +	pin_bbio_target_device(bbio);
> > >  out:
> > >  	if (dev_replace_is_ongoing) {
> > >  		btrfs_dev_replace_clear_lock_blocking(dev_replace);
> > > diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> > > index 24ba6bc3ec34..702361182597 100644
> > > --- a/fs/btrfs/volumes.h
> > > +++ b/fs/btrfs/volumes.h
> > > @@ -149,6 +149,10 @@ struct btrfs_device {
> > >  	/* Counter to record the change of device stats */
> > >  	atomic_t dev_stats_ccnt;
> > >  	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
> > > +
> > > +	/* To ensure we wait this target device before destroying it */
> > > +	atomic_t tgtdev_refs;
> > > +	wait_queue_head_t tgtdev_wait;
> > >  };
> > > 
> > >  /*
> > > @@ -538,4 +542,10 @@ struct list_head *btrfs_get_fs_uuids(void);
> > >  void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
> > >  void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
> > > 
> > > +static inline void wait_target_device(struct btrfs_device *tgtdev)
> > > +{
> > > +	if (!tgtdev || !tgtdev->is_tgtdev_for_dev_replace)
> > > +		return;
> > > +	wait_event(tgtdev->tgtdev_wait, atomic_read(&tgtdev->tgtdev_refs) == 0);
> > > +}
> > >  #endif
> > > --
> > > 2.11.0
> > > 
> > > 
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> > 
> 
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal
  2017-03-20  6:21         ` Qu Wenruo
@ 2017-03-20 20:23           ` Liu Bo
  2017-03-21  0:44             ` Qu Wenruo
  0 siblings, 1 reply; 27+ messages in thread
From: Liu Bo @ 2017-03-20 20:23 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Mon, Mar 20, 2017 at 02:21:48PM +0800, Qu Wenruo wrote:
> 
> 
> At 03/18/2017 10:03 AM, Liu Bo wrote:
> > On Fri, Mar 17, 2017 at 01:28:45PM +0800, Qu Wenruo wrote:
> > > 
> > > 
> > > At 03/17/2017 12:44 PM, Liu Bo wrote:
> > > > On Fri, Feb 03, 2017 at 04:20:22PM +0800, Qu Wenruo wrote:
> > > > > Before this patch, btrfs raid56 will keep raid56 rbio even all its IO is
> > > > > done.
> > > > > This may save some time allocating rbio, but it can cause deadly
> > > > > use-after-free bug, for the following case:
> > > > > 
> > > > > Original fs: 4 devices RAID5
> > > > > 
> > > > >        Process A                 |          Process B
> > > > > --------------------------------------------------------------------------
> > > > >                                  |  Start device replace
> > > > >                                  |    Now the fs has 5 devices
> > > > >                                  |    devid 0: replace device
> > > > >                                  |    devid 1~4: old devices
> > > > > btrfs_map_bio()                  |
> > > > > |- __btrfs_map_block()           |
> > > > > |    bbio has 5 stripes          |
> > > > > |    including devid 0           |
> > > > > |- raid56_parity_write()         |
> > > > >                                  |
> > > > > raid_write_end_io()              |
> > > > > |- rbio_orig_end_io()            |
> > > > >    |- unlock_stripe()            |
> > > > >        Keeps the old rbio for    |
> > > > >        later steal, which has    |
> > > > >        stripe for devid 0        |
> > > > >                                  |  Cancel device replace
> > > > >                                  |    Now the fs has 4 devices
> > > > >                                  |    devid 0 is freed
> > > > > Some IO happens                  |
> > > > > raid_write_end_io()              |
> > > > > |- rbio_orig_end_io()            |
> > > > >    |- unlock_stripe()            |
> > > > >       |- steal_rbio()            |
> > > > >            Use old rbio, whose   |
> > > > >            bbio has freed devid 0|
> > > > >            stripe                |
> > > > > Any access to rbio->bbio will    |
> > > > > cause general protection or NULL |
> > > > > pointer dereference              |
> > > > > 
> > > > > Such bug can already be triggered by fstests btrfs/069 for RAID5/6
> > > > > profiles.
> > > > > 
> > > > > Fix it by not keeping old rbio in unlock_stripe(), so we just free the
> > > > > finished rbio and rbio->bbio, so above problem wont' happen.
> > > > > 
> > > > 
> > > > I don't think this is acceptable, keeping a cache is important for
> > > > raid56 write performance, could you please fix it by checking if the
> > > > device is missing?
> > > 
> > > Not possible, as it's keeping the btrfs_device pointer and never release it,
> > > the stolen rbio can be kept forever until umount.
> > > 
> > 
> > steal_rbio() only takes pages from rbio->stripe_pages, so the cached
> > rbio->bbio is not going to the next IO's rbio because the cached one
> > got freed immediately in steal_rbio(), where could we dereference
> > rbio->bbio?
> 
> Did you mean in unlock_stripe(), we still keep the rbio in cache, while
> release its bbio?
>

I thought it was lock_stripe_add(), OK, so unlock_stripe() just caches
the current rbio and doens't free it.  But the point about
steal_rbio() still stands, steal_rbio() is supposed to take uptodate
pages from the cached old rbio to the current processing rbio, but it
doesn't touch the cached old rbio's bbio, nor uses the cached old rbio
afterwards, instead it is the current processing rbio that would use
its bbio for writing into target devices, but it has increased its own
bio_counter.

> This sounds quite good but I'm afraid it may cause more problems.
> 
> Quite a lot of places are accessing rbio->bbio either for their btrfs
> logical address or stripes or even stripe->dev.
>

I'm confused, could you please specify the call trace of general
protection you got in the commit log?

I wonder if patch 4 and 5 are fixing the same use-after-free problem?

Thanks,

-liubo

> Thanks,
> Qu
> 
> 
> > 
> > Thanks,
> > 
> > -liubo
> > 
> > > And I think the logical is very strange, if RAID5/6 is unstable, there is no
> > > meaning to keep it fast.
> > > 
> > > Keep it stable first, and then consider the performance.
> > > 
> > > Thanks,
> > > Qu
> > > 
> > > > 
> > > > Thanks,
> > > > 
> > > > -liubo
> > > > 
> > > > > Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> > > > > ---
> > > > >  fs/btrfs/raid56.c | 18 +-----------------
> > > > >  1 file changed, 1 insertion(+), 17 deletions(-)
> > > > > 
> > > > > diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> > > > > index 453eefdcb591..aba82b95ec73 100644
> > > > > --- a/fs/btrfs/raid56.c
> > > > > +++ b/fs/btrfs/raid56.c
> > > > > @@ -776,7 +776,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > > > >  	int bucket;
> > > > >  	struct btrfs_stripe_hash *h;
> > > > >  	unsigned long flags;
> > > > > -	int keep_cache = 0;
> > > > > 
> > > > >  	bucket = rbio_bucket(rbio);
> > > > >  	h = rbio->fs_info->stripe_hash_table->table + bucket;
> > > > > @@ -788,19 +787,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > > > >  	spin_lock(&rbio->bio_list_lock);
> > > > > 
> > > > >  	if (!list_empty(&rbio->hash_list)) {
> > > > > -		/*
> > > > > -		 * if we're still cached and there is no other IO
> > > > > -		 * to perform, just leave this rbio here for others
> > > > > -		 * to steal from later
> > > > > -		 */
> > > > > -		if (list_empty(&rbio->plug_list) &&
> > > > > -		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
> > > > > -			keep_cache = 1;
> > > > > -			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
> > > > > -			BUG_ON(!bio_list_empty(&rbio->bio_list));
> > > > > -			goto done;
> > > > > -		}
> > > > > -
> > > > >  		list_del_init(&rbio->hash_list);
> > > > >  		atomic_dec(&rbio->refs);
> > > > > 
> > > > > @@ -848,13 +834,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > > > >  			goto done_nolock;
> > > > >  		}
> > > > >  	}
> > > > > -done:
> > > > >  	spin_unlock(&rbio->bio_list_lock);
> > > > >  	spin_unlock_irqrestore(&h->lock, flags);
> > > > > 
> > > > >  done_nolock:
> > > > > -	if (!keep_cache)
> > > > > -		remove_rbio_from_cache(rbio);
> > > > > +	remove_rbio_from_cache(rbio);
> > > > >  }
> > > > > 
> > > > >  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
> > > > > --
> > > > > 2.11.0
> > > > > 
> > > > > 
> > > > > 
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > 
> > > > 
> > > 
> > > 
> > 
> > 
> 
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal
  2017-03-20 20:23           ` Liu Bo
@ 2017-03-21  0:44             ` Qu Wenruo
  2017-03-21  2:08               ` Liu Bo
  0 siblings, 1 reply; 27+ messages in thread
From: Qu Wenruo @ 2017-03-21  0:44 UTC (permalink / raw)
  To: bo.li.liu; +Cc: linux-btrfs



At 03/21/2017 04:23 AM, Liu Bo wrote:
> On Mon, Mar 20, 2017 at 02:21:48PM +0800, Qu Wenruo wrote:
>>
>>
>> At 03/18/2017 10:03 AM, Liu Bo wrote:
>>> On Fri, Mar 17, 2017 at 01:28:45PM +0800, Qu Wenruo wrote:
>>>>
>>>>
>>>> At 03/17/2017 12:44 PM, Liu Bo wrote:
>>>>> On Fri, Feb 03, 2017 at 04:20:22PM +0800, Qu Wenruo wrote:
>>>>>> Before this patch, btrfs raid56 will keep raid56 rbio even all its IO is
>>>>>> done.
>>>>>> This may save some time allocating rbio, but it can cause deadly
>>>>>> use-after-free bug, for the following case:
>>>>>>
>>>>>> Original fs: 4 devices RAID5
>>>>>>
>>>>>>        Process A                 |          Process B
>>>>>> --------------------------------------------------------------------------
>>>>>>                                  |  Start device replace
>>>>>>                                  |    Now the fs has 5 devices
>>>>>>                                  |    devid 0: replace device
>>>>>>                                  |    devid 1~4: old devices
>>>>>> btrfs_map_bio()                  |
>>>>>> |- __btrfs_map_block()           |
>>>>>> |    bbio has 5 stripes          |
>>>>>> |    including devid 0           |
>>>>>> |- raid56_parity_write()         |
>>>>>>                                  |
>>>>>> raid_write_end_io()              |
>>>>>> |- rbio_orig_end_io()            |
>>>>>>    |- unlock_stripe()            |
>>>>>>        Keeps the old rbio for    |
>>>>>>        later steal, which has    |
>>>>>>        stripe for devid 0        |
>>>>>>                                  |  Cancel device replace
>>>>>>                                  |    Now the fs has 4 devices
>>>>>>                                  |    devid 0 is freed
>>>>>> Some IO happens                  |
>>>>>> raid_write_end_io()              |
>>>>>> |- rbio_orig_end_io()            |
>>>>>>    |- unlock_stripe()            |
>>>>>>       |- steal_rbio()            |
>>>>>>            Use old rbio, whose   |
>>>>>>            bbio has freed devid 0|
>>>>>>            stripe                |
>>>>>> Any access to rbio->bbio will    |
>>>>>> cause general protection or NULL |
>>>>>> pointer dereference              |
>>>>>>
>>>>>> Such bug can already be triggered by fstests btrfs/069 for RAID5/6
>>>>>> profiles.
>>>>>>
>>>>>> Fix it by not keeping old rbio in unlock_stripe(), so we just free the
>>>>>> finished rbio and rbio->bbio, so above problem wont' happen.
>>>>>>
>>>>>
>>>>> I don't think this is acceptable, keeping a cache is important for
>>>>> raid56 write performance, could you please fix it by checking if the
>>>>> device is missing?
>>>>
>>>> Not possible, as it's keeping the btrfs_device pointer and never release it,
>>>> the stolen rbio can be kept forever until umount.
>>>>
>>>
>>> steal_rbio() only takes pages from rbio->stripe_pages, so the cached
>>> rbio->bbio is not going to the next IO's rbio because the cached one
>>> got freed immediately in steal_rbio(), where could we dereference
>>> rbio->bbio?
>>
>> Did you mean in unlock_stripe(), we still keep the rbio in cache, while
>> release its bbio?
>>
>
> I thought it was lock_stripe_add(), OK, so unlock_stripe() just caches
> the current rbio and doens't free it.  But the point about
> steal_rbio() still stands, steal_rbio() is supposed to take uptodate
> pages from the cached old rbio to the current processing rbio, but it
> doesn't touch the cached old rbio's bbio, nor uses the cached old rbio
> afterwards, instead it is the current processing rbio that would use
> its bbio for writing into target devices, but it has increased its own
> bio_counter.
>
>> This sounds quite good but I'm afraid it may cause more problems.
>>
>> Quite a lot of places are accessing rbio->bbio either for their btrfs
>> logical address or stripes or even stripe->dev.
>>
>
> I'm confused, could you please specify the call trace of general
> protection you got in the commit log?

The 4th and 5th patches are designed to fix the same problem.

If one applies 5th patch only and run btrfs/069, it will cause hang when 
aborting replace, since btrfs_device of dev 0 is hold in bbio->stripes[] 
and never get released.

The 4th patch is used to solve such hang.

And the kernel NULL pointer access is like this when running modified 
btrfs/069 (only run on RAID5, and improve the duration of fsstress):

[  884.877421] BUG: unable to handle kernel NULL pointer dereference at 
00000000000005e0
[  884.878206] IP: generic_make_request_checks+0x4d/0x610
[  884.878541] PGD 2d45a067
[  884.878542] PUD 3ba0e067
[  884.878857] PMD 0
[  884.879189]
[  884.879899] Oops: 0000 [#1] SMP
[  884.880207] Modules linked in: btrfs(O) ext4 jbd2 mbcache xor 
raid6_pq netconsole xfs [last unloaded: btrfs]
[  884.880845] CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G 
  O    4.11.0-rc2 #72
[  884.881455] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
1.10.2-20170228_101828-anatol 04/01/2014
[  884.882119] Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper 
[btrfs]
[  884.883089] task: ffff88002875b4c0 task.stack: ffffc90001334000
[  884.883527] RIP: 0010:generic_make_request_checks+0x4d/0x610
[  884.883855] RSP: 0018:ffffc90001337bb8 EFLAGS: 00010283
[  884.884186] RAX: 0000000000000000 RBX: ffff8800126503e8 RCX: 
0000000000218800
[  884.884539] RDX: 0000000000000040 RSI: 0000000000000001 RDI: 
ffff88003d8116c0
[  884.884897] RBP: ffffc90001337c18 R08: 0000000000000001 R09: 
0000000000000001
[  884.885778] R10: 0000000000000000 R11: 00000000000162b9 R12: 
0000000000000040
[  884.886346] R13: ffff8800126503e8 R14: 00000000ffffffff R15: 
0000000000000010
[  884.887146] FS:  0000000000000000(0000) GS:ffff88003e400000(0000) 
knlGS:0000000000000000
[  884.888457] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  884.888792] CR2: 00000000000005e0 CR3: 000000003ad30000 CR4: 
00000000000006e0
[  884.889212] Call Trace:
[  884.889526]  ? generic_make_request+0xc7/0x360
[  884.889841]  generic_make_request+0x24/0x360
[  884.890163]  ? generic_make_request+0xc7/0x360
[  884.890486]  submit_bio+0x64/0x120
[  884.890828]  ? page_in_rbio+0x4d/0x80 [btrfs]
[  884.891206]  ? rbio_orig_end_io+0x80/0x80 [btrfs]
[  884.891543]  finish_rmw+0x3f4/0x540 [btrfs]
[  884.891875]  validate_rbio_for_rmw+0x36/0x40 [btrfs]
[  884.892213]  raid_rmw_end_io+0x7a/0x90 [btrfs]
[  884.892565]  bio_endio+0x56/0x60
[  884.892891]  end_workqueue_fn+0x3c/0x40 [btrfs]
[  884.893265]  btrfs_scrubparity_helper+0xef/0x620 [btrfs]
[  884.893698]  btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
[  884.894101]  process_one_work+0x2af/0x720
[  884.894837]  ? process_one_work+0x22b/0x720
[  884.895278]  worker_thread+0x4b/0x4f0
[  884.895760]  kthread+0x10f/0x150
[  884.896106]  ? process_one_work+0x720/0x720
[  884.896448]  ? kthread_create_on_node+0x40/0x40
[  884.896803]  ret_from_fork+0x2e/0x40
[  884.897148] Code: 67 28 48 c7 c7 27 7f c9 81 e8 90 6c d4 ff e8 0b bb 
54 00 41 c1 ec 09 48 8b 7b 08 45 85 e4 0f 85 be 00 00 00 48 8b 87 00 01 
00 00 <4c> 8b b0 e0 05 00 00 4d 85 f6 0f 84 86 01 00 00 4c 8b af f0 00
[  884.898449] RIP: generic_make_request_checks+0x4d/0x610 RSP: 
ffffc90001337bb8
[  884.899223] CR2: 00000000000005e0
[  884.900223] ---[ end trace 307e118b57a9995e ]---

Thanks,
Qu

>
> I wonder if patch 4 and 5 are fixing the same use-after-free problem?
>
> Thanks,
>
> -liubo
>
>> Thanks,
>> Qu
>>
>>
>>>
>>> Thanks,
>>>
>>> -liubo
>>>
>>>> And I think the logical is very strange, if RAID5/6 is unstable, there is no
>>>> meaning to keep it fast.
>>>>
>>>> Keep it stable first, and then consider the performance.
>>>>
>>>> Thanks,
>>>> Qu
>>>>
>>>>>
>>>>> Thanks,
>>>>>
>>>>> -liubo
>>>>>
>>>>>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>>>> ---
>>>>>>  fs/btrfs/raid56.c | 18 +-----------------
>>>>>>  1 file changed, 1 insertion(+), 17 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>>>>>> index 453eefdcb591..aba82b95ec73 100644
>>>>>> --- a/fs/btrfs/raid56.c
>>>>>> +++ b/fs/btrfs/raid56.c
>>>>>> @@ -776,7 +776,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>>>  	int bucket;
>>>>>>  	struct btrfs_stripe_hash *h;
>>>>>>  	unsigned long flags;
>>>>>> -	int keep_cache = 0;
>>>>>>
>>>>>>  	bucket = rbio_bucket(rbio);
>>>>>>  	h = rbio->fs_info->stripe_hash_table->table + bucket;
>>>>>> @@ -788,19 +787,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>>>  	spin_lock(&rbio->bio_list_lock);
>>>>>>
>>>>>>  	if (!list_empty(&rbio->hash_list)) {
>>>>>> -		/*
>>>>>> -		 * if we're still cached and there is no other IO
>>>>>> -		 * to perform, just leave this rbio here for others
>>>>>> -		 * to steal from later
>>>>>> -		 */
>>>>>> -		if (list_empty(&rbio->plug_list) &&
>>>>>> -		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
>>>>>> -			keep_cache = 1;
>>>>>> -			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
>>>>>> -			BUG_ON(!bio_list_empty(&rbio->bio_list));
>>>>>> -			goto done;
>>>>>> -		}
>>>>>> -
>>>>>>  		list_del_init(&rbio->hash_list);
>>>>>>  		atomic_dec(&rbio->refs);
>>>>>>
>>>>>> @@ -848,13 +834,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>>>  			goto done_nolock;
>>>>>>  		}
>>>>>>  	}
>>>>>> -done:
>>>>>>  	spin_unlock(&rbio->bio_list_lock);
>>>>>>  	spin_unlock_irqrestore(&h->lock, flags);
>>>>>>
>>>>>>  done_nolock:
>>>>>> -	if (!keep_cache)
>>>>>> -		remove_rbio_from_cache(rbio);
>>>>>> +	remove_rbio_from_cache(rbio);
>>>>>>  }
>>>>>>
>>>>>>  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
>>>>>> --
>>>>>> 2.11.0
>>>>>>
>>>>>>
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>
>>>>>
>>>>
>>>>
>>>
>>>
>>
>>
>
>



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal
  2017-03-21  0:44             ` Qu Wenruo
@ 2017-03-21  2:08               ` Liu Bo
  2017-03-21  2:23                 ` Qu Wenruo
  0 siblings, 1 reply; 27+ messages in thread
From: Liu Bo @ 2017-03-21  2:08 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Tue, Mar 21, 2017 at 08:44:18AM +0800, Qu Wenruo wrote:
> 
> 
> At 03/21/2017 04:23 AM, Liu Bo wrote:
> > On Mon, Mar 20, 2017 at 02:21:48PM +0800, Qu Wenruo wrote:
> > > 
> > > 
> > > At 03/18/2017 10:03 AM, Liu Bo wrote:
> > > > On Fri, Mar 17, 2017 at 01:28:45PM +0800, Qu Wenruo wrote:
> > > > > 
> > > > > 
> > > > > At 03/17/2017 12:44 PM, Liu Bo wrote:
> > > > > > On Fri, Feb 03, 2017 at 04:20:22PM +0800, Qu Wenruo wrote:
> > > > > > > Before this patch, btrfs raid56 will keep raid56 rbio even all its IO is
> > > > > > > done.
> > > > > > > This may save some time allocating rbio, but it can cause deadly
> > > > > > > use-after-free bug, for the following case:
> > > > > > > 
> > > > > > > Original fs: 4 devices RAID5
> > > > > > > 
> > > > > > >        Process A                 |          Process B
> > > > > > > --------------------------------------------------------------------------
> > > > > > >                                  |  Start device replace
> > > > > > >                                  |    Now the fs has 5 devices
> > > > > > >                                  |    devid 0: replace device
> > > > > > >                                  |    devid 1~4: old devices
> > > > > > > btrfs_map_bio()                  |
> > > > > > > |- __btrfs_map_block()           |
> > > > > > > |    bbio has 5 stripes          |
> > > > > > > |    including devid 0           |
> > > > > > > |- raid56_parity_write()         |
> > > > > > >                                  |
> > > > > > > raid_write_end_io()              |
> > > > > > > |- rbio_orig_end_io()            |
> > > > > > >    |- unlock_stripe()            |
> > > > > > >        Keeps the old rbio for    |
> > > > > > >        later steal, which has    |
> > > > > > >        stripe for devid 0        |
> > > > > > >                                  |  Cancel device replace
> > > > > > >                                  |    Now the fs has 4 devices
> > > > > > >                                  |    devid 0 is freed
> > > > > > > Some IO happens                  |
> > > > > > > raid_write_end_io()              |
> > > > > > > |- rbio_orig_end_io()            |
> > > > > > >    |- unlock_stripe()            |
> > > > > > >       |- steal_rbio()            |
> > > > > > >            Use old rbio, whose   |
> > > > > > >            bbio has freed devid 0|
> > > > > > >            stripe                |
> > > > > > > Any access to rbio->bbio will    |
> > > > > > > cause general protection or NULL |
> > > > > > > pointer dereference              |
> > > > > > > 
> > > > > > > Such bug can already be triggered by fstests btrfs/069 for RAID5/6
> > > > > > > profiles.
> > > > > > > 
> > > > > > > Fix it by not keeping old rbio in unlock_stripe(), so we just free the
> > > > > > > finished rbio and rbio->bbio, so above problem wont' happen.
> > > > > > > 
> > > > > > 
> > > > > > I don't think this is acceptable, keeping a cache is important for
> > > > > > raid56 write performance, could you please fix it by checking if the
> > > > > > device is missing?
> > > > > 
> > > > > Not possible, as it's keeping the btrfs_device pointer and never release it,
> > > > > the stolen rbio can be kept forever until umount.
> > > > > 
> > > > 
> > > > steal_rbio() only takes pages from rbio->stripe_pages, so the cached
> > > > rbio->bbio is not going to the next IO's rbio because the cached one
> > > > got freed immediately in steal_rbio(), where could we dereference
> > > > rbio->bbio?
> > > 
> > > Did you mean in unlock_stripe(), we still keep the rbio in cache, while
> > > release its bbio?
> > > 
> > 
> > I thought it was lock_stripe_add(), OK, so unlock_stripe() just caches
> > the current rbio and doens't free it.  But the point about
> > steal_rbio() still stands, steal_rbio() is supposed to take uptodate
> > pages from the cached old rbio to the current processing rbio, but it
> > doesn't touch the cached old rbio's bbio, nor uses the cached old rbio
> > afterwards, instead it is the current processing rbio that would use
> > its bbio for writing into target devices, but it has increased its own
> > bio_counter.
> > 
> > > This sounds quite good but I'm afraid it may cause more problems.
> > > 
> > > Quite a lot of places are accessing rbio->bbio either for their btrfs
> > > logical address or stripes or even stripe->dev.
> > > 
> > 
> > I'm confused, could you please specify the call trace of general
> > protection you got in the commit log?
> 
> The 4th and 5th patches are designed to fix the same problem.
> 
> If one applies 5th patch only and run btrfs/069, it will cause hang when
> aborting replace, since btrfs_device of dev 0 is hold in bbio->stripes[] and
> never get released.
> 
> The 4th patch is used to solve such hang.
>

OK, I see.

Firstly, the above commit log is misleading people a bit because it
says that steal_rbio() dereferences the device of the cached rbio and
that device has got free'd, but steal_rbio() actually doesn't.  Yes,
the cached rbio holds a reference on the free'd device, but I think
the below 'NULL pointer dereference' comes from writing back pages
into target devices when doing RMW with the current rbio instead of
the old cached one, right?

Secondly, if it is the current rio that ends up this 'NULL pointer
dereference', it could hold a bio_counter and let the replace thread
canceled by scrub wait for this bio_counter to be zero.  Although
btrfs_dev_replace_finishing() has flushed delalloc IO and committed
transaction, seems like scrub is an exception because it can continued
after committing transaction.

Thirdly, it is possible that this canceled dev-replace could make
fstrim get a 'general protection' or 'NULL pointer dereference' since
it could access target devices and is not sychoronized by committing
transaction.

Please correct me if I'm wrong since I failed to reproduce it.

Thanks,

-liubo

> And the kernel NULL pointer access is like this when running modified
> btrfs/069 (only run on RAID5, and improve the duration of fsstress):
> 
> [  884.877421] BUG: unable to handle kernel NULL pointer dereference at
> 00000000000005e0
> [  884.878206] IP: generic_make_request_checks+0x4d/0x610
> [  884.878541] PGD 2d45a067
> [  884.878542] PUD 3ba0e067
> [  884.878857] PMD 0
> [  884.879189]
> [  884.879899] Oops: 0000 [#1] SMP
> [  884.880207] Modules linked in: btrfs(O) ext4 jbd2 mbcache xor raid6_pq
> netconsole xfs [last unloaded: btrfs]
> [  884.880845] CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G  O
> 4.11.0-rc2 #72
> [  884.881455] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
> 1.10.2-20170228_101828-anatol 04/01/2014
> [  884.882119] Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper
> [btrfs]
> [  884.883089] task: ffff88002875b4c0 task.stack: ffffc90001334000
> [  884.883527] RIP: 0010:generic_make_request_checks+0x4d/0x610
> [  884.883855] RSP: 0018:ffffc90001337bb8 EFLAGS: 00010283
> [  884.884186] RAX: 0000000000000000 RBX: ffff8800126503e8 RCX:
> 0000000000218800
> [  884.884539] RDX: 0000000000000040 RSI: 0000000000000001 RDI:
> ffff88003d8116c0
> [  884.884897] RBP: ffffc90001337c18 R08: 0000000000000001 R09:
> 0000000000000001
> [  884.885778] R10: 0000000000000000 R11: 00000000000162b9 R12:
> 0000000000000040
> [  884.886346] R13: ffff8800126503e8 R14: 00000000ffffffff R15:
> 0000000000000010
> [  884.887146] FS:  0000000000000000(0000) GS:ffff88003e400000(0000)
> knlGS:0000000000000000
> [  884.888457] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  884.888792] CR2: 00000000000005e0 CR3: 000000003ad30000 CR4:
> 00000000000006e0
> [  884.889212] Call Trace:
> [  884.889526]  ? generic_make_request+0xc7/0x360
> [  884.889841]  generic_make_request+0x24/0x360
> [  884.890163]  ? generic_make_request+0xc7/0x360
> [  884.890486]  submit_bio+0x64/0x120
> [  884.890828]  ? page_in_rbio+0x4d/0x80 [btrfs]
> [  884.891206]  ? rbio_orig_end_io+0x80/0x80 [btrfs]
> [  884.891543]  finish_rmw+0x3f4/0x540 [btrfs]
> [  884.891875]  validate_rbio_for_rmw+0x36/0x40 [btrfs]
> [  884.892213]  raid_rmw_end_io+0x7a/0x90 [btrfs]
> [  884.892565]  bio_endio+0x56/0x60
> [  884.892891]  end_workqueue_fn+0x3c/0x40 [btrfs]
> [  884.893265]  btrfs_scrubparity_helper+0xef/0x620 [btrfs]
> [  884.893698]  btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
> [  884.894101]  process_one_work+0x2af/0x720
> [  884.894837]  ? process_one_work+0x22b/0x720
> [  884.895278]  worker_thread+0x4b/0x4f0
> [  884.895760]  kthread+0x10f/0x150
> [  884.896106]  ? process_one_work+0x720/0x720
> [  884.896448]  ? kthread_create_on_node+0x40/0x40
> [  884.896803]  ret_from_fork+0x2e/0x40
> [  884.897148] Code: 67 28 48 c7 c7 27 7f c9 81 e8 90 6c d4 ff e8 0b bb 54
> 00 41 c1 ec 09 48 8b 7b 08 45 85 e4 0f 85 be 00 00 00 48 8b 87 00 01 00 00
> <4c> 8b b0 e0 05 00 00 4d 85 f6 0f 84 86 01 00 00 4c 8b af f0 00
> [  884.898449] RIP: generic_make_request_checks+0x4d/0x610 RSP:
> ffffc90001337bb8
> [  884.899223] CR2: 00000000000005e0
> [  884.900223] ---[ end trace 307e118b57a9995e ]---
> 
> Thanks,
> Qu
> 
> > 
> > I wonder if patch 4 and 5 are fixing the same use-after-free problem?
> > 
> > Thanks,
> > 
> > -liubo
> > 
> > > Thanks,
> > > Qu
> > > 
> > > 
> > > > 
> > > > Thanks,
> > > > 
> > > > -liubo
> > > > 
> > > > > And I think the logical is very strange, if RAID5/6 is unstable, there is no
> > > > > meaning to keep it fast.
> > > > > 
> > > > > Keep it stable first, and then consider the performance.
> > > > > 
> > > > > Thanks,
> > > > > Qu
> > > > > 
> > > > > > 
> > > > > > Thanks,
> > > > > > 
> > > > > > -liubo
> > > > > > 
> > > > > > > Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> > > > > > > ---
> > > > > > >  fs/btrfs/raid56.c | 18 +-----------------
> > > > > > >  1 file changed, 1 insertion(+), 17 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> > > > > > > index 453eefdcb591..aba82b95ec73 100644
> > > > > > > --- a/fs/btrfs/raid56.c
> > > > > > > +++ b/fs/btrfs/raid56.c
> > > > > > > @@ -776,7 +776,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > > > > > >  	int bucket;
> > > > > > >  	struct btrfs_stripe_hash *h;
> > > > > > >  	unsigned long flags;
> > > > > > > -	int keep_cache = 0;
> > > > > > > 
> > > > > > >  	bucket = rbio_bucket(rbio);
> > > > > > >  	h = rbio->fs_info->stripe_hash_table->table + bucket;
> > > > > > > @@ -788,19 +787,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > > > > > >  	spin_lock(&rbio->bio_list_lock);
> > > > > > > 
> > > > > > >  	if (!list_empty(&rbio->hash_list)) {
> > > > > > > -		/*
> > > > > > > -		 * if we're still cached and there is no other IO
> > > > > > > -		 * to perform, just leave this rbio here for others
> > > > > > > -		 * to steal from later
> > > > > > > -		 */
> > > > > > > -		if (list_empty(&rbio->plug_list) &&
> > > > > > > -		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
> > > > > > > -			keep_cache = 1;
> > > > > > > -			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
> > > > > > > -			BUG_ON(!bio_list_empty(&rbio->bio_list));
> > > > > > > -			goto done;
> > > > > > > -		}
> > > > > > > -
> > > > > > >  		list_del_init(&rbio->hash_list);
> > > > > > >  		atomic_dec(&rbio->refs);
> > > > > > > 
> > > > > > > @@ -848,13 +834,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > > > > > >  			goto done_nolock;
> > > > > > >  		}
> > > > > > >  	}
> > > > > > > -done:
> > > > > > >  	spin_unlock(&rbio->bio_list_lock);
> > > > > > >  	spin_unlock_irqrestore(&h->lock, flags);
> > > > > > > 
> > > > > > >  done_nolock:
> > > > > > > -	if (!keep_cache)
> > > > > > > -		remove_rbio_from_cache(rbio);
> > > > > > > +	remove_rbio_from_cache(rbio);
> > > > > > >  }
> > > > > > > 
> > > > > > >  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
> > > > > > > --
> > > > > > > 2.11.0
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > --
> > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > 
> > > > > > 
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > 
> > > 
> > 
> > 
> 
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal
  2017-03-21  2:08               ` Liu Bo
@ 2017-03-21  2:23                 ` Qu Wenruo
  2017-03-21  5:45                   ` Liu Bo
  0 siblings, 1 reply; 27+ messages in thread
From: Qu Wenruo @ 2017-03-21  2:23 UTC (permalink / raw)
  To: bo.li.liu; +Cc: linux-btrfs



At 03/21/2017 10:08 AM, Liu Bo wrote:
> On Tue, Mar 21, 2017 at 08:44:18AM +0800, Qu Wenruo wrote:
>>
>>
>> At 03/21/2017 04:23 AM, Liu Bo wrote:
>>> On Mon, Mar 20, 2017 at 02:21:48PM +0800, Qu Wenruo wrote:
>>>>
>>>>
>>>> At 03/18/2017 10:03 AM, Liu Bo wrote:
>>>>> On Fri, Mar 17, 2017 at 01:28:45PM +0800, Qu Wenruo wrote:
>>>>>>
>>>>>>
>>>>>> At 03/17/2017 12:44 PM, Liu Bo wrote:
>>>>>>> On Fri, Feb 03, 2017 at 04:20:22PM +0800, Qu Wenruo wrote:
>>>>>>>> Before this patch, btrfs raid56 will keep raid56 rbio even all its IO is
>>>>>>>> done.
>>>>>>>> This may save some time allocating rbio, but it can cause deadly
>>>>>>>> use-after-free bug, for the following case:
>>>>>>>>
>>>>>>>> Original fs: 4 devices RAID5
>>>>>>>>
>>>>>>>>        Process A                 |          Process B
>>>>>>>> --------------------------------------------------------------------------
>>>>>>>>                                  |  Start device replace
>>>>>>>>                                  |    Now the fs has 5 devices
>>>>>>>>                                  |    devid 0: replace device
>>>>>>>>                                  |    devid 1~4: old devices
>>>>>>>> btrfs_map_bio()                  |
>>>>>>>> |- __btrfs_map_block()           |
>>>>>>>> |    bbio has 5 stripes          |
>>>>>>>> |    including devid 0           |
>>>>>>>> |- raid56_parity_write()         |
>>>>>>>>                                  |
>>>>>>>> raid_write_end_io()              |
>>>>>>>> |- rbio_orig_end_io()            |
>>>>>>>>    |- unlock_stripe()            |
>>>>>>>>        Keeps the old rbio for    |
>>>>>>>>        later steal, which has    |
>>>>>>>>        stripe for devid 0        |
>>>>>>>>                                  |  Cancel device replace
>>>>>>>>                                  |    Now the fs has 4 devices
>>>>>>>>                                  |    devid 0 is freed
>>>>>>>> Some IO happens                  |
>>>>>>>> raid_write_end_io()              |
>>>>>>>> |- rbio_orig_end_io()            |
>>>>>>>>    |- unlock_stripe()            |
>>>>>>>>       |- steal_rbio()            |
>>>>>>>>            Use old rbio, whose   |
>>>>>>>>            bbio has freed devid 0|
>>>>>>>>            stripe                |
>>>>>>>> Any access to rbio->bbio will    |
>>>>>>>> cause general protection or NULL |
>>>>>>>> pointer dereference              |
>>>>>>>>
>>>>>>>> Such bug can already be triggered by fstests btrfs/069 for RAID5/6
>>>>>>>> profiles.
>>>>>>>>
>>>>>>>> Fix it by not keeping old rbio in unlock_stripe(), so we just free the
>>>>>>>> finished rbio and rbio->bbio, so above problem wont' happen.
>>>>>>>>
>>>>>>>
>>>>>>> I don't think this is acceptable, keeping a cache is important for
>>>>>>> raid56 write performance, could you please fix it by checking if the
>>>>>>> device is missing?
>>>>>>
>>>>>> Not possible, as it's keeping the btrfs_device pointer and never release it,
>>>>>> the stolen rbio can be kept forever until umount.
>>>>>>
>>>>>
>>>>> steal_rbio() only takes pages from rbio->stripe_pages, so the cached
>>>>> rbio->bbio is not going to the next IO's rbio because the cached one
>>>>> got freed immediately in steal_rbio(), where could we dereference
>>>>> rbio->bbio?
>>>>
>>>> Did you mean in unlock_stripe(), we still keep the rbio in cache, while
>>>> release its bbio?
>>>>
>>>
>>> I thought it was lock_stripe_add(), OK, so unlock_stripe() just caches
>>> the current rbio and doens't free it.  But the point about
>>> steal_rbio() still stands, steal_rbio() is supposed to take uptodate
>>> pages from the cached old rbio to the current processing rbio, but it
>>> doesn't touch the cached old rbio's bbio, nor uses the cached old rbio
>>> afterwards, instead it is the current processing rbio that would use
>>> its bbio for writing into target devices, but it has increased its own
>>> bio_counter.
>>>
>>>> This sounds quite good but I'm afraid it may cause more problems.
>>>>
>>>> Quite a lot of places are accessing rbio->bbio either for their btrfs
>>>> logical address or stripes or even stripe->dev.
>>>>
>>>
>>> I'm confused, could you please specify the call trace of general
>>> protection you got in the commit log?
>>
>> The 4th and 5th patches are designed to fix the same problem.
>>
>> If one applies 5th patch only and run btrfs/069, it will cause hang when
>> aborting replace, since btrfs_device of dev 0 is hold in bbio->stripes[] and
>> never get released.
>>
>> The 4th patch is used to solve such hang.
>>
>
> OK, I see.
>
> Firstly, the above commit log is misleading people a bit because it
> says that steal_rbio() dereferences the device of the cached rbio and
> that device has got free'd, but steal_rbio() actually doesn't.  Yes,
> the cached rbio holds a reference on the free'd device, but I think
> the below 'NULL pointer dereference' comes from writing back pages
> into target devices when doing RMW with the current rbio instead of
> the old cached one, right?

Yes, steal_bio() is not related to this problem, sorry for the confusion.

>
> Secondly, if it is the current rio that ends up this 'NULL pointer
> dereference', it could hold a bio_counter and let the replace thread
> canceled by scrub wait for this bio_counter to be zero.  Although
> btrfs_dev_replace_finishing() has flushed delalloc IO and committed
> transaction, seems like scrub is an exception because it can continued
> after committing transaction.

If I understand it correctly, did you mean hold bio_counter when rbio 
holds bbio?

That's OK, but we still need the 4th patch, or it will block replace 
cancel forever.

>
> Thirdly, it is possible that this canceled dev-replace could make
> fstrim get a 'general protection' or 'NULL pointer dereference' since
> it could access target devices and is not sychoronized by committing
> transaction.

So I'm using the refcount for btrfs_device to do full protection for it.
As long as btrfs_dev can only be freed when no on holds it, instead of 
no bio pending for it, it should be safer.

>
> Please correct me if I'm wrong since I failed to reproduce it.

The bug is harder to trigger in v4.11-rc2 now.

I modified btrfs/069 to make it easier to trigger, but it's still quite 
hard to reproduce it.

Even with modification, the possibility is still low, at about 10~20%.

Hopes the diff could help you to trigger it.

Thanks,
Qu

diff --git a/tests/btrfs/069 b/tests/btrfs/069
index d939cd8..f9ff945 100755
--- a/tests/btrfs/069
+++ b/tests/btrfs/069
@@ -74,7 +74,7 @@ run_test()
         _scratch_mount >>$seqres.full 2>&1
         SCRATCH_DEV_POOL=$saved_scratch_dev_pool

-       args=`_scale_fsstress_args -p 20 -n 100 $FSSTRESS_AVOID -d 
$SCRATCH_MNT/stressdir`
+       args=`_scale_fsstress_args -p 20 -n 1000 $FSSTRESS_AVOID -d 
$SCRATCH_MNT/stressdir`
         echo "Run fsstress $args" >>$seqres.full
         $FSSTRESS_PROG $args >/dev/null 2>&1 &
         fsstress_pid=$!
@@ -115,9 +115,10 @@ run_test()
  }

  echo "Silence is golden"
-for t in "${_btrfs_profile_configs[@]}"; do
-       run_test "$t"
-done
+#for t in "${_btrfs_profile_configs[@]}"; do
+#      run_test "$t"
+#done
+run_test "-d raid5 -m raid5"

  status=0
  exit

>
> Thanks,
>
> -liubo
>
>> And the kernel NULL pointer access is like this when running modified
>> btrfs/069 (only run on RAID5, and improve the duration of fsstress):
>>
>> [  884.877421] BUG: unable to handle kernel NULL pointer dereference at
>> 00000000000005e0
>> [  884.878206] IP: generic_make_request_checks+0x4d/0x610
>> [  884.878541] PGD 2d45a067
>> [  884.878542] PUD 3ba0e067
>> [  884.878857] PMD 0
>> [  884.879189]
>> [  884.879899] Oops: 0000 [#1] SMP
>> [  884.880207] Modules linked in: btrfs(O) ext4 jbd2 mbcache xor raid6_pq
>> netconsole xfs [last unloaded: btrfs]
>> [  884.880845] CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G  O
>> 4.11.0-rc2 #72
>> [  884.881455] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
>> 1.10.2-20170228_101828-anatol 04/01/2014
>> [  884.882119] Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper
>> [btrfs]
>> [  884.883089] task: ffff88002875b4c0 task.stack: ffffc90001334000
>> [  884.883527] RIP: 0010:generic_make_request_checks+0x4d/0x610
>> [  884.883855] RSP: 0018:ffffc90001337bb8 EFLAGS: 00010283
>> [  884.884186] RAX: 0000000000000000 RBX: ffff8800126503e8 RCX:
>> 0000000000218800
>> [  884.884539] RDX: 0000000000000040 RSI: 0000000000000001 RDI:
>> ffff88003d8116c0
>> [  884.884897] RBP: ffffc90001337c18 R08: 0000000000000001 R09:
>> 0000000000000001
>> [  884.885778] R10: 0000000000000000 R11: 00000000000162b9 R12:
>> 0000000000000040
>> [  884.886346] R13: ffff8800126503e8 R14: 00000000ffffffff R15:
>> 0000000000000010
>> [  884.887146] FS:  0000000000000000(0000) GS:ffff88003e400000(0000)
>> knlGS:0000000000000000
>> [  884.888457] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [  884.888792] CR2: 00000000000005e0 CR3: 000000003ad30000 CR4:
>> 00000000000006e0
>> [  884.889212] Call Trace:
>> [  884.889526]  ? generic_make_request+0xc7/0x360
>> [  884.889841]  generic_make_request+0x24/0x360
>> [  884.890163]  ? generic_make_request+0xc7/0x360
>> [  884.890486]  submit_bio+0x64/0x120
>> [  884.890828]  ? page_in_rbio+0x4d/0x80 [btrfs]
>> [  884.891206]  ? rbio_orig_end_io+0x80/0x80 [btrfs]
>> [  884.891543]  finish_rmw+0x3f4/0x540 [btrfs]
>> [  884.891875]  validate_rbio_for_rmw+0x36/0x40 [btrfs]
>> [  884.892213]  raid_rmw_end_io+0x7a/0x90 [btrfs]
>> [  884.892565]  bio_endio+0x56/0x60
>> [  884.892891]  end_workqueue_fn+0x3c/0x40 [btrfs]
>> [  884.893265]  btrfs_scrubparity_helper+0xef/0x620 [btrfs]
>> [  884.893698]  btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
>> [  884.894101]  process_one_work+0x2af/0x720
>> [  884.894837]  ? process_one_work+0x22b/0x720
>> [  884.895278]  worker_thread+0x4b/0x4f0
>> [  884.895760]  kthread+0x10f/0x150
>> [  884.896106]  ? process_one_work+0x720/0x720
>> [  884.896448]  ? kthread_create_on_node+0x40/0x40
>> [  884.896803]  ret_from_fork+0x2e/0x40
>> [  884.897148] Code: 67 28 48 c7 c7 27 7f c9 81 e8 90 6c d4 ff e8 0b bb 54
>> 00 41 c1 ec 09 48 8b 7b 08 45 85 e4 0f 85 be 00 00 00 48 8b 87 00 01 00 00
>> <4c> 8b b0 e0 05 00 00 4d 85 f6 0f 84 86 01 00 00 4c 8b af f0 00
>> [  884.898449] RIP: generic_make_request_checks+0x4d/0x610 RSP:
>> ffffc90001337bb8
>> [  884.899223] CR2: 00000000000005e0
>> [  884.900223] ---[ end trace 307e118b57a9995e ]---
>>
>> Thanks,
>> Qu
>>
>>>
>>> I wonder if patch 4 and 5 are fixing the same use-after-free problem?
>>>
>>> Thanks,
>>>
>>> -liubo
>>>
>>>> Thanks,
>>>> Qu
>>>>
>>>>
>>>>>
>>>>> Thanks,
>>>>>
>>>>> -liubo
>>>>>
>>>>>> And I think the logical is very strange, if RAID5/6 is unstable, there is no
>>>>>> meaning to keep it fast.
>>>>>>
>>>>>> Keep it stable first, and then consider the performance.
>>>>>>
>>>>>> Thanks,
>>>>>> Qu
>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>>
>>>>>>> -liubo
>>>>>>>
>>>>>>>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>>>>>> ---
>>>>>>>>  fs/btrfs/raid56.c | 18 +-----------------
>>>>>>>>  1 file changed, 1 insertion(+), 17 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>>>>>>>> index 453eefdcb591..aba82b95ec73 100644
>>>>>>>> --- a/fs/btrfs/raid56.c
>>>>>>>> +++ b/fs/btrfs/raid56.c
>>>>>>>> @@ -776,7 +776,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>>>>>  	int bucket;
>>>>>>>>  	struct btrfs_stripe_hash *h;
>>>>>>>>  	unsigned long flags;
>>>>>>>> -	int keep_cache = 0;
>>>>>>>>
>>>>>>>>  	bucket = rbio_bucket(rbio);
>>>>>>>>  	h = rbio->fs_info->stripe_hash_table->table + bucket;
>>>>>>>> @@ -788,19 +787,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>>>>>  	spin_lock(&rbio->bio_list_lock);
>>>>>>>>
>>>>>>>>  	if (!list_empty(&rbio->hash_list)) {
>>>>>>>> -		/*
>>>>>>>> -		 * if we're still cached and there is no other IO
>>>>>>>> -		 * to perform, just leave this rbio here for others
>>>>>>>> -		 * to steal from later
>>>>>>>> -		 */
>>>>>>>> -		if (list_empty(&rbio->plug_list) &&
>>>>>>>> -		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
>>>>>>>> -			keep_cache = 1;
>>>>>>>> -			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
>>>>>>>> -			BUG_ON(!bio_list_empty(&rbio->bio_list));
>>>>>>>> -			goto done;
>>>>>>>> -		}
>>>>>>>> -
>>>>>>>>  		list_del_init(&rbio->hash_list);
>>>>>>>>  		atomic_dec(&rbio->refs);
>>>>>>>>
>>>>>>>> @@ -848,13 +834,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>>>>>  			goto done_nolock;
>>>>>>>>  		}
>>>>>>>>  	}
>>>>>>>> -done:
>>>>>>>>  	spin_unlock(&rbio->bio_list_lock);
>>>>>>>>  	spin_unlock_irqrestore(&h->lock, flags);
>>>>>>>>
>>>>>>>>  done_nolock:
>>>>>>>> -	if (!keep_cache)
>>>>>>>> -		remove_rbio_from_cache(rbio);
>>>>>>>> +	remove_rbio_from_cache(rbio);
>>>>>>>>  }
>>>>>>>>
>>>>>>>>  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
>>>>>>>> --
>>>>>>>> 2.11.0
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> --
>>>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>>>
>>>>>>>
>>>>>>
>>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>
>>>
>>
>>
>
>



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal
  2017-03-21  2:23                 ` Qu Wenruo
@ 2017-03-21  5:45                   ` Liu Bo
  2017-03-21  7:00                     ` Qu Wenruo
  0 siblings, 1 reply; 27+ messages in thread
From: Liu Bo @ 2017-03-21  5:45 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Tue, Mar 21, 2017 at 10:23:56AM +0800, Qu Wenruo wrote:
> 
> 
> At 03/21/2017 10:08 AM, Liu Bo wrote:
> > On Tue, Mar 21, 2017 at 08:44:18AM +0800, Qu Wenruo wrote:
> > > 
> > > 
> > > At 03/21/2017 04:23 AM, Liu Bo wrote:
> > > > On Mon, Mar 20, 2017 at 02:21:48PM +0800, Qu Wenruo wrote:
> > > > > 
> > > > > 
> > > > > At 03/18/2017 10:03 AM, Liu Bo wrote:
> > > > > > On Fri, Mar 17, 2017 at 01:28:45PM +0800, Qu Wenruo wrote:
> > > > > > > 
> > > > > > > 
> > > > > > > At 03/17/2017 12:44 PM, Liu Bo wrote:
> > > > > > > > On Fri, Feb 03, 2017 at 04:20:22PM +0800, Qu Wenruo wrote:
> > > > > > > > > Before this patch, btrfs raid56 will keep raid56 rbio even all its IO is
> > > > > > > > > done.
> > > > > > > > > This may save some time allocating rbio, but it can cause deadly
> > > > > > > > > use-after-free bug, for the following case:
> > > > > > > > > 
> > > > > > > > > Original fs: 4 devices RAID5
> > > > > > > > > 
> > > > > > > > >        Process A                 |          Process B
> > > > > > > > > --------------------------------------------------------------------------
> > > > > > > > >                                  |  Start device replace
> > > > > > > > >                                  |    Now the fs has 5 devices
> > > > > > > > >                                  |    devid 0: replace device
> > > > > > > > >                                  |    devid 1~4: old devices
> > > > > > > > > btrfs_map_bio()                  |
> > > > > > > > > |- __btrfs_map_block()           |
> > > > > > > > > |    bbio has 5 stripes          |
> > > > > > > > > |    including devid 0           |
> > > > > > > > > |- raid56_parity_write()         |
> > > > > > > > >                                  |
> > > > > > > > > raid_write_end_io()              |
> > > > > > > > > |- rbio_orig_end_io()            |
> > > > > > > > >    |- unlock_stripe()            |
> > > > > > > > >        Keeps the old rbio for    |
> > > > > > > > >        later steal, which has    |
> > > > > > > > >        stripe for devid 0        |
> > > > > > > > >                                  |  Cancel device replace
> > > > > > > > >                                  |    Now the fs has 4 devices
> > > > > > > > >                                  |    devid 0 is freed
> > > > > > > > > Some IO happens                  |
> > > > > > > > > raid_write_end_io()              |
> > > > > > > > > |- rbio_orig_end_io()            |
> > > > > > > > >    |- unlock_stripe()            |
> > > > > > > > >       |- steal_rbio()            |
> > > > > > > > >            Use old rbio, whose   |
> > > > > > > > >            bbio has freed devid 0|
> > > > > > > > >            stripe                |
> > > > > > > > > Any access to rbio->bbio will    |
> > > > > > > > > cause general protection or NULL |
> > > > > > > > > pointer dereference              |
> > > > > > > > > 
> > > > > > > > > Such bug can already be triggered by fstests btrfs/069 for RAID5/6
> > > > > > > > > profiles.
> > > > > > > > > 
> > > > > > > > > Fix it by not keeping old rbio in unlock_stripe(), so we just free the
> > > > > > > > > finished rbio and rbio->bbio, so above problem wont' happen.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > I don't think this is acceptable, keeping a cache is important for
> > > > > > > > raid56 write performance, could you please fix it by checking if the
> > > > > > > > device is missing?
> > > > > > > 
> > > > > > > Not possible, as it's keeping the btrfs_device pointer and never release it,
> > > > > > > the stolen rbio can be kept forever until umount.
> > > > > > > 
> > > > > > 
> > > > > > steal_rbio() only takes pages from rbio->stripe_pages, so the cached
> > > > > > rbio->bbio is not going to the next IO's rbio because the cached one
> > > > > > got freed immediately in steal_rbio(), where could we dereference
> > > > > > rbio->bbio?
> > > > > 
> > > > > Did you mean in unlock_stripe(), we still keep the rbio in cache, while
> > > > > release its bbio?
> > > > > 
> > > > 
> > > > I thought it was lock_stripe_add(), OK, so unlock_stripe() just caches
> > > > the current rbio and doens't free it.  But the point about
> > > > steal_rbio() still stands, steal_rbio() is supposed to take uptodate
> > > > pages from the cached old rbio to the current processing rbio, but it
> > > > doesn't touch the cached old rbio's bbio, nor uses the cached old rbio
> > > > afterwards, instead it is the current processing rbio that would use
> > > > its bbio for writing into target devices, but it has increased its own
> > > > bio_counter.
> > > > 
> > > > > This sounds quite good but I'm afraid it may cause more problems.
> > > > > 
> > > > > Quite a lot of places are accessing rbio->bbio either for their btrfs
> > > > > logical address or stripes or even stripe->dev.
> > > > > 
> > > > 
> > > > I'm confused, could you please specify the call trace of general
> > > > protection you got in the commit log?
> > > 
> > > The 4th and 5th patches are designed to fix the same problem.
> > > 
> > > If one applies 5th patch only and run btrfs/069, it will cause hang when
> > > aborting replace, since btrfs_device of dev 0 is hold in bbio->stripes[] and
> > > never get released.
> > > 
> > > The 4th patch is used to solve such hang.
> > > 
> > 
> > OK, I see.
> > 
> > Firstly, the above commit log is misleading people a bit because it
> > says that steal_rbio() dereferences the device of the cached rbio and
> > that device has got free'd, but steal_rbio() actually doesn't.  Yes,
> > the cached rbio holds a reference on the free'd device, but I think
> > the below 'NULL pointer dereference' comes from writing back pages
> > into target devices when doing RMW with the current rbio instead of
> > the old cached one, right?
> 
> Yes, steal_bio() is not related to this problem, sorry for the confusion.
> 
> > 
> > Secondly, if it is the current rio that ends up this 'NULL pointer
> > dereference', it could hold a bio_counter and let the replace thread
> > canceled by scrub wait for this bio_counter to be zero.  Although
> > btrfs_dev_replace_finishing() has flushed delalloc IO and committed
> > transaction, seems like scrub is an exception because it can continued
> > after committing transaction.
> 
> If I understand it correctly, did you mean hold bio_counter when rbio holds
> bbio?
>

We have bio_counter to prevent the race against a successful dev
replace (such as btrfs_map_bio and btrfs_discard_extent), but for a
canceled dev replace (either it's canceled by scrub or by 'btrfs
replace cancel'), bio_counter could also be utilized to avoid the
race.

Now that scrub, which does raid parity recover, is the only one
without increasing bio_counter while accessing devices and stripes.

> That's OK, but we still need the 4th patch, or it will block replace cancel
> forever.
>

It is not the current code but patch 5 that needs patch 4 to avoid
blocking replace threads.

> > 
> > Thirdly, it is possible that this canceled dev-replace could make
> > fstrim get a 'general protection' or 'NULL pointer dereference' since
> > it could access target devices and is not sychoronized by committing
> > transaction.
> 
> So I'm using the refcount for btrfs_device to do full protection for it.
> As long as btrfs_dev can only be freed when no on holds it, instead of no
> bio pending for it, it should be safer.
>

If increading bio_counter could fix the bug like other use-after-free dev-replace bugs, e.g.
4245215d6a8d Btrfs, raid56: fix use-after-free problem in the final device replace procedure on raid56
,
I'd prefer it, a fix with keeping the benefit of caching rbio.

> > 
> > Please correct me if I'm wrong since I failed to reproduce it.
> 
> The bug is harder to trigger in v4.11-rc2 now.
> 
> I modified btrfs/069 to make it easier to trigger, but it's still quite hard
> to reproduce it.
> 
> Even with modification, the possibility is still low, at about 10~20%.
> 
> Hopes the diff could help you to trigger it.

I'm using -n 2000 but no luck.

Thanks,

-liubo

> 
> Thanks,
> Qu
> 
> diff --git a/tests/btrfs/069 b/tests/btrfs/069
> index d939cd8..f9ff945 100755
> --- a/tests/btrfs/069
> +++ b/tests/btrfs/069
> @@ -74,7 +74,7 @@ run_test()
>         _scratch_mount >>$seqres.full 2>&1
>         SCRATCH_DEV_POOL=$saved_scratch_dev_pool
> 
> -       args=`_scale_fsstress_args -p 20 -n 100 $FSSTRESS_AVOID -d
> $SCRATCH_MNT/stressdir`
> +       args=`_scale_fsstress_args -p 20 -n 1000 $FSSTRESS_AVOID -d
> $SCRATCH_MNT/stressdir`
>         echo "Run fsstress $args" >>$seqres.full
>         $FSSTRESS_PROG $args >/dev/null 2>&1 &
>         fsstress_pid=$!
> @@ -115,9 +115,10 @@ run_test()
>  }
> 
>  echo "Silence is golden"
> -for t in "${_btrfs_profile_configs[@]}"; do
> -       run_test "$t"
> -done
> +#for t in "${_btrfs_profile_configs[@]}"; do
> +#      run_test "$t"
> +#done
> +run_test "-d raid5 -m raid5"
> 
>  status=0
>  exit
> 
> > 
> > Thanks,
> > 
> > -liubo
> > 
> > > And the kernel NULL pointer access is like this when running modified
> > > btrfs/069 (only run on RAID5, and improve the duration of fsstress):
> > > 
> > > [  884.877421] BUG: unable to handle kernel NULL pointer dereference at
> > > 00000000000005e0
> > > [  884.878206] IP: generic_make_request_checks+0x4d/0x610
> > > [  884.878541] PGD 2d45a067
> > > [  884.878542] PUD 3ba0e067
> > > [  884.878857] PMD 0
> > > [  884.879189]
> > > [  884.879899] Oops: 0000 [#1] SMP
> > > [  884.880207] Modules linked in: btrfs(O) ext4 jbd2 mbcache xor raid6_pq
> > > netconsole xfs [last unloaded: btrfs]
> > > [  884.880845] CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G  O
> > > 4.11.0-rc2 #72
> > > [  884.881455] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
> > > 1.10.2-20170228_101828-anatol 04/01/2014
> > > [  884.882119] Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper
> > > [btrfs]
> > > [  884.883089] task: ffff88002875b4c0 task.stack: ffffc90001334000
> > > [  884.883527] RIP: 0010:generic_make_request_checks+0x4d/0x610
> > > [  884.883855] RSP: 0018:ffffc90001337bb8 EFLAGS: 00010283
> > > [  884.884186] RAX: 0000000000000000 RBX: ffff8800126503e8 RCX:
> > > 0000000000218800
> > > [  884.884539] RDX: 0000000000000040 RSI: 0000000000000001 RDI:
> > > ffff88003d8116c0
> > > [  884.884897] RBP: ffffc90001337c18 R08: 0000000000000001 R09:
> > > 0000000000000001
> > > [  884.885778] R10: 0000000000000000 R11: 00000000000162b9 R12:
> > > 0000000000000040
> > > [  884.886346] R13: ffff8800126503e8 R14: 00000000ffffffff R15:
> > > 0000000000000010
> > > [  884.887146] FS:  0000000000000000(0000) GS:ffff88003e400000(0000)
> > > knlGS:0000000000000000
> > > [  884.888457] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > [  884.888792] CR2: 00000000000005e0 CR3: 000000003ad30000 CR4:
> > > 00000000000006e0
> > > [  884.889212] Call Trace:
> > > [  884.889526]  ? generic_make_request+0xc7/0x360
> > > [  884.889841]  generic_make_request+0x24/0x360
> > > [  884.890163]  ? generic_make_request+0xc7/0x360
> > > [  884.890486]  submit_bio+0x64/0x120
> > > [  884.890828]  ? page_in_rbio+0x4d/0x80 [btrfs]
> > > [  884.891206]  ? rbio_orig_end_io+0x80/0x80 [btrfs]
> > > [  884.891543]  finish_rmw+0x3f4/0x540 [btrfs]
> > > [  884.891875]  validate_rbio_for_rmw+0x36/0x40 [btrfs]
> > > [  884.892213]  raid_rmw_end_io+0x7a/0x90 [btrfs]
> > > [  884.892565]  bio_endio+0x56/0x60
> > > [  884.892891]  end_workqueue_fn+0x3c/0x40 [btrfs]
> > > [  884.893265]  btrfs_scrubparity_helper+0xef/0x620 [btrfs]
> > > [  884.893698]  btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
> > > [  884.894101]  process_one_work+0x2af/0x720
> > > [  884.894837]  ? process_one_work+0x22b/0x720
> > > [  884.895278]  worker_thread+0x4b/0x4f0
> > > [  884.895760]  kthread+0x10f/0x150
> > > [  884.896106]  ? process_one_work+0x720/0x720
> > > [  884.896448]  ? kthread_create_on_node+0x40/0x40
> > > [  884.896803]  ret_from_fork+0x2e/0x40
> > > [  884.897148] Code: 67 28 48 c7 c7 27 7f c9 81 e8 90 6c d4 ff e8 0b bb 54
> > > 00 41 c1 ec 09 48 8b 7b 08 45 85 e4 0f 85 be 00 00 00 48 8b 87 00 01 00 00
> > > <4c> 8b b0 e0 05 00 00 4d 85 f6 0f 84 86 01 00 00 4c 8b af f0 00
> > > [  884.898449] RIP: generic_make_request_checks+0x4d/0x610 RSP:
> > > ffffc90001337bb8
> > > [  884.899223] CR2: 00000000000005e0
> > > [  884.900223] ---[ end trace 307e118b57a9995e ]---
> > > 
> > > Thanks,
> > > Qu
> > > 
> > > > 
> > > > I wonder if patch 4 and 5 are fixing the same use-after-free problem?
> > > > 
> > > > Thanks,
> > > > 
> > > > -liubo
> > > > 
> > > > > Thanks,
> > > > > Qu
> > > > > 
> > > > > 
> > > > > > 
> > > > > > Thanks,
> > > > > > 
> > > > > > -liubo
> > > > > > 
> > > > > > > And I think the logical is very strange, if RAID5/6 is unstable, there is no
> > > > > > > meaning to keep it fast.
> > > > > > > 
> > > > > > > Keep it stable first, and then consider the performance.
> > > > > > > 
> > > > > > > Thanks,
> > > > > > > Qu
> > > > > > > 
> > > > > > > > 
> > > > > > > > Thanks,
> > > > > > > > 
> > > > > > > > -liubo
> > > > > > > > 
> > > > > > > > > Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
> > > > > > > > > ---
> > > > > > > > >  fs/btrfs/raid56.c | 18 +-----------------
> > > > > > > > >  1 file changed, 1 insertion(+), 17 deletions(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> > > > > > > > > index 453eefdcb591..aba82b95ec73 100644
> > > > > > > > > --- a/fs/btrfs/raid56.c
> > > > > > > > > +++ b/fs/btrfs/raid56.c
> > > > > > > > > @@ -776,7 +776,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > > > > > > > >  	int bucket;
> > > > > > > > >  	struct btrfs_stripe_hash *h;
> > > > > > > > >  	unsigned long flags;
> > > > > > > > > -	int keep_cache = 0;
> > > > > > > > > 
> > > > > > > > >  	bucket = rbio_bucket(rbio);
> > > > > > > > >  	h = rbio->fs_info->stripe_hash_table->table + bucket;
> > > > > > > > > @@ -788,19 +787,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > > > > > > > >  	spin_lock(&rbio->bio_list_lock);
> > > > > > > > > 
> > > > > > > > >  	if (!list_empty(&rbio->hash_list)) {
> > > > > > > > > -		/*
> > > > > > > > > -		 * if we're still cached and there is no other IO
> > > > > > > > > -		 * to perform, just leave this rbio here for others
> > > > > > > > > -		 * to steal from later
> > > > > > > > > -		 */
> > > > > > > > > -		if (list_empty(&rbio->plug_list) &&
> > > > > > > > > -		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
> > > > > > > > > -			keep_cache = 1;
> > > > > > > > > -			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
> > > > > > > > > -			BUG_ON(!bio_list_empty(&rbio->bio_list));
> > > > > > > > > -			goto done;
> > > > > > > > > -		}
> > > > > > > > > -
> > > > > > > > >  		list_del_init(&rbio->hash_list);
> > > > > > > > >  		atomic_dec(&rbio->refs);
> > > > > > > > > 
> > > > > > > > > @@ -848,13 +834,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
> > > > > > > > >  			goto done_nolock;
> > > > > > > > >  		}
> > > > > > > > >  	}
> > > > > > > > > -done:
> > > > > > > > >  	spin_unlock(&rbio->bio_list_lock);
> > > > > > > > >  	spin_unlock_irqrestore(&h->lock, flags);
> > > > > > > > > 
> > > > > > > > >  done_nolock:
> > > > > > > > > -	if (!keep_cache)
> > > > > > > > > -		remove_rbio_from_cache(rbio);
> > > > > > > > > +	remove_rbio_from_cache(rbio);
> > > > > > > > >  }
> > > > > > > > > 
> > > > > > > > >  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
> > > > > > > > > --
> > > > > > > > > 2.11.0
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > --
> > > > > > > > > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > > > > > > > > the body of a message to majordomo@vger.kernel.org
> > > > > > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > > 
> > > > > 
> > > > > 
> > > > 
> > > > 
> > > 
> > > 
> > 
> > 
> 
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal
  2017-03-21  5:45                   ` Liu Bo
@ 2017-03-21  7:00                     ` Qu Wenruo
  0 siblings, 0 replies; 27+ messages in thread
From: Qu Wenruo @ 2017-03-21  7:00 UTC (permalink / raw)
  To: bo.li.liu; +Cc: linux-btrfs



At 03/21/2017 01:45 PM, Liu Bo wrote:
> On Tue, Mar 21, 2017 at 10:23:56AM +0800, Qu Wenruo wrote:
>>
>>
>> At 03/21/2017 10:08 AM, Liu Bo wrote:
>>> On Tue, Mar 21, 2017 at 08:44:18AM +0800, Qu Wenruo wrote:
>>>>
>>>>
>>>> At 03/21/2017 04:23 AM, Liu Bo wrote:
>>>>> On Mon, Mar 20, 2017 at 02:21:48PM +0800, Qu Wenruo wrote:
>>>>>>
>>>>>>
>>>>>> At 03/18/2017 10:03 AM, Liu Bo wrote:
>>>>>>> On Fri, Mar 17, 2017 at 01:28:45PM +0800, Qu Wenruo wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> At 03/17/2017 12:44 PM, Liu Bo wrote:
>>>>>>>>> On Fri, Feb 03, 2017 at 04:20:22PM +0800, Qu Wenruo wrote:
>>>>>>>>>> Before this patch, btrfs raid56 will keep raid56 rbio even all its IO is
>>>>>>>>>> done.
>>>>>>>>>> This may save some time allocating rbio, but it can cause deadly
>>>>>>>>>> use-after-free bug, for the following case:
>>>>>>>>>>
>>>>>>>>>> Original fs: 4 devices RAID5
>>>>>>>>>>
>>>>>>>>>>        Process A                 |          Process B
>>>>>>>>>> --------------------------------------------------------------------------
>>>>>>>>>>                                  |  Start device replace
>>>>>>>>>>                                  |    Now the fs has 5 devices
>>>>>>>>>>                                  |    devid 0: replace device
>>>>>>>>>>                                  |    devid 1~4: old devices
>>>>>>>>>> btrfs_map_bio()                  |
>>>>>>>>>> |- __btrfs_map_block()           |
>>>>>>>>>> |    bbio has 5 stripes          |
>>>>>>>>>> |    including devid 0           |
>>>>>>>>>> |- raid56_parity_write()         |
>>>>>>>>>>                                  |
>>>>>>>>>> raid_write_end_io()              |
>>>>>>>>>> |- rbio_orig_end_io()            |
>>>>>>>>>>    |- unlock_stripe()            |
>>>>>>>>>>        Keeps the old rbio for    |
>>>>>>>>>>        later steal, which has    |
>>>>>>>>>>        stripe for devid 0        |
>>>>>>>>>>                                  |  Cancel device replace
>>>>>>>>>>                                  |    Now the fs has 4 devices
>>>>>>>>>>                                  |    devid 0 is freed
>>>>>>>>>> Some IO happens                  |
>>>>>>>>>> raid_write_end_io()              |
>>>>>>>>>> |- rbio_orig_end_io()            |
>>>>>>>>>>    |- unlock_stripe()            |
>>>>>>>>>>       |- steal_rbio()            |
>>>>>>>>>>            Use old rbio, whose   |
>>>>>>>>>>            bbio has freed devid 0|
>>>>>>>>>>            stripe                |
>>>>>>>>>> Any access to rbio->bbio will    |
>>>>>>>>>> cause general protection or NULL |
>>>>>>>>>> pointer dereference              |
>>>>>>>>>>
>>>>>>>>>> Such bug can already be triggered by fstests btrfs/069 for RAID5/6
>>>>>>>>>> profiles.
>>>>>>>>>>
>>>>>>>>>> Fix it by not keeping old rbio in unlock_stripe(), so we just free the
>>>>>>>>>> finished rbio and rbio->bbio, so above problem wont' happen.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I don't think this is acceptable, keeping a cache is important for
>>>>>>>>> raid56 write performance, could you please fix it by checking if the
>>>>>>>>> device is missing?
>>>>>>>>
>>>>>>>> Not possible, as it's keeping the btrfs_device pointer and never release it,
>>>>>>>> the stolen rbio can be kept forever until umount.
>>>>>>>>
>>>>>>>
>>>>>>> steal_rbio() only takes pages from rbio->stripe_pages, so the cached
>>>>>>> rbio->bbio is not going to the next IO's rbio because the cached one
>>>>>>> got freed immediately in steal_rbio(), where could we dereference
>>>>>>> rbio->bbio?
>>>>>>
>>>>>> Did you mean in unlock_stripe(), we still keep the rbio in cache, while
>>>>>> release its bbio?
>>>>>>
>>>>>
>>>>> I thought it was lock_stripe_add(), OK, so unlock_stripe() just caches
>>>>> the current rbio and doens't free it.  But the point about
>>>>> steal_rbio() still stands, steal_rbio() is supposed to take uptodate
>>>>> pages from the cached old rbio to the current processing rbio, but it
>>>>> doesn't touch the cached old rbio's bbio, nor uses the cached old rbio
>>>>> afterwards, instead it is the current processing rbio that would use
>>>>> its bbio for writing into target devices, but it has increased its own
>>>>> bio_counter.
>>>>>
>>>>>> This sounds quite good but I'm afraid it may cause more problems.
>>>>>>
>>>>>> Quite a lot of places are accessing rbio->bbio either for their btrfs
>>>>>> logical address or stripes or even stripe->dev.
>>>>>>
>>>>>
>>>>> I'm confused, could you please specify the call trace of general
>>>>> protection you got in the commit log?
>>>>
>>>> The 4th and 5th patches are designed to fix the same problem.
>>>>
>>>> If one applies 5th patch only and run btrfs/069, it will cause hang when
>>>> aborting replace, since btrfs_device of dev 0 is hold in bbio->stripes[] and
>>>> never get released.
>>>>
>>>> The 4th patch is used to solve such hang.
>>>>
>>>
>>> OK, I see.
>>>
>>> Firstly, the above commit log is misleading people a bit because it
>>> says that steal_rbio() dereferences the device of the cached rbio and
>>> that device has got free'd, but steal_rbio() actually doesn't.  Yes,
>>> the cached rbio holds a reference on the free'd device, but I think
>>> the below 'NULL pointer dereference' comes from writing back pages
>>> into target devices when doing RMW with the current rbio instead of
>>> the old cached one, right?
>>
>> Yes, steal_bio() is not related to this problem, sorry for the confusion.
>>
>>>
>>> Secondly, if it is the current rio that ends up this 'NULL pointer
>>> dereference', it could hold a bio_counter and let the replace thread
>>> canceled by scrub wait for this bio_counter to be zero.  Although
>>> btrfs_dev_replace_finishing() has flushed delalloc IO and committed
>>> transaction, seems like scrub is an exception because it can continued
>>> after committing transaction.
>>
>> If I understand it correctly, did you mean hold bio_counter when rbio holds
>> bbio?
>>
>
> We have bio_counter to prevent the race against a successful dev
> replace (such as btrfs_map_bio and btrfs_discard_extent), but for a
> canceled dev replace (either it's canceled by scrub or by 'btrfs
> replace cancel'), bio_counter could also be utilized to avoid the
> race.
>
> Now that scrub, which does raid parity recover, is the only one
> without increasing bio_counter while accessing devices and stripes.
>
>> That's OK, but we still need the 4th patch, or it will block replace cancel
>> forever.
>>
>
> It is not the current code but patch 5 that needs patch 4 to avoid
> blocking replace threads.
>
>>>
>>> Thirdly, it is possible that this canceled dev-replace could make
>>> fstrim get a 'general protection' or 'NULL pointer dereference' since
>>> it could access target devices and is not sychoronized by committing
>>> transaction.
>>
>> So I'm using the refcount for btrfs_device to do full protection for it.
>> As long as btrfs_dev can only be freed when no on holds it, instead of no
>> bio pending for it, it should be safer.
>>
>
> If increading bio_counter could fix the bug like other use-after-free dev-replace bugs, e.g.
> 4245215d6a8d Btrfs, raid56: fix use-after-free problem in the final device replace procedure on raid56
> ,
> I'd prefer it, a fix with keeping the benefit of caching rbio.

Thanks, I'll try to bio_counter to fix.

It's never a bad idea to use existing facilities.

Thanks,
Qu

>
>>>
>>> Please correct me if I'm wrong since I failed to reproduce it.
>>
>> The bug is harder to trigger in v4.11-rc2 now.
>>
>> I modified btrfs/069 to make it easier to trigger, but it's still quite hard
>> to reproduce it.
>>
>> Even with modification, the possibility is still low, at about 10~20%.
>>
>> Hopes the diff could help you to trigger it.
>
> I'm using -n 2000 but no luck.
>
> Thanks,
>
> -liubo
>
>>
>> Thanks,
>> Qu
>>
>> diff --git a/tests/btrfs/069 b/tests/btrfs/069
>> index d939cd8..f9ff945 100755
>> --- a/tests/btrfs/069
>> +++ b/tests/btrfs/069
>> @@ -74,7 +74,7 @@ run_test()
>>         _scratch_mount >>$seqres.full 2>&1
>>         SCRATCH_DEV_POOL=$saved_scratch_dev_pool
>>
>> -       args=`_scale_fsstress_args -p 20 -n 100 $FSSTRESS_AVOID -d
>> $SCRATCH_MNT/stressdir`
>> +       args=`_scale_fsstress_args -p 20 -n 1000 $FSSTRESS_AVOID -d
>> $SCRATCH_MNT/stressdir`
>>         echo "Run fsstress $args" >>$seqres.full
>>         $FSSTRESS_PROG $args >/dev/null 2>&1 &
>>         fsstress_pid=$!
>> @@ -115,9 +115,10 @@ run_test()
>>  }
>>
>>  echo "Silence is golden"
>> -for t in "${_btrfs_profile_configs[@]}"; do
>> -       run_test "$t"
>> -done
>> +#for t in "${_btrfs_profile_configs[@]}"; do
>> +#      run_test "$t"
>> +#done
>> +run_test "-d raid5 -m raid5"
>>
>>  status=0
>>  exit
>>
>>>
>>> Thanks,
>>>
>>> -liubo
>>>
>>>> And the kernel NULL pointer access is like this when running modified
>>>> btrfs/069 (only run on RAID5, and improve the duration of fsstress):
>>>>
>>>> [  884.877421] BUG: unable to handle kernel NULL pointer dereference at
>>>> 00000000000005e0
>>>> [  884.878206] IP: generic_make_request_checks+0x4d/0x610
>>>> [  884.878541] PGD 2d45a067
>>>> [  884.878542] PUD 3ba0e067
>>>> [  884.878857] PMD 0
>>>> [  884.879189]
>>>> [  884.879899] Oops: 0000 [#1] SMP
>>>> [  884.880207] Modules linked in: btrfs(O) ext4 jbd2 mbcache xor raid6_pq
>>>> netconsole xfs [last unloaded: btrfs]
>>>> [  884.880845] CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G  O
>>>> 4.11.0-rc2 #72
>>>> [  884.881455] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
>>>> 1.10.2-20170228_101828-anatol 04/01/2014
>>>> [  884.882119] Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper
>>>> [btrfs]
>>>> [  884.883089] task: ffff88002875b4c0 task.stack: ffffc90001334000
>>>> [  884.883527] RIP: 0010:generic_make_request_checks+0x4d/0x610
>>>> [  884.883855] RSP: 0018:ffffc90001337bb8 EFLAGS: 00010283
>>>> [  884.884186] RAX: 0000000000000000 RBX: ffff8800126503e8 RCX:
>>>> 0000000000218800
>>>> [  884.884539] RDX: 0000000000000040 RSI: 0000000000000001 RDI:
>>>> ffff88003d8116c0
>>>> [  884.884897] RBP: ffffc90001337c18 R08: 0000000000000001 R09:
>>>> 0000000000000001
>>>> [  884.885778] R10: 0000000000000000 R11: 00000000000162b9 R12:
>>>> 0000000000000040
>>>> [  884.886346] R13: ffff8800126503e8 R14: 00000000ffffffff R15:
>>>> 0000000000000010
>>>> [  884.887146] FS:  0000000000000000(0000) GS:ffff88003e400000(0000)
>>>> knlGS:0000000000000000
>>>> [  884.888457] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>>> [  884.888792] CR2: 00000000000005e0 CR3: 000000003ad30000 CR4:
>>>> 00000000000006e0
>>>> [  884.889212] Call Trace:
>>>> [  884.889526]  ? generic_make_request+0xc7/0x360
>>>> [  884.889841]  generic_make_request+0x24/0x360
>>>> [  884.890163]  ? generic_make_request+0xc7/0x360
>>>> [  884.890486]  submit_bio+0x64/0x120
>>>> [  884.890828]  ? page_in_rbio+0x4d/0x80 [btrfs]
>>>> [  884.891206]  ? rbio_orig_end_io+0x80/0x80 [btrfs]
>>>> [  884.891543]  finish_rmw+0x3f4/0x540 [btrfs]
>>>> [  884.891875]  validate_rbio_for_rmw+0x36/0x40 [btrfs]
>>>> [  884.892213]  raid_rmw_end_io+0x7a/0x90 [btrfs]
>>>> [  884.892565]  bio_endio+0x56/0x60
>>>> [  884.892891]  end_workqueue_fn+0x3c/0x40 [btrfs]
>>>> [  884.893265]  btrfs_scrubparity_helper+0xef/0x620 [btrfs]
>>>> [  884.893698]  btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
>>>> [  884.894101]  process_one_work+0x2af/0x720
>>>> [  884.894837]  ? process_one_work+0x22b/0x720
>>>> [  884.895278]  worker_thread+0x4b/0x4f0
>>>> [  884.895760]  kthread+0x10f/0x150
>>>> [  884.896106]  ? process_one_work+0x720/0x720
>>>> [  884.896448]  ? kthread_create_on_node+0x40/0x40
>>>> [  884.896803]  ret_from_fork+0x2e/0x40
>>>> [  884.897148] Code: 67 28 48 c7 c7 27 7f c9 81 e8 90 6c d4 ff e8 0b bb 54
>>>> 00 41 c1 ec 09 48 8b 7b 08 45 85 e4 0f 85 be 00 00 00 48 8b 87 00 01 00 00
>>>> <4c> 8b b0 e0 05 00 00 4d 85 f6 0f 84 86 01 00 00 4c 8b af f0 00
>>>> [  884.898449] RIP: generic_make_request_checks+0x4d/0x610 RSP:
>>>> ffffc90001337bb8
>>>> [  884.899223] CR2: 00000000000005e0
>>>> [  884.900223] ---[ end trace 307e118b57a9995e ]---
>>>>
>>>> Thanks,
>>>> Qu
>>>>
>>>>>
>>>>> I wonder if patch 4 and 5 are fixing the same use-after-free problem?
>>>>>
>>>>> Thanks,
>>>>>
>>>>> -liubo
>>>>>
>>>>>> Thanks,
>>>>>> Qu
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>>
>>>>>>> -liubo
>>>>>>>
>>>>>>>> And I think the logical is very strange, if RAID5/6 is unstable, there is no
>>>>>>>> meaning to keep it fast.
>>>>>>>>
>>>>>>>> Keep it stable first, and then consider the performance.
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> Qu
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>>
>>>>>>>>> -liubo
>>>>>>>>>
>>>>>>>>>> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
>>>>>>>>>> ---
>>>>>>>>>>  fs/btrfs/raid56.c | 18 +-----------------
>>>>>>>>>>  1 file changed, 1 insertion(+), 17 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
>>>>>>>>>> index 453eefdcb591..aba82b95ec73 100644
>>>>>>>>>> --- a/fs/btrfs/raid56.c
>>>>>>>>>> +++ b/fs/btrfs/raid56.c
>>>>>>>>>> @@ -776,7 +776,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>>>>>>>  	int bucket;
>>>>>>>>>>  	struct btrfs_stripe_hash *h;
>>>>>>>>>>  	unsigned long flags;
>>>>>>>>>> -	int keep_cache = 0;
>>>>>>>>>>
>>>>>>>>>>  	bucket = rbio_bucket(rbio);
>>>>>>>>>>  	h = rbio->fs_info->stripe_hash_table->table + bucket;
>>>>>>>>>> @@ -788,19 +787,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>>>>>>>  	spin_lock(&rbio->bio_list_lock);
>>>>>>>>>>
>>>>>>>>>>  	if (!list_empty(&rbio->hash_list)) {
>>>>>>>>>> -		/*
>>>>>>>>>> -		 * if we're still cached and there is no other IO
>>>>>>>>>> -		 * to perform, just leave this rbio here for others
>>>>>>>>>> -		 * to steal from later
>>>>>>>>>> -		 */
>>>>>>>>>> -		if (list_empty(&rbio->plug_list) &&
>>>>>>>>>> -		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
>>>>>>>>>> -			keep_cache = 1;
>>>>>>>>>> -			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
>>>>>>>>>> -			BUG_ON(!bio_list_empty(&rbio->bio_list));
>>>>>>>>>> -			goto done;
>>>>>>>>>> -		}
>>>>>>>>>> -
>>>>>>>>>>  		list_del_init(&rbio->hash_list);
>>>>>>>>>>  		atomic_dec(&rbio->refs);
>>>>>>>>>>
>>>>>>>>>> @@ -848,13 +834,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
>>>>>>>>>>  			goto done_nolock;
>>>>>>>>>>  		}
>>>>>>>>>>  	}
>>>>>>>>>> -done:
>>>>>>>>>>  	spin_unlock(&rbio->bio_list_lock);
>>>>>>>>>>  	spin_unlock_irqrestore(&h->lock, flags);
>>>>>>>>>>
>>>>>>>>>>  done_nolock:
>>>>>>>>>> -	if (!keep_cache)
>>>>>>>>>> -		remove_rbio_from_cache(rbio);
>>>>>>>>>> +	remove_rbio_from_cache(rbio);
>>>>>>>>>>  }
>>>>>>>>>>
>>>>>>>>>>  static void __free_raid_bio(struct btrfs_raid_bio *rbio)
>>>>>>>>>> --
>>>>>>>>>> 2.11.0
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> --
>>>>>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>>>>>>>>> the body of a message to majordomo@vger.kernel.org
>>>>>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>>>>>
>>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>
>>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>
>>>
>>
>>
>
>



^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2017-03-21  7:01 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-03  8:20 [PATCH 0/5] raid56: variant bug fixes Qu Wenruo
2017-02-03  8:20 ` [PATCH 1/5] btrfs: scrub: Introduce full stripe lock for RAID56 Qu Wenruo
2017-02-03  8:20 ` [PATCH 2/5] btrfs: scrub: Fix RAID56 recovery race condition Qu Wenruo
2017-02-03  8:20 ` [PATCH 3/5] btrfs: raid56: Use correct stolen pages to calculate P/Q Qu Wenruo
2017-03-16  5:36   ` Liu Bo
2017-03-16  8:30     ` Qu Wenruo
2017-03-17  6:31     ` Qu Wenruo
2017-03-17 22:19       ` Liu Bo
2017-03-20  4:33         ` Qu Wenruo
2017-02-03  8:20 ` [PATCH 4/5] btrfs: raid56: Don't keep rbio for later steal Qu Wenruo
2017-03-17  4:44   ` Liu Bo
2017-03-17  5:28     ` Qu Wenruo
2017-03-18  2:03       ` Liu Bo
2017-03-20  6:21         ` Qu Wenruo
2017-03-20 20:23           ` Liu Bo
2017-03-21  0:44             ` Qu Wenruo
2017-03-21  2:08               ` Liu Bo
2017-03-21  2:23                 ` Qu Wenruo
2017-03-21  5:45                   ` Liu Bo
2017-03-21  7:00                     ` Qu Wenruo
2017-02-03  8:20 ` [PATCH 5/5] btrfs: replace: Use ref counts to avoid destroying target device when canceled Qu Wenruo
2017-03-18  2:12   ` Liu Bo
2017-03-20  6:30     ` Qu Wenruo
2017-03-20 19:31       ` Liu Bo
2017-03-07  3:48 ` [PATCH 0/5] raid56: variant bug fixes Qu Wenruo
2017-03-14 13:47   ` David Sterba
2017-03-14 21:30     ` Goffredo Baroncelli

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.