linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] btrfs: More intelligent degraded chunk allocator
@ 2019-11-07  6:27 Qu Wenruo
  2019-11-07  6:27 ` [PATCH 1/3] btrfs: volumes: Refactor device holes gathering into a separate function Qu Wenruo
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Qu Wenruo @ 2019-11-07  6:27 UTC (permalink / raw)
  To: linux-btrfs

This patchset will make btrfs degraded mount more intelligent and
provide more consistent profile keeping function.

One of the most problematic aspect of degraded mount is, btrfs may
create unwanted profiles.

 # mkfs.btrfs -f /dev/test/scratch[12] -m raid1 -d raid1
 # wipefs -fa /dev/test/scratch2
 # mount -o degraded /dev/test/scratch1 /mnt/btrfs
 # fallocate -l 1G /mnt/btrfs/foobar
 # btrfs ins dump-tree -t chunk /dev/test/scratch1
        item 7 key (FIRST_CHUNK_TREE CHUNK_ITEM 1674575872) itemoff 15511 itemsize 80
                length 536870912 owner 2 stripe_len 65536 type DATA
 New data chunk will fallback to SINGLE or DUP.


The cause is pretty simple, when mounted degraded, missing devices can't
be used for chunk allocation.
Thus btrfs has to fall back to SINGLE profile.

This patchset will make btrfs to consider missing devices as last resort if
current rw devices can't fulfil the profile request.

This should provide a good balance between considering all missing
device as RW and completely ruling out missing devices (current mainline
behavior).

Qu Wenruo (3):
  btrfs: volumes: Refactor device holes gathering into a separate
    function
  btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing
    devices
  btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a
    chunk

 fs/btrfs/block-group.c |  10 ++-
 fs/btrfs/volumes.c     | 170 ++++++++++++++++++++++++++---------------
 fs/btrfs/volumes.h     |   6 ++
 3 files changed, 121 insertions(+), 65 deletions(-)

-- 
2.24.0


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/3] btrfs: volumes: Refactor device holes gathering into a separate function
  2019-11-07  6:27 [PATCH 0/3] btrfs: More intelligent degraded chunk allocator Qu Wenruo
@ 2019-11-07  6:27 ` Qu Wenruo
  2019-11-07  9:20   ` Johannes Thumshirn
  2019-11-07  6:27 ` [PATCH 2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices Qu Wenruo
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 26+ messages in thread
From: Qu Wenruo @ 2019-11-07  6:27 UTC (permalink / raw)
  To: linux-btrfs

In __btrfs_alloc_chunk() we need to iterate through all rw devices and
gather the hole sizes of them.

This function can be refactor into a new function, gather_dev_holes(),
to gather holes info from a list_head.

This provides the basis for later degraded chunk feature.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/volumes.c | 129 ++++++++++++++++++++++++++-------------------
 1 file changed, 74 insertions(+), 55 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cdd7af424033..eee5fc1d11f0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4898,17 +4898,84 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
 	btrfs_set_fs_incompat(info, RAID56);
 }
 
+static int gather_dev_holes(struct btrfs_fs_info *info,
+			    struct btrfs_device_info *devices_info,
+			    int *index, struct list_head *list,
+			    int max_nr_devs, u64 stripe_size, int dev_stripes)
+{
+	struct btrfs_device *device;
+	int ret;
+	int ndevs = 0;
+
+	list_for_each_entry(device, list, dev_alloc_list) {
+		u64 max_avail;
+		u64 dev_offset;
+		u64 total_avail;
+
+		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+			WARN(1, KERN_ERR
+			       "BTRFS: read-only device in alloc_list\n");
+			continue;
+		}
+
+		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+					&device->dev_state) ||
+		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+			continue;
+
+		if (device->total_bytes > device->bytes_used)
+			total_avail = device->total_bytes - device->bytes_used;
+		else
+			total_avail = 0;
+
+		/* If there is no space on this device, skip it. */
+		if (total_avail == 0)
+			continue;
+
+		ret = find_free_dev_extent(device,
+					   stripe_size * dev_stripes,
+					   &dev_offset, &max_avail);
+		if (ret && ret != -ENOSPC)
+			break;
+
+		if (ret == 0)
+			max_avail = stripe_size * dev_stripes;
+
+		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
+			if (btrfs_test_opt(info, ENOSPC_DEBUG))
+				btrfs_debug(info,
+			"%s: devid %llu has no free space, have=%llu want=%u",
+					    __func__, device->devid, max_avail,
+					    BTRFS_STRIPE_LEN * dev_stripes);
+			continue;
+		}
+
+		if (ndevs == max_nr_devs) {
+			WARN(1, "%s: found more than %u devices\n", __func__,
+			     max_nr_devs);
+			break;
+		}
+		ret = 0;
+		devices_info[ndevs].dev_offset = dev_offset;
+		devices_info[ndevs].max_avail = max_avail;
+		devices_info[ndevs].total_avail = total_avail;
+		devices_info[ndevs].dev = device;
+		++ndevs;
+	}
+	*index += ndevs;
+	return 0;
+}
+
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			       u64 start, u64 type)
 {
 	struct btrfs_fs_info *info = trans->fs_info;
 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
-	struct btrfs_device *device;
 	struct map_lookup *map = NULL;
 	struct extent_map_tree *em_tree;
 	struct extent_map *em;
 	struct btrfs_device_info *devices_info = NULL;
-	u64 total_avail;
 	int num_stripes;	/* total number of stripes to allocate */
 	int data_stripes;	/* number of stripes that count for
 				   block group size */
@@ -4983,59 +5050,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	 * about the available holes on each device.
 	 */
 	ndevs = 0;
-	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
-		u64 max_avail;
-		u64 dev_offset;
-
-		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
-			WARN(1, KERN_ERR
-			       "BTRFS: read-only device in alloc_list\n");
-			continue;
-		}
-
-		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
-					&device->dev_state) ||
-		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
-			continue;
-
-		if (device->total_bytes > device->bytes_used)
-			total_avail = device->total_bytes - device->bytes_used;
-		else
-			total_avail = 0;
-
-		/* If there is no space on this device, skip it. */
-		if (total_avail == 0)
-			continue;
-
-		ret = find_free_dev_extent(device,
-					   max_stripe_size * dev_stripes,
-					   &dev_offset, &max_avail);
-		if (ret && ret != -ENOSPC)
-			goto error;
-
-		if (ret == 0)
-			max_avail = max_stripe_size * dev_stripes;
-
-		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
-			if (btrfs_test_opt(info, ENOSPC_DEBUG))
-				btrfs_debug(info,
-			"%s: devid %llu has no free space, have=%llu want=%u",
-					    __func__, device->devid, max_avail,
-					    BTRFS_STRIPE_LEN * dev_stripes);
-			continue;
-		}
-
-		if (ndevs == fs_devices->rw_devices) {
-			WARN(1, "%s: found more than %llu devices\n",
-			     __func__, fs_devices->rw_devices);
-			break;
-		}
-		devices_info[ndevs].dev_offset = dev_offset;
-		devices_info[ndevs].max_avail = max_avail;
-		devices_info[ndevs].total_avail = total_avail;
-		devices_info[ndevs].dev = device;
-		++ndevs;
-	}
+	ret = gather_dev_holes(info, devices_info, &ndevs,
+			&fs_devices->alloc_list, fs_devices->rw_devices,
+			max_stripe_size, dev_stripes);
+	if (ret < 0)
+		goto error;
 
 	/*
 	 * now sort the devices by hole size / available space
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices
  2019-11-07  6:27 [PATCH 0/3] btrfs: More intelligent degraded chunk allocator Qu Wenruo
  2019-11-07  6:27 ` [PATCH 1/3] btrfs: volumes: Refactor device holes gathering into a separate function Qu Wenruo
@ 2019-11-07  6:27 ` Qu Wenruo
  2019-11-07  9:31   ` Johannes Thumshirn
  2019-11-19 10:03   ` Anand Jain
  2019-11-07  6:27 ` [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk Qu Wenruo
  2019-11-18 20:18 ` [PATCH 0/3] btrfs: More intelligent degraded chunk allocator David Sterba
  3 siblings, 2 replies; 26+ messages in thread
From: Qu Wenruo @ 2019-11-07  6:27 UTC (permalink / raw)
  To: linux-btrfs

This enables btrfs to iterate missing devices separately, without
iterating all fs_devices.

This provides the basis for later degraded chunk enhancement.

The change includes:
- Add missing devices to btrfs_fs_devices::missing_list
  This happens at add_missing_dev() and other locations where
  missing_devices get increased.

- Remove missing devices from btrfs_fs_devices::missing_list
  This needs to cover all locations where missing_devices get decreased.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/volumes.c | 27 +++++++++++++++++++++------
 fs/btrfs/volumes.h |  6 ++++++
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index eee5fc1d11f0..a462d8de5d2a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -324,6 +324,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 
 	INIT_LIST_HEAD(&fs_devs->devices);
 	INIT_LIST_HEAD(&fs_devs->alloc_list);
+	INIT_LIST_HEAD(&fs_devs->missing_list);
 	INIT_LIST_HEAD(&fs_devs->fs_list);
 	if (fsid)
 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
@@ -1089,6 +1090,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 			fs_devices->missing_devices--;
 			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+			list_del_init(&device->dev_alloc_list);
 		}
 	}
 
@@ -1250,11 +1252,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
 	if (device->bdev)
 		fs_devices->open_devices--;
 
+	list_del_init(&device->dev_alloc_list);
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
-	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
-		list_del_init(&device->dev_alloc_list);
+	    device->devid != BTRFS_DEV_REPLACE_DEVID)
 		fs_devices->rw_devices--;
-	}
 
 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
 		fs_devices->missing_devices--;
@@ -2140,6 +2141,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 		device->fs_devices->rw_devices--;
 		mutex_unlock(&fs_info->chunk_mutex);
 	}
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+		mutex_lock(&fs_info->chunk_mutex);
+		list_del_init(&device->dev_alloc_list);
+		device->fs_devices->missing_devices--;
+		mutex_unlock(&fs_info->chunk_mutex);
+	}
 
 	mutex_unlock(&uuid_mutex);
 	ret = btrfs_shrink_device(device, 0);
@@ -2184,9 +2191,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	if (cur_devices != fs_devices)
 		fs_devices->total_devices--;
 
-	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
-		cur_devices->missing_devices--;
-
 	btrfs_assign_next_active_device(device, NULL);
 
 	if (device->bdev) {
@@ -2236,6 +2240,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 		device->fs_devices->rw_devices++;
 		mutex_unlock(&fs_info->chunk_mutex);
 	}
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+		mutex_lock(&fs_info->chunk_mutex);
+		list_add(&device->dev_alloc_list,
+			 &fs_devices->missing_list);
+		device->fs_devices->missing_devices++;
+		mutex_unlock(&fs_info->chunk_mutex);
+	}
 	goto out;
 }
 
@@ -2438,6 +2449,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
 	seed_devices->opened = 1;
 	INIT_LIST_HEAD(&seed_devices->devices);
 	INIT_LIST_HEAD(&seed_devices->alloc_list);
+	INIT_LIST_HEAD(&seed_devices->missing_list);
 	mutex_init(&seed_devices->device_list_mutex);
 
 	mutex_lock(&fs_devices->device_list_mutex);
@@ -6640,6 +6652,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
 	fs_devices->num_devices++;
 
 	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+	list_add(&device->dev_alloc_list, &fs_devices->missing_list);
 	fs_devices->missing_devices++;
 
 	return device;
@@ -6979,6 +6992,7 @@ static int read_one_dev(struct extent_buffer *leaf,
 			 */
 			device->fs_devices->missing_devices++;
 			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+			list_add(&device->dev_alloc_list, &fs_devices->missing_list);
 		}
 
 		/* Move the device to its own fs_devices */
@@ -6992,6 +7006,7 @@ static int read_one_dev(struct extent_buffer *leaf,
 
 			device->fs_devices->missing_devices--;
 			fs_devices->missing_devices++;
+			list_move(&device->dev_alloc_list, &fs_devices->missing_list);
 
 			device->fs_devices = fs_devices;
 		}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a7da1f3e3627..9cef4dc4b5be 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -253,6 +253,12 @@ struct btrfs_fs_devices {
 	 */
 	struct list_head alloc_list;
 
+	/*
+	 * Devices which can't be found. Projected by chunk_mutex.
+	 * This acts as a fallback allocation list for certain degraded mount.
+	 */
+	struct list_head missing_list;
+
 	struct btrfs_fs_devices *seed;
 	int seeding;
 
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk
  2019-11-07  6:27 [PATCH 0/3] btrfs: More intelligent degraded chunk allocator Qu Wenruo
  2019-11-07  6:27 ` [PATCH 1/3] btrfs: volumes: Refactor device holes gathering into a separate function Qu Wenruo
  2019-11-07  6:27 ` [PATCH 2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices Qu Wenruo
@ 2019-11-07  6:27 ` Qu Wenruo
  2019-11-19 10:05   ` Anand Jain
  2019-11-18 20:18 ` [PATCH 0/3] btrfs: More intelligent degraded chunk allocator David Sterba
  3 siblings, 1 reply; 26+ messages in thread
From: Qu Wenruo @ 2019-11-07  6:27 UTC (permalink / raw)
  To: linux-btrfs

[PROBLEM]
Btrfs degraded mount will fallback to SINGLE profile if there are not
enough devices:

 # mkfs.btrfs -f /dev/test/scratch[12] -m raid1 -d raid1
 # wipefs -fa /dev/test/scratch2
 # mount -o degraded /dev/test/scratch1 /mnt/btrfs
 # fallocate -l 1G /mnt/btrfs/foobar
 # btrfs ins dump-tree -t chunk /dev/test/scratch1
        item 7 key (FIRST_CHUNK_TREE CHUNK_ITEM 1674575872) itemoff 15511 itemsize 80
                length 536870912 owner 2 stripe_len 65536 type DATA
 New data chunk will fallback to SINGLE.

If user doesn't balance those SINGLE chunks, even with missing devices
replaced, the fs is no longer full RAID1, and a missing device can break
the tolerance.

[CAUSE]
The cause is pretty simple, when mounted degraded, missing devices can't
be used for chunk allocation.
Thus btrfs has to fall back to SINGLE profile.

[ENHANCEMENT]
To avoid such problem, this patch will:
- Make all profiler reducer/updater to consider missing devices as part
  of num_devices
- Make chunk allocator to fallback to missing_list as last resort

If we have enough rw_devices, then go regular chunk allocation code.
This can avoid allocating degraded chunks.
E.g. for 3 devices RAID1 degraded mount, we will use the 2 existing
devices to allocate chunk, avoid degraded chunk.

But if we don't have enough rw_devices, then we check missing devices to
allocate degraded chunks.
E.g. for 2 devices RAID1 degraded mount, we have to allocate degraded
chunks to keep the RAID1 profile.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/block-group.c | 10 +++++++---
 fs/btrfs/volumes.c     | 18 +++++++++++++++---
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index bf7e3f23bba7..1686fd31679b 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -52,11 +52,13 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
  */
 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
 {
-	u64 num_devices = fs_info->fs_devices->rw_devices;
+	u64 num_devices;
 	u64 target;
 	u64 raid_type;
 	u64 allowed = 0;
 
+	num_devices = fs_info->fs_devices->rw_devices +
+		      fs_info->fs_devices->missing_devices;
 	/*
 	 * See if restripe for this chunk_type is in progress, if so try to
 	 * reduce to the target profile
@@ -1986,7 +1988,8 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
 	if (stripped)
 		return extended_to_chunk(stripped);
 
-	num_devices = fs_info->fs_devices->rw_devices;
+	num_devices = fs_info->fs_devices->rw_devices +
+		      fs_info->fs_devices->missing_devices;
 
 	stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
 		BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
@@ -2981,7 +2984,8 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
 
 	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
 	if (!num_dev)
-		num_dev = fs_info->fs_devices->rw_devices;
+		num_dev = fs_info->fs_devices->rw_devices +
+			  fs_info->fs_devices->missing_devices;
 
 	return num_dev;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a462d8de5d2a..4dee1974ceb7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5052,8 +5052,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
 			     max_chunk_size);
 
-	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
-			       GFP_NOFS);
+	devices_info = kcalloc(fs_devices->rw_devices +
+			       fs_devices->missing_devices,
+			       sizeof(*devices_info), GFP_NOFS);
 	if (!devices_info)
 		return -ENOMEM;
 
@@ -5067,7 +5068,18 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			max_stripe_size, dev_stripes);
 	if (ret < 0)
 		goto error;
-
+	/*
+	 * If rw devices can't fullfil the request, fallback to missing devices
+	 * as last resort.
+	 */
+	if (ndevs < devs_min) {
+		ret = gather_dev_holes(info, devices_info + ndevs, &ndevs,
+				&fs_devices->missing_list,
+				fs_devices->missing_devices,
+				max_stripe_size, dev_stripes);
+		if (ret < 0)
+			goto error;
+	}
 	/*
 	 * now sort the devices by hole size / available space
 	 */
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] btrfs: volumes: Refactor device holes gathering into a separate function
  2019-11-07  6:27 ` [PATCH 1/3] btrfs: volumes: Refactor device holes gathering into a separate function Qu Wenruo
@ 2019-11-07  9:20   ` Johannes Thumshirn
  2019-11-07  9:33     ` Qu Wenruo
  0 siblings, 1 reply; 26+ messages in thread
From: Johannes Thumshirn @ 2019-11-07  9:20 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 07/11/2019 07:27, Qu Wenruo wrote:
> +static int gather_dev_holes(struct btrfs_fs_info *info,
> +			    struct btrfs_device_info *devices_info,
> +			    int *index, struct list_head *list,
> +			    int max_nr_devs, u64 stripe_size, int dev_stripes)
> +{

Hi Qu,

What do you think of

static int gather_dev_holes(struct btrfs_fs_info *info,
			    int *index,	u64 stripe_size,
                            int dev_stripes)
{
	struct btrfs_fs_devices *fs_devices = info->fs_devices;
	struct btrfs_device *device;
	int ret;
	int ndevs = 0;

	list_for_each_entry(device, &fs_devices->alloc_list,
			    dev_alloc_list) {

[...]


btrfs_device_info can be derived from btrfs_fs_info, and *list and
max_nr_devs can be derived from btrfs_device_info.

This reduces the number of arguments by 3 and we don't need to pass that
odd 'struct list_head *list' which isn't really clear from the type what
it is referring to.


Byte,
	Johannes
-- 
Johannes Thumshirn                            SUSE Labs Filesystems
jthumshirn@suse.de                                +49 911 74053 689
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5
90409 Nürnberg
Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Felix Imendörffer
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices
  2019-11-07  6:27 ` [PATCH 2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices Qu Wenruo
@ 2019-11-07  9:31   ` Johannes Thumshirn
  2019-11-19 10:03   ` Anand Jain
  1 sibling, 0 replies; 26+ messages in thread
From: Johannes Thumshirn @ 2019-11-07  9:31 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 07/11/2019 07:27, Qu Wenruo wrote:
> +	/*
> +	 * Devices which can't be found. Projected by chunk_mutex.
> +	 * This acts as a fallback allocation list for certain degraded mount.
> +	 */
> +	struct list_head missing_list;

From a quick glance, s/Projected/Protected/

-- 
Johannes Thumshirn                            SUSE Labs Filesystems
jthumshirn@suse.de                                +49 911 74053 689
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5
90409 Nürnberg
Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Felix Imendörffer
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] btrfs: volumes: Refactor device holes gathering into a separate function
  2019-11-07  9:20   ` Johannes Thumshirn
@ 2019-11-07  9:33     ` Qu Wenruo
  2019-11-07  9:45       ` Johannes Thumshirn
  0 siblings, 1 reply; 26+ messages in thread
From: Qu Wenruo @ 2019-11-07  9:33 UTC (permalink / raw)
  To: Johannes Thumshirn, Qu Wenruo, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 1325 bytes --]



On 2019/11/7 下午5:20, Johannes Thumshirn wrote:
> On 07/11/2019 07:27, Qu Wenruo wrote:
>> +static int gather_dev_holes(struct btrfs_fs_info *info,
>> +			    struct btrfs_device_info *devices_info,
>> +			    int *index, struct list_head *list,
>> +			    int max_nr_devs, u64 stripe_size, int dev_stripes)
>> +{
> 
> Hi Qu,
> 
> What do you think of
> 
> static int gather_dev_holes(struct btrfs_fs_info *info,
> 			    int *index,	u64 stripe_size,
>                             int dev_stripes)
> {
> 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
> 	struct btrfs_device *device;
> 	int ret;
> 	int ndevs = 0;
> 
> 	list_for_each_entry(device, &fs_devices->alloc_list,
> 			    dev_alloc_list) {
> 
> [...]
> 
> 
> btrfs_device_info can be derived from btrfs_fs_info, and *list and
> max_nr_devs can be derived from btrfs_device_info.
> 
> This reduces the number of arguments by 3 and we don't need to pass that
> odd 'struct list_head *list' which isn't really clear from the type what
> it is referring to.

The objective of this refactor is to accept different list, not only
fs_devices->alloc_list, but also later fs_devices->missing_list.

So I'm afraid it's not that easy to reduce the number of paramters.

Thanks,
Qu

> 
> 
> Byte,
> 	Johannes
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/3] btrfs: volumes: Refactor device holes gathering into a separate function
  2019-11-07  9:33     ` Qu Wenruo
@ 2019-11-07  9:45       ` Johannes Thumshirn
  0 siblings, 0 replies; 26+ messages in thread
From: Johannes Thumshirn @ 2019-11-07  9:45 UTC (permalink / raw)
  To: Qu Wenruo, Qu Wenruo, linux-btrfs

On 07/11/2019 10:33, Qu Wenruo wrote:
> 
> 
> On 2019/11/7 下午5:20, Johannes Thumshirn wrote:
>> On 07/11/2019 07:27, Qu Wenruo wrote:
>>> +static int gather_dev_holes(struct btrfs_fs_info *info,
>>> +			    struct btrfs_device_info *devices_info,
>>> +			    int *index, struct list_head *list,
>>> +			    int max_nr_devs, u64 stripe_size, int dev_stripes)
>>> +{
>>
>> Hi Qu,
>>
>> What do you think of
>>
>> static int gather_dev_holes(struct btrfs_fs_info *info,
>> 			    int *index,	u64 stripe_size,
>>                             int dev_stripes)
>> {
>> 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
>> 	struct btrfs_device *device;
>> 	int ret;
>> 	int ndevs = 0;
>>
>> 	list_for_each_entry(device, &fs_devices->alloc_list,
>> 			    dev_alloc_list) {
>>
>> [...]
>>
>>
>> btrfs_device_info can be derived from btrfs_fs_info, and *list and
>> max_nr_devs can be derived from btrfs_device_info.
>>
>> This reduces the number of arguments by 3 and we don't need to pass that
>> odd 'struct list_head *list' which isn't really clear from the type what
>> it is referring to.
> 
> The objective of this refactor is to accept different list, not only
> fs_devices->alloc_list, but also later fs_devices->missing_list.
> 
> So I'm afraid it's not that easy to reduce the number of paramters.


hmm this makes the api of gather_dev_holes() even more intransparent.

Looking at patch 3/3 I see why you're doing it though.

-- 
Johannes Thumshirn                            SUSE Labs Filesystems
jthumshirn@suse.de                                +49 911 74053 689
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5
90409 Nürnberg
Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Felix Imendörffer
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] btrfs: More intelligent degraded chunk allocator
  2019-11-07  6:27 [PATCH 0/3] btrfs: More intelligent degraded chunk allocator Qu Wenruo
                   ` (2 preceding siblings ...)
  2019-11-07  6:27 ` [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk Qu Wenruo
@ 2019-11-18 20:18 ` David Sterba
  2019-11-18 23:32   ` Qu Wenruo
  3 siblings, 1 reply; 26+ messages in thread
From: David Sterba @ 2019-11-18 20:18 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Thu, Nov 07, 2019 at 02:27:07PM +0800, Qu Wenruo wrote:
> This patchset will make btrfs degraded mount more intelligent and
> provide more consistent profile keeping function.
> 
> One of the most problematic aspect of degraded mount is, btrfs may
> create unwanted profiles.
> 
>  # mkfs.btrfs -f /dev/test/scratch[12] -m raid1 -d raid1
>  # wipefs -fa /dev/test/scratch2
>  # mount -o degraded /dev/test/scratch1 /mnt/btrfs
>  # fallocate -l 1G /mnt/btrfs/foobar
>  # btrfs ins dump-tree -t chunk /dev/test/scratch1
>         item 7 key (FIRST_CHUNK_TREE CHUNK_ITEM 1674575872) itemoff 15511 itemsize 80
>                 length 536870912 owner 2 stripe_len 65536 type DATA
>  New data chunk will fallback to SINGLE or DUP.
> 
> 
> The cause is pretty simple, when mounted degraded, missing devices can't
> be used for chunk allocation.
> Thus btrfs has to fall back to SINGLE profile.
> 
> This patchset will make btrfs to consider missing devices as last resort if
> current rw devices can't fulfil the profile request.
> 
> This should provide a good balance between considering all missing
> device as RW and completely ruling out missing devices (current mainline
> behavior).

Thanks. This is going to change the behaviour with a missing device, so
the question is if we should make this configurable first and then
switch the default.

How does this work with scrub? Eg. if there are 2 devices in RAID1, one
goes missing and then scrub is started. It makes no sense to try to
repair the missing blocks, but given the logic in the patches all the
data will be rewritten, right?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] btrfs: More intelligent degraded chunk allocator
  2019-11-18 20:18 ` [PATCH 0/3] btrfs: More intelligent degraded chunk allocator David Sterba
@ 2019-11-18 23:32   ` Qu Wenruo
  2019-11-19  5:18     ` Alberto Bursi
  2019-12-02  3:22     ` Zygo Blaxell
  0 siblings, 2 replies; 26+ messages in thread
From: Qu Wenruo @ 2019-11-18 23:32 UTC (permalink / raw)
  To: dsterba, Qu Wenruo, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 2645 bytes --]



On 2019/11/19 上午4:18, David Sterba wrote:
> On Thu, Nov 07, 2019 at 02:27:07PM +0800, Qu Wenruo wrote:
>> This patchset will make btrfs degraded mount more intelligent and
>> provide more consistent profile keeping function.
>>
>> One of the most problematic aspect of degraded mount is, btrfs may
>> create unwanted profiles.
>>
>>  # mkfs.btrfs -f /dev/test/scratch[12] -m raid1 -d raid1
>>  # wipefs -fa /dev/test/scratch2
>>  # mount -o degraded /dev/test/scratch1 /mnt/btrfs
>>  # fallocate -l 1G /mnt/btrfs/foobar
>>  # btrfs ins dump-tree -t chunk /dev/test/scratch1
>>         item 7 key (FIRST_CHUNK_TREE CHUNK_ITEM 1674575872) itemoff 15511 itemsize 80
>>                 length 536870912 owner 2 stripe_len 65536 type DATA
>>  New data chunk will fallback to SINGLE or DUP.
>>
>>
>> The cause is pretty simple, when mounted degraded, missing devices can't
>> be used for chunk allocation.
>> Thus btrfs has to fall back to SINGLE profile.
>>
>> This patchset will make btrfs to consider missing devices as last resort if
>> current rw devices can't fulfil the profile request.
>>
>> This should provide a good balance between considering all missing
>> device as RW and completely ruling out missing devices (current mainline
>> behavior).
> 
> Thanks. This is going to change the behaviour with a missing device, so
> the question is if we should make this configurable first and then
> switch the default.

Configurable then switch makes sense for most cases, but for this
degraded chunk case, IIRC the new behavior is superior in all cases.

For 2 devices RAID1 with one missing device (the main concern), old
behavior will create SINGLE/DUP chunk, which has no tolerance for extra
missing devices.

The new behavior will create degraded RAID1, which still lacks tolerance
for extra missing devices.

The difference is, for degraded chunk, if we have the device back, and
do proper scrub, then we're completely back to proper RAID1.
No need to do extra balance/convert, only scrub is needed.

So the new behavior is kinda of a super set of old behavior, using the
new behavior by default should not cause extra concern.

> 
> How does this work with scrub? Eg. if there are 2 devices in RAID1, one
> goes missing and then scrub is started. It makes no sense to try to
> repair the missing blocks, but given the logic in the patches all the
> data will be rewritten, right?

Scrub is unchanged at all.

Missing device will not go through scrub at all, as scrub is per-device
based, missing device will be ruled out at very beginning of scrub.

Thanks,
Qu
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] btrfs: More intelligent degraded chunk allocator
  2019-11-18 23:32   ` Qu Wenruo
@ 2019-11-19  5:18     ` Alberto Bursi
  2019-11-27 19:26       ` David Sterba
  2019-12-02  3:22     ` Zygo Blaxell
  1 sibling, 1 reply; 26+ messages in thread
From: Alberto Bursi @ 2019-11-19  5:18 UTC (permalink / raw)
  To: Qu Wenruo, dsterba, linux-btrfs


On 19/11/19 07:32, Qu Wenruo wrote:
>
> On 2019/11/19 上午4:18, David Sterba wrote:
>> On Thu, Nov 07, 2019 at 02:27:07PM +0800, Qu Wenruo wrote:
>>> This patchset will make btrfs degraded mount more intelligent and
>>> provide more consistent profile keeping function.
>>>
>>> One of the most problematic aspect of degraded mount is, btrfs may
>>> create unwanted profiles.
>>>
>>>   # mkfs.btrfs -f /dev/test/scratch[12] -m raid1 -d raid1
>>>   # wipefs -fa /dev/test/scratch2
>>>   # mount -o degraded /dev/test/scratch1 /mnt/btrfs
>>>   # fallocate -l 1G /mnt/btrfs/foobar
>>>   # btrfs ins dump-tree -t chunk /dev/test/scratch1
>>>          item 7 key (FIRST_CHUNK_TREE CHUNK_ITEM 1674575872) itemoff 15511 itemsize 80
>>>                  length 536870912 owner 2 stripe_len 65536 type DATA
>>>   New data chunk will fallback to SINGLE or DUP.
>>>
>>>
>>> The cause is pretty simple, when mounted degraded, missing devices can't
>>> be used for chunk allocation.
>>> Thus btrfs has to fall back to SINGLE profile.
>>>
>>> This patchset will make btrfs to consider missing devices as last resort if
>>> current rw devices can't fulfil the profile request.
>>>
>>> This should provide a good balance between considering all missing
>>> device as RW and completely ruling out missing devices (current mainline
>>> behavior).
>> Thanks. This is going to change the behaviour with a missing device, so
>> the question is if we should make this configurable first and then
>> switch the default.
> Configurable then switch makes sense for most cases, but for this
> degraded chunk case, IIRC the new behavior is superior in all cases.
>
> For 2 devices RAID1 with one missing device (the main concern), old
> behavior will create SINGLE/DUP chunk, which has no tolerance for extra
> missing devices.
>
> The new behavior will create degraded RAID1, which still lacks tolerance
> for extra missing devices.
>
> The difference is, for degraded chunk, if we have the device back, and
> do proper scrub, then we're completely back to proper RAID1.
> No need to do extra balance/convert, only scrub is needed.
>
> So the new behavior is kinda of a super set of old behavior, using the
> new behavior by default should not cause extra concern.


I think most users will see this as a bug fix, as the current behavior 
of creating

SINGLE chunks is very annoying and can cause confusion as it is NOT an

expected behavior for a classic (mdadm or hardware) degraded RAID array.


-Alberto


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices
  2019-11-07  6:27 ` [PATCH 2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices Qu Wenruo
  2019-11-07  9:31   ` Johannes Thumshirn
@ 2019-11-19 10:03   ` Anand Jain
  2019-11-19 10:29     ` Qu Wenruo
  1 sibling, 1 reply; 26+ messages in thread
From: Anand Jain @ 2019-11-19 10:03 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 11/7/19 2:27 PM, Qu Wenruo wrote:
> This enables btrfs to iterate missing devices separately, without
> iterating all fs_devices.

  IMO.
  We don't need another list to maintain the missing device. We already
  have good enough device lists.
  The way its been implemented is
  Allo_list is the only list from which we shall alloc the chunks.
  Missing is a state of the device.
  A device in the alloc list can be in the Missing state.

  If there is missing_list that means the device in the missing list
  is also possible candidate for the alloc that's messy.
  Also its not typical to have a larger number of missing devices
  to constitute its own list.

Thanks, Anand


> This provides the basis for later degraded chunk enhancement.
> 
> The change includes:
> - Add missing devices to btrfs_fs_devices::missing_list
>    This happens at add_missing_dev() and other locations where
>    missing_devices get increased.
> 
> - Remove missing devices from btrfs_fs_devices::missing_list
>    This needs to cover all locations where missing_devices get decreased.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>   fs/btrfs/volumes.c | 27 +++++++++++++++++++++------
>   fs/btrfs/volumes.h |  6 ++++++
>   2 files changed, 27 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index eee5fc1d11f0..a462d8de5d2a 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -324,6 +324,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
>   
>   	INIT_LIST_HEAD(&fs_devs->devices);
>   	INIT_LIST_HEAD(&fs_devs->alloc_list);
> +	INIT_LIST_HEAD(&fs_devs->missing_list);
>   	INIT_LIST_HEAD(&fs_devs->fs_list);
>   	if (fsid)
>   		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
> @@ -1089,6 +1090,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
>   		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
>   			fs_devices->missing_devices--;
>   			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
> +			list_del_init(&device->dev_alloc_list);
>   		}
>   	}
>   
> @@ -1250,11 +1252,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
>   	if (device->bdev)
>   		fs_devices->open_devices--;
>   
> +	list_del_init(&device->dev_alloc_list);
>   	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
> -	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
> -		list_del_init(&device->dev_alloc_list);
> +	    device->devid != BTRFS_DEV_REPLACE_DEVID)
>   		fs_devices->rw_devices--;
> -	}
>   
>   	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
>   		fs_devices->missing_devices--;
> @@ -2140,6 +2141,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
>   		device->fs_devices->rw_devices--;
>   		mutex_unlock(&fs_info->chunk_mutex);
>   	}
> +	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
> +		mutex_lock(&fs_info->chunk_mutex);
> +		list_del_init(&device->dev_alloc_list);
> +		device->fs_devices->missing_devices--;
> +		mutex_unlock(&fs_info->chunk_mutex);
> +	}
>   
>   	mutex_unlock(&uuid_mutex);
>   	ret = btrfs_shrink_device(device, 0);
> @@ -2184,9 +2191,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
>   	if (cur_devices != fs_devices)
>   		fs_devices->total_devices--;
>   
> -	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
> -		cur_devices->missing_devices--;
> -
>   	btrfs_assign_next_active_device(device, NULL);
>   
>   	if (device->bdev) {
> @@ -2236,6 +2240,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
>   		device->fs_devices->rw_devices++;
>   		mutex_unlock(&fs_info->chunk_mutex);
>   	}
> +	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
> +		mutex_lock(&fs_info->chunk_mutex);
> +		list_add(&device->dev_alloc_list,
> +			 &fs_devices->missing_list);
> +		device->fs_devices->missing_devices++;
> +		mutex_unlock(&fs_info->chunk_mutex);
> +	}
>   	goto out;
>   }
>   
> @@ -2438,6 +2449,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
>   	seed_devices->opened = 1;
>   	INIT_LIST_HEAD(&seed_devices->devices);
>   	INIT_LIST_HEAD(&seed_devices->alloc_list);
> +	INIT_LIST_HEAD(&seed_devices->missing_list);
>   	mutex_init(&seed_devices->device_list_mutex);
>   
>   	mutex_lock(&fs_devices->device_list_mutex);
> @@ -6640,6 +6652,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
>   	fs_devices->num_devices++;
>   
>   	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
> +	list_add(&device->dev_alloc_list, &fs_devices->missing_list);
>   	fs_devices->missing_devices++;
>   
>   	return device;
> @@ -6979,6 +6992,7 @@ static int read_one_dev(struct extent_buffer *leaf,
>   			 */
>   			device->fs_devices->missing_devices++;
>   			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
> +			list_add(&device->dev_alloc_list, &fs_devices->missing_list);
>   		}
>   
>   		/* Move the device to its own fs_devices */
> @@ -6992,6 +7006,7 @@ static int read_one_dev(struct extent_buffer *leaf,
>   
>   			device->fs_devices->missing_devices--;
>   			fs_devices->missing_devices++;
> +			list_move(&device->dev_alloc_list, &fs_devices->missing_list);
>   
>   			device->fs_devices = fs_devices;
>   		}
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index a7da1f3e3627..9cef4dc4b5be 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -253,6 +253,12 @@ struct btrfs_fs_devices {
>   	 */
>   	struct list_head alloc_list;
>   
> +	/*
> +	 * Devices which can't be found. Projected by chunk_mutex.
> +	 * This acts as a fallback allocation list for certain degraded mount.
> +	 */
> +	struct list_head missing_list;
> +
>   	struct btrfs_fs_devices *seed;
>   	int seeding;
>   
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk
  2019-11-07  6:27 ` [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk Qu Wenruo
@ 2019-11-19 10:05   ` Anand Jain
  2019-11-19 10:41     ` Qu Wenruo
  0 siblings, 1 reply; 26+ messages in thread
From: Anand Jain @ 2019-11-19 10:05 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 11/7/19 2:27 PM, Qu Wenruo wrote:
> [PROBLEM]
> Btrfs degraded mount will fallback to SINGLE profile if there are not
> enough devices:

  Its better to keep it like this for now until there is a fix for the
  write hole. Otherwise hitting the write hole bug in case of degraded
  raid1 will be more prevalent.

  I proposed a RFC a long time before [1] (also in there, there
  is a commit id which turned the degraded raid1 profile into single
  profile (without much write-up on it)).

    [1] [PATCH 0/2] [RFC] btrfs: create degraded-RAID1 chunks

  Similarly the patch related to the reappearing missing device [2]
  falls under the same category. Will push for the integration after
  the write hole fix.

    [2] [PATCH] btrfs: handle dynamically reappearing missing device
    (test case 154).

  If you look close enough the original author has quite nicely made
  sure write hole bug will be very difficultly to hit. These fixes
  shall make it easy to hit. So its better to work on the write hole
  first.

  I am trying to fix write hole. First attempt has limited success
  (works fine in two disk raid1 only). Now trying other ways to fix.

>   # mkfs.btrfs -f /dev/test/scratch[12] -m raid1 -d raid1
>   # wipefs -fa /dev/test/scratch2
>   # mount -o degraded /dev/test/scratch1 /mnt/btrfs
>   # fallocate -l 1G /mnt/btrfs/foobar
>   # btrfs ins dump-tree -t chunk /dev/test/scratch1
>          item 7 key (FIRST_CHUNK_TREE CHUNK_ITEM 1674575872) itemoff 15511 itemsize 80
>                  length 536870912 owner 2 stripe_len 65536 type DATA
>   New data chunk will fallback to SINGLE.
> 
> If user doesn't balance those SINGLE chunks, even with missing devices
> replaced, the fs is no longer full RAID1, and a missing device can break
> the tolerance.

  As its been discussed quite a lot of time before, the current
  re-silver/recovery approach for degraded-raid1 (with offload to Single)
  requires balance. Its kind of known.

Thanks, Anand


> [CAUSE]
> The cause is pretty simple, when mounted degraded, missing devices can't
> be used for chunk allocation.
> Thus btrfs has to fall back to SINGLE profile.
> 
> [ENHANCEMENT]
> To avoid such problem, this patch will:
> - Make all profiler reducer/updater to consider missing devices as part
>    of num_devices
> - Make chunk allocator to fallback to missing_list as last resort
> 
> If we have enough rw_devices, then go regular chunk allocation code.

> This can avoid allocating degraded chunks.
> E.g. for 3 devices RAID1 degraded mount, we will use the 2 existing
> devices to allocate chunk, avoid degraded chunk.

> But if we don't have enough rw_devices, then we check missing devices to
> allocate degraded chunks.
> E.g. for 2 devices RAID1 degraded mount, we have to allocate degraded
> chunks to keep the RAID1 profile.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>   fs/btrfs/block-group.c | 10 +++++++---
>   fs/btrfs/volumes.c     | 18 +++++++++++++++---
>   2 files changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index bf7e3f23bba7..1686fd31679b 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -52,11 +52,13 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
>    */
>   static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
>   {
> -	u64 num_devices = fs_info->fs_devices->rw_devices;
> +	u64 num_devices;
>   	u64 target;
>   	u64 raid_type;
>   	u64 allowed = 0;
>   
> +	num_devices = fs_info->fs_devices->rw_devices +
> +		      fs_info->fs_devices->missing_devices;
>   	/*
>   	 * See if restripe for this chunk_type is in progress, if so try to
>   	 * reduce to the target profile
> @@ -1986,7 +1988,8 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
>   	if (stripped)
>   		return extended_to_chunk(stripped);
>   
> -	num_devices = fs_info->fs_devices->rw_devices;
> +	num_devices = fs_info->fs_devices->rw_devices +
> +		      fs_info->fs_devices->missing_devices;
>   
>   	stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
>   		BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
> @@ -2981,7 +2984,8 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
>   
>   	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
>   	if (!num_dev)
> -		num_dev = fs_info->fs_devices->rw_devices;
> +		num_dev = fs_info->fs_devices->rw_devices +
> +			  fs_info->fs_devices->missing_devices;
>   
>   	return num_dev;
>   }
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index a462d8de5d2a..4dee1974ceb7 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -5052,8 +5052,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>   	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
>   			     max_chunk_size);
>   
> -	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
> -			       GFP_NOFS);
> +	devices_info = kcalloc(fs_devices->rw_devices +
> +			       fs_devices->missing_devices,
> +			       sizeof(*devices_info), GFP_NOFS);
>   	if (!devices_info)
>   		return -ENOMEM;
>   
> @@ -5067,7 +5068,18 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
>   			max_stripe_size, dev_stripes);
>   	if (ret < 0)
>   		goto error;
> -
> +	/*
> +	 * If rw devices can't fullfil the request, fallback to missing devices
> +	 * as last resort.
> +	 */
> +	if (ndevs < devs_min) {
> +		ret = gather_dev_holes(info, devices_info + ndevs, &ndevs,
> +				&fs_devices->missing_list,
> +				fs_devices->missing_devices,
> +				max_stripe_size, dev_stripes);
> +		if (ret < 0)
> +			goto error;
> +	}
>   	/*
>   	 * now sort the devices by hole size / available space
>   	 */
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices
  2019-11-19 10:03   ` Anand Jain
@ 2019-11-19 10:29     ` Qu Wenruo
  2019-11-27 19:36       ` David Sterba
  0 siblings, 1 reply; 26+ messages in thread
From: Qu Wenruo @ 2019-11-19 10:29 UTC (permalink / raw)
  To: Anand Jain, Qu Wenruo, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 7720 bytes --]



On 2019/11/19 下午6:03, Anand Jain wrote:
> On 11/7/19 2:27 PM, Qu Wenruo wrote:
>> This enables btrfs to iterate missing devices separately, without
>> iterating all fs_devices.
> 
>  IMO.
>  We don't need another list to maintain the missing device. We already
>  have good enough device lists.
>  The way its been implemented is
>  Allo_list is the only list from which we shall alloc the chunks.
>  Missing is a state of the device.
>  A device in the alloc list can be in the Missing state.

That would cause problem, especially when you only want to use missing
device as last resort method.

IIRC it's you mentioned this is a problem in my original design (which
put all missing deviecs into alloc_list). Or it's David?

> 
>  If there is missing_list that means the device in the missing list
>  is also possible candidate for the alloc that's messy.

But when you want to avoid missing device, alloc_list and missing_list
makes sense.

E.g. 6 devices RAID5, with one missing device, we should *avoid* using
missing devices as we have enough (5) devices to allocate from.

>  Also its not typical to have a larger number of missing devices
>  to constitute its own list.

That's just for now.

If we're going to allow RAID10 to lost half of its devices, then it
would be a problem.

Thanks,
Qu

> 
> Thanks, Anand
> 
> 
>> This provides the basis for later degraded chunk enhancement.
>>
>> The change includes:
>> - Add missing devices to btrfs_fs_devices::missing_list
>>    This happens at add_missing_dev() and other locations where
>>    missing_devices get increased.
>>
>> - Remove missing devices from btrfs_fs_devices::missing_list
>>    This needs to cover all locations where missing_devices get decreased.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/volumes.c | 27 +++++++++++++++++++++------
>>   fs/btrfs/volumes.h |  6 ++++++
>>   2 files changed, 27 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index eee5fc1d11f0..a462d8de5d2a 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -324,6 +324,7 @@ static struct btrfs_fs_devices
>> *alloc_fs_devices(const u8 *fsid,
>>         INIT_LIST_HEAD(&fs_devs->devices);
>>       INIT_LIST_HEAD(&fs_devs->alloc_list);
>> +    INIT_LIST_HEAD(&fs_devs->missing_list);
>>       INIT_LIST_HEAD(&fs_devs->fs_list);
>>       if (fsid)
>>           memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
>> @@ -1089,6 +1090,7 @@ static noinline struct btrfs_device
>> *device_list_add(const char *path,
>>           if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
>>               fs_devices->missing_devices--;
>>               clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
>> +            list_del_init(&device->dev_alloc_list);
>>           }
>>       }
>>   @@ -1250,11 +1252,10 @@ static void btrfs_close_one_device(struct
>> btrfs_device *device)
>>       if (device->bdev)
>>           fs_devices->open_devices--;
>>   +    list_del_init(&device->dev_alloc_list);
>>       if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
>> -        device->devid != BTRFS_DEV_REPLACE_DEVID) {
>> -        list_del_init(&device->dev_alloc_list);
>> +        device->devid != BTRFS_DEV_REPLACE_DEVID)
>>           fs_devices->rw_devices--;
>> -    }
>>         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
>>           fs_devices->missing_devices--;
>> @@ -2140,6 +2141,12 @@ int btrfs_rm_device(struct btrfs_fs_info
>> *fs_info, const char *device_path,
>>           device->fs_devices->rw_devices--;
>>           mutex_unlock(&fs_info->chunk_mutex);
>>       }
>> +    if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
>> +        mutex_lock(&fs_info->chunk_mutex);
>> +        list_del_init(&device->dev_alloc_list);
>> +        device->fs_devices->missing_devices--;
>> +        mutex_unlock(&fs_info->chunk_mutex);
>> +    }
>>         mutex_unlock(&uuid_mutex);
>>       ret = btrfs_shrink_device(device, 0);
>> @@ -2184,9 +2191,6 @@ int btrfs_rm_device(struct btrfs_fs_info
>> *fs_info, const char *device_path,
>>       if (cur_devices != fs_devices)
>>           fs_devices->total_devices--;
>>   -    if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
>> -        cur_devices->missing_devices--;
>> -
>>       btrfs_assign_next_active_device(device, NULL);
>>         if (device->bdev) {
>> @@ -2236,6 +2240,13 @@ int btrfs_rm_device(struct btrfs_fs_info
>> *fs_info, const char *device_path,
>>           device->fs_devices->rw_devices++;
>>           mutex_unlock(&fs_info->chunk_mutex);
>>       }
>> +    if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
>> +        mutex_lock(&fs_info->chunk_mutex);
>> +        list_add(&device->dev_alloc_list,
>> +             &fs_devices->missing_list);
>> +        device->fs_devices->missing_devices++;
>> +        mutex_unlock(&fs_info->chunk_mutex);
>> +    }
>>       goto out;
>>   }
>>   @@ -2438,6 +2449,7 @@ static int btrfs_prepare_sprout(struct
>> btrfs_fs_info *fs_info)
>>       seed_devices->opened = 1;
>>       INIT_LIST_HEAD(&seed_devices->devices);
>>       INIT_LIST_HEAD(&seed_devices->alloc_list);
>> +    INIT_LIST_HEAD(&seed_devices->missing_list);
>>       mutex_init(&seed_devices->device_list_mutex);
>>         mutex_lock(&fs_devices->device_list_mutex);
>> @@ -6640,6 +6652,7 @@ static struct btrfs_device
>> *add_missing_dev(struct btrfs_fs_devices *fs_devices,
>>       fs_devices->num_devices++;
>>         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
>> +    list_add(&device->dev_alloc_list, &fs_devices->missing_list);
>>       fs_devices->missing_devices++;
>>         return device;
>> @@ -6979,6 +6992,7 @@ static int read_one_dev(struct extent_buffer *leaf,
>>                */
>>               device->fs_devices->missing_devices++;
>>               set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
>> +            list_add(&device->dev_alloc_list,
>> &fs_devices->missing_list);
>>           }
>>             /* Move the device to its own fs_devices */
>> @@ -6992,6 +7006,7 @@ static int read_one_dev(struct extent_buffer *leaf,
>>                 device->fs_devices->missing_devices--;
>>               fs_devices->missing_devices++;
>> +            list_move(&device->dev_alloc_list,
>> &fs_devices->missing_list);
>>                 device->fs_devices = fs_devices;
>>           }
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index a7da1f3e3627..9cef4dc4b5be 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -253,6 +253,12 @@ struct btrfs_fs_devices {
>>        */
>>       struct list_head alloc_list;
>>   +    /*
>> +     * Devices which can't be found. Projected by chunk_mutex.
>> +     * This acts as a fallback allocation list for certain degraded
>> mount.
>> +     */
>> +    struct list_head missing_list;
>> +
>>       struct btrfs_fs_devices *seed;
>>       int seeding;
>>  
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 520 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk
  2019-11-19 10:05   ` Anand Jain
@ 2019-11-19 10:41     ` Qu Wenruo
  2019-11-27 19:23       ` David Sterba
  0 siblings, 1 reply; 26+ messages in thread
From: Qu Wenruo @ 2019-11-19 10:41 UTC (permalink / raw)
  To: Anand Jain, Qu Wenruo, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 8059 bytes --]



On 2019/11/19 下午6:05, Anand Jain wrote:
> On 11/7/19 2:27 PM, Qu Wenruo wrote:
>> [PROBLEM]
>> Btrfs degraded mount will fallback to SINGLE profile if there are not
>> enough devices:
> 
>  Its better to keep it like this for now until there is a fix for the
>  write hole. Otherwise hitting the write hole bug in case of degraded
>  raid1 will be more prevalent.

Write hole should be a problem for RAID5/6, not the degraded chunk
feature itself.

Furthermore, this design will try to avoid allocating chunks using
missing devices.
So even for 3 devices RAID5, new chunks will be allocated by using
existing devices (2 devices RAID5), so no new write hole is introduced.

> 
>  I proposed a RFC a long time before [1] (also in there, there
>  is a commit id which turned the degraded raid1 profile into single
>  profile (without much write-up on it)).
> 
>    [1] [PATCH 0/2] [RFC] btrfs: create degraded-RAID1 chunks

My point for this patchset is:
- Create regular chunk if we have enough devices
- Create degraded chunk only when we have not enough devices

I guess since you didn't get the point of my preparation patches, your
patches aren't that good to avoid missing devices.

> 
>  Similarly the patch related to the reappearing missing device [2]
>  falls under the same category. Will push for the integration after
>  the write hole fix.
> 
>    [2] [PATCH] btrfs: handle dynamically reappearing missing device
>    (test case 154).

That's another case, and I didn't see how it affects this feature.

> 
>  If you look close enough the original author has quite nicely made
>  sure write hole bug will be very difficultly to hit. These fixes
>  shall make it easy to hit. So its better to work on the write hole
>  first.

If you're talking about RAID5/6, you are talking at the wrong thread.
Go implement some write-a-head log for RAID5/6, or mark all degraded
RAID5/6 chunks read-only at mount time.

> 
>  I am trying to fix write hole. First attempt has limited success
>  (works fine in two disk raid1 only). Now trying other ways to fix.
> 
>>   # mkfs.btrfs -f /dev/test/scratch[12] -m raid1 -d raid1
>>   # wipefs -fa /dev/test/scratch2
>>   # mount -o degraded /dev/test/scratch1 /mnt/btrfs
>>   # fallocate -l 1G /mnt/btrfs/foobar
>>   # btrfs ins dump-tree -t chunk /dev/test/scratch1
>>          item 7 key (FIRST_CHUNK_TREE CHUNK_ITEM 1674575872) itemoff
>> 15511 itemsize 80
>>                  length 536870912 owner 2 stripe_len 65536 type DATA
>>   New data chunk will fallback to SINGLE.
>>
>> If user doesn't balance those SINGLE chunks, even with missing devices
>> replaced, the fs is no longer full RAID1, and a missing device can break
>> the tolerance.
> 
>  As its been discussed quite a lot of time before, the current
>  re-silver/recovery approach for degraded-raid1 (with offload to Single)
>  requires balance. Its kind of known.

I'd call such "well-known" behavior BS.

All other raid1 implementation can accept single device RAID1 and
resilver itself with more device into a full RAID1 setup.

But for BTRFS you're calling SINGLE profile "well-known"?
It's "well-known" because it's not working properly, that's why I'm
trying to fix it.


> 
> Thanks, Anand
> 
> 
>> [CAUSE]
>> The cause is pretty simple, when mounted degraded, missing devices can't
>> be used for chunk allocation.
>> Thus btrfs has to fall back to SINGLE profile.
>>
>> [ENHANCEMENT]
>> To avoid such problem, this patch will:
>> - Make all profiler reducer/updater to consider missing devices as part
>>    of num_devices
>> - Make chunk allocator to fallback to missing_list as last resort
>>
>> If we have enough rw_devices, then go regular chunk allocation code.
> 
>> This can avoid allocating degraded chunks.
>> E.g. for 3 devices RAID1 degraded mount, we will use the 2 existing
>> devices to allocate chunk, avoid degraded chunk.
> 
>> But if we don't have enough rw_devices, then we check missing devices to
>> allocate degraded chunks.
>> E.g. for 2 devices RAID1 degraded mount, we have to allocate degraded
>> chunks to keep the RAID1 profile.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/block-group.c | 10 +++++++---
>>   fs/btrfs/volumes.c     | 18 +++++++++++++++---
>>   2 files changed, 22 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>> index bf7e3f23bba7..1686fd31679b 100644
>> --- a/fs/btrfs/block-group.c
>> +++ b/fs/btrfs/block-group.c
>> @@ -52,11 +52,13 @@ static u64 get_restripe_target(struct
>> btrfs_fs_info *fs_info, u64 flags)
>>    */
>>   static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info,
>> u64 flags)
>>   {
>> -    u64 num_devices = fs_info->fs_devices->rw_devices;
>> +    u64 num_devices;
>>       u64 target;
>>       u64 raid_type;
>>       u64 allowed = 0;
>>   +    num_devices = fs_info->fs_devices->rw_devices +
>> +              fs_info->fs_devices->missing_devices;
>>       /*
>>        * See if restripe for this chunk_type is in progress, if so try to
>>        * reduce to the target profile
>> @@ -1986,7 +1988,8 @@ static u64 update_block_group_flags(struct
>> btrfs_fs_info *fs_info, u64 flags)
>>       if (stripped)
>>           return extended_to_chunk(stripped);
>>   -    num_devices = fs_info->fs_devices->rw_devices;
>> +    num_devices = fs_info->fs_devices->rw_devices +
>> +              fs_info->fs_devices->missing_devices;
>>         stripped = BTRFS_BLOCK_GROUP_RAID0 |
>> BTRFS_BLOCK_GROUP_RAID56_MASK |
>>           BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
>> @@ -2981,7 +2984,8 @@ static u64 get_profile_num_devs(struct
>> btrfs_fs_info *fs_info, u64 type)
>>         num_dev =
>> btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
>>       if (!num_dev)
>> -        num_dev = fs_info->fs_devices->rw_devices;
>> +        num_dev = fs_info->fs_devices->rw_devices +
>> +              fs_info->fs_devices->missing_devices;
>>         return num_dev;
>>   }
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index a462d8de5d2a..4dee1974ceb7 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -5052,8 +5052,9 @@ static int __btrfs_alloc_chunk(struct
>> btrfs_trans_handle *trans,
>>       max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
>>                    max_chunk_size);
>>   -    devices_info = kcalloc(fs_devices->rw_devices,
>> sizeof(*devices_info),
>> -                   GFP_NOFS);
>> +    devices_info = kcalloc(fs_devices->rw_devices +
>> +                   fs_devices->missing_devices,
>> +                   sizeof(*devices_info), GFP_NOFS);
>>       if (!devices_info)
>>           return -ENOMEM;
>>   @@ -5067,7 +5068,18 @@ static int __btrfs_alloc_chunk(struct
>> btrfs_trans_handle *trans,
>>               max_stripe_size, dev_stripes);
>>       if (ret < 0)
>>           goto error;
>> -
>> +    /*
>> +     * If rw devices can't fullfil the request, fallback to missing
>> devices
>> +     * as last resort.
>> +     */
>> +    if (ndevs < devs_min) {
>> +        ret = gather_dev_holes(info, devices_info + ndevs, &ndevs,
>> +                &fs_devices->missing_list,
>> +                fs_devices->missing_devices,
>> +                max_stripe_size, dev_stripes);
>> +        if (ret < 0)
>> +            goto error;
>> +    }
>>       /*
>>        * now sort the devices by hole size / available space
>>        */
>>
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 520 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk
  2019-11-19 10:41     ` Qu Wenruo
@ 2019-11-27 19:23       ` David Sterba
  2019-11-27 23:36         ` Qu Wenruo
  0 siblings, 1 reply; 26+ messages in thread
From: David Sterba @ 2019-11-27 19:23 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Anand Jain, Qu Wenruo, linux-btrfs

On Tue, Nov 19, 2019 at 06:41:49PM +0800, Qu Wenruo wrote:
> On 2019/11/19 下午6:05, Anand Jain wrote:
> > On 11/7/19 2:27 PM, Qu Wenruo wrote:
> >> [PROBLEM]
> >> Btrfs degraded mount will fallback to SINGLE profile if there are not
> >> enough devices:
> > 
> >  Its better to keep it like this for now until there is a fix for the
> >  write hole. Otherwise hitting the write hole bug in case of degraded
> >  raid1 will be more prevalent.
> 
> Write hole should be a problem for RAID5/6, not the degraded chunk
> feature itself.
> 
> Furthermore, this design will try to avoid allocating chunks using
> missing devices.
> So even for 3 devices RAID5, new chunks will be allocated by using
> existing devices (2 devices RAID5), so no new write hole is introduced.

That this would allow a 2 device raid5 (from expected 3) is similar to
the reduced chunks, but now hidden because we don't have a detailed
report for stripes on devices. And rebalance would be needed to make
sure that's the filesystem is again 3 devices (and 1 device lost
tolerant).

This is different to the 1 device missing for raid1, where scrub can
fix that (expected), but the balance is IMHO not.

I'd suggest to allow allocation from missing devices only from the
profiles with redundancy. For now.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] btrfs: More intelligent degraded chunk allocator
  2019-11-19  5:18     ` Alberto Bursi
@ 2019-11-27 19:26       ` David Sterba
  0 siblings, 0 replies; 26+ messages in thread
From: David Sterba @ 2019-11-27 19:26 UTC (permalink / raw)
  To: Alberto Bursi; +Cc: Qu Wenruo, dsterba, linux-btrfs

On Tue, Nov 19, 2019 at 01:18:21PM +0800, Alberto Bursi wrote:
> >> Thanks. This is going to change the behaviour with a missing device, so
> >> the question is if we should make this configurable first and then
> >> switch the default.
> > Configurable then switch makes sense for most cases, but for this
> > degraded chunk case, IIRC the new behavior is superior in all cases.
> >
> > For 2 devices RAID1 with one missing device (the main concern), old
> > behavior will create SINGLE/DUP chunk, which has no tolerance for extra
> > missing devices.
> >
> > The new behavior will create degraded RAID1, which still lacks tolerance
> > for extra missing devices.
> >
> > The difference is, for degraded chunk, if we have the device back, and
> > do proper scrub, then we're completely back to proper RAID1.
> > No need to do extra balance/convert, only scrub is needed.
> >
> > So the new behavior is kinda of a super set of old behavior, using the
> > new behavior by default should not cause extra concern.
> 
> I think most users will see this as a bug fix, as the current behavior 
> of creating
> 
> SINGLE chunks is very annoying and can cause confusion as it is NOT an
> 
> expected behavior for a classic (mdadm or hardware) degraded RAID array.

Thanks for the feedback, I agree with that. It's good to have a
confirmation from somebody outside of developer group.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices
  2019-11-19 10:29     ` Qu Wenruo
@ 2019-11-27 19:36       ` David Sterba
  0 siblings, 0 replies; 26+ messages in thread
From: David Sterba @ 2019-11-27 19:36 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Anand Jain, Qu Wenruo, linux-btrfs

On Tue, Nov 19, 2019 at 06:29:46PM +0800, Qu Wenruo wrote:
> On 2019/11/19 下午6:03, Anand Jain wrote:
> > On 11/7/19 2:27 PM, Qu Wenruo wrote:
> >> This enables btrfs to iterate missing devices separately, without
> >> iterating all fs_devices.
> > 
> >  IMO.
> >  We don't need another list to maintain the missing device. We already
> >  have good enough device lists.
> >  The way its been implemented is
> >  Allo_list is the only list from which we shall alloc the chunks.
> >  Missing is a state of the device.
> >  A device in the alloc list can be in the Missing state.
> 
> That would cause problem, especially when you only want to use missing
> device as last resort method.
> 
> IIRC it's you mentioned this is a problem in my original design (which
> put all missing deviecs into alloc_list). Or it's David?
> 
> > 
> >  If there is missing_list that means the device in the missing list
> >  is also possible candidate for the alloc that's messy.
> 
> But when you want to avoid missing device, alloc_list and missing_list
> makes sense.
> 
> E.g. 6 devices RAID5, with one missing device, we should *avoid* using
> missing devices as we have enough (5) devices to allocate from.

I tend to agree that adding more lists would make things messy. This
needs to keep the missing state bit and presence in the list in sync,
there's the counter of missing devices. And that there are typically
only very few missing devices is also something to consider.

The device selection in __btrfs_alloc_chunk can avoid that. There's an
array allocated, with some size related data then it's passed to qsort
so the first N drives will be used for the chunk.

In case the degraded allocation is allowed (as mentioned in the other
mail, only for the mirrored profiles)

* add the missing device to the array
* update the comparison function btrfs_cmp_device_info to order missing
  devices to the end

Then the same logic "first N" would work here.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk
  2019-11-27 19:23       ` David Sterba
@ 2019-11-27 23:36         ` Qu Wenruo
  2019-11-28 11:24           ` David Sterba
  0 siblings, 1 reply; 26+ messages in thread
From: Qu Wenruo @ 2019-11-27 23:36 UTC (permalink / raw)
  To: dsterba, Anand Jain, Qu Wenruo, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 1599 bytes --]



On 2019/11/28 上午3:23, David Sterba wrote:
> On Tue, Nov 19, 2019 at 06:41:49PM +0800, Qu Wenruo wrote:
>> On 2019/11/19 下午6:05, Anand Jain wrote:
>>> On 11/7/19 2:27 PM, Qu Wenruo wrote:
>>>> [PROBLEM]
>>>> Btrfs degraded mount will fallback to SINGLE profile if there are not
>>>> enough devices:
>>>
>>>  Its better to keep it like this for now until there is a fix for the
>>>  write hole. Otherwise hitting the write hole bug in case of degraded
>>>  raid1 will be more prevalent.
>>
>> Write hole should be a problem for RAID5/6, not the degraded chunk
>> feature itself.
>>
>> Furthermore, this design will try to avoid allocating chunks using
>> missing devices.
>> So even for 3 devices RAID5, new chunks will be allocated by using
>> existing devices (2 devices RAID5), so no new write hole is introduced.
> 
> That this would allow a 2 device raid5 (from expected 3) is similar to
> the reduced chunks, but now hidden because we don't have a detailed
> report for stripes on devices. And rebalance would be needed to make
> sure that's the filesystem is again 3 devices (and 1 device lost
> tolerant).
> 
> This is different to the 1 device missing for raid1, where scrub can
> fix that (expected), but the balance is IMHO not.
> 
> I'd suggest to allow allocation from missing devices only from the
> profiles with redundancy. For now.

But RAID5 itself supports 2 devices, right?
And even 2 devices RAID5 can still tolerant 1 missing device.

The tolerance hasn't changed in that case, just unbalanced disk usage then.

Thanks,
Qu


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 520 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk
  2019-11-27 23:36         ` Qu Wenruo
@ 2019-11-28 11:24           ` David Sterba
  2019-11-28 12:29             ` Qu Wenruo
  2019-11-28 12:30             ` Qu WenRuo
  0 siblings, 2 replies; 26+ messages in thread
From: David Sterba @ 2019-11-28 11:24 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Anand Jain, Qu Wenruo, linux-btrfs

On Thu, Nov 28, 2019 at 07:36:41AM +0800, Qu Wenruo wrote:
> On 2019/11/28 上午3:23, David Sterba wrote:
> > On Tue, Nov 19, 2019 at 06:41:49PM +0800, Qu Wenruo wrote:
> >> On 2019/11/19 下午6:05, Anand Jain wrote:
> >>> On 11/7/19 2:27 PM, Qu Wenruo wrote:
> >>>> [PROBLEM]
> >>>> Btrfs degraded mount will fallback to SINGLE profile if there are not
> >>>> enough devices:
> >>>
> >>>  Its better to keep it like this for now until there is a fix for the
> >>>  write hole. Otherwise hitting the write hole bug in case of degraded
> >>>  raid1 will be more prevalent.
> >>
> >> Write hole should be a problem for RAID5/6, not the degraded chunk
> >> feature itself.
> >>
> >> Furthermore, this design will try to avoid allocating chunks using
> >> missing devices.
> >> So even for 3 devices RAID5, new chunks will be allocated by using
> >> existing devices (2 devices RAID5), so no new write hole is introduced.
> > 
> > That this would allow a 2 device raid5 (from expected 3) is similar to
> > the reduced chunks, but now hidden because we don't have a detailed
> > report for stripes on devices. And rebalance would be needed to make
> > sure that's the filesystem is again 3 devices (and 1 device lost
> > tolerant).
> > 
> > This is different to the 1 device missing for raid1, where scrub can
> > fix that (expected), but the balance is IMHO not.
> > 
> > I'd suggest to allow allocation from missing devices only from the
> > profiles with redundancy. For now.
> 
> But RAID5 itself supports 2 devices, right?
> And even 2 devices RAID5 can still tolerant 1 missing device.

> The tolerance hasn't changed in that case, just unbalanced disk usage then.

Ah right, the constraints are still fine. That the usage is unbalanced
is something I'd still consider a problem because it's silently changing
the layout from the one that was set by user.

As there are two conflicting ways to continue from the missing device state:

- try to use remaining devices to allow writes but change the layout
- don't allow writes, let user/admin sort it out

I'd rather have more time to understand the implications and try to
experiment with that.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk
  2019-11-28 11:24           ` David Sterba
@ 2019-11-28 12:29             ` Qu Wenruo
  2019-11-28 12:30             ` Qu WenRuo
  1 sibling, 0 replies; 26+ messages in thread
From: Qu Wenruo @ 2019-11-28 12:29 UTC (permalink / raw)
  To: dsterba, Anand Jain, Qu Wenruo, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 2653 bytes --]



On 2019/11/28 下午7:24, David Sterba wrote:
> On Thu, Nov 28, 2019 at 07:36:41AM +0800, Qu Wenruo wrote:
>> On 2019/11/28 上午3:23, David Sterba wrote:
>>> On Tue, Nov 19, 2019 at 06:41:49PM +0800, Qu Wenruo wrote:
>>>> On 2019/11/19 下午6:05, Anand Jain wrote:
>>>>> On 11/7/19 2:27 PM, Qu Wenruo wrote:
>>>>>> [PROBLEM]
>>>>>> Btrfs degraded mount will fallback to SINGLE profile if there are not
>>>>>> enough devices:
>>>>>
>>>>>  Its better to keep it like this for now until there is a fix for the
>>>>>  write hole. Otherwise hitting the write hole bug in case of degraded
>>>>>  raid1 will be more prevalent.
>>>>
>>>> Write hole should be a problem for RAID5/6, not the degraded chunk
>>>> feature itself.
>>>>
>>>> Furthermore, this design will try to avoid allocating chunks using
>>>> missing devices.
>>>> So even for 3 devices RAID5, new chunks will be allocated by using
>>>> existing devices (2 devices RAID5), so no new write hole is introduced.
>>>
>>> That this would allow a 2 device raid5 (from expected 3) is similar to
>>> the reduced chunks, but now hidden because we don't have a detailed
>>> report for stripes on devices. And rebalance would be needed to make
>>> sure that's the filesystem is again 3 devices (and 1 device lost
>>> tolerant).
>>>
>>> This is different to the 1 device missing for raid1, where scrub can
>>> fix that (expected), but the balance is IMHO not.
>>>
>>> I'd suggest to allow allocation from missing devices only from the
>>> profiles with redundancy. For now.
>>
>> But RAID5 itself supports 2 devices, right?
>> And even 2 devices RAID5 can still tolerant 1 missing device.
> 
>> The tolerance hasn't changed in that case, just unbalanced disk usage then.
> 
> Ah right, the constraints are still fine. That the usage is unbalanced
> is something I'd still consider a problem because it's silently changing
> the layout from the one that was set by user.

Then it's the trade off between:
- Completely the same layout
- Just tolerance

I'd say, even without missing device, btrfs by its nature, it can't
ensure all chunks are allocated using the same device layout.

E.g. 4 disk raid5, 1T + 2T + 2T + 2T.
There will be a point where btrfs can only go 3 disks RAID5 other than 4.

> 
> As there are two conflicting ways to continue from the missing device state:
> 
> - try to use remaining devices to allow writes but change the layout
> - don't allow writes, let user/admin sort it out
> 
> I'd rather have more time to understand the implications and try to
> experiment with that.

Sure, no problem.

Thanks,
Qu


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk
  2019-11-28 11:24           ` David Sterba
  2019-11-28 12:29             ` Qu Wenruo
@ 2019-11-28 12:30             ` Qu WenRuo
  2019-11-28 12:39               ` Qu Wenruo
  1 sibling, 1 reply; 26+ messages in thread
From: Qu WenRuo @ 2019-11-28 12:30 UTC (permalink / raw)
  To: dsterba, Qu Wenruo, Anand Jain, linux-btrfs



On 2019/11/28 下午7:24, David Sterba wrote:
> On Thu, Nov 28, 2019 at 07:36:41AM +0800, Qu Wenruo wrote:
>> On 2019/11/28 上午3:23, David Sterba wrote:
>>> On Tue, Nov 19, 2019 at 06:41:49PM +0800, Qu Wenruo wrote:
>>>> On 2019/11/19 下午6:05, Anand Jain wrote:
>>>>> On 11/7/19 2:27 PM, Qu Wenruo wrote:
>>>>>> [PROBLEM]
>>>>>> Btrfs degraded mount will fallback to SINGLE profile if there are not
>>>>>> enough devices:
>>>>>
>>>>>  Its better to keep it like this for now until there is a fix for the
>>>>>  write hole. Otherwise hitting the write hole bug in case of degraded
>>>>>  raid1 will be more prevalent.
>>>>
>>>> Write hole should be a problem for RAID5/6, not the degraded chunk
>>>> feature itself.
>>>>
>>>> Furthermore, this design will try to avoid allocating chunks using
>>>> missing devices.
>>>> So even for 3 devices RAID5, new chunks will be allocated by using
>>>> existing devices (2 devices RAID5), so no new write hole is introduced.
>>>
>>> That this would allow a 2 device raid5 (from expected 3) is similar to
>>> the reduced chunks, but now hidden because we don't have a detailed
>>> report for stripes on devices. And rebalance would be needed to make
>>> sure that's the filesystem is again 3 devices (and 1 device lost
>>> tolerant).
>>>
>>> This is different to the 1 device missing for raid1, where scrub can
>>> fix that (expected), but the balance is IMHO not.
>>>
>>> I'd suggest to allow allocation from missing devices only from the
>>> profiles with redundancy. For now.
>>
>> But RAID5 itself supports 2 devices, right?
>> And even 2 devices RAID5 can still tolerant 1 missing device.
> 
>> The tolerance hasn't changed in that case, just unbalanced disk usage then.
> 
> Ah right, the constraints are still fine. That the usage is unbalanced
> is something I'd still consider a problem because it's silently changing
> the layout from the one that was set by user.
> 
> As there are two conflicting ways to continue from the missing device state:
> 
> - try to use remaining devices to allow writes but change the layout
> - don't allow writes, let user/admin sort it out
> 
> I'd rather have more time to understand the implications and try to
> experiment with that.
> 
Ah, makes sense.

So no need for a new version.

Reviewed-by: Qu Wenruo <wqu@suse.com>

Thanks,
Qu

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk
  2019-11-28 12:30             ` Qu WenRuo
@ 2019-11-28 12:39               ` Qu Wenruo
  0 siblings, 0 replies; 26+ messages in thread
From: Qu Wenruo @ 2019-11-28 12:39 UTC (permalink / raw)
  To: Qu WenRuo, dsterba, Anand Jain, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 2591 bytes --]



On 2019/11/28 下午8:30, Qu WenRuo wrote:
> 
> 
> On 2019/11/28 下午7:24, David Sterba wrote:
>> On Thu, Nov 28, 2019 at 07:36:41AM +0800, Qu Wenruo wrote:
>>> On 2019/11/28 上午3:23, David Sterba wrote:
>>>> On Tue, Nov 19, 2019 at 06:41:49PM +0800, Qu Wenruo wrote:
>>>>> On 2019/11/19 下午6:05, Anand Jain wrote:
>>>>>> On 11/7/19 2:27 PM, Qu Wenruo wrote:
>>>>>>> [PROBLEM]
>>>>>>> Btrfs degraded mount will fallback to SINGLE profile if there are not
>>>>>>> enough devices:
>>>>>>
>>>>>>  Its better to keep it like this for now until there is a fix for the
>>>>>>  write hole. Otherwise hitting the write hole bug in case of degraded
>>>>>>  raid1 will be more prevalent.
>>>>>
>>>>> Write hole should be a problem for RAID5/6, not the degraded chunk
>>>>> feature itself.
>>>>>
>>>>> Furthermore, this design will try to avoid allocating chunks using
>>>>> missing devices.
>>>>> So even for 3 devices RAID5, new chunks will be allocated by using
>>>>> existing devices (2 devices RAID5), so no new write hole is introduced.
>>>>
>>>> That this would allow a 2 device raid5 (from expected 3) is similar to
>>>> the reduced chunks, but now hidden because we don't have a detailed
>>>> report for stripes on devices. And rebalance would be needed to make
>>>> sure that's the filesystem is again 3 devices (and 1 device lost
>>>> tolerant).
>>>>
>>>> This is different to the 1 device missing for raid1, where scrub can
>>>> fix that (expected), but the balance is IMHO not.
>>>>
>>>> I'd suggest to allow allocation from missing devices only from the
>>>> profiles with redundancy. For now.
>>>
>>> But RAID5 itself supports 2 devices, right?
>>> And even 2 devices RAID5 can still tolerant 1 missing device.
>>
>>> The tolerance hasn't changed in that case, just unbalanced disk usage then.
>>
>> Ah right, the constraints are still fine. That the usage is unbalanced
>> is something I'd still consider a problem because it's silently changing
>> the layout from the one that was set by user.
>>
>> As there are two conflicting ways to continue from the missing device state:
>>
>> - try to use remaining devices to allow writes but change the layout
>> - don't allow writes, let user/admin sort it out
>>
>> I'd rather have more time to understand the implications and try to
>> experiment with that.
>>
> Ah, makes sense.
> 
> So no need for a new version.
> 
> Reviewed-by: Qu Wenruo <wqu@suse.com>
> 
> Thanks,
> Qu
> 
Facepalm, that's for another thread....

Reviewing patch from myself, WTF....


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] btrfs: More intelligent degraded chunk allocator
  2019-11-18 23:32   ` Qu Wenruo
  2019-11-19  5:18     ` Alberto Bursi
@ 2019-12-02  3:22     ` Zygo Blaxell
  2019-12-02  4:41       ` Qu Wenruo
  1 sibling, 1 reply; 26+ messages in thread
From: Zygo Blaxell @ 2019-12-02  3:22 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Qu Wenruo, linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 3328 bytes --]

On Tue, Nov 19, 2019 at 07:32:26AM +0800, Qu Wenruo wrote:
> 
> 
> On 2019/11/19 上午4:18, David Sterba wrote:
> > On Thu, Nov 07, 2019 at 02:27:07PM +0800, Qu Wenruo wrote:
> >> This patchset will make btrfs degraded mount more intelligent and
> >> provide more consistent profile keeping function.
> >>
> >> One of the most problematic aspect of degraded mount is, btrfs may
> >> create unwanted profiles.
> >>
> >>  # mkfs.btrfs -f /dev/test/scratch[12] -m raid1 -d raid1
> >>  # wipefs -fa /dev/test/scratch2
> >>  # mount -o degraded /dev/test/scratch1 /mnt/btrfs
> >>  # fallocate -l 1G /mnt/btrfs/foobar
> >>  # btrfs ins dump-tree -t chunk /dev/test/scratch1
> >>         item 7 key (FIRST_CHUNK_TREE CHUNK_ITEM 1674575872) itemoff 15511 itemsize 80
> >>                 length 536870912 owner 2 stripe_len 65536 type DATA
> >>  New data chunk will fallback to SINGLE or DUP.
> >>
> >>
> >> The cause is pretty simple, when mounted degraded, missing devices can't
> >> be used for chunk allocation.
> >> Thus btrfs has to fall back to SINGLE profile.
> >>
> >> This patchset will make btrfs to consider missing devices as last resort if
> >> current rw devices can't fulfil the profile request.
> >>
> >> This should provide a good balance between considering all missing
> >> device as RW and completely ruling out missing devices (current mainline
> >> behavior).
> > 
> > Thanks. This is going to change the behaviour with a missing device, so
> > the question is if we should make this configurable first and then
> > switch the default.
> 
> Configurable then switch makes sense for most cases, but for this
> degraded chunk case, IIRC the new behavior is superior in all cases.
> 
> For 2 devices RAID1 with one missing device (the main concern), old
> behavior will create SINGLE/DUP chunk, which has no tolerance for extra
> missing devices.
> 
> The new behavior will create degraded RAID1, which still lacks tolerance
> for extra missing devices.
> 
> The difference is, for degraded chunk, if we have the device back, and
> do proper scrub, then we're completely back to proper RAID1.
> No need to do extra balance/convert, only scrub is needed.

I think you meant to say "replace" instead of "scrub" above.

> So the new behavior is kinda of a super set of old behavior, using the
> new behavior by default should not cause extra concern.

It sounds OK to me, provided that the missing device is going away
permanently, and a new device replaces it.

If the missing device comes back, we end up relying on scrub and 32-bit
CRCs to figure out which disk has correct data, and it will be wrong
1/2^32 of the time.  For nodatasum files there are no CRCs so the data
will be wrong much more often.  This patch doesn't change that, but
maybe another patch should.

> > How does this work with scrub? Eg. if there are 2 devices in RAID1, one
> > goes missing and then scrub is started. It makes no sense to try to
> > repair the missing blocks, but given the logic in the patches all the
> > data will be rewritten, right?
> 
> Scrub is unchanged at all.
> 
> Missing device will not go through scrub at all, as scrub is per-device
> based, missing device will be ruled out at very beginning of scrub.
> 
> Thanks,
> Qu
> > 
> 




[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] btrfs: More intelligent degraded chunk allocator
  2019-12-02  3:22     ` Zygo Blaxell
@ 2019-12-02  4:41       ` Qu Wenruo
  2019-12-02 19:27         ` Zygo Blaxell
  0 siblings, 1 reply; 26+ messages in thread
From: Qu Wenruo @ 2019-12-02  4:41 UTC (permalink / raw)
  To: Zygo Blaxell; +Cc: dsterba, Qu Wenruo, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 3893 bytes --]



On 2019/12/2 上午11:22, Zygo Blaxell wrote:
> On Tue, Nov 19, 2019 at 07:32:26AM +0800, Qu Wenruo wrote:
>>
>>
>> On 2019/11/19 上午4:18, David Sterba wrote:
>>> On Thu, Nov 07, 2019 at 02:27:07PM +0800, Qu Wenruo wrote:
>>>> This patchset will make btrfs degraded mount more intelligent and
>>>> provide more consistent profile keeping function.
>>>>
>>>> One of the most problematic aspect of degraded mount is, btrfs may
>>>> create unwanted profiles.
>>>>
>>>>  # mkfs.btrfs -f /dev/test/scratch[12] -m raid1 -d raid1
>>>>  # wipefs -fa /dev/test/scratch2
>>>>  # mount -o degraded /dev/test/scratch1 /mnt/btrfs
>>>>  # fallocate -l 1G /mnt/btrfs/foobar
>>>>  # btrfs ins dump-tree -t chunk /dev/test/scratch1
>>>>         item 7 key (FIRST_CHUNK_TREE CHUNK_ITEM 1674575872) itemoff 15511 itemsize 80
>>>>                 length 536870912 owner 2 stripe_len 65536 type DATA
>>>>  New data chunk will fallback to SINGLE or DUP.
>>>>
>>>>
>>>> The cause is pretty simple, when mounted degraded, missing devices can't
>>>> be used for chunk allocation.
>>>> Thus btrfs has to fall back to SINGLE profile.
>>>>
>>>> This patchset will make btrfs to consider missing devices as last resort if
>>>> current rw devices can't fulfil the profile request.
>>>>
>>>> This should provide a good balance between considering all missing
>>>> device as RW and completely ruling out missing devices (current mainline
>>>> behavior).
>>>
>>> Thanks. This is going to change the behaviour with a missing device, so
>>> the question is if we should make this configurable first and then
>>> switch the default.
>>
>> Configurable then switch makes sense for most cases, but for this
>> degraded chunk case, IIRC the new behavior is superior in all cases.
>>
>> For 2 devices RAID1 with one missing device (the main concern), old
>> behavior will create SINGLE/DUP chunk, which has no tolerance for extra
>> missing devices.
>>
>> The new behavior will create degraded RAID1, which still lacks tolerance
>> for extra missing devices.
>>
>> The difference is, for degraded chunk, if we have the device back, and
>> do proper scrub, then we're completely back to proper RAID1.
>> No need to do extra balance/convert, only scrub is needed.
> 
> I think you meant to say "replace" instead of "scrub" above.

"scrub" for missing-then-back case.

As at the time of write, I didn't even take the replace case into
consideration...

> 
>> So the new behavior is kinda of a super set of old behavior, using the
>> new behavior by default should not cause extra concern.
> 
> It sounds OK to me, provided that the missing device is going away
> permanently, and a new device replaces it.
> 
> If the missing device comes back, we end up relying on scrub and 32-bit
> CRCs to figure out which disk has correct data, and it will be wrong
> 1/2^32 of the time.  For nodatasum files there are no CRCs so the data
> will be wrong much more often.  This patch doesn't change that, but
> maybe another patch should.

Yep, the patchset won't change it.

But this also remind me, so far we are all talking about "degraded"
mount option.
Under most case, user is only using "degraded" when they completely
understands that device is missing, not using that option as a daily option.

So that shouldn't be a big problem so far.

Thanks,
Qu

> 
>>> How does this work with scrub? Eg. if there are 2 devices in RAID1, one
>>> goes missing and then scrub is started. It makes no sense to try to
>>> repair the missing blocks, but given the logic in the patches all the
>>> data will be rewritten, right?
>>
>> Scrub is unchanged at all.
>>
>> Missing device will not go through scrub at all, as scrub is per-device
>> based, missing device will be ruled out at very beginning of scrub.
>>
>> Thanks,
>> Qu
>>>
>>
> 
> 
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 0/3] btrfs: More intelligent degraded chunk allocator
  2019-12-02  4:41       ` Qu Wenruo
@ 2019-12-02 19:27         ` Zygo Blaxell
  0 siblings, 0 replies; 26+ messages in thread
From: Zygo Blaxell @ 2019-12-02 19:27 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Qu Wenruo, linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 7009 bytes --]

On Mon, Dec 02, 2019 at 12:41:53PM +0800, Qu Wenruo wrote:
> 
> 
> On 2019/12/2 上午11:22, Zygo Blaxell wrote:
> > On Tue, Nov 19, 2019 at 07:32:26AM +0800, Qu Wenruo wrote:
> >>
> >>
> >> On 2019/11/19 上午4:18, David Sterba wrote:
> >>> On Thu, Nov 07, 2019 at 02:27:07PM +0800, Qu Wenruo wrote:
> >>>> This patchset will make btrfs degraded mount more intelligent and
> >>>> provide more consistent profile keeping function.
> >>>>
> >>>> One of the most problematic aspect of degraded mount is, btrfs may
> >>>> create unwanted profiles.
> >>>>
> >>>>  # mkfs.btrfs -f /dev/test/scratch[12] -m raid1 -d raid1
> >>>>  # wipefs -fa /dev/test/scratch2
> >>>>  # mount -o degraded /dev/test/scratch1 /mnt/btrfs
> >>>>  # fallocate -l 1G /mnt/btrfs/foobar
> >>>>  # btrfs ins dump-tree -t chunk /dev/test/scratch1
> >>>>         item 7 key (FIRST_CHUNK_TREE CHUNK_ITEM 1674575872) itemoff 15511 itemsize 80
> >>>>                 length 536870912 owner 2 stripe_len 65536 type DATA
> >>>>  New data chunk will fallback to SINGLE or DUP.
> >>>>
> >>>>
> >>>> The cause is pretty simple, when mounted degraded, missing devices can't
> >>>> be used for chunk allocation.
> >>>> Thus btrfs has to fall back to SINGLE profile.
> >>>>
> >>>> This patchset will make btrfs to consider missing devices as last resort if
> >>>> current rw devices can't fulfil the profile request.
> >>>>
> >>>> This should provide a good balance between considering all missing
> >>>> device as RW and completely ruling out missing devices (current mainline
> >>>> behavior).
> >>>
> >>> Thanks. This is going to change the behaviour with a missing device, so
> >>> the question is if we should make this configurable first and then
> >>> switch the default.
> >>
> >> Configurable then switch makes sense for most cases, but for this
> >> degraded chunk case, IIRC the new behavior is superior in all cases.
> >>
> >> For 2 devices RAID1 with one missing device (the main concern), old
> >> behavior will create SINGLE/DUP chunk, which has no tolerance for extra
> >> missing devices.
> >>
> >> The new behavior will create degraded RAID1, which still lacks tolerance
> >> for extra missing devices.
> >>
> >> The difference is, for degraded chunk, if we have the device back, and
> >> do proper scrub, then we're completely back to proper RAID1.
> >> No need to do extra balance/convert, only scrub is needed.
> > 
> > I think you meant to say "replace" instead of "scrub" above.
> 
> "scrub" for missing-then-back case.
> 
> As at the time of write, I didn't even take the replace case into
> consideration...

Using scrub for that purpose has some pretty bad failure modes, e.g.

	https://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg78254.html

mdadm raid1 has many problems, but this is not one of them.  Drives that
exit a mdadm raid1 array are marked of date on the remaining drives'
superblocks (using sequence numbers and time stamps to avoid colliding
sequence numbers from split-brain raid1 cases).  If the drive returns,
it cannot simply reenter the array, it has to be resynced using online
drives as a replacement data source (partial resyncs are possible with
mdadm bitmaps, a similar thing could be done with btrfs block groups).
mdadm will do all this for you automatically.

> >> So the new behavior is kinda of a super set of old behavior, using the
> >> new behavior by default should not cause extra concern.
> > 
> > It sounds OK to me, provided that the missing device is going away
> > permanently, and a new device replaces it.
> > 
> > If the missing device comes back, we end up relying on scrub and 32-bit
> > CRCs to figure out which disk has correct data, and it will be wrong
> > 1/2^32 of the time.  For nodatasum files there are no CRCs so the data
> > will be wrong much more often.  This patch doesn't change that, but
> > maybe another patch should.
> 
> Yep, the patchset won't change it.
> 
> But this also remind me, so far we are all talking about "degraded"
> mount option.
> Under most case, user is only using "degraded" when they completely
> understands that device is missing, not using that option as a daily option.

That makes certain high-availability setups (e.g. when btrfs is /)
difficult in practice.  To do it by the book, you'd need to:

	1.  Experience a total disk failure
	2.  Reconfigure bootloader to mount -o degraded
	3.  Shut system down
	4.  Remove old drive, add 
	5.  Boot system
	6.  btrfs replace
	7.  Reconfigure bootloader to mount without degraded

That's hard (or just expensive and risky) to do in headless remote setups,
especially if step 1 causes a crash, step 2 gets skipped, and the system
has neither a console to reconfigure the bootloader nor will btrfs boot
without reconfiguring the bootloader.

(Note:  I'm assuming that for whatever reason you can't just hotswap the
drive and run replace immediately--or maybe you did hotswap the drive,
then there was a crash or power failure and you had to go back to mounting
with -o degraded to finish the replace anyway.)

In practice, what happens is more like:

	1.  Bootloader configured to mount -o degraded all the time
	2.  Hard disk has a partial failure, drops off bus or firmware crash
	3.  Reboot (due to crash, power failure, adding replacement drive, etc)
	4.  System comes back up with bad disk present
	5.  Lots of csum failures, 0.0000004% of which are not recovered
	    in scrub with CRC32C csums, and nodatacow files damaged
	6.  Repeat from step 2 several times

If users on IRC are representative, most of them do not know that step #2
is a data-losing failure (btrfs does, but btrfs currently doesn't act on
the information as long as only one drive has failed).  Users also don't
know that scrub is not replace (resync in mdadm terms), and will keep
using the drive and filesystem until something more visible goes wrong.
After several loops they show up on IRC asking for help, but by that
point there has been unrecoverable data loss.

> So that shouldn't be a big problem so far.

It isn't a big problem if you follow exactly the single correct procedure,
and know precisely which tools to use in what order.

This is mostly a usability concern:  other RAID systems cope with this
kind of event automatically, without requiring all the explicit operator
intervention.

> Thanks,
> Qu
> 
> > 
> >>> How does this work with scrub? Eg. if there are 2 devices in RAID1, one
> >>> goes missing and then scrub is started. It makes no sense to try to
> >>> repair the missing blocks, but given the logic in the patches all the
> >>> data will be rewritten, right?
> >>
> >> Scrub is unchanged at all.
> >>
> >> Missing device will not go through scrub at all, as scrub is per-device
> >> based, missing device will be ruled out at very beginning of scrub.
> >>
> >> Thanks,
> >> Qu
> >>>
> >>
> > 
> > 
> > 
> 




[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2019-12-02 19:27 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-07  6:27 [PATCH 0/3] btrfs: More intelligent degraded chunk allocator Qu Wenruo
2019-11-07  6:27 ` [PATCH 1/3] btrfs: volumes: Refactor device holes gathering into a separate function Qu Wenruo
2019-11-07  9:20   ` Johannes Thumshirn
2019-11-07  9:33     ` Qu Wenruo
2019-11-07  9:45       ` Johannes Thumshirn
2019-11-07  6:27 ` [PATCH 2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices Qu Wenruo
2019-11-07  9:31   ` Johannes Thumshirn
2019-11-19 10:03   ` Anand Jain
2019-11-19 10:29     ` Qu Wenruo
2019-11-27 19:36       ` David Sterba
2019-11-07  6:27 ` [PATCH 3/3] btrfs: volumes: Allocate degraded chunks if rw devices can't fullfil a chunk Qu Wenruo
2019-11-19 10:05   ` Anand Jain
2019-11-19 10:41     ` Qu Wenruo
2019-11-27 19:23       ` David Sterba
2019-11-27 23:36         ` Qu Wenruo
2019-11-28 11:24           ` David Sterba
2019-11-28 12:29             ` Qu Wenruo
2019-11-28 12:30             ` Qu WenRuo
2019-11-28 12:39               ` Qu Wenruo
2019-11-18 20:18 ` [PATCH 0/3] btrfs: More intelligent degraded chunk allocator David Sterba
2019-11-18 23:32   ` Qu Wenruo
2019-11-19  5:18     ` Alberto Bursi
2019-11-27 19:26       ` David Sterba
2019-12-02  3:22     ` Zygo Blaxell
2019-12-02  4:41       ` Qu Wenruo
2019-12-02 19:27         ` Zygo Blaxell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).