All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree
@ 2022-05-16 14:31 Johannes Thumshirn
  2022-05-16 14:31 ` [RFC ONLY 1/8] btrfs: add raid stripe tree definitions Johannes Thumshirn
                   ` (10 more replies)
  0 siblings, 11 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-16 14:31 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Johannes Thumshirn

Introduce a raid-stripe-tree to record writes in a RAID environment.

In essence this adds another address translation layer between the logical
and the physical addresses in btrfs and is designed to close two gaps. The
first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
second one is the inability of doing RAID with zoned block devices due to the
constraints we have with REQ_OP_ZONE_APPEND writes.

Thsi is an RFC/PoC only which just shows how the code will look like for a
zoned RAID1. Its sole purpose is to facilitate design reviews and is not
intended to be merged yet. Or if merged to be used on an actual file-system.

Johannes Thumshirn (8):
  btrfs: add raid stripe tree definitions
  btrfs: move btrfs_io_context to volumes.h
  btrfs: read raid-stripe-tree from disk
  btrfs: add boilerplate code to insert raid extent
  btrfs: add code to delete raid extent
  btrfs: add code to read raid extent
  btrfs: zoned: allow zoned RAID1
  btrfs: add raid stripe tree pretty printer

 fs/btrfs/Makefile               |   2 +-
 fs/btrfs/ctree.c                |   1 +
 fs/btrfs/ctree.h                |  29 ++++
 fs/btrfs/disk-io.c              |  12 ++
 fs/btrfs/extent-tree.c          |   9 ++
 fs/btrfs/file.c                 |   1 -
 fs/btrfs/print-tree.c           |  21 +++
 fs/btrfs/raid-stripe-tree.c     | 251 ++++++++++++++++++++++++++++++++
 fs/btrfs/raid-stripe-tree.h     |  39 +++++
 fs/btrfs/volumes.c              |  44 +++++-
 fs/btrfs/volumes.h              |  93 ++++++------
 fs/btrfs/zoned.c                |  39 +++++
 include/uapi/linux/btrfs.h      |   1 +
 include/uapi/linux/btrfs_tree.h |  17 +++
 14 files changed, 509 insertions(+), 50 deletions(-)
 create mode 100644 fs/btrfs/raid-stripe-tree.c
 create mode 100644 fs/btrfs/raid-stripe-tree.h

-- 
2.35.1


^ permalink raw reply	[flat|nested] 88+ messages in thread

* [RFC ONLY 1/8] btrfs: add raid stripe tree definitions
  2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
@ 2022-05-16 14:31 ` Johannes Thumshirn
  2022-05-17  7:39   ` Qu Wenruo
  2022-05-16 14:31 ` [RFC ONLY 2/8] btrfs: move btrfs_io_context to volumes.h Johannes Thumshirn
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-16 14:31 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Johannes Thumshirn

Add definitions for the raid-stripe-tree. This tree will hold informatioin
about the on-disk layout of the stripes in a RAID set.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/ctree.h                | 28 ++++++++++++++++++++++++++++
 include/uapi/linux/btrfs_tree.h | 17 +++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7328fb17b7f5..20aa2ebac7cd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1878,6 +1878,34 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
 
+BTRFS_SETGET_FUNCS(stripe_extent_devid, struct btrfs_stripe_extent, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_extent_offset, struct btrfs_stripe_extent, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_devid, struct btrfs_stripe_extent, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_offset, struct btrfs_stripe_extent, offset, 64);
+
+static inline struct btrfs_stripe_extent *btrfs_stripe_extent_nr(
+					 struct btrfs_dp_stripe *dps, int nr)
+{
+	unsigned long offset = (unsigned long)dps;
+	offset += offsetof(struct btrfs_dp_stripe, extents);
+	offset += nr * sizeof(struct btrfs_stripe_extent);
+	return (struct btrfs_stripe_extent *)offset;
+}
+
+static inline u64 btrfs_stripe_extent_devid_nr(const struct extent_buffer *eb,
+					       struct btrfs_dp_stripe *dps,
+					       int nr)
+{
+	return btrfs_stripe_extent_devid(eb, btrfs_stripe_extent_nr(dps, nr));
+}
+
+static inline u64 btrfs_stripe_extent_offset_nr(const struct extent_buffer *eb,
+						struct btrfs_dp_stripe *dps,
+						int nr)
+{
+	return btrfs_stripe_extent_offset(eb, btrfs_stripe_extent_nr(dps, nr));
+}
+
 /* struct btrfs_dev_extent */
 BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
 		   chunk_tree, 64);
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index b069752a8ecf..a2d28d83cc96 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -56,6 +56,9 @@
 /* Holds the block group items for extent tree v2. */
 #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
 
+/* tracks RAID stripes in block groups. */
+#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
+
 /* device stats in the device tree */
 #define BTRFS_DEV_STATS_OBJECTID 0ULL
 
@@ -264,6 +267,8 @@
  */
 #define BTRFS_QGROUP_RELATION_KEY       246
 
+#define BTRFS_RAID_STRIPE_KEY 247
+
 /*
  * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
  */
@@ -488,6 +493,18 @@ struct btrfs_free_space_header {
 	__le64 num_bitmaps;
 } __attribute__ ((__packed__));
 
+struct btrfs_stripe_extent {
+	/* btrfs device-id this raid extent  lives on */
+	__le64 devid;
+	/* offset from  the devextent start */
+	__le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_dp_stripe {
+	/* array of stripe extents this stripe is comprised of */
+	struct btrfs_stripe_extent extents;
+} __attribute__ ((__packed__));
+
 #define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [RFC ONLY 2/8] btrfs: move btrfs_io_context to volumes.h
  2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
  2022-05-16 14:31 ` [RFC ONLY 1/8] btrfs: add raid stripe tree definitions Johannes Thumshirn
@ 2022-05-16 14:31 ` Johannes Thumshirn
  2022-05-17  7:42   ` Qu Wenruo
  2022-05-16 14:31 ` [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk Johannes Thumshirn
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-16 14:31 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Johannes Thumshirn

In preparation for upcoming changes, move 'struct btrfs_io_context' to
volumes.h, so we can use it outside of volumes.c

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/volumes.h | 90 +++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bd297f23d19e..894d289a3b50 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -32,6 +32,51 @@ struct btrfs_io_geometry {
 	u64 raid56_stripe_offset;
 };
 
+struct btrfs_io_stripe {
+	struct btrfs_device *dev;
+	u64 physical;
+	u64 length; /* only used for discard mappings */
+};
+
+/*
+ * Context for IO subsmission for device stripe.
+ *
+ * - Track the unfinished mirrors for mirror based profiles
+ *   Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
+ *
+ * - Contain the logical -> physical mapping info
+ *   Used by submit_stripe_bio() for mapping logical bio
+ *   into physical device address.
+ *
+ * - Contain device replace info
+ *   Used by handle_ops_on_dev_replace() to copy logical bios
+ *   into the new device.
+ *
+ * - Contain RAID56 full stripe logical bytenrs
+ */
+struct btrfs_io_context {
+	refcount_t refs;
+	atomic_t stripes_pending;
+	struct btrfs_fs_info *fs_info;
+	u64 map_type; /* get from map_lookup->type */
+	bio_end_io_t *end_io;
+	struct bio *orig_bio;
+	void *private;
+	atomic_t error;
+	int max_errors;
+	int num_stripes;
+	int mirror_num;
+	int num_tgtdevs;
+	int *tgtdev_map;
+	/*
+	 * logical block numbers for the start of each stripe
+	 * The last one or two are p/q.  These are sorted,
+	 * so raid_map[0] is the start of our full stripe
+	 */
+	u64 *raid_map;
+	struct btrfs_io_stripe stripes[];
+};
+
 /*
  * Use sequence counter to get consistent device stat data on
  * 32-bit processors.
@@ -354,51 +399,6 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
 	}
 }
 
-struct btrfs_io_stripe {
-	struct btrfs_device *dev;
-	u64 physical;
-	u64 length; /* only used for discard mappings */
-};
-
-/*
- * Context for IO subsmission for device stripe.
- *
- * - Track the unfinished mirrors for mirror based profiles
- *   Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
- *
- * - Contain the logical -> physical mapping info
- *   Used by submit_stripe_bio() for mapping logical bio
- *   into physical device address.
- *
- * - Contain device replace info
- *   Used by handle_ops_on_dev_replace() to copy logical bios
- *   into the new device.
- *
- * - Contain RAID56 full stripe logical bytenrs
- */
-struct btrfs_io_context {
-	refcount_t refs;
-	atomic_t stripes_pending;
-	struct btrfs_fs_info *fs_info;
-	u64 map_type; /* get from map_lookup->type */
-	bio_end_io_t *end_io;
-	struct bio *orig_bio;
-	void *private;
-	atomic_t error;
-	int max_errors;
-	int num_stripes;
-	int mirror_num;
-	int num_tgtdevs;
-	int *tgtdev_map;
-	/*
-	 * logical block numbers for the start of each stripe
-	 * The last one or two are p/q.  These are sorted,
-	 * so raid_map[0] is the start of our full stripe
-	 */
-	u64 *raid_map;
-	struct btrfs_io_stripe stripes[];
-};
-
 struct btrfs_device_info {
 	struct btrfs_device *dev;
 	u64 dev_offset;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
  2022-05-16 14:31 ` [RFC ONLY 1/8] btrfs: add raid stripe tree definitions Johannes Thumshirn
  2022-05-16 14:31 ` [RFC ONLY 2/8] btrfs: move btrfs_io_context to volumes.h Johannes Thumshirn
@ 2022-05-16 14:31 ` Johannes Thumshirn
  2022-05-17  8:09   ` Qu Wenruo
  2022-05-16 14:31 ` [RFC ONLY 4/8] btrfs: add boilerplate code to insert raid extent Johannes Thumshirn
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-16 14:31 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Johannes Thumshirn

If we're discovering a raid-stripe-tree on mount, read it from disk.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/ctree.h           |  1 +
 fs/btrfs/disk-io.c         | 12 ++++++++++++
 include/uapi/linux/btrfs.h |  1 +
 3 files changed, 14 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 20aa2ebac7cd..1db669662f61 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -667,6 +667,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *uuid_root;
 	struct btrfs_root *data_reloc_root;
 	struct btrfs_root *block_group_root;
+	struct btrfs_root *stripe_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d456f426924c..c0f08917465a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1706,6 +1706,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
 
 		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
 	}
+	if (objectid == BTRFS_RAID_STRIPE_TREE_OBJECTID)
+		return btrfs_grab_root(fs_info->stripe_root) ?
+			fs_info->stripe_root : ERR_PTR(-ENOENT);
 	return NULL;
 }
 
@@ -1784,6 +1787,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_put_root(fs_info->fs_root);
 	btrfs_put_root(fs_info->data_reloc_root);
 	btrfs_put_root(fs_info->block_group_root);
+	btrfs_put_root(fs_info->stripe_root);
 	btrfs_check_leaked_roots(fs_info);
 	btrfs_extent_buffer_leak_debug_check(fs_info);
 	kfree(fs_info->super_copy);
@@ -2337,6 +2341,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
 	free_root_extent_buffers(info->fs_root);
 	free_root_extent_buffers(info->data_reloc_root);
 	free_root_extent_buffers(info->block_group_root);
+	free_root_extent_buffers(info->stripe_root);
 	if (free_chunk_root)
 		free_root_extent_buffers(info->chunk_root);
 }
@@ -2773,6 +2778,13 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 		fs_info->uuid_root = root;
 	}
 
+	location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+	root = btrfs_read_tree_root(tree_root, &location);
+	if (!IS_ERR(root)) {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		fs_info->stripe_root = root;
+	}
+
 	return 0;
 out:
 	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index d956b2993970..4e0429fc4e87 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -310,6 +310,7 @@ struct btrfs_ioctl_fs_info_args {
 #define BTRFS_FEATURE_INCOMPAT_RAID1C34		(1ULL << 11)
 #define BTRFS_FEATURE_INCOMPAT_ZONED		(1ULL << 12)
 #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2	(1ULL << 13)
+#define BTRFS_FEATURE_INCOMPAT_STRIPE_TREE	(1ULL << 14)
 
 struct btrfs_ioctl_feature_flags {
 	__u64 compat_flags;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [RFC ONLY 4/8] btrfs: add boilerplate code to insert raid extent
  2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
                   ` (2 preceding siblings ...)
  2022-05-16 14:31 ` [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk Johannes Thumshirn
@ 2022-05-16 14:31 ` Johannes Thumshirn
  2022-05-17  7:53   ` Qu Wenruo
  2022-05-17  8:00   ` Qu Wenruo
  2022-05-16 14:31 ` [RFC ONLY 5/8] btrfs: add code to delete " Johannes Thumshirn
                   ` (6 subsequent siblings)
  10 siblings, 2 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-16 14:31 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Johannes Thumshirn

Add boilerplate code to insert raid extents into the raid-stripe-tree on
each write to a RAID1 block-group.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/Makefile           |  2 +-
 fs/btrfs/raid-stripe-tree.c | 72 +++++++++++++++++++++++++++++++++++++
 fs/btrfs/raid-stripe-tree.h | 28 +++++++++++++++
 fs/btrfs/volumes.c          | 21 +++++++++++
 fs/btrfs/volumes.h          |  3 ++
 5 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/raid-stripe-tree.c
 create mode 100644 fs/btrfs/raid-stripe-tree.h

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 4188ba3fd8c3..6b9a00ad532a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -30,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
 	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
-	   subpage.o tree-mod-log.o
+	   subpage.o tree-mod-log.o raid-stripe-tree.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
new file mode 100644
index 000000000000..426066bd7c0d
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "raid-stripe-tree.h"
+#include "volumes.h"
+
+static void btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_io_context *bioc)
+{
+	struct btrfs_fs_info *fs_info = bioc->fs_info;
+	struct btrfs_key stripe_key;
+	struct btrfs_root *stripe_root = fs_info->stripe_root;
+	struct btrfs_dp_stripe *raid_stripe;
+	struct btrfs_stripe_extent *stripe_extent;
+	size_t item_size;
+	int ret;
+	int i;
+
+	item_size = sizeof(struct btrfs_dp_stripe) - sizeof(struct btrfs_stripe_extent) +
+		bioc->num_stripes * sizeof(struct btrfs_stripe_extent);
+
+	raid_stripe = kzalloc(item_size, GFP_NOFS);
+	if (!raid_stripe) {
+		btrfs_abort_transaction(trans, -ENOMEM);
+		return;
+	}
+
+	stripe_extent = &raid_stripe->extents;
+	for (i = 0; i  < bioc->num_stripes; i++) {
+		u64 devid = bioc->stripes[i].dev->devid;
+		u64 physical = bioc->stripes[i].physical;
+
+		btrfs_set_stack_stripe_extent_devid(stripe_extent, devid);
+		btrfs_set_stack_stripe_extent_offset(stripe_extent, physical);
+		stripe_extent++;
+	}
+
+	stripe_key.objectid = bioc->logical;
+	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+	stripe_key.offset = bioc->length;
+
+	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, raid_stripe,
+				item_size);
+	if (ret) {
+		kfree(raid_stripe);
+		btrfs_abort_transaction(trans, ret);
+		return;
+	}
+
+	kfree(raid_stripe);
+}
+
+void btrfs_raid_stripe_tree_fn(struct work_struct *work)
+{
+	struct btrfs_io_context *bioc;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *root;
+	struct btrfs_trans_handle *trans = NULL;
+
+	bioc = container_of(work, struct btrfs_io_context, stripe_update_work);
+	fs_info = bioc->fs_info;
+	root = fs_info->stripe_root;
+
+	trans = btrfs_join_transaction(root);
+
+	btrfs_insert_raid_extent(trans, bioc);
+	btrfs_end_transaction(trans);
+
+	btrfs_put_bioc(bioc);
+}
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
new file mode 100644
index 000000000000..320a110ecc66
--- /dev/null
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_RAID_STRIPE_TREE_H
+#define BTRFS_RAID_STRIPE_TREE_H
+
+#include "volumes.h"
+
+void btrfs_raid_stripe_tree_fn(struct work_struct *work);
+
+static inline bool btrfs_need_stripe_tree_update(struct btrfs_io_context *bioc)
+{
+	u64 type = bioc->map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
+	u64 profile = bioc->map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+	if (!bioc->fs_info->stripe_root)
+		return false;
+
+	// for now
+	if (type != BTRFS_BLOCK_GROUP_DATA)
+		return false;
+
+	if (profile & BTRFS_BLOCK_GROUP_RAID1_MASK)
+		return true;
+
+	return false;
+}
+
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3fd17e87815a..36acef2ae5d8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -33,6 +33,7 @@
 #include "block-group.h"
 #include "discard.h"
 #include "zoned.h"
+#include "raid-stripe-tree.h"
 
 #define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
 					 BTRFS_BLOCK_GROUP_RAID10 | \
@@ -5917,6 +5918,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
 	bioc->fs_info = fs_info;
 	bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
 	bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
+	INIT_WORK(&bioc->stripe_update_work, btrfs_raid_stripe_tree_fn);
 
 	return bioc;
 }
@@ -6677,6 +6679,17 @@ static void btrfs_end_bio(struct bio *bio)
 		}
 	}
 
+	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+		int i;
+
+		for (i = 0; i < bioc->num_stripes; i++) {
+			if (bioc->stripes[i].dev->bdev != bio->bi_bdev)
+				continue;
+			bioc->stripes[i].physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+		}
+	}
+
+
 	if (bio == bioc->orig_bio)
 		is_orig_bio = 1;
 
@@ -6700,6 +6713,12 @@ static void btrfs_end_bio(struct bio *bio)
 			 * go over the max number of errors
 			 */
 			bio->bi_status = BLK_STS_OK;
+
+			if (btrfs_op(bio) == BTRFS_MAP_WRITE &&
+			    btrfs_need_stripe_tree_update(bioc)) {
+				btrfs_get_bioc(bioc);
+				schedule_work(&bioc->stripe_update_work);
+			}
 		}
 
 		btrfs_end_bioc(bioc, bio);
@@ -6788,6 +6807,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 	bioc->orig_bio = first_bio;
 	bioc->private = first_bio->bi_private;
 	bioc->end_io = first_bio->bi_end_io;
+	bioc->logical = logical;
+	bioc->length = length;
 	atomic_set(&bioc->stripes_pending, bioc->num_stripes);
 
 	if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 894d289a3b50..4b4235b4432a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -68,6 +68,9 @@ struct btrfs_io_context {
 	int mirror_num;
 	int num_tgtdevs;
 	int *tgtdev_map;
+	u64 logical;
+	u64 length;
+	struct work_struct stripe_update_work;
 	/*
 	 * logical block numbers for the start of each stripe
 	 * The last one or two are p/q.  These are sorted,
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [RFC ONLY 5/8] btrfs: add code to delete raid extent
  2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
                   ` (3 preceding siblings ...)
  2022-05-16 14:31 ` [RFC ONLY 4/8] btrfs: add boilerplate code to insert raid extent Johannes Thumshirn
@ 2022-05-16 14:31 ` Johannes Thumshirn
  2022-05-17  8:06   ` Qu Wenruo
  2022-05-16 14:31 ` [RFC ONLY 6/8] btrfs: add code to read " Johannes Thumshirn
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-16 14:31 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Johannes Thumshirn

Add boilerplate code to delete entries from the raid-stripe-tree if the
corresponding file extent got deleted.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/ctree.c            |   1 +
 fs/btrfs/extent-tree.c      |   9 +++
 fs/btrfs/file.c             |   1 -
 fs/btrfs/raid-stripe-tree.c | 111 ++++++++++++++++++++++++++++++++++++
 fs/btrfs/raid-stripe-tree.h |   8 +++
 5 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1e24695ede0a..b7b4e421e9b8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3623,6 +3623,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
 	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+	       key.type != BTRFS_RAID_STRIPE_KEY &&
 	       key.type != BTRFS_EXTENT_CSUM_KEY);
 
 	if (btrfs_leaf_free_space(leaf) >= ins_len)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f477035a2ac2..00af3e469881 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -36,6 +36,7 @@
 #include "rcu-string.h"
 #include "zoned.h"
 #include "dev-replace.h"
+#include "raid-stripe-tree.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -3199,6 +3200,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		}
 
+		if (is_data) {
+			ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
+			if (ret) {
+				btrfs_abort_transaction(trans, ret);
+				return ret;
+			}
+		}
+
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
 		if (ret) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bd329316945f..6021188dcb9a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1009,7 +1009,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		btrfs_release_path(path);
 out:
 	args->drop_end = found ? min(args->end, last_end) : args->end;
-
 	return ret;
 }
 
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 426066bd7c0d..370ea68fe343 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -6,6 +6,117 @@
 #include "raid-stripe-tree.h"
 #include "volumes.h"
 
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start,
+			     u64 length)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *stripe_root = fs_info->stripe_root;
+	struct btrfs_path *path;
+	struct btrfs_key stripe_key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	u64 end = start + length;
+	u64 found_start;
+	u64 found_end;
+	int slot;
+	int ret;
+
+	if (!stripe_root)
+		return 0;
+
+	stripe_key.objectid = start;
+	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+	stripe_key.offset = end;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, stripe_root, &stripe_key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret == 0)
+		goto delete;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, slot);
+	found_start = found_key.objectid;
+	found_end = found_start + found_key.offset;
+
+	/*
+	 * | -- range to drop --|
+	 * | ---------- extent ---------- |
+	 */
+front_split:
+	if (start > found_start) {
+		struct btrfs_key front_key;
+		struct btrfs_dp_stripe *raid_stripe;
+		struct extent_buffer *front_leaf;
+		struct btrfs_stripe_extent *stripe_extent;
+		int num_stripes;
+		int i;
+
+		front_key.objectid = found_start + length;
+		front_key.type = BTRFS_RAID_STRIPE_KEY;
+		front_key.offset = found_end - length;
+
+		num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
+
+		ret = btrfs_duplicate_item(trans, stripe_root, path, &front_key);
+		if (ret == -EAGAIN) {
+			btrfs_release_path(path);
+			goto front_split;
+		}
+		if (ret < 0)
+			goto out;
+		front_leaf = path->nodes[0];
+
+		raid_stripe = btrfs_item_ptr(leaf, slot, struct btrfs_dp_stripe);
+		stripe_extent = &raid_stripe->extents;
+		for (i = 0; i < num_stripes; i++) {
+			u64 physical;
+
+			physical = btrfs_stripe_extent_offset(leaf, stripe_extent);
+			btrfs_set_stripe_extent_offset(front_leaf, stripe_extent,
+							  physical + length);
+			stripe_extent++;
+		}
+
+		btrfs_mark_buffer_dirty(front_leaf);
+	}
+
+	/*
+	 *           | -- range to drop --|
+	 * | ---------- extent ---------- |
+	 */
+tail_split:
+	if (end < found_end) {
+		struct btrfs_key tail_key;
+
+
+		tail_key.objectid = start;
+		tail_key.type = BTRFS_RAID_STRIPE_KEY;
+		tail_key.offset = found_end - end;
+
+		ret = btrfs_duplicate_item(trans, stripe_root, path, &tail_key);
+		if (ret == -EAGAIN) {
+			btrfs_release_path(path);
+			goto tail_split;
+		}
+		if (ret < 0)
+			goto out;
+		btrfs_mark_buffer_dirty(path->nodes[0]);
+	}
+
+delete:
+	ret = btrfs_del_item(trans, stripe_root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+
+}
+
 static void btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_io_context *bioc)
 {
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 320a110ecc66..766634df8601 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -5,8 +5,16 @@
 
 #include "volumes.h"
 
+int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start,
+			     u64 length);
 void btrfs_raid_stripe_tree_fn(struct work_struct *work);
 
+static inline int btrfs_num_raid_stripes(u32 item_size)
+{
+	return item_size - offsetof(struct btrfs_dp_stripe, extents) /
+		sizeof(struct btrfs_stripe_extent);
+}
+
 static inline bool btrfs_need_stripe_tree_update(struct btrfs_io_context *bioc)
 {
 	u64 type = bioc->map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [RFC ONLY 6/8] btrfs: add code to read raid extent
  2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
                   ` (4 preceding siblings ...)
  2022-05-16 14:31 ` [RFC ONLY 5/8] btrfs: add code to delete " Johannes Thumshirn
@ 2022-05-16 14:31 ` Johannes Thumshirn
  2022-05-16 14:55   ` Josef Bacik
  2022-05-16 14:31 ` [RFC ONLY 7/8] btrfs: zoned: allow zoned RAID1 Johannes Thumshirn
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-16 14:31 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Johannes Thumshirn

Add boilerplate code to lookup the physical address from the
raid-stripe-tree when a read on an RAID volume formatted with the
raid-stripe-tree was attempted.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/raid-stripe-tree.c | 68 +++++++++++++++++++++++++++++++++++++
 fs/btrfs/raid-stripe-tree.h |  3 ++
 fs/btrfs/volumes.c          | 23 +++++++++++--
 3 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 370ea68fe343..ecc8205be760 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -1,10 +1,78 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/btrfs_tree.h>
+
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
 #include "raid-stripe-tree.h"
 #include "volumes.h"
+#include "misc.h"
+
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+				 u64 logical, u64 length, u64 map_type,
+				 u64 devid, u64 *physical)
+{
+	struct btrfs_root *stripe_root = fs_info->stripe_root;
+	struct btrfs_dp_stripe *raid_stripe;
+	struct btrfs_key stripe_key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	u64 offset;
+	u64 found_logical, found_length;
+	int num_stripes;
+	int slot;
+	int ret;
+	int i;
+
+	stripe_key.objectid = logical;
+	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
+	stripe_key.offset = length;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	num_stripes = btrfs_bg_type_to_factor(map_type);
+
+	ret = btrfs_search_slot_for_read(stripe_root, &stripe_key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	}
+
+	if (ret == 1)
+		ret = 0;
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		found_logical = found_key.objectid;
+		found_length = found_key.offset;
+
+		if (!in_range(logical, found_logical, found_length))
+		    goto next;
+		offset = logical - found_logical;
+
+		raid_stripe = btrfs_item_ptr(leaf, slot, struct btrfs_dp_stripe);
+		for (i = 0; i < num_stripes; i++) {
+			if (btrfs_stripe_extent_devid_nr(leaf, raid_stripe, i) != devid)
+				continue;
+			*physical = btrfs_stripe_extent_offset_nr(leaf, raid_stripe, i) + offset;
+			goto out;
+		}
+next:
+		ret = btrfs_next_item(stripe_root, path);
+		if (ret)
+			break;
+	}
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
 
 int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start,
 			     u64 length)
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 766634df8601..1bfa6274eef8 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -5,6 +5,9 @@
 
 #include "volumes.h"
 
+int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
+				 u64 logical, u64 length, u64 map_type,
+				 u64 devid, u64 *physical);
 int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start,
 			     u64 length);
 void btrfs_raid_stripe_tree_fn(struct work_struct *work);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 36acef2ae5d8..38329728425c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6559,11 +6559,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 		ret = -ENOMEM;
 		goto out;
 	}
+	bioc->map_type = map->type;
 
 	for (i = 0; i < num_stripes; i++) {
-		bioc->stripes[i].physical = map->stripes[stripe_index].physical +
-			stripe_offset + stripe_nr * map->stripe_len;
+		u64 physical;
+
 		bioc->stripes[i].dev = map->stripes[stripe_index].dev;
+
+		if (fs_info->stripe_root && op == BTRFS_MAP_READ &&
+		   btrfs_need_stripe_tree_update(bioc)) {
+			ret = btrfs_get_raid_extent_offset(fs_info, logical,
+							   map->stripe_len,
+							   map->type,
+							   bioc->stripes[i].dev->devid,
+							   &physical);
+			if (ret) {
+				btrfs_put_bioc(bioc);
+				goto out;
+			}
+		} else {
+			physical = map->stripes[stripe_index].physical +
+				stripe_offset + stripe_nr * map->stripe_len;
+		}
+		bioc->stripes[i].physical = physical;
 		stripe_index++;
 	}
 
@@ -6600,7 +6618,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 	}
 
 	*bioc_ret = bioc;
-	bioc->map_type = map->type;
 	bioc->num_stripes = num_stripes;
 	bioc->max_errors = max_errors;
 	bioc->mirror_num = mirror_num;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [RFC ONLY 7/8] btrfs: zoned: allow zoned RAID1
  2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
                   ` (5 preceding siblings ...)
  2022-05-16 14:31 ` [RFC ONLY 6/8] btrfs: add code to read " Johannes Thumshirn
@ 2022-05-16 14:31 ` Johannes Thumshirn
  2022-05-16 14:31 ` [RFC ONLY 8/8] btrfs: add raid stripe tree pretty printer Johannes Thumshirn
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-16 14:31 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Johannes Thumshirn

When we have a raid-stripe-tree, we can do RAID1 on zoned devices for data
block-groups. For meta-data block-groups, we don't actually need
anything special, as all meta-data I/O is protected by the
btrfs_zoned_meta_io_lock() already.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/zoned.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 1b1b310c3c51..d817a3349595 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1455,6 +1455,45 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 		cache->zone_capacity = min(caps[0], caps[1]);
 		break;
 	case BTRFS_BLOCK_GROUP_RAID1:
+	case BTRFS_BLOCK_GROUP_RAID1C3:
+	case BTRFS_BLOCK_GROUP_RAID1C4:
+		if (map->type & BTRFS_BLOCK_GROUP_DATA &&
+		    !fs_info->stripe_root) {
+			btrfs_err(fs_info,
+				  "zoned: data RAID1 needs stripe_root");
+			ret = -EIO;
+			goto out;
+
+		}
+
+		for (i = 0; i < map->num_stripes; i++) {
+			if (alloc_offsets[i] == WP_MISSING_DEV) {
+				btrfs_err(fs_info,
+					  "zoned: cannot recover write pointer for zone %llu",
+					  physical[0]);
+				ret = -EIO;
+				goto out;
+			}
+			if (i == 0)
+				continue;
+
+			if (alloc_offsets[0] != alloc_offsets[i]) {
+				btrfs_err(fs_info,
+					  "zoned: write pointer offset mismatch of zones in RAID profile");
+				ret = -EIO;
+				goto out;
+			}
+			if (test_bit(0, active) != test_bit(i, active)) {
+				if (!btrfs_zone_activate(cache)) {
+					ret = -EIO;
+					goto out;
+				}
+			}
+			cache->zone_capacity = min(caps[0], caps[i]);
+		}
+		cache->zone_is_active = test_bit(0, active);
+		cache->alloc_offset = alloc_offsets[0];
+		break;
 	case BTRFS_BLOCK_GROUP_RAID0:
 	case BTRFS_BLOCK_GROUP_RAID10:
 	case BTRFS_BLOCK_GROUP_RAID5:
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 88+ messages in thread

* [RFC ONLY 8/8] btrfs: add raid stripe tree pretty printer
  2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
                   ` (6 preceding siblings ...)
  2022-05-16 14:31 ` [RFC ONLY 7/8] btrfs: zoned: allow zoned RAID1 Johannes Thumshirn
@ 2022-05-16 14:31 ` Johannes Thumshirn
  2022-05-16 14:58 ` [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Josef Bacik
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-16 14:31 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Johannes Thumshirn

Decode raid-stripe-tree entries on btrfs_print_tree().

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/print-tree.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index dd8777872143..1ee06b511951 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -6,6 +6,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
+#include "raid-stripe-tree.h"
 
 struct root_name_map {
 	u64 id;
@@ -25,6 +26,7 @@ static const struct root_name_map root_map[] = {
 	{ BTRFS_FREE_SPACE_TREE_OBJECTID,	"FREE_SPACE_TREE"	},
 	{ BTRFS_BLOCK_GROUP_TREE_OBJECTID,	"BLOCK_GROUP_TREE"	},
 	{ BTRFS_DATA_RELOC_TREE_OBJECTID,	"DATA_RELOC_TREE"	},
+	{ BTRFS_RAID_STRIPE_TREE_OBJECTID,	"RAID_STRIPE_TREE"	},
 };
 
 const char *btrfs_root_name(const struct btrfs_key *key, char *buf)
@@ -184,6 +186,20 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
 	}
 }
 
+static void print_raid_stripe_key(struct extent_buffer *eb, u32 item_size,
+				  struct btrfs_dp_stripe *stripe)
+{
+	int num_stripes;
+	int i;
+
+	num_stripes = btrfs_num_raid_stripes(item_size);
+
+	for (i = 0; i < num_stripes; i++)
+		pr_info("\t\t\tstripe %d devid %llu offset %llu\n", i,
+			btrfs_stripe_extent_devid_nr(eb, stripe, i),
+			btrfs_stripe_extent_offset_nr(eb, stripe, i));
+}
+
 /*
  * Helper to output refs and locking status of extent buffer.  Useful to debug
  * race condition related problems.
@@ -348,6 +364,11 @@ void btrfs_print_leaf(struct extent_buffer *l)
 			print_uuid_item(l, btrfs_item_ptr_offset(l, i),
 					btrfs_item_size(l, i));
 			break;
+		case BTRFS_RAID_STRIPE_KEY:
+			print_raid_stripe_key(l, btrfs_item_size(l, i),
+					      btrfs_item_ptr(l, i,
+							     struct btrfs_dp_stripe));
+			break;
 		}
 	}
 }
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 6/8] btrfs: add code to read raid extent
  2022-05-16 14:31 ` [RFC ONLY 6/8] btrfs: add code to read " Johannes Thumshirn
@ 2022-05-16 14:55   ` Josef Bacik
  0 siblings, 0 replies; 88+ messages in thread
From: Josef Bacik @ 2022-05-16 14:55 UTC (permalink / raw)
  To: Johannes Thumshirn; +Cc: linux-btrfs

On Mon, May 16, 2022 at 07:31:41AM -0700, Johannes Thumshirn wrote:
> Add boilerplate code to lookup the physical address from the
> raid-stripe-tree when a read on an RAID volume formatted with the
> raid-stripe-tree was attempted.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/btrfs/raid-stripe-tree.c | 68 +++++++++++++++++++++++++++++++++++++
>  fs/btrfs/raid-stripe-tree.h |  3 ++
>  fs/btrfs/volumes.c          | 23 +++++++++++--
>  3 files changed, 91 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index 370ea68fe343..ecc8205be760 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -1,10 +1,78 @@
>  // SPDX-License-Identifier: GPL-2.0
>  
> +#include <linux/btrfs_tree.h>
> +
>  #include "ctree.h"
>  #include "transaction.h"
>  #include "disk-io.h"
>  #include "raid-stripe-tree.h"
>  #include "volumes.h"
> +#include "misc.h"
> +
> +int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
> +				 u64 logical, u64 length, u64 map_type,
> +				 u64 devid, u64 *physical)
> +{
> +	struct btrfs_root *stripe_root = fs_info->stripe_root;
> +	struct btrfs_dp_stripe *raid_stripe;
> +	struct btrfs_key stripe_key;
> +	struct btrfs_key found_key;
> +	struct btrfs_path *path;
> +	struct extent_buffer *leaf;
> +	u64 offset;
> +	u64 found_logical, found_length;
> +	int num_stripes;
> +	int slot;
> +	int ret;
> +	int i;
> +
> +	stripe_key.objectid = logical;
> +	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
> +	stripe_key.offset = length;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	num_stripes = btrfs_bg_type_to_factor(map_type);
> +
> +	ret = btrfs_search_slot_for_read(stripe_root, &stripe_key, path, 0, 0);
> +	if (ret < 0) {
> +		goto out;
> +	}
> +
> +	if (ret == 1)
> +		ret = 0;
> +
> +	while (1) {
> +		leaf = path->nodes[0];
> +		slot = path->slots[0];
> +
> +		btrfs_item_key_to_cpu(leaf, &found_key, slot);
> +		found_logical = found_key.objectid;
> +		found_length = found_key.offset;
> +
> +		if (!in_range(logical, found_logical, found_length))
> +		    goto next;
> +		offset = logical - found_logical;
> +
> +		raid_stripe = btrfs_item_ptr(leaf, slot, struct btrfs_dp_stripe);
> +		for (i = 0; i < num_stripes; i++) {
> +			if (btrfs_stripe_extent_devid_nr(leaf, raid_stripe, i) != devid)
> +				continue;
> +			*physical = btrfs_stripe_extent_offset_nr(leaf, raid_stripe, i) + offset;
> +			goto out;
> +		}
> +next:
> +		ret = btrfs_next_item(stripe_root, path);
> +		if (ret)

This will leak 1 if we don't find a stripe, probably should do EUCLEAN if we
don't have a raid stripe for this?  Thanks,

Josef

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree
  2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
                   ` (7 preceding siblings ...)
  2022-05-16 14:31 ` [RFC ONLY 8/8] btrfs: add raid stripe tree pretty printer Johannes Thumshirn
@ 2022-05-16 14:58 ` Josef Bacik
  2022-05-16 15:04   ` Johannes Thumshirn
  2022-05-17  7:23 ` Nikolay Borisov
  2022-07-13 10:54 ` RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree") Qu Wenruo
  10 siblings, 1 reply; 88+ messages in thread
From: Josef Bacik @ 2022-05-16 14:58 UTC (permalink / raw)
  To: Johannes Thumshirn; +Cc: linux-btrfs

On Mon, May 16, 2022 at 07:31:35AM -0700, Johannes Thumshirn wrote:
> Introduce a raid-stripe-tree to record writes in a RAID environment.
> 
> In essence this adds another address translation layer between the logical
> and the physical addresses in btrfs and is designed to close two gaps. The
> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
> second one is the inability of doing RAID with zoned block devices due to the
> constraints we have with REQ_OP_ZONE_APPEND writes.
> 
> Thsi is an RFC/PoC only which just shows how the code will look like for a
> zoned RAID1. Its sole purpose is to facilitate design reviews and is not
> intended to be merged yet. Or if merged to be used on an actual file-system.
>

This is hard to talk about without seeing the code to add the raid extents and
such.  Reading it makes sense, but I don't know how often the stripes are meant
to change.  Are they static once they're allocated, like dev extents?  I can't
quite fit in my head the relationship with the rest of the allocation system.
Are they coupled with the logical extent that gets allocated?  Or are they
coupled with the dev extent?  Are they somewhere in between?

Also I realize this is an RFC, but we're going to need some caching for reads so
we're not having to do a tree search on every IO with the RAID stripe tree in
place.  Thanks,

Josef 

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree
  2022-05-16 14:58 ` [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Josef Bacik
@ 2022-05-16 15:04   ` Johannes Thumshirn
  2022-05-16 15:10     ` Josef Bacik
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-16 15:04 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs

On 16/05/2022 16:58, Josef Bacik wrote:
> On Mon, May 16, 2022 at 07:31:35AM -0700, Johannes Thumshirn wrote:
>> Introduce a raid-stripe-tree to record writes in a RAID environment.
>>
>> In essence this adds another address translation layer between the logical
>> and the physical addresses in btrfs and is designed to close two gaps. The
>> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
>> second one is the inability of doing RAID with zoned block devices due to the
>> constraints we have with REQ_OP_ZONE_APPEND writes.
>>
>> Thsi is an RFC/PoC only which just shows how the code will look like for a
>> zoned RAID1. Its sole purpose is to facilitate design reviews and is not
>> intended to be merged yet. Or if merged to be used on an actual file-system.
>>
> 
> This is hard to talk about without seeing the code to add the raid extents and
> such.  Reading it makes sense, but I don't know how often the stripes are meant
> to change.  Are they static once they're allocated, like dev extents?  I can't
> quite fit in my head the relationship with the rest of the allocation system.
> Are they coupled with the logical extent that gets allocated?  Or are they
> coupled with the dev extent?  Are they somewhere in between?

The stripe extents have a 1:1 relationship file the file-extents, i.e:

stripe_extent_key.objectid = btrfs_file_extent_item.disk_bytenr;
stripe_extent_type = BTRFS_RAID_STRIPE_EXTENT;
stripe_extent_offset = btrfs_file_extent_item.disk_num_bytes;


> Also I realize this is an RFC, but we're going to need some caching for reads so
> we're not having to do a tree search on every IO with the RAID stripe tree in
> place.

Do we really need to do caching of stripe tree entries? They're read, once the
corresponding btrfs_file_extent_item is read from disk, which then gets cached
in the page cache. Every override is cached in the page cache as well.

If we're flushing the cache, we need to re-read both, the file_extent_item and 
the stripe extents.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree
  2022-05-16 15:04   ` Johannes Thumshirn
@ 2022-05-16 15:10     ` Josef Bacik
  2022-05-16 15:47       ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Josef Bacik @ 2022-05-16 15:10 UTC (permalink / raw)
  To: Johannes Thumshirn; +Cc: linux-btrfs

On Mon, May 16, 2022 at 03:04:35PM +0000, Johannes Thumshirn wrote:
> On 16/05/2022 16:58, Josef Bacik wrote:
> > On Mon, May 16, 2022 at 07:31:35AM -0700, Johannes Thumshirn wrote:
> >> Introduce a raid-stripe-tree to record writes in a RAID environment.
> >>
> >> In essence this adds another address translation layer between the logical
> >> and the physical addresses in btrfs and is designed to close two gaps. The
> >> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
> >> second one is the inability of doing RAID with zoned block devices due to the
> >> constraints we have with REQ_OP_ZONE_APPEND writes.
> >>
> >> Thsi is an RFC/PoC only which just shows how the code will look like for a
> >> zoned RAID1. Its sole purpose is to facilitate design reviews and is not
> >> intended to be merged yet. Or if merged to be used on an actual file-system.
> >>
> > 
> > This is hard to talk about without seeing the code to add the raid extents and
> > such.  Reading it makes sense, but I don't know how often the stripes are meant
> > to change.  Are they static once they're allocated, like dev extents?  I can't
> > quite fit in my head the relationship with the rest of the allocation system.
> > Are they coupled with the logical extent that gets allocated?  Or are they
> > coupled with the dev extent?  Are they somewhere in between?
> 
> The stripe extents have a 1:1 relationship file the file-extents, i.e:
> 
> stripe_extent_key.objectid = btrfs_file_extent_item.disk_bytenr;
> stripe_extent_type = BTRFS_RAID_STRIPE_EXTENT;
> stripe_extent_offset = btrfs_file_extent_item.disk_num_bytes;
> 
> 
> > Also I realize this is an RFC, but we're going to need some caching for reads so
> > we're not having to do a tree search on every IO with the RAID stripe tree in
> > place.
> 
> Do we really need to do caching of stripe tree entries? They're read, once the
> corresponding btrfs_file_extent_item is read from disk, which then gets cached
> in the page cache. Every override is cached in the page cache as well.
> 
> If we're flushing the cache, we need to re-read both, the file_extent_item and 
> the stripe extents.

Yup ok if we're 1:1 with the file-extents then we don't want the whole tree
striped.

Since we're 1:1 with the file-extents please make the stripe tree follow the
same convention as the global roots, at least put the load code in the same area
as the csum/fst/extent tree, if your stuff gets merged and turned on before
extnet tree v2 it'll be easier for me to adapt it.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree
  2022-05-16 15:10     ` Josef Bacik
@ 2022-05-16 15:47       ` Johannes Thumshirn
  0 siblings, 0 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-16 15:47 UTC (permalink / raw)
  To: Josef Bacik; +Cc: linux-btrfs

On 16/05/2022 17:10, Josef Bacik wrote:
> On Mon, May 16, 2022 at 03:04:35PM +0000, Johannes Thumshirn wrote:
>> On 16/05/2022 16:58, Josef Bacik wrote:
>>> On Mon, May 16, 2022 at 07:31:35AM -0700, Johannes Thumshirn wrote:
>>>> Introduce a raid-stripe-tree to record writes in a RAID environment.
>>>>
>>>> In essence this adds another address translation layer between the logical
>>>> and the physical addresses in btrfs and is designed to close two gaps. The
>>>> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
>>>> second one is the inability of doing RAID with zoned block devices due to the
>>>> constraints we have with REQ_OP_ZONE_APPEND writes.
>>>>
>>>> Thsi is an RFC/PoC only which just shows how the code will look like for a
>>>> zoned RAID1. Its sole purpose is to facilitate design reviews and is not
>>>> intended to be merged yet. Or if merged to be used on an actual file-system.
>>>>
>>>
>>> This is hard to talk about without seeing the code to add the raid extents and
>>> such.  Reading it makes sense, but I don't know how often the stripes are meant
>>> to change.  Are they static once they're allocated, like dev extents?  I can't
>>> quite fit in my head the relationship with the rest of the allocation system.
>>> Are they coupled with the logical extent that gets allocated?  Or are they
>>> coupled with the dev extent?  Are they somewhere in between?
>>
>> The stripe extents have a 1:1 relationship file the file-extents, i.e:
>>
>> stripe_extent_key.objectid = btrfs_file_extent_item.disk_bytenr;
>> stripe_extent_type = BTRFS_RAID_STRIPE_EXTENT;
>> stripe_extent_offset = btrfs_file_extent_item.disk_num_bytes;
>>
>>
>>> Also I realize this is an RFC, but we're going to need some caching for reads so
>>> we're not having to do a tree search on every IO with the RAID stripe tree in
>>> place.
>>
>> Do we really need to do caching of stripe tree entries? They're read, once the
>> corresponding btrfs_file_extent_item is read from disk, which then gets cached
>> in the page cache. Every override is cached in the page cache as well.
>>
>> If we're flushing the cache, we need to re-read both, the file_extent_item and 
>> the stripe extents.
> 
> Yup ok if we're 1:1 with the file-extents then we don't want the whole tree
> striped.
> 
> Since we're 1:1 with the file-extents please make the stripe tree follow the
> same convention as the global roots, at least put the load code in the same area
> as the csum/fst/extent tree, if your stuff gets merged and turned on before
> extnet tree v2 it'll be easier for me to adapt it.  Thanks,

Sure. I know that there once will be the need to have meta-data in the RST, but
then page cache should do the trick for as us well, as it's hanging off the btree
inode, isn't it?

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree
  2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
                   ` (8 preceding siblings ...)
  2022-05-16 14:58 ` [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Josef Bacik
@ 2022-05-17  7:23 ` Nikolay Borisov
  2022-05-17  7:31   ` Qu Wenruo
  2022-05-17  7:32   ` Johannes Thumshirn
  2022-07-13 10:54 ` RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree") Qu Wenruo
  10 siblings, 2 replies; 88+ messages in thread
From: Nikolay Borisov @ 2022-05-17  7:23 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 16.05.22 г. 17:31 ч., Johannes Thumshirn wrote:
> Introduce a raid-stripe-tree to record writes in a RAID environment.
> 
> In essence this adds another address translation layer between the logical
> and the physical addresses in btrfs and is designed to close two gaps. The
> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
> second one is the inability of doing RAID with zoned block devices due to the
> constraints we have with REQ_OP_ZONE_APPEND writes.
> 
> Thsi is an RFC/PoC only which just shows how the code will look like for a
> zoned RAID1. Its sole purpose is to facilitate design reviews and is not
> intended to be merged yet. Or if merged to be used on an actual file-system.
> 
> Johannes Thumshirn (8):
>    btrfs: add raid stripe tree definitions
>    btrfs: move btrfs_io_context to volumes.h
>    btrfs: read raid-stripe-tree from disk
>    btrfs: add boilerplate code to insert raid extent
>    btrfs: add code to delete raid extent
>    btrfs: add code to read raid extent
>    btrfs: zoned: allow zoned RAID1
>    btrfs: add raid stripe tree pretty printer
> 
>   fs/btrfs/Makefile               |   2 +-
>   fs/btrfs/ctree.c                |   1 +
>   fs/btrfs/ctree.h                |  29 ++++
>   fs/btrfs/disk-io.c              |  12 ++
>   fs/btrfs/extent-tree.c          |   9 ++
>   fs/btrfs/file.c                 |   1 -
>   fs/btrfs/print-tree.c           |  21 +++
>   fs/btrfs/raid-stripe-tree.c     | 251 ++++++++++++++++++++++++++++++++
>   fs/btrfs/raid-stripe-tree.h     |  39 +++++
>   fs/btrfs/volumes.c              |  44 +++++-
>   fs/btrfs/volumes.h              |  93 ++++++------
>   fs/btrfs/zoned.c                |  39 +++++
>   include/uapi/linux/btrfs.h      |   1 +
>   include/uapi/linux/btrfs_tree.h |  17 +++
>   14 files changed, 509 insertions(+), 50 deletions(-)
>   create mode 100644 fs/btrfs/raid-stripe-tree.c
>   create mode 100644 fs/btrfs/raid-stripe-tree.h
> 


So if we choose to go with raid stripe tree this means we won't need the 
raid56j code that Qu is working on ? So it's important that these two 
work streams are synced so we don't duplicate effort, right?

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree
  2022-05-17  7:23 ` Nikolay Borisov
@ 2022-05-17  7:31   ` Qu Wenruo
  2022-05-17  7:41     ` Johannes Thumshirn
  2022-05-17  7:32   ` Johannes Thumshirn
  1 sibling, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  7:31 UTC (permalink / raw)
  To: Nikolay Borisov, Johannes Thumshirn, linux-btrfs



On 2022/5/17 15:23, Nikolay Borisov wrote:
>
>
> On 16.05.22 г. 17:31 ч., Johannes Thumshirn wrote:
>> Introduce a raid-stripe-tree to record writes in a RAID environment.
>>
>> In essence this adds another address translation layer between the
>> logical
>> and the physical addresses in btrfs and is designed to close two gaps.
>> The
>> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
>> second one is the inability of doing RAID with zoned block devices due
>> to the
>> constraints we have with REQ_OP_ZONE_APPEND writes.
>>
>> Thsi is an RFC/PoC only which just shows how the code will look like
>> for a
>> zoned RAID1. Its sole purpose is to facilitate design reviews and is not
>> intended to be merged yet. Or if merged to be used on an actual
>> file-system.
>>
>> Johannes Thumshirn (8):
>>    btrfs: add raid stripe tree definitions
>>    btrfs: move btrfs_io_context to volumes.h
>>    btrfs: read raid-stripe-tree from disk
>>    btrfs: add boilerplate code to insert raid extent
>>    btrfs: add code to delete raid extent
>>    btrfs: add code to read raid extent
>>    btrfs: zoned: allow zoned RAID1
>>    btrfs: add raid stripe tree pretty printer
>>
>>   fs/btrfs/Makefile               |   2 +-
>>   fs/btrfs/ctree.c                |   1 +
>>   fs/btrfs/ctree.h                |  29 ++++
>>   fs/btrfs/disk-io.c              |  12 ++
>>   fs/btrfs/extent-tree.c          |   9 ++
>>   fs/btrfs/file.c                 |   1 -
>>   fs/btrfs/print-tree.c           |  21 +++
>>   fs/btrfs/raid-stripe-tree.c     | 251 ++++++++++++++++++++++++++++++++
>>   fs/btrfs/raid-stripe-tree.h     |  39 +++++
>>   fs/btrfs/volumes.c              |  44 +++++-
>>   fs/btrfs/volumes.h              |  93 ++++++------
>>   fs/btrfs/zoned.c                |  39 +++++
>>   include/uapi/linux/btrfs.h      |   1 +
>>   include/uapi/linux/btrfs_tree.h |  17 +++
>>   14 files changed, 509 insertions(+), 50 deletions(-)
>>   create mode 100644 fs/btrfs/raid-stripe-tree.c
>>   create mode 100644 fs/btrfs/raid-stripe-tree.h
>>
>
>
> So if we choose to go with raid stripe tree this means we won't need the
> raid56j code that Qu is working on ? So it's important that these two
> work streams are synced so we don't duplicate effort, right?

I believe the stripe tree is going to change the definition of RAID56.

It's no longer strict RAID56, as it doesn't contain the fixed device
rotation, thus it's kinda between RAID4 and RAID5.

Personally speaking, I think both features can co-exist, especially the
raid56 stripe tree may need extra development and review, since the
extra translation layer is a completely different monster when comes to
RAID56.

Don't get me wrong, I like stripe-tree too, the only problem is it's
just too new, thus we may want a backup plan.

Thanks,
Qu


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree
  2022-05-17  7:23 ` Nikolay Borisov
  2022-05-17  7:31   ` Qu Wenruo
@ 2022-05-17  7:32   ` Johannes Thumshirn
  1 sibling, 0 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-17  7:32 UTC (permalink / raw)
  To: Nikolay Borisov, linux-btrfs

On 17/05/2022 09:23, Nikolay Borisov wrote:
> 
> 
> On 16.05.22 г. 17:31 ч., Johannes Thumshirn wrote:
>> Introduce a raid-stripe-tree to record writes in a RAID environment.
>>
>> In essence this adds another address translation layer between the logical
>> and the physical addresses in btrfs and is designed to close two gaps. The
>> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
>> second one is the inability of doing RAID with zoned block devices due to the
>> constraints we have with REQ_OP_ZONE_APPEND writes.
>>
>> Thsi is an RFC/PoC only which just shows how the code will look like for a
>> zoned RAID1. Its sole purpose is to facilitate design reviews and is not
>> intended to be merged yet. Or if merged to be used on an actual file-system.
>>
>> Johannes Thumshirn (8):
>>    btrfs: add raid stripe tree definitions
>>    btrfs: move btrfs_io_context to volumes.h
>>    btrfs: read raid-stripe-tree from disk
>>    btrfs: add boilerplate code to insert raid extent
>>    btrfs: add code to delete raid extent
>>    btrfs: add code to read raid extent
>>    btrfs: zoned: allow zoned RAID1
>>    btrfs: add raid stripe tree pretty printer
>>
>>   fs/btrfs/Makefile               |   2 +-
>>   fs/btrfs/ctree.c                |   1 +
>>   fs/btrfs/ctree.h                |  29 ++++
>>   fs/btrfs/disk-io.c              |  12 ++
>>   fs/btrfs/extent-tree.c          |   9 ++
>>   fs/btrfs/file.c                 |   1 -
>>   fs/btrfs/print-tree.c           |  21 +++
>>   fs/btrfs/raid-stripe-tree.c     | 251 ++++++++++++++++++++++++++++++++
>>   fs/btrfs/raid-stripe-tree.h     |  39 +++++
>>   fs/btrfs/volumes.c              |  44 +++++-
>>   fs/btrfs/volumes.h              |  93 ++++++------
>>   fs/btrfs/zoned.c                |  39 +++++
>>   include/uapi/linux/btrfs.h      |   1 +
>>   include/uapi/linux/btrfs_tree.h |  17 +++
>>   14 files changed, 509 insertions(+), 50 deletions(-)
>>   create mode 100644 fs/btrfs/raid-stripe-tree.c
>>   create mode 100644 fs/btrfs/raid-stripe-tree.h
>>
> 
> 
> So if we choose to go with raid stripe tree this means we won't need the 
> raid56j code that Qu is working on ? So it's important that these two 
> work streams are synced so we don't duplicate effort, right?
> 

That's the reason for my early RFC here.

I think both solutions have benefits and drawbacks. 

The stripe tree adds complexity, metadata (though at the moment only 16 
bytes per drive in the stripe per extent) and another address translation /
lookup layer, it adds the benefit of being always able to do CoW and close
the write-hole here. Also it can work with zoned devices and the Zone Append
write command.

The raid56j code will be simpler in the end I suspect, but it still doesn't
do full CoW and isn't Zone Append capable. Two factors that can't work on
zoned filesystems. And given that capacity drives will likely be more and more
zoned drives, even outside of the hyperscale sector, I see this problematic.

Both Qu and I are aware of each others patches and I would really like to get
the work converged here. The raid56j code for sure is a stop gap solution for
the users that already have a raid56 setup and want to get rid of the write
hole.

Thanks,
	Johannes




^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 1/8] btrfs: add raid stripe tree definitions
  2022-05-16 14:31 ` [RFC ONLY 1/8] btrfs: add raid stripe tree definitions Johannes Thumshirn
@ 2022-05-17  7:39   ` Qu Wenruo
  2022-05-17  7:45     ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  7:39 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/16 22:31, Johannes Thumshirn wrote:
> Add definitions for the raid-stripe-tree. This tree will hold informatioin
> about the on-disk layout of the stripes in a RAID set.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/ctree.h                | 28 ++++++++++++++++++++++++++++
>   include/uapi/linux/btrfs_tree.h | 17 +++++++++++++++++
>   2 files changed, 45 insertions(+)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 7328fb17b7f5..20aa2ebac7cd 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1878,6 +1878,34 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
>   BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
>   BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
>
> +BTRFS_SETGET_FUNCS(stripe_extent_devid, struct btrfs_stripe_extent, devid, 64);
> +BTRFS_SETGET_FUNCS(stripe_extent_offset, struct btrfs_stripe_extent, offset, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_devid, struct btrfs_stripe_extent, devid, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_offset, struct btrfs_stripe_extent, offset, 64);
> +
> +static inline struct btrfs_stripe_extent *btrfs_stripe_extent_nr(
> +					 struct btrfs_dp_stripe *dps, int nr)
> +{
> +	unsigned long offset = (unsigned long)dps;
> +	offset += offsetof(struct btrfs_dp_stripe, extents);
> +	offset += nr * sizeof(struct btrfs_stripe_extent);
> +	return (struct btrfs_stripe_extent *)offset;
> +}
> +
> +static inline u64 btrfs_stripe_extent_devid_nr(const struct extent_buffer *eb,
> +					       struct btrfs_dp_stripe *dps,
> +					       int nr)
> +{
> +	return btrfs_stripe_extent_devid(eb, btrfs_stripe_extent_nr(dps, nr));
> +}
> +
> +static inline u64 btrfs_stripe_extent_offset_nr(const struct extent_buffer *eb,
> +						struct btrfs_dp_stripe *dps,
> +						int nr)
> +{
> +	return btrfs_stripe_extent_offset(eb, btrfs_stripe_extent_nr(dps, nr));
> +}
> +
>   /* struct btrfs_dev_extent */
>   BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
>   		   chunk_tree, 64);
> diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
> index b069752a8ecf..a2d28d83cc96 100644
> --- a/include/uapi/linux/btrfs_tree.h
> +++ b/include/uapi/linux/btrfs_tree.h
> @@ -56,6 +56,9 @@
>   /* Holds the block group items for extent tree v2. */
>   #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL
>
> +/* tracks RAID stripes in block groups. */
> +#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
> +
>   /* device stats in the device tree */
>   #define BTRFS_DEV_STATS_OBJECTID 0ULL
>
> @@ -264,6 +267,8 @@
>    */
>   #define BTRFS_QGROUP_RELATION_KEY       246
>
> +#define BTRFS_RAID_STRIPE_KEY 247
> +
>   /*
>    * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
>    */
> @@ -488,6 +493,18 @@ struct btrfs_free_space_header {
>   	__le64 num_bitmaps;
>   } __attribute__ ((__packed__));
>
> +struct btrfs_stripe_extent {
> +	/* btrfs device-id this raid extent  lives on */
> +	__le64 devid;
> +	/* offset from  the devextent start */
> +	__le64 offset;

Considering we have 1G stripe length limit (at least for now), u32 may
be large enough?

Although u64 is definitely future proof.

> +} __attribute__ ((__packed__));
> +

Mind to mention the key format?

My guess is, it's (<logical bytenr>, BTRFS_RAID_STRIPE_KEY, <length>)?

Thanks,
Qu

> +struct btrfs_dp_stripe {
> +	/* array of stripe extents this stripe is comprised of */
> +	struct btrfs_stripe_extent extents;
> +} __attribute__ ((__packed__));
> +
>   #define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
>   #define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree
  2022-05-17  7:31   ` Qu Wenruo
@ 2022-05-17  7:41     ` Johannes Thumshirn
  0 siblings, 0 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-17  7:41 UTC (permalink / raw)
  To: Qu Wenruo, Nikolay Borisov, linux-btrfs

On 17/05/2022 09:32, Qu Wenruo wrote:
> 
> 
> On 2022/5/17 15:23, Nikolay Borisov wrote:
>>
>>
>> On 16.05.22 г. 17:31 ч., Johannes Thumshirn wrote:
>>> Introduce a raid-stripe-tree to record writes in a RAID environment.
>>>
>>> In essence this adds another address translation layer between the
>>> logical
>>> and the physical addresses in btrfs and is designed to close two gaps.
>>> The
>>> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
>>> second one is the inability of doing RAID with zoned block devices due
>>> to the
>>> constraints we have with REQ_OP_ZONE_APPEND writes.
>>>
>>> Thsi is an RFC/PoC only which just shows how the code will look like
>>> for a
>>> zoned RAID1. Its sole purpose is to facilitate design reviews and is not
>>> intended to be merged yet. Or if merged to be used on an actual
>>> file-system.
>>>
>>> Johannes Thumshirn (8):
>>>    btrfs: add raid stripe tree definitions
>>>    btrfs: move btrfs_io_context to volumes.h
>>>    btrfs: read raid-stripe-tree from disk
>>>    btrfs: add boilerplate code to insert raid extent
>>>    btrfs: add code to delete raid extent
>>>    btrfs: add code to read raid extent
>>>    btrfs: zoned: allow zoned RAID1
>>>    btrfs: add raid stripe tree pretty printer
>>>
>>>   fs/btrfs/Makefile               |   2 +-
>>>   fs/btrfs/ctree.c                |   1 +
>>>   fs/btrfs/ctree.h                |  29 ++++
>>>   fs/btrfs/disk-io.c              |  12 ++
>>>   fs/btrfs/extent-tree.c          |   9 ++
>>>   fs/btrfs/file.c                 |   1 -
>>>   fs/btrfs/print-tree.c           |  21 +++
>>>   fs/btrfs/raid-stripe-tree.c     | 251 ++++++++++++++++++++++++++++++++
>>>   fs/btrfs/raid-stripe-tree.h     |  39 +++++
>>>   fs/btrfs/volumes.c              |  44 +++++-
>>>   fs/btrfs/volumes.h              |  93 ++++++------
>>>   fs/btrfs/zoned.c                |  39 +++++
>>>   include/uapi/linux/btrfs.h      |   1 +
>>>   include/uapi/linux/btrfs_tree.h |  17 +++
>>>   14 files changed, 509 insertions(+), 50 deletions(-)
>>>   create mode 100644 fs/btrfs/raid-stripe-tree.c
>>>   create mode 100644 fs/btrfs/raid-stripe-tree.h
>>>
>>
>>
>> So if we choose to go with raid stripe tree this means we won't need the
>> raid56j code that Qu is working on ? So it's important that these two
>> work streams are synced so we don't duplicate effort, right?
> 
> I believe the stripe tree is going to change the definition of RAID56.
> 
> It's no longer strict RAID56, as it doesn't contain the fixed device
> rotation, thus it's kinda between RAID4 and RAID5.

Well I think it can still contain the device rotation. The stripe tree only
records the on-disk location of each sub-stripe, after it has been written.
The data placement itself doesn't get changed at all. But for this to work,
there's still a lot to do. There's also other plans I have. IIUC btrfs raid56
uses all available drives in a raid set, while raid1,10,0 etc permute the
drives the data is placed. Which is a way better solution IMHO as it reduces
rebuild stress in case we need to do rebuild. Given we have two digit TB
drives these days, rebuilds do a lot of IO which can cause more drives failing
while rebuilding.

> Personally speaking, I think both features can co-exist, especially the
> raid56 stripe tree may need extra development and review, since the
> extra translation layer is a completely different monster when comes to
> RAID56.
> 
> Don't get me wrong, I like stripe-tree too, the only problem is it's
> just too new, thus we may want a backup plan.
> 

Exactly, as I already wrote to Nikolay, raid56j is for sure the simpler
solution and some users might even prefer it for this reason.

Byte,
	Johannes

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 2/8] btrfs: move btrfs_io_context to volumes.h
  2022-05-16 14:31 ` [RFC ONLY 2/8] btrfs: move btrfs_io_context to volumes.h Johannes Thumshirn
@ 2022-05-17  7:42   ` Qu Wenruo
  2022-05-17  7:51     ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  7:42 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/16 22:31, Johannes Thumshirn wrote:
> In preparation for upcoming changes, move 'struct btrfs_io_context' to
> volumes.h, so we can use it outside of volumes.c

In fact I don't think the naming itself (from myself) is that good.

It maybe a good idea to also do a rename here.

I have some bad alternatives, but doesn't seem better than the current
generic naming either:

- btrfs_io_mapping
- btrfs_mapping_context

Thus I guess the current name is chosen mostly due to lack of better ones.

Thanks,
Qu

>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/volumes.h | 90 +++++++++++++++++++++++-----------------------
>   1 file changed, 45 insertions(+), 45 deletions(-)
>
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index bd297f23d19e..894d289a3b50 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -32,6 +32,51 @@ struct btrfs_io_geometry {
>   	u64 raid56_stripe_offset;
>   };
>
> +struct btrfs_io_stripe {
> +	struct btrfs_device *dev;
> +	u64 physical;
> +	u64 length; /* only used for discard mappings */
> +};
> +
> +/*
> + * Context for IO subsmission for device stripe.
> + *
> + * - Track the unfinished mirrors for mirror based profiles
> + *   Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
> + *
> + * - Contain the logical -> physical mapping info
> + *   Used by submit_stripe_bio() for mapping logical bio
> + *   into physical device address.
> + *
> + * - Contain device replace info
> + *   Used by handle_ops_on_dev_replace() to copy logical bios
> + *   into the new device.
> + *
> + * - Contain RAID56 full stripe logical bytenrs
> + */
> +struct btrfs_io_context {
> +	refcount_t refs;
> +	atomic_t stripes_pending;
> +	struct btrfs_fs_info *fs_info;
> +	u64 map_type; /* get from map_lookup->type */
> +	bio_end_io_t *end_io;
> +	struct bio *orig_bio;
> +	void *private;
> +	atomic_t error;
> +	int max_errors;
> +	int num_stripes;
> +	int mirror_num;
> +	int num_tgtdevs;
> +	int *tgtdev_map;
> +	/*
> +	 * logical block numbers for the start of each stripe
> +	 * The last one or two are p/q.  These are sorted,
> +	 * so raid_map[0] is the start of our full stripe
> +	 */
> +	u64 *raid_map;
> +	struct btrfs_io_stripe stripes[];
> +};
> +
>   /*
>    * Use sequence counter to get consistent device stat data on
>    * 32-bit processors.
> @@ -354,51 +399,6 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
>   	}
>   }
>
> -struct btrfs_io_stripe {
> -	struct btrfs_device *dev;
> -	u64 physical;
> -	u64 length; /* only used for discard mappings */
> -};
> -
> -/*
> - * Context for IO subsmission for device stripe.
> - *
> - * - Track the unfinished mirrors for mirror based profiles
> - *   Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
> - *
> - * - Contain the logical -> physical mapping info
> - *   Used by submit_stripe_bio() for mapping logical bio
> - *   into physical device address.
> - *
> - * - Contain device replace info
> - *   Used by handle_ops_on_dev_replace() to copy logical bios
> - *   into the new device.
> - *
> - * - Contain RAID56 full stripe logical bytenrs
> - */
> -struct btrfs_io_context {
> -	refcount_t refs;
> -	atomic_t stripes_pending;
> -	struct btrfs_fs_info *fs_info;
> -	u64 map_type; /* get from map_lookup->type */
> -	bio_end_io_t *end_io;
> -	struct bio *orig_bio;
> -	void *private;
> -	atomic_t error;
> -	int max_errors;
> -	int num_stripes;
> -	int mirror_num;
> -	int num_tgtdevs;
> -	int *tgtdev_map;
> -	/*
> -	 * logical block numbers for the start of each stripe
> -	 * The last one or two are p/q.  These are sorted,
> -	 * so raid_map[0] is the start of our full stripe
> -	 */
> -	u64 *raid_map;
> -	struct btrfs_io_stripe stripes[];
> -};
> -
>   struct btrfs_device_info {
>   	struct btrfs_device *dev;
>   	u64 dev_offset;

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 1/8] btrfs: add raid stripe tree definitions
  2022-05-17  7:39   ` Qu Wenruo
@ 2022-05-17  7:45     ` Johannes Thumshirn
  2022-05-17  7:56       ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-17  7:45 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 17/05/2022 09:39, Qu Wenruo wrote:
>>
>> +struct btrfs_stripe_extent {
>> +	/* btrfs device-id this raid extent  lives on */
>> +	__le64 devid;
>> +	/* offset from  the devextent start */
>> +	__le64 offset;
> 
> Considering we have 1G stripe length limit (at least for now), u32 may
> be large enough?
> 
> Although u64 is definitely future proof.
> 
>> +} __attribute__ ((__packed__));
>> +
> 
> Mind to mention the key format?
> 
> My guess is, it's (<logical bytenr>, BTRFS_RAID_STRIPE_KEY, <length>)?

Correct. I'll add a comment here.

>> +struct btrfs_dp_stripe {
>> +	/* array of stripe extents this stripe is comprised of */
>> +	struct btrfs_stripe_extent extents;
>> +} __attribute__ ((__packed__));
>> +

Another question, should I add the generation to the 
btrfs_dp_stripe? And does someone have a better name for the struct?

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 2/8] btrfs: move btrfs_io_context to volumes.h
  2022-05-17  7:42   ` Qu Wenruo
@ 2022-05-17  7:51     ` Johannes Thumshirn
  2022-05-17  7:58       ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-17  7:51 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 17/05/2022 09:42, Qu Wenruo wrote:
> 
> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>> In preparation for upcoming changes, move 'struct btrfs_io_context' to
>> volumes.h, so we can use it outside of volumes.c
> In fact I don't think the naming itself (from myself) is that good.
> 
> It maybe a good idea to also do a rename here.
> 
> I have some bad alternatives, but doesn't seem better than the current
> generic naming either:
> 
> - btrfs_io_mapping
> - btrfs_mapping_context
> 
> Thus I guess the current name is chosen mostly due to lack of better ones.

Yep but I'm not any better in naming *cough* btrfs_dp_stripe *cough*. Maybe
someone else has an idea.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 4/8] btrfs: add boilerplate code to insert raid extent
  2022-05-16 14:31 ` [RFC ONLY 4/8] btrfs: add boilerplate code to insert raid extent Johannes Thumshirn
@ 2022-05-17  7:53   ` Qu Wenruo
  2022-05-17  8:00   ` Qu Wenruo
  1 sibling, 0 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  7:53 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/16 22:31, Johannes Thumshirn wrote:
> Add boilerplate code to insert raid extents into the raid-stripe-tree on
> each write to a RAID1 block-group.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/Makefile           |  2 +-
>   fs/btrfs/raid-stripe-tree.c | 72 +++++++++++++++++++++++++++++++++++++
>   fs/btrfs/raid-stripe-tree.h | 28 +++++++++++++++
>   fs/btrfs/volumes.c          | 21 +++++++++++
>   fs/btrfs/volumes.h          |  3 ++
>   5 files changed, 125 insertions(+), 1 deletion(-)
>   create mode 100644 fs/btrfs/raid-stripe-tree.c
>   create mode 100644 fs/btrfs/raid-stripe-tree.h
>
> diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
> index 4188ba3fd8c3..6b9a00ad532a 100644
> --- a/fs/btrfs/Makefile
> +++ b/fs/btrfs/Makefile
> @@ -30,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
>   	   backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
>   	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
>   	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
> -	   subpage.o tree-mod-log.o
> +	   subpage.o tree-mod-log.o raid-stripe-tree.o
>
>   btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
>   btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> new file mode 100644
> index 000000000000..426066bd7c0d
> --- /dev/null
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -0,0 +1,72 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include "ctree.h"
> +#include "transaction.h"
> +#include "disk-io.h"
> +#include "raid-stripe-tree.h"
> +#include "volumes.h"
> +
> +static void btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
> +				     struct btrfs_io_context *bioc)
> +{
> +	struct btrfs_fs_info *fs_info = bioc->fs_info;
> +	struct btrfs_key stripe_key;
> +	struct btrfs_root *stripe_root = fs_info->stripe_root;
> +	struct btrfs_dp_stripe *raid_stripe;
> +	struct btrfs_stripe_extent *stripe_extent;
> +	size_t item_size;
> +	int ret;
> +	int i;
> +
> +	item_size = sizeof(struct btrfs_dp_stripe) - sizeof(struct btrfs_stripe_extent) +
> +		bioc->num_stripes * sizeof(struct btrfs_stripe_extent);
> +
> +	raid_stripe = kzalloc(item_size, GFP_NOFS);
> +	if (!raid_stripe) {
> +		btrfs_abort_transaction(trans, -ENOMEM);
> +		return;
> +	}
> +
> +	stripe_extent = &raid_stripe->extents;
> +	for (i = 0; i  < bioc->num_stripes; i++) {
> +		u64 devid = bioc->stripes[i].dev->devid;
> +		u64 physical = bioc->stripes[i].physical;
> +
> +		btrfs_set_stack_stripe_extent_devid(stripe_extent, devid);
> +		btrfs_set_stack_stripe_extent_offset(stripe_extent, physical);
> +		stripe_extent++;
> +	}
> +
> +	stripe_key.objectid = bioc->logical;
> +	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
> +	stripe_key.offset = bioc->length;
> +
> +	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, raid_stripe,
> +				item_size);
> +	if (ret) {
> +		kfree(raid_stripe);
> +		btrfs_abort_transaction(trans, ret);
> +		return;
> +	}
> +
> +	kfree(raid_stripe);
> +}
> +
> +void btrfs_raid_stripe_tree_fn(struct work_struct *work)
> +{
> +	struct btrfs_io_context *bioc;
> +	struct btrfs_fs_info *fs_info;
> +	struct btrfs_root *root;
> +	struct btrfs_trans_handle *trans = NULL;
> +
> +	bioc = container_of(work, struct btrfs_io_context, stripe_update_work);
> +	fs_info = bioc->fs_info;
> +	root = fs_info->stripe_root;
> +
> +	trans = btrfs_join_transaction(root);
> +
> +	btrfs_insert_raid_extent(trans, bioc);
> +	btrfs_end_transaction(trans);
> +
> +	btrfs_put_bioc(bioc);
> +}
> diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
> new file mode 100644
> index 000000000000..320a110ecc66
> --- /dev/null
> +++ b/fs/btrfs/raid-stripe-tree.h
> @@ -0,0 +1,28 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef BTRFS_RAID_STRIPE_TREE_H
> +#define BTRFS_RAID_STRIPE_TREE_H
> +
> +#include "volumes.h"
> +
> +void btrfs_raid_stripe_tree_fn(struct work_struct *work);
> +
> +static inline bool btrfs_need_stripe_tree_update(struct btrfs_io_context *bioc)
> +{
> +	u64 type = bioc->map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
> +	u64 profile = bioc->map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
> +
> +	if (!bioc->fs_info->stripe_root)
> +		return false;
> +
> +	// for now
> +	if (type != BTRFS_BLOCK_GROUP_DATA)
> +		return false;

OK, for now it's indeed excluding metadata/sys chunks from stripe tree.

That's fine for now.

But this really brings the problem of bootstrap, thus I'm afraid that we
may not support metadata/data for stripe tree mapped chunks forever.


This also brings a new problem to us, if we plan to make stripe tree
work for metadata/sys, despite the bootstrap problem, we also need to
determine if stripe tree is something global, or per-chunk.

a) Global switch for stripe tree

If global, then every data chunk needs to be stripe-mapped, or we build
a complex stripe-tree supported chunk type list.

In fact, currently the btrfs_need_stripe_tree_update() is already doing
that.

Without a proper on-disk indicate, we can never really do stable support
for new stripe-tree support on other profiles.

b) Per-chunk type stripe tree

Then we need an extra type/flag for chunks/block groups to indicate that
any read/write into the chunk needs stripe tree update.

This allows us to support different chunk types with stripe tree, but
needs more complex on-disk change, other than just a simple global flag.

Thanks,
Qu

> +
> +	if (profile & BTRFS_BLOCK_GROUP_RAID1_MASK)
> +		return true;
> +
> +	return false;
> +}
> +
> +#endif
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 3fd17e87815a..36acef2ae5d8 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -33,6 +33,7 @@
>   #include "block-group.h"
>   #include "discard.h"
>   #include "zoned.h"
> +#include "raid-stripe-tree.h"
>
>   #define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
>   					 BTRFS_BLOCK_GROUP_RAID10 | \
> @@ -5917,6 +5918,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
>   	bioc->fs_info = fs_info;
>   	bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
>   	bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
> +	INIT_WORK(&bioc->stripe_update_work, btrfs_raid_stripe_tree_fn);
>
>   	return bioc;
>   }
> @@ -6677,6 +6679,17 @@ static void btrfs_end_bio(struct bio *bio)
>   		}
>   	}
>
> +	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
> +		int i;
> +
> +		for (i = 0; i < bioc->num_stripes; i++) {
> +			if (bioc->stripes[i].dev->bdev != bio->bi_bdev)
> +				continue;
> +			bioc->stripes[i].physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
> +		}
> +	}
> +
> +
>   	if (bio == bioc->orig_bio)
>   		is_orig_bio = 1;
>
> @@ -6700,6 +6713,12 @@ static void btrfs_end_bio(struct bio *bio)
>   			 * go over the max number of errors
>   			 */
>   			bio->bi_status = BLK_STS_OK;
> +
> +			if (btrfs_op(bio) == BTRFS_MAP_WRITE &&
> +			    btrfs_need_stripe_tree_update(bioc)) {
> +				btrfs_get_bioc(bioc);
> +				schedule_work(&bioc->stripe_update_work);
> +			}
>   		}
>
>   		btrfs_end_bioc(bioc, bio);
> @@ -6788,6 +6807,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
>   	bioc->orig_bio = first_bio;
>   	bioc->private = first_bio->bi_private;
>   	bioc->end_io = first_bio->bi_end_io;
> +	bioc->logical = logical;
> +	bioc->length = length;
>   	atomic_set(&bioc->stripes_pending, bioc->num_stripes);
>
>   	if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 894d289a3b50..4b4235b4432a 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -68,6 +68,9 @@ struct btrfs_io_context {
>   	int mirror_num;
>   	int num_tgtdevs;
>   	int *tgtdev_map;
> +	u64 logical;
> +	u64 length;
> +	struct work_struct stripe_update_work;
>   	/*
>   	 * logical block numbers for the start of each stripe
>   	 * The last one or two are p/q.  These are sorted,

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 1/8] btrfs: add raid stripe tree definitions
  2022-05-17  7:45     ` Johannes Thumshirn
@ 2022-05-17  7:56       ` Qu Wenruo
  0 siblings, 0 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  7:56 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/17 15:45, Johannes Thumshirn wrote:
> On 17/05/2022 09:39, Qu Wenruo wrote:
>>>
>>> +struct btrfs_stripe_extent {
>>> +	/* btrfs device-id this raid extent  lives on */
>>> +	__le64 devid;
>>> +	/* offset from  the devextent start */
>>> +	__le64 offset;
>>
>> Considering we have 1G stripe length limit (at least for now), u32 may
>> be large enough?
>>
>> Although u64 is definitely future proof.
>>
>>> +} __attribute__ ((__packed__));
>>> +
>>
>> Mind to mention the key format?
>>
>> My guess is, it's (<logical bytenr>, BTRFS_RAID_STRIPE_KEY, <length>)?
>
> Correct. I'll add a comment here.
>
>>> +struct btrfs_dp_stripe {
>>> +	/* array of stripe extents this stripe is comprised of */
>>> +	struct btrfs_stripe_extent extents;
>>> +} __attribute__ ((__packed__));
>>> +
>
> Another question, should I add the generation to the
> btrfs_dp_stripe? And does someone have a better name for the struct?

Why you need a new generation member?
To address the problem of possible RAID56 device mismatch?

Thanks,
Qu

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 2/8] btrfs: move btrfs_io_context to volumes.h
  2022-05-17  7:51     ` Johannes Thumshirn
@ 2022-05-17  7:58       ` Qu Wenruo
  2022-05-17  8:01         ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  7:58 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/17 15:51, Johannes Thumshirn wrote:
> On 17/05/2022 09:42, Qu Wenruo wrote:
>>
>> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>>> In preparation for upcoming changes, move 'struct btrfs_io_context' to
>>> volumes.h, so we can use it outside of volumes.c
>> In fact I don't think the naming itself (from myself) is that good.
>>
>> It maybe a good idea to also do a rename here.
>>
>> I have some bad alternatives, but doesn't seem better than the current
>> generic naming either:
>>
>> - btrfs_io_mapping
>> - btrfs_mapping_context
>>
>> Thus I guess the current name is chosen mostly due to lack of better ones.
>
> Yep but I'm not any better in naming *cough* btrfs_dp_stripe *cough*. Maybe
> someone else has an idea.

Forgot to ask in that thread, what does the "dp" naming mean?

Thanks,
Qu

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 4/8] btrfs: add boilerplate code to insert raid extent
  2022-05-16 14:31 ` [RFC ONLY 4/8] btrfs: add boilerplate code to insert raid extent Johannes Thumshirn
  2022-05-17  7:53   ` Qu Wenruo
@ 2022-05-17  8:00   ` Qu Wenruo
  2022-05-17  8:05     ` Johannes Thumshirn
  1 sibling, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  8:00 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/16 22:31, Johannes Thumshirn wrote:
> Add boilerplate code to insert raid extents into the raid-stripe-tree on
> each write to a RAID1 block-group.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/Makefile           |  2 +-
>   fs/btrfs/raid-stripe-tree.c | 72 +++++++++++++++++++++++++++++++++++++
>   fs/btrfs/raid-stripe-tree.h | 28 +++++++++++++++
>   fs/btrfs/volumes.c          | 21 +++++++++++
>   fs/btrfs/volumes.h          |  3 ++
>   5 files changed, 125 insertions(+), 1 deletion(-)
>   create mode 100644 fs/btrfs/raid-stripe-tree.c
>   create mode 100644 fs/btrfs/raid-stripe-tree.h
>
> diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
> index 4188ba3fd8c3..6b9a00ad532a 100644
> --- a/fs/btrfs/Makefile
> +++ b/fs/btrfs/Makefile
> @@ -30,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
>   	   backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
>   	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
>   	   block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
> -	   subpage.o tree-mod-log.o
> +	   subpage.o tree-mod-log.o raid-stripe-tree.o
>
>   btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
>   btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> new file mode 100644
> index 000000000000..426066bd7c0d
> --- /dev/null
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -0,0 +1,72 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include "ctree.h"
> +#include "transaction.h"
> +#include "disk-io.h"
> +#include "raid-stripe-tree.h"
> +#include "volumes.h"
> +
> +static void btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
> +				     struct btrfs_io_context *bioc)
> +{
> +	struct btrfs_fs_info *fs_info = bioc->fs_info;
> +	struct btrfs_key stripe_key;
> +	struct btrfs_root *stripe_root = fs_info->stripe_root;
> +	struct btrfs_dp_stripe *raid_stripe;
> +	struct btrfs_stripe_extent *stripe_extent;
> +	size_t item_size;
> +	int ret;
> +	int i;
> +
> +	item_size = sizeof(struct btrfs_dp_stripe) - sizeof(struct btrfs_stripe_extent) +
> +		bioc->num_stripes * sizeof(struct btrfs_stripe_extent);
> +
> +	raid_stripe = kzalloc(item_size, GFP_NOFS);
> +	if (!raid_stripe) {
> +		btrfs_abort_transaction(trans, -ENOMEM);
> +		return;
> +	}
> +
> +	stripe_extent = &raid_stripe->extents;
> +	for (i = 0; i  < bioc->num_stripes; i++) {
> +		u64 devid = bioc->stripes[i].dev->devid;
> +		u64 physical = bioc->stripes[i].physical;
> +
> +		btrfs_set_stack_stripe_extent_devid(stripe_extent, devid);
> +		btrfs_set_stack_stripe_extent_offset(stripe_extent, physical);
> +		stripe_extent++;
> +	}
> +
> +	stripe_key.objectid = bioc->logical;
> +	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
> +	stripe_key.offset = bioc->length;
> +
> +	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, raid_stripe,
> +				item_size);
> +	if (ret) {
> +		kfree(raid_stripe);
> +		btrfs_abort_transaction(trans, ret);
> +		return;
> +	}
> +
> +	kfree(raid_stripe);
> +}
> +
> +void btrfs_raid_stripe_tree_fn(struct work_struct *work)
> +{
> +	struct btrfs_io_context *bioc;
> +	struct btrfs_fs_info *fs_info;
> +	struct btrfs_root *root;
> +	struct btrfs_trans_handle *trans = NULL;
> +
> +	bioc = container_of(work, struct btrfs_io_context, stripe_update_work);
> +	fs_info = bioc->fs_info;
> +	root = fs_info->stripe_root;
> +
> +	trans = btrfs_join_transaction(root);
> +
> +	btrfs_insert_raid_extent(trans, bioc);
> +	btrfs_end_transaction(trans);
> +
> +	btrfs_put_bioc(bioc);
> +}
> diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
> new file mode 100644
> index 000000000000..320a110ecc66
> --- /dev/null
> +++ b/fs/btrfs/raid-stripe-tree.h
> @@ -0,0 +1,28 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef BTRFS_RAID_STRIPE_TREE_H
> +#define BTRFS_RAID_STRIPE_TREE_H
> +
> +#include "volumes.h"
> +
> +void btrfs_raid_stripe_tree_fn(struct work_struct *work);
> +
> +static inline bool btrfs_need_stripe_tree_update(struct btrfs_io_context *bioc)
> +{
> +	u64 type = bioc->map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
> +	u64 profile = bioc->map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
> +
> +	if (!bioc->fs_info->stripe_root)
> +		return false;
> +
> +	// for now
> +	if (type != BTRFS_BLOCK_GROUP_DATA)
> +		return false;
> +
> +	if (profile & BTRFS_BLOCK_GROUP_RAID1_MASK)
> +		return true;
> +
> +	return false;
> +}
> +
> +#endif
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 3fd17e87815a..36acef2ae5d8 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -33,6 +33,7 @@
>   #include "block-group.h"
>   #include "discard.h"
>   #include "zoned.h"
> +#include "raid-stripe-tree.h"
>
>   #define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
>   					 BTRFS_BLOCK_GROUP_RAID10 | \
> @@ -5917,6 +5918,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
>   	bioc->fs_info = fs_info;
>   	bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
>   	bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
> +	INIT_WORK(&bioc->stripe_update_work, btrfs_raid_stripe_tree_fn);
>
>   	return bioc;
>   }
> @@ -6677,6 +6679,17 @@ static void btrfs_end_bio(struct bio *bio)
>   		}
>   	}
>
> +	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
> +		int i;
> +
> +		for (i = 0; i < bioc->num_stripes; i++) {
> +			if (bioc->stripes[i].dev->bdev != bio->bi_bdev)
> +				continue;
> +			bioc->stripes[i].physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
> +		}
> +	}
> +
> +
>   	if (bio == bioc->orig_bio)
>   		is_orig_bio = 1;
>
> @@ -6700,6 +6713,12 @@ static void btrfs_end_bio(struct bio *bio)
>   			 * go over the max number of errors
>   			 */
>   			bio->bi_status = BLK_STS_OK;
> +
> +			if (btrfs_op(bio) == BTRFS_MAP_WRITE &&
> +			    btrfs_need_stripe_tree_update(bioc)) {
> +				btrfs_get_bioc(bioc);
> +				schedule_work(&bioc->stripe_update_work);

Considering the stripe tree should be a 1:1 map for file extents, can't
we do it in btrfs_finish_ordered_io()?

Thanks,
Qu

> +			}
>   		}
>
>   		btrfs_end_bioc(bioc, bio);
> @@ -6788,6 +6807,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
>   	bioc->orig_bio = first_bio;
>   	bioc->private = first_bio->bi_private;
>   	bioc->end_io = first_bio->bi_end_io;
> +	bioc->logical = logical;
> +	bioc->length = length;
>   	atomic_set(&bioc->stripes_pending, bioc->num_stripes);
>
>   	if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 894d289a3b50..4b4235b4432a 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -68,6 +68,9 @@ struct btrfs_io_context {
>   	int mirror_num;
>   	int num_tgtdevs;
>   	int *tgtdev_map;
> +	u64 logical;
> +	u64 length;
> +	struct work_struct stripe_update_work;
>   	/*
>   	 * logical block numbers for the start of each stripe
>   	 * The last one or two are p/q.  These are sorted,

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 2/8] btrfs: move btrfs_io_context to volumes.h
  2022-05-17  7:58       ` Qu Wenruo
@ 2022-05-17  8:01         ` Johannes Thumshirn
  0 siblings, 0 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-17  8:01 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 17/05/2022 09:58, Qu Wenruo wrote:
> 
> 
> On 2022/5/17 15:51, Johannes Thumshirn wrote:
>> On 17/05/2022 09:42, Qu Wenruo wrote:
>>>
>>> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>>>> In preparation for upcoming changes, move 'struct btrfs_io_context' to
>>>> volumes.h, so we can use it outside of volumes.c
>>> In fact I don't think the naming itself (from myself) is that good.
>>>
>>> It maybe a good idea to also do a rename here.
>>>
>>> I have some bad alternatives, but doesn't seem better than the current
>>> generic naming either:
>>>
>>> - btrfs_io_mapping
>>> - btrfs_mapping_context
>>>
>>> Thus I guess the current name is chosen mostly due to lack of better ones.
>>
>> Yep but I'm not any better in naming *cough* btrfs_dp_stripe *cough*. Maybe
>> someone else has an idea.
> 
> Forgot to ask in that thread, what does the "dp" naming mean?

Declustered Parity, but that's misleading actually, as RAID1 doesn't do parity
at all (although BTRFS RAID1 is a declustered RAID already).


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 4/8] btrfs: add boilerplate code to insert raid extent
  2022-05-17  8:00   ` Qu Wenruo
@ 2022-05-17  8:05     ` Johannes Thumshirn
  2022-05-17  8:09       ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-17  8:05 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 17/05/2022 10:01, Qu Wenruo wrote:
>> @@ -6700,6 +6713,12 @@ static void btrfs_end_bio(struct bio *bio)
>>   			 * go over the max number of errors
>>   			 */
>>   			bio->bi_status = BLK_STS_OK;
>> +
>> +			if (btrfs_op(bio) == BTRFS_MAP_WRITE &&
>> +			    btrfs_need_stripe_tree_update(bioc)) {
>> +				btrfs_get_bioc(bioc);
>> +				schedule_work(&bioc->stripe_update_work);
> Considering the stripe tree should be a 1:1 map for file extents, can't
> we do it in btrfs_finish_ordered_io()?

Unfortunately not at the moment. I need the stripes[] array from
btrfs_io_context to record the per-disk physical locations. Another
possibility would be to lift this array into btrfs_ordered_extent,
then it can be done in btrfs_finish_ordered_io().

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 5/8] btrfs: add code to delete raid extent
  2022-05-16 14:31 ` [RFC ONLY 5/8] btrfs: add code to delete " Johannes Thumshirn
@ 2022-05-17  8:06   ` Qu Wenruo
  2022-05-17  8:10     ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  8:06 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/16 22:31, Johannes Thumshirn wrote:
> Add boilerplate code to delete entries from the raid-stripe-tree if the
> corresponding file extent got deleted.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/ctree.c            |   1 +
>   fs/btrfs/extent-tree.c      |   9 +++
>   fs/btrfs/file.c             |   1 -
>   fs/btrfs/raid-stripe-tree.c | 111 ++++++++++++++++++++++++++++++++++++
>   fs/btrfs/raid-stripe-tree.h |   8 +++
>   5 files changed, 129 insertions(+), 1 deletion(-)
>
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 1e24695ede0a..b7b4e421e9b8 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -3623,6 +3623,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
>   	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
>
>   	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
> +	       key.type != BTRFS_RAID_STRIPE_KEY &&
>   	       key.type != BTRFS_EXTENT_CSUM_KEY);
>
>   	if (btrfs_leaf_free_space(leaf) >= ins_len)
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index f477035a2ac2..00af3e469881 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -36,6 +36,7 @@
>   #include "rcu-string.h"
>   #include "zoned.h"
>   #include "dev-replace.h"
> +#include "raid-stripe-tree.h"
>
>   #undef SCRAMBLE_DELAYED_REFS
>
> @@ -3199,6 +3200,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
>   			}
>   		}

Considering we're already in __btrfs_free_extents(), and the branch
we're in is already for refs == 1 case, which means we're already the
last one owning the file extent (and its stripe tree entry).
>
> +		if (is_data) {
> +			ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
> +			if (ret) {
> +				btrfs_abort_transaction(trans, ret);
> +				return ret;
> +			}
> +		}
> +
>   		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
>   				      num_to_del);
>   		if (ret) {
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index bd329316945f..6021188dcb9a 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1009,7 +1009,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
>   		btrfs_release_path(path);
>   out:
>   	args->drop_end = found ? min(args->end, last_end) : args->end;
> -
>   	return ret;
>   }
>
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index 426066bd7c0d..370ea68fe343 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -6,6 +6,117 @@
>   #include "raid-stripe-tree.h"
>   #include "volumes.h"
>
> +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start,
> +			     u64 length)
> +{
> +	struct btrfs_fs_info *fs_info = trans->fs_info;
> +	struct btrfs_root *stripe_root = fs_info->stripe_root;
> +	struct btrfs_path *path;
> +	struct btrfs_key stripe_key;
> +	struct btrfs_key found_key;
> +	struct extent_buffer *leaf;
> +	u64 end = start + length;
> +	u64 found_start;
> +	u64 found_end;
> +	int slot;
> +	int ret;
> +
> +	if (!stripe_root)
> +		return 0;
> +
> +	stripe_key.objectid = start;
> +	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
> +	stripe_key.offset = end;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	ret = btrfs_search_slot(trans, stripe_root, &stripe_key, path, -1, 1);
> +	if (ret < 0)
> +		goto out;
> +	if (ret == 0)
> +		goto delete;
> +
> +	leaf = path->nodes[0];
> +	slot = path->slots[0];
> +	btrfs_item_key_to_cpu(leaf, &found_key, slot);
> +	found_start = found_key.objectid;
> +	found_end = found_start + found_key.offset;
> +
> +	/*
> +	 * | -- range to drop --|
> +	 * | ---------- extent ---------- |
> +	 */

Thus I believe we don't need those complex checking.

The call site has make sure we're the last one owning the file extent,
and since raid stripe is 1:1 mapped to the full extent (not just part of
a data extent, like btrfs_file_extent_item can do), we should be safe to
just do an ASSERT() without the complex split.


Thus, I guess to be extra accurate, the 1:1 mapping is between an (data)
EXTENT_ITEM and a raid stripe?

Thanks,
Qu
> +front_split:
> +	if (start > found_start) {
> +		struct btrfs_key front_key;
> +		struct btrfs_dp_stripe *raid_stripe;
> +		struct extent_buffer *front_leaf;
> +		struct btrfs_stripe_extent *stripe_extent;
> +		int num_stripes;
> +		int i;
> +
> +		front_key.objectid = found_start + length;
> +		front_key.type = BTRFS_RAID_STRIPE_KEY;
> +		front_key.offset = found_end - length;
> +
> +		num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
> +
> +		ret = btrfs_duplicate_item(trans, stripe_root, path, &front_key);
> +		if (ret == -EAGAIN) {
> +			btrfs_release_path(path);
> +			goto front_split;
> +		}
> +		if (ret < 0)
> +			goto out;
> +		front_leaf = path->nodes[0];
> +
> +		raid_stripe = btrfs_item_ptr(leaf, slot, struct btrfs_dp_stripe);
> +		stripe_extent = &raid_stripe->extents;
> +		for (i = 0; i < num_stripes; i++) {
> +			u64 physical;
> +
> +			physical = btrfs_stripe_extent_offset(leaf, stripe_extent);
> +			btrfs_set_stripe_extent_offset(front_leaf, stripe_extent,
> +							  physical + length);
> +			stripe_extent++;
> +		}
> +
> +		btrfs_mark_buffer_dirty(front_leaf);
> +	}
> +
> +	/*
> +	 *           | -- range to drop --|
> +	 * | ---------- extent ---------- |
> +	 */
> +tail_split:
> +	if (end < found_end) {
> +		struct btrfs_key tail_key;
> +
> +
> +		tail_key.objectid = start;
> +		tail_key.type = BTRFS_RAID_STRIPE_KEY;
> +		tail_key.offset = found_end - end;
> +
> +		ret = btrfs_duplicate_item(trans, stripe_root, path, &tail_key);
> +		if (ret == -EAGAIN) {
> +			btrfs_release_path(path);
> +			goto tail_split;
> +		}
> +		if (ret < 0)
> +			goto out;
> +		btrfs_mark_buffer_dirty(path->nodes[0]);
> +	}
> +
> +delete:
> +	ret = btrfs_del_item(trans, stripe_root, path);
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +
> +}
> +
>   static void btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
>   				     struct btrfs_io_context *bioc)
>   {
> diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
> index 320a110ecc66..766634df8601 100644
> --- a/fs/btrfs/raid-stripe-tree.h
> +++ b/fs/btrfs/raid-stripe-tree.h
> @@ -5,8 +5,16 @@
>
>   #include "volumes.h"
>
> +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start,
> +			     u64 length);
>   void btrfs_raid_stripe_tree_fn(struct work_struct *work);
>
> +static inline int btrfs_num_raid_stripes(u32 item_size)
> +{
> +	return item_size - offsetof(struct btrfs_dp_stripe, extents) /
> +		sizeof(struct btrfs_stripe_extent);
> +}
> +
>   static inline bool btrfs_need_stripe_tree_update(struct btrfs_io_context *bioc)
>   {
>   	u64 type = bioc->map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 4/8] btrfs: add boilerplate code to insert raid extent
  2022-05-17  8:05     ` Johannes Thumshirn
@ 2022-05-17  8:09       ` Qu Wenruo
  0 siblings, 0 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  8:09 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/17 16:05, Johannes Thumshirn wrote:
> On 17/05/2022 10:01, Qu Wenruo wrote:
>>> @@ -6700,6 +6713,12 @@ static void btrfs_end_bio(struct bio *bio)
>>>    			 * go over the max number of errors
>>>    			 */
>>>    			bio->bi_status = BLK_STS_OK;
>>> +
>>> +			if (btrfs_op(bio) == BTRFS_MAP_WRITE &&
>>> +			    btrfs_need_stripe_tree_update(bioc)) {
>>> +				btrfs_get_bioc(bioc);
>>> +				schedule_work(&bioc->stripe_update_work);
>> Considering the stripe tree should be a 1:1 map for file extents, can't
>> we do it in btrfs_finish_ordered_io()?
>
> Unfortunately not at the moment. I need the stripes[] array from
> btrfs_io_context to record the per-disk physical locations. Another
> possibility would be to lift this array into btrfs_ordered_extent,
> then it can be done in btrfs_finish_ordered_io().

At least to me, lifting it to btrfs_ordered_extent() seems more reasonable.

One problem is, if we write the stripe to stripe tree, and a trans
committed, but power loss happened before btrfs_finish_ordered_io().

Then we would have an orphan stripe item in stripe tree I guess.

Then we may later hit EEXIST doing other stripe tree operations.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-16 14:31 ` [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk Johannes Thumshirn
@ 2022-05-17  8:09   ` Qu Wenruo
  2022-05-17  8:13     ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  8:09 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/16 22:31, Johannes Thumshirn wrote:
> If we're discovering a raid-stripe-tree on mount, read it from disk.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>   fs/btrfs/ctree.h           |  1 +
>   fs/btrfs/disk-io.c         | 12 ++++++++++++
>   include/uapi/linux/btrfs.h |  1 +
>   3 files changed, 14 insertions(+)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 20aa2ebac7cd..1db669662f61 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -667,6 +667,7 @@ struct btrfs_fs_info {
>   	struct btrfs_root *uuid_root;
>   	struct btrfs_root *data_reloc_root;
>   	struct btrfs_root *block_group_root;
> +	struct btrfs_root *stripe_root;
>
>   	/* the log root tree is a directory of all the other log roots */
>   	struct btrfs_root *log_root_tree;
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index d456f426924c..c0f08917465a 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1706,6 +1706,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
>
>   		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
>   	}
> +	if (objectid == BTRFS_RAID_STRIPE_TREE_OBJECTID)
> +		return btrfs_grab_root(fs_info->stripe_root) ?
> +			fs_info->stripe_root : ERR_PTR(-ENOENT);
>   	return NULL;
>   }
>
> @@ -1784,6 +1787,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
>   	btrfs_put_root(fs_info->fs_root);
>   	btrfs_put_root(fs_info->data_reloc_root);
>   	btrfs_put_root(fs_info->block_group_root);
> +	btrfs_put_root(fs_info->stripe_root);
>   	btrfs_check_leaked_roots(fs_info);
>   	btrfs_extent_buffer_leak_debug_check(fs_info);
>   	kfree(fs_info->super_copy);
> @@ -2337,6 +2341,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
>   	free_root_extent_buffers(info->fs_root);
>   	free_root_extent_buffers(info->data_reloc_root);
>   	free_root_extent_buffers(info->block_group_root);
> +	free_root_extent_buffers(info->stripe_root);
>   	if (free_chunk_root)
>   		free_root_extent_buffers(info->chunk_root);
>   }
> @@ -2773,6 +2778,13 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
>   		fs_info->uuid_root = root;
>   	}
>

I guess in the real patch, we need to check the incompatble feature first.

Another problem is, how do we do bootstrap?

If our metadata (especially chunk tree) is also in some chunks which is
stripe-tree mapped, without stripe tree we're even unable to read the
chunk tree.

Or do you plan to not support metadata on stripe-tree mapped chunks?

Thanks,
Qu
> +	location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
> +	root = btrfs_read_tree_root(tree_root, &location);
> +	if (!IS_ERR(root)) {
> +		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
> +		fs_info->stripe_root = root;
> +	}
> +
>   	return 0;
>   out:
>   	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index d956b2993970..4e0429fc4e87 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -310,6 +310,7 @@ struct btrfs_ioctl_fs_info_args {
>   #define BTRFS_FEATURE_INCOMPAT_RAID1C34		(1ULL << 11)
>   #define BTRFS_FEATURE_INCOMPAT_ZONED		(1ULL << 12)
>   #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2	(1ULL << 13)
> +#define BTRFS_FEATURE_INCOMPAT_STRIPE_TREE	(1ULL << 14)
>
>   struct btrfs_ioctl_feature_flags {
>   	__u64 compat_flags;

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 5/8] btrfs: add code to delete raid extent
  2022-05-17  8:06   ` Qu Wenruo
@ 2022-05-17  8:10     ` Johannes Thumshirn
  2022-05-17  8:14       ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-17  8:10 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 17/05/2022 10:06, Qu Wenruo wrote:
>> +	ret = btrfs_search_slot(trans, stripe_root, &stripe_key, path, -1, 1);
>> +	if (ret < 0)
>> +		goto out;
>> +	if (ret == 0)
>> +		goto delete;
>> +
>> +	leaf = path->nodes[0];
>> +	slot = path->slots[0];
>> +	btrfs_item_key_to_cpu(leaf, &found_key, slot);
>> +	found_start = found_key.objectid;
>> +	found_end = found_start + found_key.offset;
>> +
>> +	/*
>> +	 * | -- range to drop --|
>> +	 * | ---------- extent ---------- |
>> +	 */
> Thus I believe we don't need those complex checking.
> 
> The call site has make sure we're the last one owning the file extent,
> and since raid stripe is 1:1 mapped to the full extent (not just part of
> a data extent, like btrfs_file_extent_item can do), we should be safe to
> just do an ASSERT() without the complex split.
> 
> 
> Thus, I guess to be extra accurate, the 1:1 mapping is between an (data)
> EXTENT_ITEM and a raid stripe?

Unfortunately not, as we can split extents. I've found out the hard way that
we need this. See btrfs_drop_extents() for details.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-17  8:09   ` Qu Wenruo
@ 2022-05-17  8:13     ` Johannes Thumshirn
  2022-05-17  8:28       ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-17  8:13 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 17/05/2022 10:10, Qu Wenruo wrote:
> 
> 
> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>> If we're discovering a raid-stripe-tree on mount, read it from disk.
>>
>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>> ---
>>   fs/btrfs/ctree.h           |  1 +
>>   fs/btrfs/disk-io.c         | 12 ++++++++++++
>>   include/uapi/linux/btrfs.h |  1 +
>>   3 files changed, 14 insertions(+)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 20aa2ebac7cd..1db669662f61 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -667,6 +667,7 @@ struct btrfs_fs_info {
>>   	struct btrfs_root *uuid_root;
>>   	struct btrfs_root *data_reloc_root;
>>   	struct btrfs_root *block_group_root;
>> +	struct btrfs_root *stripe_root;
>>
>>   	/* the log root tree is a directory of all the other log roots */
>>   	struct btrfs_root *log_root_tree;
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index d456f426924c..c0f08917465a 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -1706,6 +1706,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
>>
>>   		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
>>   	}
>> +	if (objectid == BTRFS_RAID_STRIPE_TREE_OBJECTID)
>> +		return btrfs_grab_root(fs_info->stripe_root) ?
>> +			fs_info->stripe_root : ERR_PTR(-ENOENT);
>>   	return NULL;
>>   }
>>
>> @@ -1784,6 +1787,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
>>   	btrfs_put_root(fs_info->fs_root);
>>   	btrfs_put_root(fs_info->data_reloc_root);
>>   	btrfs_put_root(fs_info->block_group_root);
>> +	btrfs_put_root(fs_info->stripe_root);
>>   	btrfs_check_leaked_roots(fs_info);
>>   	btrfs_extent_buffer_leak_debug_check(fs_info);
>>   	kfree(fs_info->super_copy);
>> @@ -2337,6 +2341,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
>>   	free_root_extent_buffers(info->fs_root);
>>   	free_root_extent_buffers(info->data_reloc_root);
>>   	free_root_extent_buffers(info->block_group_root);
>> +	free_root_extent_buffers(info->stripe_root);
>>   	if (free_chunk_root)
>>   		free_root_extent_buffers(info->chunk_root);
>>   }
>> @@ -2773,6 +2778,13 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
>>   		fs_info->uuid_root = root;
>>   	}
>>
> 
> I guess in the real patch, we need to check the incompatble feature first.

Or at least a compatible_ro. For regular drives it should be sufficient, for
zoned drives mounting with raid without a stripe tree will fail.

> 
> Another problem is, how do we do bootstrap?
> 
> If our metadata (especially chunk tree) is also in some chunks which is
> stripe-tree mapped, without stripe tree we're even unable to read the
> chunk tree.
> 
> Or do you plan to not support metadata on stripe-tree mapped chunks?

I do, but I have no clue yet how to attack this problem. I was hoping to get some
insights from Josef's extent-tree v2 series.

Metadata on the stripe tree really is the main blocker right now.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 5/8] btrfs: add code to delete raid extent
  2022-05-17  8:10     ` Johannes Thumshirn
@ 2022-05-17  8:14       ` Qu Wenruo
  2022-05-17  8:20         ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  8:14 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/17 16:10, Johannes Thumshirn wrote:
> On 17/05/2022 10:06, Qu Wenruo wrote:
>>> +	ret = btrfs_search_slot(trans, stripe_root, &stripe_key, path, -1, 1);
>>> +	if (ret < 0)
>>> +		goto out;
>>> +	if (ret == 0)
>>> +		goto delete;
>>> +
>>> +	leaf = path->nodes[0];
>>> +	slot = path->slots[0];
>>> +	btrfs_item_key_to_cpu(leaf, &found_key, slot);
>>> +	found_start = found_key.objectid;
>>> +	found_end = found_start + found_key.offset;
>>> +
>>> +	/*
>>> +	 * | -- range to drop --|
>>> +	 * | ---------- extent ---------- |
>>> +	 */
>> Thus I believe we don't need those complex checking.
>>
>> The call site has make sure we're the last one owning the file extent,
>> and since raid stripe is 1:1 mapped to the full extent (not just part of
>> a data extent, like btrfs_file_extent_item can do), we should be safe to
>> just do an ASSERT() without the complex split.
>>
>>
>> Thus, I guess to be extra accurate, the 1:1 mapping is between an (data)
>> EXTENT_ITEM and a raid stripe?
>
> Unfortunately not, as we can split extents. I've found out the hard way that
> we need this. See btrfs_drop_extents() for details.

But btrfs extents are immortal, it can only get freed when every bytes
are no longer referred to.

Btrfs_drop_extents() is complex because it's working on file extent
level, but __btrfs_free_extent() is already working on extent level.

Or do you mean, the raid stripe is not really bounded to extent level,
but really bounded to file extent level?

Then I'm not sure if it's a good idea to do such level mapping then.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 5/8] btrfs: add code to delete raid extent
  2022-05-17  8:14       ` Qu Wenruo
@ 2022-05-17  8:20         ` Johannes Thumshirn
  2022-05-17  8:31           ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-17  8:20 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 17/05/2022 10:15, Qu Wenruo wrote:
> But btrfs extents are immortal, it can only get freed when every bytes
> are no longer referred to.
> 
> Btrfs_drop_extents() is complex because it's working on file extent
> level, but __btrfs_free_extent() is already working on extent level.
> 
> Or do you mean, the raid stripe is not really bounded to extent level,
> but really bounded to file extent level?
> 
> Then I'm not sure if it's a good idea to do such level mapping then.

It is bound to the file_extent_item as the extent_item is lacking the 
(logical, lenght) information I need.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-17  8:13     ` Johannes Thumshirn
@ 2022-05-17  8:28       ` Qu Wenruo
  2022-05-18 11:29         ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  8:28 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/17 16:13, Johannes Thumshirn wrote:
> On 17/05/2022 10:10, Qu Wenruo wrote:
>>
>>
>> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>>> If we're discovering a raid-stripe-tree on mount, read it from disk.
>>>
>>> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>>> ---
>>>    fs/btrfs/ctree.h           |  1 +
>>>    fs/btrfs/disk-io.c         | 12 ++++++++++++
>>>    include/uapi/linux/btrfs.h |  1 +
>>>    3 files changed, 14 insertions(+)
>>>
>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>> index 20aa2ebac7cd..1db669662f61 100644
>>> --- a/fs/btrfs/ctree.h
>>> +++ b/fs/btrfs/ctree.h
>>> @@ -667,6 +667,7 @@ struct btrfs_fs_info {
>>>    	struct btrfs_root *uuid_root;
>>>    	struct btrfs_root *data_reloc_root;
>>>    	struct btrfs_root *block_group_root;
>>> +	struct btrfs_root *stripe_root;
>>>
>>>    	/* the log root tree is a directory of all the other log roots */
>>>    	struct btrfs_root *log_root_tree;
>>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>>> index d456f426924c..c0f08917465a 100644
>>> --- a/fs/btrfs/disk-io.c
>>> +++ b/fs/btrfs/disk-io.c
>>> @@ -1706,6 +1706,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
>>>
>>>    		return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT);
>>>    	}
>>> +	if (objectid == BTRFS_RAID_STRIPE_TREE_OBJECTID)
>>> +		return btrfs_grab_root(fs_info->stripe_root) ?
>>> +			fs_info->stripe_root : ERR_PTR(-ENOENT);
>>>    	return NULL;
>>>    }
>>>
>>> @@ -1784,6 +1787,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
>>>    	btrfs_put_root(fs_info->fs_root);
>>>    	btrfs_put_root(fs_info->data_reloc_root);
>>>    	btrfs_put_root(fs_info->block_group_root);
>>> +	btrfs_put_root(fs_info->stripe_root);
>>>    	btrfs_check_leaked_roots(fs_info);
>>>    	btrfs_extent_buffer_leak_debug_check(fs_info);
>>>    	kfree(fs_info->super_copy);
>>> @@ -2337,6 +2341,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
>>>    	free_root_extent_buffers(info->fs_root);
>>>    	free_root_extent_buffers(info->data_reloc_root);
>>>    	free_root_extent_buffers(info->block_group_root);
>>> +	free_root_extent_buffers(info->stripe_root);
>>>    	if (free_chunk_root)
>>>    		free_root_extent_buffers(info->chunk_root);
>>>    }
>>> @@ -2773,6 +2778,13 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
>>>    		fs_info->uuid_root = root;
>>>    	}
>>>
>>
>> I guess in the real patch, we need to check the incompatble feature first.
>
> Or at least a compatible_ro. For regular drives it should be sufficient, for
> zoned drives mounting with raid without a stripe tree will fail.
>
>>
>> Another problem is, how do we do bootstrap?
>>
>> If our metadata (especially chunk tree) is also in some chunks which is
>> stripe-tree mapped, without stripe tree we're even unable to read the
>> chunk tree.
>>
>> Or do you plan to not support metadata on stripe-tree mapped chunks?
>
> I do, but I have no clue yet how to attack this problem. I was hoping to get some
> insights from Josef's extent-tree v2 series.

Personally speaking, a per-chunk flag/type allowing us to know if a
chunk has stripe mapped is much better for testing, and can bring you
much needed time for further improvement.

>
> Metadata on the stripe tree really is the main blocker right now.

That's no doubt.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 5/8] btrfs: add code to delete raid extent
  2022-05-17  8:20         ` Johannes Thumshirn
@ 2022-05-17  8:31           ` Qu Wenruo
  0 siblings, 0 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-05-17  8:31 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/17 16:20, Johannes Thumshirn wrote:
> On 17/05/2022 10:15, Qu Wenruo wrote:
>> But btrfs extents are immortal, it can only get freed when every bytes
>> are no longer referred to.
>>
>> Btrfs_drop_extents() is complex because it's working on file extent
>> level, but __btrfs_free_extent() is already working on extent level.
>>
>> Or do you mean, the raid stripe is not really bounded to extent level,
>> but really bounded to file extent level?
>>
>> Then I'm not sure if it's a good idea to do such level mapping then.
>
> It is bound to the file_extent_item as the extent_item is lacking the
> (logical, lenght) information I need.

Then I guess the it would be way more complex than we though.

For file extent we can split it for things like CoW, and it's really too
flex and too complex to me.

The info of logical and length can still be extracted from from extent
item (of course, or we will have way bigger problems).

Furthermore, if we are bounded to extent item, then we even have the
help from delayed refs update code, to help reduce the IO on stripe tree.

Although all those benefits come from the cost of new creative ways to
pass the stripe mapping info to delayed refs...

Thanks,
Qu

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-17  8:28       ` Qu Wenruo
@ 2022-05-18 11:29         ` Johannes Thumshirn
  2022-05-19  8:36           ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-18 11:29 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 17/05/2022 10:28, Qu Wenruo wrote:
>>
>> Metadata on the stripe tree really is the main blocker right now.
> 
> That's no doubt.

What could be done and I think this is the only way forward, is to have
the stripe tree in the system block group and force system to be RAID1
on a fs with stripe tree.

Thoughts?

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-18 11:29         ` Johannes Thumshirn
@ 2022-05-19  8:36           ` Qu Wenruo
  2022-05-19  8:39             ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-19  8:36 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/18 19:29, Johannes Thumshirn wrote:
> On 17/05/2022 10:28, Qu Wenruo wrote:
>>>
>>> Metadata on the stripe tree really is the main blocker right now.
>>
>> That's no doubt.
>
> What could be done and I think this is the only way forward, is to have
> the stripe tree in the system block group

This behavior itself has its problems, unfortunately.

Currently the system chunks are pretty small, in fact system chunks has
the minimal stripe size for current code base.
(Data: 1G, Meta: 1G/256M, sys: 32M)

This means, if we put stripe tree (which can be as large as extent tree
afaik), we need way much larger system chunks.

And this can further increase the possibility on ENOSPC due to
unbalanced data/metadata/sys usage.

Although this is really the last problem we need to bother.

> and force system to be RAID1 on a fs with stripe tree.

Then the system RAID1 chunks also need stripe tree for zoned devices.

This means we're unable to bootstrap at all.

Or did you mean, make system chunks RAID1 but not using stripe tree?
That can solve the boot strap problem, but it doesn't really look
elegant to me...

Thanks,
Qu

>
> Thoughts?

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-19  8:36           ` Qu Wenruo
@ 2022-05-19  8:39             ` Johannes Thumshirn
  2022-05-19 10:37               ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-19  8:39 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 19/05/2022 10:36, Qu Wenruo wrote:
> 
> 
> On 2022/5/18 19:29, Johannes Thumshirn wrote:
>> On 17/05/2022 10:28, Qu Wenruo wrote:
>>>>
>>>> Metadata on the stripe tree really is the main blocker right now.
>>>
>>> That's no doubt.
>>
>> What could be done and I think this is the only way forward, is to have
>> the stripe tree in the system block group
> 
> This behavior itself has its problems, unfortunately.
> 
> Currently the system chunks are pretty small, in fact system chunks has
> the minimal stripe size for current code base.
> (Data: 1G, Meta: 1G/256M, sys: 32M)
> 
> This means, if we put stripe tree (which can be as large as extent tree
> afaik), we need way much larger system chunks.

I know, but IIRC (need to look this up again) Josef increased the max size
of sys chunks to 2G

> 
> And this can further increase the possibility on ENOSPC due to
> unbalanced data/metadata/sys usage.
> 
> Although this is really the last problem we need to bother.
> 
>> and force system to be RAID1 on a fs with stripe tree.
> 
> Then the system RAID1 chunks also need stripe tree for zoned devices.
> 
> This means we're unable to bootstrap at all.
> 
> Or did you mean, make system chunks RAID1 but not using stripe tree?
> That can solve the boot strap problem, but it doesn't really look
> elegant to me...

RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
so it will work and we can bootstrap from it.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-19  8:39             ` Johannes Thumshirn
@ 2022-05-19 10:37               ` Qu Wenruo
  2022-05-19 11:44                 ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-19 10:37 UTC (permalink / raw)
  To: Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/5/19 16:39, Johannes Thumshirn wrote:
> On 19/05/2022 10:36, Qu Wenruo wrote:
>>
>>
>> On 2022/5/18 19:29, Johannes Thumshirn wrote:
>>> On 17/05/2022 10:28, Qu Wenruo wrote:
>>>>>
>>>>> Metadata on the stripe tree really is the main blocker right now.
>>>>
>>>> That's no doubt.
>>>
>>> What could be done and I think this is the only way forward, is to have
>>> the stripe tree in the system block group
>>
>> This behavior itself has its problems, unfortunately.
>>
>> Currently the system chunks are pretty small, in fact system chunks has
>> the minimal stripe size for current code base.
>> (Data: 1G, Meta: 1G/256M, sys: 32M)
>>
>> This means, if we put stripe tree (which can be as large as extent tree
>> afaik), we need way much larger system chunks.
> 
> I know, but IIRC (need to look this up again) Josef increased the max size
> of sys chunks to 2G
> 
>>
>> And this can further increase the possibility on ENOSPC due to
>> unbalanced data/metadata/sys usage.
>>
>> Although this is really the last problem we need to bother.
>>
>>> and force system to be RAID1 on a fs with stripe tree.
>>
>> Then the system RAID1 chunks also need stripe tree for zoned devices.
>>
>> This means we're unable to bootstrap at all.
>>
>> Or did you mean, make system chunks RAID1 but not using stripe tree?
>> That can solve the boot strap problem, but it doesn't really look
>> elegant to me...
> 
> RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
> so it will work and we can bootstrap from it.
> 
That sounds good.

And in that case, we don't need to put stripe tree into system chunks at 
all.

So this method means, stripe tree is only useful for data.
Although it's less elegant, it's much saner.

Thanks,
Qu


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-19 10:37               ` Qu Wenruo
@ 2022-05-19 11:44                 ` Johannes Thumshirn
  2022-05-19 11:48                   ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-19 11:44 UTC (permalink / raw)
  To: Qu Wenruo, Qu Wenruo, linux-btrfs

On 19/05/2022 12:37, Qu Wenruo wrote:
>> RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
>> so it will work and we can bootstrap from it.
>>
> That sounds good.
> 
> And in that case, we don't need to put stripe tree into system chunks at 
> all.
> 
> So this method means, stripe tree is only useful for data.
> Although it's less elegant, it's much saner.

Yes and no. People still might want to use different metadata profiles than
RAID1. I'd prefer to have system on RAID1 (forced) with stripe trees and 
data/meta-data can be whatever. Of cause only RAID5/6 or higher level encodings
which might need a stripe-tree should be accpeted with a stripe tree.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-19 11:44                 ` Johannes Thumshirn
@ 2022-05-19 11:48                   ` Qu Wenruo
  2022-05-19 11:53                     ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-19 11:48 UTC (permalink / raw)
  To: Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/5/19 19:44, Johannes Thumshirn wrote:
> On 19/05/2022 12:37, Qu Wenruo wrote:
>>> RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
>>> so it will work and we can bootstrap from it.
>>>
>> That sounds good.
>>
>> And in that case, we don't need to put stripe tree into system chunks at
>> all.
>>
>> So this method means, stripe tree is only useful for data.
>> Although it's less elegant, it's much saner.
>
> Yes and no. People still might want to use different metadata profiles than
> RAID1.

For RAID1 variants like RAID1C3/4, I guess we don't need stripe tree either?

What about DUP? If RAID1*/DUP/SINGLE all doesn't need stripe tree, I
believe that's already a pretty good profile set for most zoned device
users.

Personally speaking, it would be much simpler to avoid bothering the
stripe tree for metadata.

Thanks,
Qu

> I'd prefer to have system on RAID1 (forced) with stripe trees and
> data/meta-data can be whatever. Of cause only RAID5/6 or higher level encodings
> which might need a stripe-tree should be accpeted with a stripe tree.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-19 11:48                   ` Qu Wenruo
@ 2022-05-19 11:53                     ` Johannes Thumshirn
  2022-05-19 13:26                       ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-19 11:53 UTC (permalink / raw)
  To: Qu Wenruo, Qu Wenruo, linux-btrfs

On 19/05/2022 13:48, Qu Wenruo wrote:
> 
> On 2022/5/19 19:44, Johannes Thumshirn wrote:
>> On 19/05/2022 12:37, Qu Wenruo wrote:
>>>> RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
>>>> so it will work and we can bootstrap from it.
>>>>
>>> That sounds good.
>>>
>>> And in that case, we don't need to put stripe tree into system chunks at
>>> all.
>>>
>>> So this method means, stripe tree is only useful for data.
>>> Although it's less elegant, it's much saner.
>> Yes and no. People still might want to use different metadata profiles than
>> RAID1.
> For RAID1 variants like RAID1C3/4, I guess we don't need stripe tree either?
> 
> What about DUP? If RAID1*/DUP/SINGLE all doesn't need stripe tree, I
> believe that's already a pretty good profile set for most zoned device
> users.
> 
> Personally speaking, it would be much simpler to avoid bothering the
> stripe tree for metadata.

I totally agree, but once you get past say 10 drives you might want to have
different encoding schemes and also have a higher level of redundancy for your 
metadata than just 4 copies.

The stripe tree will also hold any l2p information for erasure coded RAID 
arrays once that's done.

So this definitively should be considered.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-19 11:53                     ` Johannes Thumshirn
@ 2022-05-19 13:26                       ` Qu Wenruo
  2022-05-19 13:49                         ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-19 13:26 UTC (permalink / raw)
  To: Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/5/19 19:53, Johannes Thumshirn wrote:
> On 19/05/2022 13:48, Qu Wenruo wrote:
>>
>> On 2022/5/19 19:44, Johannes Thumshirn wrote:
>>> On 19/05/2022 12:37, Qu Wenruo wrote:
>>>>> RAID1 on zoned only needs a stripe tree for data, not for meta-data/system,
>>>>> so it will work and we can bootstrap from it.
>>>>>
>>>> That sounds good.
>>>>
>>>> And in that case, we don't need to put stripe tree into system chunks at
>>>> all.
>>>>
>>>> So this method means, stripe tree is only useful for data.
>>>> Although it's less elegant, it's much saner.
>>> Yes and no. People still might want to use different metadata profiles than
>>> RAID1.
>> For RAID1 variants like RAID1C3/4, I guess we don't need stripe tree either?
>>
>> What about DUP? If RAID1*/DUP/SINGLE all doesn't need stripe tree, I
>> believe that's already a pretty good profile set for most zoned device
>> users.
>>
>> Personally speaking, it would be much simpler to avoid bothering the
>> stripe tree for metadata.
>
> I totally agree, but once you get past say 10 drives you might want to have
> different encoding schemes and also have a higher level of redundancy for your
> metadata than just 4 copies.
>
> The stripe tree will also hold any l2p information for erasure coded RAID
> arrays once that's done.
>
> So this definitively should be considered.


Then let us consider the extra chunk type flag, like
BTRFS_BLOCK_GROUP_HAS_STRIPE_TREE, and then expand the combination from
the initial RAID1*|HAS_STRIPE_TREE to other profiles.

But for over 10 devices, I doubt we really need to bother metadata that
much. Consider we go RAID1C4, we have the ability to lose 3 devices
already, that's way stronger than RAID6. For metadata I believe it's
completely fine already.

Normally it's data requiring more balance between cost and redundancy as
they are the main part of a fs.

Thus even for 10 disk, metadata RAID1C4, data RAID6 (with stripe tree
for zoned), it still looks very reasonable to me at least.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-19 13:26                       ` Qu Wenruo
@ 2022-05-19 13:49                         ` Johannes Thumshirn
  2022-05-19 22:56                           ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-19 13:49 UTC (permalink / raw)
  To: Qu Wenruo, Qu Wenruo, linux-btrfs

On 19/05/2022 15:27, Qu Wenruo wrote:
> 
> 
> Then let us consider the extra chunk type flag, like
> BTRFS_BLOCK_GROUP_HAS_STRIPE_TREE, and then expand the combination from
> the initial RAID1*|HAS_STRIPE_TREE to other profiles.


That would definitively work for me.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-19 13:49                         ` Johannes Thumshirn
@ 2022-05-19 22:56                           ` Qu Wenruo
  2022-05-20  8:27                             ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-05-19 22:56 UTC (permalink / raw)
  To: Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/5/19 21:49, Johannes Thumshirn wrote:
> On 19/05/2022 15:27, Qu Wenruo wrote:
>>
>>
>> Then let us consider the extra chunk type flag, like
>> BTRFS_BLOCK_GROUP_HAS_STRIPE_TREE, and then expand the combination from
>> the initial RAID1*|HAS_STRIPE_TREE to other profiles.
>
>
> That would definitively work for me.

Just one thing to mention, does RAID10 also need stripe tree for
metadata? Or since we're doing depth = 1 IO for metadata anyway, RAID10
is also safe for metadata without using a stripe tree?

If so, I really believe the metadata has already a super good profile
set already.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk
  2022-05-19 22:56                           ` Qu Wenruo
@ 2022-05-20  8:27                             ` Johannes Thumshirn
  0 siblings, 0 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-05-20  8:27 UTC (permalink / raw)
  To: Qu Wenruo, Qu Wenruo, linux-btrfs

On 20/05/2022 00:56, Qu Wenruo wrote:
> 
> 
> On 2022/5/19 21:49, Johannes Thumshirn wrote:
>> On 19/05/2022 15:27, Qu Wenruo wrote:
>>>
>>>
>>> Then let us consider the extra chunk type flag, like
>>> BTRFS_BLOCK_GROUP_HAS_STRIPE_TREE, and then expand the combination from
>>> the initial RAID1*|HAS_STRIPE_TREE to other profiles.
>>
>>
>> That would definitively work for me.
> 
> Just one thing to mention, does RAID10 also need stripe tree for
> metadata? Or since we're doing depth = 1 IO for metadata anyway, RAID10
> is also safe for metadata without using a stripe tree?
> 
> If so, I really believe the metadata has already a super good profile
> set already.

Yep I think so, as no meta-data is written with zone-append.

I just think for meta-data on raid56 we need something.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
                   ` (9 preceding siblings ...)
  2022-05-17  7:23 ` Nikolay Borisov
@ 2022-07-13 10:54 ` Qu Wenruo
  2022-07-13 11:43   ` Johannes Thumshirn
  10 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-07-13 10:54 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/5/16 22:31, Johannes Thumshirn wrote:
> Introduce a raid-stripe-tree to record writes in a RAID environment.
>
> In essence this adds another address translation layer between the logical
> and the physical addresses in btrfs and is designed to close two gaps. The
> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
> second one is the inability of doing RAID with zoned block devices due to the
> constraints we have with REQ_OP_ZONE_APPEND writes.

Here I want to discuss about something related to RAID56 and RST.

One of my long existing concern is, P/Q stripes have a higher update
frequency, thus with certain transaction commit/data writeback timing,
wouldn't it cause the device storing P/Q stripes go out of space before
the data stripe devices?

One example is like this, we have 3 disks RAID5, with RST and zoned
allocator (allocated logical bytenr can only go forward):

	0		32K		64K
Disk 1	|                               | (data stripe)
Disk 2	|                               | (data stripe)
Disk 3	|                               | (parity stripe)

And initially, all the zones in those disks are empty, and their write
pointer are all at the beginning of the zone. (all data)

Then we write 0~4K in the range, and write back happens immediate (can
be DIO or sync).

We need to write the 0~4K back to disk 1, and update P for that vertical
stripe, right? So we got:

	0		32K		64K
Disk 1	|X                              | (data stripe)
Disk 2	|                               | (data stripe)
Disk 3	|X                              | (parity stripe)

Then we write into 4~8K range, and sync immedately.

If we go C0W for the P (we have to anyway), so what we got is:

	0		32K		64K
Disk 1	|X                              | (data stripe)
Disk 2	|X                              | (data stripe)
Disk 3	|XX                             | (parity stripe)

So now, you can see disk3 (the zone handling parity) has its writer
pointer moved 8K forward, but both data stripe zone only has its writer
pointer moved 4K forward.

If we go forward like this, always 4K write and sync, we will hit the
following case eventually:

	0		32K		64K
Disk 1	|XXXXXXXXXXXXXXX                | (data stripe)
Disk 2	|XXXXXXXXXXXXXXX                | (data stripe)
Disk 3	|XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| (parity stripe)

The extent allocator should still think we have 64K free space to write,
as we only really have written 64K.

But the zone for parity stripe is already exhausted.

How could we handle such case?
As RAID0/1 shouldn't have such problem at all, the imbalance is purely
caused by the fact that CoWing P/Q will cause higher write frequency.

Thanks,
Qu

>
> Thsi is an RFC/PoC only which just shows how the code will look like for a
> zoned RAID1. Its sole purpose is to facilitate design reviews and is not
> intended to be merged yet. Or if merged to be used on an actual file-system.
>
> Johannes Thumshirn (8):
>    btrfs: add raid stripe tree definitions
>    btrfs: move btrfs_io_context to volumes.h
>    btrfs: read raid-stripe-tree from disk
>    btrfs: add boilerplate code to insert raid extent
>    btrfs: add code to delete raid extent
>    btrfs: add code to read raid extent
>    btrfs: zoned: allow zoned RAID1
>    btrfs: add raid stripe tree pretty printer
>
>   fs/btrfs/Makefile               |   2 +-
>   fs/btrfs/ctree.c                |   1 +
>   fs/btrfs/ctree.h                |  29 ++++
>   fs/btrfs/disk-io.c              |  12 ++
>   fs/btrfs/extent-tree.c          |   9 ++
>   fs/btrfs/file.c                 |   1 -
>   fs/btrfs/print-tree.c           |  21 +++
>   fs/btrfs/raid-stripe-tree.c     | 251 ++++++++++++++++++++++++++++++++
>   fs/btrfs/raid-stripe-tree.h     |  39 +++++
>   fs/btrfs/volumes.c              |  44 +++++-
>   fs/btrfs/volumes.h              |  93 ++++++------
>   fs/btrfs/zoned.c                |  39 +++++
>   include/uapi/linux/btrfs.h      |   1 +
>   include/uapi/linux/btrfs_tree.h |  17 +++
>   14 files changed, 509 insertions(+), 50 deletions(-)
>   create mode 100644 fs/btrfs/raid-stripe-tree.c
>   create mode 100644 fs/btrfs/raid-stripe-tree.h
>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-13 10:54 ` RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree") Qu Wenruo
@ 2022-07-13 11:43   ` Johannes Thumshirn
  2022-07-13 12:01     ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-07-13 11:43 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 13.07.22 12:54, Qu Wenruo wrote:
> 
> 
> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>> Introduce a raid-stripe-tree to record writes in a RAID environment.
>>
>> In essence this adds another address translation layer between the logical
>> and the physical addresses in btrfs and is designed to close two gaps. The
>> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
>> second one is the inability of doing RAID with zoned block devices due to the
>> constraints we have with REQ_OP_ZONE_APPEND writes.
> 
> Here I want to discuss about something related to RAID56 and RST.
> 
> One of my long existing concern is, P/Q stripes have a higher update
> frequency, thus with certain transaction commit/data writeback timing,
> wouldn't it cause the device storing P/Q stripes go out of space before
> the data stripe devices?

P/Q stripes on a dedicated drive would be RAID4, which we don't have.

> 
> One example is like this, we have 3 disks RAID5, with RST and zoned
> allocator (allocated logical bytenr can only go forward):
> 
> 	0		32K		64K
> Disk 1	|                               | (data stripe)
> Disk 2	|                               | (data stripe)
> Disk 3	|                               | (parity stripe)
> 
> And initially, all the zones in those disks are empty, and their write
> pointer are all at the beginning of the zone. (all data)
> 
> Then we write 0~4K in the range, and write back happens immediate (can
> be DIO or sync).
> 
> We need to write the 0~4K back to disk 1, and update P for that vertical
> stripe, right? So we got:
> 
> 	0		32K		64K
> Disk 1	|X                              | (data stripe)
> Disk 2	|                               | (data stripe)
> Disk 3	|X                              | (parity stripe)
> 
> Then we write into 4~8K range, and sync immedately.
> 
> If we go C0W for the P (we have to anyway), so what we got is:
> 
> 	0		32K		64K
> Disk 1	|X                              | (data stripe)
> Disk 2	|X                              | (data stripe)
> Disk 3	|XX                             | (parity stripe)
> 
> So now, you can see disk3 (the zone handling parity) has its writer
> pointer moved 8K forward, but both data stripe zone only has its writer
> pointer moved 4K forward.
> 
> If we go forward like this, always 4K write and sync, we will hit the
> following case eventually:
> 
> 	0		32K		64K
> Disk 1	|XXXXXXXXXXXXXXX                | (data stripe)
> Disk 2	|XXXXXXXXXXXXXXX                | (data stripe)
> Disk 3	|XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| (parity stripe)
> 
> The extent allocator should still think we have 64K free space to write,
> as we only really have written 64K.
> 
> But the zone for parity stripe is already exhausted.
> 
> How could we handle such case?
> As RAID0/1 shouldn't have such problem at all, the imbalance is purely
> caused by the fact that CoWing P/Q will cause higher write frequency.
> 

Then the a new zone for the parity stripe has to be allocated, and the old one
gets reclaimed. That's nothing new. Of cause there's some gotchas in the extent
allocator and the active zone management we need to consider, but over all I do
not see where the blocker is here.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-13 11:43   ` Johannes Thumshirn
@ 2022-07-13 12:01     ` Qu Wenruo
  2022-07-13 12:42       ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-07-13 12:01 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/7/13 19:43, Johannes Thumshirn wrote:
> On 13.07.22 12:54, Qu Wenruo wrote:
>>
>>
>> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>>> Introduce a raid-stripe-tree to record writes in a RAID environment.
>>>
>>> In essence this adds another address translation layer between the logical
>>> and the physical addresses in btrfs and is designed to close two gaps. The
>>> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
>>> second one is the inability of doing RAID with zoned block devices due to the
>>> constraints we have with REQ_OP_ZONE_APPEND writes.
>>
>> Here I want to discuss about something related to RAID56 and RST.
>>
>> One of my long existing concern is, P/Q stripes have a higher update
>> frequency, thus with certain transaction commit/data writeback timing,
>> wouldn't it cause the device storing P/Q stripes go out of space before
>> the data stripe devices?
>
> P/Q stripes on a dedicated drive would be RAID4, which we don't have.

I'm just using one block group as an example.

Sure, the next bg can definitely go somewhere else.

But inside one bg, we are still using one zone for the bg, right?
>
>>
>> One example is like this, we have 3 disks RAID5, with RST and zoned
>> allocator (allocated logical bytenr can only go forward):
>>
>> 	0		32K		64K
>> Disk 1	|                               | (data stripe)
>> Disk 2	|                               | (data stripe)
>> Disk 3	|                               | (parity stripe)
>>
>> And initially, all the zones in those disks are empty, and their write
>> pointer are all at the beginning of the zone. (all data)
>>
>> Then we write 0~4K in the range, and write back happens immediate (can
>> be DIO or sync).
>>
>> We need to write the 0~4K back to disk 1, and update P for that vertical
>> stripe, right? So we got:
>>
>> 	0		32K		64K
>> Disk 1	|X                              | (data stripe)
>> Disk 2	|                               | (data stripe)
>> Disk 3	|X                              | (parity stripe)
>>
>> Then we write into 4~8K range, and sync immedately.
>>
>> If we go C0W for the P (we have to anyway), so what we got is:
>>
>> 	0		32K		64K
>> Disk 1	|X                              | (data stripe)
>> Disk 2	|X                              | (data stripe)
>> Disk 3	|XX                             | (parity stripe)
>>
>> So now, you can see disk3 (the zone handling parity) has its writer
>> pointer moved 8K forward, but both data stripe zone only has its writer
>> pointer moved 4K forward.
>>
>> If we go forward like this, always 4K write and sync, we will hit the
>> following case eventually:
>>
>> 	0		32K		64K
>> Disk 1	|XXXXXXXXXXXXXXX                | (data stripe)
>> Disk 2	|XXXXXXXXXXXXXXX                | (data stripe)
>> Disk 3	|XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| (parity stripe)
>>
>> The extent allocator should still think we have 64K free space to write,
>> as we only really have written 64K.
>>
>> But the zone for parity stripe is already exhausted.
>>
>> How could we handle such case?
>> As RAID0/1 shouldn't have such problem at all, the imbalance is purely
>> caused by the fact that CoWing P/Q will cause higher write frequency.
>>
>
> Then the a new zone for the parity stripe has to be allocated, and the old one
> gets reclaimed. That's nothing new. Of cause there's some gotchas in the extent
> allocator and the active zone management we need to consider, but over all I do
> not see where the blocker is here.

The problem is, we can not reclaim the existing full parity zone yet.

We still have parity for the above 32K in that zone.

So that zone can not be reclaimed, until both data stripe zoned are claimed.

This means, we can have a case, that all data stripes are in the above
cases, and need twice the amount of parity zones.

And in that case, I'm not sure if our chunk allocator can handle it
properly, but at least our free space estimation is not accurate.

Thanks,
Qu


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-13 12:01     ` Qu Wenruo
@ 2022-07-13 12:42       ` Johannes Thumshirn
  2022-07-13 13:47         ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-07-13 12:42 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 13.07.22 14:01, Qu Wenruo wrote:
> 
> 
> On 2022/7/13 19:43, Johannes Thumshirn wrote:
>> On 13.07.22 12:54, Qu Wenruo wrote:
>>>
>>>
>>> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>>>> Introduce a raid-stripe-tree to record writes in a RAID environment.
>>>>
>>>> In essence this adds another address translation layer between the logical
>>>> and the physical addresses in btrfs and is designed to close two gaps. The
>>>> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
>>>> second one is the inability of doing RAID with zoned block devices due to the
>>>> constraints we have with REQ_OP_ZONE_APPEND writes.
>>>
>>> Here I want to discuss about something related to RAID56 and RST.
>>>
>>> One of my long existing concern is, P/Q stripes have a higher update
>>> frequency, thus with certain transaction commit/data writeback timing,
>>> wouldn't it cause the device storing P/Q stripes go out of space before
>>> the data stripe devices?
>>
>> P/Q stripes on a dedicated drive would be RAID4, which we don't have.
> 
> I'm just using one block group as an example.
> 
> Sure, the next bg can definitely go somewhere else.
> 
> But inside one bg, we are still using one zone for the bg, right?

Ok maybe I'm not understanding the code in volumes.c correctly, but
doesn't __btrfs_map_block() calculate a rotation per stripe-set?

I'm looking at this code:

	/* Build raid_map */
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
	    (need_full_stripe(op) || mirror_num > 1)) {
		u64 tmp;
		unsigned rot;

		/* Work out the disk rotation on this stripe-set */
		div_u64_rem(stripe_nr, num_stripes, &rot);

		/* Fill in the logical address of each stripe */
		tmp = stripe_nr * data_stripes;
		for (i = 0; i < data_stripes; i++)
			bioc->raid_map[(i + rot) % num_stripes] =
				em->start + (tmp + i) * map->stripe_len;

		bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
			bioc->raid_map[(i + rot + 1) % num_stripes] =
				RAID6_Q_STRIPE;

		sort_parity_stripes(bioc, num_stripes);
	}


So then in your example we have something like this:

Write of 4k D1:

	0		32K		64K
Disk 1	|D1                             | 
Disk 2	|                               | 
Disk 3	|P1                             | 


Write of 4k D2, the new parity is P2 the old P1 parity is obsolete

	0		32K		64K
Disk 1	|D1                             | 
Disk 2	|P2                             | 
Disk 3	|P1D2                           | 

Write of new 4k D1 with P3 

	0		32K		64K
Disk 1	|D1P3                           | 
Disk 2	|P2D1                           | 
Disk 3	|P1D2                           | 

and so on.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-13 12:42       ` Johannes Thumshirn
@ 2022-07-13 13:47         ` Qu Wenruo
  2022-07-13 14:01           ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-07-13 13:47 UTC (permalink / raw)
  To: Johannes Thumshirn, linux-btrfs



On 2022/7/13 20:42, Johannes Thumshirn wrote:
> On 13.07.22 14:01, Qu Wenruo wrote:
>>
>>
>> On 2022/7/13 19:43, Johannes Thumshirn wrote:
>>> On 13.07.22 12:54, Qu Wenruo wrote:
>>>>
>>>>
>>>> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>>>>> Introduce a raid-stripe-tree to record writes in a RAID environment.
>>>>>
>>>>> In essence this adds another address translation layer between the logical
>>>>> and the physical addresses in btrfs and is designed to close two gaps. The
>>>>> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
>>>>> second one is the inability of doing RAID with zoned block devices due to the
>>>>> constraints we have with REQ_OP_ZONE_APPEND writes.
>>>>
>>>> Here I want to discuss about something related to RAID56 and RST.
>>>>
>>>> One of my long existing concern is, P/Q stripes have a higher update
>>>> frequency, thus with certain transaction commit/data writeback timing,
>>>> wouldn't it cause the device storing P/Q stripes go out of space before
>>>> the data stripe devices?
>>>
>>> P/Q stripes on a dedicated drive would be RAID4, which we don't have.
>>
>> I'm just using one block group as an example.
>>
>> Sure, the next bg can definitely go somewhere else.
>>
>> But inside one bg, we are still using one zone for the bg, right?
>
> Ok maybe I'm not understanding the code in volumes.c correctly, but
> doesn't __btrfs_map_block() calculate a rotation per stripe-set?
>
> I'm looking at this code:
>
> 	/* Build raid_map */
> 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
> 	    (need_full_stripe(op) || mirror_num > 1)) {
> 		u64 tmp;
> 		unsigned rot;
>
> 		/* Work out the disk rotation on this stripe-set */
> 		div_u64_rem(stripe_nr, num_stripes, &rot);
>
> 		/* Fill in the logical address of each stripe */
> 		tmp = stripe_nr * data_stripes;
> 		for (i = 0; i < data_stripes; i++)
> 			bioc->raid_map[(i + rot) % num_stripes] =
> 				em->start + (tmp + i) * map->stripe_len;
>
> 		bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
> 		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
> 			bioc->raid_map[(i + rot + 1) % num_stripes] =
> 				RAID6_Q_STRIPE;
>
> 		sort_parity_stripes(bioc, num_stripes);
> 	}

That's per full-stripe. AKA, the rotation only kicks in after a full stripe.

In my example, we're inside one full stripe, no rotation, until next
full stripe.

>
>
> So then in your example we have something like this:
>
> Write of 4k D1:
>
> 	0		32K		64K
> Disk 1	|D1                             |
> Disk 2	|                               |
> Disk 3	|P1                             |
>
>
> Write of 4k D2, the new parity is P2 the old P1 parity is obsolete
>
> 	0		32K		64K
> Disk 1	|D1                             |
> Disk 2	|P2                             |
> Disk 3	|P1D2                           |
>
> Write of new 4k D1 with P3
>
> 	0		32K		64K
> Disk 1	|D1P3                           |
> Disk 2	|P2D1                           |
> Disk 3	|P1D2                           |
>
> and so on.

So, not the case, at least not in the full stripe.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-13 13:47         ` Qu Wenruo
@ 2022-07-13 14:01           ` Johannes Thumshirn
  2022-07-13 15:24             ` Lukas Straub
  2022-07-14  1:08             ` Qu Wenruo
  0 siblings, 2 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-07-13 14:01 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs

On 13.07.22 15:47, Qu Wenruo wrote:
> 
> 
> On 2022/7/13 20:42, Johannes Thumshirn wrote:
>> On 13.07.22 14:01, Qu Wenruo wrote:
>>>
>>>
>>> On 2022/7/13 19:43, Johannes Thumshirn wrote:
>>>> On 13.07.22 12:54, Qu Wenruo wrote:
>>>>>
>>>>>
>>>>> On 2022/5/16 22:31, Johannes Thumshirn wrote:
>>>>>> Introduce a raid-stripe-tree to record writes in a RAID environment.
>>>>>>
>>>>>> In essence this adds another address translation layer between the logical
>>>>>> and the physical addresses in btrfs and is designed to close two gaps. The
>>>>>> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
>>>>>> second one is the inability of doing RAID with zoned block devices due to the
>>>>>> constraints we have with REQ_OP_ZONE_APPEND writes.
>>>>>
>>>>> Here I want to discuss about something related to RAID56 and RST.
>>>>>
>>>>> One of my long existing concern is, P/Q stripes have a higher update
>>>>> frequency, thus with certain transaction commit/data writeback timing,
>>>>> wouldn't it cause the device storing P/Q stripes go out of space before
>>>>> the data stripe devices?
>>>>
>>>> P/Q stripes on a dedicated drive would be RAID4, which we don't have.
>>>
>>> I'm just using one block group as an example.
>>>
>>> Sure, the next bg can definitely go somewhere else.
>>>
>>> But inside one bg, we are still using one zone for the bg, right?
>>
>> Ok maybe I'm not understanding the code in volumes.c correctly, but
>> doesn't __btrfs_map_block() calculate a rotation per stripe-set?
>>
>> I'm looking at this code:
>>
>> 	/* Build raid_map */
>> 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
>> 	    (need_full_stripe(op) || mirror_num > 1)) {
>> 		u64 tmp;
>> 		unsigned rot;
>>
>> 		/* Work out the disk rotation on this stripe-set */
>> 		div_u64_rem(stripe_nr, num_stripes, &rot);
>>
>> 		/* Fill in the logical address of each stripe */
>> 		tmp = stripe_nr * data_stripes;
>> 		for (i = 0; i < data_stripes; i++)
>> 			bioc->raid_map[(i + rot) % num_stripes] =
>> 				em->start + (tmp + i) * map->stripe_len;
>>
>> 		bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
>> 		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
>> 			bioc->raid_map[(i + rot + 1) % num_stripes] =
>> 				RAID6_Q_STRIPE;
>>
>> 		sort_parity_stripes(bioc, num_stripes);
>> 	}
> 
> That's per full-stripe. AKA, the rotation only kicks in after a full stripe.
> 
> In my example, we're inside one full stripe, no rotation, until next
> full stripe.
> 


Ah ok, my apologies. For sub-stripe size writes My idea was to 0-pad up to  
stripe size. Then we can do full CoW of stripes. If we have an older generation
of a stripe, we can just override it on regular btrfs. On zoned btrfs this
just accounts for more zone_unusable bytes and waits for the GC to kick in.


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-13 14:01           ` Johannes Thumshirn
@ 2022-07-13 15:24             ` Lukas Straub
  2022-07-13 15:28               ` Johannes Thumshirn
  2022-07-14  1:08             ` Qu Wenruo
  1 sibling, 1 reply; 88+ messages in thread
From: Lukas Straub @ 2022-07-13 15:24 UTC (permalink / raw)
  To: Johannes Thumshirn; +Cc: Qu Wenruo, linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 3428 bytes --]

On Wed, 13 Jul 2022 14:01:32 +0000
Johannes Thumshirn <Johannes.Thumshirn@wdc.com> wrote:

> On 13.07.22 15:47, Qu Wenruo wrote:
> > 
> > 
> > On 2022/7/13 20:42, Johannes Thumshirn wrote:  
> >> On 13.07.22 14:01, Qu Wenruo wrote:  
> >>>
> >>>
> >>> On 2022/7/13 19:43, Johannes Thumshirn wrote:  
> >>>> On 13.07.22 12:54, Qu Wenruo wrote:  
> >>>>>
> >>>>>
> >>>>> On 2022/5/16 22:31, Johannes Thumshirn wrote:  
> >>>>>> Introduce a raid-stripe-tree to record writes in a RAID environment.
> >>>>>>
> >>>>>> In essence this adds another address translation layer between the logical
> >>>>>> and the physical addresses in btrfs and is designed to close two gaps. The
> >>>>>> first is the ominous RAID-write-hole we suffer from with RAID5/6 and the
> >>>>>> second one is the inability of doing RAID with zoned block devices due to the
> >>>>>> constraints we have with REQ_OP_ZONE_APPEND writes.  
> >>>>>
> >>>>> Here I want to discuss about something related to RAID56 and RST.
> >>>>>
> >>>>> One of my long existing concern is, P/Q stripes have a higher update
> >>>>> frequency, thus with certain transaction commit/data writeback timing,
> >>>>> wouldn't it cause the device storing P/Q stripes go out of space before
> >>>>> the data stripe devices?  
> >>>>
> >>>> P/Q stripes on a dedicated drive would be RAID4, which we don't have.  
> >>>
> >>> I'm just using one block group as an example.
> >>>
> >>> Sure, the next bg can definitely go somewhere else.
> >>>
> >>> But inside one bg, we are still using one zone for the bg, right?  
> >>
> >> Ok maybe I'm not understanding the code in volumes.c correctly, but
> >> doesn't __btrfs_map_block() calculate a rotation per stripe-set?
> >>
> >> I'm looking at this code:
> >>
> >> 	/* Build raid_map */
> >> 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
> >> 	    (need_full_stripe(op) || mirror_num > 1)) {
> >> 		u64 tmp;
> >> 		unsigned rot;
> >>
> >> 		/* Work out the disk rotation on this stripe-set */
> >> 		div_u64_rem(stripe_nr, num_stripes, &rot);
> >>
> >> 		/* Fill in the logical address of each stripe */
> >> 		tmp = stripe_nr * data_stripes;
> >> 		for (i = 0; i < data_stripes; i++)
> >> 			bioc->raid_map[(i + rot) % num_stripes] =
> >> 				em->start + (tmp + i) * map->stripe_len;
> >>
> >> 		bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
> >> 		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
> >> 			bioc->raid_map[(i + rot + 1) % num_stripes] =
> >> 				RAID6_Q_STRIPE;
> >>
> >> 		sort_parity_stripes(bioc, num_stripes);
> >> 	}  
> > 
> > That's per full-stripe. AKA, the rotation only kicks in after a full stripe.
> > 
> > In my example, we're inside one full stripe, no rotation, until next
> > full stripe.
> >   
> 
> 
> Ah ok, my apologies. For sub-stripe size writes My idea was to 0-pad up to  
> stripe size. Then we can do full CoW of stripes. If we have an older generation
> of a stripe, we can just override it on regular btrfs. On zoned btrfs this
> just accounts for more zone_unusable bytes and waits for the GC to kick in.
> 

Have you considered variable stripe size? I believe ZFS does this.
Should be easy for raid5 since it's just xor, not sure for raid6.

PS: ZFS seems to do variable-_width_ stripes
https://pthree.org/2012/12/05/zfs-administration-part-ii-raidz/

Regards,
Lukas Straub

-- 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-13 15:24             ` Lukas Straub
@ 2022-07-13 15:28               ` Johannes Thumshirn
  0 siblings, 0 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-07-13 15:28 UTC (permalink / raw)
  To: Lukas Straub; +Cc: Qu Wenruo, linux-btrfs

On 13.07.22 17:25, Lukas Straub wrote:
> 
> Have you considered variable stripe size? I believe ZFS does this.
> Should be easy for raid5 since it's just xor, not sure for raid6.
> 
> PS: ZFS seems to do variable-_width_ stripes
> https://pthree.org/2012/12/05/zfs-administration-part-ii-raidz/

I did and coincidentally we have been talking about it just 5 minutes
ago and both David and Chris aren't very fond of the idea.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-13 14:01           ` Johannes Thumshirn
  2022-07-13 15:24             ` Lukas Straub
@ 2022-07-14  1:08             ` Qu Wenruo
  2022-07-14  7:08               ` Johannes Thumshirn
  1 sibling, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-07-14  1:08 UTC (permalink / raw)
  To: Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/7/13 22:01, Johannes Thumshirn wrote:
> On 13.07.22 15:47, Qu Wenruo wrote:
> 
> 
> Ah ok, my apologies. For sub-stripe size writes My idea was to 0-pad up to
> stripe size. Then we can do full CoW of stripes. If we have an older generation
> of a stripe, we can just override it on regular btrfs. On zoned btrfs this
> just accounts for more zone_unusable bytes and waits for the GC to kick in.
> 

Sorry, I guess you still didn't get my point here.

What I'm talking about is, how many bytes you can really write into a 
full stripe when CoWing P/Q stripes.

[TL;DR]

If we CoW P/Q, for the worst cases (always 4K write and sync), the space 
efficiency is no better than RAID1.

For a lot of write order, we can only write 64K (STRIPE_LEN) no matter what.


!NOTE!
All following examples are using 8KiB sector size, to make the graph 
shorter.

[CASE 1 CURRENT WRITE ORDER, NO PADDING]
        0                               64K
Disk 1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | (Data stripe)
Disk 2 | 8 | 9 | a | b | c | d | e | f | (Data stripe)
Disk 3 | P | P | P | P | P | P | P | P | (Parity stripe).

For zoned RST, we can only write 8 sectors, then Disk 3 exhaust its 
zone. As every time we write a sector in data stripe, we have to write a P.

Total written bytes: 64K
Expected written bytes: 128K (nr_data * 64K)
Efficiency:	1 / nr_data.

The worst.

[CASE 2 CURRENT WRITE ORDER, PADDING]
No difference than case 1, just when we have finished sector 7, all 
zones are exhausted.

Total written bytes: 64K
Expected written bytes: 128K (nr_data * 64K)
Efficiency:	1 / nr_data.

[CASE 3 FULLY UNORDERED, NO PADDING]
This should have the best efficiency, but no better than RAID1.

        0                               64K
Disk 1 | 0 | P | 3 | P | 6 | P | 9 | P |
Disk 2 | P | 2 | P | 5 | P | 8 | P | b |
Disk 3 | 1 | P | 4 | P | 7 | P | a | P |

Total written bytes: 96K
Expected written bytes: 128K (nr_data * 64K)
Efficiency:	1 / 2

This can not even beat RAID1/RAID10, but cause way more metadata just 
for the RST.


Whatever the case, we can no longer ensure we can write (nr_data * 64K) 
bytes of data into a full stripe.
And for worst cases, it can be way bad than RAID1, I don't really think 
it's any good to our extent allocator or the space efficiency (that's 
exactly why users choose to go RAID56).

[ROOT CAUSE]
If we just check how many write we really need submit to each device, it 
should be obvious:

When data stripe in disk1 is filled:
        0                               64K
Disk 1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 64K written
Disk 2 |   |   |   |   |   |   |   |   | 0 written
Disk 3 | P | P | P | P | P | P | P | P | 64K written

When data stripe in disk2 is filled:

        0                               64K
Disk 1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 64K written
Disk 2 | 8 | 9 | a | b | c | d | e | f | 64K written
Disk 3 | P'| P'| P'| P'| P'| P'| P'| P'| 128K written

For RAID56 partial write, the total write is always 2 * data written.
Thus for zoned device, since they can not do any overwrite, their worst 
case space efficiency can never exceed RAID1.

Thus I have repeated times and times, against this problem for RST.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-14  1:08             ` Qu Wenruo
@ 2022-07-14  7:08               ` Johannes Thumshirn
  2022-07-14  7:32                 ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-07-14  7:08 UTC (permalink / raw)
  To: Qu Wenruo, Qu Wenruo, linux-btrfs

On 14.07.22 03:08, Qu Wenruo wrote:> [CASE 2 CURRENT WRITE ORDER, PADDING> No difference than case 1, just when we have finished sector 7, all > zones are exhausted.>> Total written bytes: 64K> Expected written bytes: 128K (nr_data * 64K)> Efficiency:	1 / nr_data.> 
I'm sorry but I have to disagree.
If we're writing less than 64k, everything beyond these 64k will get filled up with 0

       0                               64K
Disk 1 | D1| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
Disk 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
Disk 3 | P | P | P | P | P | P | P | P | (Parity stripe)

So the next write (the CoW) will then be:

      64k                              128K
Disk 1 | D1| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
Disk 2 | D2| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
Disk 3 | P'| P'| P'| P'| P'| P'| P'| P'| (Parity stripe)

For zoned we can play this game zone_size/stripe_size times, which on a typical
SMR HDD would be:

126M/64k = 4096 times until you fill up a zone.

I.e. if you do stupid things you get stupid results. C'est la vie.


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-14  7:08               ` Johannes Thumshirn
@ 2022-07-14  7:32                 ` Qu Wenruo
  2022-07-14  7:46                   ` Johannes Thumshirn
  0 siblings, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-07-14  7:32 UTC (permalink / raw)
  To: Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/7/14 15:08, Johannes Thumshirn wrote:
> On 14.07.22 03:08, Qu Wenruo wrote:> [CASE 2 CURRENT WRITE ORDER, PADDING> No difference than case 1, just when we have finished sector 7, all > zones are exhausted.>> Total written bytes: 64K> Expected written bytes: 128K (nr_data * 64K)> Efficiency:	1 / nr_data.>
> I'm sorry but I have to disagree.
> If we're writing less than 64k, everything beyond these 64k will get filled up with 0
>
>         0                               64K
> Disk 1 | D1| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
> Disk 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
> Disk 3 | P | P | P | P | P | P | P | P | (Parity stripe)
>
> So the next write (the CoW) will then be:
>
>        64k                              128K
> Disk 1 | D1| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
> Disk 2 | D2| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
> Disk 3 | P'| P'| P'| P'| P'| P'| P'| P'| (Parity stripe)

Nope, currently full stripe write should still go into disk1, not disk 2.
Sorry I did use a bad example from the very beginning.

In that case, what we should have is:

        0                               64K
Disk 1 | D1| D2| 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
Disk 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
Disk 3 | P | P | 0 | 0 | 0 | 0 | 0 | 0 | (Parity stripe)

In that case, Parity should still needs two blocks.

And when Disk 1 get filled up, we have no way to write into Disk 2.

>
> For zoned we can play this game zone_size/stripe_size times, which on a typical
> SMR HDD would be:
>
> 126M/64k = 4096 times until you fill up a zone.

No difference.

You have extra zone to use, but the result is, the space efficiency will
not be better than RAID1 for the worst case.

>
> I.e. if you do stupid things you get stupid results. C'est la vie.
>

You still didn't answer the space efficient problem.

RAID56 really rely on overwrite on its P/Q stripes.
The total write amount is really twice the data writes, that's something
you can not avoid.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-14  7:32                 ` Qu Wenruo
@ 2022-07-14  7:46                   ` Johannes Thumshirn
  2022-07-14  7:53                     ` Qu Wenruo
  2022-07-15 17:54                     ` Goffredo Baroncelli
  0 siblings, 2 replies; 88+ messages in thread
From: Johannes Thumshirn @ 2022-07-14  7:46 UTC (permalink / raw)
  To: Qu Wenruo, Qu Wenruo, linux-btrfs

On 14.07.22 09:32, Qu Wenruo wrote:
> 
> 
> On 2022/7/14 15:08, Johannes Thumshirn wrote:
>> On 14.07.22 03:08, Qu Wenruo wrote:> [CASE 2 CURRENT WRITE ORDER, PADDING> No difference than case 1, just when we have finished sector 7, all > zones are exhausted.>> Total written bytes: 64K> Expected written bytes: 128K (nr_data * 64K)> Efficiency:	1 / nr_data.>
>> I'm sorry but I have to disagree.
>> If we're writing less than 64k, everything beyond these 64k will get filled up with 0
>>
>>         0                               64K
>> Disk 1 | D1| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
>> Disk 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
>> Disk 3 | P | P | P | P | P | P | P | P | (Parity stripe)
>>
>> So the next write (the CoW) will then be:
>>
>>        64k                              128K
>> Disk 1 | D1| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
>> Disk 2 | D2| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
>> Disk 3 | P'| P'| P'| P'| P'| P'| P'| P'| (Parity stripe)
> 
> Nope, currently full stripe write should still go into disk1, not disk 2.
> Sorry I did use a bad example from the very beginning.
> 
> In that case, what we should have is:
> 
>         0                               64K
> Disk 1 | D1| D2| 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
> Disk 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
> Disk 3 | P | P | 0 | 0 | 0 | 0 | 0 | 0 | (Parity stripe)
> 
> In that case, Parity should still needs two blocks.
> 
> And when Disk 1 get filled up, we have no way to write into Disk 2.
> 
>>
>> For zoned we can play this game zone_size/stripe_size times, which on a typical
>> SMR HDD would be:
>>
>> 126M/64k = 4096 times until you fill up a zone.
> 
> No difference.
> 
> You have extra zone to use, but the result is, the space efficiency will
> not be better than RAID1 for the worst case.
> 
>>
>> I.e. if you do stupid things you get stupid results. C'est la vie.
>>
> 
> You still didn't answer the space efficient problem.
> 
> RAID56 really rely on overwrite on its P/Q stripes.

Nope, btrfs raid56 does this. Another implementation could for instance
buffer each stripe in an NVRAM (like described in [1]), or like Chris 
suggested in a RAID1 area on the drives, or doing variable stripe length
like ZFS' RAID-Z, and so on.

> The total write amount is really twice the data writes, that's something
> you can not avoid.
>

Again if you're doing sub-stripe size writes, you're asking stupid things and
then there's no reason to not give the user stupid answers.

If a user is concerned about the write or space amplicfication of sub-stripe
writes on RAID56 he/she really needs to rethink the architecture.



[1]
S. K. Mishra and P. Mohapatra, 
"Performance study of RAID-5 disk arrays with data and parity cache," 
Proceedings of the 1996 ICPP Workshop on Challenges for Parallel Processing,
1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-14  7:46                   ` Johannes Thumshirn
@ 2022-07-14  7:53                     ` Qu Wenruo
  2022-07-15 17:54                     ` Goffredo Baroncelli
  1 sibling, 0 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-07-14  7:53 UTC (permalink / raw)
  To: Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/7/14 15:46, Johannes Thumshirn wrote:
> On 14.07.22 09:32, Qu Wenruo wrote:
>>
>>
>> On 2022/7/14 15:08, Johannes Thumshirn wrote:
>>> On 14.07.22 03:08, Qu Wenruo wrote:> [CASE 2 CURRENT WRITE ORDER, PADDING> No difference than case 1, just when we have finished sector 7, all > zones are exhausted.>> Total written bytes: 64K> Expected written bytes: 128K (nr_data * 64K)> Efficiency:	1 / nr_data.>
>>> I'm sorry but I have to disagree.
>>> If we're writing less than 64k, everything beyond these 64k will get filled up with 0
>>>
>>>          0                               64K
>>> Disk 1 | D1| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
>>> Disk 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
>>> Disk 3 | P | P | P | P | P | P | P | P | (Parity stripe)
>>>
>>> So the next write (the CoW) will then be:
>>>
>>>         64k                              128K
>>> Disk 1 | D1| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
>>> Disk 2 | D2| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
>>> Disk 3 | P'| P'| P'| P'| P'| P'| P'| P'| (Parity stripe)
>>
>> Nope, currently full stripe write should still go into disk1, not disk 2.
>> Sorry I did use a bad example from the very beginning.
>>
>> In that case, what we should have is:
>>
>>          0                               64K
>> Disk 1 | D1| D2| 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
>> Disk 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (Data stripe)
>> Disk 3 | P | P | 0 | 0 | 0 | 0 | 0 | 0 | (Parity stripe)
>>
>> In that case, Parity should still needs two blocks.
>>
>> And when Disk 1 get filled up, we have no way to write into Disk 2.
>>
>>>
>>> For zoned we can play this game zone_size/stripe_size times, which on a typical
>>> SMR HDD would be:
>>>
>>> 126M/64k = 4096 times until you fill up a zone.
>>
>> No difference.
>>
>> You have extra zone to use, but the result is, the space efficiency will
>> not be better than RAID1 for the worst case.
>>
>>>
>>> I.e. if you do stupid things you get stupid results. C'est la vie.
>>>
>>
>> You still didn't answer the space efficient problem.
>>
>> RAID56 really rely on overwrite on its P/Q stripes.
>
> Nope, btrfs raid56 does this. Another implementation could for instance
> buffer each stripe in an NVRAM (like described in [1]), or like Chris
> suggested in a RAID1 area on the drives, or doing variable stripe length
> like ZFS' RAID-Z, and so on.

Not only btrfs raid56, but also dm-raid56 also do this.

And what you mention is just an variant of journal, delay the write
until got a full stripe.

>
>> The total write amount is really twice the data writes, that's something
>> you can not avoid.
>>
>
> Again if you're doing sub-stripe size writes, you're asking stupid things and
> then there's no reason to not give the user stupid answers.

No, you can not limit what users do.

As long as btrfs itself support writes in sectorsize (4K), you can not
stop user doing that.

In your argument, I can also say, write-intent is a problem of end
users, and no need to fix at all.

That's definitely not the correct way to do, let user to adapt the
limitation? No, just big no.

Thanks,
Qu

>
> If a user is concerned about the write or space amplicfication of sub-stripe
> writes on RAID56 he/she really needs to rethink the architecture.
>
>
>
> [1]
> S. K. Mishra and P. Mohapatra,
> "Performance study of RAID-5 disk arrays with data and parity cache,"
> Proceedings of the 1996 ICPP Workshop on Challenges for Parallel Processing,
> 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-14  7:46                   ` Johannes Thumshirn
  2022-07-14  7:53                     ` Qu Wenruo
@ 2022-07-15 17:54                     ` Goffredo Baroncelli
  2022-07-15 19:08                       ` Thiago Ramon
                                         ` (2 more replies)
  1 sibling, 3 replies; 88+ messages in thread
From: Goffredo Baroncelli @ 2022-07-15 17:54 UTC (permalink / raw)
  To: Johannes Thumshirn, Qu Wenruo, Qu Wenruo, linux-btrfs

On 14/07/2022 09.46, Johannes Thumshirn wrote:
> On 14.07.22 09:32, Qu Wenruo wrote:
>>[...]
> 
> Again if you're doing sub-stripe size writes, you're asking stupid things and
> then there's no reason to not give the user stupid answers.
> 

Qu is right, if we consider only full stripe write the "raid hole" problem
disappear, because if a "full stripe" is not fully written it is not
referenced either.


Personally I think that the ZFS variable stripe size, may be interesting
to evaluate. Moreover, because the BTRFS disk format is quite flexible,
we can store different BG with different number of disks. Let me to make an
example: if we have 10 disks, we could allocate:
1 BG RAID1
1 BG RAID5, spread over 4 disks only
1 BG RAID5, spread over 8 disks only
1 BG RAID5, spread over 10 disks

So if we have short writes, we could put the extents in the RAID1 BG; for longer
writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
of the data.

Yes this would require a sort of garbage collector to move the data to the biggest
raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
variable stripe size.

Doing so we don't need any disk format change and it would be backward compatible.


Moreover, if we could put the smaller BG in the faster disks, we could have a
decent tiering....


> If a user is concerned about the write or space amplicfication of sub-stripe
> writes on RAID56 he/she really needs to rethink the architecture.
> 
> 
> 
> [1]
> S. K. Mishra and P. Mohapatra,
> "Performance study of RAID-5 disk arrays with data and parity cache,"
> Proceedings of the 1996 ICPP Workshop on Challenges for Parallel Processing,
> 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.

-- 
gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-15 17:54                     ` Goffredo Baroncelli
@ 2022-07-15 19:08                       ` Thiago Ramon
  2022-07-16  0:34                         ` Qu Wenruo
  2022-07-15 20:14                       ` Chris Murphy
  2022-07-18  7:30                       ` Johannes Thumshirn
  2 siblings, 1 reply; 88+ messages in thread
From: Thiago Ramon @ 2022-07-15 19:08 UTC (permalink / raw)
  To: kreijack; +Cc: Johannes Thumshirn, Qu Wenruo, Qu Wenruo, linux-btrfs

As a user of RAID6 here, let me jump in because I think this
suggestion is actually a very good compromise.

With stripes written only once, we completely eliminate any possible
write-hole, and even without any changes on the current disk layout
and allocation, there shouldn't be much wasted space (in my case, I
have a 12-disk RAID6, so each full stripe holds 640kb, and discounting
single-sector writes that should go into metadata space, any
reasonable write should fill that buffer in a few seconds).

The additional suggestion of using smaller stripe widths in case there
isn't enough data to fill a whole stripe would make it very easy to
reclaim the wasted space by rebalancing with a stripe count filter,
which can be easily automated and run very frequently.

On-disk format also wouldn't change and be fully usable by older
kernels, and it should "only" require changes on the allocator to
implement.

On Fri, Jul 15, 2022 at 2:58 PM Goffredo Baroncelli <kreijack@libero.it> wrote:
>
> On 14/07/2022 09.46, Johannes Thumshirn wrote:
> > On 14.07.22 09:32, Qu Wenruo wrote:
> >>[...]
> >
> > Again if you're doing sub-stripe size writes, you're asking stupid things and
> > then there's no reason to not give the user stupid answers.
> >
>
> Qu is right, if we consider only full stripe write the "raid hole" problem
> disappear, because if a "full stripe" is not fully written it is not
> referenced either.
>
>
> Personally I think that the ZFS variable stripe size, may be interesting
> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
> we can store different BG with different number of disks. Let me to make an
> example: if we have 10 disks, we could allocate:
> 1 BG RAID1
> 1 BG RAID5, spread over 4 disks only
> 1 BG RAID5, spread over 8 disks only
> 1 BG RAID5, spread over 10 disks
>
> So if we have short writes, we could put the extents in the RAID1 BG; for longer
> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
> of the data.
>
> Yes this would require a sort of garbage collector to move the data to the biggest
> raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
> variable stripe size.
>
> Doing so we don't need any disk format change and it would be backward compatible.
>
>
> Moreover, if we could put the smaller BG in the faster disks, we could have a
> decent tiering....
>
>
> > If a user is concerned about the write or space amplicfication of sub-stripe
> > writes on RAID56 he/she really needs to rethink the architecture.
> >
> >
> >
> > [1]
> > S. K. Mishra and P. Mohapatra,
> > "Performance study of RAID-5 disk arrays with data and parity cache,"
> > Proceedings of the 1996 ICPP Workshop on Challenges for Parallel Processing,
> > 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.
>
> --
> gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
> Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-15 17:54                     ` Goffredo Baroncelli
  2022-07-15 19:08                       ` Thiago Ramon
@ 2022-07-15 20:14                       ` Chris Murphy
  2022-07-18  7:33                         ` Johannes Thumshirn
  2022-07-18 21:49                         ` Forza
  2022-07-18  7:30                       ` Johannes Thumshirn
  2 siblings, 2 replies; 88+ messages in thread
From: Chris Murphy @ 2022-07-15 20:14 UTC (permalink / raw)
  To: Goffredo Baroncelli; +Cc: Johannes Thumshirn, Qu Wenruo, Qu Wenruo, linux-btrfs

On Fri, Jul 15, 2022 at 1:55 PM Goffredo Baroncelli <kreijack@libero.it> wrote:
>
> On 14/07/2022 09.46, Johannes Thumshirn wrote:
> > On 14.07.22 09:32, Qu Wenruo wrote:
> >>[...]
> >
> > Again if you're doing sub-stripe size writes, you're asking stupid things and
> > then there's no reason to not give the user stupid answers.
> >
>
> Qu is right, if we consider only full stripe write the "raid hole" problem
> disappear, because if a "full stripe" is not fully written it is not
> referenced either.
>
>
> Personally I think that the ZFS variable stripe size, may be interesting
> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
> we can store different BG with different number of disks. Let me to make an
> example: if we have 10 disks, we could allocate:
> 1 BG RAID1
> 1 BG RAID5, spread over 4 disks only
> 1 BG RAID5, spread over 8 disks only
> 1 BG RAID5, spread over 10 disks
>
> So if we have short writes, we could put the extents in the RAID1 BG; for longer
> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
> of the data.
>
> Yes this would require a sort of garbage collector to move the data to the biggest
> raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
> variable stripe size.
>
> Doing so we don't need any disk format change and it would be backward compatible.

My 2 cents...

Regarding the current raid56 support, in order of preference:

a. Fix the current bugs, without changing format. Zygo has an extensive list.
b. Mostly fix the write hole, also without changing the format, by
only doing COW with full stripe writes. Yes you could somehow get
corrupt parity still and not know it until degraded operation produces
a bad reconstruction of data - but checksum will still catch that.
This kind of "unreplicated corruption" is not quite the same thing as
the write hole, because it isn't pernicious like the write hole.
c. A new de-clustered parity raid56 implementation that is not
backwards compatible.

Ergo, I think it's best to not break the format twice. Even if a new
raid implementation is years off.

Metadata centric workloads suck on parity raid anyway. If Btrfs always
does full stripe COW won't matter even if the performance is worse
because no one should use parity raid for this workload anyway.


--
Chris Murphy

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-15 19:08                       ` Thiago Ramon
@ 2022-07-16  0:34                         ` Qu Wenruo
  2022-07-16 11:11                           ` Qu Wenruo
  2022-07-17 23:00                           ` Zygo Blaxell
  0 siblings, 2 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-07-16  0:34 UTC (permalink / raw)
  To: Thiago Ramon, kreijack; +Cc: Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/7/16 03:08, Thiago Ramon wrote:
> As a user of RAID6 here, let me jump in because I think this
> suggestion is actually a very good compromise.
>
> With stripes written only once, we completely eliminate any possible
> write-hole, and even without any changes on the current disk layout
> and allocation,

Unfortunately current extent allocator won't understand the requirement
at all.

Currently the extent allocator although tends to use clustered free
space, when it can not find a clustered space, it goes where it can find
a free space. No matter if it's a substripe write.


Thus to full stripe only write, it's really the old idea about a new
extent allocator to avoid sub-stripe writes.

Nowadays with the zoned code, I guess it is now more feasible than previous.

Now I think it's time to revive the extent allcator idea, and explore
the extent allocator based idea, at least it requires no on-disk format
change, which even write-intent still needs a on-disk format change (at
least needs a compat ro flag)

Thanks,
Qu

> there shouldn't be much wasted space (in my case, I
> have a 12-disk RAID6, so each full stripe holds 640kb, and discounting
> single-sector writes that should go into metadata space, any
> reasonable write should fill that buffer in a few seconds).
>
> The additional suggestion of using smaller stripe widths in case there
> isn't enough data to fill a whole stripe would make it very easy to
> reclaim the wasted space by rebalancing with a stripe count filter,
> which can be easily automated and run very frequently.
>
> On-disk format also wouldn't change and be fully usable by older
> kernels, and it should "only" require changes on the allocator to
> implement.
>
> On Fri, Jul 15, 2022 at 2:58 PM Goffredo Baroncelli <kreijack@libero.it> wrote:
>>
>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>>> On 14.07.22 09:32, Qu Wenruo wrote:
>>>> [...]
>>>
>>> Again if you're doing sub-stripe size writes, you're asking stupid things and
>>> then there's no reason to not give the user stupid answers.
>>>
>>
>> Qu is right, if we consider only full stripe write the "raid hole" problem
>> disappear, because if a "full stripe" is not fully written it is not
>> referenced either.
>>
>>
>> Personally I think that the ZFS variable stripe size, may be interesting
>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
>> we can store different BG with different number of disks. Let me to make an
>> example: if we have 10 disks, we could allocate:
>> 1 BG RAID1
>> 1 BG RAID5, spread over 4 disks only
>> 1 BG RAID5, spread over 8 disks only
>> 1 BG RAID5, spread over 10 disks
>>
>> So if we have short writes, we could put the extents in the RAID1 BG; for longer
>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
>> of the data.
>>
>> Yes this would require a sort of garbage collector to move the data to the biggest
>> raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
>> variable stripe size.
>>
>> Doing so we don't need any disk format change and it would be backward compatible.
>>
>>
>> Moreover, if we could put the smaller BG in the faster disks, we could have a
>> decent tiering....
>>
>>
>>> If a user is concerned about the write or space amplicfication of sub-stripe
>>> writes on RAID56 he/she really needs to rethink the architecture.
>>>
>>>
>>>
>>> [1]
>>> S. K. Mishra and P. Mohapatra,
>>> "Performance study of RAID-5 disk arrays with data and parity cache,"
>>> Proceedings of the 1996 ICPP Workshop on Challenges for Parallel Processing,
>>> 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.
>>
>> --
>> gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
>> Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
>>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-16  0:34                         ` Qu Wenruo
@ 2022-07-16 11:11                           ` Qu Wenruo
  2022-07-16 13:52                             ` Thiago Ramon
  2022-07-17 23:00                           ` Zygo Blaxell
  1 sibling, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-07-16 11:11 UTC (permalink / raw)
  To: Thiago Ramon, kreijack; +Cc: Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/7/16 08:34, Qu Wenruo wrote:
>
>
> On 2022/7/16 03:08, Thiago Ramon wrote:
>> As a user of RAID6 here, let me jump in because I think this
>> suggestion is actually a very good compromise.
>>
>> With stripes written only once, we completely eliminate any possible
>> write-hole, and even without any changes on the current disk layout
>> and allocation,
>
> Unfortunately current extent allocator won't understand the requirement
> at all.
>
> Currently the extent allocator although tends to use clustered free
> space, when it can not find a clustered space, it goes where it can find
> a free space. No matter if it's a substripe write.
>
>
> Thus to full stripe only write, it's really the old idea about a new
> extent allocator to avoid sub-stripe writes.
>
> Nowadays with the zoned code, I guess it is now more feasible than
> previous.
>
> Now I think it's time to revive the extent allcator idea, and explore
> the extent allocator based idea, at least it requires no on-disk format
> change, which even write-intent still needs a on-disk format change (at
> least needs a compat ro flag)

After more consideration, I am still not confident of above extent
allocator avoid sub-stripe write.

Especially for the following ENOSPC case (I'll later try submit it as an
future proof test case for fstests).

---
   mkfs.btrfs -f -m raid1c3 -d raid5 $dev1 $dev2 $dev3
   mount $dev1 $mnt
   for (( i=0;; i+=2 )) do
	xfs_io -f -c "pwrite 0 64k" $mnt/file.$i &> /dev/null
	if [ $? -ne 0 ]; then
		break
	fi
	xfs_io -f -c "pwrite 0 64k" $mnt/file.$(($i + 1)) &> /dev/null
	if [ $? -ne 0 ]; then
		break
	fi
	sync
   done
   rm -rf -- $mnt/file.*[02468]
   sync
   xfs_io -f -c "pwrite 0 4m" $mnt/new_file
---

The core idea of above script it, fill the fs using 64K extents.
Then delete half of them interleavely.

This will make all the full stripes to have one data stripe fully
utilize, one free, and all parity utilized.

If you go extent allocator that avoid sub-stripe write, then the last
write will fail.

If you RST with regular devices and COWing P/Q, then the last write will
also fail.

To me, I don't care about performance or latency, but at least, what we
can do before, but now if a new proposed RAID56 can not do, then to me
it's a regression.

I'm not against RST, but for RST on regular devices, we still need GC
and reserved block groups to avoid above problems.

And that's why I still prefer write-intent, it brings no possible
regression.

>
>> there shouldn't be much wasted space (in my case, I
>> have a 12-disk RAID6, so each full stripe holds 640kb, and discounting
>> single-sector writes that should go into metadata space, any
>> reasonable write should fill that buffer in a few seconds).

Nope, the problem is not that simple.

Consider this, you have an application doing an 64K write DIO.

Then with allocator prohibiting sub-stripe write, it will take a full
640K stripe, wasting 90% of your space.


Furthermore, even if you have some buffered write, merged into an 640KiB
full stripe, but later 9 * 64K of data extents in that full stripe get
freed.
Then you can not use that 9 * 64K space anyway.

That's why zoned device has GC and reserved zones.

If we go allocator way, then we also need a non-zoned GC and reserved
block groups.

Good luck implementing that feature just for RAID56 on non-zoned devices.

Thanks,
Qu

>>
>> The additional suggestion of using smaller stripe widths in case there
>> isn't enough data to fill a whole stripe would make it very easy to
>> reclaim the wasted space by rebalancing with a stripe count filter,
>> which can be easily automated and run very frequently.
>>
>> On-disk format also wouldn't change and be fully usable by older
>> kernels, and it should "only" require changes on the allocator to
>> implement.
>>
>> On Fri, Jul 15, 2022 at 2:58 PM Goffredo Baroncelli
>> <kreijack@libero.it> wrote:
>>>
>>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>>>> On 14.07.22 09:32, Qu Wenruo wrote:
>>>>> [...]
>>>>
>>>> Again if you're doing sub-stripe size writes, you're asking stupid
>>>> things and
>>>> then there's no reason to not give the user stupid answers.
>>>>
>>>
>>> Qu is right, if we consider only full stripe write the "raid hole"
>>> problem
>>> disappear, because if a "full stripe" is not fully written it is not
>>> referenced either.
>>>
>>>
>>> Personally I think that the ZFS variable stripe size, may be interesting
>>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
>>> we can store different BG with different number of disks. Let me to
>>> make an
>>> example: if we have 10 disks, we could allocate:
>>> 1 BG RAID1
>>> 1 BG RAID5, spread over 4 disks only
>>> 1 BG RAID5, spread over 8 disks only
>>> 1 BG RAID5, spread over 10 disks
>>>
>>> So if we have short writes, we could put the extents in the RAID1 BG;
>>> for longer
>>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by
>>> length
>>> of the data.
>>>
>>> Yes this would require a sort of garbage collector to move the data
>>> to the biggest
>>> raid5 BG, but this would avoid (or reduce) the fragmentation which
>>> affect the
>>> variable stripe size.
>>>
>>> Doing so we don't need any disk format change and it would be
>>> backward compatible.
>>>
>>>
>>> Moreover, if we could put the smaller BG in the faster disks, we
>>> could have a
>>> decent tiering....
>>>
>>>
>>>> If a user is concerned about the write or space amplicfication of
>>>> sub-stripe
>>>> writes on RAID56 he/she really needs to rethink the architecture.
>>>>
>>>>
>>>>
>>>> [1]
>>>> S. K. Mishra and P. Mohapatra,
>>>> "Performance study of RAID-5 disk arrays with data and parity cache,"
>>>> Proceedings of the 1996 ICPP Workshop on Challenges for Parallel
>>>> Processing,
>>>> 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.
>>>
>>> --
>>> gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
>>> Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
>>>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-16 11:11                           ` Qu Wenruo
@ 2022-07-16 13:52                             ` Thiago Ramon
  2022-07-16 14:26                               ` Goffredo Baroncelli
  2022-07-17  0:30                               ` Qu Wenruo
  0 siblings, 2 replies; 88+ messages in thread
From: Thiago Ramon @ 2022-07-16 13:52 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: kreijack, Johannes Thumshirn, Qu Wenruo, linux-btrfs

On Sat, Jul 16, 2022 at 8:12 AM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>
>
>
> On 2022/7/16 08:34, Qu Wenruo wrote:
> >
> >
> > On 2022/7/16 03:08, Thiago Ramon wrote:
> >> As a user of RAID6 here, let me jump in because I think this
> >> suggestion is actually a very good compromise.
> >>
> >> With stripes written only once, we completely eliminate any possible
> >> write-hole, and even without any changes on the current disk layout
> >> and allocation,
> >
> > Unfortunately current extent allocator won't understand the requirement
> > at all.
> >
> > Currently the extent allocator although tends to use clustered free
> > space, when it can not find a clustered space, it goes where it can find
> > a free space. No matter if it's a substripe write.
> >
> >
> > Thus to full stripe only write, it's really the old idea about a new
> > extent allocator to avoid sub-stripe writes.
> >
> > Nowadays with the zoned code, I guess it is now more feasible than
> > previous.
> >
> > Now I think it's time to revive the extent allcator idea, and explore
> > the extent allocator based idea, at least it requires no on-disk format
> > change, which even write-intent still needs a on-disk format change (at
> > least needs a compat ro flag)
>
> After more consideration, I am still not confident of above extent
> allocator avoid sub-stripe write.
>
> Especially for the following ENOSPC case (I'll later try submit it as an
> future proof test case for fstests).
>
> ---
>    mkfs.btrfs -f -m raid1c3 -d raid5 $dev1 $dev2 $dev3
>    mount $dev1 $mnt
>    for (( i=0;; i+=2 )) do
>         xfs_io -f -c "pwrite 0 64k" $mnt/file.$i &> /dev/null
>         if [ $? -ne 0 ]; then
>                 break
>         fi
>         xfs_io -f -c "pwrite 0 64k" $mnt/file.$(($i + 1)) &> /dev/null
>         if [ $? -ne 0 ]; then
>                 break
>         fi
>         sync
>    done
>    rm -rf -- $mnt/file.*[02468]
>    sync
>    xfs_io -f -c "pwrite 0 4m" $mnt/new_file
> ---
>
> The core idea of above script it, fill the fs using 64K extents.
> Then delete half of them interleavely.
>
> This will make all the full stripes to have one data stripe fully
> utilize, one free, and all parity utilized.
>
> If you go extent allocator that avoid sub-stripe write, then the last
> write will fail.
>
> If you RST with regular devices and COWing P/Q, then the last write will
> also fail.
>
> To me, I don't care about performance or latency, but at least, what we
> can do before, but now if a new proposed RAID56 can not do, then to me
> it's a regression.
>
> I'm not against RST, but for RST on regular devices, we still need GC
> and reserved block groups to avoid above problems.
>
> And that's why I still prefer write-intent, it brings no possible
> regression.
While the test does fail as-is, rebalancing will recover all the
wasted space. It's a new gotcha for RAID56, but I think it's still
preferable than the write-hole, and is proper CoW.
Narrowing the stripes to 4k would waste a lot less space overall, but
there's probably code around that depends on the current 64k-tall
stripes.

>
> >
> >> there shouldn't be much wasted space (in my case, I
> >> have a 12-disk RAID6, so each full stripe holds 640kb, and discounting
> >> single-sector writes that should go into metadata space, any
> >> reasonable write should fill that buffer in a few seconds).
>
> Nope, the problem is not that simple.
>
> Consider this, you have an application doing an 64K write DIO.
>
> Then with allocator prohibiting sub-stripe write, it will take a full
> 640K stripe, wasting 90% of your space.
>
>
> Furthermore, even if you have some buffered write, merged into an 640KiB
> full stripe, but later 9 * 64K of data extents in that full stripe get
> freed.
> Then you can not use that 9 * 64K space anyway.
>
> That's why zoned device has GC and reserved zones.
>
> If we go allocator way, then we also need a non-zoned GC and reserved
> block groups.
>
> Good luck implementing that feature just for RAID56 on non-zoned devices.
DIO definitely would be a problem this way. As you mention, a separate
zone for high;y modified data would make things a lot easier (maybe a
RAID1Cx zone), but that definitely would be a huge change on the way
things are handled.
Another, easier solution would be disabling DIO altogether for RAID56,
and I'd prefer that if that's the cost of having RAID56 finally
respecting CoW and stopping modifying data shared with other files.
But as you say, it's definitely a regression if we change things this
way, and we'd need to hear from other people using RAID56 what they'd
prefer.

>
> Thanks,
> Qu
>
> >>
> >> The additional suggestion of using smaller stripe widths in case there
> >> isn't enough data to fill a whole stripe would make it very easy to
> >> reclaim the wasted space by rebalancing with a stripe count filter,
> >> which can be easily automated and run very frequently.
> >>
> >> On-disk format also wouldn't change and be fully usable by older
> >> kernels, and it should "only" require changes on the allocator to
> >> implement.
> >>
> >> On Fri, Jul 15, 2022 at 2:58 PM Goffredo Baroncelli
> >> <kreijack@libero.it> wrote:
> >>>
> >>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
> >>>> On 14.07.22 09:32, Qu Wenruo wrote:
> >>>>> [...]
> >>>>
> >>>> Again if you're doing sub-stripe size writes, you're asking stupid
> >>>> things and
> >>>> then there's no reason to not give the user stupid answers.
> >>>>
> >>>
> >>> Qu is right, if we consider only full stripe write the "raid hole"
> >>> problem
> >>> disappear, because if a "full stripe" is not fully written it is not
> >>> referenced either.
> >>>
> >>>
> >>> Personally I think that the ZFS variable stripe size, may be interesting
> >>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
> >>> we can store different BG with different number of disks. Let me to
> >>> make an
> >>> example: if we have 10 disks, we could allocate:
> >>> 1 BG RAID1
> >>> 1 BG RAID5, spread over 4 disks only
> >>> 1 BG RAID5, spread over 8 disks only
> >>> 1 BG RAID5, spread over 10 disks
> >>>
> >>> So if we have short writes, we could put the extents in the RAID1 BG;
> >>> for longer
> >>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by
> >>> length
> >>> of the data.
> >>>
> >>> Yes this would require a sort of garbage collector to move the data
> >>> to the biggest
> >>> raid5 BG, but this would avoid (or reduce) the fragmentation which
> >>> affect the
> >>> variable stripe size.
> >>>
> >>> Doing so we don't need any disk format change and it would be
> >>> backward compatible.
> >>>
> >>>
> >>> Moreover, if we could put the smaller BG in the faster disks, we
> >>> could have a
> >>> decent tiering....
> >>>
> >>>
> >>>> If a user is concerned about the write or space amplicfication of
> >>>> sub-stripe
> >>>> writes on RAID56 he/she really needs to rethink the architecture.
> >>>>
> >>>>
> >>>>
> >>>> [1]
> >>>> S. K. Mishra and P. Mohapatra,
> >>>> "Performance study of RAID-5 disk arrays with data and parity cache,"
> >>>> Proceedings of the 1996 ICPP Workshop on Challenges for Parallel
> >>>> Processing,
> >>>> 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.
> >>>
> >>> --
> >>> gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
> >>> Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
> >>>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-16 13:52                             ` Thiago Ramon
@ 2022-07-16 14:26                               ` Goffredo Baroncelli
  2022-07-17 17:58                                 ` Goffredo Baroncelli
  2022-07-17  0:30                               ` Qu Wenruo
  1 sibling, 1 reply; 88+ messages in thread
From: Goffredo Baroncelli @ 2022-07-16 14:26 UTC (permalink / raw)
  To: Thiago Ramon, Qu Wenruo; +Cc: Johannes Thumshirn, Qu Wenruo, linux-btrfs

On 16/07/2022 15.52, Thiago Ramon wrote:
>> Good luck implementing that feature just for RAID56 on non-zoned devices.
> DIO definitely would be a problem this way. As you mention, a separate
> zone for high;y modified data would make things a lot easier (maybe a
> RAID1Cx zone), but that definitely would be a huge change on the way
> things are handled.


When you talk about DIO, do you mean O_DIRECT ? Because this is full reliable
even without RAID56...
See my email

"BUG: BTRFS and O_DIRECT could lead to wrong checksum and wrong data", sent 15/09/2017



-- 
gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-16 13:52                             ` Thiago Ramon
  2022-07-16 14:26                               ` Goffredo Baroncelli
@ 2022-07-17  0:30                               ` Qu Wenruo
  2022-07-17 15:18                                 ` Thiago Ramon
  1 sibling, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-07-17  0:30 UTC (permalink / raw)
  To: Thiago Ramon; +Cc: kreijack, Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/7/16 21:52, Thiago Ramon wrote:
> On Sat, Jul 16, 2022 at 8:12 AM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>>
>>
>>
>> On 2022/7/16 08:34, Qu Wenruo wrote:
>>>
>>>
>>> On 2022/7/16 03:08, Thiago Ramon wrote:
>>>> As a user of RAID6 here, let me jump in because I think this
>>>> suggestion is actually a very good compromise.
>>>>
>>>> With stripes written only once, we completely eliminate any possible
>>>> write-hole, and even without any changes on the current disk layout
>>>> and allocation,
>>>
>>> Unfortunately current extent allocator won't understand the requirement
>>> at all.
>>>
>>> Currently the extent allocator although tends to use clustered free
>>> space, when it can not find a clustered space, it goes where it can find
>>> a free space. No matter if it's a substripe write.
>>>
>>>
>>> Thus to full stripe only write, it's really the old idea about a new
>>> extent allocator to avoid sub-stripe writes.
>>>
>>> Nowadays with the zoned code, I guess it is now more feasible than
>>> previous.
>>>
>>> Now I think it's time to revive the extent allcator idea, and explore
>>> the extent allocator based idea, at least it requires no on-disk format
>>> change, which even write-intent still needs a on-disk format change (at
>>> least needs a compat ro flag)
>>
>> After more consideration, I am still not confident of above extent
>> allocator avoid sub-stripe write.
>>
>> Especially for the following ENOSPC case (I'll later try submit it as an
>> future proof test case for fstests).
>>
>> ---
>>     mkfs.btrfs -f -m raid1c3 -d raid5 $dev1 $dev2 $dev3
>>     mount $dev1 $mnt
>>     for (( i=0;; i+=2 )) do
>>          xfs_io -f -c "pwrite 0 64k" $mnt/file.$i &> /dev/null
>>          if [ $? -ne 0 ]; then
>>                  break
>>          fi
>>          xfs_io -f -c "pwrite 0 64k" $mnt/file.$(($i + 1)) &> /dev/null
>>          if [ $? -ne 0 ]; then
>>                  break
>>          fi
>>          sync
>>     done
>>     rm -rf -- $mnt/file.*[02468]
>>     sync
>>     xfs_io -f -c "pwrite 0 4m" $mnt/new_file
>> ---
>>
>> The core idea of above script it, fill the fs using 64K extents.
>> Then delete half of them interleavely.
>>
>> This will make all the full stripes to have one data stripe fully
>> utilize, one free, and all parity utilized.
>>
>> If you go extent allocator that avoid sub-stripe write, then the last
>> write will fail.
>>
>> If you RST with regular devices and COWing P/Q, then the last write will
>> also fail.
>>
>> To me, I don't care about performance or latency, but at least, what we
>> can do before, but now if a new proposed RAID56 can not do, then to me
>> it's a regression.
>>
>> I'm not against RST, but for RST on regular devices, we still need GC
>> and reserved block groups to avoid above problems.
>>
>> And that's why I still prefer write-intent, it brings no possible
>> regression.
> While the test does fail as-is, rebalancing will recover all the
> wasted space.

Nope, the fs is already filled, you have no unallocated space to do balance.

That's exactly why zoned btrfs have reserved zones to handle such
problem for GC.

> It's a new gotcha for RAID56, but I think it's still
> preferable than the write-hole, and is proper CoW.
> Narrowing the stripes to 4k would waste a lot less space overall, but
> there's probably code around that depends on the current 64k-tall
> stripes.

Yes, limiting stripe size to 4K will cause way less wasted space, but
the result is still the same for the worst case script, thus still need
garbage collecting and reserved space for GC.

Thanks,
Qu

>
>>
>>>
>>>> there shouldn't be much wasted space (in my case, I
>>>> have a 12-disk RAID6, so each full stripe holds 640kb, and discounting
>>>> single-sector writes that should go into metadata space, any
>>>> reasonable write should fill that buffer in a few seconds).
>>
>> Nope, the problem is not that simple.
>>
>> Consider this, you have an application doing an 64K write DIO.
>>
>> Then with allocator prohibiting sub-stripe write, it will take a full
>> 640K stripe, wasting 90% of your space.
>>
>>
>> Furthermore, even if you have some buffered write, merged into an 640KiB
>> full stripe, but later 9 * 64K of data extents in that full stripe get
>> freed.
>> Then you can not use that 9 * 64K space anyway.
>>
>> That's why zoned device has GC and reserved zones.
>>
>> If we go allocator way, then we also need a non-zoned GC and reserved
>> block groups.
>>
>> Good luck implementing that feature just for RAID56 on non-zoned devices.
> DIO definitely would be a problem this way. As you mention, a separate
> zone for high;y modified data would make things a lot easier (maybe a
> RAID1Cx zone), but that definitely would be a huge change on the way
> things are handled.
> Another, easier solution would be disabling DIO altogether for RAID56,
> and I'd prefer that if that's the cost of having RAID56 finally
> respecting CoW and stopping modifying data shared with other files.
> But as you say, it's definitely a regression if we change things this
> way, and we'd need to hear from other people using RAID56 what they'd
> prefer.
>
>>
>> Thanks,
>> Qu
>>
>>>>
>>>> The additional suggestion of using smaller stripe widths in case there
>>>> isn't enough data to fill a whole stripe would make it very easy to
>>>> reclaim the wasted space by rebalancing with a stripe count filter,
>>>> which can be easily automated and run very frequently.
>>>>
>>>> On-disk format also wouldn't change and be fully usable by older
>>>> kernels, and it should "only" require changes on the allocator to
>>>> implement.
>>>>
>>>> On Fri, Jul 15, 2022 at 2:58 PM Goffredo Baroncelli
>>>> <kreijack@libero.it> wrote:
>>>>>
>>>>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>>>>>> On 14.07.22 09:32, Qu Wenruo wrote:
>>>>>>> [...]
>>>>>>
>>>>>> Again if you're doing sub-stripe size writes, you're asking stupid
>>>>>> things and
>>>>>> then there's no reason to not give the user stupid answers.
>>>>>>
>>>>>
>>>>> Qu is right, if we consider only full stripe write the "raid hole"
>>>>> problem
>>>>> disappear, because if a "full stripe" is not fully written it is not
>>>>> referenced either.
>>>>>
>>>>>
>>>>> Personally I think that the ZFS variable stripe size, may be interesting
>>>>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
>>>>> we can store different BG with different number of disks. Let me to
>>>>> make an
>>>>> example: if we have 10 disks, we could allocate:
>>>>> 1 BG RAID1
>>>>> 1 BG RAID5, spread over 4 disks only
>>>>> 1 BG RAID5, spread over 8 disks only
>>>>> 1 BG RAID5, spread over 10 disks
>>>>>
>>>>> So if we have short writes, we could put the extents in the RAID1 BG;
>>>>> for longer
>>>>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by
>>>>> length
>>>>> of the data.
>>>>>
>>>>> Yes this would require a sort of garbage collector to move the data
>>>>> to the biggest
>>>>> raid5 BG, but this would avoid (or reduce) the fragmentation which
>>>>> affect the
>>>>> variable stripe size.
>>>>>
>>>>> Doing so we don't need any disk format change and it would be
>>>>> backward compatible.
>>>>>
>>>>>
>>>>> Moreover, if we could put the smaller BG in the faster disks, we
>>>>> could have a
>>>>> decent tiering....
>>>>>
>>>>>
>>>>>> If a user is concerned about the write or space amplicfication of
>>>>>> sub-stripe
>>>>>> writes on RAID56 he/she really needs to rethink the architecture.
>>>>>>
>>>>>>
>>>>>>
>>>>>> [1]
>>>>>> S. K. Mishra and P. Mohapatra,
>>>>>> "Performance study of RAID-5 disk arrays with data and parity cache,"
>>>>>> Proceedings of the 1996 ICPP Workshop on Challenges for Parallel
>>>>>> Processing,
>>>>>> 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.
>>>>>
>>>>> --
>>>>> gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
>>>>> Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
>>>>>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-17  0:30                               ` Qu Wenruo
@ 2022-07-17 15:18                                 ` Thiago Ramon
  2022-07-17 22:01                                   ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Thiago Ramon @ 2022-07-17 15:18 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: kreijack, Johannes Thumshirn, Qu Wenruo, linux-btrfs

On Sat, Jul 16, 2022 at 9:30 PM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>
>
>
> On 2022/7/16 21:52, Thiago Ramon wrote:
> > On Sat, Jul 16, 2022 at 8:12 AM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
> >>
> >>
> >>
> >> On 2022/7/16 08:34, Qu Wenruo wrote:
> >>>
> >>>
> >>> On 2022/7/16 03:08, Thiago Ramon wrote:
> >>>> As a user of RAID6 here, let me jump in because I think this
> >>>> suggestion is actually a very good compromise.
> >>>>
> >>>> With stripes written only once, we completely eliminate any possible
> >>>> write-hole, and even without any changes on the current disk layout
> >>>> and allocation,
> >>>
> >>> Unfortunately current extent allocator won't understand the requirement
> >>> at all.
> >>>
> >>> Currently the extent allocator although tends to use clustered free
> >>> space, when it can not find a clustered space, it goes where it can find
> >>> a free space. No matter if it's a substripe write.
> >>>
> >>>
> >>> Thus to full stripe only write, it's really the old idea about a new
> >>> extent allocator to avoid sub-stripe writes.
> >>>
> >>> Nowadays with the zoned code, I guess it is now more feasible than
> >>> previous.
> >>>
> >>> Now I think it's time to revive the extent allcator idea, and explore
> >>> the extent allocator based idea, at least it requires no on-disk format
> >>> change, which even write-intent still needs a on-disk format change (at
> >>> least needs a compat ro flag)
> >>
> >> After more consideration, I am still not confident of above extent
> >> allocator avoid sub-stripe write.
> >>
> >> Especially for the following ENOSPC case (I'll later try submit it as an
> >> future proof test case for fstests).
> >>
> >> ---
> >>     mkfs.btrfs -f -m raid1c3 -d raid5 $dev1 $dev2 $dev3
> >>     mount $dev1 $mnt
> >>     for (( i=0;; i+=2 )) do
> >>          xfs_io -f -c "pwrite 0 64k" $mnt/file.$i &> /dev/null
> >>          if [ $? -ne 0 ]; then
> >>                  break
> >>          fi
> >>          xfs_io -f -c "pwrite 0 64k" $mnt/file.$(($i + 1)) &> /dev/null
> >>          if [ $? -ne 0 ]; then
> >>                  break
> >>          fi
> >>          sync
> >>     done
> >>     rm -rf -- $mnt/file.*[02468]
> >>     sync
> >>     xfs_io -f -c "pwrite 0 4m" $mnt/new_file
> >> ---
> >>
> >> The core idea of above script it, fill the fs using 64K extents.
> >> Then delete half of them interleavely.
> >>
> >> This will make all the full stripes to have one data stripe fully
> >> utilize, one free, and all parity utilized.
> >>
> >> If you go extent allocator that avoid sub-stripe write, then the last
> >> write will fail.
> >>
> >> If you RST with regular devices and COWing P/Q, then the last write will
> >> also fail.
> >>
> >> To me, I don't care about performance or latency, but at least, what we
> >> can do before, but now if a new proposed RAID56 can not do, then to me
> >> it's a regression.
> >>
> >> I'm not against RST, but for RST on regular devices, we still need GC
> >> and reserved block groups to avoid above problems.
> >>
> >> And that's why I still prefer write-intent, it brings no possible
> >> regression.
> > While the test does fail as-is, rebalancing will recover all the
> > wasted space.
>
> Nope, the fs is already filled, you have no unallocated space to do balance.
>
> That's exactly why zoned btrfs have reserved zones to handle such
> problem for GC.

Very good point. What would be the implementation difficulty and
overall impact of ALWAYS reserving space, for exclusive balance usage,
for at least 1 metadata or data block group, whichever is larger?
This would obviously create some unusable space on the FS, but I think
this would solve the majority of ENOSPC problems with all profiles. Of
course an option to disable this would also be needed for advanced
usage, but it sounds like a decent default.

>
> > It's a new gotcha for RAID56, but I think it's still
> > preferable than the write-hole, and is proper CoW.
> > Narrowing the stripes to 4k would waste a lot less space overall, but
> > there's probably code around that depends on the current 64k-tall
> > stripes.
>
> Yes, limiting stripe size to 4K will cause way less wasted space, but
> the result is still the same for the worst case script, thus still need
> garbage collecting and reserved space for GC.
>
> Thanks,
> Qu
>
> >
> >>
> >>>
> >>>> there shouldn't be much wasted space (in my case, I
> >>>> have a 12-disk RAID6, so each full stripe holds 640kb, and discounting
> >>>> single-sector writes that should go into metadata space, any
> >>>> reasonable write should fill that buffer in a few seconds).
> >>
> >> Nope, the problem is not that simple.
> >>
> >> Consider this, you have an application doing an 64K write DIO.
> >>
> >> Then with allocator prohibiting sub-stripe write, it will take a full
> >> 640K stripe, wasting 90% of your space.
> >>
> >>
> >> Furthermore, even if you have some buffered write, merged into an 640KiB
> >> full stripe, but later 9 * 64K of data extents in that full stripe get
> >> freed.
> >> Then you can not use that 9 * 64K space anyway.
> >>
> >> That's why zoned device has GC and reserved zones.
> >>
> >> If we go allocator way, then we also need a non-zoned GC and reserved
> >> block groups.
> >>
> >> Good luck implementing that feature just for RAID56 on non-zoned devices.
> > DIO definitely would be a problem this way. As you mention, a separate
> > zone for high;y modified data would make things a lot easier (maybe a
> > RAID1Cx zone), but that definitely would be a huge change on the way
> > things are handled.
> > Another, easier solution would be disabling DIO altogether for RAID56,
> > and I'd prefer that if that's the cost of having RAID56 finally
> > respecting CoW and stopping modifying data shared with other files.
> > But as you say, it's definitely a regression if we change things this
> > way, and we'd need to hear from other people using RAID56 what they'd
> > prefer.
> >
> >>
> >> Thanks,
> >> Qu
> >>
> >>>>
> >>>> The additional suggestion of using smaller stripe widths in case there
> >>>> isn't enough data to fill a whole stripe would make it very easy to
> >>>> reclaim the wasted space by rebalancing with a stripe count filter,
> >>>> which can be easily automated and run very frequently.
> >>>>
> >>>> On-disk format also wouldn't change and be fully usable by older
> >>>> kernels, and it should "only" require changes on the allocator to
> >>>> implement.
> >>>>
> >>>> On Fri, Jul 15, 2022 at 2:58 PM Goffredo Baroncelli
> >>>> <kreijack@libero.it> wrote:
> >>>>>
> >>>>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
> >>>>>> On 14.07.22 09:32, Qu Wenruo wrote:
> >>>>>>> [...]
> >>>>>>
> >>>>>> Again if you're doing sub-stripe size writes, you're asking stupid
> >>>>>> things and
> >>>>>> then there's no reason to not give the user stupid answers.
> >>>>>>
> >>>>>
> >>>>> Qu is right, if we consider only full stripe write the "raid hole"
> >>>>> problem
> >>>>> disappear, because if a "full stripe" is not fully written it is not
> >>>>> referenced either.
> >>>>>
> >>>>>
> >>>>> Personally I think that the ZFS variable stripe size, may be interesting
> >>>>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
> >>>>> we can store different BG with different number of disks. Let me to
> >>>>> make an
> >>>>> example: if we have 10 disks, we could allocate:
> >>>>> 1 BG RAID1
> >>>>> 1 BG RAID5, spread over 4 disks only
> >>>>> 1 BG RAID5, spread over 8 disks only
> >>>>> 1 BG RAID5, spread over 10 disks
> >>>>>
> >>>>> So if we have short writes, we could put the extents in the RAID1 BG;
> >>>>> for longer
> >>>>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by
> >>>>> length
> >>>>> of the data.
> >>>>>
> >>>>> Yes this would require a sort of garbage collector to move the data
> >>>>> to the biggest
> >>>>> raid5 BG, but this would avoid (or reduce) the fragmentation which
> >>>>> affect the
> >>>>> variable stripe size.
> >>>>>
> >>>>> Doing so we don't need any disk format change and it would be
> >>>>> backward compatible.
> >>>>>
> >>>>>
> >>>>> Moreover, if we could put the smaller BG in the faster disks, we
> >>>>> could have a
> >>>>> decent tiering....
> >>>>>
> >>>>>
> >>>>>> If a user is concerned about the write or space amplicfication of
> >>>>>> sub-stripe
> >>>>>> writes on RAID56 he/she really needs to rethink the architecture.
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> [1]
> >>>>>> S. K. Mishra and P. Mohapatra,
> >>>>>> "Performance study of RAID-5 disk arrays with data and parity cache,"
> >>>>>> Proceedings of the 1996 ICPP Workshop on Challenges for Parallel
> >>>>>> Processing,
> >>>>>> 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.
> >>>>>
> >>>>> --
> >>>>> gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
> >>>>> Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
> >>>>>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-16 14:26                               ` Goffredo Baroncelli
@ 2022-07-17 17:58                                 ` Goffredo Baroncelli
  0 siblings, 0 replies; 88+ messages in thread
From: Goffredo Baroncelli @ 2022-07-17 17:58 UTC (permalink / raw)
  To: Thiago Ramon, Qu Wenruo; +Cc: Johannes Thumshirn, Qu Wenruo, linux-btrfs

On 16/07/2022 16.26, Goffredo Baroncelli wrote:
> On 16/07/2022 15.52, Thiago Ramon wrote:
>>> Good luck implementing that feature just for RAID56 on non-zoned devices.
>> DIO definitely would be a problem this way. As you mention, a separate
>> zone for high;y modified data would make things a lot easier (maybe a
>> RAID1Cx zone), but that definitely would be a huge change on the way
>> things are handled.
> 
> 
> When you talk about DIO, do you mean O_DIRECT ? Because this is full reliable
> even without RAID56...
ehmm... I forgot a "not". So my last sentence is

          Because this is NOT fully reliable even without RAID56...

> See my email
> 
> "BUG: BTRFS and O_DIRECT could lead to wrong checksum and wrong data", sent 15/09/2017
> 
> 
> 

-- 
gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-17 15:18                                 ` Thiago Ramon
@ 2022-07-17 22:01                                   ` Qu Wenruo
  0 siblings, 0 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-07-17 22:01 UTC (permalink / raw)
  To: Thiago Ramon; +Cc: kreijack, Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/7/17 23:18, Thiago Ramon wrote:
> On Sat, Jul 16, 2022 at 9:30 PM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>>
>>
>>
>> On 2022/7/16 21:52, Thiago Ramon wrote:
>>> On Sat, Jul 16, 2022 at 8:12 AM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>>>>
>>>>
>>>>
>>>> On 2022/7/16 08:34, Qu Wenruo wrote:
>>>>>
>>>>>
>>>>> On 2022/7/16 03:08, Thiago Ramon wrote:
>>>>>> As a user of RAID6 here, let me jump in because I think this
>>>>>> suggestion is actually a very good compromise.
>>>>>>
>>>>>> With stripes written only once, we completely eliminate any possible
>>>>>> write-hole, and even without any changes on the current disk layout
>>>>>> and allocation,
>>>>>
>>>>> Unfortunately current extent allocator won't understand the requirement
>>>>> at all.
>>>>>
>>>>> Currently the extent allocator although tends to use clustered free
>>>>> space, when it can not find a clustered space, it goes where it can find
>>>>> a free space. No matter if it's a substripe write.
>>>>>
>>>>>
>>>>> Thus to full stripe only write, it's really the old idea about a new
>>>>> extent allocator to avoid sub-stripe writes.
>>>>>
>>>>> Nowadays with the zoned code, I guess it is now more feasible than
>>>>> previous.
>>>>>
>>>>> Now I think it's time to revive the extent allcator idea, and explore
>>>>> the extent allocator based idea, at least it requires no on-disk format
>>>>> change, which even write-intent still needs a on-disk format change (at
>>>>> least needs a compat ro flag)
>>>>
>>>> After more consideration, I am still not confident of above extent
>>>> allocator avoid sub-stripe write.
>>>>
>>>> Especially for the following ENOSPC case (I'll later try submit it as an
>>>> future proof test case for fstests).
>>>>
>>>> ---
>>>>      mkfs.btrfs -f -m raid1c3 -d raid5 $dev1 $dev2 $dev3
>>>>      mount $dev1 $mnt
>>>>      for (( i=0;; i+=2 )) do
>>>>           xfs_io -f -c "pwrite 0 64k" $mnt/file.$i &> /dev/null
>>>>           if [ $? -ne 0 ]; then
>>>>                   break
>>>>           fi
>>>>           xfs_io -f -c "pwrite 0 64k" $mnt/file.$(($i + 1)) &> /dev/null
>>>>           if [ $? -ne 0 ]; then
>>>>                   break
>>>>           fi
>>>>           sync
>>>>      done
>>>>      rm -rf -- $mnt/file.*[02468]
>>>>      sync
>>>>      xfs_io -f -c "pwrite 0 4m" $mnt/new_file
>>>> ---
>>>>
>>>> The core idea of above script it, fill the fs using 64K extents.
>>>> Then delete half of them interleavely.
>>>>
>>>> This will make all the full stripes to have one data stripe fully
>>>> utilize, one free, and all parity utilized.
>>>>
>>>> If you go extent allocator that avoid sub-stripe write, then the last
>>>> write will fail.
>>>>
>>>> If you RST with regular devices and COWing P/Q, then the last write will
>>>> also fail.
>>>>
>>>> To me, I don't care about performance or latency, but at least, what we
>>>> can do before, but now if a new proposed RAID56 can not do, then to me
>>>> it's a regression.
>>>>
>>>> I'm not against RST, but for RST on regular devices, we still need GC
>>>> and reserved block groups to avoid above problems.
>>>>
>>>> And that's why I still prefer write-intent, it brings no possible
>>>> regression.
>>> While the test does fail as-is, rebalancing will recover all the
>>> wasted space.
>>
>> Nope, the fs is already filled, you have no unallocated space to do balance.
>>
>> That's exactly why zoned btrfs have reserved zones to handle such
>> problem for GC.
>
> Very good point. What would be the implementation difficulty and
> overall impact of ALWAYS reserving space,

To me, it's not simple, especially for non-zoned devices, we don't have
any existing unit like zones to do reservation.

And since we support device with uneven size, it can be pretty tricky to
calculate what size we should really reserve.


At least for now for non-zoned device, we don't have extra reservation
of unallocated space, nor the auto reclaim mechanism.

We may learn from zoned code, but I'm not confident if we should jump
into the rabbit hole at all, especially we already have a write-intent
implementation to avoid all the problem at least for non-zoned devices.


Another (smaller) problem is latency, if we ran of out space, we need to
kick in GC to reclaim space.
IIRC for zoned device it's mostly balancing near-empty zones into a new
zone, which can definitely introduce latency.

Thanks,
Qu
> for exclusive balance usage,
> for at least 1 metadata or data block group, whichever is larger?
> This would obviously create some unusable space on the FS, but I think
> this would solve the majority of ENOSPC problems with all profiles. Of
> course an option to disable this would also be needed for advanced
> usage, but it sounds like a decent default.
>
>>
>>> It's a new gotcha for RAID56, but I think it's still
>>> preferable than the write-hole, and is proper CoW.
>>> Narrowing the stripes to 4k would waste a lot less space overall, but
>>> there's probably code around that depends on the current 64k-tall
>>> stripes.
>>
>> Yes, limiting stripe size to 4K will cause way less wasted space, but
>> the result is still the same for the worst case script, thus still need
>> garbage collecting and reserved space for GC.
>>
>> Thanks,
>> Qu
>>
>>>
>>>>
>>>>>
>>>>>> there shouldn't be much wasted space (in my case, I
>>>>>> have a 12-disk RAID6, so each full stripe holds 640kb, and discounting
>>>>>> single-sector writes that should go into metadata space, any
>>>>>> reasonable write should fill that buffer in a few seconds).
>>>>
>>>> Nope, the problem is not that simple.
>>>>
>>>> Consider this, you have an application doing an 64K write DIO.
>>>>
>>>> Then with allocator prohibiting sub-stripe write, it will take a full
>>>> 640K stripe, wasting 90% of your space.
>>>>
>>>>
>>>> Furthermore, even if you have some buffered write, merged into an 640KiB
>>>> full stripe, but later 9 * 64K of data extents in that full stripe get
>>>> freed.
>>>> Then you can not use that 9 * 64K space anyway.
>>>>
>>>> That's why zoned device has GC and reserved zones.
>>>>
>>>> If we go allocator way, then we also need a non-zoned GC and reserved
>>>> block groups.
>>>>
>>>> Good luck implementing that feature just for RAID56 on non-zoned devices.
>>> DIO definitely would be a problem this way. As you mention, a separate
>>> zone for high;y modified data would make things a lot easier (maybe a
>>> RAID1Cx zone), but that definitely would be a huge change on the way
>>> things are handled.
>>> Another, easier solution would be disabling DIO altogether for RAID56,
>>> and I'd prefer that if that's the cost of having RAID56 finally
>>> respecting CoW and stopping modifying data shared with other files.
>>> But as you say, it's definitely a regression if we change things this
>>> way, and we'd need to hear from other people using RAID56 what they'd
>>> prefer.
>>>
>>>>
>>>> Thanks,
>>>> Qu
>>>>
>>>>>>
>>>>>> The additional suggestion of using smaller stripe widths in case there
>>>>>> isn't enough data to fill a whole stripe would make it very easy to
>>>>>> reclaim the wasted space by rebalancing with a stripe count filter,
>>>>>> which can be easily automated and run very frequently.
>>>>>>
>>>>>> On-disk format also wouldn't change and be fully usable by older
>>>>>> kernels, and it should "only" require changes on the allocator to
>>>>>> implement.
>>>>>>
>>>>>> On Fri, Jul 15, 2022 at 2:58 PM Goffredo Baroncelli
>>>>>> <kreijack@libero.it> wrote:
>>>>>>>
>>>>>>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>>>>>>>> On 14.07.22 09:32, Qu Wenruo wrote:
>>>>>>>>> [...]
>>>>>>>>
>>>>>>>> Again if you're doing sub-stripe size writes, you're asking stupid
>>>>>>>> things and
>>>>>>>> then there's no reason to not give the user stupid answers.
>>>>>>>>
>>>>>>>
>>>>>>> Qu is right, if we consider only full stripe write the "raid hole"
>>>>>>> problem
>>>>>>> disappear, because if a "full stripe" is not fully written it is not
>>>>>>> referenced either.
>>>>>>>
>>>>>>>
>>>>>>> Personally I think that the ZFS variable stripe size, may be interesting
>>>>>>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
>>>>>>> we can store different BG with different number of disks. Let me to
>>>>>>> make an
>>>>>>> example: if we have 10 disks, we could allocate:
>>>>>>> 1 BG RAID1
>>>>>>> 1 BG RAID5, spread over 4 disks only
>>>>>>> 1 BG RAID5, spread over 8 disks only
>>>>>>> 1 BG RAID5, spread over 10 disks
>>>>>>>
>>>>>>> So if we have short writes, we could put the extents in the RAID1 BG;
>>>>>>> for longer
>>>>>>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by
>>>>>>> length
>>>>>>> of the data.
>>>>>>>
>>>>>>> Yes this would require a sort of garbage collector to move the data
>>>>>>> to the biggest
>>>>>>> raid5 BG, but this would avoid (or reduce) the fragmentation which
>>>>>>> affect the
>>>>>>> variable stripe size.
>>>>>>>
>>>>>>> Doing so we don't need any disk format change and it would be
>>>>>>> backward compatible.
>>>>>>>
>>>>>>>
>>>>>>> Moreover, if we could put the smaller BG in the faster disks, we
>>>>>>> could have a
>>>>>>> decent tiering....
>>>>>>>
>>>>>>>
>>>>>>>> If a user is concerned about the write or space amplicfication of
>>>>>>>> sub-stripe
>>>>>>>> writes on RAID56 he/she really needs to rethink the architecture.
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> [1]
>>>>>>>> S. K. Mishra and P. Mohapatra,
>>>>>>>> "Performance study of RAID-5 disk arrays with data and parity cache,"
>>>>>>>> Proceedings of the 1996 ICPP Workshop on Challenges for Parallel
>>>>>>>> Processing,
>>>>>>>> 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.
>>>>>>>
>>>>>>> --
>>>>>>> gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
>>>>>>> Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
>>>>>>>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-16  0:34                         ` Qu Wenruo
  2022-07-16 11:11                           ` Qu Wenruo
@ 2022-07-17 23:00                           ` Zygo Blaxell
  2022-07-18  1:04                             ` Qu Wenruo
  1 sibling, 1 reply; 88+ messages in thread
From: Zygo Blaxell @ 2022-07-17 23:00 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Thiago Ramon, kreijack, Johannes Thumshirn, Qu Wenruo, linux-btrfs

On Sat, Jul 16, 2022 at 08:34:30AM +0800, Qu Wenruo wrote:
> 
> 
> On 2022/7/16 03:08, Thiago Ramon wrote:
> > As a user of RAID6 here, let me jump in because I think this
> > suggestion is actually a very good compromise.
> > 
> > With stripes written only once, we completely eliminate any possible
> > write-hole, and even without any changes on the current disk layout
> > and allocation,
> 
> Unfortunately current extent allocator won't understand the requirement
> at all.
> 
> Currently the extent allocator although tends to use clustered free
> space, when it can not find a clustered space, it goes where it can find
> a free space. No matter if it's a substripe write.

> Thus to full stripe only write, it's really the old idea about a new
> extent allocator to avoid sub-stripe writes.

> Nowadays with the zoned code, I guess it is now more feasible than previous.

It's certainly easier, but the gotcha at the bottom of the pile for
stripe-level GC on raid5 for btrfs is that raid5 stripe boundaries
don't match btrfs extent boundaries.  If I write some extents of various
sizes:

        Extents:  [4k][24k][64k][160k][--512k--][200k][100k]
        Stripes:  [---384k--------------][---384k-][---384k---]

If that 64K extent is freed, and I later write new data to it, then
in theory I have to CoW the 4k, 24k, 160k extents, and _parts of_
the 512k extent, or GC needs to be able to split extents (with an
explosion of fragmentation as all the existing extents are sliced up
in fractions-of-384k sized pieces).  Both options involve significant
IO amplification (reads and writes) at write time, with the worst case
being something like:

        Extents:  ...--][128M][-8K-][128M][...
        Stripes:  ...][384k][384k][384k][...

where there's a 384k raid stripe that contains parts of two 128M extents
and a 4K free space, and CoW does 256MB of IO on data blocks alone.
All of the above seems like an insane path we don't want to follow.

The main points of WIL (and as far as I can tell, also of RST) are:

	- it's a tree that translates logical bytenrs to new physical
	bytenrs so you can do CoW (RST) or journalling (WIL) on raid56
	stripes

	- it's persistent on disk in mirrored (non-parity) metadata,
	so the write hole is closed and no committed data is lost on
	crash (note we don't need to ever make parity metadata work
	because mirrored metadata will suffice, so this solution does
	not have to be adapted to work for metadata)

	- the tree is used to perform CoW on the raid stripe level,
	not the btrfs extent level, i.e. everything this tree does is
	invisible to btrfs extent, csum, and subvol trees.

It basically behaves like a writeback cache for RMW stripe updates.

On non-zoned devices, write intent log could write a complete stripe in
a new location, record the new location in a WIL tree, commit, overwrite
the stripe in the original location, delete the new location from the
WIL tree, and commit again.  This effectively makes raid5 stripes CoW
instead of RMW, and closes the write hole.  There's no need to modify
any other btrfs trees, which is good because relocation is expensive
compared to the overhead of overwriting the unmodified blocks in a stripe
for non-zoned devices.  Full-stripe writes don't require any of this,
so they go straight to the disk and leave no trace in WIL.  A writeback
thread can handle flushing WIL entries back to original stripe locations
in the background, and a small amount of extra space will be used while
that thread catches up to writes from previous transactions.  There's no
need to do anything new with the allocator for this, because everything
is hidden in the btrfs raid5/6 profile layer and the WIL tree, so the
existing clustered allocator is fine (though a RMW-avoiding allocator
would be faster).

On zoned devices, none of this seems necessary or useful, and some of
it is actively harmful.  We can't overwrite data in place, so we get no
benefit from a shortcut that might allow us to.  Once a stripe is written,
it's stuck in a read-only state until every extent that references the
stripe is deleted (by deleting the containing block group).  There's no
requirement to copy a stripe at any time, since any new writes could
simply get allocated to extents in a new raid stripe.  When we are
reclaiming space from a zone in GC, we want to copy only the data that
remains in existing extents, not the leftover unused blocks in the raid
stripes that contain the extents, so we simply perform exactly that copy
in reclaim.  For zoned device reclaim we _want_ all of the btrfs trees
(csum, extent, and subvol) to have extent-level visibility so that we can
avoid copying data from stripes that contain extents we didn't modify
or that were later deleted.

ISTM that zoned devices naturally fix the btrfs raid5/6 write hole issues
without any special effort because their limitations wrt overwrites
simply don't allow the write hole to be implemented.

> Now I think it's time to revive the extent allcator idea, and explore
> the extent allocator based idea, at least it requires no on-disk format
> change, which even write-intent still needs a on-disk format change (at
> least needs a compat ro flag)

This is the attractive feature about getting the allocator disciplined
so that RMW isn't needed any more.  It can reuse all the work of the
zoned implementation, except with the ability to allocate a full raid
stripe in any block group, not just the few that are opened for appending.

This would introduce a new requirement for existing raid5 filesystems that
several BGs are reserved for reclaim; however, this is not a particularly
onerous requirement since several BGs have to be reserved for metadata
expansion to avoid ENOSPC already, and there's no automation for this
in the filesystem.  Also raid5 filesystems are typically larger than
average and can afford a few hundred spare GB.  btrfs-cleaner only has
to be taught to not delete every single empty block group, but leave a
few spares allocated for GC.

> Thanks,
> Qu
> 
> > there shouldn't be much wasted space (in my case, I
> > have a 12-disk RAID6, so each full stripe holds 640kb, and discounting
> > single-sector writes that should go into metadata space, any
> > reasonable write should fill that buffer in a few seconds).
> > 
> > The additional suggestion of using smaller stripe widths in case there
> > isn't enough data to fill a whole stripe would make it very easy to
> > reclaim the wasted space by rebalancing with a stripe count filter,
> > which can be easily automated and run very frequently.
> > 
> > On-disk format also wouldn't change and be fully usable by older
> > kernels, and it should "only" require changes on the allocator to
> > implement.
> > 
> > On Fri, Jul 15, 2022 at 2:58 PM Goffredo Baroncelli <kreijack@libero.it> wrote:
> > > 
> > > On 14/07/2022 09.46, Johannes Thumshirn wrote:
> > > > On 14.07.22 09:32, Qu Wenruo wrote:
> > > > > [...]
> > > > 
> > > > Again if you're doing sub-stripe size writes, you're asking stupid things and
> > > > then there's no reason to not give the user stupid answers.
> > > > 
> > > 
> > > Qu is right, if we consider only full stripe write the "raid hole" problem
> > > disappear, because if a "full stripe" is not fully written it is not
> > > referenced either.
> > > 
> > > 
> > > Personally I think that the ZFS variable stripe size, may be interesting
> > > to evaluate. Moreover, because the BTRFS disk format is quite flexible,
> > > we can store different BG with different number of disks. Let me to make an
> > > example: if we have 10 disks, we could allocate:
> > > 1 BG RAID1
> > > 1 BG RAID5, spread over 4 disks only
> > > 1 BG RAID5, spread over 8 disks only
> > > 1 BG RAID5, spread over 10 disks
> > > 
> > > So if we have short writes, we could put the extents in the RAID1 BG; for longer
> > > writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
> > > of the data.
> > > 
> > > Yes this would require a sort of garbage collector to move the data to the biggest
> > > raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
> > > variable stripe size.
> > > 
> > > Doing so we don't need any disk format change and it would be backward compatible.
> > > 
> > > 
> > > Moreover, if we could put the smaller BG in the faster disks, we could have a
> > > decent tiering....
> > > 
> > > 
> > > > If a user is concerned about the write or space amplicfication of sub-stripe
> > > > writes on RAID56 he/she really needs to rethink the architecture.
> > > > 
> > > > 
> > > > 
> > > > [1]
> > > > S. K. Mishra and P. Mohapatra,
> > > > "Performance study of RAID-5 disk arrays with data and parity cache,"
> > > > Proceedings of the 1996 ICPP Workshop on Challenges for Parallel Processing,
> > > > 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.
> > > 
> > > --
> > > gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
> > > Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
> > > 

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-17 23:00                           ` Zygo Blaxell
@ 2022-07-18  1:04                             ` Qu Wenruo
  0 siblings, 0 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-07-18  1:04 UTC (permalink / raw)
  To: Zygo Blaxell
  Cc: Thiago Ramon, kreijack, Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/7/18 07:00, Zygo Blaxell wrote:
> On Sat, Jul 16, 2022 at 08:34:30AM +0800, Qu Wenruo wrote:
>>
>>
>> On 2022/7/16 03:08, Thiago Ramon wrote:
>>> As a user of RAID6 here, let me jump in because I think this
>>> suggestion is actually a very good compromise.
>>>
>>> With stripes written only once, we completely eliminate any possible
>>> write-hole, and even without any changes on the current disk layout
>>> and allocation,
>>
>> Unfortunately current extent allocator won't understand the requirement
>> at all.
>>
>> Currently the extent allocator although tends to use clustered free
>> space, when it can not find a clustered space, it goes where it can find
>> a free space. No matter if it's a substripe write.
>
>> Thus to full stripe only write, it's really the old idea about a new
>> extent allocator to avoid sub-stripe writes.
>
>> Nowadays with the zoned code, I guess it is now more feasible than previous.
>
> It's certainly easier, but the gotcha at the bottom of the pile for
> stripe-level GC on raid5 for btrfs is that raid5 stripe boundaries
> don't match btrfs extent boundaries.  If I write some extents of various
> sizes:
>
>          Extents:  [4k][24k][64k][160k][--512k--][200k][100k]
>          Stripes:  [---384k--------------][---384k-][---384k---]

That won't be a problem for RST, at least for RST on zoned device.

On RST with zoned device, we split extents to match the stripe boundary,
thus no problem.
(At least that's my educated guess).

But this will become a problem for RST on non-zoned device.


>
> If that 64K extent is freed, and I later write new data to it, then
> in theory I have to CoW the 4k, 24k, 160k extents, and _parts of_
> the 512k extent, or GC needs to be able to split extents (with an
> explosion of fragmentation as all the existing extents are sliced up
> in fractions-of-384k sized pieces).  Both options involve significant
> IO amplification (reads and writes) at write time, with the worst case
> being something like:
>
>          Extents:  ...--][128M][-8K-][128M][...
>          Stripes:  ...][384k][384k][384k][...
>
> where there's a 384k raid stripe that contains parts of two 128M extents
> and a 4K free space, and CoW does 256MB of IO on data blocks alone.
> All of the above seems like an insane path we don't want to follow.
>
> The main points of WIL (and as far as I can tell, also of RST) are:
>
> 	- it's a tree that translates logical bytenrs to new physical
> 	bytenrs so you can do CoW (RST) or journalling (WIL) on raid56
> 	stripes
>
> 	- it's persistent on disk in mirrored (non-parity) metadata,
> 	so the write hole is closed and no committed data is lost on
> 	crash (note we don't need to ever make parity metadata work
> 	because mirrored metadata will suffice, so this solution does
> 	not have to be adapted to work for metadata)
>
> 	- the tree is used to perform CoW on the raid stripe level,
> 	not the btrfs extent level, i.e. everything this tree does is
> 	invisible to btrfs extent, csum, and subvol trees.

I'm afraid we can not really do pure RAID level COW, without really
touching the extent layer.

At least for now, zoned code has already changed extent allocator to
always go forward to compensate for the zoned limitations.

I know this is not a good idea for layer separation we want, but
unfortunately zoned support really depends on that.

And if we really go the RST with transparent COW without touching extent
allocator, the complexity would go sky rocket.

>
> It basically behaves like a writeback cache for RMW stripe updates.
>
> On non-zoned devices, write intent log could write a complete stripe in
> a new location, record the new location in a WIL tree, commit, overwrite
> the stripe in the original location, delete the new location from the
> WIL tree, and commit again.  This effectively makes raid5 stripes CoW
> instead of RMW, and closes the write hole.  There's no need to modify
> any other btrfs trees, which is good because relocation is expensive
> compared to the overhead of overwriting the unmodified blocks in a stripe
> for non-zoned devices.

Yep, that's why I'm so strongly pushing for write-intent bitmaps.

It really is the least astonishment way to go, but at the cost of no
zoned support.

But considering how bad reputation nowadays SMR devices receive, I doubt
normal end users would even consider zoned device for RAID56.

>  Full-stripe writes don't require any of this,
> so they go straight to the disk and leave no trace in WIL.

Exactly the optimization I want to go, although not in current
write-intent code, it only needs less than 10 lines to implement.

>  A writeback
> thread can handle flushing WIL entries back to original stripe locations
> in the background, and a small amount of extra space will be used while
> that thread catches up to writes from previous transactions.

I don't think we need a dedicated flushing thread.

Currently for write-intent bitmaps, it's too small to really need to wait.
(and dm-bitmap also goes this way, if the bitmap overflows, we just wait
and retry).

For full journal, we can just wait for the endio function to free up
some space before we need to add new journal.

>  There's no
> need to do anything new with the allocator for this, because everything
> is hidden in the btrfs raid5/6 profile layer and the WIL tree, so the
> existing clustered allocator is fine (though a RMW-avoiding allocator
> would be faster).
>
> On zoned devices, none of this seems necessary or useful, and some of
> it is actively harmful.  We can't overwrite data in place, so we get no
> benefit from a shortcut that might allow us to.  Once a stripe is written,
> it's stuck in a read-only state until every extent that references the
> stripe is deleted (by deleting the containing block group).  There's no
> requirement to copy a stripe at any time, since any new writes could
> simply get allocated to extents in a new raid stripe.  When we are
> reclaiming space from a zone in GC, we want to copy only the data that
> remains in existing extents, not the leftover unused blocks in the raid
> stripes that contain the extents, so we simply perform exactly that copy
> in reclaim.  For zoned device reclaim we _want_ all of the btrfs trees
> (csum, extent, and subvol) to have extent-level visibility so that we can
> avoid copying data from stripes that contain extents we didn't modify
> or that were later deleted.
>
> ISTM that zoned devices naturally fix the btrfs raid5/6 write hole issues
> without any special effort because their limitations wrt overwrites
> simply don't allow the write hole to be implemented.

Not that simple, but mostly correct.

For zoned device, they need RST anyway for data RAID support (not only
RAID56).

And since their RST is also protected by transaction, we won't have the
sub-stripe overwrite problem, as if we lost power, we still see the old
RST tree.

But we will face the new challenges like GC and reserved zones just to
handle the script I mentioned above.

>
>> Now I think it's time to revive the extent allcator idea, and explore
>> the extent allocator based idea, at least it requires no on-disk format
>> change, which even write-intent still needs a on-disk format change (at
>> least needs a compat ro flag)
>
> This is the attractive feature about getting the allocator disciplined
> so that RMW isn't needed any more.  It can reuse all the work of the
> zoned implementation, except with the ability to allocate a full raid
> stripe in any block group, not just the few that are opened for appending.
>
> This would introduce a new requirement for existing raid5 filesystems that
> several BGs are reserved for reclaim; however, this is not a particularly
> onerous requirement since several BGs have to be reserved for metadata
> expansion to avoid ENOSPC already,

Sorry, we don't reserved BGs at all for non-zoned devices.

In fact, zoned devices can do reserved zones because they have the unit
of zones.

If for non-zoned devices, how to calculate the space? Using the similar
code like chunk allocator?
How to handle added devices or uneven devices?

That hardware zones really solves a lot of problems for reserved space.


In short, I'm not against RST + cow parity, but not comfort at all for
the following things:

- Zoned GC mechanism
- No support for non-zoned devices at all
   We still need GC, as long as we go Parity COW, even we can overwrite.

Unless there are some prototypes that can pass the script I mentioned
above, I'm not in.

Thanks,
Qu

> and there's no automation for this
> in the filesystem.  Also raid5 filesystems are typically larger than
> average and can afford a few hundred spare GB.  btrfs-cleaner only has
> to be taught to not delete every single empty block group, but leave a
> few spares allocated for GC.
>
>> Thanks,
>> Qu
>>
>>> there shouldn't be much wasted space (in my case, I
>>> have a 12-disk RAID6, so each full stripe holds 640kb, and discounting
>>> single-sector writes that should go into metadata space, any
>>> reasonable write should fill that buffer in a few seconds).
>>>
>>> The additional suggestion of using smaller stripe widths in case there
>>> isn't enough data to fill a whole stripe would make it very easy to
>>> reclaim the wasted space by rebalancing with a stripe count filter,
>>> which can be easily automated and run very frequently.
>>>
>>> On-disk format also wouldn't change and be fully usable by older
>>> kernels, and it should "only" require changes on the allocator to
>>> implement.
>>>
>>> On Fri, Jul 15, 2022 at 2:58 PM Goffredo Baroncelli <kreijack@libero.it> wrote:
>>>>
>>>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>>>>> On 14.07.22 09:32, Qu Wenruo wrote:
>>>>>> [...]
>>>>>
>>>>> Again if you're doing sub-stripe size writes, you're asking stupid things and
>>>>> then there's no reason to not give the user stupid answers.
>>>>>
>>>>
>>>> Qu is right, if we consider only full stripe write the "raid hole" problem
>>>> disappear, because if a "full stripe" is not fully written it is not
>>>> referenced either.
>>>>
>>>>
>>>> Personally I think that the ZFS variable stripe size, may be interesting
>>>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
>>>> we can store different BG with different number of disks. Let me to make an
>>>> example: if we have 10 disks, we could allocate:
>>>> 1 BG RAID1
>>>> 1 BG RAID5, spread over 4 disks only
>>>> 1 BG RAID5, spread over 8 disks only
>>>> 1 BG RAID5, spread over 10 disks
>>>>
>>>> So if we have short writes, we could put the extents in the RAID1 BG; for longer
>>>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
>>>> of the data.
>>>>
>>>> Yes this would require a sort of garbage collector to move the data to the biggest
>>>> raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
>>>> variable stripe size.
>>>>
>>>> Doing so we don't need any disk format change and it would be backward compatible.
>>>>
>>>>
>>>> Moreover, if we could put the smaller BG in the faster disks, we could have a
>>>> decent tiering....
>>>>
>>>>
>>>>> If a user is concerned about the write or space amplicfication of sub-stripe
>>>>> writes on RAID56 he/she really needs to rethink the architecture.
>>>>>
>>>>>
>>>>>
>>>>> [1]
>>>>> S. K. Mishra and P. Mohapatra,
>>>>> "Performance study of RAID-5 disk arrays with data and parity cache,"
>>>>> Proceedings of the 1996 ICPP Workshop on Challenges for Parallel Processing,
>>>>> 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.
>>>>
>>>> --
>>>> gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
>>>> Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
>>>>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-15 17:54                     ` Goffredo Baroncelli
  2022-07-15 19:08                       ` Thiago Ramon
  2022-07-15 20:14                       ` Chris Murphy
@ 2022-07-18  7:30                       ` Johannes Thumshirn
  2022-07-19 18:58                         ` Goffredo Baroncelli
  2 siblings, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-07-18  7:30 UTC (permalink / raw)
  To: kreijack, Qu Wenruo, Qu Wenruo, linux-btrfs

On 15.07.22 19:54, Goffredo Baroncelli wrote:
> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>> On 14.07.22 09:32, Qu Wenruo wrote:
>>> [...]
>>
>> Again if you're doing sub-stripe size writes, you're asking stupid things and
>> then there's no reason to not give the user stupid answers.
>>
> 
> Qu is right, if we consider only full stripe write the "raid hole" problem
> disappear, because if a "full stripe" is not fully written it is not
> referenced either.

It's not that there wil lbe a new write hole, it's just that there is sub-optimal
space consumption until we can either re-write or garbage collect the blocks.

> 
> Personally I think that the ZFS variable stripe size, may be interesting

But then we would need extra meta-data to describe the size of each stripe.

> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
> we can store different BG with different number of disks. Let me to make an
> example: if we have 10 disks, we could allocate:
> 1 BG RAID1
> 1 BG RAID5, spread over 4 disks only
> 1 BG RAID5, spread over 8 disks only
> 1 BG RAID5, spread over 10 disks
> 
> So if we have short writes, we could put the extents in the RAID1 BG; for longer
> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
> of the data.
> 
> Yes this would require a sort of garbage collector to move the data to the biggest
> raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
> variable stripe size.
> 
> Doing so we don't need any disk format change and it would be backward compatible.
> 
> 
> Moreover, if we could put the smaller BG in the faster disks, we could have a
> decent tiering....
> 
> 
>> If a user is concerned about the write or space amplicfication of sub-stripe
>> writes on RAID56 he/she really needs to rethink the architecture.
>>
>>
>>
>> [1]
>> S. K. Mishra and P. Mohapatra,
>> "Performance study of RAID-5 disk arrays with data and parity cache,"
>> Proceedings of the 1996 ICPP Workshop on Challenges for Parallel Processing,
>> 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.
> 


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-15 20:14                       ` Chris Murphy
@ 2022-07-18  7:33                         ` Johannes Thumshirn
  2022-07-18  8:03                           ` Qu Wenruo
  2022-07-18 21:49                         ` Forza
  1 sibling, 1 reply; 88+ messages in thread
From: Johannes Thumshirn @ 2022-07-18  7:33 UTC (permalink / raw)
  To: Chris Murphy, Goffredo Baroncelli; +Cc: Qu Wenruo, Qu Wenruo, linux-btrfs

On 15.07.22 22:15, Chris Murphy wrote:
> On Fri, Jul 15, 2022 at 1:55 PM Goffredo Baroncelli <kreijack@libero.it> wrote:
>>
>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>>> On 14.07.22 09:32, Qu Wenruo wrote:
>>>> [...]
>>>
>>> Again if you're doing sub-stripe size writes, you're asking stupid things and
>>> then there's no reason to not give the user stupid answers.
>>>
>>
>> Qu is right, if we consider only full stripe write the "raid hole" problem
>> disappear, because if a "full stripe" is not fully written it is not
>> referenced either.
>>
>>
>> Personally I think that the ZFS variable stripe size, may be interesting
>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
>> we can store different BG with different number of disks. Let me to make an
>> example: if we have 10 disks, we could allocate:
>> 1 BG RAID1
>> 1 BG RAID5, spread over 4 disks only
>> 1 BG RAID5, spread over 8 disks only
>> 1 BG RAID5, spread over 10 disks
>>
>> So if we have short writes, we could put the extents in the RAID1 BG; for longer
>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
>> of the data.
>>
>> Yes this would require a sort of garbage collector to move the data to the biggest
>> raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
>> variable stripe size.
>>
>> Doing so we don't need any disk format change and it would be backward compatible.
> 
> My 2 cents...
> 
> Regarding the current raid56 support, in order of preference:
> 
> a. Fix the current bugs, without changing format. Zygo has an extensive list.
> b. Mostly fix the write hole, also without changing the format, by
> only doing COW with full stripe writes. Yes you could somehow get
> corrupt parity still and not know it until degraded operation produces
> a bad reconstruction of data - but checksum will still catch that.
> This kind of "unreplicated corruption" is not quite the same thing as
> the write hole, because it isn't pernicious like the write hole.
> c. A new de-clustered parity raid56 implementation that is not
> backwards compatible.

c) is what I'm leaning to/working on, simply for the fact, that it is
the the only solution (I can think of at least) to make raid56 working
on zoned drives. And given that zoned drives tend to have a higher 
capacity than regular drives, they are appealing for raid arrays.
 
> Ergo, I think it's best to not break the format twice. Even if a new
> raid implementation is years off.

Agreed.

> Metadata centric workloads suck on parity raid anyway. If Btrfs always
> does full stripe COW won't matter even if the performance is worse
> because no one should use parity raid for this workload anyway.
> 

Yup.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-18  7:33                         ` Johannes Thumshirn
@ 2022-07-18  8:03                           ` Qu Wenruo
  0 siblings, 0 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-07-18  8:03 UTC (permalink / raw)
  To: Johannes Thumshirn, Chris Murphy, Goffredo Baroncelli
  Cc: Qu Wenruo, linux-btrfs



On 2022/7/18 15:33, Johannes Thumshirn wrote:
> On 15.07.22 22:15, Chris Murphy wrote:
>> On Fri, Jul 15, 2022 at 1:55 PM Goffredo Baroncelli <kreijack@libero.it> wrote:
>>>
>>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>>>> On 14.07.22 09:32, Qu Wenruo wrote:
>>>>> [...]
>>>>
>>>> Again if you're doing sub-stripe size writes, you're asking stupid things and
>>>> then there's no reason to not give the user stupid answers.
>>>>
>>>
>>> Qu is right, if we consider only full stripe write the "raid hole" problem
>>> disappear, because if a "full stripe" is not fully written it is not
>>> referenced either.
>>>
>>>
>>> Personally I think that the ZFS variable stripe size, may be interesting
>>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
>>> we can store different BG with different number of disks. Let me to make an
>>> example: if we have 10 disks, we could allocate:
>>> 1 BG RAID1
>>> 1 BG RAID5, spread over 4 disks only
>>> 1 BG RAID5, spread over 8 disks only
>>> 1 BG RAID5, spread over 10 disks
>>>
>>> So if we have short writes, we could put the extents in the RAID1 BG; for longer
>>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
>>> of the data.
>>>
>>> Yes this would require a sort of garbage collector to move the data to the biggest
>>> raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
>>> variable stripe size.
>>>
>>> Doing so we don't need any disk format change and it would be backward compatible.
>>
>> My 2 cents...
>>
>> Regarding the current raid56 support, in order of preference:
>>
>> a. Fix the current bugs, without changing format. Zygo has an extensive list.
>> b. Mostly fix the write hole, also without changing the format, by
>> only doing COW with full stripe writes. Yes you could somehow get
>> corrupt parity still and not know it until degraded operation produces
>> a bad reconstruction of data - but checksum will still catch that.
>> This kind of "unreplicated corruption" is not quite the same thing as
>> the write hole, because it isn't pernicious like the write hole.
>> c. A new de-clustered parity raid56 implementation that is not
>> backwards compatible.
>
> c) is what I'm leaning to/working on, simply for the fact, that it is
> the the only solution (I can think of at least) to make raid56 working
> on zoned drives. And given that zoned drives tend to have a higher
> capacity than regular drives, they are appealing for raid arrays.


That's what I can totally agree on.

RST is not an optional, but an essential thing to support RAID profiles
for data.

Thus I'm not against RST on zoned device at all, no matter if it's
RAID56 or not.

Thanks,
Qu

>
>> Ergo, I think it's best to not break the format twice. Even if a new
>> raid implementation is years off.
>
> Agreed.
>
>> Metadata centric workloads suck on parity raid anyway. If Btrfs always
>> does full stripe COW won't matter even if the performance is worse
>> because no one should use parity raid for this workload anyway.
>>
>
> Yup.

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-15 20:14                       ` Chris Murphy
  2022-07-18  7:33                         ` Johannes Thumshirn
@ 2022-07-18 21:49                         ` Forza
  2022-07-19  1:19                           ` Qu Wenruo
  1 sibling, 1 reply; 88+ messages in thread
From: Forza @ 2022-07-18 21:49 UTC (permalink / raw)
  To: Chris Murphy, Goffredo Baroncelli
  Cc: Johannes Thumshirn, Qu Wenruo, Qu Wenruo, linux-btrfs



---- From: Chris Murphy <lists@colorremedies.com> -- Sent: 2022-07-15 - 22:14 ----

> On Fri, Jul 15, 2022 at 1:55 PM Goffredo Baroncelli <kreijack@libero.it> wrote:
>>
>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>> > On 14.07.22 09:32, Qu Wenruo wrote:
>> >>[...]
>> >
>> > Again if you're doing sub-stripe size writes, you're asking stupid things and
>> > then there's no reason to not give the user stupid answers.
>> >
>>
>> Qu is right, if we consider only full stripe write the "raid hole" problem
>> disappear, because if a "full stripe" is not fully written it is not
>> referenced either.
>>
>>
>> Personally I think that the ZFS variable stripe size, may be interesting
>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
>> we can store different BG with different number of disks. 

We can create new types of BGs too. For example parity BGs. 

>>Let me to make an
>> example: if we have 10 disks, we could allocate:
>> 1 BG RAID1
>> 1 BG RAID5, spread over 4 disks only
>> 1 BG RAID5, spread over 8 disks only
>> 1 BG RAID5, spread over 10 disks
>>
>> So if we have short writes, we could put the extents in the RAID1 BG; for longer
>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
>> of the data.
>>
>> Yes this would require a sort of garbage collector to move the data to the biggest
>> raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
>> variable stripe size.
>>
>> Doing so we don't need any disk format change and it would be backward compatible.

Do we need to implement RAID56 in the traditional sense? As the user/sysadmin I care about redundancy and performance and cost. The option to create redundancy for any 'n drives is appealing from a cost perspective, otherwise I'd use RAID1/10.

Since the current RAID56 mode have several important drawbacks - and that it's officially not recommended for production use - it is a good idea to reconstruct new btrfs 'redundant-n' profiles that doesn't have the inherent issues of traditional RAID. For example a non-striped redundant-n profile as well as a striped redundant-n profile. 

> 
> My 2 cents...
> 
> Regarding the current raid56 support, in order of preference:
> 
> a. Fix the current bugs, without changing format. Zygo has an extensive list.

I agree that relatively simple fixes should be made. But it seems we will need quite a large rewrite to solve all issues? Is there a minium viable option here? 

> b. Mostly fix the write hole, also without changing the format, by
> only doing COW with full stripe writes. Yes you could somehow get
> corrupt parity still and not know it until degraded operation produces
> a bad reconstruction of data - but checksum will still catch that.
> This kind of "unreplicated corruption" is not quite the same thing as
> the write hole, because it isn't pernicious like the write hole.

What is the difference to a)? Is write hole the worst issue? Judging from the #brtfs channel discussions there seems to be other quite severe issues, for example real data corruption risks in degraded mode. 

> c. A new de-clustered parity raid56 implementation that is not
> backwards compatible.

Yes. We have a good opportunity to work out something much better than current implementations. We could have  redundant-n profiles that also works with tired storage like ssd/nvme similar to the metadata on ssd idea. 

Variable stripe width has been brought up before, but received cool responses. Why is that? IMO it could improve random 4k ios by doing equivalent to RAID1 instead of RMW, while also closing the write hole. Perhaps there is a middle ground to be found? 


> 
> Ergo, I think it's best to not break the format twice. Even if a new
> raid implementation is years off.

I very agree here. Btrfs already suffers in public opinion from the lack of a stable and safe-for-data RAID56, and requiring several non-compatible chances isn't going to help. 

I also think it's important that the 'temporary' changes actually leads to a stable filesystem. Because what is the point otherwise? 

Thanks
Forza

> 
> Metadata centric workloads suck on parity raid anyway. If Btrfs always
> does full stripe COW won't matter even if the performance is worse
> because no one should use parity raid for this workload anyway.
> 
> 
> --
> Chris Murphy



^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-18 21:49                         ` Forza
@ 2022-07-19  1:19                           ` Qu Wenruo
  2022-07-21 14:51                             ` Forza
  2022-07-25  0:00                             ` Zygo Blaxell
  0 siblings, 2 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-07-19  1:19 UTC (permalink / raw)
  To: Forza, Chris Murphy, Goffredo Baroncelli
  Cc: Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/7/19 05:49, Forza wrote:
>
>
> ---- From: Chris Murphy <lists@colorremedies.com> -- Sent: 2022-07-15 - 22:14 ----
>
>> On Fri, Jul 15, 2022 at 1:55 PM Goffredo Baroncelli <kreijack@libero.it> wrote:
>>>
>>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>>>> On 14.07.22 09:32, Qu Wenruo wrote:
>>>>> [...]
>>>>
>>>> Again if you're doing sub-stripe size writes, you're asking stupid things and
>>>> then there's no reason to not give the user stupid answers.
>>>>
>>>
>>> Qu is right, if we consider only full stripe write the "raid hole" problem
>>> disappear, because if a "full stripe" is not fully written it is not
>>> referenced either.
>>>
>>>
>>> Personally I think that the ZFS variable stripe size, may be interesting
>>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
>>> we can store different BG with different number of disks.
>
> We can create new types of BGs too. For example parity BGs.
>
>>> Let me to make an
>>> example: if we have 10 disks, we could allocate:
>>> 1 BG RAID1
>>> 1 BG RAID5, spread over 4 disks only
>>> 1 BG RAID5, spread over 8 disks only
>>> 1 BG RAID5, spread over 10 disks
>>>
>>> So if we have short writes, we could put the extents in the RAID1 BG; for longer
>>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
>>> of the data.
>>>
>>> Yes this would require a sort of garbage collector to move the data to the biggest
>>> raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
>>> variable stripe size.
>>>
>>> Doing so we don't need any disk format change and it would be backward compatible.
>
> Do we need to implement RAID56 in the traditional sense? As the user/sysadmin I care about redundancy and performance and cost. The option to create redundancy for any 'n drives is appealing from a cost perspective, otherwise I'd use RAID1/10.

Have you heard any recent problems related to dm-raid56?

If your answer is no, then I guess we already have an  answer to your
question.

>
> Since the current RAID56 mode have several important drawbacks

Let me to be clear:

If you can ensure you didn't hit power loss, or after a power loss do a
scrub immediately before any new write, then current RAID56 is fine, at
least not obviously worse than dm-raid56.

(There are still common problems shared between both btrfs raid56 and
dm-raid56, like destructive-RMW)

> - and that it's officially not recommended for production use - it is a good idea to reconstruct new btrfs 'redundant-n' profiles that doesn't have the inherent issues of traditional RAID.

I'd say the complexity is hugely underestimated.

> For example a non-striped redundant-n profile as well as a striped redundant-n profile.

Non-striped redundant-n profile is already so complex that I can't
figure out a working idea right now.

But if there is such way, I'm pretty happy to consider.

>
>>
>> My 2 cents...
>>
>> Regarding the current raid56 support, in order of preference:
>>
>> a. Fix the current bugs, without changing format. Zygo has an extensive list.
>
> I agree that relatively simple fixes should be made. But it seems we will need quite a large rewrite to solve all issues? Is there a minium viable option here?

Nope. Just see my write-intent code, already have prototype (just needs
new scrub based recovery code at mount time) working.

And based on my write-intent code, I don't think it's that hard to
implement a full journal.

Thanks,
Qu

>
>> b. Mostly fix the write hole, also without changing the format, by
>> only doing COW with full stripe writes. Yes you could somehow get
>> corrupt parity still and not know it until degraded operation produces
>> a bad reconstruction of data - but checksum will still catch that.
>> This kind of "unreplicated corruption" is not quite the same thing as
>> the write hole, because it isn't pernicious like the write hole.
>
> What is the difference to a)? Is write hole the worst issue? Judging from the #brtfs channel discussions there seems to be other quite severe issues, for example real data corruption risks in degraded mode.
>
>> c. A new de-clustered parity raid56 implementation that is not
>> backwards compatible.
>
> Yes. We have a good opportunity to work out something much better than current implementations. We could have  redundant-n profiles that also works with tired storage like ssd/nvme similar to the metadata on ssd idea.
>
> Variable stripe width has been brought up before, but received cool responses. Why is that? IMO it could improve random 4k ios by doing equivalent to RAID1 instead of RMW, while also closing the write hole. Perhaps there is a middle ground to be found?
>
>
>>
>> Ergo, I think it's best to not break the format twice. Even if a new
>> raid implementation is years off.
>
> I very agree here. Btrfs already suffers in public opinion from the lack of a stable and safe-for-data RAID56, and requiring several non-compatible chances isn't going to help.
>
> I also think it's important that the 'temporary' changes actually leads to a stable filesystem. Because what is the point otherwise?
>
> Thanks
> Forza
>
>>
>> Metadata centric workloads suck on parity raid anyway. If Btrfs always
>> does full stripe COW won't matter even if the performance is worse
>> because no one should use parity raid for this workload anyway.
>>
>>
>> --
>> Chris Murphy
>
>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-18  7:30                       ` Johannes Thumshirn
@ 2022-07-19 18:58                         ` Goffredo Baroncelli
  0 siblings, 0 replies; 88+ messages in thread
From: Goffredo Baroncelli @ 2022-07-19 18:58 UTC (permalink / raw)
  To: Johannes Thumshirn, Qu Wenruo, Qu Wenruo, linux-btrfs

On 18/07/2022 09.30, Johannes Thumshirn wrote:
> On 15.07.22 19:54, Goffredo Baroncelli wrote:
>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>>> On 14.07.22 09:32, Qu Wenruo wrote:
>>>> [...]
>>>
>>> Again if you're doing sub-stripe size writes, you're asking stupid things and
>>> then there's no reason to not give the user stupid answers.
>>>
>>
>> Qu is right, if we consider only full stripe write the "raid hole" problem
>> disappear, because if a "full stripe" is not fully written it is not
>> referenced either.
> 
> It's not that there wil lbe a new write hole, it's just that there is sub-optimal
> space consumption until we can either re-write or garbage collect the blocks.

May be that I was not very clear. Let me to repeat: if we assume that we
can write only full stripe (padding with 0, if smaller), we don't have the
write-hole problem at all, so we can also avoid using RST.

  
>>
>> Personally I think that the ZFS variable stripe size, may be interesting
> 
> But then we would need extra meta-data to describe the size of each stripe.

It is not needed. The stripe allocation is per extent. The layout of the stripe
depends only by the start of the extent and its length. Assuming that you
have n disks and a raid5 layout, you already know that each (n-1) data blocks,
there is a parity block in the extent.

The relation between the length of the extent and the real data stored is

extent-length = data-length + 4k*(data-length / 4k / (n-1))
extent-length += 4k if (data-length  %(4k * (n-1))) > 0

extent-length = size of the extent (which contains the parity block)
data-length = the real length of consecutive data
n = number of disk

Below some examples that show better my idea:

Assuming the following logical address

Disk1	0    12k  24k
Disk2	4k   16k  28k
Disk3	8k   20k  ....


first write: data size = 1 block

Disk1	D1 ...
Disk2	P1 ...
Disk3	...

Extent = (0, 8K),


second write; data size = 3 block ( D2, D2*, D3 )

Disk1	D1 D2* P3 ...
Disk2	P1 P2  ...
Disk3	D2 D3  ...

Extent = (8k, 20K)


Write a bigger data, and shape the stripe taller:

3rd write; data size 32k: (D6, D6*, ... D9*

Disk1	D1 D2* P3 P68 P6*8* P79 P7*9* ...
Disk2	P1 P2  D6 D6* D7    D7* ...
Disk3	D2 D3  D8 D8* D9    D9* ...

Extent = (28k, 48k)

The major drawbacks are:
- you can break the extent only at stripe boundary (up to 64K * n)
- the scrub is a bit more complex, because it involves some math around
   the extents start/length
- you need to have an extent that describe the stripe. I don't know if this
   requirements is fulfill by metadata

The schema above, has an huge simplification if we allow BTRFS to use BG with
dedicated stripe size. Moreover this would reduce the fragmentation, even tough it
requires a GC.

> 
>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
>> we can store different BG with different number of disks. Let me to make an
>> example: if we have 10 disks, we could allocate:
>> 1 BG RAID1
>> 1 BG RAID5, spread over 4 disks only
>> 1 BG RAID5, spread over 8 disks only
>> 1 BG RAID5, spread over 10 disks
>>
>> So if we have short writes, we could put the extents in the RAID1 BG; for longer
>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by length
>> of the data.
>>
>> Yes this would require a sort of garbage collector to move the data to the biggest
>> raid5 BG, but this would avoid (or reduce) the fragmentation which affect the
>> variable stripe size.
>>
>> Doing so we don't need any disk format change and it would be backward compatible.
>>
>>
>> Moreover, if we could put the smaller BG in the faster disks, we could have a
>> decent tiering....
>>
>>
>>> If a user is concerned about the write or space amplicfication of sub-stripe
>>> writes on RAID56 he/she really needs to rethink the architecture.
>>>
>>>
>>>
>>> [1]
>>> S. K. Mishra and P. Mohapatra,
>>> "Performance study of RAID-5 disk arrays with data and parity cache,"
>>> Proceedings of the 1996 ICPP Workshop on Challenges for Parallel Processing,
>>> 1996, pp. 222-229 vol.1, doi: 10.1109/ICPP.1996.537164.
>>
> 

-- 
gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-19  1:19                           ` Qu Wenruo
@ 2022-07-21 14:51                             ` Forza
  2022-07-24 11:27                               ` Qu Wenruo
  2022-07-25  0:00                             ` Zygo Blaxell
  1 sibling, 1 reply; 88+ messages in thread
From: Forza @ 2022-07-21 14:51 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs



On 2022-07-19 03:19, Qu Wenruo wrote:
> 
> 
> On 2022/7/19 05:49, Forza wrote:
>>
>>
>> ---- From: Chris Murphy <lists@colorremedies.com> -- Sent: 2022-07-15 
>> - 22:14 ----
>>
>>> On Fri, Jul 15, 2022 at 1:55 PM Goffredo Baroncelli 
>>> <kreijack@libero.it> wrote:
>>>>
>>>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>>>>> On 14.07.22 09:32, Qu Wenruo wrote:
>>>>>> [...]
>>>>>
>>>>> Again if you're doing sub-stripe size writes, you're asking stupid 
>>>>> things and
>>>>> then there's no reason to not give the user stupid answers.
>>>>>
>>>>
>>>> Qu is right, if we consider only full stripe write the "raid hole" 
>>>> problem
>>>> disappear, because if a "full stripe" is not fully written it is not
>>>> referenced either.
>>>>
>>>>
>>>> Personally I think that the ZFS variable stripe size, may be 
>>>> interesting
>>>> to evaluate. Moreover, because the BTRFS disk format is quite flexible,
>>>> we can store different BG with different number of disks.
>>
>> We can create new types of BGs too. For example parity BGs.
>>
>>>> Let me to make an
>>>> example: if we have 10 disks, we could allocate:
>>>> 1 BG RAID1
>>>> 1 BG RAID5, spread over 4 disks only
>>>> 1 BG RAID5, spread over 8 disks only
>>>> 1 BG RAID5, spread over 10 disks
>>>>
>>>> So if we have short writes, we could put the extents in the RAID1 
>>>> BG; for longer
>>>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by 
>>>> length
>>>> of the data.
>>>>
>>>> Yes this would require a sort of garbage collector to move the data 
>>>> to the biggest
>>>> raid5 BG, but this would avoid (or reduce) the fragmentation which 
>>>> affect the
>>>> variable stripe size.
>>>>
>>>> Doing so we don't need any disk format change and it would be 
>>>> backward compatible.
>>
>> Do we need to implement RAID56 in the traditional sense? As the 
>> user/sysadmin I care about redundancy and performance and cost. The 
>> option to create redundancy for any 'n drives is appealing from a cost 
>> perspective, otherwise I'd use RAID1/10.
> 
> Have you heard any recent problems related to dm-raid56?

No..?

> 
> If your answer is no, then I guess we already have an  answer to your
> question.
> 
>>
>> Since the current RAID56 mode have several important drawbacks
> 
> Let me to be clear:
> 
> If you can ensure you didn't hit power loss, or after a power loss do a
> scrub immediately before any new write, then current RAID56 is fine, at
> least not obviously worse than dm-raid56.
> 
> (There are still common problems shared between both btrfs raid56 and
> dm-raid56, like destructive-RMW)
> 
>> - and that it's officially not recommended for production use - it is 
>> a good idea to reconstruct new btrfs 'redundant-n' profiles that 
>> doesn't have the inherent issues of traditional RAID.
> 
> I'd say the complexity is hugely underestimated.

You are probably right. But is it solvable, and is there a vision of 
'something better' than traditional RAID56?

> 
>> For example a non-striped redundant-n profile as well as a striped 
>> redundant-n profile.
> 
> Non-striped redundant-n profile is already so complex that I can't
> figure out a working idea right now.
> 
> But if there is such way, I'm pretty happy to consider.

Can we borrow ideas from the PAR2/PAR3 format?

For each extent, create 'par' redundancy metadata that allows for n-% or 
n-copies of recovery, and that this metadata is also split on different 
disks to allow for n total drive-failures? Maybe parity data can be 
stored in parity BGs, in metadata itself or in special type of extents 
inside data BGs.

> 
>>
>>>
>>> My 2 cents...
>>>
>>> Regarding the current raid56 support, in order of preference:
>>>
>>> a. Fix the current bugs, without changing format. Zygo has an 
>>> extensive list.
>>
>> I agree that relatively simple fixes should be made. But it seems we 
>> will need quite a large rewrite to solve all issues? Is there a minium 
>> viable option here?
> 
> Nope. Just see my write-intent code, already have prototype (just needs
> new scrub based recovery code at mount time) working.
> 
> And based on my write-intent code, I don't think it's that hard to
> implement a full journal.
> 

This is good news. Do you see any other major issues that would need 
fixing before RADI56 can be considered production-ready?


> Thanks,
> Qu
> 
>>
>>> b. Mostly fix the write hole, also without changing the format, by
>>> only doing COW with full stripe writes. Yes you could somehow get
>>> corrupt parity still and not know it until degraded operation produces
>>> a bad reconstruction of data - but checksum will still catch that.
>>> This kind of "unreplicated corruption" is not quite the same thing as
>>> the write hole, because it isn't pernicious like the write hole.
>>
>> What is the difference to a)? Is write hole the worst issue? Judging 
>> from the #brtfs channel discussions there seems to be other quite 
>> severe issues, for example real data corruption risks in degraded mode.
>>
>>> c. A new de-clustered parity raid56 implementation that is not
>>> backwards compatible.
>>
>> Yes. We have a good opportunity to work out something much better than 
>> current implementations. We could have  redundant-n profiles that also 
>> works with tired storage like ssd/nvme similar to the metadata on ssd 
>> idea.
>>
>> Variable stripe width has been brought up before, but received cool 
>> responses. Why is that? IMO it could improve random 4k ios by doing 
>> equivalent to RAID1 instead of RMW, while also closing the write hole. 
>> Perhaps there is a middle ground to be found?
>>
>>
>>>
>>> Ergo, I think it's best to not break the format twice. Even if a new
>>> raid implementation is years off.
>>
>> I very agree here. Btrfs already suffers in public opinion from the 
>> lack of a stable and safe-for-data RAID56, and requiring several 
>> non-compatible chances isn't going to help.
>>
>> I also think it's important that the 'temporary' changes actually 
>> leads to a stable filesystem. Because what is the point otherwise?
>>
>> Thanks
>> Forza
>>
>>>
>>> Metadata centric workloads suck on parity raid anyway. If Btrfs always
>>> does full stripe COW won't matter even if the performance is worse
>>> because no one should use parity raid for this workload anyway.
>>>
>>>
>>> -- 
>>> Chris Murphy
>>
>>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-21 14:51                             ` Forza
@ 2022-07-24 11:27                               ` Qu Wenruo
  0 siblings, 0 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-07-24 11:27 UTC (permalink / raw)
  To: Forza, Qu Wenruo, linux-btrfs



On 2022/7/21 22:51, Forza wrote:
>
>
> On 2022-07-19 03:19, Qu Wenruo wrote:
>>
>>
>> On 2022/7/19 05:49, Forza wrote:
>>>
>>>
>>> ---- From: Chris Murphy <lists@colorremedies.com> -- Sent: 2022-07-15
>>> - 22:14 ----
>>>
>>>> On Fri, Jul 15, 2022 at 1:55 PM Goffredo Baroncelli
>>>> <kreijack@libero.it> wrote:
>>>>>
>>>>> On 14/07/2022 09.46, Johannes Thumshirn wrote:
>>>>>> On 14.07.22 09:32, Qu Wenruo wrote:
>>>>>>> [...]
>>>>>>
>>>>>> Again if you're doing sub-stripe size writes, you're asking stupid
>>>>>> things and
>>>>>> then there's no reason to not give the user stupid answers.
>>>>>>
>>>>>
>>>>> Qu is right, if we consider only full stripe write the "raid hole"
>>>>> problem
>>>>> disappear, because if a "full stripe" is not fully written it is not
>>>>> referenced either.
>>>>>
>>>>>
>>>>> Personally I think that the ZFS variable stripe size, may be
>>>>> interesting
>>>>> to evaluate. Moreover, because the BTRFS disk format is quite
>>>>> flexible,
>>>>> we can store different BG with different number of disks.
>>>
>>> We can create new types of BGs too. For example parity BGs.
>>>
>>>>> Let me to make an
>>>>> example: if we have 10 disks, we could allocate:
>>>>> 1 BG RAID1
>>>>> 1 BG RAID5, spread over 4 disks only
>>>>> 1 BG RAID5, spread over 8 disks only
>>>>> 1 BG RAID5, spread over 10 disks
>>>>>
>>>>> So if we have short writes, we could put the extents in the RAID1
>>>>> BG; for longer
>>>>> writes we could use a RAID5 BG with 4 or 8 or 10 disks depending by
>>>>> length
>>>>> of the data.
>>>>>
>>>>> Yes this would require a sort of garbage collector to move the data
>>>>> to the biggest
>>>>> raid5 BG, but this would avoid (or reduce) the fragmentation which
>>>>> affect the
>>>>> variable stripe size.
>>>>>
>>>>> Doing so we don't need any disk format change and it would be
>>>>> backward compatible.
>>>
>>> Do we need to implement RAID56 in the traditional sense? As the
>>> user/sysadmin I care about redundancy and performance and cost. The
>>> option to create redundancy for any 'n drives is appealing from a
>>> cost perspective, otherwise I'd use RAID1/10.
>>
>> Have you heard any recent problems related to dm-raid56?
>
> No..?

Then, I'd say their write-intent + journal (PPL for RAID5, full journal
for RAID6) is a tried and true solution.

I see no reason not to follow.

>
>>
>> If your answer is no, then I guess we already have an  answer to your
>> question.
>>
>>>
>>> Since the current RAID56 mode have several important drawbacks
>>
>> Let me to be clear:
>>
>> If you can ensure you didn't hit power loss, or after a power loss do a
>> scrub immediately before any new write, then current RAID56 is fine, at
>> least not obviously worse than dm-raid56.
>>
>> (There are still common problems shared between both btrfs raid56 and
>> dm-raid56, like destructive-RMW)
>>
>>> - and that it's officially not recommended for production use - it is
>>> a good idea to reconstruct new btrfs 'redundant-n' profiles that
>>> doesn't have the inherent issues of traditional RAID.
>>
>> I'd say the complexity is hugely underestimated.
>
> You are probably right. But is it solvable, and is there a vision of
> 'something better' than traditional RAID56?

I'd say, maybe.

I prefer some encode at file extent level (like compression) to provide
extra data recovery, other than relying on stripe based RAID56.

The problem is, normally such encoding is to correct data corruption for
a small percentage, but for regular RAID1/10 or even small number of
disks RAID56, the percentage is not small.

(missing 1 disk in 3 disks RAID5, we're in fact recovery 50% of our data)

If we can find a good encode (probably used after compression), I'm 100%
fine to use that encoding, other than traditional RAID56.

>
>>
>>> For example a non-striped redundant-n profile as well as a striped
>>> redundant-n profile.
>>
>> Non-striped redundant-n profile is already so complex that I can't
>> figure out a working idea right now.
>>
>> But if there is such way, I'm pretty happy to consider.
>
> Can we borrow ideas from the PAR2/PAR3 format?
>
> For each extent, create 'par' redundancy metadata that allows for n-% or
> n-copies of recovery, and that this metadata is also split on different
> disks to allow for n total drive-failures? Maybe parity data can be
> stored in parity BGs, in metadata itself or in special type of extents
> inside data BGs.

The problem is still there, if there is anything representing a stripe,
and any calculate extra info based on stripes, then we can still hit the
write-hole problem.

If we do sub-stripe write, we have to update the checksum or whatever,
which can be out-of-sync during power loss.


If you mean an extra tree to store all these extra checksum/info (aka,
no longer need the stripe unit at all), then I guess it may be possible.

Like we use a special csum algorithm which may take way larger space
than our current 32 bytes per 4K, then I guess we may be able to get
extra redundancy.

There will be some problems like different metadata/data csum (metadata
csum is limited to 32bytes as it's inlined), and way larger metadata
usage for csum.

But those should be more or less solvable.

>
>>
>>>
>>>>
>>>> My 2 cents...
>>>>
>>>> Regarding the current raid56 support, in order of preference:
>>>>
>>>> a. Fix the current bugs, without changing format. Zygo has an
>>>> extensive list.
>>>
>>> I agree that relatively simple fixes should be made. But it seems we
>>> will need quite a large rewrite to solve all issues? Is there a
>>> minium viable option here?
>>
>> Nope. Just see my write-intent code, already have prototype (just needs
>> new scrub based recovery code at mount time) working.
>>
>> And based on my write-intent code, I don't think it's that hard to
>> implement a full journal.
>>
>
> This is good news. Do you see any other major issues that would need
> fixing before RADI56 can be considered production-ready?

Currently I have only finished write-intent bitmaps, which requires
after power loss, all devices are still available and data not touched
is still correct.

For powerloss + missing device, I have to go full journal, but the code
should be pretty similar thus I'm not that concerned.


The biggest problem remaining is, write-intent bitmap/full journal
requires regular devices, no support for zoned devices at all.

Thus zoned guys are not a big fan of this solution.

Thanks,
Qu

>
>
>> Thanks,
>> Qu
>>
>>>
>>>> b. Mostly fix the write hole, also without changing the format, by
>>>> only doing COW with full stripe writes. Yes you could somehow get
>>>> corrupt parity still and not know it until degraded operation produces
>>>> a bad reconstruction of data - but checksum will still catch that.
>>>> This kind of "unreplicated corruption" is not quite the same thing as
>>>> the write hole, because it isn't pernicious like the write hole.
>>>
>>> What is the difference to a)? Is write hole the worst issue? Judging
>>> from the #brtfs channel discussions there seems to be other quite
>>> severe issues, for example real data corruption risks in degraded mode.
>>>
>>>> c. A new de-clustered parity raid56 implementation that is not
>>>> backwards compatible.
>>>
>>> Yes. We have a good opportunity to work out something much better
>>> than current implementations. We could have  redundant-n profiles
>>> that also works with tired storage like ssd/nvme similar to the
>>> metadata on ssd idea.
>>>
>>> Variable stripe width has been brought up before, but received cool
>>> responses. Why is that? IMO it could improve random 4k ios by doing
>>> equivalent to RAID1 instead of RMW, while also closing the write
>>> hole. Perhaps there is a middle ground to be found?
>>>
>>>
>>>>
>>>> Ergo, I think it's best to not break the format twice. Even if a new
>>>> raid implementation is years off.
>>>
>>> I very agree here. Btrfs already suffers in public opinion from the
>>> lack of a stable and safe-for-data RAID56, and requiring several
>>> non-compatible chances isn't going to help.
>>>
>>> I also think it's important that the 'temporary' changes actually
>>> leads to a stable filesystem. Because what is the point otherwise?
>>>
>>> Thanks
>>> Forza
>>>
>>>>
>>>> Metadata centric workloads suck on parity raid anyway. If Btrfs always
>>>> does full stripe COW won't matter even if the performance is worse
>>>> because no one should use parity raid for this workload anyway.
>>>>
>>>>
>>>> --
>>>> Chris Murphy
>>>
>>>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-19  1:19                           ` Qu Wenruo
  2022-07-21 14:51                             ` Forza
@ 2022-07-25  0:00                             ` Zygo Blaxell
  2022-07-25  0:25                               ` Qu Wenruo
  2022-07-25 19:58                               ` Goffredo Baroncelli
  1 sibling, 2 replies; 88+ messages in thread
From: Zygo Blaxell @ 2022-07-25  0:00 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Forza, Chris Murphy, Goffredo Baroncelli, Johannes Thumshirn,
	Qu Wenruo, linux-btrfs

On Tue, Jul 19, 2022 at 09:19:21AM +0800, Qu Wenruo wrote:
> > > > Doing so we don't need any disk format change and it would be backward compatible.
> > 
> > Do we need to implement RAID56 in the traditional sense? As the
> user/sysadmin I care about redundancy and performance and cost. The
> option to create redundancy for any 'n drives is appealing from a cost
> perspective, otherwise I'd use RAID1/10.
> 
> Have you heard any recent problems related to dm-raid56?
> 
> If your answer is no, then I guess we already have an  answer to your
> question.

With plain dm-raid56 the problems were there since the beginning, so
they're not recent.  If there's a way to configure PPL or a journal
device with raid5 LVs on LVM, I can't find it.  AFAIK nobody who knows
what they're doing would choose dm-raid56 for high-value data, especially
when alternatives like ZFS exist.

Before btrfs, we had a single-digit-percentage rate of severe data losses
(more than 90% data lost) on filesystems and databases using mdadm +
ext3/4 with no journal in degraded mode.  Multiply by per-drive AFR
and that's a lot of full system rebuilds over the years.

> > Since the current RAID56 mode have several important drawbacks
> 
> Let me to be clear:
> 
> If you can ensure you didn't hit power loss, or after a power loss do a
> scrub immediately before any new write, then current RAID56 is fine, at
> least not obviously worse than dm-raid56.

I'm told that scrub doesn't repair parity errors on btrfs.  That was a
thing I got wrong in my raid5 bug list from 2020.  Scrub will fix data
blocks if they have csum errors, but it will not detect or correct
corruption in the parity blocks themselves.  AFAICT the only way to
get the parity blocks rewritten is to run something like balance,
which carries risks of its own due to the sheer volume of IO from
data and metadata updates.

Most of the raid56 bugs I've identified have nothing to do with power
loss.  The data on disks is fine, but the kernel can't read it correctly
in degraded mode, or the diagnostic data from scrub are clearly garbage.

I noticed you and others have done some work here recently, so some of
these issues might be fixed in 5.19.  I haven't re-run my raid5 tests
on post-5.18 kernels yet (there have been other bugs blocking testing).

> (There are still common problems shared between both btrfs raid56 and
> dm-raid56, like destructive-RMW)

Yeah, that's one of the critical things to fix because btrfs is in a good
position to do as well or better than dm-raid56.  btrfs has definitely
fallen behind the other available solutions in the 9 years since raid5 was
first added to btrfs, as btrfs implements only the basic configuration
of raid56 (no parity integrity or rmw journal) that is fully vulnerable
to write hole and drive-side data corruption.

> > - and that it's officially not recommended for production use - it
> is a good idea to reconstruct new btrfs 'redundant-n' profiles that
> doesn't have the inherent issues of traditional RAID.
> 
> I'd say the complexity is hugely underestimated.

I'd agree with that.  e.g. some btrfs equivalent of ZFS raidZ (put parity
blocks inline with extents during writes) is not much more complex to
implement on btrfs than compression; however, the btrfs kernel code
couldn't read compressed data correctly for 12 years out of its 14-year
history, and nobody wants to wait another decade or more for raid5
to work.

It seems to me the biggest problem with write hole fixes is that all
the potential fixes have cost tradeoffs, and everybody wants to veto
the fix that has a cost they don't like.

We could implement multiple fix approaches at the same time, as AFAIK
most of the proposed solutions are orthogonal to each other.  e.g. a
write-ahead log can safely enable RMW at a higher IO cost, while the
allocator could place extents to avoid RMW and thereby avoid the logging
cost as much as possible (paid for by a deferred relocation/garbage
collection cost), and using both at the same time would combine both
benefits.  Both solutions can be used independently for filesystems at
extreme ends of the performance/capacity spectrum (if the filesystem is
never more than 50% full, then logging is all cost with no gain compared
to allocator avoidance of RMW, while a filesystem that is always near
full will have to journal writes and also throttle writes on the journal.

> > For example a non-striped redundant-n profile as well as a striped redundant-n profile.
> 
> Non-striped redundant-n profile is already so complex that I can't
> figure out a working idea right now.
> 
> But if there is such way, I'm pretty happy to consider.
> 
> > 
> > > 
> > > My 2 cents...
> > > 
> > > Regarding the current raid56 support, in order of preference:
> > > 
> > > a. Fix the current bugs, without changing format. Zygo has an extensive list.
> > 
> > I agree that relatively simple fixes should be made. But it seems we will need quite a large rewrite to solve all issues? Is there a minium viable option here?
> 
> Nope. Just see my write-intent code, already have prototype (just needs
> new scrub based recovery code at mount time) working.
> 
> And based on my write-intent code, I don't think it's that hard to
> implement a full journal.

FWIW I think we can get a very usable btrfs raid5 with a small format
change (add a journal for stripe RMW, though we might disagree about
details of how it should be structured and used) and fixes to the
read-repair and scrub problems.  The read-side problems in btrfs raid5
were always much more severe than the write hole.  As soon as a disk
goes offline, the read-repair code is unable to read all the surviving
data correctly, and the filesystem has to be kept inactive or data on
the disks will be gradually corrupted as bad parity gets mixed with data
and written back to the filesystem.

A few of the problems will require a deeper redesign, but IMHO they're not
important problems.  e.g. scrub can't identify which drive is corrupted
in all cases, because it has no csum on parity blocks.  The current
on-disk format needs every data block in the raid5 stripe to be occupied
by a file with a csum so scrub can eliminate every other block as the
possible source of mismatched parity.  While this could be fixed by
a future new raid5 profile (and/or csum tree) specifically designed
to avoid this, it's not something I'd insist on having before deploying
a fleet of btrfs raid5 boxes.  Silent corruption failures are so
rare on spinning disks that I'd use the feature maybe once a decade.
Silent corruption due to a failing or overheating HBA chip will most
likely affect multiple disks at once and trash the whole filesystem,
so individual drive-level corruption reporting isn't helpful.

> Thanks,
> Qu
> 
> > 
> > > b. Mostly fix the write hole, also without changing the format, by
> > > only doing COW with full stripe writes. Yes you could somehow get
> > > corrupt parity still and not know it until degraded operation produces
> > > a bad reconstruction of data - but checksum will still catch that.
> > > This kind of "unreplicated corruption" is not quite the same thing as
> > > the write hole, because it isn't pernicious like the write hole.
> > 
> > What is the difference to a)? Is write hole the worst issue? Judging from the #brtfs channel discussions there seems to be other quite severe issues, for example real data corruption risks in degraded mode.
> > 
> > > c. A new de-clustered parity raid56 implementation that is not
> > > backwards compatible.
> > 
> > Yes. We have a good opportunity to work out something much better than current implementations. We could have  redundant-n profiles that also works with tired storage like ssd/nvme similar to the metadata on ssd idea.
> > 
> > Variable stripe width has been brought up before, but received cool responses. Why is that? IMO it could improve random 4k ios by doing equivalent to RAID1 instead of RMW, while also closing the write hole. Perhaps there is a middle ground to be found?
> > 
> > 
> > > 
> > > Ergo, I think it's best to not break the format twice. Even if a new
> > > raid implementation is years off.
> > 
> > I very agree here. Btrfs already suffers in public opinion from the lack of a stable and safe-for-data RAID56, and requiring several non-compatible chances isn't going to help.
> > 
> > I also think it's important that the 'temporary' changes actually leads to a stable filesystem. Because what is the point otherwise?
> > 
> > Thanks
> > Forza
> > 
> > > 
> > > Metadata centric workloads suck on parity raid anyway. If Btrfs always
> > > does full stripe COW won't matter even if the performance is worse
> > > because no one should use parity raid for this workload anyway.
> > > 
> > > 
> > > --
> > > Chris Murphy
> > 
> > 

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-25  0:00                             ` Zygo Blaxell
@ 2022-07-25  0:25                               ` Qu Wenruo
  2022-07-25  5:41                                 ` Zygo Blaxell
  2022-07-25 19:58                               ` Goffredo Baroncelli
  1 sibling, 1 reply; 88+ messages in thread
From: Qu Wenruo @ 2022-07-25  0:25 UTC (permalink / raw)
  To: Zygo Blaxell
  Cc: Forza, Chris Murphy, Goffredo Baroncelli, Johannes Thumshirn,
	Qu Wenruo, linux-btrfs



On 2022/7/25 08:00, Zygo Blaxell wrote:
> On Tue, Jul 19, 2022 at 09:19:21AM +0800, Qu Wenruo wrote:
>>>>> Doing so we don't need any disk format change and it would be backward compatible.
>>>
>>> Do we need to implement RAID56 in the traditional sense? As the
>> user/sysadmin I care about redundancy and performance and cost. The
>> option to create redundancy for any 'n drives is appealing from a cost
>> perspective, otherwise I'd use RAID1/10.
>>
>> Have you heard any recent problems related to dm-raid56?
>>
>> If your answer is no, then I guess we already have an  answer to your
>> question.
>
> With plain dm-raid56 the problems were there since the beginning, so
> they're not recent.

Are you talking about mdraid? They go internal write-intent bitmap and
PPL by default.

>  If there's a way to configure PPL or a journal
> device with raid5 LVs on LVM, I can't find it.

LVM is another story.

>  AFAIK nobody who knows
> what they're doing would choose dm-raid56 for high-value data, especially
> when alternatives like ZFS exist.

Isn't it the opposite? mdraid is what most people go, other than LVM raid.

>
> Before btrfs, we had a single-digit-percentage rate of severe data losses
> (more than 90% data lost) on filesystems and databases using mdadm +
> ext3/4 with no journal in degraded mode.  Multiply by per-drive AFR
> and that's a lot of full system rebuilds over the years.
>
>>> Since the current RAID56 mode have several important drawbacks
>>
>> Let me to be clear:
>>
>> If you can ensure you didn't hit power loss, or after a power loss do a
>> scrub immediately before any new write, then current RAID56 is fine, at
>> least not obviously worse than dm-raid56.
>
> I'm told that scrub doesn't repair parity errors on btrfs.

That's totally untrue.

You can easily verify that using "btrfs check --check-data-csum", as
recent btrfs-progs has the extra code to verify the rebuilt data using
parity.

In fact, I'm testing my write-intent bitmaps code with manually
corrupted parity to emulate a power loss after write-intent bitmaps update.

And I must say, the scrub code works as expected.



The myth may come from some bad advice on only scrubbing a single device
for RAID56 to avoid duplicated IO.

But the truth is, if only scrubbing one single device, for data stripes
on that device, if no csum error detected, scrub won't check the parity
or the other data stripes in the same vertical stripe.

On the other hand, if scrub is checking the parity stripe, it will also
check the csum for the data stripes in the same vertical stripe, and
rewrite the parity if needed.

>  That was a
> thing I got wrong in my raid5 bug list from 2020.  Scrub will fix data
> blocks if they have csum errors, but it will not detect or correct
> corruption in the parity blocks themselves.

That's exactly what I mentioned, the user is trying to be a smartass
without knowing the details.

Although I think we should enhance the man page to discourage the usage
of single device scrub.

By default, we scrub all devices (using mount point).

>  AFAICT the only way to
> get the parity blocks rewritten is to run something like balance,
> which carries risks of its own due to the sheer volume of IO from
> data and metadata updates.

Completely incorrect.

>
> Most of the raid56 bugs I've identified have nothing to do with power
> loss.  The data on disks is fine, but the kernel can't read it correctly
> in degraded mode, or the diagnostic data from scrub are clearly garbage.

Unable to read in degraded mode just means parity is out-of-sync with data.

There are several other bugs related to this, mostly related to the
cached raid bio and how we rebuild the data. (aka, btrfs/125)
Thankfully I have submitted patches for that bug and now btrfs/125
should pass without problems.

But the powerloss can still lead to out-of-sync parity and that's why
I'm fixing the problem using write-intent-bitmaps.

>
> I noticed you and others have done some work here recently, so some of
> these issues might be fixed in 5.19.  I haven't re-run my raid5 tests
> on post-5.18 kernels yet (there have been other bugs blocking testing).
>
>> (There are still common problems shared between both btrfs raid56 and
>> dm-raid56, like destructive-RMW)
>
> Yeah, that's one of the critical things to fix because btrfs is in a good
> position to do as well or better than dm-raid56.  btrfs has definitely
> fallen behind the other available solutions in the 9 years since raid5 was
> first added to btrfs, as btrfs implements only the basic configuration
> of raid56 (no parity integrity or rmw journal) that is fully vulnerable
> to write hole and drive-side data corruption.
>
>>> - and that it's officially not recommended for production use - it
>> is a good idea to reconstruct new btrfs 'redundant-n' profiles that
>> doesn't have the inherent issues of traditional RAID.
>>
>> I'd say the complexity is hugely underestimated.
>
> I'd agree with that.  e.g. some btrfs equivalent of ZFS raidZ (put parity
> blocks inline with extents during writes) is not much more complex to
> implement on btrfs than compression; however, the btrfs kernel code
> couldn't read compressed data correctly for 12 years out of its 14-year
> history, and nobody wants to wait another decade or more for raid5
> to work.
>
> It seems to me the biggest problem with write hole fixes is that all
> the potential fixes have cost tradeoffs, and everybody wants to veto
> the fix that has a cost they don't like.

Well, that's why I prefer multiple solutions for end users to choose,
other than really trying to get a silver bullet solution.

(That's also why I'm recently trying to separate block group tree from
extent tree v2, as I really believe progressive improvement over a death
ball feature)

Thanks,
Qu

>
> We could implement multiple fix approaches at the same time, as AFAIK
> most of the proposed solutions are orthogonal to each other.  e.g. a
> write-ahead log can safely enable RMW at a higher IO cost, while the
> allocator could place extents to avoid RMW and thereby avoid the logging
> cost as much as possible (paid for by a deferred relocation/garbage
> collection cost), and using both at the same time would combine both
> benefits.  Both solutions can be used independently for filesystems at
> extreme ends of the performance/capacity spectrum (if the filesystem is
> never more than 50% full, then logging is all cost with no gain compared
> to allocator avoidance of RMW, while a filesystem that is always near
> full will have to journal writes and also throttle writes on the journal.
>
>>> For example a non-striped redundant-n profile as well as a striped redundant-n profile.
>>
>> Non-striped redundant-n profile is already so complex that I can't
>> figure out a working idea right now.
>>
>> But if there is such way, I'm pretty happy to consider.
>>
>>>
>>>>
>>>> My 2 cents...
>>>>
>>>> Regarding the current raid56 support, in order of preference:
>>>>
>>>> a. Fix the current bugs, without changing format. Zygo has an extensive list.
>>>
>>> I agree that relatively simple fixes should be made. But it seems we will need quite a large rewrite to solve all issues? Is there a minium viable option here?
>>
>> Nope. Just see my write-intent code, already have prototype (just needs
>> new scrub based recovery code at mount time) working.
>>
>> And based on my write-intent code, I don't think it's that hard to
>> implement a full journal.
>
> FWIW I think we can get a very usable btrfs raid5 with a small format
> change (add a journal for stripe RMW, though we might disagree about
> details of how it should be structured and used) and fixes to the
> read-repair and scrub problems.  The read-side problems in btrfs raid5
> were always much more severe than the write hole.  As soon as a disk
> goes offline, the read-repair code is unable to read all the surviving
> data correctly, and the filesystem has to be kept inactive or data on
> the disks will be gradually corrupted as bad parity gets mixed with data
> and written back to the filesystem.
>
> A few of the problems will require a deeper redesign, but IMHO they're not
> important problems.  e.g. scrub can't identify which drive is corrupted
> in all cases, because it has no csum on parity blocks.  The current
> on-disk format needs every data block in the raid5 stripe to be occupied
> by a file with a csum so scrub can eliminate every other block as the
> possible source of mismatched parity.  While this could be fixed by
> a future new raid5 profile (and/or csum tree) specifically designed
> to avoid this, it's not something I'd insist on having before deploying
> a fleet of btrfs raid5 boxes.  Silent corruption failures are so
> rare on spinning disks that I'd use the feature maybe once a decade.
> Silent corruption due to a failing or overheating HBA chip will most
> likely affect multiple disks at once and trash the whole filesystem,
> so individual drive-level corruption reporting isn't helpful.
>
>> Thanks,
>> Qu
>>
>>>
>>>> b. Mostly fix the write hole, also without changing the format, by
>>>> only doing COW with full stripe writes. Yes you could somehow get
>>>> corrupt parity still and not know it until degraded operation produces
>>>> a bad reconstruction of data - but checksum will still catch that.
>>>> This kind of "unreplicated corruption" is not quite the same thing as
>>>> the write hole, because it isn't pernicious like the write hole.
>>>
>>> What is the difference to a)? Is write hole the worst issue? Judging from the #brtfs channel discussions there seems to be other quite severe issues, for example real data corruption risks in degraded mode.
>>>
>>>> c. A new de-clustered parity raid56 implementation that is not
>>>> backwards compatible.
>>>
>>> Yes. We have a good opportunity to work out something much better than current implementations. We could have  redundant-n profiles that also works with tired storage like ssd/nvme similar to the metadata on ssd idea.
>>>
>>> Variable stripe width has been brought up before, but received cool responses. Why is that? IMO it could improve random 4k ios by doing equivalent to RAID1 instead of RMW, while also closing the write hole. Perhaps there is a middle ground to be found?
>>>
>>>
>>>>
>>>> Ergo, I think it's best to not break the format twice. Even if a new
>>>> raid implementation is years off.
>>>
>>> I very agree here. Btrfs already suffers in public opinion from the lack of a stable and safe-for-data RAID56, and requiring several non-compatible chances isn't going to help.
>>>
>>> I also think it's important that the 'temporary' changes actually leads to a stable filesystem. Because what is the point otherwise?
>>>
>>> Thanks
>>> Forza
>>>
>>>>
>>>> Metadata centric workloads suck on parity raid anyway. If Btrfs always
>>>> does full stripe COW won't matter even if the performance is worse
>>>> because no one should use parity raid for this workload anyway.
>>>>
>>>>
>>>> --
>>>> Chris Murphy
>>>
>>>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-25  0:25                               ` Qu Wenruo
@ 2022-07-25  5:41                                 ` Zygo Blaxell
  2022-07-25  7:49                                   ` Qu Wenruo
  0 siblings, 1 reply; 88+ messages in thread
From: Zygo Blaxell @ 2022-07-25  5:41 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Forza, Chris Murphy, Goffredo Baroncelli, Johannes Thumshirn,
	Qu Wenruo, linux-btrfs

On Mon, Jul 25, 2022 at 08:25:44AM +0800, Qu Wenruo wrote:
> 
> 
> On 2022/7/25 08:00, Zygo Blaxell wrote:
> > On Tue, Jul 19, 2022 at 09:19:21AM +0800, Qu Wenruo wrote:
> > > > > > Doing so we don't need any disk format change and it would be backward compatible.
> > > > 
> > > > Do we need to implement RAID56 in the traditional sense? As the
> > > user/sysadmin I care about redundancy and performance and cost. The
> > > option to create redundancy for any 'n drives is appealing from a cost
> > > perspective, otherwise I'd use RAID1/10.
> > > 
> > > Have you heard any recent problems related to dm-raid56?
> > > 
> > > If your answer is no, then I guess we already have an  answer to your
> > > question.
> > 
> > With plain dm-raid56 the problems were there since the beginning, so
> > they're not recent.
> 
> Are you talking about mdraid? They go internal write-intent bitmap and
> PPL by default.

resync is the default for mdadm raid5, not PPL.  Write-intent and PPL
are mutually exclusive options.  mdadm raid5 doesn't default to bitmap
either.  (Verified with mdadm v4.2 - 2021-12-30).

> >  If there's a way to configure PPL or a journal
> > device with raid5 LVs on LVM, I can't find it.
> 
> LVM is another story.
> 
> >  AFAIK nobody who knows
> > what they're doing would choose dm-raid56 for high-value data, especially
> > when alternatives like ZFS exist.
> 
> Isn't it the opposite? mdraid is what most people go, other than LVM raid.

You said dm-raid, so I thought we were talking about dm-raid here.
It's a different interface to the core mdadm raid code, so the practical
differences between dm-raid and md-raid for most users are in what lvm
exposes (or does not expose).

> > Before btrfs, we had a single-digit-percentage rate of severe data losses
> > (more than 90% data lost) on filesystems and databases using mdadm +
> > ext3/4 with no journal in degraded mode.  Multiply by per-drive AFR
> > and that's a lot of full system rebuilds over the years.
> > 
> > > > Since the current RAID56 mode have several important drawbacks
> > > 
> > > Let me to be clear:
> > > 
> > > If you can ensure you didn't hit power loss, or after a power loss do a
> > > scrub immediately before any new write, then current RAID56 is fine, at
> > > least not obviously worse than dm-raid56.
> > 
> > I'm told that scrub doesn't repair parity errors on btrfs.
> 
> That's totally untrue.
> 
> You can easily verify that using "btrfs check --check-data-csum", as
> recent btrfs-progs has the extra code to verify the rebuilt data using
> parity.
> 
> In fact, I'm testing my write-intent bitmaps code with manually
> corrupted parity to emulate a power loss after write-intent bitmaps update.
> 
> And I must say, the scrub code works as expected.

That's good, but if it's true, it's a (welcome) change since last week.
Every time I've run a raid5 repair test with a single corrupted disk,
there has been some lost data, both from scrub and reads.  5.18.12 today
behaves the way I'm used to, with read repair unable to repair csum
errors and scrub leaving a few uncorrected blocks behind.

> The myth may come from some bad advice on only scrubbing a single device
> for RAID56 to avoid duplicated IO.
> 
> But the truth is, if only scrubbing one single device, for data stripes
> on that device, if no csum error detected, scrub won't check the parity
> or the other data stripes in the same vertical stripe.
> 
> On the other hand, if scrub is checking the parity stripe, it will also
> check the csum for the data stripes in the same vertical stripe, and
> rewrite the parity if needed.
> 
> >  That was a
> > thing I got wrong in my raid5 bug list from 2020.  Scrub will fix data
> > blocks if they have csum errors, but it will not detect or correct
> > corruption in the parity blocks themselves.
> 
> That's exactly what I mentioned, the user is trying to be a smartass
> without knowing the details.
> 
> Although I think we should enhance the man page to discourage the usage
> of single device scrub.

If we have something better to replace it now, sure.  The reason for
running the scrub on devices sequentially was because it behaved so
terribly when the per-device threads ran in parallel.  If scrub is now
behaving differently on raid56 then the man page should be updated to
reflect that.

> By default, we scrub all devices (using mount point).

The scrub userspace code enumerates the devices and runs a separate
thread to scrub each one.  Running them on one device at a time makes
those threads run sequentially instead of in parallel, and avoids a
lot of bad stuff with competing disk accesses and race conditions.
See below for a recent example.

> >  AFAICT the only way to
> > get the parity blocks rewritten is to run something like balance,
> > which carries risks of its own due to the sheer volume of IO from
> > data and metadata updates.
> 
> Completely incorrect.

And yet consistent with testing evidence going back 6 years so far.

If scrub works, it should be possible to corrupt one drive, scrub,
then corrupt the other drive, scrub again, and have zero errors
and zero kernel crashes.  Instead:

	# mkfs.btrfs -draid5 -mraid1 -f /dev/vdb /dev/vdc
	# mount -ospace_cache=v2,compress=zstd /dev/vdb /testfs
	# cp -a /testdata/. /testfs/. &  # 40TB of files, average size 23K

	[...wait a few minutes for some data, we don't need the whole thing...]

	# compsize /testfs/.
	Processed 15271 files, 7901 regular extents (7909 refs), 6510 inline.
	Type       Perc     Disk Usage   Uncompressed Referenced  
	TOTAL       73%      346M         472M         473M       
	none       100%      253M         253M         253M       
	zstd        42%       92M         219M         219M       

	# cat /dev/zero > /dev/vdb
	# sync
	# btrfs scrub start /dev/vdb  # or '/testfs', doesn't matter
	# cat /dev/zero > /dev/vdc
	# sync

	# btrfs scrub start /dev/vdc  # or '/testfs', doesn't matter
	ERROR: there are uncorrectable errors
	# btrfs scrub status -d .
	UUID:             8237e122-35af-40ef-80bc-101693e878e3

	Scrub device /dev/vdb (id 1)
		no stats available

	Scrub device /dev/vdc (id 2) history
	Scrub started:    Mon Jul 25 00:02:25 2022
	Status:           finished
	Duration:         0:00:22
	Total to scrub:   2.01GiB
	Rate:             1.54MiB/s
	Error summary:    csum=1690
	  Corrected:      1032
	  Uncorrectable:  658
	  Unverified:     0
	# cat /proc/version
	Linux version 5.19.0-ba37a9d53d71-for-next+ (zblaxell@tester) (gcc (Debian 11.3.0-3) 11.3.0, GNU ld (GNU Binutils for Debian) 2.38) #82 SMP PREEMPT_DYNAMIC Sun Jul 24 15:12:57 EDT 2022

Running scrub threads in parallel sometimes triggers stuff like this,
which killed one of the test runs while I was writing this:

	[ 1304.696921] BTRFS info (device vdb): read error corrected: ino 411 off 135168 (dev /dev/vdb sector 3128840)
	[ 1304.697705] BTRFS info (device vdb): read error corrected: ino 411 off 139264 (dev /dev/vdb sector 3128848)
	[ 1304.701196] ==================================================================
	[ 1304.716463] ------------[ cut here ]------------
	[ 1304.717094] BUG: KFENCE: use-after-free read in free_io_failure+0x157/0x210

	[ 1304.723346] kernel BUG at fs/btrfs/extent_io.c:2350!
	[ 1304.725076] Use-after-free read at 0x000000001e0043a6 (in kfence-#228):
	[ 1304.725103]  free_io_failure+0x157/0x210
	[ 1304.725115]  clean_io_failure+0x11d/0x260
	[ 1304.725126]  end_compressed_bio_read+0x2a9/0x470
	[ 1304.727698] invalid opcode: 0000 [#1] PREEMPT SMP PTI
	[ 1304.729516]  bio_endio+0x361/0x3c0
	[ 1304.731048] CPU: 1 PID: 12615 Comm: kworker/u8:10 Not tainted 5.19.0-ba37a9d53d71-for-next+ #82 d82f965b2e84525cfbba07129899b46c497cda69
	[ 1304.733084]  rbio_orig_end_io+0x127/0x1c0
	[ 1304.736876] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
	[ 1304.738720]  __raid_recover_end_io+0x405/0x8f0
	[ 1304.740310] Workqueue: btrfs-endio btrfs_end_bio_work
	[ 1304.748199]  raid_recover_end_io_work+0x8c/0xb0

	[ 1304.750028] RIP: 0010:repair_io_failure+0x359/0x4b0
	[ 1304.752434]  process_one_work+0x4e5/0xaa0
	[ 1304.752449]  worker_thread+0x32e/0x720
	[ 1304.754214] Code: 2b e8 2b 2f 79 ff 48 c7 c6 70 06 ac 91 48 c7 c7 00 b9 14 94 e8 38 00 73 ff 48 8d bd 48 ff ff ff e8 8c 7a 26 00 e9 f6 fd ff ff <0f> 0b e8 10 be 5e 01 85 c0 74 cc 48 c7 c7 f0 1c 45 94 e8 30 ab 98
	[ 1304.756561]  kthread+0x1ab/0x1e0
	[ 1304.758398] RSP: 0018:ffffa429c6adbb10 EFLAGS: 00010246
	[ 1304.759278]  ret_from_fork+0x22/0x30

	[ 1304.761343] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000

	[ 1304.762308] kfence-#228: 0x00000000cc0e17b4-0x0000000004ce48de, size=48, cache=kmalloc-64

	[ 1304.763692] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
	[ 1304.764649] allocated by task 12615 on cpu 1 at 1304.670070s:
	[ 1304.765617] RBP: ffffa429c6adbc08 R08: 0000000000000000 R09: 0000000000000000
	[ 1304.766421]  btrfs_repair_one_sector+0x370/0x500
	[ 1304.767638] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9108baaec000
	[ 1304.768341]  end_compressed_bio_read+0x187/0x470
	[ 1304.770163] R13: 0000000000000000 R14: ffffe44885d55040 R15: ffff9108114e66a4
	[ 1304.770993]  bio_endio+0x361/0x3c0
	[ 1304.772226] FS:  0000000000000000(0000) GS:ffff9109b7200000(0000) knlGS:0000000000000000
	[ 1304.773128]  btrfs_end_bio_work+0x1f/0x30
	[ 1304.773914] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
	[ 1304.774856]  process_one_work+0x4e5/0xaa0
	[ 1304.774869]  worker_thread+0x32e/0x720
	[ 1304.775172] CR2: 00007fb7a88c1738 CR3: 00000000bc03e002 CR4: 0000000000170ee0
	[ 1304.776397]  kthread+0x1ab/0x1e0
	[ 1304.776429]  ret_from_fork+0x22/0x30
	[ 1304.778282] Call Trace:

	[ 1304.779009] freed by task 21948 on cpu 2 at 1304.694620s:
	[ 1304.781760]  <TASK>
	[ 1304.782419]  free_io_failure+0x19a/0x210
	[ 1304.783213]  ? __bio_clone+0x1c0/0x1c0
	[ 1304.783952]  clean_io_failure+0x11d/0x260
	[ 1304.783963]  end_compressed_bio_read+0x2a9/0x470
	[ 1304.784263]  clean_io_failure+0x21a/0x260
	[ 1304.785674]  bio_endio+0x361/0x3c0
	[ 1304.785995]  end_compressed_bio_read+0x2a9/0x470
	[ 1304.787645]  btrfs_end_bio_work+0x1f/0x30
	[ 1304.788597]  bio_endio+0x361/0x3c0
	[ 1304.789674]  process_one_work+0x4e5/0xaa0
	[ 1304.790786]  btrfs_end_bio_work+0x1f/0x30
	[ 1304.791776]  worker_thread+0x32e/0x720
	[ 1304.791788]  kthread+0x1ab/0x1e0
	[ 1304.792895]  process_one_work+0x4e5/0xaa0
	[ 1304.793882]  ret_from_fork+0x22/0x30
	[ 1304.795043]  worker_thread+0x32e/0x720

	[ 1304.795802] CPU: 3 PID: 12616 Comm: kworker/u8:11 Not tainted 5.19.0-ba37a9d53d71-for-next+ #82 d82f965b2e84525cfbba07129899b46c497cda69
	[ 1304.796945]  ? _raw_spin_unlock_irqrestore+0x7d/0xa0
	[ 1304.797662] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
	[ 1304.798453]  ? process_one_work+0xaa0/0xaa0
	[ 1304.799175] Workqueue: btrfs-endio-raid56 raid_recover_end_io_work
	[ 1304.799739]  kthread+0x1ab/0x1e0

	[ 1304.801288] ==================================================================
	[ 1304.801873]  ? kthread_complete_and_exit+0x40/0x40
	[ 1304.809362] ==================================================================
	[ 1304.809933]  ret_from_fork+0x22/0x30
	[ 1304.809977]  </TASK>
	[ 1304.809982] Modules linked in:
	[ 1304.810068] ---[ end trace 0000000000000000 ]---
	[ 1304.810079] RIP: 0010:repair_io_failure+0x359/0x4b0
	[ 1304.810092] Code: 2b e8 2b 2f 79 ff 48 c7 c6 70 06 ac 91 48 c7 c7 00 b9 14 94 e8 38 00 73 ff 48 8d bd 48 ff ff ff e8 8c 7a 26 00 e9 f6 fd ff ff <0f> 0b e8 10 be 5e 01 85 c0 74 cc 48 c7 c7 f0 1c 45 94 e8 30 ab 98
	[ 1304.810114] RSP: 0018:ffffa429c6adbb10 EFLAGS: 00010246
	[ 1304.810125] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
	[ 1304.810133] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
	[ 1304.810140] RBP: ffffa429c6adbc08 R08: 0000000000000000 R09: 0000000000000000
	[ 1304.810149] R10: 0000000000000000 R11: 0000000000000000 R12: ffff9108baaec000
	[ 1304.810157] R13: 0000000000000000 R14: ffffe44885d55040 R15: ffff9108114e66a4
	[ 1304.810165] FS:  0000000000000000(0000) GS:ffff9109b7200000(0000) knlGS:0000000000000000
	[ 1304.810175] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
	[ 1304.810184] CR2: 00007fb7a88c1738 CR3: 00000000bc03e002 CR4: 0000000000170ee0
	[ 1304.903432] BUG: KFENCE: invalid free in free_io_failure+0x19a/0x210

	[ 1304.906815] Invalid free of 0x00000000cc0e17b4 (in kfence-#228):
	[ 1304.909006]  free_io_failure+0x19a/0x210
	[ 1304.909666]  clean_io_failure+0x11d/0x260
	[ 1304.910358]  end_compressed_bio_read+0x2a9/0x470
	[ 1304.911121]  bio_endio+0x361/0x3c0
	[ 1304.911722]  rbio_orig_end_io+0x127/0x1c0
	[ 1304.912405]  __raid_recover_end_io+0x405/0x8f0
	[ 1304.919917]  raid_recover_end_io_work+0x8c/0xb0
	[ 1304.927494]  process_one_work+0x4e5/0xaa0
	[ 1304.934191]  worker_thread+0x32e/0x720
	[ 1304.940524]  kthread+0x1ab/0x1e0
	[ 1304.945963]  ret_from_fork+0x22/0x30

	[ 1304.953057] kfence-#228: 0x00000000cc0e17b4-0x0000000004ce48de, size=48, cache=kmalloc-64

	[ 1304.955733] allocated by task 12615 on cpu 1 at 1304.670070s:
	[ 1304.957225]  btrfs_repair_one_sector+0x370/0x500
	[ 1304.958574]  end_compressed_bio_read+0x187/0x470
	[ 1304.959937]  bio_endio+0x361/0x3c0
	[ 1304.960960]  btrfs_end_bio_work+0x1f/0x30
	[ 1304.962193]  process_one_work+0x4e5/0xaa0
	[ 1304.963403]  worker_thread+0x32e/0x720
	[ 1304.965498]  kthread+0x1ab/0x1e0
	[ 1304.966515]  ret_from_fork+0x22/0x30

	[ 1304.968681] freed by task 21948 on cpu 2 at 1304.694620s:
	[ 1304.970160]  free_io_failure+0x19a/0x210
	[ 1304.971725]  clean_io_failure+0x11d/0x260
	[ 1304.973082]  end_compressed_bio_read+0x2a9/0x470
	[ 1304.974277]  bio_endio+0x361/0x3c0
	[ 1304.975245]  btrfs_end_bio_work+0x1f/0x30
	[ 1304.976623]  process_one_work+0x4e5/0xaa0
	[ 1304.979141]  worker_thread+0x32e/0x720
	[ 1304.980044]  kthread+0x1ab/0x1e0
	[ 1304.981002]  ret_from_fork+0x22/0x30

	[ 1304.982520] CPU: 2 PID: 12616 Comm: kworker/u8:11 Tainted: G    B D           5.19.0-ba37a9d53d71-for-next+ #82 d82f965b2e84525cfbba07129899b46c497cda69
	[ 1304.986522] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
	[ 1304.988636] Workqueue: btrfs-endio-raid56 raid_recover_end_io_work
	[ 1304.990234] ==================================================================

On kernels without KASAN or page poisoning, that use-after-free might lead
to a hang at the end of a btrfs replace.  I don't know exactly what's
going on there--there is often a hang at the end of a raid5 replace,
it's caused by a mismatch between the count of active bios and the actual
number of active bios, and a use-after-free might be causing that by
forgetting to decrement the counter.  There are multiple overlapping
bugs in btrfs raid5 and it's hard to reliably separate them until some
of them get fixed.

Another data point:  I ran 5 test runs while writing this, and the third
one did fix all the errors in scrub.  It sometimes does happen over test
cases of a few gigabytes.  It's just not anywhere near reliable enough
to fix a 50TB array with one busted disk.

I think you need better test cases.  btrfs raid5 has been broken like this
since the beginning, with failures that can be demonstrated in minutes.
btrfs raid1 can run these tests all day.

> > Most of the raid56 bugs I've identified have nothing to do with power
> > loss.  The data on disks is fine, but the kernel can't read it correctly
> > in degraded mode, or the diagnostic data from scrub are clearly garbage.
> 
> Unable to read in degraded mode just means parity is out-of-sync with data.

No, the degraded mode case is different.  It has distinct behavior from
the above test case where all the drives are online but csums are failing.
In degraded mode one of the devices is unavailable, so the read code
is trying to reconstruct data on the fly.  The parity and data on disk
is often OK on the surviving disks if I dump it out by hand, and often
all the data can be recovered by 'btrfs replace' without error (as
long as 'btrfs replace' is the only active process on the filesystem).

Rebooting the test VM will make a different set of data unreadable
through the filesystem, and the set of unreadable blocks changes over
time if running something like:

	sysctl vm.drop_caches=3; find -type f -exec cat {} + >/dev/null

in a loop, especially if something is writing to the filesystem at the
same time.  Note there is never a write hole in these test cases--the
filesystem is always cleanly umounted, and sometimes there's no umount
at all, one device is simply disconnected with no umount or reboot.

> There are several other bugs related to this, mostly related to the
> cached raid bio and how we rebuild the data. (aka, btrfs/125)
> Thankfully I have submitted patches for that bug and now btrfs/125
> should pass without problems.

xfstests btrfs/125 is an extremely simple test case.  I'm using btrfs
raid5 on 20-80TB filesystems, millions to billions of files.  The error
rate is quantitatively low (only 0.01% of data is lost after one disk
failure) but it should be zero, as none of my test cases involve write
hole, nodatacow, or raid5 metadata.

for-next and misc-next are still quite broken, though to be fair they
definitely have issues beyond raid5.  5.18.12 can get through the
test without tripping over KASAN or blowing up the metadata, but it
has uncorrectable errors and fake read errors:

	# btrfs scrub start -Bd /testfs/

	Scrub device /dev/vdb (id 1) done
	Scrub started:    Mon Jul 25 00:49:28 2022
	Status:           finished
	Duration:         0:03:03
	Total to scrub:   4.01GiB
	Rate:             1.63MiB/s
	Error summary:    read=3 csum=7578
	  Corrected:      7577
	  Uncorrectable:  4
	  Unverified:     1

I know the read errors are fake because /dev/vdb is a file on a tmpfs.

> But the powerloss can still lead to out-of-sync parity and that's why
> I'm fixing the problem using write-intent-bitmaps.

None of my test cases involve write hole, as I know write-hole test cases
will always fail.  There's no point in testing write hole if recovery
from much simpler failures isn't working yet.

> > I noticed you and others have done some work here recently, so some of
> > these issues might be fixed in 5.19.  I haven't re-run my raid5 tests
> > on post-5.18 kernels yet (there have been other bugs blocking testing).
> > 
> > > (There are still common problems shared between both btrfs raid56 and
> > > dm-raid56, like destructive-RMW)
> > 
> > Yeah, that's one of the critical things to fix because btrfs is in a good
> > position to do as well or better than dm-raid56.  btrfs has definitely
> > fallen behind the other available solutions in the 9 years since raid5 was
> > first added to btrfs, as btrfs implements only the basic configuration
> > of raid56 (no parity integrity or rmw journal) that is fully vulnerable
> > to write hole and drive-side data corruption.
> > 
> > > > - and that it's officially not recommended for production use - it
> > > is a good idea to reconstruct new btrfs 'redundant-n' profiles that
> > > doesn't have the inherent issues of traditional RAID.
> > > 
> > > I'd say the complexity is hugely underestimated.
> > 
> > I'd agree with that.  e.g. some btrfs equivalent of ZFS raidZ (put parity
> > blocks inline with extents during writes) is not much more complex to
> > implement on btrfs than compression; however, the btrfs kernel code
> > couldn't read compressed data correctly for 12 years out of its 14-year
> > history, and nobody wants to wait another decade or more for raid5
> > to work.
> > 
> > It seems to me the biggest problem with write hole fixes is that all
> > the potential fixes have cost tradeoffs, and everybody wants to veto
> > the fix that has a cost they don't like.
> 
> Well, that's why I prefer multiple solutions for end users to choose,
> other than really trying to get a silver bullet solution.
> 
> (That's also why I'm recently trying to separate block group tree from
> extent tree v2, as I really believe progressive improvement over a death
> ball feature)

Yeah I'm definitely in favor of getting bgtree done sooner rather
than later.  It's a simple, stand-alone feature that has well known
beneficial effect.  If the extent tree v2 project wants to do something
incompatible with it later on, that's extent tree v2's problem, not a
reason to block bgtree in the short term.

> Thanks,
> Qu
> 
> > 
> > We could implement multiple fix approaches at the same time, as AFAIK
> > most of the proposed solutions are orthogonal to each other.  e.g. a
> > write-ahead log can safely enable RMW at a higher IO cost, while the
> > allocator could place extents to avoid RMW and thereby avoid the logging
> > cost as much as possible (paid for by a deferred relocation/garbage
> > collection cost), and using both at the same time would combine both
> > benefits.  Both solutions can be used independently for filesystems at
> > extreme ends of the performance/capacity spectrum (if the filesystem is
> > never more than 50% full, then logging is all cost with no gain compared
> > to allocator avoidance of RMW, while a filesystem that is always near
> > full will have to journal writes and also throttle writes on the journal.
> > 
> > > > For example a non-striped redundant-n profile as well as a striped redundant-n profile.
> > > 
> > > Non-striped redundant-n profile is already so complex that I can't
> > > figure out a working idea right now.
> > > 
> > > But if there is such way, I'm pretty happy to consider.
> > > 
> > > > 
> > > > > 
> > > > > My 2 cents...
> > > > > 
> > > > > Regarding the current raid56 support, in order of preference:
> > > > > 
> > > > > a. Fix the current bugs, without changing format. Zygo has an extensive list.
> > > > 
> > > > I agree that relatively simple fixes should be made. But it seems we will need quite a large rewrite to solve all issues? Is there a minium viable option here?
> > > 
> > > Nope. Just see my write-intent code, already have prototype (just needs
> > > new scrub based recovery code at mount time) working.
> > > 
> > > And based on my write-intent code, I don't think it's that hard to
> > > implement a full journal.
> > 
> > FWIW I think we can get a very usable btrfs raid5 with a small format
> > change (add a journal for stripe RMW, though we might disagree about
> > details of how it should be structured and used) and fixes to the
> > read-repair and scrub problems.  The read-side problems in btrfs raid5
> > were always much more severe than the write hole.  As soon as a disk
> > goes offline, the read-repair code is unable to read all the surviving
> > data correctly, and the filesystem has to be kept inactive or data on
> > the disks will be gradually corrupted as bad parity gets mixed with data
> > and written back to the filesystem.
> > 
> > A few of the problems will require a deeper redesign, but IMHO they're not
> > important problems.  e.g. scrub can't identify which drive is corrupted
> > in all cases, because it has no csum on parity blocks.  The current
> > on-disk format needs every data block in the raid5 stripe to be occupied
> > by a file with a csum so scrub can eliminate every other block as the
> > possible source of mismatched parity.  While this could be fixed by
> > a future new raid5 profile (and/or csum tree) specifically designed
> > to avoid this, it's not something I'd insist on having before deploying
> > a fleet of btrfs raid5 boxes.  Silent corruption failures are so
> > rare on spinning disks that I'd use the feature maybe once a decade.
> > Silent corruption due to a failing or overheating HBA chip will most
> > likely affect multiple disks at once and trash the whole filesystem,
> > so individual drive-level corruption reporting isn't helpful.
> > 
> > > Thanks,
> > > Qu
> > > 
> > > > 
> > > > > b. Mostly fix the write hole, also without changing the format, by
> > > > > only doing COW with full stripe writes. Yes you could somehow get
> > > > > corrupt parity still and not know it until degraded operation produces
> > > > > a bad reconstruction of data - but checksum will still catch that.
> > > > > This kind of "unreplicated corruption" is not quite the same thing as
> > > > > the write hole, because it isn't pernicious like the write hole.
> > > > 
> > > > What is the difference to a)? Is write hole the worst issue? Judging from the #brtfs channel discussions there seems to be other quite severe issues, for example real data corruption risks in degraded mode.
> > > > 
> > > > > c. A new de-clustered parity raid56 implementation that is not
> > > > > backwards compatible.
> > > > 
> > > > Yes. We have a good opportunity to work out something much better than current implementations. We could have  redundant-n profiles that also works with tired storage like ssd/nvme similar to the metadata on ssd idea.
> > > > 
> > > > Variable stripe width has been brought up before, but received cool responses. Why is that? IMO it could improve random 4k ios by doing equivalent to RAID1 instead of RMW, while also closing the write hole. Perhaps there is a middle ground to be found?
> > > > 
> > > > 
> > > > > 
> > > > > Ergo, I think it's best to not break the format twice. Even if a new
> > > > > raid implementation is years off.
> > > > 
> > > > I very agree here. Btrfs already suffers in public opinion from the lack of a stable and safe-for-data RAID56, and requiring several non-compatible chances isn't going to help.
> > > > 
> > > > I also think it's important that the 'temporary' changes actually leads to a stable filesystem. Because what is the point otherwise?
> > > > 
> > > > Thanks
> > > > Forza
> > > > 
> > > > > 
> > > > > Metadata centric workloads suck on parity raid anyway. If Btrfs always
> > > > > does full stripe COW won't matter even if the performance is worse
> > > > > because no one should use parity raid for this workload anyway.
> > > > > 
> > > > > 
> > > > > --
> > > > > Chris Murphy
> > > > 
> > > > 
> 

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-25  5:41                                 ` Zygo Blaxell
@ 2022-07-25  7:49                                   ` Qu Wenruo
  0 siblings, 0 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-07-25  7:49 UTC (permalink / raw)
  To: Zygo Blaxell
  Cc: Forza, Chris Murphy, Goffredo Baroncelli, Johannes Thumshirn,
	Qu Wenruo, linux-btrfs



On 2022/7/25 13:41, Zygo Blaxell wrote:
> On Mon, Jul 25, 2022 at 08:25:44AM +0800, Qu Wenruo wrote:
[...]
>>
>> You can easily verify that using "btrfs check --check-data-csum", as
>> recent btrfs-progs has the extra code to verify the rebuilt data using
>> parity.
>>
>> In fact, I'm testing my write-intent bitmaps code with manually
>> corrupted parity to emulate a power loss after write-intent bitmaps update.
>>
>> And I must say, the scrub code works as expected.
>
> That's good, but if it's true, it's a (welcome) change since last week.
> Every time I've run a raid5 repair test with a single corrupted disk,
> there has been some lost data, both from scrub and reads.  5.18.12 today
> behaves the way I'm used to, with read repair unable to repair csum
> errors and scrub leaving a few uncorrected blocks behind.

Have you tried misc-next?

The following patches are not yet in upstream nor backported:

btrfs: raid56: don't trust any cached sector in __raid56_parity_recover()
btrfs: update stripe_sectors::uptodate in steal_rbio
btrfs: only write the sectors in the vertical stripe which has data stripes


>
>> The myth may come from some bad advice on only scrubbing a single device
>> for RAID56 to avoid duplicated IO.
>>
>> But the truth is, if only scrubbing one single device, for data stripes
>> on that device, if no csum error detected, scrub won't check the parity
>> or the other data stripes in the same vertical stripe.
>>
>> On the other hand, if scrub is checking the parity stripe, it will also
>> check the csum for the data stripes in the same vertical stripe, and
>> rewrite the parity if needed.
>>
>>>   That was a
>>> thing I got wrong in my raid5 bug list from 2020.  Scrub will fix data
>>> blocks if they have csum errors, but it will not detect or correct
>>> corruption in the parity blocks themselves.
>>
>> That's exactly what I mentioned, the user is trying to be a smartass
>> without knowing the details.
>>
>> Although I think we should enhance the man page to discourage the usage
>> of single device scrub.
>
> If we have something better to replace it now, sure.  The reason for
> running the scrub on devices sequentially was because it behaved so
> terribly when the per-device threads ran in parallel.

Really? For mirror/stripe based profiles they should be fine.

Since each device scrubbing is only doing IO from that device (if no
rebuild is needed).
Although things like extent and csum tree iteration would cause some
conflicts, I don't think that would be a big problem as tree block
caching should work pretty well.

It's RAID56 we're doing racing between each other, as for parity
scrubbing, we will do extra IO from data stripes, thus it will cause
performance problems.

To that aspect, we indeed need a better interface for RAID56 scrubbing.
But that's RAID56 only.

>  If scrub is now
> behaving differently on raid56 then the man page should be updated to
> reflect that.
>
>> By default, we scrub all devices (using mount point).
>
> The scrub userspace code enumerates the devices and runs a separate
> thread to scrub each one.  Running them on one device at a time makes
> those threads run sequentially instead of in parallel, and avoids a
> lot of bad stuff with competing disk accesses and race conditions.
> See below for a recent example.
>
>>>   AFAICT the only way to
>>> get the parity blocks rewritten is to run something like balance,
>>> which carries risks of its own due to the sheer volume of IO from
>>> data and metadata updates.
>>
>> Completely incorrect.
>
> And yet consistent with testing evidence going back 6 years so far.
>
> If scrub works, it should be possible to corrupt one drive, scrub,
> then corrupt the other drive, scrub again, and have zero errors
> and zero kernel crashes.  Instead:
>
> 	# mkfs.btrfs -draid5 -mraid1 -f /dev/vdb /dev/vdc
> 	# mount -ospace_cache=v2,compress=zstd /dev/vdb /testfs
> 	# cp -a /testdata/. /testfs/. &  # 40TB of files, average size 23K
>
> 	[...wait a few minutes for some data, we don't need the whole thing...]
>
> 	# compsize /testfs/.
> 	Processed 15271 files, 7901 regular extents (7909 refs), 6510 inline.
> 	Type       Perc     Disk Usage   Uncompressed Referenced
> 	TOTAL       73%      346M         472M         473M
> 	none       100%      253M         253M         253M
> 	zstd        42%       92M         219M         219M
>
> 	# cat /dev/zero > /dev/vdb
> 	# sync
> 	# btrfs scrub start /dev/vdb  # or '/testfs', doesn't matter
> 	# cat /dev/zero > /dev/vdc
> 	# sync
>
> 	# btrfs scrub start /dev/vdc  # or '/testfs', doesn't matter
> 	ERROR: there are uncorrectable errors
> 	# btrfs scrub status -d .
> 	UUID:             8237e122-35af-40ef-80bc-101693e878e3
>
> 	Scrub device /dev/vdb (id 1)
> 		no stats available
>
> 	Scrub device /dev/vdc (id 2) history
> 	Scrub started:    Mon Jul 25 00:02:25 2022
> 	Status:           finished
> 	Duration:         0:00:22
> 	Total to scrub:   2.01GiB
> 	Rate:             1.54MiB/s
> 	Error summary:    csum=1690
> 	  Corrected:      1032
> 	  Uncorrectable:  658
> 	  Unverified:     0
> 	# cat /proc/version
> 	Linux version 5.19.0-ba37a9d53d71-for-next+ (zblaxell@tester) (gcc (Debian 11.3.0-3) 11.3.0, GNU ld (GNU Binutils for Debian) 2.38) #82 SMP PREEMPT_DYNAMIC Sun Jul 24 15:12:57 EDT 2022
>
> Running scrub threads in parallel sometimes triggers stuff like this,
> which killed one of the test runs while I was writing this:
>
> 	[ 1304.696921] BTRFS info (device vdb): read error corrected: ino 411 off 135168 (dev /dev/vdb sector 3128840)
> 	[ 1304.697705] BTRFS info (device vdb): read error corrected: ino 411 off 139264 (dev /dev/vdb sector 3128848)
> 	[ 1304.701196] ==================================================================
> 	[ 1304.716463] ------------[ cut here ]------------
> 	[ 1304.717094] BUG: KFENCE: use-after-free read in free_io_failure+0x157/0x210
>
> 	[ 1304.723346] kernel BUG at fs/btrfs/extent_io.c:2350!
> 	[ 1304.725076] Use-after-free read at 0x000000001e0043a6 (in kfence-#228):
> 	[ 1304.725103]  free_io_failure+0x157/0x210
> 	[ 1304.725115]  clean_io_failure+0x11d/0x260
> 	[ 1304.725126]  end_compressed_bio_read+0x2a9/0x470

This looks like a problem relater to read-repair code with compression,
HCH is also working on this, in the long run we would get rid of io
failure record completely.

Have you tried without compression?

[...]
>
> On kernels without KASAN or page poisoning, that use-after-free might lead
> to a hang at the end of a btrfs replace.  I don't know exactly what's
> going on there--there is often a hang at the end of a raid5 replace,
> it's caused by a mismatch between the count of active bios and the actual
> number of active bios, and a use-after-free might be causing that by
> forgetting to decrement the counter.  There are multiple overlapping
> bugs in btrfs raid5 and it's hard to reliably separate them until some
> of them get fixed.
>
> Another data point:  I ran 5 test runs while writing this, and the third
> one did fix all the errors in scrub.  It sometimes does happen over test
> cases of a few gigabytes.  It's just not anywhere near reliable enough
> to fix a 50TB array with one busted disk.
>
> I think you need better test cases.  btrfs raid5 has been broken like this
> since the beginning, with failures that can be demonstrated in minutes.
> btrfs raid1 can run these tests all day.

I'd say, the compression is adding a completely different complexity
into the equation.

Yep, for sysadmin's point of view, this is completely fine, but to us to
locate the problems, I'd prefer something without compression, just
RAID56 and plain file operations to see if it's really compression or
other things screwed up.

Remember, for read time repair, btrfs is still just trying the next
mirror, no difference than RAID1.

It's the RAID56 recovery code converting the 2nd mirror by using extra
P/Q and data stripes to rebuild the data.

We had problems which can cause exact the same unrepairable problems in
the RAID56 repair path (not read-repair path), in that case, please try
misc-next to see if there is any improvement (better without compression).

>
>>> Most of the raid56 bugs I've identified have nothing to do with power
>>> loss.  The data on disks is fine, but the kernel can't read it correctly
>>> in degraded mode, or the diagnostic data from scrub are clearly garbage.
>>
>> Unable to read in degraded mode just means parity is out-of-sync with data.
>
> No, the degraded mode case is different.  It has distinct behavior from
> the above test case where all the drives are online but csums are failing.
> In degraded mode one of the devices is unavailable, so the read code
> is trying to reconstruct data on the fly.  The parity and data on disk
> is often OK on the surviving disks if I dump it out by hand, and often
> all the data can be recovered by 'btrfs replace' without error (as
> long as 'btrfs replace' is the only active process on the filesystem).
>
> Rebooting the test VM will make a different set of data unreadable
> through the filesystem, and the set of unreadable blocks changes over
> time if running something like:
>
> 	sysctl vm.drop_caches=3; find -type f -exec cat {} + >/dev/null
>
> in a loop, especially if something is writing to the filesystem at the
> same time.  Note there is never a write hole in these test cases--the
> filesystem is always cleanly umounted, and sometimes there's no umount
> at all, one device is simply disconnected with no umount or reboot.
>
>> There are several other bugs related to this, mostly related to the
>> cached raid bio and how we rebuild the data. (aka, btrfs/125)
>> Thankfully I have submitted patches for that bug and now btrfs/125
>> should pass without problems.
>
> xfstests btrfs/125 is an extremely simple test case.  I'm using btrfs
> raid5 on 20-80TB filesystems, millions to billions of files.

That's the difference between developer and end users, and I totally
understand you want to really go heavy testing and squeeze out every bug.

But from a developer's view, we prefer to fix bugs one by one.

Btrfs/125 is a very short but effective test case to show how the
original RAID56 repair path has problems mostly related to its cached
radi56 behavior.

I'm not saying the test case representing all the problems, but it's a
very quick indicator of whether your code base has the fixes for RAID56
code.

>  The error
> rate is quantitatively low (only 0.01% of data is lost after one disk
> failure) but it should be zero, as none of my test cases involve write
> hole, nodatacow, or raid5 metadata.
>
> for-next and misc-next are still quite broken, though to be fair they
> definitely have issues beyond raid5.

I'm more interesting in how misc-next is broken.

I know that previous misc-next has some problems related to page
faulting and can hang fsstress easily.

But that should be fixed in recent misc-next, thus I strongly recommend
to try it (and without compression) to see if there is any improvement
for RAID56.

Thanks,
Qu

>  5.18.12 can get through the
> test without tripping over KASAN or blowing up the metadata, but it
> has uncorrectable errors and fake read errors:
>
> 	# btrfs scrub start -Bd /testfs/
>
> 	Scrub device /dev/vdb (id 1) done
> 	Scrub started:    Mon Jul 25 00:49:28 2022
> 	Status:           finished
> 	Duration:         0:03:03
> 	Total to scrub:   4.01GiB
> 	Rate:             1.63MiB/s
> 	Error summary:    read=3 csum=7578
> 	  Corrected:      7577
> 	  Uncorrectable:  4
> 	  Unverified:     1
>
> I know the read errors are fake because /dev/vdb is a file on a tmpfs.
>
>> But the powerloss can still lead to out-of-sync parity and that's why
>> I'm fixing the problem using write-intent-bitmaps.
>
> None of my test cases involve write hole, as I know write-hole test cases
> will always fail.  There's no point in testing write hole if recovery
> from much simpler failures isn't working yet.
>
>>> I noticed you and others have done some work here recently, so some of
>>> these issues might be fixed in 5.19.  I haven't re-run my raid5 tests
>>> on post-5.18 kernels yet (there have been other bugs blocking testing).
>>>
>>>> (There are still common problems shared between both btrfs raid56 and
>>>> dm-raid56, like destructive-RMW)
>>>
>>> Yeah, that's one of the critical things to fix because btrfs is in a good
>>> position to do as well or better than dm-raid56.  btrfs has definitely
>>> fallen behind the other available solutions in the 9 years since raid5 was
>>> first added to btrfs, as btrfs implements only the basic configuration
>>> of raid56 (no parity integrity or rmw journal) that is fully vulnerable
>>> to write hole and drive-side data corruption.
>>>
>>>>> - and that it's officially not recommended for production use - it
>>>> is a good idea to reconstruct new btrfs 'redundant-n' profiles that
>>>> doesn't have the inherent issues of traditional RAID.
>>>>
>>>> I'd say the complexity is hugely underestimated.
>>>
>>> I'd agree with that.  e.g. some btrfs equivalent of ZFS raidZ (put parity
>>> blocks inline with extents during writes) is not much more complex to
>>> implement on btrfs than compression; however, the btrfs kernel code
>>> couldn't read compressed data correctly for 12 years out of its 14-year
>>> history, and nobody wants to wait another decade or more for raid5
>>> to work.
>>>
>>> It seems to me the biggest problem with write hole fixes is that all
>>> the potential fixes have cost tradeoffs, and everybody wants to veto
>>> the fix that has a cost they don't like.
>>
>> Well, that's why I prefer multiple solutions for end users to choose,
>> other than really trying to get a silver bullet solution.
>>
>> (That's also why I'm recently trying to separate block group tree from
>> extent tree v2, as I really believe progressive improvement over a death
>> ball feature)
>
> Yeah I'm definitely in favor of getting bgtree done sooner rather
> than later.  It's a simple, stand-alone feature that has well known
> beneficial effect.  If the extent tree v2 project wants to do something
> incompatible with it later on, that's extent tree v2's problem, not a
> reason to block bgtree in the short term.
>
>> Thanks,
>> Qu
>>
>>>
>>> We could implement multiple fix approaches at the same time, as AFAIK
>>> most of the proposed solutions are orthogonal to each other.  e.g. a
>>> write-ahead log can safely enable RMW at a higher IO cost, while the
>>> allocator could place extents to avoid RMW and thereby avoid the logging
>>> cost as much as possible (paid for by a deferred relocation/garbage
>>> collection cost), and using both at the same time would combine both
>>> benefits.  Both solutions can be used independently for filesystems at
>>> extreme ends of the performance/capacity spectrum (if the filesystem is
>>> never more than 50% full, then logging is all cost with no gain compared
>>> to allocator avoidance of RMW, while a filesystem that is always near
>>> full will have to journal writes and also throttle writes on the journal.
>>>
>>>>> For example a non-striped redundant-n profile as well as a striped redundant-n profile.
>>>>
>>>> Non-striped redundant-n profile is already so complex that I can't
>>>> figure out a working idea right now.
>>>>
>>>> But if there is such way, I'm pretty happy to consider.
>>>>
>>>>>
>>>>>>
>>>>>> My 2 cents...
>>>>>>
>>>>>> Regarding the current raid56 support, in order of preference:
>>>>>>
>>>>>> a. Fix the current bugs, without changing format. Zygo has an extensive list.
>>>>>
>>>>> I agree that relatively simple fixes should be made. But it seems we will need quite a large rewrite to solve all issues? Is there a minium viable option here?
>>>>
>>>> Nope. Just see my write-intent code, already have prototype (just needs
>>>> new scrub based recovery code at mount time) working.
>>>>
>>>> And based on my write-intent code, I don't think it's that hard to
>>>> implement a full journal.
>>>
>>> FWIW I think we can get a very usable btrfs raid5 with a small format
>>> change (add a journal for stripe RMW, though we might disagree about
>>> details of how it should be structured and used) and fixes to the
>>> read-repair and scrub problems.  The read-side problems in btrfs raid5
>>> were always much more severe than the write hole.  As soon as a disk
>>> goes offline, the read-repair code is unable to read all the surviving
>>> data correctly, and the filesystem has to be kept inactive or data on
>>> the disks will be gradually corrupted as bad parity gets mixed with data
>>> and written back to the filesystem.
>>>
>>> A few of the problems will require a deeper redesign, but IMHO they're not
>>> important problems.  e.g. scrub can't identify which drive is corrupted
>>> in all cases, because it has no csum on parity blocks.  The current
>>> on-disk format needs every data block in the raid5 stripe to be occupied
>>> by a file with a csum so scrub can eliminate every other block as the
>>> possible source of mismatched parity.  While this could be fixed by
>>> a future new raid5 profile (and/or csum tree) specifically designed
>>> to avoid this, it's not something I'd insist on having before deploying
>>> a fleet of btrfs raid5 boxes.  Silent corruption failures are so
>>> rare on spinning disks that I'd use the feature maybe once a decade.
>>> Silent corruption due to a failing or overheating HBA chip will most
>>> likely affect multiple disks at once and trash the whole filesystem,
>>> so individual drive-level corruption reporting isn't helpful.
>>>
>>>> Thanks,
>>>> Qu
>>>>
>>>>>
>>>>>> b. Mostly fix the write hole, also without changing the format, by
>>>>>> only doing COW with full stripe writes. Yes you could somehow get
>>>>>> corrupt parity still and not know it until degraded operation produces
>>>>>> a bad reconstruction of data - but checksum will still catch that.
>>>>>> This kind of "unreplicated corruption" is not quite the same thing as
>>>>>> the write hole, because it isn't pernicious like the write hole.
>>>>>
>>>>> What is the difference to a)? Is write hole the worst issue? Judging from the #brtfs channel discussions there seems to be other quite severe issues, for example real data corruption risks in degraded mode.
>>>>>
>>>>>> c. A new de-clustered parity raid56 implementation that is not
>>>>>> backwards compatible.
>>>>>
>>>>> Yes. We have a good opportunity to work out something much better than current implementations. We could have  redundant-n profiles that also works with tired storage like ssd/nvme similar to the metadata on ssd idea.
>>>>>
>>>>> Variable stripe width has been brought up before, but received cool responses. Why is that? IMO it could improve random 4k ios by doing equivalent to RAID1 instead of RMW, while also closing the write hole. Perhaps there is a middle ground to be found?
>>>>>
>>>>>
>>>>>>
>>>>>> Ergo, I think it's best to not break the format twice. Even if a new
>>>>>> raid implementation is years off.
>>>>>
>>>>> I very agree here. Btrfs already suffers in public opinion from the lack of a stable and safe-for-data RAID56, and requiring several non-compatible chances isn't going to help.
>>>>>
>>>>> I also think it's important that the 'temporary' changes actually leads to a stable filesystem. Because what is the point otherwise?
>>>>>
>>>>> Thanks
>>>>> Forza
>>>>>
>>>>>>
>>>>>> Metadata centric workloads suck on parity raid anyway. If Btrfs always
>>>>>> does full stripe COW won't matter even if the performance is worse
>>>>>> because no one should use parity raid for this workload anyway.
>>>>>>
>>>>>>
>>>>>> --
>>>>>> Chris Murphy
>>>>>
>>>>>
>>

^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-25  0:00                             ` Zygo Blaxell
  2022-07-25  0:25                               ` Qu Wenruo
@ 2022-07-25 19:58                               ` Goffredo Baroncelli
  2022-07-25 21:29                                 ` Qu Wenruo
  1 sibling, 1 reply; 88+ messages in thread
From: Goffredo Baroncelli @ 2022-07-25 19:58 UTC (permalink / raw)
  To: Zygo Blaxell, Qu Wenruo
  Cc: Forza, Chris Murphy, Johannes Thumshirn, Qu Wenruo, linux-btrfs

On 25/07/2022 02.00, Zygo Blaxell wrote:
> On Tue, Jul 19, 2022 at 09:19:21AM +0800, Qu Wenruo wrote:
[...]
> 
> I'd agree with that.  e.g. some btrfs equivalent of ZFS raidZ (put parity
> blocks inline with extents during writes) is not much more complex to
> implement on btrfs than compression; however, the btrfs kernel code
> couldn't read compressed data correctly for 12 years out of its 14-year
> history, and nobody wants to wait another decade or more for raid5
> to work.
> 
> It seems to me the biggest problem with write hole fixes is that all
> the potential fixes have cost tradeoffs, and everybody wants to veto
> the fix that has a cost they don't like.
> 
> We could implement multiple fix approaches at the same time, as AFAIK
> most of the proposed solutions are orthogonal to each other.  e.g. a
> write-ahead log can safely enable RMW at a higher IO cost, while the
> allocator could place extents to avoid RMW and thereby avoid the logging
> cost as much as possible (paid for by a deferred relocation/garbage
> collection cost), and using both at the same time would combine both
> benefits.  Both solutions can be used independently for filesystems at
> extreme ends of the performance/capacity spectrum (if the filesystem is
> never more than 50% full, then logging is all cost with no gain compared
> to allocator avoidance of RMW, while a filesystem that is always near
> full will have to journal writes and also throttle writes on the journal.

Kudos to Zygo; I have to say that I never encountered before a so clearly
explanation of the complexity around btrfs raid5/6 problems and the related
solutions.

> 
>>> For example a non-striped redundant-n profile as well as a striped redundant-n profile.
>>
>> Non-striped redundant-n profile is already so complex that I can't
>> figure out a working idea right now.
>>
>> But if there is such way, I'm pretty happy to consider.
>>
>>>
>>>>
>>>> My 2 cents...
>>>>
>>>> Regarding the current raid56 support, in order of preference:
>>>>
>>>> a. Fix the current bugs, without changing format. Zygo has an extensive list.
>>>
>>> I agree that relatively simple fixes should be made. But it seems we will need quite a large rewrite to solve all issues? Is there a minium viable option here?
>>
>> Nope. Just see my write-intent code, already have prototype (just needs
>> new scrub based recovery code at mount time) working.
>>
>> And based on my write-intent code, I don't think it's that hard to
>> implement a full journal.
> 
> FWIW I think we can get a very usable btrfs raid5 with a small format
> change (add a journal for stripe RMW, though we might disagree about
> details of how it should be structured and used)...

Again, I have to agree with Zygo. Even tough I am fascinating by a solution
like ZFS (parity block inside the extent), I think that a journal (and a
write intent log) is a more pragmatic approach:
- this kind of solution is below the btrfs bg; this would avoid to add further
   pressure on the metadata
- being below to the other btrfs structure may be shaped more easily with
   less risk of incompatibility

It is true that a ZFS solution may be more faster in some workload, but
I think that these are very few:
   - for high throughput, you likely write the full stripe which doesn't need
     journal/ppl
   - for small block update, a journal is more efficient than rewrite the
     full stripe


I hope that the end of Qu activities, will be a more robust raid5 btrfs
implementation, which will in turn increase the number of user, and which
in turn increase the pressure to improve this part of btrfs.

My only suggestion is to evaluate if we need to develop a write intent log
and then a journal, instead of developing the journal alone. I think that
two disk format changes are too much.


BR
G.Baroncelli
> and fixes to the
> read-repair and scrub problems.  The read-side problems in btrfs raid5
> were always much more severe than the write hole.  As soon as a disk
> goes offline, the read-repair code is unable to read all the surviving
> data correctly, and the filesystem has to be kept inactive or data on
> the disks will be gradually corrupted as bad parity gets mixed with data
> and written back to the filesystem.
> 
> A few of the problems will require a deeper redesign, but IMHO they're not
> important problems.  e.g. scrub can't identify which drive is corrupted
> in all cases, because it has no csum on parity blocks.  The current
> on-disk format needs every data block in the raid5 stripe to be occupied
> by a file with a csum so scrub can eliminate every other block as the
> possible source of mismatched parity.  While this could be fixed by
> a future new raid5 profile (and/or csum tree) specifically designed
> to avoid this, it's not something I'd insist on having before deploying
> a fleet of btrfs raid5 boxes.  Silent corruption failures are so
> rare on spinning disks that I'd use the feature maybe once a decade.
> Silent corruption due to a failing or overheating HBA chip will most
> likely affect multiple disks at once and trash the whole filesystem,
> so individual drive-level corruption reporting isn't helpful.
> 
>> Thanks,
>> Qu
>>
>>>
>>>> b. Mostly fix the write hole, also without changing the format, by
>>>> only doing COW with full stripe writes. Yes you could somehow get
>>>> corrupt parity still and not know it until degraded operation produces
>>>> a bad reconstruction of data - but checksum will still catch that.
>>>> This kind of "unreplicated corruption" is not quite the same thing as
>>>> the write hole, because it isn't pernicious like the write hole.
>>>
>>> What is the difference to a)? Is write hole the worst issue? Judging from the #brtfs channel discussions there seems to be other quite severe issues, for example real data corruption risks in degraded mode.
>>>
>>>> c. A new de-clustered parity raid56 implementation that is not
>>>> backwards compatible.
>>>
>>> Yes. We have a good opportunity to work out something much better than current implementations. We could have  redundant-n profiles that also works with tired storage like ssd/nvme similar to the metadata on ssd idea.
>>>
>>> Variable stripe width has been brought up before, but received cool responses. Why is that? IMO it could improve random 4k ios by doing equivalent to RAID1 instead of RMW, while also closing the write hole. Perhaps there is a middle ground to be found?
>>>
>>>
>>>>
>>>> Ergo, I think it's best to not break the format twice. Even if a new
>>>> raid implementation is years off.
>>>
>>> I very agree here. Btrfs already suffers in public opinion from the lack of a stable and safe-for-data RAID56, and requiring several non-compatible chances isn't going to help.
>>>
>>> I also think it's important that the 'temporary' changes actually leads to a stable filesystem. Because what is the point otherwise?
>>>
>>> Thanks
>>> Forza
>>>
>>>>
>>>> Metadata centric workloads suck on parity raid anyway. If Btrfs always
>>>> does full stripe COW won't matter even if the performance is worse
>>>> because no one should use parity raid for this workload anyway.
>>>>
>>>>
>>>> --
>>>> Chris Murphy
>>>
>>>

-- 
gpg @keyserver.linux.it: Goffredo Baroncelli <kreijackATinwind.it>
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5


^ permalink raw reply	[flat|nested] 88+ messages in thread

* Re: RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree")
  2022-07-25 19:58                               ` Goffredo Baroncelli
@ 2022-07-25 21:29                                 ` Qu Wenruo
  0 siblings, 0 replies; 88+ messages in thread
From: Qu Wenruo @ 2022-07-25 21:29 UTC (permalink / raw)
  To: kreijack, Zygo Blaxell
  Cc: Forza, Chris Murphy, Johannes Thumshirn, Qu Wenruo, linux-btrfs



On 2022/7/26 03:58, Goffredo Baroncelli wrote:
> On 25/07/2022 02.00, Zygo Blaxell wrote:
>> On Tue, Jul 19, 2022 at 09:19:21AM +0800, Qu Wenruo wrote:
> [...]
>>
>> I'd agree with that.  e.g. some btrfs equivalent of ZFS raidZ (put parity
>> blocks inline with extents during writes) is not much more complex to
>> implement on btrfs than compression; however, the btrfs kernel code
>> couldn't read compressed data correctly for 12 years out of its 14-year
>> history, and nobody wants to wait another decade or more for raid5
>> to work.
>>
>> It seems to me the biggest problem with write hole fixes is that all
>> the potential fixes have cost tradeoffs, and everybody wants to veto
>> the fix that has a cost they don't like.
>>
>> We could implement multiple fix approaches at the same time, as AFAIK
>> most of the proposed solutions are orthogonal to each other.  e.g. a
>> write-ahead log can safely enable RMW at a higher IO cost, while the
>> allocator could place extents to avoid RMW and thereby avoid the logging
>> cost as much as possible (paid for by a deferred relocation/garbage
>> collection cost), and using both at the same time would combine both
>> benefits.  Both solutions can be used independently for filesystems at
>> extreme ends of the performance/capacity spectrum (if the filesystem is
>> never more than 50% full, then logging is all cost with no gain compared
>> to allocator avoidance of RMW, while a filesystem that is always near
>> full will have to journal writes and also throttle writes on the journal.
>
> Kudos to Zygo; I have to say that I never encountered before a so clearly
> explanation of the complexity around btrfs raid5/6 problems and the related
> solutions.
>
>>
>>>> For example a non-striped redundant-n profile as well as a striped
>>>> redundant-n profile.
>>>
>>> Non-striped redundant-n profile is already so complex that I can't
>>> figure out a working idea right now.
>>>
>>> But if there is such way, I'm pretty happy to consider.
>>>
>>>>
>>>>>
>>>>> My 2 cents...
>>>>>
>>>>> Regarding the current raid56 support, in order of preference:
>>>>>
>>>>> a. Fix the current bugs, without changing format. Zygo has an
>>>>> extensive list.
>>>>
>>>> I agree that relatively simple fixes should be made. But it seems we
>>>> will need quite a large rewrite to solve all issues? Is there a
>>>> minium viable option here?
>>>
>>> Nope. Just see my write-intent code, already have prototype (just needs
>>> new scrub based recovery code at mount time) working.
>>>
>>> And based on my write-intent code, I don't think it's that hard to
>>> implement a full journal.
>>
>> FWIW I think we can get a very usable btrfs raid5 with a small format
>> change (add a journal for stripe RMW, though we might disagree about
>> details of how it should be structured and used)...
>
> Again, I have to agree with Zygo. Even tough I am fascinating by a solution
> like ZFS (parity block inside the extent), I think that a journal (and a
> write intent log) is a more pragmatic approach:
> - this kind of solution is below the btrfs bg; this would avoid to add
> further
>    pressure on the metadata
> - being below to the other btrfs structure may be shaped more easily with
>    less risk of incompatibility
>
> It is true that a ZFS solution may be more faster in some workload, but
> I think that these are very few:
>    - for high throughput, you likely write the full stripe which doesn't
> need
>      journal/ppl
>    - for small block update, a journal is more efficient than rewrite the
>      full stripe
>
>
> I hope that the end of Qu activities, will be a more robust raid5 btrfs
> implementation, which will in turn increase the number of user, and which
> in turn increase the pressure to improve this part of btrfs.
>
> My only suggestion is to evaluate if we need to develop a write intent log
> and then a journal, instead of developing the journal alone. I think that
> two disk format changes are too much.

That won't be a problem.

For write-intent, we only need 4K, while during the development, I have
reserved 1MiB for write-intent and future journal.

Thus the format change will only be once.

Furthermore, that 1MiB can be tuned to be larger easily for journal.
And for existing RAID56 users, there will be a pretty quick way to
convert to the new write-intent/journal feature.

Thanks,
Qu

>
>
> BR
> G.Baroncelli
>> and fixes to the
>> read-repair and scrub problems.  The read-side problems in btrfs raid5
>> were always much more severe than the write hole.  As soon as a disk
>> goes offline, the read-repair code is unable to read all the surviving
>> data correctly, and the filesystem has to be kept inactive or data on
>> the disks will be gradually corrupted as bad parity gets mixed with data
>> and written back to the filesystem.
>>
>> A few of the problems will require a deeper redesign, but IMHO they're
>> not
>> important problems.  e.g. scrub can't identify which drive is corrupted
>> in all cases, because it has no csum on parity blocks.  The current
>> on-disk format needs every data block in the raid5 stripe to be occupied
>> by a file with a csum so scrub can eliminate every other block as the
>> possible source of mismatched parity.  While this could be fixed by
>> a future new raid5 profile (and/or csum tree) specifically designed
>> to avoid this, it's not something I'd insist on having before deploying
>> a fleet of btrfs raid5 boxes.  Silent corruption failures are so
>> rare on spinning disks that I'd use the feature maybe once a decade.
>> Silent corruption due to a failing or overheating HBA chip will most
>> likely affect multiple disks at once and trash the whole filesystem,
>> so individual drive-level corruption reporting isn't helpful.
>>
>>> Thanks,
>>> Qu
>>>
>>>>
>>>>> b. Mostly fix the write hole, also without changing the format, by
>>>>> only doing COW with full stripe writes. Yes you could somehow get
>>>>> corrupt parity still and not know it until degraded operation produces
>>>>> a bad reconstruction of data - but checksum will still catch that.
>>>>> This kind of "unreplicated corruption" is not quite the same thing as
>>>>> the write hole, because it isn't pernicious like the write hole.
>>>>
>>>> What is the difference to a)? Is write hole the worst issue? Judging
>>>> from the #brtfs channel discussions there seems to be other quite
>>>> severe issues, for example real data corruption risks in degraded mode.
>>>>
>>>>> c. A new de-clustered parity raid56 implementation that is not
>>>>> backwards compatible.
>>>>
>>>> Yes. We have a good opportunity to work out something much better
>>>> than current implementations. We could have  redundant-n profiles
>>>> that also works with tired storage like ssd/nvme similar to the
>>>> metadata on ssd idea.
>>>>
>>>> Variable stripe width has been brought up before, but received cool
>>>> responses. Why is that? IMO it could improve random 4k ios by doing
>>>> equivalent to RAID1 instead of RMW, while also closing the write
>>>> hole. Perhaps there is a middle ground to be found?
>>>>
>>>>
>>>>>
>>>>> Ergo, I think it's best to not break the format twice. Even if a new
>>>>> raid implementation is years off.
>>>>
>>>> I very agree here. Btrfs already suffers in public opinion from the
>>>> lack of a stable and safe-for-data RAID56, and requiring several
>>>> non-compatible chances isn't going to help.
>>>>
>>>> I also think it's important that the 'temporary' changes actually
>>>> leads to a stable filesystem. Because what is the point otherwise?
>>>>
>>>> Thanks
>>>> Forza
>>>>
>>>>>
>>>>> Metadata centric workloads suck on parity raid anyway. If Btrfs always
>>>>> does full stripe COW won't matter even if the performance is worse
>>>>> because no one should use parity raid for this workload anyway.
>>>>>
>>>>>
>>>>> --
>>>>> Chris Murphy
>>>>
>>>>
>

^ permalink raw reply	[flat|nested] 88+ messages in thread

end of thread, other threads:[~2022-07-25 21:33 UTC | newest]

Thread overview: 88+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-16 14:31 [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Johannes Thumshirn
2022-05-16 14:31 ` [RFC ONLY 1/8] btrfs: add raid stripe tree definitions Johannes Thumshirn
2022-05-17  7:39   ` Qu Wenruo
2022-05-17  7:45     ` Johannes Thumshirn
2022-05-17  7:56       ` Qu Wenruo
2022-05-16 14:31 ` [RFC ONLY 2/8] btrfs: move btrfs_io_context to volumes.h Johannes Thumshirn
2022-05-17  7:42   ` Qu Wenruo
2022-05-17  7:51     ` Johannes Thumshirn
2022-05-17  7:58       ` Qu Wenruo
2022-05-17  8:01         ` Johannes Thumshirn
2022-05-16 14:31 ` [RFC ONLY 3/8] btrfs: read raid-stripe-tree from disk Johannes Thumshirn
2022-05-17  8:09   ` Qu Wenruo
2022-05-17  8:13     ` Johannes Thumshirn
2022-05-17  8:28       ` Qu Wenruo
2022-05-18 11:29         ` Johannes Thumshirn
2022-05-19  8:36           ` Qu Wenruo
2022-05-19  8:39             ` Johannes Thumshirn
2022-05-19 10:37               ` Qu Wenruo
2022-05-19 11:44                 ` Johannes Thumshirn
2022-05-19 11:48                   ` Qu Wenruo
2022-05-19 11:53                     ` Johannes Thumshirn
2022-05-19 13:26                       ` Qu Wenruo
2022-05-19 13:49                         ` Johannes Thumshirn
2022-05-19 22:56                           ` Qu Wenruo
2022-05-20  8:27                             ` Johannes Thumshirn
2022-05-16 14:31 ` [RFC ONLY 4/8] btrfs: add boilerplate code to insert raid extent Johannes Thumshirn
2022-05-17  7:53   ` Qu Wenruo
2022-05-17  8:00   ` Qu Wenruo
2022-05-17  8:05     ` Johannes Thumshirn
2022-05-17  8:09       ` Qu Wenruo
2022-05-16 14:31 ` [RFC ONLY 5/8] btrfs: add code to delete " Johannes Thumshirn
2022-05-17  8:06   ` Qu Wenruo
2022-05-17  8:10     ` Johannes Thumshirn
2022-05-17  8:14       ` Qu Wenruo
2022-05-17  8:20         ` Johannes Thumshirn
2022-05-17  8:31           ` Qu Wenruo
2022-05-16 14:31 ` [RFC ONLY 6/8] btrfs: add code to read " Johannes Thumshirn
2022-05-16 14:55   ` Josef Bacik
2022-05-16 14:31 ` [RFC ONLY 7/8] btrfs: zoned: allow zoned RAID1 Johannes Thumshirn
2022-05-16 14:31 ` [RFC ONLY 8/8] btrfs: add raid stripe tree pretty printer Johannes Thumshirn
2022-05-16 14:58 ` [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree Josef Bacik
2022-05-16 15:04   ` Johannes Thumshirn
2022-05-16 15:10     ` Josef Bacik
2022-05-16 15:47       ` Johannes Thumshirn
2022-05-17  7:23 ` Nikolay Borisov
2022-05-17  7:31   ` Qu Wenruo
2022-05-17  7:41     ` Johannes Thumshirn
2022-05-17  7:32   ` Johannes Thumshirn
2022-07-13 10:54 ` RAID56 discussion related to RST. (Was "Re: [RFC ONLY 0/8] btrfs: introduce raid-stripe-tree") Qu Wenruo
2022-07-13 11:43   ` Johannes Thumshirn
2022-07-13 12:01     ` Qu Wenruo
2022-07-13 12:42       ` Johannes Thumshirn
2022-07-13 13:47         ` Qu Wenruo
2022-07-13 14:01           ` Johannes Thumshirn
2022-07-13 15:24             ` Lukas Straub
2022-07-13 15:28               ` Johannes Thumshirn
2022-07-14  1:08             ` Qu Wenruo
2022-07-14  7:08               ` Johannes Thumshirn
2022-07-14  7:32                 ` Qu Wenruo
2022-07-14  7:46                   ` Johannes Thumshirn
2022-07-14  7:53                     ` Qu Wenruo
2022-07-15 17:54                     ` Goffredo Baroncelli
2022-07-15 19:08                       ` Thiago Ramon
2022-07-16  0:34                         ` Qu Wenruo
2022-07-16 11:11                           ` Qu Wenruo
2022-07-16 13:52                             ` Thiago Ramon
2022-07-16 14:26                               ` Goffredo Baroncelli
2022-07-17 17:58                                 ` Goffredo Baroncelli
2022-07-17  0:30                               ` Qu Wenruo
2022-07-17 15:18                                 ` Thiago Ramon
2022-07-17 22:01                                   ` Qu Wenruo
2022-07-17 23:00                           ` Zygo Blaxell
2022-07-18  1:04                             ` Qu Wenruo
2022-07-15 20:14                       ` Chris Murphy
2022-07-18  7:33                         ` Johannes Thumshirn
2022-07-18  8:03                           ` Qu Wenruo
2022-07-18 21:49                         ` Forza
2022-07-19  1:19                           ` Qu Wenruo
2022-07-21 14:51                             ` Forza
2022-07-24 11:27                               ` Qu Wenruo
2022-07-25  0:00                             ` Zygo Blaxell
2022-07-25  0:25                               ` Qu Wenruo
2022-07-25  5:41                                 ` Zygo Blaxell
2022-07-25  7:49                                   ` Qu Wenruo
2022-07-25 19:58                               ` Goffredo Baroncelli
2022-07-25 21:29                                 ` Qu Wenruo
2022-07-18  7:30                       ` Johannes Thumshirn
2022-07-19 18:58                         ` Goffredo Baroncelli

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.