[PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS

linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
@ 2024-01-18  8:54 Naohiro Aota
  2024-01-18  8:54 ` [PATCH 1/2] btrfs: introduce inline_csum_mode to tweak inline checksum behavior Naohiro Aota
                   ` (5 more replies)
  0 siblings, 6 replies; 17+ messages in thread
From: Naohiro Aota @ 2024-01-18  8:54 UTC (permalink / raw)
  To: linux-btrfs; +Cc: wangyugui, Naohiro Aota

There was a report of write performance regression on 6.5-rc4 on RAID0
(4 devices) btrfs [1]. Then, I reported that BTRFS_FS_CSUM_IMPL_FAST
and doing the checksum inline can be bad for performance on RAID0
setup [2]. 

[1] https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
[2] https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/

While inlining the fast checksum is good for single (or two) device,
but it is not fast enough for multi-device striped writing.

So, this series first introduces fs_devices->inline_csum_mode and its
sysfs interface to tweak the inline csum behavior (auto/on/off). Then,
it disables inline checksum when it find a block group striped writing
into multiple devices.

Naohiro Aota (2):
  btrfs: introduce inline_csum_mode to tweak inline checksum behavior
  btrfs: detect multi-dev stripe and disable automatic inline checksum

 fs/btrfs/bio.c     | 14 ++++++++++++--
 fs/btrfs/sysfs.c   | 39 +++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.c | 20 ++++++++++++++++++++
 fs/btrfs/volumes.h | 21 +++++++++++++++++++++
 4 files changed, 92 insertions(+), 2 deletions(-)

-- 
2.43.0

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/2] btrfs: introduce inline_csum_mode to tweak inline checksum behavior
  2024-01-18  8:54 [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Naohiro Aota
@ 2024-01-18  8:54 ` Naohiro Aota
  2024-01-18  8:54 ` [PATCH 2/2] btrfs: detect multi-dev stripe and disable automatic inline checksum Naohiro Aota
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 17+ messages in thread
From: Naohiro Aota @ 2024-01-18  8:54 UTC (permalink / raw)
  To: linux-btrfs; +Cc: wangyugui, Naohiro Aota

We disable offloading checksum to workqueues and do it inline when the
checksum algorithm is fast. However, as reported in the link below, RAID0
with multiple devices may suffer from the inline checksum, because "fast
checksum" is still not fast enough to catch up RAID0 writing.

To measure the effectiveness of inline checksum, it would be better to have
a switch for inline checksum.

This commit introduces fs_devices->inline_csum_mode, so that a FS user can
change the behavior by writing to /sys/fs/btrfs/<uuid>/inline_csum. The
default is "auto" which is the same as the previous behavior. Or, you can
set "on" or "off" to always/never use inline checksum.

Link: https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
Link: https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/bio.c     |  8 +++++++-
 fs/btrfs/sysfs.c   | 39 +++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h | 19 +++++++++++++++++++
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 2d20215548db..222ee52a3af1 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -609,8 +609,14 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
 
 static bool should_async_write(struct btrfs_bio *bbio)
 {
+	struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
+
+	if (fs_devices->inline_csum_mode == BTRFS_INLINE_CSUM_FORCE_ON)
+		return false;
+
 	/* Submit synchronously if the checksum implementation is fast. */
-	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
+	if (fs_devices->inline_csum_mode == BTRFS_INLINE_CSUM_AUTO &&
+	    test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
 		return false;
 
 	/*
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 84c05246ffd8..f7491bc2950e 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1306,6 +1306,44 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
 BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
 	      btrfs_bg_reclaim_threshold_store);
 
+static ssize_t btrfs_inline_csum_show(struct kobject *kobj,
+				      struct kobj_attribute *a,
+				      char *buf)
+{
+	struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+
+	switch (fs_devices->inline_csum_mode) {
+	case BTRFS_INLINE_CSUM_AUTO:
+		return sysfs_emit(buf, "auto\n");
+	case BTRFS_INLINE_CSUM_FORCE_ON:
+		return sysfs_emit(buf, "on\n");
+	case BTRFS_INLINE_CSUM_FORCE_OFF:
+		return sysfs_emit(buf, "off\n");
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+}
+
+static ssize_t btrfs_inline_csum_store(struct kobject *kobj,
+				       struct kobj_attribute *a,
+				       const char *buf, size_t len)
+{
+	struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+
+	if (sysfs_streq(buf, "auto"))
+		fs_devices->inline_csum_mode = BTRFS_INLINE_CSUM_AUTO;
+	else if (sysfs_streq(buf, "on"))
+		fs_devices->inline_csum_mode = BTRFS_INLINE_CSUM_FORCE_ON;
+	else if (sysfs_streq(buf, "off"))
+		fs_devices->inline_csum_mode = BTRFS_INLINE_CSUM_FORCE_OFF;
+	else
+		return -EINVAL;
+
+	return len;
+}
+BTRFS_ATTR_RW(, inline_csum, btrfs_inline_csum_show, btrfs_inline_csum_store);
+
 /*
  * Per-filesystem information and stats.
  *
@@ -1325,6 +1363,7 @@ static const struct attribute *btrfs_attrs[] = {
 	BTRFS_ATTR_PTR(, bg_reclaim_threshold),
 	BTRFS_ATTR_PTR(, commit_stats),
 	BTRFS_ATTR_PTR(, temp_fsid),
+	BTRFS_ATTR_PTR(, inline_csum),
 	NULL,
 };
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53f87f398da7..f21cfe268be9 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -276,6 +276,22 @@ enum btrfs_read_policy {
 	BTRFS_NR_READ_POLICY,
 };
 
+/*
+ * Checksum mode - do it in btrfs_submit_chunk() or offload it.
+ */
+enum btrfs_inline_csum_mode {
+	/*
+	 * Choose inline checksum or offloading automatically. Do it
+	 * inline if the checksum is fast, or offload to workqueues
+	 * otherwise.
+	 */
+	BTRFS_INLINE_CSUM_AUTO,
+	/* Never offload checksum to workqueues. */
+	BTRFS_INLINE_CSUM_FORCE_ON,
+	/* Always offload checksum to workqueues. */
+	BTRFS_INLINE_CSUM_FORCE_OFF,
+};
+
 struct btrfs_fs_devices {
 	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
 
@@ -380,6 +396,9 @@ struct btrfs_fs_devices {
 
 	/* Policy used to read the mirrored stripes. */
 	enum btrfs_read_policy read_policy;
+
+	/* Checksum mode - do it inline or offload it. */
+	enum btrfs_inline_csum_mode inline_csum_mode;
 };
 
 #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)	\
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/2] btrfs: detect multi-dev stripe and disable automatic inline checksum
  2024-01-18  8:54 [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Naohiro Aota
  2024-01-18  8:54 ` [PATCH 1/2] btrfs: introduce inline_csum_mode to tweak inline checksum behavior Naohiro Aota
@ 2024-01-18  8:54 ` Naohiro Aota
  2024-01-19 15:29   ` Johannes Thumshirn
  2024-01-18  9:12 ` [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Roman Mamedov
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 17+ messages in thread
From: Naohiro Aota @ 2024-01-18  8:54 UTC (permalink / raw)
  To: linux-btrfs; +Cc: wangyugui, Naohiro Aota

Currently, we disable offloading checksum to workqueues if the checksum
algorithm implementation is fast. However, it may not be fast enough for
multiple device striping, as reported in the links.

For example, below is a result running the following fio benchmark on 6
devices RAID0 (both data and metadata) btrfs.

fio --group_reporting --rw=write --fallocate=none \
     --direct=0 --ioengine=libaio --iodepth=32 --end_fsync=1 \
     --filesize=200G bs=$((64 * 6))k \
     --time_based --runtime=300s \
     --directory=/mnt --name=writer --numjobs=6

with this patch (offloading checksum to workques)
    WRITE: bw=2084MiB/s (2185MB/s), 2084MiB/s-2084MiB/s (2185MB/s-2185MB/s), io=750GiB (806GB), run=368752-368752msec
without this patch (inline checksum)
    WRITE: bw=1447MiB/s (1517MB/s), 1447MiB/s-1447MiB/s (1517MB/s-1517MB/s), io=517GiB (555GB), run=366017-366017msec

So, it is better to disable inline checksum when there is a block group
stripe-writing to several devices (RAID0/10/5/6). For now, I simply set the
threshold to 2, so if there are more than 2 devices, it disables the inline
checksum.

For reference, here is a result on 1 device RAID0 setup. No degradation
introduced as expected.

with this patch:
    WRITE: bw=445MiB/s (467MB/s), 445MiB/s-445MiB/s (467MB/s-467MB/s), io=302GiB (324GB), run=694199-694199msec
without this patch
    WRITE: bw=441MiB/s (462MB/s), 441MiB/s-441MiB/s (462MB/s-462MB/s), io=300GiB (322GB), run=696125-696125msec

Link: https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
Link: https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/bio.c     |  8 ++++++--
 fs/btrfs/volumes.c | 20 ++++++++++++++++++++
 fs/btrfs/volumes.h |  2 ++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 222ee52a3af1..dfdba72ea0da 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -614,9 +614,13 @@ static bool should_async_write(struct btrfs_bio *bbio)
 	if (fs_devices->inline_csum_mode == BTRFS_INLINE_CSUM_FORCE_ON)
 		return false;
 
-	/* Submit synchronously if the checksum implementation is fast. */
+	/*
+	 * Submit synchronously if the checksum implementation is
+	 * fast, and it is not backed by multiple devices striping.
+	 */
 	if (fs_devices->inline_csum_mode == BTRFS_INLINE_CSUM_AUTO &&
-	    test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
+	    test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags) &&
+	    !fs_devices->striped_writing)
 		return false;
 
 	/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d67785be2c77..79c1af049e9e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -41,6 +41,12 @@
 					 BTRFS_BLOCK_GROUP_RAID10 | \
 					 BTRFS_BLOCK_GROUP_RAID56_MASK)
 
+/*
+ * Maximum number of devices automatically enabling inline checksum for
+ * a striped write.
+ */
+#define BTRFS_INLINE_CSUM_MAX_DEVS 2
+
 struct btrfs_io_geometry {
 	u32 stripe_index;
 	u32 stripe_nr;
@@ -5124,6 +5130,19 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
 	btrfs_set_fs_incompat(info, RAID1C34);
 }
 
+static void check_striped_block_group(struct btrfs_fs_info *info, u64 type, int num_stripes)
+{
+	if (btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max != 0 ||
+	    num_stripes <= BTRFS_INLINE_CSUM_MAX_DEVS)
+		return;
+
+	/*
+	 * Found a block group writing to multiple devices, disable
+	 * inline automatic checksum.
+	 */
+	info->fs_devices->striped_writing = true;
+}
+
 /*
  * Structure used internally for btrfs_create_chunk() function.
  * Wraps needed parameters.
@@ -5592,6 +5611,7 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
 
 	check_raid56_incompat_flag(info, type);
 	check_raid1c34_incompat_flag(info, type);
+	check_striped_block_group(info, type, map->num_stripes);
 
 	return block_group;
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f21cfe268be9..c32501985c64 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -384,6 +384,8 @@ struct btrfs_fs_devices {
 	bool seeding;
 	/* The mount needs to use a randomly generated fsid. */
 	bool temp_fsid;
+	/* Has a block group writing stripes to multiple devices. (RAID0/10/5/6). */
+	bool striped_writing;
 
 	struct btrfs_fs_info *fs_info;
 	/* sysfs kobjects */
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
  2024-01-18  8:54 [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Naohiro Aota
  2024-01-18  8:54 ` [PATCH 1/2] btrfs: introduce inline_csum_mode to tweak inline checksum behavior Naohiro Aota
  2024-01-18  8:54 ` [PATCH 2/2] btrfs: detect multi-dev stripe and disable automatic inline checksum Naohiro Aota
@ 2024-01-18  9:12 ` Roman Mamedov
  2024-01-19 15:49   ` David Sterba
  2024-01-22  7:17   ` Naohiro Aota
  2024-01-19 15:30 ` Johannes Thumshirn
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 17+ messages in thread
From: Roman Mamedov @ 2024-01-18  9:12 UTC (permalink / raw)
  To: Naohiro Aota; +Cc: linux-btrfs, wangyugui

On Thu, 18 Jan 2024 17:54:49 +0900
Naohiro Aota <naohiro.aota@wdc.com> wrote:

> There was a report of write performance regression on 6.5-rc4 on RAID0
> (4 devices) btrfs [1]. Then, I reported that BTRFS_FS_CSUM_IMPL_FAST
> and doing the checksum inline can be bad for performance on RAID0
> setup [2]. 
> 
> [1] https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
> [2] https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
> 
> While inlining the fast checksum is good for single (or two) device,
> but it is not fast enough for multi-device striped writing.

Personal opinion, it is a very awkward criteria to enable or disable the
inline mode. There can be a RAID0 of SATA HDDs/SSDs that will be slower than a
single PCI-E 4.0 NVMe SSD. In [1] the inline mode slashes the performance from
4 GB/sec to 1.5 GB/sec. A single modern SSD is capable of up to 6 GB/sec.

Secondly, less often, there can be a hardware RAID which presents itself to the
OS as a single device, but is also very fast.

Sure, basing such decision on anything else, such as benchmark of the
actual block device may not be as feasible.

> So, this series first introduces fs_devices->inline_csum_mode and its
> sysfs interface to tweak the inline csum behavior (auto/on/off). Then,
> it disables inline checksum when it find a block group striped writing
> into multiple devices.

Has it been determined what improvement enabling the inline mode brings at all,
and in which setups? Maybe just disable it by default and provide this tweak
option?

> Naohiro Aota (2):
>   btrfs: introduce inline_csum_mode to tweak inline checksum behavior
>   btrfs: detect multi-dev stripe and disable automatic inline checksum
> 
>  fs/btrfs/bio.c     | 14 ++++++++++++--
>  fs/btrfs/sysfs.c   | 39 +++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/volumes.c | 20 ++++++++++++++++++++
>  fs/btrfs/volumes.h | 21 +++++++++++++++++++++
>  4 files changed, 92 insertions(+), 2 deletions(-)
> 


-- 
With respect,
Roman

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] btrfs: detect multi-dev stripe and disable automatic inline checksum
  2024-01-18  8:54 ` [PATCH 2/2] btrfs: detect multi-dev stripe and disable automatic inline checksum Naohiro Aota
@ 2024-01-19 15:29   ` Johannes Thumshirn
  2024-01-22  8:02     ` Naohiro Aota
  2024-01-22 21:11     ` David Sterba
  0 siblings, 2 replies; 17+ messages in thread
From: Johannes Thumshirn @ 2024-01-19 15:29 UTC (permalink / raw)
  To: Naohiro Aota, linux-btrfs; +Cc: wangyugui

On 18.01.24 09:55, Naohiro Aota wrote:
> +static void check_striped_block_group(struct btrfs_fs_info *info, u64 type, int num_stripes)
> +{
> +	if (btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max != 0 ||
> +	    num_stripes <= BTRFS_INLINE_CSUM_MAX_DEVS)
> +		return;
> +
> +	/*
> +	 * Found a block group writing to multiple devices, disable
> +	 * inline automatic checksum.
> +	 */
> +	info->fs_devices->striped_writing = true;
> +}
> +

This function adds some overly long lines.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
  2024-01-18  8:54 [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Naohiro Aota
                   ` (2 preceding siblings ...)
  2024-01-18  9:12 ` [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Roman Mamedov
@ 2024-01-19 15:30 ` Johannes Thumshirn
  2024-01-19 16:01 ` David Sterba
  2024-01-24  0:19 ` Wang Yugui
  5 siblings, 0 replies; 17+ messages in thread
From: Johannes Thumshirn @ 2024-01-19 15:30 UTC (permalink / raw)
  To: Naohiro Aota, linux-btrfs; +Cc: wangyugui

Apart from the style nit in 2/2:
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
  2024-01-18  9:12 ` [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Roman Mamedov
@ 2024-01-19 15:49   ` David Sterba
  2024-01-22 15:31     ` Naohiro Aota
  2024-01-22  7:17   ` Naohiro Aota
  1 sibling, 1 reply; 17+ messages in thread
From: David Sterba @ 2024-01-19 15:49 UTC (permalink / raw)
  To: Roman Mamedov; +Cc: Naohiro Aota, linux-btrfs, wangyugui

On Thu, Jan 18, 2024 at 02:12:31PM +0500, Roman Mamedov wrote:
> On Thu, 18 Jan 2024 17:54:49 +0900
> Naohiro Aota <naohiro.aota@wdc.com> wrote:
> 
> > There was a report of write performance regression on 6.5-rc4 on RAID0
> > (4 devices) btrfs [1]. Then, I reported that BTRFS_FS_CSUM_IMPL_FAST
> > and doing the checksum inline can be bad for performance on RAID0
> > setup [2]. 
> > 
> > [1] https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
> > [2] https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
> > 
> > While inlining the fast checksum is good for single (or two) device,
> > but it is not fast enough for multi-device striped writing.
> 
> Personal opinion, it is a very awkward criteria to enable or disable the
> inline mode. There can be a RAID0 of SATA HDDs/SSDs that will be slower than a
> single PCI-E 4.0 NVMe SSD. In [1] the inline mode slashes the performance from
> 4 GB/sec to 1.5 GB/sec. A single modern SSD is capable of up to 6 GB/sec.
> 
> Secondly, less often, there can be a hardware RAID which presents itself to the
> OS as a single device, but is also very fast.

Yeah I find the decision logic not adapting well to various types of
underlying hardware. While the multi-device and striped sounds like a
good heuristic it can still lead to worse performance.

> Sure, basing such decision on anything else, such as benchmark of the
> actual block device may not be as feasible.

In an ideal case it adapts to current load or device capabilities, which
needs a feedback loop and tracking the status for the offloading
decision.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
  2024-01-18  8:54 [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Naohiro Aota
                   ` (3 preceding siblings ...)
  2024-01-19 15:30 ` Johannes Thumshirn
@ 2024-01-19 16:01 ` David Sterba
  2024-01-22 15:12   ` Naohiro Aota
  2024-01-24  0:19 ` Wang Yugui
  5 siblings, 1 reply; 17+ messages in thread
From: David Sterba @ 2024-01-19 16:01 UTC (permalink / raw)
  To: Naohiro Aota; +Cc: linux-btrfs, wangyugui

On Thu, Jan 18, 2024 at 05:54:49PM +0900, Naohiro Aota wrote:
> There was a report of write performance regression on 6.5-rc4 on RAID0
> (4 devices) btrfs [1]. Then, I reported that BTRFS_FS_CSUM_IMPL_FAST
> and doing the checksum inline can be bad for performance on RAID0
> setup [2]. 

First, please don't name it 'inline checksum', it's so confusing because
we have 'inline' as inline files and also the inline checksums stored in
the b-tree nodes.

> [1] https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
> [2] https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
> 
> While inlining the fast checksum is good for single (or two) device,
> but it is not fast enough for multi-device striped writing.
> 
> So, this series first introduces fs_devices->inline_csum_mode and its
> sysfs interface to tweak the inline csum behavior (auto/on/off). Then,
> it disables inline checksum when it find a block group striped writing
> into multiple devices.

How is one supposed to know if and how the sysfs knob should be set?
This depends on the device speed(s), profiles and number of devices, can
the same decision logic be replicated inside btrfs? Such tuning should
be done automatically (similar things are done in other subystems like
memory management).

With such type of setting we'll get people randomly flipping it on/off
and see if it fixes performance, without actually looking if it's
relevant or not. We've seen this with random advice circling around
internet how to fix enospc problems, it's next to impossible to stop
that so I really don't want to allow that for performance.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Re: [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
  2024-01-18  9:12 ` [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Roman Mamedov
  2024-01-19 15:49   ` David Sterba
@ 2024-01-22  7:17   ` Naohiro Aota
  1 sibling, 0 replies; 17+ messages in thread
From: Naohiro Aota @ 2024-01-22  7:17 UTC (permalink / raw)
  To: Roman Mamedov; +Cc: linux-btrfs, wangyugui, hch, clm

On Thu, Jan 18, 2024 at 02:12:31PM +0500, Roman Mamedov wrote:
> On Thu, 18 Jan 2024 17:54:49 +0900
> Naohiro Aota <naohiro.aota@wdc.com> wrote:
>
> > There was a report of write performance regression on 6.5-rc4 on RAID0
> > (4 devices) btrfs [1]. Then, I reported that BTRFS_FS_CSUM_IMPL_FAST
> > and doing the checksum inline can be bad for performance on RAID0
> > setup [2].
> >
> > [1] https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
> > [2] https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
> >
> > While inlining the fast checksum is good for single (or two) device,
> > but it is not fast enough for multi-device striped writing.
>
> Personal opinion, it is a very awkward criteria to enable or disable the
> inline mode. There can be a RAID0 of SATA HDDs/SSDs that will be slower than a
> single PCI-E 4.0 NVMe SSD. In [1] the inline mode slashes the performance from
> 4 GB/sec to 1.5 GB/sec. A single modern SSD is capable of up to 6 GB/sec.
>
> Secondly, less often, there can be a hardware RAID which presents itself to the
> OS as a single device, but is also very fast.
>
> Sure, basing such decision on anything else, such as benchmark of the
> actual block device may not be as feasible.
>
> > So, this series first introduces fs_devices->inline_csum_mode and its
> > sysfs interface to tweak the inline csum behavior (auto/on/off). Then,
> > it disables inline checksum when it find a block group striped writing
> > into multiple devices.
>
> Has it been determined what improvement enabling the inline mode brings at all,
> and in which setups? Maybe just disable it by default and provide this tweak
> option?

Note: as mentioned by David, I'm going to say "sync checksum" instead of
"inline checksum".

I'm going to list the benchmark results here.

The sync checksum is introduced with this patch.

https://lore.kernel.org/linux-btrfs/20230503070615.1029820-2-hch@lst.de/

The benchmark described in the patch is originated from this email by Chris.

https://lore.kernel.org/linux-btrfs/eb544c31-7a74-d503-83f0-4dc226917d1a@meta.com/

* Device: Intel Optane

* workqueue checksum (Unpatched):
  write: IOPS=3316, BW=3316MiB/s (3477MB/s)(200GiB/61757msec); 0 zone resets

* Sync checksum (synchronous CRCs):
  write: IOPS=4882, BW=4882MiB/s (5119MB/s)(200GiB/41948msec); 0 zone resets

Christoph also did the same on kvm on consumer drives and got a similar
result. Furthermore, even with "non-accelerated crc32 code", "the workqueue
offload only looked better for really large writes, and then only
marginally."

https://lore.kernel.org/linux-btrfs/20230325081341.GB7353@lst.de/

Then, Wang Yugui reported a regression both on SINGLE setup and RAID0 setup.

https://lore.kernel.org/linux-btrfs/20230811222321.2AD2.409509F4@e16-tech.com/

* CPU: E5 2680 v2, two NUMA nodes
* RAM: 192G
* Device: NVMe SSD PCIe3 x4
* Btrfs profile: data=raid0, metadata=raid1
  - all PCIe3 NVMe SSD are connected to one NVMe HBA/one numa node.

* workqueue checksum: RAID0:
  WRITE: bw=3858MiB/s (4045MB/s)
  WRITE: bw=3781MiB/s (3965MB/s)
* sync checksum: RAID0:
  WRITE: bw=1311MiB/s (1375MB/s)
  WRITE: bw=1435MiB/s (1504MB/s)

* workqueue checksum: SINGLE:
  WRITE: bw=3004MiB/s (3149MB/s)
  WRITE: bw=2851MiB/s (2990MB/s)
* sync checksum: SINGLE:
  WRITE: bw=1337MiB/s (1402MB/s)
  WRITE: bw=1413MiB/s (1481MB/s)

So, workqueue (old) method is way better on his machine.

After a while, I reported workqueue checksum is better than sync checksum
on 6 SSDs RAID0 case.

https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/

* CPU: Intel(R) Xeon(R) Platinum 8260 CPU, two NUMA nodes, 96 CPUs
* RAM: 1024G

On 6 SSDs RAID0
* workqueue checksum:
  WRITE: bw=2106MiB/s (2208MB/s), 2106MiB/s-2106MiB/s (2208MB/s-2208MB/s), io=760GiB (816GB), run=369705-369705msec
* sync checksum:
  WRITE: bw=1391MiB/s (1458MB/s), 1391MiB/s-1391MiB/s (1458MB/s-1458MB/s), io=499GiB (536GB), run=367312-367312msec

Or, even with 1 SSD setup (still RAID0):
* workqueue checksum:
  WRITE: bw=437MiB/s (459MB/s), 437MiB/s-437MiB/s (459MB/s-459MB/s), io=299GiB (321GB), run=699787-699787msec
* sync checksum:
  WRITE: bw=442MiB/s (464MB/s), 442MiB/s-442MiB/s (464MB/s-464MB/s), io=302GiB (324GB), run=698553-698553msec

The same as Wang Yugui, I got better performance with workqueue checksum.

I also tested it on an emulated fast device (null_blk with irqmode=0)
today. The device is formatted with the default profile.

* CPU: Intel(R) Xeon(R) Platinum 8260 CPU, two NUMA nodes, 96 CPUs
* RAM: 1024G
* Device: null_blk with irqmode=0, use_per_node_hctx=1, memory_backed=1, size=512000 (512GB)
* Btrfs profile: data=single, metadata=dup

I ran this fio command with this series applied to tweak the checksum mode.

fio --group_reporting --eta=always --eta-interval=30s --eta-newline=30s \
    --rw=write \
    --direct=0 --ioengine=psync \
    --filesize=${filesize} \
    --blocksize=1m \
    --end_fsync=1 \
    --directory=/mnt \
    --name=writer --numjobs=${numjobs}

I tried several setups, but I could not get a better performance with sync
checksum. Examples are shown below.

With numjobs=96, filesize=2GB
* workqueue checksum: (writing off to the newly added sysfs file)
  WRITE: bw=1776MiB/s (1862MB/s), 1776MiB/s-1776MiB/s (1862MB/s-1862MB/s), io=192GiB (206GB), run=110733-110733msec
* sync checksum       (writing on to the sysfs file)
  WRITE: bw=1037MiB/s (1088MB/s), 1037MiB/s-1037MiB/s (1088MB/s-1088MB/s), io=192GiB (206GB), run=189550-189550msec

With numjobs=368, filesize=512MB
* workqueue checksum:
  WRITE: bw=1726MiB/s (1810MB/s), 1726MiB/s-1726MiB/s (1810MB/s-1810MB/s), io=192GiB (206GB), run=113902-113902msec
* sync checksum
  WRITE: bw=1221MiB/s (1280MB/s), 1221MiB/s-1221MiB/s (1280MB/s-1280MB/s), io=192GiB (206GB), run=161060-161060msec

Also, I run a similar experiment on a different machine, which has 32 CPUs
and 128 GB RAM. Since it has a smaller RAM, filesize is also smaller than
above. And, again, workqueue checksum is slightly better.

* workqueue checksum:
  WRITE: bw=298MiB/s (313MB/s), 298MiB/s-298MiB/s (313MB/s-313MB/s), io=32.0GiB (34.4GB), run=109883-109883msec
* sync checksum
  WRITE: bw=275MiB/s (288MB/s), 275MiB/s-275MiB/s (288MB/s-288MB/s), io=32.0GiB (34.4GB), run=119169-119169msec


When I started writing this reply, I thought the proper criteria may be the
number of CPUs, or some balance of the number of CPUs vs disks. But, now,
as I could not get "sync checksum" to be better on any setup, I'm getting
puzzled. Is "sync checksum" really effective still for now? Maybe, it's
good on smaller CPUs (~4?) machine with a single device?

In addition, We are also going to have a change on the workqueue's side
too, which changes max number of working jobs, especially effective for a
NUMA machine.

https://lore.kernel.org/all/20240113002911.406791-1-tj@kernel.org/

Anyway, we need more benchmark results to see the effect of "sync checksum"
and "workqueue checksum".

>
> > Naohiro Aota (2):
> >   btrfs: introduce inline_csum_mode to tweak inline checksum behavior
> >   btrfs: detect multi-dev stripe and disable automatic inline checksum
> >
> >  fs/btrfs/bio.c     | 14 ++++++++++++--
> >  fs/btrfs/sysfs.c   | 39 +++++++++++++++++++++++++++++++++++++++
> >  fs/btrfs/volumes.c | 20 ++++++++++++++++++++
> >  fs/btrfs/volumes.h | 21 +++++++++++++++++++++
> >  4 files changed, 92 insertions(+), 2 deletions(-)
> >
>
>
> --
> With respect,
> Roman

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Re: [PATCH 2/2] btrfs: detect multi-dev stripe and disable automatic inline checksum
  2024-01-19 15:29   ` Johannes Thumshirn
@ 2024-01-22  8:02     ` Naohiro Aota
  2024-01-22 21:11     ` David Sterba
  1 sibling, 0 replies; 17+ messages in thread
From: Naohiro Aota @ 2024-01-22  8:02 UTC (permalink / raw)
  To: Johannes Thumshirn; +Cc: linux-btrfs, wangyugui

On Fri, Jan 19, 2024 at 03:29:13PM +0000, Johannes Thumshirn wrote:
> On 18.01.24 09:55, Naohiro Aota wrote:
> > +static void check_striped_block_group(struct btrfs_fs_info *info, u64 type, int num_stripes)
> > +{
> > +	if (btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max != 0 ||
> > +	    num_stripes <= BTRFS_INLINE_CSUM_MAX_DEVS)
> > +		return;
> > +
> > +	/*
> > +	 * Found a block group writing to multiple devices, disable
> > +	 * inline automatic checksum.
> > +	 */
> > +	info->fs_devices->striped_writing = true;
> > +}
> > +
> 
> This function adds some overly long lines.

Oh, the checkpatch didn't gave an error. But, I'll move "num_stripes" in
the next version, well, if it's still needed.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Re: [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
  2024-01-19 16:01 ` David Sterba
@ 2024-01-22 15:12   ` Naohiro Aota
  2024-01-22 21:19     ` David Sterba
  0 siblings, 1 reply; 17+ messages in thread
From: Naohiro Aota @ 2024-01-22 15:12 UTC (permalink / raw)
  To: David Sterba; +Cc: linux-btrfs, wangyugui

On Fri, Jan 19, 2024 at 05:01:01PM +0100, David Sterba wrote:
> On Thu, Jan 18, 2024 at 05:54:49PM +0900, Naohiro Aota wrote:
> > There was a report of write performance regression on 6.5-rc4 on RAID0
> > (4 devices) btrfs [1]. Then, I reported that BTRFS_FS_CSUM_IMPL_FAST
> > and doing the checksum inline can be bad for performance on RAID0
> > setup [2]. 
> 
> First, please don't name it 'inline checksum', it's so confusing because
> we have 'inline' as inline files and also the inline checksums stored in
> the b-tree nodes.

Sure. Sorry for the confusing naming. Is it OK to call it "sync checksum"?
and "workqueue checksum" for the opposite?

> 
> > [1] https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
> > [2] https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
> > 
> > While inlining the fast checksum is good for single (or two) device,
> > but it is not fast enough for multi-device striped writing.
> > 
> > So, this series first introduces fs_devices->inline_csum_mode and its
> > sysfs interface to tweak the inline csum behavior (auto/on/off). Then,
> > it disables inline checksum when it find a block group striped writing
> > into multiple devices.
> 
> How is one supposed to know if and how the sysfs knob should be set?
> This depends on the device speed(s), profiles and number of devices, can
> the same decision logic be replicated inside btrfs? Such tuning should
> be done automatically (similar things are done in other subystems like
> memory management).

Yeah, I first thought it was OK to turn sync checksum off automatically on
e.g, RAID0 case. But, as reported in [1], it becomes difficult. It might
depend also on CPUs.

[1] https://lore.kernel.org/linux-btrfs/irc2v7zqrpbkeehhysq7fccwmguujnkrktknl3d23t2ecwope6@o62qzd4yyxt2/T/#u

> With such type of setting we'll get people randomly flipping it on/off
> and see if it fixes performance, without actually looking if it's
> relevant or not. We've seen this with random advice circling around
> internet how to fix enospc problems, it's next to impossible to stop
> that so I really don't want to allow that for performance.

Yes, I agree it's nasty to have a random switch.

But, in [1], I can't find a setup that has a better performance on sync
checksum (even for SINGLE setup). So, I think, we need to rethink and
examine the effectiveness of sync checksum vs workqueue checksum. So, for
the evaluation, I'd like to leave the sysfs knob under CONFIG_BTRFS_DEBUG.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Re: [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
  2024-01-19 15:49   ` David Sterba
@ 2024-01-22 15:31     ` Naohiro Aota
  0 siblings, 0 replies; 17+ messages in thread
From: Naohiro Aota @ 2024-01-22 15:31 UTC (permalink / raw)
  To: David Sterba; +Cc: Roman Mamedov, linux-btrfs, wangyugui

On Fri, Jan 19, 2024 at 04:49:27PM +0100, David Sterba wrote:
> On Thu, Jan 18, 2024 at 02:12:31PM +0500, Roman Mamedov wrote:
> > On Thu, 18 Jan 2024 17:54:49 +0900
> > Naohiro Aota <naohiro.aota@wdc.com> wrote:
> > 
> > > There was a report of write performance regression on 6.5-rc4 on RAID0
> > > (4 devices) btrfs [1]. Then, I reported that BTRFS_FS_CSUM_IMPL_FAST
> > > and doing the checksum inline can be bad for performance on RAID0
> > > setup [2]. 
> > > 
> > > [1] https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
> > > [2] https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
> > > 
> > > While inlining the fast checksum is good for single (or two) device,
> > > but it is not fast enough for multi-device striped writing.
> > 
> > Personal opinion, it is a very awkward criteria to enable or disable the
> > inline mode. There can be a RAID0 of SATA HDDs/SSDs that will be slower than a
> > single PCI-E 4.0 NVMe SSD. In [1] the inline mode slashes the performance from
> > 4 GB/sec to 1.5 GB/sec. A single modern SSD is capable of up to 6 GB/sec.
> > 
> > Secondly, less often, there can be a hardware RAID which presents itself to the
> > OS as a single device, but is also very fast.
> 
> Yeah I find the decision logic not adapting well to various types of
> underlying hardware. While the multi-device and striped sounds like a
> good heuristic it can still lead to worse performance.
> 
> > Sure, basing such decision on anything else, such as benchmark of the
> > actual block device may not be as feasible.
> 
> In an ideal case it adapts to current load or device capabilities, which
> needs a feedback loop and tracking the status for the offloading
> decision.

Yeah, it is the best if we can implement it properly...

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] btrfs: detect multi-dev stripe and disable automatic inline checksum
  2024-01-19 15:29   ` Johannes Thumshirn
  2024-01-22  8:02     ` Naohiro Aota
@ 2024-01-22 21:11     ` David Sterba
  1 sibling, 0 replies; 17+ messages in thread
From: David Sterba @ 2024-01-22 21:11 UTC (permalink / raw)
  To: Johannes Thumshirn; +Cc: Naohiro Aota, linux-btrfs, wangyugui

On Fri, Jan 19, 2024 at 03:29:13PM +0000, Johannes Thumshirn wrote:
> On 18.01.24 09:55, Naohiro Aota wrote:
> > +static void check_striped_block_group(struct btrfs_fs_info *info, u64 type, int num_stripes)
> > +{
> > +	if (btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max != 0 ||
> > +	    num_stripes <= BTRFS_INLINE_CSUM_MAX_DEVS)
> > +		return;
> > +
> > +	/*
> > +	 * Found a block group writing to multiple devices, disable
> > +	 * inline automatic checksum.
> > +	 */
> > +	info->fs_devices->striped_writing = true;
> > +}
> > +
> 
> This function adds some overly long lines.

The line length limits are not strict and checkpatch has been updated
from 80 to 100, so it's up to our taste. For the prototypes it's around
85-ish to be ok.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Re: [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
  2024-01-22 15:12   ` Naohiro Aota
@ 2024-01-22 21:19     ` David Sterba
  0 siblings, 0 replies; 17+ messages in thread
From: David Sterba @ 2024-01-22 21:19 UTC (permalink / raw)
  To: Naohiro Aota; +Cc: David Sterba, linux-btrfs, wangyugui

On Mon, Jan 22, 2024 at 03:12:27PM +0000, Naohiro Aota wrote:
> On Fri, Jan 19, 2024 at 05:01:01PM +0100, David Sterba wrote:
> > On Thu, Jan 18, 2024 at 05:54:49PM +0900, Naohiro Aota wrote:
> > > There was a report of write performance regression on 6.5-rc4 on RAID0
> > > (4 devices) btrfs [1]. Then, I reported that BTRFS_FS_CSUM_IMPL_FAST
> > > and doing the checksum inline can be bad for performance on RAID0
> > > setup [2]. 
> > 
> > First, please don't name it 'inline checksum', it's so confusing because
> > we have 'inline' as inline files and also the inline checksums stored in
> > the b-tree nodes.
> 
> Sure. Sorry for the confusing naming. Is it OK to call it "sync checksum"?
> and "workqueue checksum" for the opposite?

Well 'sync' also already has a meaning :) we could call it 'checksum
offloading'.

> > > [1] https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
> > > [2] https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
> > > 
> > > While inlining the fast checksum is good for single (or two) device,
> > > but it is not fast enough for multi-device striped writing.
> > > 
> > > So, this series first introduces fs_devices->inline_csum_mode and its
> > > sysfs interface to tweak the inline csum behavior (auto/on/off). Then,
> > > it disables inline checksum when it find a block group striped writing
> > > into multiple devices.
> > 
> > How is one supposed to know if and how the sysfs knob should be set?
> > This depends on the device speed(s), profiles and number of devices, can
> > the same decision logic be replicated inside btrfs? Such tuning should
> > be done automatically (similar things are done in other subystems like
> > memory management).
> 
> Yeah, I first thought it was OK to turn sync checksum off automatically on
> e.g, RAID0 case. But, as reported in [1], it becomes difficult. It might
> depend also on CPUs.
> 
> [1] https://lore.kernel.org/linux-btrfs/irc2v7zqrpbkeehhysq7fccwmguujnkrktknl3d23t2ecwope6@o62qzd4yyxt2/T/#u
> 
> > With such type of setting we'll get people randomly flipping it on/off
> > and see if it fixes performance, without actually looking if it's
> > relevant or not. We've seen this with random advice circling around
> > internet how to fix enospc problems, it's next to impossible to stop
> > that so I really don't want to allow that for performance.
> 
> Yes, I agree it's nasty to have a random switch.
> 
> But, in [1], I can't find a setup that has a better performance on sync
> checksum (even for SINGLE setup). So, I think, we need to rethink and
> examine the effectiveness of sync checksum vs workqueue checksum. So, for
> the evaluation, I'd like to leave the sysfs knob under CONFIG_BTRFS_DEBUG.

Ok to have it for debugging and evaluation. Thanks for the results [1],
clearly the switch to sync checksums was not backed by enough
benchmarks. As a fallback we can revert the change at the cost of
pessimizing the one case where it helped compared to several others
where it makes things works.

We might find a heuristic based on device type and turn it again
selectively, eg. for the NVMe-like devices.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
  2024-01-18  8:54 [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Naohiro Aota
                   ` (4 preceding siblings ...)
  2024-01-19 16:01 ` David Sterba
@ 2024-01-24  0:19 ` Wang Yugui
  2024-01-29 12:56   ` Wang Yugui
  5 siblings, 1 reply; 17+ messages in thread
From: Wang Yugui @ 2024-01-24  0:19 UTC (permalink / raw)
  To: Naohiro Aota; +Cc: linux-btrfs

Hi,

> There was a report of write performance regression on 6.5-rc4 on RAID0
> (4 devices) btrfs [1]. Then, I reported that BTRFS_FS_CSUM_IMPL_FAST
> and doing the checksum inline can be bad for performance on RAID0
> setup [2]. 
> 
> [1] https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
> [2] https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
> 
> While inlining the fast checksum is good for single (or two) device,
> but it is not fast enough for multi-device striped writing.
> 
> So, this series first introduces fs_devices->inline_csum_mode and its
> sysfs interface to tweak the inline csum behavior (auto/on/off). Then,
> it disables inline checksum when it find a block group striped writing
> into multiple devices.

We have struct btrfs_inode | sync_writers in kernel 6.1.y, but dropped in recent
kernel. 

Is btrfs_inode | sync_writers not implemented very well?

Best Regards
Wang Yugui (wangyugui@e16-tech.com)
2024/01/24



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
  2024-01-24  0:19 ` Wang Yugui
@ 2024-01-29 12:56   ` Wang Yugui
  2024-01-30  1:38     ` Naohiro Aota
  0 siblings, 1 reply; 17+ messages in thread
From: Wang Yugui @ 2024-01-29 12:56 UTC (permalink / raw)
  To: Naohiro Aota, linux-btrfs

Hi,

> Hi,
> 
> > There was a report of write performance regression on 6.5-rc4 on RAID0
> > (4 devices) btrfs [1]. Then, I reported that BTRFS_FS_CSUM_IMPL_FAST
> > and doing the checksum inline can be bad for performance on RAID0
> > setup [2]. 
> > 
> > [1] https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
> > [2] https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
> > 
> > While inlining the fast checksum is good for single (or two) device,
> > but it is not fast enough for multi-device striped writing.
> > 
> > So, this series first introduces fs_devices->inline_csum_mode and its
> > sysfs interface to tweak the inline csum behavior (auto/on/off). Then,
> > it disables inline checksum when it find a block group striped writing
> > into multiple devices.
> 
> We have struct btrfs_inode | sync_writers in kernel 6.1.y, but dropped in recent
> kernel. 
> 
> Is btrfs_inode | sync_writers not implemented very well?

I tried the logic blow, some like ' btrfs_inode | sync_writers'.
- checksum of metadata always sync
- checksum of data async only when depth over 1,
	to reduce task switch when low load.
	to use more cpu core when high load.

performance test result is not good
	2GiB/s(checksum of data always async) -> 2.1GiB/s when low load.
	4GiB/s(checksum of data always async) -> 2788MiB/s when high load.

but the info  maybe useful, so post it here.


- checksum of metadata always sync

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 12b12443efaa..8ef968f0957d 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -598,7 +598,7 @@ static void run_one_async_free(struct btrfs_work *work)
 static bool should_async_write(struct btrfs_bio *bbio)
 {
 	/* Submit synchronously if the checksum implementation is fast. */
-	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
+	if ((bbio->bio.bi_opf & REQ_META) && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
 		return false;
 
 	/*


- checksum of data async only when depth over 1, to reduce task switch.

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index efb894967f55..f90b6e8cf53c 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -626,6 +626,9 @@ static bool should_async_write(struct btrfs_bio *bbio)
 	if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info))
 		return false;
 
+	if (!(bbio->bio.bi_opf & REQ_META) && atomic_read(&bbio->fs_info->depth_checksum_data)==0 )
+		return false;
+
 	return true;
 }
 
@@ -725,11 +728,21 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 		if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
 		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
 		    !btrfs_is_data_reloc_root(inode->root)) {
-			if (should_async_write(bbio) &&
-			    btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
-				goto done;
-
+			if (should_async_write(bbio)){
+				if (!(bbio->bio.bi_opf & REQ_META))
+					atomic_inc(&bbio->fs_info->depth_checksum_data);
+				ret = btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num);
+				if (!(bbio->bio.bi_opf & REQ_META))
+					atomic_dec(&bbio->fs_info->depth_checksum_data);
+				if(ret)
+					goto done;
+			}
+
+			if (!(bbio->bio.bi_opf & REQ_META))
+				atomic_inc(&bbio->fs_info->depth_checksum_data);
 			ret = btrfs_bio_csum(bbio);
+			if (!(bbio->bio.bi_opf & REQ_META))
+				atomic_dec(&bbio->fs_info->depth_checksum_data);
 			if (ret)
 				goto fail_put_bio;
 		} else if (use_append) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d7b127443c9a..3fd89be7610a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2776,6 +2776,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
+	atomic_set(&fs_info->depth_checksum_data, 0);
 
 	INIT_LIST_HEAD(&fs_info->ordered_roots);
 	spin_lock_init(&fs_info->ordered_root_lock);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 7443bf014639..123cc8fa9be1 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -596,6 +596,7 @@ struct btrfs_fs_info {
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
 	u32 thread_pool_size;
+	atomic_t depth_checksum_data;
 
 	struct kobject *space_info_kobj;
 	struct kobject *qgroups_kobj;

Best Regards
Wang Yugui (wangyugui@e16-tech.com)
2024/01/25



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: Re: [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS
  2024-01-29 12:56   ` Wang Yugui
@ 2024-01-30  1:38     ` Naohiro Aota
  0 siblings, 0 replies; 17+ messages in thread
From: Naohiro Aota @ 2024-01-30  1:38 UTC (permalink / raw)
  To: Wang Yugui; +Cc: linux-btrfs

On Mon, Jan 29, 2024 at 08:56:22PM +0800, Wang Yugui wrote:
> Hi,
> 
> > Hi,
> > 
> > > There was a report of write performance regression on 6.5-rc4 on RAID0
> > > (4 devices) btrfs [1]. Then, I reported that BTRFS_FS_CSUM_IMPL_FAST
> > > and doing the checksum inline can be bad for performance on RAID0
> > > setup [2]. 
> > > 
> > > [1] https://lore.kernel.org/linux-btrfs/20230731152223.4EFB.409509F4@e16-tech.com/
> > > [2] https://lore.kernel.org/linux-btrfs/p3vo3g7pqn664mhmdhlotu5dzcna6vjtcoc2hb2lsgo2fwct7k@xzaxclba5tae/
> > > 
> > > While inlining the fast checksum is good for single (or two) device,
> > > but it is not fast enough for multi-device striped writing.
> > > 
> > > So, this series first introduces fs_devices->inline_csum_mode and its
> > > sysfs interface to tweak the inline csum behavior (auto/on/off). Then,
> > > it disables inline checksum when it find a block group striped writing
> > > into multiple devices.
> > 
> > We have struct btrfs_inode | sync_writers in kernel 6.1.y, but dropped in recent
> > kernel. 
> > 
> > Is btrfs_inode | sync_writers not implemented very well?
> 
> I tried the logic blow, some like ' btrfs_inode | sync_writers'.
> - checksum of metadata always sync
> - checksum of data async only when depth over 1,
> 	to reduce task switch when low load.
> 	to use more cpu core when high load.
> 
> performance test result is not good
> 	2GiB/s(checksum of data always async) -> 2.1GiB/s when low load.
> 	4GiB/s(checksum of data always async) -> 2788MiB/s when high load.
> 
> but the info  maybe useful, so post it here.
> 
> 
> - checksum of metadata always sync
> 
> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
> index 12b12443efaa..8ef968f0957d 100644
> --- a/fs/btrfs/bio.c
> +++ b/fs/btrfs/bio.c
> @@ -598,7 +598,7 @@ static void run_one_async_free(struct btrfs_work *work)
>  static bool should_async_write(struct btrfs_bio *bbio)
>  {
>  	/* Submit synchronously if the checksum implementation is fast. */
> -	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
> +	if ((bbio->bio.bi_opf & REQ_META) && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
>  		return false;
>  
>  	/*
> 
> 
> - checksum of data async only when depth over 1, to reduce task switch.
> 
> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
> index efb894967f55..f90b6e8cf53c 100644
> --- a/fs/btrfs/bio.c
> +++ b/fs/btrfs/bio.c
> @@ -626,6 +626,9 @@ static bool should_async_write(struct btrfs_bio *bbio)
>  	if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info))
>  		return false;
>  
> +	if (!(bbio->bio.bi_opf & REQ_META) && atomic_read(&bbio->fs_info->depth_checksum_data)==0 )
> +		return false;
> +
>  	return true;
>  }
>  
> @@ -725,11 +728,21 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
>  		if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
>  		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
>  		    !btrfs_is_data_reloc_root(inode->root)) {
> -			if (should_async_write(bbio) &&
> -			    btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
> -				goto done;
> -
> +			if (should_async_write(bbio)){
> +				if (!(bbio->bio.bi_opf & REQ_META))
> +					atomic_inc(&bbio->fs_info->depth_checksum_data);
> +				ret = btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num);
> +				if (!(bbio->bio.bi_opf & REQ_META))
> +					atomic_dec(&bbio->fs_info->depth_checksum_data);

This does not looks like well implemented. btrfs_wq_submit_bio() returns
soon after it queues a checksum work to the workqueue without processing
any actual work.

> +				if(ret)
> +					goto done;
> +			}
> +
> +			if (!(bbio->bio.bi_opf & REQ_META))
> +				atomic_inc(&bbio->fs_info->depth_checksum_data);
>  			ret = btrfs_bio_csum(bbio);
> +			if (!(bbio->bio.bi_opf & REQ_META))
> +				atomic_dec(&bbio->fs_info->depth_checksum_data);

These lines seems fine. But, as the btrfs_wq_submit_bio() side is not well
implemented, it will work like this:

- The first data bio will go to sync checksum: btrfs_bio_csum() because
  depth_checksum_data == 0.
- While at it, the second one come and it goes to btrfs_wq_submit_bio()
  because depth_checksum_data == 1.
- Then, depth_checksum_data soon decrements to 1. The same happens for
  other parallel IOs.
- Once the checksum of the first bio finishes, depth_checksum_data == 0,
  accepting sync checksum. But, at this time, the workqueue may have a lot
  of work queued.

>  			if (ret)
>  				goto fail_put_bio;
>  		} else if (use_append) {
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index d7b127443c9a..3fd89be7610a 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2776,6 +2776,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
>  
>  	fs_info->thread_pool_size = min_t(unsigned long,
>  					  num_online_cpus() + 2, 8);
> +	atomic_set(&fs_info->depth_checksum_data, 0);
>  
>  	INIT_LIST_HEAD(&fs_info->ordered_roots);
>  	spin_lock_init(&fs_info->ordered_root_lock);
> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> index 7443bf014639..123cc8fa9be1 100644
> --- a/fs/btrfs/fs.h
> +++ b/fs/btrfs/fs.h
> @@ -596,6 +596,7 @@ struct btrfs_fs_info {
>  	struct task_struct *transaction_kthread;
>  	struct task_struct *cleaner_kthread;
>  	u32 thread_pool_size;
> +	atomic_t depth_checksum_data;
>  
>  	struct kobject *space_info_kobj;
>  	struct kobject *qgroups_kobj;
> 
> Best Regards
> Wang Yugui (wangyugui@e16-tech.com)
> 2024/01/25
> 
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2024-01-30  1:39 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-18  8:54 [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Naohiro Aota
2024-01-18  8:54 ` [PATCH 1/2] btrfs: introduce inline_csum_mode to tweak inline checksum behavior Naohiro Aota
2024-01-18  8:54 ` [PATCH 2/2] btrfs: detect multi-dev stripe and disable automatic inline checksum Naohiro Aota
2024-01-19 15:29   ` Johannes Thumshirn
2024-01-22  8:02     ` Naohiro Aota
2024-01-22 21:11     ` David Sterba
2024-01-18  9:12 ` [PATCH 0/2] btrfs: disable inline checksum for multi-dev striped FS Roman Mamedov
2024-01-19 15:49   ` David Sterba
2024-01-22 15:31     ` Naohiro Aota
2024-01-22  7:17   ` Naohiro Aota
2024-01-19 15:30 ` Johannes Thumshirn
2024-01-19 16:01 ` David Sterba
2024-01-22 15:12   ` Naohiro Aota
2024-01-22 21:19     ` David Sterba
2024-01-24  0:19 ` Wang Yugui
2024-01-29 12:56   ` Wang Yugui
2024-01-30  1:38     ` Naohiro Aota

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).