All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] btrfs: scrub: per-device bandwidth control
@ 2021-05-18 14:49 David Sterba
  2021-05-18 16:52 ` Holger Hoffstätte
                   ` (3 more replies)
  0 siblings, 4 replies; 20+ messages in thread
From: David Sterba @ 2021-05-18 14:49 UTC (permalink / raw)
  To: linux-btrfs; +Cc: David Sterba

Add sysfs interface to limit io during scrub. We relied on the ionice
interface to do that, eg. the idle class let the system usable while
scrub was running. This has changed when mq-deadline got widespread and
did not implement the scheduling classes. That was a CFQ thing that got
deleted. We've got numerous complaints from users about degraded
performance.

Currently only BFQ supports that but it's not a common scheduler and we
can't ask everybody to switch to it.

Alternatively the cgroup io limiting can be used but that also a
non-trivial setup (v2 required, the controller must be enabled on the
system). This can still be used if desired.

Other ideas that have been explored: piggy-back on ionice (that is set
per-process and is accessible) and interpret the class and classdata as
bandwidth limits, but this does not have enough flexibility as there are
only 8 allowed and we'd have to map fixed limits to each value. Also
adjusting the value would need to lookup the process that currently runs
scrub on the given device, and the value is not sticky so would have to
be adjusted each time scrub runs.

Running out of options, sysfs does not look that bad:

- it's accessible from scripts, or udev rules
- the name is similar to what MD-RAID has
  (/proc/sys/dev/raid/speed_limit_max or /sys/block/mdX/md/sync_speed_max)
- the value is sticky at least for filesystem mount time
- adjusting the value has immediate effect
- sysfs is available in constrained environments (eg. system rescue)
- the limit also applies to device replace

Sysfs:

- raw value is in bytes
- values written to the file accept suffixes like K, M
- file is in the per-device directory /sys/fs/btrfs/FSID/devinfo/DEVID/scrub_speed_max
- 0 means use default priority of IO

The scheduler is a simple deadline one and the accuracy is up to nearest
128K.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c   | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/sysfs.c   | 28 +++++++++++++++++++++
 fs/btrfs/volumes.h |  3 +++
 3 files changed, 92 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 485cda3eb8d7..83f3149253ea 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -165,6 +165,10 @@ struct scrub_ctx {
 	int			readonly;
 	int			pages_per_rd_bio;
 
+	/* State of IO submission throttling affecting the associated device */
+	ktime_t			throttle_deadline;
+	u64			throttle_sent;
+
 	int			is_dev_replace;
 	u64			write_pointer;
 
@@ -605,6 +609,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 	spin_lock_init(&sctx->list_lock);
 	spin_lock_init(&sctx->stat_lock);
 	init_waitqueue_head(&sctx->list_wait);
+	sctx->throttle_deadline = 0;
 
 	WARN_ON(sctx->wr_curr_bio != NULL);
 	mutex_init(&sctx->wr_lock);
@@ -1988,6 +1993,60 @@ static void scrub_page_put(struct scrub_page *spage)
 	}
 }
 
+/*
+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
+ * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
+ */
+static void scrub_throttle(struct scrub_ctx *sctx)
+{
+	const int time_slice = 1000;
+	struct scrub_bio *sbio;
+	struct btrfs_device *device;
+	s64 delta;
+	ktime_t now;
+	u32 div;
+	u64 bwlimit;
+
+	sbio = sctx->bios[sctx->curr];
+	device = sbio->dev;
+	bwlimit = READ_ONCE(device->scrub_speed_max);
+	if (bwlimit == 0)
+		return;
+
+	/*
+	 * Slice is divided into intervals when the IO is submitted, adjust by
+	 * bwlimit and maximum of 64 intervals.
+	 */
+	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+	div = min_t(u32, 64, div);
+
+	/* Start new epoch, set deadline */
+	now = ktime_get();
+	if (sctx->throttle_deadline == 0) {
+		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
+		sctx->throttle_sent = 0;
+	}
+
+	/* Still in the time to send? */
+	if (ktime_before(now, sctx->throttle_deadline)) {
+		/* If current bio is within the limit, send it */
+		sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
+		if (sctx->throttle_sent <= bwlimit / div)
+			return;
+
+		/* We're over the limit, sleep until the rest of the slice */
+		delta = ktime_ms_delta(sctx->throttle_deadline, now);
+	} else {
+		/* New request after deadline, start new epoch */
+		delta = 0;
+	}
+
+	if (delta)
+		schedule_timeout_interruptible(delta * HZ / 1000);
+	/* Next call will start the deadline period */
+	sctx->throttle_deadline = 0;
+}
+
 static void scrub_submit(struct scrub_ctx *sctx)
 {
 	struct scrub_bio *sbio;
@@ -1995,6 +2054,8 @@ static void scrub_submit(struct scrub_ctx *sctx)
 	if (sctx->curr == -1)
 		return;
 
+	scrub_throttle(sctx);
+
 	sbio = sctx->bios[sctx->curr];
 	sctx->curr = -1;
 	scrub_pending_bio_inc(sctx);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 436ac7b4b334..c45d9b6dfdb5 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1455,6 +1455,33 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
 }
 BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
 
+static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
+					     struct kobj_attribute *a,
+					     char *buf)
+{
+	struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+						   devid_kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n",
+			 READ_ONCE(device->scrub_speed_max));
+}
+
+static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
+					      struct kobj_attribute *a,
+					      const char *buf, size_t len)
+{
+	struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+						   devid_kobj);
+	char *endptr;
+	unsigned long long limit;
+
+	limit = memparse(buf, &endptr);
+	WRITE_ONCE(device->scrub_speed_max, limit);
+	return len;
+}
+BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show,
+	      btrfs_devinfo_scrub_speed_max_store);
+
 static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
 					    struct kobj_attribute *a, char *buf)
 {
@@ -1472,6 +1499,7 @@ static struct attribute *devid_attrs[] = {
 	BTRFS_ATTR_PTR(devid, in_fs_metadata),
 	BTRFS_ATTR_PTR(devid, missing),
 	BTRFS_ATTR_PTR(devid, replace_target),
+	BTRFS_ATTR_PTR(devid, scrub_speed_max),
 	BTRFS_ATTR_PTR(devid, writeable),
 	NULL
 };
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9c0d84e5ec06..063ce999b9d3 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -143,6 +143,9 @@ struct btrfs_device {
 	struct completion kobj_unregister;
 	/* For sysfs/FSID/devinfo/devid/ */
 	struct kobject devid_kobj;
+
+	/* Bandwidth limit for scrub, in bytes */
+	u64 scrub_speed_max;
 };
 
 /*
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-18 14:49 [PATCH] btrfs: scrub: per-device bandwidth control David Sterba
@ 2021-05-18 16:52 ` Holger Hoffstätte
  2021-05-18 20:46   ` David Sterba
  2021-05-18 20:15 ` waxhead
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 20+ messages in thread
From: Holger Hoffstätte @ 2021-05-18 16:52 UTC (permalink / raw)
  To: linux-btrfs

On 2021-05-18 16:49, David Sterba wrote:
> Add sysfs interface to limit io during scrub. We relied on the ionice

This is very useful, thank you! Running scrubs on three devices with
10M/20M/30M bandwidth limit showed up in iotop/nmon exactly as expected,
and dynamically changing the values to speed up/slow down also worked
right away - very nice.

My only suggestion would be:

> - raw value is in bytes

..for this to be in megabytes only (maybe also renaming scrub_speed_max to
scrub_speed_max_mb?), because otherwise everyone will forget the unit and wonder
why scrub is running with 50 bytes/sec. IMHO bytes/kbytes are not really practical
scrub speeds. Other than that have a:

Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>

cheers,
Holger

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-18 14:49 [PATCH] btrfs: scrub: per-device bandwidth control David Sterba
  2021-05-18 16:52 ` Holger Hoffstätte
@ 2021-05-18 20:15 ` waxhead
  2021-05-18 21:06   ` David Sterba
  2021-05-19  6:53 ` Johannes Thumshirn
  2021-05-20  7:43 ` Geert Uytterhoeven
  3 siblings, 1 reply; 20+ messages in thread
From: waxhead @ 2021-05-18 20:15 UTC (permalink / raw)
  To: David Sterba, linux-btrfs

David Sterba wrote:
> Add sysfs interface to limit io during scrub. We relied on the ionice
> interface to do that, eg. the idle class let the system usable while
> scrub was running. This has changed when mq-deadline got widespread and
> did not implement the scheduling classes. That was a CFQ thing that got
> deleted. We've got numerous complaints from users about degraded
> performance.
> 
> Currently only BFQ supports that but it's not a common scheduler and we
> can't ask everybody to switch to it.
> 
> Alternatively the cgroup io limiting can be used but that also a
> non-trivial setup (v2 required, the controller must be enabled on the
> system). This can still be used if desired.
> 
> Other ideas that have been explored: piggy-back on ionice (that is set
> per-process and is accessible) and interpret the class and classdata as
> bandwidth limits, but this does not have enough flexibility as there are
> only 8 allowed and we'd have to map fixed limits to each value. Also
> adjusting the value would need to lookup the process that currently runs
> scrub on the given device, and the value is not sticky so would have to
> be adjusted each time scrub runs.
> 
> Running out of options, sysfs does not look that bad:
> 
> - it's accessible from scripts, or udev rules
> - the name is similar to what MD-RAID has
>    (/proc/sys/dev/raid/speed_limit_max or /sys/block/mdX/md/sync_speed_max)
> - the value is sticky at least for filesystem mount time
> - adjusting the value has immediate effect
> - sysfs is available in constrained environments (eg. system rescue)
> - the limit also applies to device replace
> 
> Sysfs:
> 
> - raw value is in bytes
> - values written to the file accept suffixes like K, M
> - file is in the per-device directory /sys/fs/btrfs/FSID/devinfo/DEVID/scrub_speed_max
> - 0 means use default priority of IO
> 

Being just the average user I ponder if this feature is an absolute 
hard-limit or not? E.g. would this limit be applied all the time even if 
the filesystem is idle? or is it dynamic in the sense that scrub can run 
a full speed if the filesystem is (reasonably) idle.

If this is or could somehow be dynamic would it be possible to set the 
value to something like for example 30M:15M which may be interpreted as 
30MiB/s when not idle and 15MiB/s when not idle.

I myself would actually much prefer if it was possible to set a reserved 
bandwidth for all non-scrub related stuff. for example maybe up to max 
75% of the bandwidth always reserved for non-scrub related read/writes.
This would allow the scrub to run at full speed if no other I/O is is 
taking place in which case the scrub would be limited to 25%.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-18 16:52 ` Holger Hoffstätte
@ 2021-05-18 20:46   ` David Sterba
  0 siblings, 0 replies; 20+ messages in thread
From: David Sterba @ 2021-05-18 20:46 UTC (permalink / raw)
  To: Holger Hoffstätte; +Cc: linux-btrfs, Zygo Blaxell

On Tue, May 18, 2021 at 06:52:50PM +0200, Holger Hoffstätte wrote:
> On 2021-05-18 16:49, David Sterba wrote:
> > Add sysfs interface to limit io during scrub. We relied on the ionice
> > - raw value is in bytes
> 
> ..for this to be in megabytes only (maybe also renaming scrub_speed_max to
> scrub_speed_max_mb?), because otherwise everyone will forget the unit and wonder
> why scrub is running with 50 bytes/sec. IMHO bytes/kbytes are not really practical
> scrub speeds.

The granularity is 128K and it's implemented to check if the io is over
the quota, so if it's 50 b/sec it'll become 128 K/sec.

Regarding the units, Zygo mentioned usecase when the limit was set to
~100K/s while the system was booting and then lifted as needed so it
would be at least kilobytes.

I understand the concerns about units, it's not consistent among sysfs
files, at least there's convention to use "_kb" suffix, but not
consistent. My idea is that the suffix when specified is user friendly
enough and allows to nicely specify wide range of numbers without extra
multiplication when the unit is fixed. Eg. 'echo 500K', 'echo 5M' or
even 'echo 1G' looks obvious, but typos can happen when it's KB and it
becomes 50000 instead of 500000.

> Other than that have a:
> 
> Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>

Thanks!

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-18 20:15 ` waxhead
@ 2021-05-18 21:06   ` David Sterba
  0 siblings, 0 replies; 20+ messages in thread
From: David Sterba @ 2021-05-18 21:06 UTC (permalink / raw)
  To: waxhead; +Cc: David Sterba, linux-btrfs

On Tue, May 18, 2021 at 10:15:57PM +0200, waxhead wrote:
> Being just the average user I ponder if this feature is an absolute 
> hard-limit or not? E.g. would this limit be applied all the time even if 
> the filesystem is idle? or is it dynamic in the sense that scrub can run 
> a full speed if the filesystem is (reasonably) idle.

The limit is absolute so even if the device is idle, scrub won't go
faster.  Determining the "is it idle" is the hard part and would require
information that the io scheduler has. With that interpret the numbers
(eg. in-flight requests and overall size) and hope that it would match
the actual device performance to make the guesses.

I vaguely remember trying something like that but it became too similar
to a full blown io scheduler, so that's not the right way. There used to
be some kind of block device congestion query but this got removed.

> If this is or could somehow be dynamic would it be possible to set the 
> value to something like for example 30M:15M which may be interpreted as 
> 30MiB/s when not idle and 15MiB/s when not idle.
> 
> I myself would actually much prefer if it was possible to set a reserved 
> bandwidth for all non-scrub related stuff. for example maybe up to max 
> 75% of the bandwidth always reserved for non-scrub related read/writes.
> This would allow the scrub to run at full speed if no other I/O is is 
> taking place in which case the scrub would be limited to 25%.

This looks similar to the md-raid minimum speed setting, for scrub I
picked only the maximum, but there's /proc/sys/dev/raid/speed_limit_min
that in the example would be 15MiB/s (and _max 30MiB/s). This could be
implemented once there's a way to know the idle status. Maybe something
will emerge over time, right now I don't know how to do it.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-18 14:49 [PATCH] btrfs: scrub: per-device bandwidth control David Sterba
  2021-05-18 16:52 ` Holger Hoffstätte
  2021-05-18 20:15 ` waxhead
@ 2021-05-19  6:53 ` Johannes Thumshirn
  2021-05-19 14:26   ` David Sterba
  2021-05-20  7:43 ` Geert Uytterhoeven
  3 siblings, 1 reply; 20+ messages in thread
From: Johannes Thumshirn @ 2021-05-19  6:53 UTC (permalink / raw)
  To: David Sterba, linux-btrfs

On 18/05/2021 16:52, David Sterba wrote:
> Add sysfs interface to limit io during scrub. We relied on the ionice
> interface to do that, eg. the idle class let the system usable while
> scrub was running. This has changed when mq-deadline got widespread and
> did not implement the scheduling classes. That was a CFQ thing that got
> deleted. We've got numerous complaints from users about degraded
> performance.
> 
> Currently only BFQ supports that but it's not a common scheduler and we
> can't ask everybody to switch to it.
> 
> Alternatively the cgroup io limiting can be used but that also a
> non-trivial setup (v2 required, the controller must be enabled on the
> system). This can still be used if desired.
> 
> Other ideas that have been explored: piggy-back on ionice (that is set
> per-process and is accessible) and interpret the class and classdata as
> bandwidth limits, but this does not have enough flexibility as there are
> only 8 allowed and we'd have to map fixed limits to each value. Also
> adjusting the value would need to lookup the process that currently runs
> scrub on the given device, and the value is not sticky so would have to
> be adjusted each time scrub runs.
> 
> Running out of options, sysfs does not look that bad:
> 
> - it's accessible from scripts, or udev rules
> - the name is similar to what MD-RAID has
>   (/proc/sys/dev/raid/speed_limit_max or /sys/block/mdX/md/sync_speed_max)
> - the value is sticky at least for filesystem mount time
> - adjusting the value has immediate effect
> - sysfs is available in constrained environments (eg. system rescue)
> - the limit also applies to device replace
> 
> Sysfs:
> 
> - raw value is in bytes
> - values written to the file accept suffixes like K, M
> - file is in the per-device directory /sys/fs/btrfs/FSID/devinfo/DEVID/scrub_speed_max
> - 0 means use default priority of IO
> 
> The scheduler is a simple deadline one and the accuracy is up to nearest
> 128K.
> 
> Signed-off-by: David Sterba <dsterba@suse.com>

This looks very good :)
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

I wonder if this interface would make sense for limiting balance bandwidth as well?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-19  6:53 ` Johannes Thumshirn
@ 2021-05-19 14:26   ` David Sterba
  2021-05-19 15:32     ` Johannes Thumshirn
  0 siblings, 1 reply; 20+ messages in thread
From: David Sterba @ 2021-05-19 14:26 UTC (permalink / raw)
  To: Johannes Thumshirn; +Cc: David Sterba, linux-btrfs

On Wed, May 19, 2021 at 06:53:54AM +0000, Johannes Thumshirn wrote:
> On 18/05/2021 16:52, David Sterba wrote:
> I wonder if this interface would make sense for limiting balance
> bandwidth as well?

Balance is not contained to one device, so this makes the scrub case
easy. For balance there are data and metadata involved, both read and
write accross several threads so this is really something that the
cgroups io controler is supposed to do.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-19 14:26   ` David Sterba
@ 2021-05-19 15:32     ` Johannes Thumshirn
  2021-05-19 16:20       ` Graham Cobb
  2021-05-20 12:28       ` David Sterba
  0 siblings, 2 replies; 20+ messages in thread
From: Johannes Thumshirn @ 2021-05-19 15:32 UTC (permalink / raw)
  To: dsterba; +Cc: David Sterba, linux-btrfs

On 19/05/2021 16:28, David Sterba wrote:
> On Wed, May 19, 2021 at 06:53:54AM +0000, Johannes Thumshirn wrote:
>> On 18/05/2021 16:52, David Sterba wrote:
>> I wonder if this interface would make sense for limiting balance
>> bandwidth as well?
> 
> Balance is not contained to one device, so this makes the scrub case
> easy. For balance there are data and metadata involved, both read and
> write accross several threads so this is really something that the
> cgroups io controler is supposed to do.
> 

For a user initiated balance a cgroups io controller would work well, yes.
But for having a QoS mechanism to decide when it's a good time to rebalance
one or more block groups, i.e. with a zoned file system or when we switch
to auto-balance in the future I'm not sure if cgroups would help us much
in this case.

But this can be my lack of knowledge as well.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-19 15:32     ` Johannes Thumshirn
@ 2021-05-19 16:20       ` Graham Cobb
  2021-05-20 12:41         ` David Sterba
  2021-05-21  7:18         ` Zygo Blaxell
  2021-05-20 12:28       ` David Sterba
  1 sibling, 2 replies; 20+ messages in thread
From: Graham Cobb @ 2021-05-19 16:20 UTC (permalink / raw)
  To: Johannes Thumshirn, dsterba; +Cc: linux-btrfs

On 19/05/2021 16:32, Johannes Thumshirn wrote:
> On 19/05/2021 16:28, David Sterba wrote:
>> On Wed, May 19, 2021 at 06:53:54AM +0000, Johannes Thumshirn wrote:
>>> On 18/05/2021 16:52, David Sterba wrote:
>>> I wonder if this interface would make sense for limiting balance
>>> bandwidth as well?
>>
>> Balance is not contained to one device, so this makes the scrub case
>> easy. For balance there are data and metadata involved, both read and
>> write accross several threads so this is really something that the
>> cgroups io controler is supposed to do.
>>
> 
> For a user initiated balance a cgroups io controller would work well, yes.

Hmmm. I might give this a try. On my main mail server balance takes a
long time and a lot of IO, which is why I created my "balance_slowly"
script which shuts down mail (and some other services) then runs balance
for 20 mins, then cancels the balance and allows mail to run for 10
minutes, then resumes the balance for 20 mins, etc. Using this each
month, a balance takes over 24 hours but at least the only problem is
short mail delays for 1 day a month, not timeouts, users seeing mail
error reports, etc.

Before I did this, the impact was horrible: btrfs spent all its time
doing backref searches and any process which touched the filesystem (for
example to deliver a tiny email) could be stuck for over an hour.

I am wondering whether the cgroups io controller would help, or whether
it would cause a priority inversion because the backrefs couldn't do the
IO they needed so the delays to other processes locked out would get
even **longer**. Any thoughts?


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-18 14:49 [PATCH] btrfs: scrub: per-device bandwidth control David Sterba
                   ` (2 preceding siblings ...)
  2021-05-19  6:53 ` Johannes Thumshirn
@ 2021-05-20  7:43 ` Geert Uytterhoeven
  2021-05-20 12:55   ` David Sterba
  2021-05-20 13:14   ` Arnd Bergmann
  3 siblings, 2 replies; 20+ messages in thread
From: Geert Uytterhoeven @ 2021-05-20  7:43 UTC (permalink / raw)
  To: David Sterba; +Cc: linux-btrfs, linux-kernel, Arnd Bergmann

 	Hi David,

On Tue, 18 May 2021, David Sterba wrote:
> Add sysfs interface to limit io during scrub. We relied on the ionice
> interface to do that, eg. the idle class let the system usable while
> scrub was running. This has changed when mq-deadline got widespread and
> did not implement the scheduling classes. That was a CFQ thing that got
> deleted. We've got numerous complaints from users about degraded
> performance.
>
> Currently only BFQ supports that but it's not a common scheduler and we
> can't ask everybody to switch to it.
>
> Alternatively the cgroup io limiting can be used but that also a
> non-trivial setup (v2 required, the controller must be enabled on the
> system). This can still be used if desired.
>
> Other ideas that have been explored: piggy-back on ionice (that is set
> per-process and is accessible) and interpret the class and classdata as
> bandwidth limits, but this does not have enough flexibility as there are
> only 8 allowed and we'd have to map fixed limits to each value. Also
> adjusting the value would need to lookup the process that currently runs
> scrub on the given device, and the value is not sticky so would have to
> be adjusted each time scrub runs.
>
> Running out of options, sysfs does not look that bad:
>
> - it's accessible from scripts, or udev rules
> - the name is similar to what MD-RAID has
>  (/proc/sys/dev/raid/speed_limit_max or /sys/block/mdX/md/sync_speed_max)
> - the value is sticky at least for filesystem mount time
> - adjusting the value has immediate effect
> - sysfs is available in constrained environments (eg. system rescue)
> - the limit also applies to device replace
>
> Sysfs:
>
> - raw value is in bytes
> - values written to the file accept suffixes like K, M
> - file is in the per-device directory /sys/fs/btrfs/FSID/devinfo/DEVID/scrub_speed_max
> - 0 means use default priority of IO
>
> The scheduler is a simple deadline one and the accuracy is up to nearest
> 128K.
>
> Signed-off-by: David Sterba <dsterba@suse.com>

Thanks for your patch, which is now commit b4a9f4bee31449bc ("btrfs:
scrub: per-device bandwidth control") in linux-next.

noreply@ellerman.id.au reported the following failures for e.g.
m68k/defconfig:

ERROR: modpost: "__udivdi3" [fs/btrfs/btrfs.ko] undefined!
ERROR: modpost: "__divdi3" [fs/btrfs/btrfs.ko] undefined!

> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -1988,6 +1993,60 @@ static void scrub_page_put(struct scrub_page *spage)
> 	}
> }
>
> +/*
> + * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
> + * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
> + */
> +static void scrub_throttle(struct scrub_ctx *sctx)
> +{
> +	const int time_slice = 1000;
> +	struct scrub_bio *sbio;
> +	struct btrfs_device *device;
> +	s64 delta;
> +	ktime_t now;
> +	u32 div;
> +	u64 bwlimit;
> +
> +	sbio = sctx->bios[sctx->curr];
> +	device = sbio->dev;
> +	bwlimit = READ_ONCE(device->scrub_speed_max);
> +	if (bwlimit == 0)
> +		return;
> +
> +	/*
> +	 * Slice is divided into intervals when the IO is submitted, adjust by
> +	 * bwlimit and maximum of 64 intervals.
> +	 */
> +	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
> +	div = min_t(u32, 64, div);
> +
> +	/* Start new epoch, set deadline */
> +	now = ktime_get();
> +	if (sctx->throttle_deadline == 0) {
> +		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);

ERROR: modpost: "__udivdi3" [fs/btrfs/btrfs.ko] undefined!

div_u64(bwlimit, div)

> +		sctx->throttle_sent = 0;
> +	}
> +
> +	/* Still in the time to send? */
> +	if (ktime_before(now, sctx->throttle_deadline)) {
> +		/* If current bio is within the limit, send it */
> +		sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
> +		if (sctx->throttle_sent <= bwlimit / div)
> +			return;
> +
> +		/* We're over the limit, sleep until the rest of the slice */
> +		delta = ktime_ms_delta(sctx->throttle_deadline, now);
> +	} else {
> +		/* New request after deadline, start new epoch */
> +		delta = 0;
> +	}
> +
> +	if (delta)
> +		schedule_timeout_interruptible(delta * HZ / 1000);

ERROR: modpost: "__divdi3" [fs/btrfs/btrfs.ko] undefined!

I'm a bit surprised gcc doesn't emit code for the division by the
constant 1000, but emits a call to __divdi3().  So this has to become
div_u64(), too.

> +	/* Next call will start the deadline period */
> +	sctx->throttle_deadline = 0;
> +}

BTW, any chance you can start adding lore Link: tags to your commits, to
make it easier to find the email thread to reply to when reporting a
regression?

Thanks!

Gr{oetje,eeting}s,

 						Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
 							    -- Linus Torvalds

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-19 15:32     ` Johannes Thumshirn
  2021-05-19 16:20       ` Graham Cobb
@ 2021-05-20 12:28       ` David Sterba
  1 sibling, 0 replies; 20+ messages in thread
From: David Sterba @ 2021-05-20 12:28 UTC (permalink / raw)
  To: Johannes Thumshirn; +Cc: dsterba, David Sterba, linux-btrfs

On Wed, May 19, 2021 at 03:32:48PM +0000, Johannes Thumshirn wrote:
> On 19/05/2021 16:28, David Sterba wrote:
> > On Wed, May 19, 2021 at 06:53:54AM +0000, Johannes Thumshirn wrote:
> >> On 18/05/2021 16:52, David Sterba wrote:
> >> I wonder if this interface would make sense for limiting balance
> >> bandwidth as well?
> > 
> > Balance is not contained to one device, so this makes the scrub case
> > easy. For balance there are data and metadata involved, both read and
> > write accross several threads so this is really something that the
> > cgroups io controler is supposed to do.
> 
> For a user initiated balance a cgroups io controller would work well, yes.
> But for having a QoS mechanism to decide when it's a good time to rebalance
> one or more block groups, i.e. with a zoned file system or when we switch
> to auto-balance in the future I'm not sure if cgroups would help us much
> in this case.

Hm right, the background jobs can't be started with the cgroup
controllers. I'm not sure if there's an internal cgroup api to do that,
also that would interfere with the system cgroup settings.

For zoned mode the bg reclaim is crucial to make it work and the
thresholds should be set to make the reclam as fast as possible.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-19 16:20       ` Graham Cobb
@ 2021-05-20 12:41         ` David Sterba
  2021-05-21  7:18         ` Zygo Blaxell
  1 sibling, 0 replies; 20+ messages in thread
From: David Sterba @ 2021-05-20 12:41 UTC (permalink / raw)
  To: Graham Cobb; +Cc: Johannes Thumshirn, dsterba, linux-btrfs

On Wed, May 19, 2021 at 05:20:50PM +0100, Graham Cobb wrote:
> On 19/05/2021 16:32, Johannes Thumshirn wrote:
> > On 19/05/2021 16:28, David Sterba wrote:
> >> On Wed, May 19, 2021 at 06:53:54AM +0000, Johannes Thumshirn wrote:
> >>> On 18/05/2021 16:52, David Sterba wrote:
> >>> I wonder if this interface would make sense for limiting balance
> >>> bandwidth as well?
> >>
> >> Balance is not contained to one device, so this makes the scrub case
> >> easy. For balance there are data and metadata involved, both read and
> >> write accross several threads so this is really something that the
> >> cgroups io controler is supposed to do.
> >>
> > 
> > For a user initiated balance a cgroups io controller would work well, yes.
> 
> Hmmm. I might give this a try. On my main mail server balance takes a
> long time and a lot of IO, which is why I created my "balance_slowly"
> script which shuts down mail (and some other services) then runs balance
> for 20 mins, then cancels the balance and allows mail to run for 10
> minutes, then resumes the balance for 20 mins, etc. Using this each
> month, a balance takes over 24 hours but at least the only problem is
> short mail delays for 1 day a month, not timeouts, users seeing mail
> error reports, etc.
> 
> Before I did this, the impact was horrible: btrfs spent all its time
> doing backref searches and any process which touched the filesystem (for
> example to deliver a tiny email) could be stuck for over an hour.
> 
> I am wondering whether the cgroups io controller would help, or whether
> it would cause a priority inversion because the backrefs couldn't do the
> IO they needed so the delays to other processes locked out would get
> even **longer**. Any thoughts?

Do you do a full balance? On a mail server where files get created and
deleted the space could become quite fragmented so a balance from time
to time would make the space more compact. You can also start balance in
smaller batches eg. using the limit=N filter.

I haven't fully tested the cgroup io limiting, no success setting it up
with raw cgroups, but the systemd unit files have support for that so
maybe I'm doing it the wrong way.

The priority inversion could be an issue, relocation needs to commit the
changes so in general the metadata operations should not be throttled.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-20  7:43 ` Geert Uytterhoeven
@ 2021-05-20 12:55   ` David Sterba
  2021-05-20 13:04     ` Geert Uytterhoeven
  2021-05-20 13:14   ` Arnd Bergmann
  1 sibling, 1 reply; 20+ messages in thread
From: David Sterba @ 2021-05-20 12:55 UTC (permalink / raw)
  To: Geert Uytterhoeven; +Cc: David Sterba, linux-btrfs, linux-kernel, Arnd Bergmann

On Thu, May 20, 2021 at 09:43:10AM +0200, Geert Uytterhoeven wrote:
> > - values written to the file accept suffixes like K, M
> > - file is in the per-device directory /sys/fs/btrfs/FSID/devinfo/DEVID/scrub_speed_max
> > - 0 means use default priority of IO
> >
> > The scheduler is a simple deadline one and the accuracy is up to nearest
> > 128K.
> >
> > Signed-off-by: David Sterba <dsterba@suse.com>
> 
> Thanks for your patch, which is now commit b4a9f4bee31449bc ("btrfs:
> scrub: per-device bandwidth control") in linux-next.
> 
> noreply@ellerman.id.au reported the following failures for e.g.
> m68k/defconfig:
> 
> ERROR: modpost: "__udivdi3" [fs/btrfs/btrfs.ko] undefined!
> ERROR: modpost: "__divdi3" [fs/btrfs/btrfs.ko] undefined!

I'll fix it, thanks for the report.

> > +static void scrub_throttle(struct scrub_ctx *sctx)
> > +{
> > +	const int time_slice = 1000;
> > +	struct scrub_bio *sbio;
> > +	struct btrfs_device *device;
> > +	s64 delta;
> > +	ktime_t now;
> > +	u32 div;
> > +	u64 bwlimit;
> > +
> > +	sbio = sctx->bios[sctx->curr];
> > +	device = sbio->dev;
> > +	bwlimit = READ_ONCE(device->scrub_speed_max);
> > +	if (bwlimit == 0)
> > +		return;
> > +
> > +	/*
> > +	 * Slice is divided into intervals when the IO is submitted, adjust by
> > +	 * bwlimit and maximum of 64 intervals.
> > +	 */
> > +	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
> > +	div = min_t(u32, 64, div);
> > +
> > +	/* Start new epoch, set deadline */
> > +	now = ktime_get();
> > +	if (sctx->throttle_deadline == 0) {
> > +		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
> 
> ERROR: modpost: "__udivdi3" [fs/btrfs/btrfs.ko] undefined!
> 
> div_u64(bwlimit, div)
> 
> > +		sctx->throttle_sent = 0;
> > +	}
> > +
> > +	/* Still in the time to send? */
> > +	if (ktime_before(now, sctx->throttle_deadline)) {
> > +		/* If current bio is within the limit, send it */
> > +		sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
> > +		if (sctx->throttle_sent <= bwlimit / div)
> > +			return;
> > +
> > +		/* We're over the limit, sleep until the rest of the slice */
> > +		delta = ktime_ms_delta(sctx->throttle_deadline, now);
> > +	} else {
> > +		/* New request after deadline, start new epoch */
> > +		delta = 0;
> > +	}
> > +
> > +	if (delta)
> > +		schedule_timeout_interruptible(delta * HZ / 1000);
> 
> ERROR: modpost: "__divdi3" [fs/btrfs/btrfs.ko] undefined!
> 
> I'm a bit surprised gcc doesn't emit code for the division by the
> constant 1000, but emits a call to __divdi3().  So this has to become
> div_u64(), too.
> 
> > +	/* Next call will start the deadline period */
> > +	sctx->throttle_deadline = 0;
> > +}
> 
> BTW, any chance you can start adding lore Link: tags to your commits, to
> make it easier to find the email thread to reply to when reporting a
> regression?

Well, no I'm not going to do that, sorry. It should be easy enough to
paste the patch subject to the search field on lore.k.org and click the
link leading to the mail, I do that all the time. Making sure that
patches have all the tags and information takes time already so I'm not
too keen to spend time on adding links.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-20 12:55   ` David Sterba
@ 2021-05-20 13:04     ` Geert Uytterhoeven
  0 siblings, 0 replies; 20+ messages in thread
From: Geert Uytterhoeven @ 2021-05-20 13:04 UTC (permalink / raw)
  To: David Sterba
  Cc: David Sterba, linux-btrfs, Linux Kernel Mailing List, Arnd Bergmann

Hi David,

On Thu, May 20, 2021 at 2:57 PM David Sterba <dsterba@suse.cz> wrote:
> On Thu, May 20, 2021 at 09:43:10AM +0200, Geert Uytterhoeven wrote:
> > > - values written to the file accept suffixes like K, M
> > > - file is in the per-device directory /sys/fs/btrfs/FSID/devinfo/DEVID/scrub_speed_max
> > > - 0 means use default priority of IO
> > >
> > > The scheduler is a simple deadline one and the accuracy is up to nearest
> > > 128K.
> > >
> > > Signed-off-by: David Sterba <dsterba@suse.com>
> >
> > Thanks for your patch, which is now commit b4a9f4bee31449bc ("btrfs:
> > scrub: per-device bandwidth control") in linux-next.
> >
> > noreply@ellerman.id.au reported the following failures for e.g.
> > m68k/defconfig:
> >
> > ERROR: modpost: "__udivdi3" [fs/btrfs/btrfs.ko] undefined!
> > ERROR: modpost: "__divdi3" [fs/btrfs/btrfs.ko] undefined!
>
> I'll fix it, thanks for the report.

Thanks!

> > BTW, any chance you can start adding lore Link: tags to your commits, to
> > make it easier to find the email thread to reply to when reporting a
> > regression?
>
> Well, no I'm not going to do that, sorry. It should be easy enough to
> paste the patch subject to the search field on lore.k.org and click the
> link leading to the mail, I do that all the time. Making sure that

There's no global search field on lore.kernel.org (yet), so you still have
to guess the mailing list.  In this case that was obvious, but that's not always
the case.

> patches have all the tags and information takes time already so I'm not
> too keen to spend time on adding links.

The link can be added automatically by a git hook.  Hence if you use b4
to get the series with all tags, you'll get the Link: for free!

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-20  7:43 ` Geert Uytterhoeven
  2021-05-20 12:55   ` David Sterba
@ 2021-05-20 13:14   ` Arnd Bergmann
  2021-05-20 13:26     ` Geert Uytterhoeven
  2021-05-21 15:16     ` David Sterba
  1 sibling, 2 replies; 20+ messages in thread
From: Arnd Bergmann @ 2021-05-20 13:14 UTC (permalink / raw)
  To: Geert Uytterhoeven; +Cc: David Sterba, linux-btrfs, Linux Kernel Mailing List

On Thu, May 20, 2021 at 9:43 AM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> On Tue, 18 May 2021, David Sterba wrote:

> > --- a/fs/btrfs/scrub.c
> > +++ b/fs/btrfs/scrub.c
> > @@ -1988,6 +1993,60 @@ static void scrub_page_put(struct scrub_page *spage)
> >       }
> > }
> >
> > +/*
> > + * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
> > + * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
> > + */
> > +static void scrub_throttle(struct scrub_ctx *sctx)
> > +{
> > +     const int time_slice = 1000;
> > +     struct scrub_bio *sbio;
> > +     struct btrfs_device *device;
> > +     s64 delta;
> > +     ktime_t now;
> > +     u32 div;
> > +     u64 bwlimit;
> > +
> > +     sbio = sctx->bios[sctx->curr];
> > +     device = sbio->dev;
> > +     bwlimit = READ_ONCE(device->scrub_speed_max);
> > +     if (bwlimit == 0)
> > +             return;
> > +
> > +     /*
> > +      * Slice is divided into intervals when the IO is submitted, adjust by
> > +      * bwlimit and maximum of 64 intervals.
> > +      */
> > +     div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
> > +     div = min_t(u32, 64, div);
> > +
> > +     /* Start new epoch, set deadline */
> > +     now = ktime_get();
> > +     if (sctx->throttle_deadline == 0) {
> > +             sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
>
> ERROR: modpost: "__udivdi3" [fs/btrfs/btrfs.ko] undefined!
>
> div_u64(bwlimit, div)

If 'time_slice' is in nanoseconds, the best interface to use
is ktime_divns().

> > +             sctx->throttle_sent = 0;
> > +     }
> > +
> > +     /* Still in the time to send? */
> > +     if (ktime_before(now, sctx->throttle_deadline)) {
> > +             /* If current bio is within the limit, send it */
> > +             sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
> > +             if (sctx->throttle_sent <= bwlimit / div)
> > +                     return;

Doesn't this also need to be changed?

> > +             /* We're over the limit, sleep until the rest of the slice */
> > +             delta = ktime_ms_delta(sctx->throttle_deadline, now);
> > +     } else {
> > +             /* New request after deadline, start new epoch */
> > +             delta = 0;
> > +     }
> > +
> > +     if (delta)
> > +             schedule_timeout_interruptible(delta * HZ / 1000);
>
> ERROR: modpost: "__divdi3" [fs/btrfs/btrfs.ko] undefined!
>
> I'm a bit surprised gcc doesn't emit code for the division by the
> constant 1000, but emits a call to __divdi3().  So this has to become
> div_u64(), too.

There is schedule_hrtimeout(), which takes a ktime_t directly
but has slightly different behavior. There is also an msecs_to_jiffies
helper that should produce a fast division.

       Arnd

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-20 13:14   ` Arnd Bergmann
@ 2021-05-20 13:26     ` Geert Uytterhoeven
  2021-05-21 15:16     ` David Sterba
  1 sibling, 0 replies; 20+ messages in thread
From: Geert Uytterhoeven @ 2021-05-20 13:26 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: David Sterba, linux-btrfs, Linux Kernel Mailing List

Hi Arnd,

On Thu, May 20, 2021 at 3:15 PM Arnd Bergmann <arnd@arndb.de> wrote:
> On Thu, May 20, 2021 at 9:43 AM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> > On Tue, 18 May 2021, David Sterba wrote:
> > > --- a/fs/btrfs/scrub.c
> > > +++ b/fs/btrfs/scrub.c
> > > @@ -1988,6 +1993,60 @@ static void scrub_page_put(struct scrub_page *spage)
> > >       }
> > > }
> > >
> > > +/*
> > > + * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
> > > + * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
> > > + */
> > > +static void scrub_throttle(struct scrub_ctx *sctx)
> > > +{
> > > +     const int time_slice = 1000;
> > > +     struct scrub_bio *sbio;
> > > +     struct btrfs_device *device;
> > > +     s64 delta;
> > > +     ktime_t now;
> > > +     u32 div;
> > > +     u64 bwlimit;
> > > +
> > > +     sbio = sctx->bios[sctx->curr];
> > > +     device = sbio->dev;
> > > +     bwlimit = READ_ONCE(device->scrub_speed_max);
> > > +     if (bwlimit == 0)
> > > +             return;
> > > +
> > > +     /*
> > > +      * Slice is divided into intervals when the IO is submitted, adjust by
> > > +      * bwlimit and maximum of 64 intervals.
> > > +      */
> > > +     div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
> > > +     div = min_t(u32, 64, div);
> > > +
> > > +     /* Start new epoch, set deadline */
> > > +     now = ktime_get();
> > > +     if (sctx->throttle_deadline == 0) {
> > > +             sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
> >
> > ERROR: modpost: "__udivdi3" [fs/btrfs/btrfs.ko] undefined!
> >
> > div_u64(bwlimit, div)
>
> If 'time_slice' is in nanoseconds, the best interface to use
> is ktime_divns().

Actually this one is not a problem...

>
> > > +             sctx->throttle_sent = 0;
> > > +     }
> > > +
> > > +     /* Still in the time to send? */
> > > +     if (ktime_before(now, sctx->throttle_deadline)) {
> > > +             /* If current bio is within the limit, send it */
> > > +             sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
> > > +             if (sctx->throttle_sent <= bwlimit / div)
> > > +                     return;
>
> Doesn't this also need to be changed?

... but this is.

Sorry. I added the annotation to the wrong line, after devising which
code caused the issue from looking at the generated assembly ;-)

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-19 16:20       ` Graham Cobb
  2021-05-20 12:41         ` David Sterba
@ 2021-05-21  7:18         ` Zygo Blaxell
  2021-05-21  9:55           ` Graham Cobb
  1 sibling, 1 reply; 20+ messages in thread
From: Zygo Blaxell @ 2021-05-21  7:18 UTC (permalink / raw)
  To: Graham Cobb; +Cc: Johannes Thumshirn, dsterba, linux-btrfs

On Wed, May 19, 2021 at 05:20:50PM +0100, Graham Cobb wrote:
> On 19/05/2021 16:32, Johannes Thumshirn wrote:
> > On 19/05/2021 16:28, David Sterba wrote:
> >> On Wed, May 19, 2021 at 06:53:54AM +0000, Johannes Thumshirn wrote:
> >>> On 18/05/2021 16:52, David Sterba wrote:
> >>> I wonder if this interface would make sense for limiting balance
> >>> bandwidth as well?
> >>
> >> Balance is not contained to one device, so this makes the scrub case
> >> easy. For balance there are data and metadata involved, both read and
> >> write accross several threads so this is really something that the
> >> cgroups io controler is supposed to do.
> >>
> > 
> > For a user initiated balance a cgroups io controller would work well, yes.

Don't throttle balance.  You can only make _everything_ slower with
throttling.  You can't be selective, e.g. making balance slower than
the mail server.

> Hmmm. I might give this a try. On my main mail server balance takes a
> long time and a lot of IO, which is why I created my "balance_slowly"
> script which shuts down mail (and some other services) then runs balance
> for 20 mins, then cancels the balance and allows mail to run for 10
> minutes, then resumes the balance for 20 mins, etc. 
> Using this each month, a balance takes over 24 hours

My question here is:  wait?  What?  Are you running full balances
every...ever?  Why?

If you have unallocated space and are not in danger of running out,
you don't need to balance anything.  Even if you are running low on
unallocated space, balance -dlimit=1 will usually suffice to get a single
GB of unallocated space--the minimum required to avoid metadata ENOSPC
gotchas.

Never balance metadata.  Maintenance metadata balances are extremely slow,
do nothing useful, and can exacerbate ENOSPC gotchas.

Metadata balance must be used to convert a different raid profile, or
permanently remove a drive from a filesystem--but only because there
is no better way to do those things in btrfs.

> but at least the only problem is
> short mail delays for 1 day a month, not timeouts, users seeing mail
> error reports, etc.

I run btrfs on some mail servers with crappy spinning drives.

One balance block group every day--on days when balance runs at all--only
introduces a one-time 90-second latency.  Not enough to kill a SMTP
transaction.

Backup snapshot deletes generate more latency (2-4 minutes once a day).
That is long enough to kill a SMTP transaction, but pretty much every
non-spammer sender will retry sooner than the next backup.

> Before I did this, the impact was horrible: btrfs spent all its time
> doing backref searches and any process which touched the filesystem (for
> example to deliver a tiny email) could be stuck for over an hour.
> 
> I am wondering whether the cgroups io controller would help, or whether
> it would cause a priority inversion because the backrefs couldn't do the
> IO they needed so the delays to other processes locked out would get
> even **longer**. Any thoughts?

Yes that is pretty much exactly what happens.

Balance spends a tiny fraction of its IO cost moving data blocks around.
Each data extent that is relocated triggers a reference update,
which goes into a queue for the current btrfs transaction.  On its
way through that queue, each ref update cascades into updates on other
tree pages for parent nodes, csum tree items, extent tree items, and
(if using space_cache=v2) free space tree items.  These are all small
random writes that have performance costs even on non-rotating media,
much more expensive than the data reads and writes which are mostly
sequential and consecutive.  Worst-case write multipliers are 3-digit
numbers for each of the trees.  It is not impossible for one block group
relocation--balance less than 1GB of data--to run for _days_.

On a filesystem with lots of tiny extents (like a mail server), the
data blocks will be less than 1% of the balance IO.  The other 99%+ are
metadata updates.  If there is throttling on those, any thread trying
to write to the filesystem stops dead in the next transaction commit,
and stays blocked until the throttled IO completes.

If other threads are writing to the filesystem, it gets even worse:
the running time of delayed ref flushes is bounded only by available
disk space, because only running out of disk space can make btrfs stop
queueing up more work for itself in transaction commit.

Even threads that aren't writing to the throttled filesystem can get
blocked on malloc() because Linux MM shares the same pool of pages for
malloc() and disk writes, and will block memory allocations when dirty
limits are exceeded anywhere.  This causes most applications (i.e. those
which call malloc()) to stop dead until IO bandwidth becomes available
to btrfs, even if the processes never touch any btrfs filesystem.
Add in VFS locks, and even reading threads block.

The best currently available approach is to minimize balancing.  Don't do
it at all if you can avoid it, and do only the bare minimum if you can't.

On the other hand, a lot of these problems can maybe be reduced or
eliminated by limiting the number of extents balance processes each time
it goes through its extent iteration loop.  Right now, balance tries to
relocate an entire block group in one shot, but maybe that's too much.
Instead, balance could move a maximum of 100 extents (or some number
chosen to generate about a second's worth of IO), then do a transaction
commit to flush out the delayed refs queue while it's still relatively
small, then repeat.  This would be very crude throttling since we'd have
to guess how many backrefs each extent has, but it will work far better
for reducing latency than any throttling based on block IO.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-21  7:18         ` Zygo Blaxell
@ 2021-05-21  9:55           ` Graham Cobb
  0 siblings, 0 replies; 20+ messages in thread
From: Graham Cobb @ 2021-05-21  9:55 UTC (permalink / raw)
  To: Zygo Blaxell; +Cc: Johannes Thumshirn, dsterba, linux-btrfs

On 21/05/2021 08:18, Zygo Blaxell wrote:
> On Wed, May 19, 2021 at 05:20:50PM +0100, Graham Cobb wrote:
>> On 19/05/2021 16:32, Johannes Thumshirn wrote:
>>> On 19/05/2021 16:28, David Sterba wrote:
>>>> On Wed, May 19, 2021 at 06:53:54AM +0000, Johannes Thumshirn wrote:
>>>>> On 18/05/2021 16:52, David Sterba wrote:
>>>>> I wonder if this interface would make sense for limiting balance
>>>>> bandwidth as well?
>>>>
>>>> Balance is not contained to one device, so this makes the scrub case
>>>> easy. For balance there are data and metadata involved, both read and
>>>> write accross several threads so this is really something that the
>>>> cgroups io controler is supposed to do.
>>>>
>>>
>>> For a user initiated balance a cgroups io controller would work well, yes.
> 
> Don't throttle balance.  You can only make _everything_ slower with
> throttling.  You can't be selective, e.g. making balance slower than
> the mail server.
> 
>> Hmmm. I might give this a try. On my main mail server balance takes a
>> long time and a lot of IO, which is why I created my "balance_slowly"
>> script which shuts down mail (and some other services) then runs balance
>> for 20 mins, then cancels the balance and allows mail to run for 10
>> minutes, then resumes the balance for 20 mins, etc. 
>> Using this each month, a balance takes over 24 hours
> 
> My question here is:  wait?  What?  Are you running full balances
> every...ever?  Why?

No, not any more - that was how it worked years ago but nowadays my
scripts are based on btrfs-balance-least-used and end up only balancing
empty or almost empty block groups, data only. It still sometimes slows
disk access for other processes down quite a lot, though. Probably
because I still have some on-disk snapshots causing extra work (although
most are now moved onto a separate disk after about 24 hours).

And I was wrong - my slow balances don't take 24 hours (that is scrub) -
they take a couple of hours.

...

> I run btrfs on some mail servers with crappy spinning drives.
> 
> One balance block group every day--on days when balance runs at all--only
> introduces a one-time 90-second latency.  Not enough to kill a SMTP
> transaction.

Although it happens, the *much* bigger problem than SMTP timeouts is the
error messages that dovecot mail delivery (with or without lmtp)
generates when things get very slow. Sometimes these make it into an
error report confusing the sender significantly! I prefer to just shut
mail down for a while doing the balance. The script uses a timeout to
stop sending block groups to be balanced once a time limit has been
reached, then when the in-progress block group has finished it restarts
mail for a while.

> Backup snapshot deletes generate more latency (2-4 minutes once a day).
> That is long enough to kill a SMTP transaction, but pretty much every
> non-spammer sender will retry sooner than the next backup.
> 
>> Before I did this, the impact was horrible: btrfs spent all its time
>> doing backref searches and any process which touched the filesystem (for
>> example to deliver a tiny email) could be stuck for over an hour.
>>
>> I am wondering whether the cgroups io controller would help, or whether
>> it would cause a priority inversion because the backrefs couldn't do the
>> IO they needed so the delays to other processes locked out would get
>> even **longer**. Any thoughts?
> 
> Yes that is pretty much exactly what happens.
> 
> Balance spends a tiny fraction of its IO cost moving data blocks around.
> Each data extent that is relocated triggers a reference update,
> which goes into a queue for the current btrfs transaction.  On its
> way through that queue, each ref update cascades into updates on other
> tree pages for parent nodes, csum tree items, extent tree items, and
> (if using space_cache=v2) free space tree items.  These are all small
> random writes that have performance costs even on non-rotating media,
> much more expensive than the data reads and writes which are mostly
> sequential and consecutive.  Worst-case write multipliers are 3-digit
> numbers for each of the trees.  It is not impossible for one block group
> relocation--balance less than 1GB of data--to run for _days_.
> 
> On a filesystem with lots of tiny extents (like a mail server), the
> data blocks will be less than 1% of the balance IO.  The other 99%+ are
> metadata updates.  If there is throttling on those, any thread trying
> to write to the filesystem stops dead in the next transaction commit,
> and stays blocked until the throttled IO completes.
> 
> If other threads are writing to the filesystem, it gets even worse:
> the running time of delayed ref flushes is bounded only by available
> disk space, because only running out of disk space can make btrfs stop
> queueing up more work for itself in transaction commit.
> 
> Even threads that aren't writing to the throttled filesystem can get
> blocked on malloc() because Linux MM shares the same pool of pages for
> malloc() and disk writes, and will block memory allocations when dirty
> limits are exceeded anywhere.  This causes most applications (i.e. those
> which call malloc()) to stop dead until IO bandwidth becomes available
> to btrfs, even if the processes never touch any btrfs filesystem.
> Add in VFS locks, and even reading threads block.
> 
> The best currently available approach is to minimize balancing.  Don't do
> it at all if you can avoid it, and do only the bare minimum if you can't.
> 
> On the other hand, a lot of these problems can maybe be reduced or
> eliminated by limiting the number of extents balance processes each time
> it goes through its extent iteration loop.  Right now, balance tries to
> relocate an entire block group in one shot, but maybe that's too much.
> Instead, balance could move a maximum of 100 extents (or some number
> chosen to generate about a second's worth of IO), then do a transaction
> commit to flush out the delayed refs queue while it's still relatively
> small, then repeat.  This would be very crude throttling since we'd have
> to guess how many backrefs each extent has, but it will work far better
> for reducing latency than any throttling based on block IO.
> 

Thanks for the useful analysis Zygo. I think I will stick with my
current approach, for balance at least. I look forward to playing with
the new controls for scrub, though (where I currently use a similar script).


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-20 13:14   ` Arnd Bergmann
  2021-05-20 13:26     ` Geert Uytterhoeven
@ 2021-05-21 15:16     ` David Sterba
  2021-05-21 15:38       ` Geert Uytterhoeven
  1 sibling, 1 reply; 20+ messages in thread
From: David Sterba @ 2021-05-21 15:16 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Geert Uytterhoeven, David Sterba, linux-btrfs, Linux Kernel Mailing List

On Thu, May 20, 2021 at 03:14:03PM +0200, Arnd Bergmann wrote:
> On Thu, May 20, 2021 at 9:43 AM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> > On Tue, 18 May 2021, David Sterba wrote:
> > > +     /* Start new epoch, set deadline */
> > > +     now = ktime_get();
> > > +     if (sctx->throttle_deadline == 0) {
> > > +             sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
> >
> > ERROR: modpost: "__udivdi3" [fs/btrfs/btrfs.ko] undefined!
> >
> > div_u64(bwlimit, div)
> 
> If 'time_slice' is in nanoseconds, the best interface to use
> is ktime_divns().

It's in miliseconds and the division above is int/int, the problematic
one is below.
> 
> > > +             sctx->throttle_sent = 0;
> > > +     }
> > > +
> > > +     /* Still in the time to send? */
> > > +     if (ktime_before(now, sctx->throttle_deadline)) {
> > > +             /* If current bio is within the limit, send it */
> > > +             sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
> > > +             if (sctx->throttle_sent <= bwlimit / div)
> > > +                     return;
> 
> Doesn't this also need to be changed?
> 
> > > +             /* We're over the limit, sleep until the rest of the slice */
> > > +             delta = ktime_ms_delta(sctx->throttle_deadline, now);
> > > +     } else {
> > > +             /* New request after deadline, start new epoch */
> > > +             delta = 0;
> > > +     }
> > > +
> > > +     if (delta)
> > > +             schedule_timeout_interruptible(delta * HZ / 1000);
> >
> > ERROR: modpost: "__divdi3" [fs/btrfs/btrfs.ko] undefined!
> >
> > I'm a bit surprised gcc doesn't emit code for the division by the
> > constant 1000, but emits a call to __divdi3().  So this has to become
> > div_u64(), too.
> 
> There is schedule_hrtimeout(), which takes a ktime_t directly
> but has slightly different behavior. There is also an msecs_to_jiffies
> helper that should produce a fast division.

I'll use msecs_to_jiffies, thanks. If 'hr' in schedule_hrtimeout stands
for high resolution, it's not necessary here.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] btrfs: scrub: per-device bandwidth control
  2021-05-21 15:16     ` David Sterba
@ 2021-05-21 15:38       ` Geert Uytterhoeven
  0 siblings, 0 replies; 20+ messages in thread
From: Geert Uytterhoeven @ 2021-05-21 15:38 UTC (permalink / raw)
  To: David Sterba, Arnd Bergmann, David Sterba, linux-btrfs,
	Linux Kernel Mailing List

Hi David,

On Fri, May 21, 2021 at 5:18 PM David Sterba <dsterba@suse.cz> wrote:
> On Thu, May 20, 2021 at 03:14:03PM +0200, Arnd Bergmann wrote:
> > On Thu, May 20, 2021 at 9:43 AM Geert Uytterhoeven <geert@linux-m68k.org> wrote:
> > > On Tue, 18 May 2021, David Sterba wrote:
> > > > +     /* Start new epoch, set deadline */
> > > > +     now = ktime_get();
> > > > +     if (sctx->throttle_deadline == 0) {
> > > > +             sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
> > >
> > > ERROR: modpost: "__udivdi3" [fs/btrfs/btrfs.ko] undefined!
> > >
> > > div_u64(bwlimit, div)
> >
> > If 'time_slice' is in nanoseconds, the best interface to use
> > is ktime_divns().
>
> It's in miliseconds and the division above is int/int, the problematic
> one is below.

Yep, sorry for the wrong pointer.

> >
> > > > +             sctx->throttle_sent = 0;
> > > > +     }
> > > > +
> > > > +     /* Still in the time to send? */
> > > > +     if (ktime_before(now, sctx->throttle_deadline)) {
> > > > +             /* If current bio is within the limit, send it */
> > > > +             sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
> > > > +             if (sctx->throttle_sent <= bwlimit / div)
> > > > +                     return;
> >
> > Doesn't this also need to be changed?
> >
> > > > +             /* We're over the limit, sleep until the rest of the slice */
> > > > +             delta = ktime_ms_delta(sctx->throttle_deadline, now);
> > > > +     } else {
> > > > +             /* New request after deadline, start new epoch */
> > > > +             delta = 0;
> > > > +     }
> > > > +
> > > > +     if (delta)
> > > > +             schedule_timeout_interruptible(delta * HZ / 1000);
> > >
> > > ERROR: modpost: "__divdi3" [fs/btrfs/btrfs.ko] undefined!
> > >
> > > I'm a bit surprised gcc doesn't emit code for the division by the
> > > constant 1000, but emits a call to __divdi3().  So this has to become
> > > div_u64(), too.
> >
> > There is schedule_hrtimeout(), which takes a ktime_t directly
> > but has slightly different behavior. There is also an msecs_to_jiffies
> > helper that should produce a fast division.
>
> I'll use msecs_to_jiffies, thanks. If 'hr' in schedule_hrtimeout stands
> for high resolution, it's not necessary here.

msecs_to_jiffies() takes (32-bit) "unsigned int", while delta is "s64".

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2021-05-21 15:38 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-18 14:49 [PATCH] btrfs: scrub: per-device bandwidth control David Sterba
2021-05-18 16:52 ` Holger Hoffstätte
2021-05-18 20:46   ` David Sterba
2021-05-18 20:15 ` waxhead
2021-05-18 21:06   ` David Sterba
2021-05-19  6:53 ` Johannes Thumshirn
2021-05-19 14:26   ` David Sterba
2021-05-19 15:32     ` Johannes Thumshirn
2021-05-19 16:20       ` Graham Cobb
2021-05-20 12:41         ` David Sterba
2021-05-21  7:18         ` Zygo Blaxell
2021-05-21  9:55           ` Graham Cobb
2021-05-20 12:28       ` David Sterba
2021-05-20  7:43 ` Geert Uytterhoeven
2021-05-20 12:55   ` David Sterba
2021-05-20 13:04     ` Geert Uytterhoeven
2021-05-20 13:14   ` Arnd Bergmann
2021-05-20 13:26     ` Geert Uytterhoeven
2021-05-21 15:16     ` David Sterba
2021-05-21 15:38       ` Geert Uytterhoeven

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.