* [PATCH 1/5] btrfs: make the bg_reclaim_threshold per-space info
2022-03-21 16:14 [PATCH 0/5] btrfs: rework background block group relocation Johannes Thumshirn
@ 2022-03-21 16:14 ` Johannes Thumshirn
2022-03-22 17:32 ` Josef Bacik
2022-03-21 16:14 ` [PATCH 2/5] btrfs: allow block group background reclaim for !zoned fs'es Johannes Thumshirn
` (3 subsequent siblings)
4 siblings, 1 reply; 19+ messages in thread
From: Johannes Thumshirn @ 2022-03-21 16:14 UTC (permalink / raw)
To: David Sterba; +Cc: Josef Bacik, linux-btrfs, Pankaj Raghav, Johannes Thumshirn
From: Josef Bacik <josef@toxicpanda.com>
For !zoned file systems it's useful to have the auto reclaim feature,
however there are different use cases for !zoned, for example we may not
want to reclaim metadata chunks ever, only data chunks. Move this sysfs
flag to per-space_info. This won't affect current users because this
tunable only ever did anything for zoned, and that is currently hidden
behind BTRFS_CONFIG_DEBUG.
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ jth restore global bg_reclaim_threshold ]
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/free-space-cache.c | 7 +++++--
fs/btrfs/space-info.c | 9 +++++++++
fs/btrfs/space-info.h | 6 ++++++
fs/btrfs/sysfs.c | 37 +++++++++++++++++++++++++++++++++++++
fs/btrfs/zoned.h | 6 +-----
5 files changed, 58 insertions(+), 7 deletions(-)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 01a408db5683..ef84bc5030cd 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2630,16 +2630,19 @@ int __btrfs_add_free_space(struct btrfs_block_group *block_group,
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 bytenr, u64 size, bool used)
{
- struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *sinfo = block_group->space_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
- const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
+ int bg_reclaim_threshold = 0;
bool initial = (size == block_group->length);
u64 reclaimable_unusable;
WARN_ON(!initial && offset + size > block_group->zone_capacity);
+ if (!initial)
+ bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
+
spin_lock(&ctl->tree_lock);
if (!used)
to_free = size;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index b87931a458eb..60d0a58c4644 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -181,6 +181,12 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
found->full = 0;
}
+/*
+ * Block groups with more than this value (percents) of unusable space will be
+ * scheduled for background reclaim.
+ */
+#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH 75
+
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{
@@ -203,6 +209,9 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
+ if (btrfs_is_zoned(info))
+ space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
+
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
return ret;
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index d841fed73492..0c45f539e3cf 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -24,6 +24,12 @@ struct btrfs_space_info {
the space info if we had an ENOSPC in the
allocator. */
+ /*
+ * Once a block group drops below this threshold we'll schedule it for
+ * reclaim.
+ */
+ int bg_reclaim_threshold;
+
int clamp; /* Used to scale our threshold for preemptive
flushing. The value is >> clamp, so turns
out to be a 2^clamp divisor. */
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 17389a42a3ab..90da1ea0cae0 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -722,6 +722,42 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ ssize_t ret;
+
+ ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
+
+ return ret;
+}
+
+static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int thresh;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &thresh);
+ if (ret)
+ return ret;
+
+ if (thresh != 0 && (thresh <= 50 || thresh > 100))
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->bg_reclaim_threshold, thresh);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
+ btrfs_sinfo_bg_reclaim_threshold_show,
+ btrfs_sinfo_bg_reclaim_threshold_store);
+
/*
* Allocation information about block group types.
*
@@ -738,6 +774,7 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
+ BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
NULL,
};
ATTRIBUTE_GROUPS(space_info);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index cbf016a7bb5d..c489c08d7fd5 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -10,11 +10,7 @@
#include "block-group.h"
#include "btrfs_inode.h"
-/*
- * Block groups with more than this value (percents) of unusable space will be
- * scheduled for background reclaim.
- */
-#define BTRFS_DEFAULT_RECLAIM_THRESH 75
+#define BTRFS_DEFAULT_RECLAIM_THRESH 75
struct btrfs_zoned_device_info {
/*
--
2.35.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH 1/5] btrfs: make the bg_reclaim_threshold per-space info
2022-03-21 16:14 ` [PATCH 1/5] btrfs: make the bg_reclaim_threshold per-space info Johannes Thumshirn
@ 2022-03-22 17:32 ` Josef Bacik
2022-03-22 17:34 ` Johannes Thumshirn
0 siblings, 1 reply; 19+ messages in thread
From: Josef Bacik @ 2022-03-22 17:32 UTC (permalink / raw)
To: Johannes Thumshirn; +Cc: David Sterba, linux-btrfs, Pankaj Raghav
On Mon, Mar 21, 2022 at 09:14:10AM -0700, Johannes Thumshirn wrote:
> From: Josef Bacik <josef@toxicpanda.com>
>
> For !zoned file systems it's useful to have the auto reclaim feature,
> however there are different use cases for !zoned, for example we may not
> want to reclaim metadata chunks ever, only data chunks. Move this sysfs
> flag to per-space_info. This won't affect current users because this
> tunable only ever did anything for zoned, and that is currently hidden
> behind BTRFS_CONFIG_DEBUG.
>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> [ jth restore global bg_reclaim_threshold ]
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
> fs/btrfs/free-space-cache.c | 7 +++++--
> fs/btrfs/space-info.c | 9 +++++++++
> fs/btrfs/space-info.h | 6 ++++++
> fs/btrfs/sysfs.c | 37 +++++++++++++++++++++++++++++++++++++
> fs/btrfs/zoned.h | 6 +-----
> 5 files changed, 58 insertions(+), 7 deletions(-)
>
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 01a408db5683..ef84bc5030cd 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -2630,16 +2630,19 @@ int __btrfs_add_free_space(struct btrfs_block_group *block_group,
> static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
> u64 bytenr, u64 size, bool used)
> {
> - struct btrfs_fs_info *fs_info = block_group->fs_info;
> + struct btrfs_space_info *sinfo = block_group->space_info;
> struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> u64 offset = bytenr - block_group->start;
> u64 to_free, to_unusable;
> - const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
> + int bg_reclaim_threshold = 0;
> bool initial = (size == block_group->length);
> u64 reclaimable_unusable;
>
> WARN_ON(!initial && offset + size > block_group->zone_capacity);
>
> + if (!initial)
> + bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
> +
> spin_lock(&ctl->tree_lock);
> if (!used)
> to_free = size;
> diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
> index b87931a458eb..60d0a58c4644 100644
> --- a/fs/btrfs/space-info.c
> +++ b/fs/btrfs/space-info.c
> @@ -181,6 +181,12 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
> found->full = 0;
> }
>
> +/*
> + * Block groups with more than this value (percents) of unusable space will be
> + * scheduled for background reclaim.
> + */
> +#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH 75
> +
> static int create_space_info(struct btrfs_fs_info *info, u64 flags)
> {
>
> @@ -203,6 +209,9 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
> INIT_LIST_HEAD(&space_info->priority_tickets);
> space_info->clamp = 1;
>
> + if (btrfs_is_zoned(info))
> + space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
> +
> ret = btrfs_sysfs_add_space_info_type(info, space_info);
> if (ret)
> return ret;
> diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
> index d841fed73492..0c45f539e3cf 100644
> --- a/fs/btrfs/space-info.h
> +++ b/fs/btrfs/space-info.h
> @@ -24,6 +24,12 @@ struct btrfs_space_info {
> the space info if we had an ENOSPC in the
> allocator. */
>
> + /*
> + * Once a block group drops below this threshold we'll schedule it for
> + * reclaim.
> + */
> + int bg_reclaim_threshold;
> +
> int clamp; /* Used to scale our threshold for preemptive
> flushing. The value is >> clamp, so turns
> out to be a 2^clamp divisor. */
> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> index 17389a42a3ab..90da1ea0cae0 100644
> --- a/fs/btrfs/sysfs.c
> +++ b/fs/btrfs/sysfs.c
> @@ -722,6 +722,42 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
> SPACE_INFO_ATTR(disk_used);
> SPACE_INFO_ATTR(disk_total);
>
> +static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
> + struct kobj_attribute *a,
> + char *buf)
> +{
> + struct btrfs_space_info *space_info = to_space_info(kobj);
> + ssize_t ret;
> +
> + ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
> +
> + return ret;
> +}
> +
> +static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
> + struct kobj_attribute *a,
> + const char *buf, size_t len)
> +{
> + struct btrfs_space_info *space_info = to_space_info(kobj);
> + int thresh;
> + int ret;
> +
> + ret = kstrtoint(buf, 10, &thresh);
> + if (ret)
> + return ret;
> +
> + if (thresh != 0 && (thresh <= 50 || thresh > 100))
> + return -EINVAL;
> +
> + WRITE_ONCE(space_info->bg_reclaim_threshold, thresh);
> +
> + return len;
> +}
> +
> +BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
> + btrfs_sinfo_bg_reclaim_threshold_show,
> + btrfs_sinfo_bg_reclaim_threshold_store);
> +
> /*
> * Allocation information about block group types.
> *
> @@ -738,6 +774,7 @@ static struct attribute *space_info_attrs[] = {
> BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
> BTRFS_ATTR_PTR(space_info, disk_used),
> BTRFS_ATTR_PTR(space_info, disk_total),
> + BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
> NULL,
> };
> ATTRIBUTE_GROUPS(space_info);
> diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
> index cbf016a7bb5d..c489c08d7fd5 100644
> --- a/fs/btrfs/zoned.h
> +++ b/fs/btrfs/zoned.h
> @@ -10,11 +10,7 @@
> #include "block-group.h"
> #include "btrfs_inode.h"
>
> -/*
> - * Block groups with more than this value (percents) of unusable space will be
> - * scheduled for background reclaim.
> - */
> -#define BTRFS_DEFAULT_RECLAIM_THRESH 75
> +#define BTRFS_DEFAULT_RECLAIM_THRESH 75
>
Looks like you added this back by accident?
Josef
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 1/5] btrfs: make the bg_reclaim_threshold per-space info
2022-03-22 17:32 ` Josef Bacik
@ 2022-03-22 17:34 ` Johannes Thumshirn
0 siblings, 0 replies; 19+ messages in thread
From: Johannes Thumshirn @ 2022-03-22 17:34 UTC (permalink / raw)
To: Josef Bacik; +Cc: David Sterba, linux-btrfs, Pankaj Raghav
On 22/03/2022 18:32, Josef Bacik wrote:
>>
>> -/*
>> - * Block groups with more than this value (percents) of unusable space will be
>> - * scheduled for background reclaim.
>> - */
>> -#define BTRFS_DEFAULT_RECLAIM_THRESH 75
>> +#define BTRFS_DEFAULT_RECLAIM_THRESH 75
>>
>
> Looks like you added this back by accident?
Nope I've added this back on purpose but apparently screwed up doing so.
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 2/5] btrfs: allow block group background reclaim for !zoned fs'es
2022-03-21 16:14 [PATCH 0/5] btrfs: rework background block group relocation Johannes Thumshirn
2022-03-21 16:14 ` [PATCH 1/5] btrfs: make the bg_reclaim_threshold per-space info Johannes Thumshirn
@ 2022-03-21 16:14 ` Johannes Thumshirn
2022-03-22 17:38 ` Josef Bacik
2022-03-21 16:14 ` [PATCH 3/5] btrfs: change the bg_reclaim_threshold valid region from 0 to 100 Johannes Thumshirn
` (2 subsequent siblings)
4 siblings, 1 reply; 19+ messages in thread
From: Johannes Thumshirn @ 2022-03-21 16:14 UTC (permalink / raw)
To: David Sterba; +Cc: Josef Bacik, linux-btrfs, Pankaj Raghav
From: Josef Bacik <josef@toxicpanda.com>
We have found this feature invaluable at Facebook due to how our
workload interacts with the allocator. We have been using this in
production for months with only a single problem that has already been
fixed. This will allow us to set a threshold for block groups to be
automatically relocated even if we don't have zoned devices.
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
fs/btrfs/block-group.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 59f18a10fd5f..628741ecb97b 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -3208,6 +3208,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
return ret;
}
+static inline bool should_reclaim_block_group(struct btrfs_block_group *block_group,
+ u64 bytes_freed)
+{
+ const struct btrfs_space_info *space_info = block_group->space_info;
+ const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const u64 new_val = block_group->used;
+ const u64 old_val = new_val + bytes_freed;
+ u64 thresh;
+
+ if (reclaim_thresh == 0)
+ return false;
+
+ thresh = div_factor_fine(block_group->length, reclaim_thresh);
+
+ /*
+ * If we were below the threshold before don't reclaim, we are likely a
+ * brand new block group and we don't want to relocate new block groups.
+ */
+ if (old_val < thresh)
+ return false;
+ if (new_val >= thresh)
+ return false;
+ return true;
+}
+
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
@@ -3230,6 +3255,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->delalloc_root_lock);
while (total) {
+ bool reclaim;
+
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache) {
ret = -ENOENT;
@@ -3275,6 +3302,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
cache->space_info, num_bytes);
cache->space_info->bytes_used -= num_bytes;
cache->space_info->disk_used -= num_bytes * factor;
+
+ reclaim = should_reclaim_block_group(cache, num_bytes);
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@@ -3301,6 +3330,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
if (!alloc && old_val == 0) {
if (!btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_mark_bg_unused(cache);
+ } else if (!alloc && reclaim) {
+ btrfs_mark_bg_to_reclaim(cache);
}
btrfs_put_block_group(cache);
--
2.35.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH 2/5] btrfs: allow block group background reclaim for !zoned fs'es
2022-03-21 16:14 ` [PATCH 2/5] btrfs: allow block group background reclaim for !zoned fs'es Johannes Thumshirn
@ 2022-03-22 17:38 ` Josef Bacik
2022-03-22 17:40 ` Johannes Thumshirn
0 siblings, 1 reply; 19+ messages in thread
From: Josef Bacik @ 2022-03-22 17:38 UTC (permalink / raw)
To: Johannes Thumshirn; +Cc: David Sterba, linux-btrfs, Pankaj Raghav
On Mon, Mar 21, 2022 at 09:14:11AM -0700, Johannes Thumshirn wrote:
> From: Josef Bacik <josef@toxicpanda.com>
>
> We have found this feature invaluable at Facebook due to how our
> workload interacts with the allocator. We have been using this in
> production for months with only a single problem that has already been
> fixed. This will allow us to set a threshold for block groups to be
> automatically relocated even if we don't have zoned devices.
>
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
> fs/btrfs/block-group.c | 31 +++++++++++++++++++++++++++++++
> 1 file changed, 31 insertions(+)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 59f18a10fd5f..628741ecb97b 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -3208,6 +3208,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
> return ret;
> }
>
> +static inline bool should_reclaim_block_group(struct btrfs_block_group *block_group,
> + u64 bytes_freed)
> +{
> + const struct btrfs_space_info *space_info = block_group->space_info;
> + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
> + const u64 new_val = block_group->used;
> + const u64 old_val = new_val + bytes_freed;
> + u64 thresh;
> +
Actually do we want to do a
if (btrfs_zoned())
return false;
here and leave the auto reclaim zoned behavior the way it is? Or do you want to
delete your stuff and rely on this as the way that we setup zoned block groups
for relocation? If we use this then you could make the
fs_info->bg_reclaim_threshold also apply here. Thanks,
Josef
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 2/5] btrfs: allow block group background reclaim for !zoned fs'es
2022-03-22 17:38 ` Josef Bacik
@ 2022-03-22 17:40 ` Johannes Thumshirn
0 siblings, 0 replies; 19+ messages in thread
From: Johannes Thumshirn @ 2022-03-22 17:40 UTC (permalink / raw)
To: Josef Bacik; +Cc: David Sterba, linux-btrfs, Pankaj Raghav
On 22/03/2022 18:38, Josef Bacik wrote:
>> +static inline bool should_reclaim_block_group(struct btrfs_block_group *block_group,
>> + u64 bytes_freed)
>> +{
>> + const struct btrfs_space_info *space_info = block_group->space_info;
>> + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
>> + const u64 new_val = block_group->used;
>> + const u64 old_val = new_val + bytes_freed;
>> + u64 thresh;
>> +
>
> Actually do we want to do a
>
> if (btrfs_zoned())
> return false;
>
> here and leave the auto reclaim zoned behavior the way it is? Or do you want to
> delete your stuff and rely on this as the way that we setup zoned block groups
> for relocation? If we use this then you could make the
> fs_info->bg_reclaim_threshold also apply here. Thanks,
>
>
I actually like the per-space info knob for the individual block groups and then
the global one to kick in reclaim.
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 3/5] btrfs: change the bg_reclaim_threshold valid region from 0 to 100
2022-03-21 16:14 [PATCH 0/5] btrfs: rework background block group relocation Johannes Thumshirn
2022-03-21 16:14 ` [PATCH 1/5] btrfs: make the bg_reclaim_threshold per-space info Johannes Thumshirn
2022-03-21 16:14 ` [PATCH 2/5] btrfs: allow block group background reclaim for !zoned fs'es Johannes Thumshirn
@ 2022-03-21 16:14 ` Johannes Thumshirn
2022-03-21 16:14 ` [PATCH 4/5] btrfs: make calc_available_free_space available outside of space-info Johannes Thumshirn
2022-03-21 16:14 ` [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive Johannes Thumshirn
4 siblings, 0 replies; 19+ messages in thread
From: Johannes Thumshirn @ 2022-03-21 16:14 UTC (permalink / raw)
To: David Sterba; +Cc: Josef Bacik, linux-btrfs, Pankaj Raghav
From: Josef Bacik <josef@toxicpanda.com>
For the !zoned case we may want to set the threshold for reclaim to
something below 50%. Change the acceptable threshold from 50-100 to
0-100.
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
fs/btrfs/sysfs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 90da1ea0cae0..fdf9bf789528 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -746,7 +746,7 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
if (ret)
return ret;
- if (thresh != 0 && (thresh <= 50 || thresh > 100))
+ if (thresh < 0 || thresh > 100)
return -EINVAL;
WRITE_ONCE(space_info->bg_reclaim_threshold, thresh);
--
2.35.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 4/5] btrfs: make calc_available_free_space available outside of space-info
2022-03-21 16:14 [PATCH 0/5] btrfs: rework background block group relocation Johannes Thumshirn
` (2 preceding siblings ...)
2022-03-21 16:14 ` [PATCH 3/5] btrfs: change the bg_reclaim_threshold valid region from 0 to 100 Johannes Thumshirn
@ 2022-03-21 16:14 ` Johannes Thumshirn
2022-03-22 17:34 ` Josef Bacik
2022-03-21 16:14 ` [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive Johannes Thumshirn
4 siblings, 1 reply; 19+ messages in thread
From: Johannes Thumshirn @ 2022-03-21 16:14 UTC (permalink / raw)
To: David Sterba; +Cc: Johannes Thumshirn, Josef Bacik, linux-btrfs, Pankaj Raghav
Make calc_available_free_space available outside of space-info.c and
rename to btrfs_calc_available_free_space.
This is a preparation for the next commit and does not impose any
functional changes.
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/space-info.c | 8 ++++----
fs/btrfs/space-info.h | 4 ++++
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 60d0a58c4644..e1fd1b1681fd 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -301,7 +301,7 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
return NULL;
}
-static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
+u64 btrfs_calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
{
@@ -349,7 +349,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
return 0;
used = btrfs_space_info_used(space_info, true);
- avail = calc_available_free_space(fs_info, space_info, flush);
+ avail = btrfs_calc_available_free_space(fs_info, space_info, flush);
if (used + bytes < space_info->total_bytes + avail)
return 1;
@@ -722,7 +722,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&space_info->lock);
- avail = calc_available_free_space(fs_info, space_info,
+ avail = btrfs_calc_available_free_space(fs_info, space_info,
BTRFS_RESERVE_FLUSH_ALL);
used = btrfs_space_info_used(space_info, true);
@@ -803,7 +803,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
* much delalloc we need for the background flusher to kick in.
*/
- thresh = calc_available_free_space(fs_info, space_info,
+ thresh = btrfs_calc_available_free_space(fs_info, space_info,
BTRFS_RESERVE_FLUSH_ALL);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_readonly + global_rsv_size;
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 0c45f539e3cf..a957d84d8ff0 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -151,4 +151,8 @@ static inline void btrfs_space_info_free_bytes_may_use(
}
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
+
+u64 btrfs_calc_available_free_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ enum btrfs_reserve_flush_enum flush);
#endif /* BTRFS_SPACE_INFO_H */
--
2.35.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive
2022-03-21 16:14 [PATCH 0/5] btrfs: rework background block group relocation Johannes Thumshirn
` (3 preceding siblings ...)
2022-03-21 16:14 ` [PATCH 4/5] btrfs: make calc_available_free_space available outside of space-info Johannes Thumshirn
@ 2022-03-21 16:14 ` Johannes Thumshirn
2022-03-23 9:08 ` Pankaj Raghav
4 siblings, 1 reply; 19+ messages in thread
From: Johannes Thumshirn @ 2022-03-21 16:14 UTC (permalink / raw)
To: David Sterba; +Cc: Johannes Thumshirn, Josef Bacik, linux-btrfs, Pankaj Raghav
The current auto-reclaim algorithm starts reclaiming all block-group's
with a zone_unusable value above a configured threshold. This is causing a
lot of reclaim IO even if there would be enough free zones on the device.
Instead of only accounting a block-group's zone_unusable value, also take
the number of empty zones into account.
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
Changes since v3:
* remove unusedd include (Filipe)
* Only call btrfs_zoned_should_reclaim() for zoned FSes (Josef)
* Use btrfs_calc_available_free_space() (Josef)
Changes since v2:
* Use div_u64()
Changes since RFC:
* Fix logic error
* Skip unavailable devices
* Use different metric working for non-zoned devices as well
---
fs/btrfs/block-group.c | 10 ++++++++++
fs/btrfs/zoned.c | 23 +++++++++++++++++++++++
fs/btrfs/zoned.h | 6 ++++++
3 files changed, 39 insertions(+)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 628741ecb97b..12454304bb85 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1512,6 +1512,13 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
return bg1->used > bg2->used;
}
+static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_is_zoned(fs_info))
+ return btrfs_zoned_should_reclaim(fs_info);
+ return true;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@@ -1522,6 +1529,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ if (!btrfs_should_reclaim(fs_info))
+ return;
+
sb_start_write(fs_info->sb);
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 1b1b310c3c51..f2a412427921 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2079,3 +2079,26 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
}
mutex_unlock(&fs_devices->device_list_mutex);
}
+
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *sinfo;
+ u64 used = 0;
+ u64 total = 0;
+ u64 factor;
+
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ if (!fs_info->bg_reclaim_threshold)
+ return false;
+
+ list_for_each_entry(sinfo, &fs_info->space_info, list) {
+ total += sinfo->total_bytes;
+ used += btrfs_calc_available_free_space(fs_info, sinfo,
+ BTRFS_RESERVE_NO_FLUSH);
+ }
+
+ factor = div_u64(used * 100, total);
+ return factor >= fs_info->bg_reclaim_threshold;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index c489c08d7fd5..f2d16395087f 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -74,6 +74,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -232,6 +233,11 @@ static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+
+static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+{
+ return false;
+}
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
--
2.35.1
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive
2022-03-21 16:14 ` [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive Johannes Thumshirn
@ 2022-03-23 9:08 ` Pankaj Raghav
2022-03-23 9:11 ` Johannes Thumshirn
0 siblings, 1 reply; 19+ messages in thread
From: Pankaj Raghav @ 2022-03-23 9:08 UTC (permalink / raw)
To: Johannes Thumshirn, David Sterba
Cc: Josef Bacik, linux-btrfs, Luis Chamberlain, Javier Gonzalez
Hi Johannes,
I tried this patchset and I am noticing a weird behaviour wrt the value
of factor in btrfs_zoned_should_reclaim function.
Here is my setup in QEMU:
size 12800M
zoned=true,zoned.zone_capacity=128M,zoned.zone_size=128M
btrfs mkfs:
mkfs.btrfs -d single -m single <znsdev>; mount -t auto <znsdev>
/mnt/real_scratch
I added a print statement in btrfs_zoned_should_reclaim to get the
values for the factor, used and total params.
When I run the btrfs/237 xfstest, I am noticing the following values:
[ 54.829309] btrfs: factor: 4850 used: 19528679424, total: 402653184
The used > total, thereby making the factor greater than 100. This will
force a reclaim even though the drive is almost empty:
start: 0x000000000, len 0x040000, cap 0x040000, wptr 0x000078 reset:0
non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x000040000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x000080000, len 0x040000, cap 0x040000, wptr 0x0001e0 reset:0
non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x0000c0000, len 0x040000, cap 0x040000, wptr 0x000d80 reset:0
non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x000100000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x000140000, len 0x040000, cap 0x040000, wptr 0x008520 reset:0
non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x000180000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x0001c0000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x000200000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x000240000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x000280000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
start: 0x0002c0000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
.....
.....
I am also noticing the same behaviour for ZNS drive with size 1280M:
[ 86.276409] btrfs: factor: 350 used: 1409286144, total: 402653184
Is something going wrong with the calculation? Or am I missing something
here?
On 2022-03-21 17:14, Johannes Thumshirn wrote:
> index 1b1b310c3c51..f2a412427921 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -2079,3 +2079,26 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
> }
> mutex_unlock(&fs_devices->device_list_mutex);
> }
> +
> +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
> +{
> + struct btrfs_space_info *sinfo;
> + u64 used = 0;
> + u64 total = 0;
> + u64 factor;
> +
> + if (!btrfs_is_zoned(fs_info))
> + return false;
> +
> + if (!fs_info->bg_reclaim_threshold)
> + return false;
> +
> + list_for_each_entry(sinfo, &fs_info->space_info, list) {
> + total += sinfo->total_bytes;
> + used += btrfs_calc_available_free_space(fs_info, sinfo,
> + BTRFS_RESERVE_NO_FLUSH);
> + }
> +
> + factor = div_u64(used * 100, total);
+ pr_info("btrfs: factor: %lld used: %lld, total: %lld ", factor,
used, total);
> + return factor >= fs_info->bg_reclaim_threshold;
> +}
> static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
--
Regards,
Pankaj
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive
2022-03-23 9:08 ` Pankaj Raghav
@ 2022-03-23 9:11 ` Johannes Thumshirn
2022-03-23 9:14 ` Pankaj Raghav
0 siblings, 1 reply; 19+ messages in thread
From: Johannes Thumshirn @ 2022-03-23 9:11 UTC (permalink / raw)
To: Pankaj Raghav, David Sterba
Cc: Josef Bacik, linux-btrfs, Luis Chamberlain, Javier Gonzalez
On 23/03/2022 10:09, Pankaj Raghav wrote:
> Hi Johannes,
>
> I tried this patchset and I am noticing a weird behaviour wrt the value
> of factor in btrfs_zoned_should_reclaim function.
>
> Here is my setup in QEMU:
> size 12800M
> zoned=true,zoned.zone_capacity=128M,zoned.zone_size=128M
>
> btrfs mkfs:
> mkfs.btrfs -d single -m single <znsdev>; mount -t auto <znsdev>
> /mnt/real_scratch
>
> I added a print statement in btrfs_zoned_should_reclaim to get the
> values for the factor, used and total params.
>
> When I run the btrfs/237 xfstest, I am noticing the following values:
> [ 54.829309] btrfs: factor: 4850 used: 19528679424, total: 402653184
>
> The used > total, thereby making the factor greater than 100. This will
> force a reclaim even though the drive is almost empty:
>
> start: 0x000000000, len 0x040000, cap 0x040000, wptr 0x000078 reset:0
> non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
> start: 0x000040000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
> start: 0x000080000, len 0x040000, cap 0x040000, wptr 0x0001e0 reset:0
> non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
> start: 0x0000c0000, len 0x040000, cap 0x040000, wptr 0x000d80 reset:0
> non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
> start: 0x000100000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
> start: 0x000140000, len 0x040000, cap 0x040000, wptr 0x008520 reset:0
> non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]
> start: 0x000180000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
> start: 0x0001c0000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
> start: 0x000200000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
> start: 0x000240000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
> start: 0x000280000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
> start: 0x0002c0000, len 0x040000, cap 0x040000, wptr 0x000000 reset:0
> non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
> .....
> .....
>
> I am also noticing the same behaviour for ZNS drive with size 1280M:
>
> [ 86.276409] btrfs: factor: 350 used: 1409286144, total: 402653184
>
> Is something going wrong with the calculation? Or am I missing something
> here?
>
Apparently I'm either too dumb for basic maths, or
btrfs_calc_available_free_space() doesn't give us the values we're expecting.
I'll recheck.
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive
2022-03-23 9:11 ` Johannes Thumshirn
@ 2022-03-23 9:14 ` Pankaj Raghav
2022-03-23 10:39 ` Johannes Thumshirn
0 siblings, 1 reply; 19+ messages in thread
From: Pankaj Raghav @ 2022-03-23 9:14 UTC (permalink / raw)
To: Johannes Thumshirn, David Sterba
Cc: Josef Bacik, linux-btrfs, Luis Chamberlain, Javier Gonzalez
On 2022-03-23 10:11, Johannes Thumshirn wrote:
> On 23/03/2022 10:09, Pankaj Raghav wrote:
>> I am also noticing the same behaviour for ZNS drive with size 1280M:
>>
>> [ 86.276409] btrfs: factor: 350 used: 1409286144, total: 402653184
>>
>> Is something going wrong with the calculation? Or am I missing something
>> here?
>>
>
> Apparently I'm either too dumb for basic maths, or
> btrfs_calc_available_free_space() doesn't give us the values we're expecting.
>
I would say the latter :)
> I'll recheck.
Let me know if you can also reproduce the results.
--
Regards,
Pankaj
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive
2022-03-23 9:14 ` Pankaj Raghav
@ 2022-03-23 10:39 ` Johannes Thumshirn
2022-03-23 11:24 ` Pankaj Raghav
0 siblings, 1 reply; 19+ messages in thread
From: Johannes Thumshirn @ 2022-03-23 10:39 UTC (permalink / raw)
To: Pankaj Raghav, David Sterba
Cc: Josef Bacik, linux-btrfs, Luis Chamberlain, Javier Gonzalez
On 23/03/2022 10:14, Pankaj Raghav wrote:
>
>
> On 2022-03-23 10:11, Johannes Thumshirn wrote:
>> On 23/03/2022 10:09, Pankaj Raghav wrote:
>>> I am also noticing the same behaviour for ZNS drive with size 1280M:
>>>
>>> [ 86.276409] btrfs: factor: 350 used: 1409286144, total: 402653184
>>>
>>> Is something going wrong with the calculation? Or am I missing something
>>> here?
>>>
>>
>> Apparently I'm either too dumb for basic maths, or
>> btrfs_calc_available_free_space() doesn't give us the values we're expecting.
>>
> I would say the latter :)
>> I'll recheck.
> Let me know if you can also reproduce the results.
>
It looks like we can't use btrfs_calc_available_free_space(), can
you try this one on top:
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index f2a412427921..4a6c1f1a7223 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2082,23 +2082,27 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
{
- struct btrfs_space_info *sinfo;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
u64 used = 0;
u64 total = 0;
u64 factor;
- if (!btrfs_is_zoned(fs_info))
- return false;
-
if (!fs_info->bg_reclaim_threshold)
return false;
- list_for_each_entry(sinfo, &fs_info->space_info, list) {
- total += sinfo->total_bytes;
- used += btrfs_calc_available_free_space(fs_info, sinfo,
- BTRFS_RESERVE_NO_FLUSH);
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (!device->bdev)
+ continue;
+
+ total += device->disk_total_bytes;
+ used += device->bytes_used;
+
}
+ mutex_unlock(&fs_devices->device_list_mutex);
- factor = div_u64(used * 100, total);
+ factor = div64_u64(used * 100, total);
return factor >= fs_info->bg_reclaim_threshold;
}
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive
2022-03-23 10:39 ` Johannes Thumshirn
@ 2022-03-23 11:24 ` Pankaj Raghav
2022-03-23 11:52 ` Johannes Thumshirn
0 siblings, 1 reply; 19+ messages in thread
From: Pankaj Raghav @ 2022-03-23 11:24 UTC (permalink / raw)
To: Johannes Thumshirn, David Sterba
Cc: Josef Bacik, linux-btrfs, Luis Chamberlain, Javier Gonzalez
On 2022-03-23 11:39, Johannes Thumshirn wrote:
>
> It looks like we can't use btrfs_calc_available_free_space(), can
> you try this one on top:
>
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index f2a412427921..4a6c1f1a7223 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -2082,23 +2082,27 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
>
> bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
> {
> - struct btrfs_space_info *sinfo;
> + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> + struct btrfs_device *device;
> u64 used = 0;
> u64 total = 0;
> u64 factor;
>
> - if (!btrfs_is_zoned(fs_info))
> - return false;
> -
> if (!fs_info->bg_reclaim_threshold)
> return false;
>
> - list_for_each_entry(sinfo, &fs_info->space_info, list) {
> - total += sinfo->total_bytes;
> - used += btrfs_calc_available_free_space(fs_info, sinfo,
> - BTRFS_RESERVE_NO_FLUSH);
> +
> + mutex_lock(&fs_devices->device_list_mutex);
> + list_for_each_entry(device, &fs_devices->devices, dev_list) {
> + if (!device->bdev)
> + continue;
> +
> + total += device->disk_total_bytes;
> + used += device->bytes_used;
> +
> }
> + mutex_unlock(&fs_devices->device_list_mutex);
>
> - factor = div_u64(used * 100, total);
> + factor = div64_u64(used * 100, total);
> return factor >= fs_info->bg_reclaim_threshold;
> }
>
size 1280M:
[ 47.511871] btrfs: factor: 30 used: 402653184, total: 1342177280
[ 48.542981] btrfs: factor: 30 used: 402653184, total: 1342177280
[ 49.576005] btrfs: factor: 30 used: 402653184, total: 1342177280
size: 12800M:
[ 33.971009] btrfs: factor: 3 used: 402653184, total: 13421772800
[ 34.978602] btrfs: factor: 3 used: 402653184, total: 13421772800
[ 35.991784] btrfs: factor: 3 used: 402653184, total: 13421772800
size: 12800M, zcap=96M zsze=128M:
[ 64.639103] btrfs: factor: 3 used: 402653184, total: 13421772800
[ 65.643778] btrfs: factor: 3 used: 402653184, total: 13421772800
[ 66.661920] btrfs: factor: 3 used: 402653184, total: 13421772800
This looks good. And the test btrfs/237 is failing, as it should be
because of the change in reclaim condition. Are you planning to update
this test in fstests later?
I still have one more question: shouldn't we use the usable disk bytes
(zcap * nr_zones) instead of disk_total_bytes (zsze * nr_zones) to
calculate the `total` variable? The `used` value is a part of the usable
disk space so I feel it makes more sense to calculate the `factor` with
the usable disk bytes instead of the disk_total_bytes.
--
Regards,
Pankaj
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive
2022-03-23 11:24 ` Pankaj Raghav
@ 2022-03-23 11:52 ` Johannes Thumshirn
2022-03-23 19:37 ` Pankaj Raghav
2022-03-24 0:06 ` Damien Le Moal
0 siblings, 2 replies; 19+ messages in thread
From: Johannes Thumshirn @ 2022-03-23 11:52 UTC (permalink / raw)
To: Pankaj Raghav, David Sterba, Naohiro Aota
Cc: Josef Bacik, linux-btrfs, Luis Chamberlain, Javier Gonzalez
On 23/03/2022 12:24, Pankaj Raghav wrote:
>
>
> On 2022-03-23 11:39, Johannes Thumshirn wrote:
>>
>> It looks like we can't use btrfs_calc_available_free_space(), can
>> you try this one on top:
>>
>> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
>> index f2a412427921..4a6c1f1a7223 100644
>> --- a/fs/btrfs/zoned.c
>> +++ b/fs/btrfs/zoned.c
>> @@ -2082,23 +2082,27 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
>>
>> bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
>> {
>> - struct btrfs_space_info *sinfo;
>> + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
>> + struct btrfs_device *device;
>> u64 used = 0;
>> u64 total = 0;
>> u64 factor;
>>
>> - if (!btrfs_is_zoned(fs_info))
>> - return false;
>> -
>> if (!fs_info->bg_reclaim_threshold)
>> return false;
>>
>> - list_for_each_entry(sinfo, &fs_info->space_info, list) {
>> - total += sinfo->total_bytes;
>> - used += btrfs_calc_available_free_space(fs_info, sinfo,
>> - BTRFS_RESERVE_NO_FLUSH);
>> +
>> + mutex_lock(&fs_devices->device_list_mutex);
>> + list_for_each_entry(device, &fs_devices->devices, dev_list) {
>> + if (!device->bdev)
>> + continue;
>> +
>> + total += device->disk_total_bytes;
>> + used += device->bytes_used;
>> +
>> }
>> + mutex_unlock(&fs_devices->device_list_mutex);
>>
>> - factor = div_u64(used * 100, total);
>> + factor = div64_u64(used * 100, total);
>> return factor >= fs_info->bg_reclaim_threshold;
>> }
>>
> size 1280M:
> [ 47.511871] btrfs: factor: 30 used: 402653184, total: 1342177280
> [ 48.542981] btrfs: factor: 30 used: 402653184, total: 1342177280
> [ 49.576005] btrfs: factor: 30 used: 402653184, total: 1342177280
> size: 12800M:
> [ 33.971009] btrfs: factor: 3 used: 402653184, total: 13421772800
> [ 34.978602] btrfs: factor: 3 used: 402653184, total: 13421772800
> [ 35.991784] btrfs: factor: 3 used: 402653184, total: 13421772800
> size: 12800M, zcap=96M zsze=128M:
> [ 64.639103] btrfs: factor: 3 used: 402653184, total: 13421772800
> [ 65.643778] btrfs: factor: 3 used: 402653184, total: 13421772800
> [ 66.661920] btrfs: factor: 3 used: 402653184, total: 13421772800
>
> This looks good. And the test btrfs/237 is failing, as it should be
> because of the change in reclaim condition. Are you planning to update
> this test in fstests later?
Yes, once I have an idea how to do. Probably just fill the FS until
~75% of the drive is filled and then use the original logic.
> I still have one more question: shouldn't we use the usable disk bytes
> (zcap * nr_zones) instead of disk_total_bytes (zsze * nr_zones) to
> calculate the `total` variable? The `used` value is a part of the usable
> disk space so I feel it makes more sense to calculate the `factor` with
> the usable disk bytes instead of the disk_total_bytes.
>
disk_total_bytes comes from the value the underlying device driver set
for the gendisk's capacity via set_capacity().
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive
2022-03-23 11:52 ` Johannes Thumshirn
@ 2022-03-23 19:37 ` Pankaj Raghav
2022-03-24 0:06 ` Damien Le Moal
1 sibling, 0 replies; 19+ messages in thread
From: Pankaj Raghav @ 2022-03-23 19:37 UTC (permalink / raw)
To: Johannes Thumshirn, David Sterba, Naohiro Aota
Cc: Josef Bacik, linux-btrfs, Luis Chamberlain, Javier Gonzalez
On 2022-03-23 12:52, Johannes Thumshirn wrote:
>> This looks good. And the test btrfs/237 is failing, as it should be
>> because of the change in reclaim condition. Are you planning to update
>> this test in fstests later?
>
> Yes, once I have an idea how to do. Probably just fill the FS until
> ~75% of the drive is filled and then use the original logic.
>
Perfect.
>> I still have one more question: shouldn't we use the usable disk bytes
>> (zcap * nr_zones) instead of disk_total_bytes (zsze * nr_zones) to
>> calculate the `total` variable? The `used` value is a part of the usable
>> disk space so I feel it makes more sense to calculate the `factor` with
>> the usable disk bytes instead of the disk_total_bytes.
>>
>
> disk_total_bytes comes from the value the underlying device driver set
> for the gendisk's capacity via set_capacity().
Yes, I understand that part. My comment was more about how pedantic we
need to be for reclaim as set capacity via the device driver will
include the unusable holes (lbas between zcap and zsze).
For e.g., zns drive with 96M zone capacity and 128M zone size with 10
zones will have a total disk capacity of 1280M but the usuable capacity
is 960M.
Let us say the `used` value is 128M, then the `factor` value with the
current approach is 128 / 1280 = 10%.
But if we use the usable capacity of a zns drive, then the `factor`
value will be 128 / 960 = 13.3%.
This might not be problem for lower value of `used` but my concern is
when the drive is nearing its capacity.
Let's take the same example where the `used` value is now 900M. Then the
factor with the current approach is 70% (900 / 1280), still below the
default 75 for bg_reclaim_threshold but when used with the usable
capacity, it is 93% (900 / 960).
So essentially we are postponing the reclaim assuming we have enough
room left but in reality it is not.
I still don't have a complete understanding of the full stack with btrfs
on zoned devices so please correct me if I am wrong.
--
Regards,
Pankaj
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 5/5] btrfs: zoned: make auto-reclaim less aggressive
2022-03-23 11:52 ` Johannes Thumshirn
2022-03-23 19:37 ` Pankaj Raghav
@ 2022-03-24 0:06 ` Damien Le Moal
1 sibling, 0 replies; 19+ messages in thread
From: Damien Le Moal @ 2022-03-24 0:06 UTC (permalink / raw)
To: Johannes Thumshirn, Pankaj Raghav, David Sterba, Naohiro Aota
Cc: Josef Bacik, linux-btrfs, Luis Chamberlain, Javier Gonzalez
On 3/23/22 20:52, Johannes Thumshirn wrote:
> On 23/03/2022 12:24, Pankaj Raghav wrote:
>>
>>
>> On 2022-03-23 11:39, Johannes Thumshirn wrote:
>>>
>>> It looks like we can't use btrfs_calc_available_free_space(), can
>>> you try this one on top:
>>>
>>> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
>>> index f2a412427921..4a6c1f1a7223 100644
>>> --- a/fs/btrfs/zoned.c
>>> +++ b/fs/btrfs/zoned.c
>>> @@ -2082,23 +2082,27 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
>>>
>>> bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
>>> {
>>> - struct btrfs_space_info *sinfo;
>>> + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
>>> + struct btrfs_device *device;
>>> u64 used = 0;
>>> u64 total = 0;
>>> u64 factor;
>>>
>>> - if (!btrfs_is_zoned(fs_info))
>>> - return false;
>>> -
>>> if (!fs_info->bg_reclaim_threshold)
>>> return false;
>>>
>>> - list_for_each_entry(sinfo, &fs_info->space_info, list) {
>>> - total += sinfo->total_bytes;
>>> - used += btrfs_calc_available_free_space(fs_info, sinfo,
>>> - BTRFS_RESERVE_NO_FLUSH);
>>> +
>>> + mutex_lock(&fs_devices->device_list_mutex);
>>> + list_for_each_entry(device, &fs_devices->devices, dev_list) {
>>> + if (!device->bdev)
>>> + continue;
>>> +
>>> + total += device->disk_total_bytes;
>>> + used += device->bytes_used;
Does bytes_used include all the unusable blocks between zone cap and zone
size for all zones ? If yes, then the calculation will be OK. If not, then
you will get an artificially low factor not reflecting the need for defrag.
>>> +
>>> }
>>> + mutex_unlock(&fs_devices->device_list_mutex);
>>>
>>> - factor = div_u64(used * 100, total);
>>> + factor = div64_u64(used * 100, total);
>>> return factor >= fs_info->bg_reclaim_threshold;
Not sure if the factor variable is really necessary here.
>>> }
>>>
>> size 1280M:
>> [ 47.511871] btrfs: factor: 30 used: 402653184, total: 1342177280
>> [ 48.542981] btrfs: factor: 30 used: 402653184, total: 1342177280
>> [ 49.576005] btrfs: factor: 30 used: 402653184, total: 1342177280
>> size: 12800M:
>> [ 33.971009] btrfs: factor: 3 used: 402653184, total: 13421772800
>> [ 34.978602] btrfs: factor: 3 used: 402653184, total: 13421772800
>> [ 35.991784] btrfs: factor: 3 used: 402653184, total: 13421772800
>> size: 12800M, zcap=96M zsze=128M:
>> [ 64.639103] btrfs: factor: 3 used: 402653184, total: 13421772800
>> [ 65.643778] btrfs: factor: 3 used: 402653184, total: 13421772800
>> [ 66.661920] btrfs: factor: 3 used: 402653184, total: 13421772800
>>
>> This looks good. And the test btrfs/237 is failing, as it should be
>> because of the change in reclaim condition. Are you planning to update
>> this test in fstests later?
>
> Yes, once I have an idea how to do. Probably just fill the FS until
> ~75% of the drive is filled and then use the original logic.
>
>> I still have one more question: shouldn't we use the usable disk bytes
>> (zcap * nr_zones) instead of disk_total_bytes (zsze * nr_zones) to
>> calculate the `total` variable? The `used` value is a part of the usable
>> disk space so I feel it makes more sense to calculate the `factor` with
>> the usable disk bytes instead of the disk_total_bytes.
>>
>
> disk_total_bytes comes from the value the underlying device driver set
> for the gendisk's capacity via set_capacity().
--
Damien Le Moal
Western Digital Research
^ permalink raw reply [flat|nested] 19+ messages in thread