All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] btrfs: block group size class load fixes
@ 2023-01-25 20:50 Boris Burkov
  2023-01-25 20:50 ` [PATCH 1/2] btrfs: fix size class loading logic Boris Burkov
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Boris Burkov @ 2023-01-25 20:50 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

The original size class loading logic was totally broken.

Patch 1 fixes it.
Patch 2 adds sysfs visibility to size classes so that we don't mess up
this badly in the future (and can test it in xfstests now).

Boris Burkov (2):
  btrfs: fix size class loading logic
  btrfs: add size class stats to sysfs

 fs/btrfs/block-group.c | 56 ++++++++++++++++++++++++++++--------------
 fs/btrfs/sysfs.c       | 39 +++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 19 deletions(-)

-- 
2.38.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/2] btrfs: fix size class loading logic
  2023-01-25 20:50 [PATCH 0/2] btrfs: block group size class load fixes Boris Burkov
@ 2023-01-25 20:50 ` Boris Burkov
  2023-02-14 21:47   ` Filipe Manana
  2023-01-25 20:50 ` [PATCH 2/2] btrfs: add size class stats to sysfs Boris Burkov
  2023-01-25 22:05 ` [PATCH 0/2] btrfs: block group size class load fixes David Sterba
  2 siblings, 1 reply; 9+ messages in thread
From: Boris Burkov @ 2023-01-25 20:50 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

The original implementation was completely incorrect. It used
btrfs_search_slot to make an inexact match, which simply returned >0 to
indicate not finding the key.

Change it to using search_forward with no transid to actually walk the
leaves looking for extent items. Some small tweaks to the key space
condition checking in the iteration were also necessary.

Finally, since the sampling lookups are of fixed complexity, move them
into the main, blocking part of caching a block group, not as a
best-effort thing after. This has no effect on total block group caching
throughput as there is only one thread anyway, but makes it simpler and
reduces weird races where we change the size class simultaneously from
an allocation and loading.

Signed-off-by: Boris Burkov <boris@bur.io>
---
 fs/btrfs/block-group.c | 56 ++++++++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 73e1270b3904..45ccb25c5b1f 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -555,7 +555,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
  * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
  * error code on error.
  */
-static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
+static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
+					  struct btrfs_block_group *block_group,
 					  int index, int max_index,
 					  struct btrfs_key *key)
 {
@@ -563,17 +564,19 @@ static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
 	struct btrfs_root *extent_root;
 	int ret = 0;
 	u64 search_offset;
+	u64 search_end = block_group->start + block_group->length;
 	struct btrfs_path *path;
 
 	ASSERT(index >= 0);
 	ASSERT(index <= max_index);
 	ASSERT(max_index > 0);
+	lockdep_assert_held(&caching_ctl->mutex);
+	lockdep_assert_held_read(&fs_info->commit_root_sem);
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	down_read(&fs_info->commit_root_sem);
 	extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
 						       BTRFS_SUPER_INFO_OFFSET));
 
@@ -586,21 +589,36 @@ static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
 	key->type = BTRFS_EXTENT_ITEM_KEY;
 	key->offset = 0;
 
-	ret = btrfs_search_slot(NULL, extent_root, key, path, 0, 0);
-	if (ret != 0)
-		goto out;
-	if (key->objectid < block_group->start ||
-	    key->objectid > block_group->start + block_group->length) {
-		ret = 1;
-		goto out;
-	}
-	if (key->type != BTRFS_EXTENT_ITEM_KEY) {
-		ret = 1;
-		goto out;
+	while (1) {
+		ret = btrfs_search_forward(extent_root, key, path, 0);
+		if (ret != 0)
+			goto out;
+		/* Success; sampled an extent item in the block group */
+		if (key->type == BTRFS_EXTENT_ITEM_KEY &&
+		    key->objectid >= block_group->start &&
+		    key->objectid + key->offset <= search_end)
+			goto out;
+
+		/* We can't possibly find a valid extent item anymore */
+		if (key->objectid >= search_end) {
+			ret = 1;
+			break;
+		}
+		if (key->type < BTRFS_EXTENT_ITEM_KEY)
+			key->type = BTRFS_EXTENT_ITEM_KEY;
+		else
+			key->objectid++;
+		btrfs_release_path(path);
+		up_read(&fs_info->commit_root_sem);
+		mutex_unlock(&caching_ctl->mutex);
+		cond_resched();
+		mutex_lock(&caching_ctl->mutex);
+		down_read(&fs_info->commit_root_sem);
 	}
 out:
+	lockdep_assert_held(&caching_ctl->mutex);
+	lockdep_assert_held_read(&fs_info->commit_root_sem);
 	btrfs_free_path(path);
-	up_read(&fs_info->commit_root_sem);
 	return ret;
 }
 
@@ -638,7 +656,8 @@ static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
  *
  * Returns: 0 on success, negative error code on error.
  */
-static int load_block_group_size_class(struct btrfs_block_group *block_group)
+static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
+				       struct btrfs_block_group *block_group)
 {
 	struct btrfs_key key;
 	int i;
@@ -646,11 +665,11 @@ static int load_block_group_size_class(struct btrfs_block_group *block_group)
 	enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
 	int ret;
 
-	if (btrfs_block_group_should_use_size_class(block_group))
+	if (!btrfs_block_group_should_use_size_class(block_group))
 		return 0;
 
 	for (i = 0; i < 5; ++i) {
-		ret = sample_block_group_extent_item(block_group, i, 5, &key);
+		ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
 		if (ret < 0)
 			goto out;
 		if (ret > 0)
@@ -812,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work)
 	mutex_lock(&caching_ctl->mutex);
 	down_read(&fs_info->commit_root_sem);
 
+	load_block_group_size_class(caching_ctl, block_group);
 	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 		ret = load_free_space_cache(block_group);
 		if (ret == 1) {
@@ -867,8 +887,6 @@ static noinline void caching_thread(struct btrfs_work *work)
 
 	wake_up(&caching_ctl->wait);
 
-	load_block_group_size_class(block_group);
-
 	btrfs_put_caching_control(caching_ctl);
 	btrfs_put_block_group(block_group);
 }
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/2] btrfs: add size class stats to sysfs
  2023-01-25 20:50 [PATCH 0/2] btrfs: block group size class load fixes Boris Burkov
  2023-01-25 20:50 ` [PATCH 1/2] btrfs: fix size class loading logic Boris Burkov
@ 2023-01-25 20:50 ` Boris Burkov
  2023-01-27 13:23   ` David Sterba
  2023-01-25 22:05 ` [PATCH 0/2] btrfs: block group size class load fixes David Sterba
  2 siblings, 1 reply; 9+ messages in thread
From: Boris Burkov @ 2023-01-25 20:50 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

Make it possible to see the distribution of size classes for block
groups. Helpful for testing and debugging the allocator w.r.t. to size
classes.

Signed-off-by: Boris Burkov <boris@bur.io>
---
 fs/btrfs/sysfs.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 108aa3876186..e1ae4d2323d6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -9,6 +9,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/bug.h>
+#include <linux/list.h>
 #include <crypto/hash.h>
 #include "messages.h"
 #include "ctree.h"
@@ -778,6 +779,42 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
 	return len;
 }
 
+static ssize_t btrfs_size_classes_show(struct kobject *kobj,
+				       struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_space_info *sinfo = to_space_info(kobj);
+	struct btrfs_block_group *bg;
+	int none = 0;
+	int small = 0;
+	int medium = 0;
+	int large = 0;
+	int i;
+
+	down_read(&sinfo->groups_sem);
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; ++i) {
+		list_for_each_entry(bg, &sinfo->block_groups[i], list) {
+			if (!btrfs_block_group_should_use_size_class(bg))
+				continue;
+			switch (bg->size_class) {
+			case BTRFS_BG_SZ_NONE:
+				none++;
+				break;
+			case BTRFS_BG_SZ_SMALL:
+				small++;
+				break;
+			case BTRFS_BG_SZ_MEDIUM:
+				medium++;
+				break;
+			case BTRFS_BG_SZ_LARGE:
+				large++;
+				break;
+			}
+		}
+	}
+	up_read(&sinfo->groups_sem);
+	return sysfs_emit(buf, "%d %d %d %d\n", none, small, medium, large);
+}
+
 #ifdef CONFIG_BTRFS_DEBUG
 /*
  * Request chunk allocation with current chunk size.
@@ -835,6 +872,7 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
 SPACE_INFO_ATTR(disk_used);
 SPACE_INFO_ATTR(disk_total);
 BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
+BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);
 
 static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
 						     struct kobj_attribute *a,
@@ -887,6 +925,7 @@ static struct attribute *space_info_attrs[] = {
 	BTRFS_ATTR_PTR(space_info, disk_total),
 	BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
 	BTRFS_ATTR_PTR(space_info, chunk_size),
+	BTRFS_ATTR_PTR(space_info, size_classes),
 #ifdef CONFIG_BTRFS_DEBUG
 	BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
 #endif
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/2] btrfs: block group size class load fixes
  2023-01-25 20:50 [PATCH 0/2] btrfs: block group size class load fixes Boris Burkov
  2023-01-25 20:50 ` [PATCH 1/2] btrfs: fix size class loading logic Boris Burkov
  2023-01-25 20:50 ` [PATCH 2/2] btrfs: add size class stats to sysfs Boris Burkov
@ 2023-01-25 22:05 ` David Sterba
  2 siblings, 0 replies; 9+ messages in thread
From: David Sterba @ 2023-01-25 22:05 UTC (permalink / raw)
  To: Boris Burkov; +Cc: linux-btrfs, kernel-team

On Wed, Jan 25, 2023 at 12:50:31PM -0800, Boris Burkov wrote:
> The original size class loading logic was totally broken.
> 
> Patch 1 fixes it.
> Patch 2 adds sysfs visibility to size classes so that we don't mess up
> this badly in the future (and can test it in xfstests now).
> 
> Boris Burkov (2):
>   btrfs: fix size class loading logic
>   btrfs: add size class stats to sysfs

There's still time to replace the orignial patches, you can send me
incremental diffs or resend the whole series.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] btrfs: add size class stats to sysfs
  2023-01-25 20:50 ` [PATCH 2/2] btrfs: add size class stats to sysfs Boris Burkov
@ 2023-01-27 13:23   ` David Sterba
  2023-01-27 21:43     ` Boris Burkov
  0 siblings, 1 reply; 9+ messages in thread
From: David Sterba @ 2023-01-27 13:23 UTC (permalink / raw)
  To: Boris Burkov; +Cc: linux-btrfs, kernel-team

On Wed, Jan 25, 2023 at 12:50:33PM -0800, Boris Burkov wrote:
> Make it possible to see the distribution of size classes for block
> groups. Helpful for testing and debugging the allocator w.r.t. to size
> classes.

Please note the sysfs file path.

> Signed-off-by: Boris Burkov <boris@bur.io>
> ---
>  fs/btrfs/sysfs.c | 39 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 39 insertions(+)
> 
> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> index 108aa3876186..e1ae4d2323d6 100644
> --- a/fs/btrfs/sysfs.c
> +++ b/fs/btrfs/sysfs.c
> @@ -9,6 +9,7 @@
>  #include <linux/spinlock.h>
>  #include <linux/completion.h>
>  #include <linux/bug.h>
> +#include <linux/list.h>
>  #include <crypto/hash.h>
>  #include "messages.h"
>  #include "ctree.h"
> @@ -778,6 +779,42 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
>  	return len;
>  }
>  
> +static ssize_t btrfs_size_classes_show(struct kobject *kobj,
> +				       struct kobj_attribute *a, char *buf)
> +{
> +	struct btrfs_space_info *sinfo = to_space_info(kobj);
> +	struct btrfs_block_group *bg;
> +	int none = 0;
> +	int small = 0;
> +	int medium = 0;
> +	int large = 0;

For simple counters please use unsigned types.

> +	int i;
> +
> +	down_read(&sinfo->groups_sem);

This is a big lock and reading the sysfs repeatedly could block space
reservations. I think RCU works for the block group list and the
size_class is a simple read so the synchronization can be lightweight.

> +	for (i = 0; i < BTRFS_NR_RAID_TYPES; ++i) {

	for (int = 0; ...)

> +		list_for_each_entry(bg, &sinfo->block_groups[i], list) {
> +			if (!btrfs_block_group_should_use_size_class(bg))
> +				continue;
> +			switch (bg->size_class) {
> +			case BTRFS_BG_SZ_NONE:
> +				none++;
> +				break;
> +			case BTRFS_BG_SZ_SMALL:
> +				small++;
> +				break;
> +			case BTRFS_BG_SZ_MEDIUM:
> +				medium++;
> +				break;
> +			case BTRFS_BG_SZ_LARGE:
> +				large++;
> +				break;
> +			}
> +		}
> +	}
> +	up_read(&sinfo->groups_sem);
> +	return sysfs_emit(buf, "%d %d %d %d\n", none, small, medium, large);

This is lacks the types in the output, so this should be like

	"none %u\n"
	"small %u\n"
	...

For stats we can group the values in one file.

> +}
> +
>  #ifdef CONFIG_BTRFS_DEBUG
>  /*
>   * Request chunk allocation with current chunk size.
> @@ -835,6 +872,7 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
>  SPACE_INFO_ATTR(disk_used);
>  SPACE_INFO_ATTR(disk_total);
>  BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
> +BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);
>  
>  static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
>  						     struct kobj_attribute *a,
> @@ -887,6 +925,7 @@ static struct attribute *space_info_attrs[] = {
>  	BTRFS_ATTR_PTR(space_info, disk_total),
>  	BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
>  	BTRFS_ATTR_PTR(space_info, chunk_size),
> +	BTRFS_ATTR_PTR(space_info, size_classes),
>  #ifdef CONFIG_BTRFS_DEBUG
>  	BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
>  #endif
> -- 
> 2.38.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] btrfs: add size class stats to sysfs
  2023-01-27 13:23   ` David Sterba
@ 2023-01-27 21:43     ` Boris Burkov
  2023-02-07 15:08       ` David Sterba
  0 siblings, 1 reply; 9+ messages in thread
From: Boris Burkov @ 2023-01-27 21:43 UTC (permalink / raw)
  To: David Sterba; +Cc: linux-btrfs, kernel-team

On Fri, Jan 27, 2023 at 02:23:45PM +0100, David Sterba wrote:
> On Wed, Jan 25, 2023 at 12:50:33PM -0800, Boris Burkov wrote:
> > Make it possible to see the distribution of size classes for block
> > groups. Helpful for testing and debugging the allocator w.r.t. to size
> > classes.
> 
> Please note the sysfs file path.
> 
> > Signed-off-by: Boris Burkov <boris@bur.io>
> > ---
> >  fs/btrfs/sysfs.c | 39 +++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 39 insertions(+)
> > 
> > diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> > index 108aa3876186..e1ae4d2323d6 100644
> > --- a/fs/btrfs/sysfs.c
> > +++ b/fs/btrfs/sysfs.c
> > @@ -9,6 +9,7 @@
> >  #include <linux/spinlock.h>
> >  #include <linux/completion.h>
> >  #include <linux/bug.h>
> > +#include <linux/list.h>
> >  #include <crypto/hash.h>
> >  #include "messages.h"
> >  #include "ctree.h"
> > @@ -778,6 +779,42 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
> >  	return len;
> >  }
> >  
> > +static ssize_t btrfs_size_classes_show(struct kobject *kobj,
> > +				       struct kobj_attribute *a, char *buf)
> > +{
> > +	struct btrfs_space_info *sinfo = to_space_info(kobj);
> > +	struct btrfs_block_group *bg;
> > +	int none = 0;
> > +	int small = 0;
> > +	int medium = 0;
> > +	int large = 0;
> 
> For simple counters please use unsigned types.
> 
> > +	int i;
> > +
> > +	down_read(&sinfo->groups_sem);
> 
> This is a big lock and reading the sysfs repeatedly could block space
> reservations. I think RCU works for the block group list and the
> size_class is a simple read so the synchronization can be lightweight.

I believe space reservations only hold the read lock. The write lock is
needed only to remove or add block groups, so this shouldn't slow down
reservations. Also, FWIW, raid_bytes_show() uses the same
locking/iteration pattern.

I am not sure how to definitely safely concurrently iterate the block
groups without taking the lock. Are you suggesting I should just drop
the locking, and it won't crash but might be inaccurate? Or is there
some other RCU trick I am missing? I don't believe we use any RCU
specific methods when deleting from the list.

Sending a v3 with the rest of your review changes.

Thanks,
Boris

> 
> > +	for (i = 0; i < BTRFS_NR_RAID_TYPES; ++i) {
> 
> 	for (int = 0; ...)
> 
> > +		list_for_each_entry(bg, &sinfo->block_groups[i], list) {
> > +			if (!btrfs_block_group_should_use_size_class(bg))
> > +				continue;
> > +			switch (bg->size_class) {
> > +			case BTRFS_BG_SZ_NONE:
> > +				none++;
> > +				break;
> > +			case BTRFS_BG_SZ_SMALL:
> > +				small++;
> > +				break;
> > +			case BTRFS_BG_SZ_MEDIUM:
> > +				medium++;
> > +				break;
> > +			case BTRFS_BG_SZ_LARGE:
> > +				large++;
> > +				break;
> > +			}
> > +		}
> > +	}
> > +	up_read(&sinfo->groups_sem);
> > +	return sysfs_emit(buf, "%d %d %d %d\n", none, small, medium, large);
> 
> This is lacks the types in the output, so this should be like
> 
> 	"none %u\n"
> 	"small %u\n"
> 	...
> 
> For stats we can group the values in one file.
> 
> > +}
> > +
> >  #ifdef CONFIG_BTRFS_DEBUG
> >  /*
> >   * Request chunk allocation with current chunk size.
> > @@ -835,6 +872,7 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
> >  SPACE_INFO_ATTR(disk_used);
> >  SPACE_INFO_ATTR(disk_total);
> >  BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
> > +BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);
> >  
> >  static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
> >  						     struct kobj_attribute *a,
> > @@ -887,6 +925,7 @@ static struct attribute *space_info_attrs[] = {
> >  	BTRFS_ATTR_PTR(space_info, disk_total),
> >  	BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
> >  	BTRFS_ATTR_PTR(space_info, chunk_size),
> > +	BTRFS_ATTR_PTR(space_info, size_classes),
> >  #ifdef CONFIG_BTRFS_DEBUG
> >  	BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
> >  #endif
> > -- 
> > 2.38.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] btrfs: add size class stats to sysfs
  2023-01-27 21:43     ` Boris Burkov
@ 2023-02-07 15:08       ` David Sterba
  0 siblings, 0 replies; 9+ messages in thread
From: David Sterba @ 2023-02-07 15:08 UTC (permalink / raw)
  To: Boris Burkov; +Cc: David Sterba, linux-btrfs, kernel-team

On Fri, Jan 27, 2023 at 01:43:19PM -0800, Boris Burkov wrote:
> On Fri, Jan 27, 2023 at 02:23:45PM +0100, David Sterba wrote:
> > On Wed, Jan 25, 2023 at 12:50:33PM -0800, Boris Burkov wrote:
> > > Make it possible to see the distribution of size classes for block
> > > groups. Helpful for testing and debugging the allocator w.r.t. to size
> > > classes.
> > 
> > Please note the sysfs file path.
> > 
> > > Signed-off-by: Boris Burkov <boris@bur.io>
> > > ---
> > >  fs/btrfs/sysfs.c | 39 +++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 39 insertions(+)
> > > 
> > > diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> > > index 108aa3876186..e1ae4d2323d6 100644
> > > --- a/fs/btrfs/sysfs.c
> > > +++ b/fs/btrfs/sysfs.c
> > > @@ -9,6 +9,7 @@
> > >  #include <linux/spinlock.h>
> > >  #include <linux/completion.h>
> > >  #include <linux/bug.h>
> > > +#include <linux/list.h>
> > >  #include <crypto/hash.h>
> > >  #include "messages.h"
> > >  #include "ctree.h"
> > > @@ -778,6 +779,42 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
> > >  	return len;
> > >  }
> > >  
> > > +static ssize_t btrfs_size_classes_show(struct kobject *kobj,
> > > +				       struct kobj_attribute *a, char *buf)
> > > +{
> > > +	struct btrfs_space_info *sinfo = to_space_info(kobj);
> > > +	struct btrfs_block_group *bg;
> > > +	int none = 0;
> > > +	int small = 0;
> > > +	int medium = 0;
> > > +	int large = 0;
> > 
> > For simple counters please use unsigned types.
> > 
> > > +	int i;
> > > +
> > > +	down_read(&sinfo->groups_sem);
> > 
> > This is a big lock and reading the sysfs repeatedly could block space
> > reservations. I think RCU works for the block group list and the
> > size_class is a simple read so the synchronization can be lightweight.
> 
> I believe space reservations only hold the read lock. The write lock is
> needed only to remove or add block groups, so this shouldn't slow down
> reservations. Also, FWIW, raid_bytes_show() uses the same
> locking/iteration pattern.

It does use the same lock but it does not lock all space infos, only the
one it's supposed to print the numbers.

> I am not sure how to definitely safely concurrently iterate the block
> groups without taking the lock. Are you suggesting I should just drop
> the locking, and it won't crash but might be inaccurate? Or is there
> some other RCU trick I am missing? I don't believe we use any RCU
> specific methods when deleting from the list.

I don't see the RCU for block groups so some kind of locking needs to be
done, if we don't insist on accurate numbers the mutex can be taken but
eventually yielded (rwsem_is_contended or need_resched).

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] btrfs: fix size class loading logic
  2023-01-25 20:50 ` [PATCH 1/2] btrfs: fix size class loading logic Boris Burkov
@ 2023-02-14 21:47   ` Filipe Manana
  2023-02-14 22:12     ` Boris Burkov
  0 siblings, 1 reply; 9+ messages in thread
From: Filipe Manana @ 2023-02-14 21:47 UTC (permalink / raw)
  To: Boris Burkov; +Cc: linux-btrfs, kernel-team

On Wed, Jan 25, 2023 at 9:33 PM Boris Burkov <boris@bur.io> wrote:
>
> The original implementation was completely incorrect. It used
> btrfs_search_slot to make an inexact match, which simply returned >0 to
> indicate not finding the key.
>
> Change it to using search_forward with no transid to actually walk the
> leaves looking for extent items. Some small tweaks to the key space
> condition checking in the iteration were also necessary.
>
> Finally, since the sampling lookups are of fixed complexity, move them
> into the main, blocking part of caching a block group, not as a
> best-effort thing after. This has no effect on total block group caching
> throughput as there is only one thread anyway, but makes it simpler and
> reduces weird races where we change the size class simultaneously from
> an allocation and loading.
>
> Signed-off-by: Boris Burkov <boris@bur.io>
> ---
>  fs/btrfs/block-group.c | 56 ++++++++++++++++++++++++++++--------------
>  1 file changed, 37 insertions(+), 19 deletions(-)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 73e1270b3904..45ccb25c5b1f 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -555,7 +555,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
>   * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
>   * error code on error.
>   */
> -static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
> +static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
> +                                         struct btrfs_block_group *block_group,
>                                           int index, int max_index,
>                                           struct btrfs_key *key)
>  {
> @@ -563,17 +564,19 @@ static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
>         struct btrfs_root *extent_root;
>         int ret = 0;
>         u64 search_offset;
> +       u64 search_end = block_group->start + block_group->length;
>         struct btrfs_path *path;
>
>         ASSERT(index >= 0);
>         ASSERT(index <= max_index);
>         ASSERT(max_index > 0);
> +       lockdep_assert_held(&caching_ctl->mutex);
> +       lockdep_assert_held_read(&fs_info->commit_root_sem);
>
>         path = btrfs_alloc_path();
>         if (!path)
>                 return -ENOMEM;
>
> -       down_read(&fs_info->commit_root_sem);
>         extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
>                                                        BTRFS_SUPER_INFO_OFFSET));
>
> @@ -586,21 +589,36 @@ static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
>         key->type = BTRFS_EXTENT_ITEM_KEY;
>         key->offset = 0;
>
> -       ret = btrfs_search_slot(NULL, extent_root, key, path, 0, 0);
> -       if (ret != 0)
> -               goto out;
> -       if (key->objectid < block_group->start ||
> -           key->objectid > block_group->start + block_group->length) {
> -               ret = 1;
> -               goto out;
> -       }
> -       if (key->type != BTRFS_EXTENT_ITEM_KEY) {
> -               ret = 1;
> -               goto out;
> +       while (1) {
> +               ret = btrfs_search_forward(extent_root, key, path, 0);

Boris, this is broken and can result in a deadlock.

btrfs_search_forward() will always lock the root node, despite the
path having ->skip_locking and ->search_commit_root set to true (1).
It's not meant to be used for commit roots, so it always needs to do locking.

So if another task is COWing a child node of the same root node and
then needs to wait for block group caching to complete when trying to
allocate a metadata extent, it deadlocks.
For example:

[539604.239315] sysrq: Show Blocked State
[539604.240133] task:kworker/u16:6   state:D stack:0     pid:2119594
ppid:2      flags:0x00004000
[539604.241613] Workqueue: btrfs-cache btrfs_work_helper [btrfs]
[539604.242673] Call Trace:
[539604.243129]  <TASK>
[539604.243925]  __schedule+0x41d/0xee0
[539604.244797]  ? rcu_read_lock_sched_held+0x12/0x70
[539604.245399]  ? rwsem_down_read_slowpath+0x185/0x490
[539604.246111]  schedule+0x5d/0xf0
[539604.246593]  rwsem_down_read_slowpath+0x2da/0x490
[539604.247290]  ? rcu_barrier_tasks_trace+0x10/0x20
[539604.248090]  __down_read_common+0x3d/0x150
[539604.248702]  down_read_nested+0xc3/0x140
[539604.249280]  __btrfs_tree_read_lock+0x24/0x100 [btrfs]
[539604.250097]  btrfs_read_lock_root_node+0x48/0x60 [btrfs]
[539604.250915]  btrfs_search_forward+0x59/0x460 [btrfs]
[539604.251781]  ? btrfs_global_root+0x50/0x70 [btrfs]
[539604.252476]  caching_thread+0x1be/0x920 [btrfs]
[539604.253167]  btrfs_work_helper+0xf6/0x400 [btrfs]
[539604.253848]  process_one_work+0x24f/0x5a0
[539604.254476]  worker_thread+0x52/0x3b0
[539604.255166]  ? __pfx_worker_thread+0x10/0x10
[539604.256047]  kthread+0xf0/0x120
[539604.256591]  ? __pfx_kthread+0x10/0x10
[539604.257212]  ret_from_fork+0x29/0x50
[539604.257822]  </TASK>
[539604.258233] task:btrfs-transacti state:D stack:0     pid:2236474
ppid:2      flags:0x00004000
[539604.259802] Call Trace:
[539604.260243]  <TASK>
[539604.260615]  __schedule+0x41d/0xee0
[539604.261205]  ? rcu_read_lock_sched_held+0x12/0x70
[539604.262000]  ? rwsem_down_read_slowpath+0x185/0x490
[539604.262822]  schedule+0x5d/0xf0
[539604.263374]  rwsem_down_read_slowpath+0x2da/0x490
[539604.266228]  ? lock_acquire+0x160/0x310
[539604.266917]  ? rcu_read_lock_sched_held+0x12/0x70
[539604.267996]  ? lock_contended+0x19e/0x500
[539604.268720]  __down_read_common+0x3d/0x150
[539604.269400]  down_read_nested+0xc3/0x140
[539604.270057]  __btrfs_tree_read_lock+0x24/0x100 [btrfs]
[539604.271129]  btrfs_read_lock_root_node+0x48/0x60 [btrfs]
[539604.272372]  btrfs_search_slot+0x143/0xf70 [btrfs]
[539604.273295]  update_block_group_item+0x9e/0x190 [btrfs]
[539604.274282]  btrfs_start_dirty_block_groups+0x1c4/0x4f0 [btrfs]
[539604.275381]  ? __mutex_unlock_slowpath+0x45/0x280
[539604.276390]  btrfs_commit_transaction+0xee/0xed0 [btrfs]
[539604.277391]  ? lock_acquire+0x1a4/0x310
[539604.278080]  ? start_transaction+0xcb/0x6c0 [btrfs]
[539604.279099]  transaction_kthread+0x142/0x1c0 [btrfs]
[539604.279996]  ? __pfx_transaction_kthread+0x10/0x10 [btrfs]
[539604.280673]  kthread+0xf0/0x120
[539604.281050]  ? __pfx_kthread+0x10/0x10
[539604.281496]  ret_from_fork+0x29/0x50
[539604.281966]  </TASK>
[539604.282255] task:fsstress        state:D stack:0     pid:2236483
ppid:1      flags:0x00004006
[539604.283897] Call Trace:
[539604.284700]  <TASK>
[539604.285088]  __schedule+0x41d/0xee0
[539604.285660]  schedule+0x5d/0xf0
[539604.286175]  btrfs_wait_block_group_cache_progress+0xf2/0x170 [btrfs]
[539604.287342]  ? __pfx_autoremove_wake_function+0x10/0x10
[539604.288450]  find_free_extent+0xd93/0x1750 [btrfs]
[539604.289256]  ? _raw_spin_unlock+0x29/0x50
[539604.289911]  ? btrfs_get_alloc_profile+0x127/0x2a0 [btrfs]
[539604.290843]  btrfs_reserve_extent+0x147/0x290 [btrfs]
[539604.291943]  btrfs_alloc_tree_block+0xcb/0x3e0 [btrfs]
[539604.292903]  __btrfs_cow_block+0x138/0x580 [btrfs]
[539604.293773]  btrfs_cow_block+0x10e/0x240 [btrfs]
[539604.294595]  btrfs_search_slot+0x7f3/0xf70 [btrfs]
[539604.295585]  btrfs_update_device+0x71/0x1b0 [btrfs]
[539604.296459]  btrfs_chunk_alloc_add_chunk_item+0xe0/0x340 [btrfs]
[539604.297489]  btrfs_chunk_alloc+0x1bf/0x490 [btrfs]
[539604.298335]  find_free_extent+0x6fa/0x1750 [btrfs]
[539604.299174]  ? _raw_spin_unlock+0x29/0x50
[539604.299950]  ? btrfs_get_alloc_profile+0x127/0x2a0 [btrfs]
[539604.300918]  btrfs_reserve_extent+0x147/0x290 [btrfs]
[539604.301797]  btrfs_alloc_tree_block+0xcb/0x3e0 [btrfs]
[539604.303017]  ? lock_release+0x224/0x4a0
[539604.303855]  __btrfs_cow_block+0x138/0x580 [btrfs]
[539604.304789]  btrfs_cow_block+0x10e/0x240 [btrfs]
[539604.305611]  btrfs_search_slot+0x7f3/0xf70 [btrfs]
[539604.306682]  ? btrfs_global_root+0x50/0x70 [btrfs]
[539604.308198]  lookup_inline_extent_backref+0x17b/0x7a0 [btrfs]
[539604.309254]  lookup_extent_backref+0x43/0xd0 [btrfs]
[539604.310122]  __btrfs_free_extent+0xf8/0x810 [btrfs]
[539604.310874]  ? lock_release+0x224/0x4a0
[539604.311724]  ? btrfs_merge_delayed_refs+0x17b/0x1d0 [btrfs]
[539604.313023]  __btrfs_run_delayed_refs+0x2ba/0x1260 [btrfs]
[539604.314271]  btrfs_run_delayed_refs+0x8f/0x1c0 [btrfs]
[539604.315445]  ? rcu_read_lock_sched_held+0x12/0x70
[539604.316706]  btrfs_commit_transaction+0xa2/0xed0 [btrfs]
[539604.317855]  ? do_raw_spin_unlock+0x4b/0xa0
[539604.318544]  ? _raw_spin_unlock+0x29/0x50
[539604.319240]  create_subvol+0x53d/0x6e0 [btrfs]
[539604.320283]  btrfs_mksubvol+0x4f5/0x590 [btrfs]
[539604.321220]  __btrfs_ioctl_snap_create+0x11b/0x180 [btrfs]
[539604.322307]  btrfs_ioctl_snap_create_v2+0xc6/0x150 [btrfs]
[539604.323295]  btrfs_ioctl+0x9f7/0x33e0 [btrfs]
[539604.324331]  ? rcu_read_lock_sched_held+0x12/0x70
[539604.325137]  ? lock_release+0x224/0x4a0
[539604.325808]  ? __x64_sys_ioctl+0x87/0xc0
[539604.326467]  __x64_sys_ioctl+0x87/0xc0
[539604.327109]  do_syscall_64+0x38/0x90
[539604.327875]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
[539604.328792] RIP: 0033:0x7f05a7babaeb
[539604.329378] RSP: 002b:00007ffd0fecc480 EFLAGS: 00000246 ORIG_RAX:
0000000000000010
[539604.330561] RAX: ffffffffffffffda RBX: 00007ffd0fecd520 RCX:
00007f05a7babaeb
[539604.335194] RDX: 00007ffd0fecc4e0 RSI: 0000000050009418 RDI:
0000000000000004
[539604.336583] RBP: 0000000000000002 R08: 0000000000000000 R09:
000055a66e107480
[539604.337685] R10: 00007f05a7ac5358 R11: 0000000000000246 R12:
0000000000000004
[539604.339220] R13: 00007ffd0fecc4e0 R14: 0000000000000004 R15:
000055a66c4be590
[539604.341137]  </TASK>

This needs to use regular btrfs_search_slot() with some skip and stop logic.

Thanks.


> +               if (ret != 0)
> +                       goto out;
> +               /* Success; sampled an extent item in the block group */
> +               if (key->type == BTRFS_EXTENT_ITEM_KEY &&
> +                   key->objectid >= block_group->start &&
> +                   key->objectid + key->offset <= search_end)
> +                       goto out;
> +
> +               /* We can't possibly find a valid extent item anymore */
> +               if (key->objectid >= search_end) {
> +                       ret = 1;
> +                       break;
> +               }
> +               if (key->type < BTRFS_EXTENT_ITEM_KEY)
> +                       key->type = BTRFS_EXTENT_ITEM_KEY;
> +               else
> +                       key->objectid++;
> +               btrfs_release_path(path);
> +               up_read(&fs_info->commit_root_sem);
> +               mutex_unlock(&caching_ctl->mutex);
> +               cond_resched();
> +               mutex_lock(&caching_ctl->mutex);
> +               down_read(&fs_info->commit_root_sem);
>         }
>  out:
> +       lockdep_assert_held(&caching_ctl->mutex);
> +       lockdep_assert_held_read(&fs_info->commit_root_sem);
>         btrfs_free_path(path);
> -       up_read(&fs_info->commit_root_sem);
>         return ret;
>  }
>
> @@ -638,7 +656,8 @@ static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
>   *
>   * Returns: 0 on success, negative error code on error.
>   */
> -static int load_block_group_size_class(struct btrfs_block_group *block_group)
> +static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
> +                                      struct btrfs_block_group *block_group)
>  {
>         struct btrfs_key key;
>         int i;
> @@ -646,11 +665,11 @@ static int load_block_group_size_class(struct btrfs_block_group *block_group)
>         enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
>         int ret;
>
> -       if (btrfs_block_group_should_use_size_class(block_group))
> +       if (!btrfs_block_group_should_use_size_class(block_group))
>                 return 0;
>
>         for (i = 0; i < 5; ++i) {
> -               ret = sample_block_group_extent_item(block_group, i, 5, &key);
> +               ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
>                 if (ret < 0)
>                         goto out;
>                 if (ret > 0)
> @@ -812,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work)
>         mutex_lock(&caching_ctl->mutex);
>         down_read(&fs_info->commit_root_sem);
>
> +       load_block_group_size_class(caching_ctl, block_group);
>         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
>                 ret = load_free_space_cache(block_group);
>                 if (ret == 1) {
> @@ -867,8 +887,6 @@ static noinline void caching_thread(struct btrfs_work *work)
>
>         wake_up(&caching_ctl->wait);
>
> -       load_block_group_size_class(block_group);
> -
>         btrfs_put_caching_control(caching_ctl);
>         btrfs_put_block_group(block_group);
>  }
> --
> 2.38.1
>


-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] btrfs: fix size class loading logic
  2023-02-14 21:47   ` Filipe Manana
@ 2023-02-14 22:12     ` Boris Burkov
  0 siblings, 0 replies; 9+ messages in thread
From: Boris Burkov @ 2023-02-14 22:12 UTC (permalink / raw)
  To: Filipe Manana; +Cc: linux-btrfs, kernel-team

On Tue, Feb 14, 2023 at 09:47:54PM +0000, Filipe Manana wrote:
> On Wed, Jan 25, 2023 at 9:33 PM Boris Burkov <boris@bur.io> wrote:
> >
> > The original implementation was completely incorrect. It used
> > btrfs_search_slot to make an inexact match, which simply returned >0 to
> > indicate not finding the key.
> >
> > Change it to using search_forward with no transid to actually walk the
> > leaves looking for extent items. Some small tweaks to the key space
> > condition checking in the iteration were also necessary.
> >
> > Finally, since the sampling lookups are of fixed complexity, move them
> > into the main, blocking part of caching a block group, not as a
> > best-effort thing after. This has no effect on total block group caching
> > throughput as there is only one thread anyway, but makes it simpler and
> > reduces weird races where we change the size class simultaneously from
> > an allocation and loading.
> >
> > Signed-off-by: Boris Burkov <boris@bur.io>
> > ---
> >  fs/btrfs/block-group.c | 56 ++++++++++++++++++++++++++++--------------
> >  1 file changed, 37 insertions(+), 19 deletions(-)
> >
> > diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> > index 73e1270b3904..45ccb25c5b1f 100644
> > --- a/fs/btrfs/block-group.c
> > +++ b/fs/btrfs/block-group.c
> > @@ -555,7 +555,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
> >   * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
> >   * error code on error.
> >   */
> > -static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
> > +static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
> > +                                         struct btrfs_block_group *block_group,
> >                                           int index, int max_index,
> >                                           struct btrfs_key *key)
> >  {
> > @@ -563,17 +564,19 @@ static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
> >         struct btrfs_root *extent_root;
> >         int ret = 0;
> >         u64 search_offset;
> > +       u64 search_end = block_group->start + block_group->length;
> >         struct btrfs_path *path;
> >
> >         ASSERT(index >= 0);
> >         ASSERT(index <= max_index);
> >         ASSERT(max_index > 0);
> > +       lockdep_assert_held(&caching_ctl->mutex);
> > +       lockdep_assert_held_read(&fs_info->commit_root_sem);
> >
> >         path = btrfs_alloc_path();
> >         if (!path)
> >                 return -ENOMEM;
> >
> > -       down_read(&fs_info->commit_root_sem);
> >         extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
> >                                                        BTRFS_SUPER_INFO_OFFSET));
> >
> > @@ -586,21 +589,36 @@ static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
> >         key->type = BTRFS_EXTENT_ITEM_KEY;
> >         key->offset = 0;
> >
> > -       ret = btrfs_search_slot(NULL, extent_root, key, path, 0, 0);
> > -       if (ret != 0)
> > -               goto out;
> > -       if (key->objectid < block_group->start ||
> > -           key->objectid > block_group->start + block_group->length) {
> > -               ret = 1;
> > -               goto out;
> > -       }
> > -       if (key->type != BTRFS_EXTENT_ITEM_KEY) {
> > -               ret = 1;
> > -               goto out;
> > +       while (1) {
> > +               ret = btrfs_search_forward(extent_root, key, path, 0);
> 
> Boris, this is broken and can result in a deadlock.
> 
> btrfs_search_forward() will always lock the root node, despite the
> path having ->skip_locking and ->search_commit_root set to true (1).
> It's not meant to be used for commit roots, so it always needs to do locking.

Thanks for the catch and explanation. Working on a fix!

> 
> So if another task is COWing a child node of the same root node and
> then needs to wait for block group caching to complete when trying to
> allocate a metadata extent, it deadlocks.
> For example:
> 
> [539604.239315] sysrq: Show Blocked State
> [539604.240133] task:kworker/u16:6   state:D stack:0     pid:2119594
> ppid:2      flags:0x00004000
> [539604.241613] Workqueue: btrfs-cache btrfs_work_helper [btrfs]
> [539604.242673] Call Trace:
> [539604.243129]  <TASK>
> [539604.243925]  __schedule+0x41d/0xee0
> [539604.244797]  ? rcu_read_lock_sched_held+0x12/0x70
> [539604.245399]  ? rwsem_down_read_slowpath+0x185/0x490
> [539604.246111]  schedule+0x5d/0xf0
> [539604.246593]  rwsem_down_read_slowpath+0x2da/0x490
> [539604.247290]  ? rcu_barrier_tasks_trace+0x10/0x20
> [539604.248090]  __down_read_common+0x3d/0x150
> [539604.248702]  down_read_nested+0xc3/0x140
> [539604.249280]  __btrfs_tree_read_lock+0x24/0x100 [btrfs]
> [539604.250097]  btrfs_read_lock_root_node+0x48/0x60 [btrfs]
> [539604.250915]  btrfs_search_forward+0x59/0x460 [btrfs]
> [539604.251781]  ? btrfs_global_root+0x50/0x70 [btrfs]
> [539604.252476]  caching_thread+0x1be/0x920 [btrfs]
> [539604.253167]  btrfs_work_helper+0xf6/0x400 [btrfs]
> [539604.253848]  process_one_work+0x24f/0x5a0
> [539604.254476]  worker_thread+0x52/0x3b0
> [539604.255166]  ? __pfx_worker_thread+0x10/0x10
> [539604.256047]  kthread+0xf0/0x120
> [539604.256591]  ? __pfx_kthread+0x10/0x10
> [539604.257212]  ret_from_fork+0x29/0x50
> [539604.257822]  </TASK>
> [539604.258233] task:btrfs-transacti state:D stack:0     pid:2236474
> ppid:2      flags:0x00004000
> [539604.259802] Call Trace:
> [539604.260243]  <TASK>
> [539604.260615]  __schedule+0x41d/0xee0
> [539604.261205]  ? rcu_read_lock_sched_held+0x12/0x70
> [539604.262000]  ? rwsem_down_read_slowpath+0x185/0x490
> [539604.262822]  schedule+0x5d/0xf0
> [539604.263374]  rwsem_down_read_slowpath+0x2da/0x490
> [539604.266228]  ? lock_acquire+0x160/0x310
> [539604.266917]  ? rcu_read_lock_sched_held+0x12/0x70
> [539604.267996]  ? lock_contended+0x19e/0x500
> [539604.268720]  __down_read_common+0x3d/0x150
> [539604.269400]  down_read_nested+0xc3/0x140
> [539604.270057]  __btrfs_tree_read_lock+0x24/0x100 [btrfs]
> [539604.271129]  btrfs_read_lock_root_node+0x48/0x60 [btrfs]
> [539604.272372]  btrfs_search_slot+0x143/0xf70 [btrfs]
> [539604.273295]  update_block_group_item+0x9e/0x190 [btrfs]
> [539604.274282]  btrfs_start_dirty_block_groups+0x1c4/0x4f0 [btrfs]
> [539604.275381]  ? __mutex_unlock_slowpath+0x45/0x280
> [539604.276390]  btrfs_commit_transaction+0xee/0xed0 [btrfs]
> [539604.277391]  ? lock_acquire+0x1a4/0x310
> [539604.278080]  ? start_transaction+0xcb/0x6c0 [btrfs]
> [539604.279099]  transaction_kthread+0x142/0x1c0 [btrfs]
> [539604.279996]  ? __pfx_transaction_kthread+0x10/0x10 [btrfs]
> [539604.280673]  kthread+0xf0/0x120
> [539604.281050]  ? __pfx_kthread+0x10/0x10
> [539604.281496]  ret_from_fork+0x29/0x50
> [539604.281966]  </TASK>
> [539604.282255] task:fsstress        state:D stack:0     pid:2236483
> ppid:1      flags:0x00004006
> [539604.283897] Call Trace:
> [539604.284700]  <TASK>
> [539604.285088]  __schedule+0x41d/0xee0
> [539604.285660]  schedule+0x5d/0xf0
> [539604.286175]  btrfs_wait_block_group_cache_progress+0xf2/0x170 [btrfs]
> [539604.287342]  ? __pfx_autoremove_wake_function+0x10/0x10
> [539604.288450]  find_free_extent+0xd93/0x1750 [btrfs]
> [539604.289256]  ? _raw_spin_unlock+0x29/0x50
> [539604.289911]  ? btrfs_get_alloc_profile+0x127/0x2a0 [btrfs]
> [539604.290843]  btrfs_reserve_extent+0x147/0x290 [btrfs]
> [539604.291943]  btrfs_alloc_tree_block+0xcb/0x3e0 [btrfs]
> [539604.292903]  __btrfs_cow_block+0x138/0x580 [btrfs]
> [539604.293773]  btrfs_cow_block+0x10e/0x240 [btrfs]
> [539604.294595]  btrfs_search_slot+0x7f3/0xf70 [btrfs]
> [539604.295585]  btrfs_update_device+0x71/0x1b0 [btrfs]
> [539604.296459]  btrfs_chunk_alloc_add_chunk_item+0xe0/0x340 [btrfs]
> [539604.297489]  btrfs_chunk_alloc+0x1bf/0x490 [btrfs]
> [539604.298335]  find_free_extent+0x6fa/0x1750 [btrfs]
> [539604.299174]  ? _raw_spin_unlock+0x29/0x50
> [539604.299950]  ? btrfs_get_alloc_profile+0x127/0x2a0 [btrfs]
> [539604.300918]  btrfs_reserve_extent+0x147/0x290 [btrfs]
> [539604.301797]  btrfs_alloc_tree_block+0xcb/0x3e0 [btrfs]
> [539604.303017]  ? lock_release+0x224/0x4a0
> [539604.303855]  __btrfs_cow_block+0x138/0x580 [btrfs]
> [539604.304789]  btrfs_cow_block+0x10e/0x240 [btrfs]
> [539604.305611]  btrfs_search_slot+0x7f3/0xf70 [btrfs]
> [539604.306682]  ? btrfs_global_root+0x50/0x70 [btrfs]
> [539604.308198]  lookup_inline_extent_backref+0x17b/0x7a0 [btrfs]
> [539604.309254]  lookup_extent_backref+0x43/0xd0 [btrfs]
> [539604.310122]  __btrfs_free_extent+0xf8/0x810 [btrfs]
> [539604.310874]  ? lock_release+0x224/0x4a0
> [539604.311724]  ? btrfs_merge_delayed_refs+0x17b/0x1d0 [btrfs]
> [539604.313023]  __btrfs_run_delayed_refs+0x2ba/0x1260 [btrfs]
> [539604.314271]  btrfs_run_delayed_refs+0x8f/0x1c0 [btrfs]
> [539604.315445]  ? rcu_read_lock_sched_held+0x12/0x70
> [539604.316706]  btrfs_commit_transaction+0xa2/0xed0 [btrfs]
> [539604.317855]  ? do_raw_spin_unlock+0x4b/0xa0
> [539604.318544]  ? _raw_spin_unlock+0x29/0x50
> [539604.319240]  create_subvol+0x53d/0x6e0 [btrfs]
> [539604.320283]  btrfs_mksubvol+0x4f5/0x590 [btrfs]
> [539604.321220]  __btrfs_ioctl_snap_create+0x11b/0x180 [btrfs]
> [539604.322307]  btrfs_ioctl_snap_create_v2+0xc6/0x150 [btrfs]
> [539604.323295]  btrfs_ioctl+0x9f7/0x33e0 [btrfs]
> [539604.324331]  ? rcu_read_lock_sched_held+0x12/0x70
> [539604.325137]  ? lock_release+0x224/0x4a0
> [539604.325808]  ? __x64_sys_ioctl+0x87/0xc0
> [539604.326467]  __x64_sys_ioctl+0x87/0xc0
> [539604.327109]  do_syscall_64+0x38/0x90
> [539604.327875]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
> [539604.328792] RIP: 0033:0x7f05a7babaeb
> [539604.329378] RSP: 002b:00007ffd0fecc480 EFLAGS: 00000246 ORIG_RAX:
> 0000000000000010
> [539604.330561] RAX: ffffffffffffffda RBX: 00007ffd0fecd520 RCX:
> 00007f05a7babaeb
> [539604.335194] RDX: 00007ffd0fecc4e0 RSI: 0000000050009418 RDI:
> 0000000000000004
> [539604.336583] RBP: 0000000000000002 R08: 0000000000000000 R09:
> 000055a66e107480
> [539604.337685] R10: 00007f05a7ac5358 R11: 0000000000000246 R12:
> 0000000000000004
> [539604.339220] R13: 00007ffd0fecc4e0 R14: 0000000000000004 R15:
> 000055a66c4be590
> [539604.341137]  </TASK>
> 
> This needs to use regular btrfs_search_slot() with some skip and stop logic.
> 
> Thanks.
> 
> 
> > +               if (ret != 0)
> > +                       goto out;
> > +               /* Success; sampled an extent item in the block group */
> > +               if (key->type == BTRFS_EXTENT_ITEM_KEY &&
> > +                   key->objectid >= block_group->start &&
> > +                   key->objectid + key->offset <= search_end)
> > +                       goto out;
> > +
> > +               /* We can't possibly find a valid extent item anymore */
> > +               if (key->objectid >= search_end) {
> > +                       ret = 1;
> > +                       break;
> > +               }
> > +               if (key->type < BTRFS_EXTENT_ITEM_KEY)
> > +                       key->type = BTRFS_EXTENT_ITEM_KEY;
> > +               else
> > +                       key->objectid++;
> > +               btrfs_release_path(path);
> > +               up_read(&fs_info->commit_root_sem);
> > +               mutex_unlock(&caching_ctl->mutex);
> > +               cond_resched();
> > +               mutex_lock(&caching_ctl->mutex);
> > +               down_read(&fs_info->commit_root_sem);
> >         }
> >  out:
> > +       lockdep_assert_held(&caching_ctl->mutex);
> > +       lockdep_assert_held_read(&fs_info->commit_root_sem);
> >         btrfs_free_path(path);
> > -       up_read(&fs_info->commit_root_sem);
> >         return ret;
> >  }
> >
> > @@ -638,7 +656,8 @@ static int sample_block_group_extent_item(struct btrfs_block_group *block_group,
> >   *
> >   * Returns: 0 on success, negative error code on error.
> >   */
> > -static int load_block_group_size_class(struct btrfs_block_group *block_group)
> > +static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
> > +                                      struct btrfs_block_group *block_group)
> >  {
> >         struct btrfs_key key;
> >         int i;
> > @@ -646,11 +665,11 @@ static int load_block_group_size_class(struct btrfs_block_group *block_group)
> >         enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
> >         int ret;
> >
> > -       if (btrfs_block_group_should_use_size_class(block_group))
> > +       if (!btrfs_block_group_should_use_size_class(block_group))
> >                 return 0;
> >
> >         for (i = 0; i < 5; ++i) {
> > -               ret = sample_block_group_extent_item(block_group, i, 5, &key);
> > +               ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
> >                 if (ret < 0)
> >                         goto out;
> >                 if (ret > 0)
> > @@ -812,6 +831,7 @@ static noinline void caching_thread(struct btrfs_work *work)
> >         mutex_lock(&caching_ctl->mutex);
> >         down_read(&fs_info->commit_root_sem);
> >
> > +       load_block_group_size_class(caching_ctl, block_group);
> >         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
> >                 ret = load_free_space_cache(block_group);
> >                 if (ret == 1) {
> > @@ -867,8 +887,6 @@ static noinline void caching_thread(struct btrfs_work *work)
> >
> >         wake_up(&caching_ctl->wait);
> >
> > -       load_block_group_size_class(block_group);
> > -
> >         btrfs_put_caching_control(caching_ctl);
> >         btrfs_put_block_group(block_group);
> >  }
> > --
> > 2.38.1
> >
> 
> 
> -- 
> Filipe David Manana,
> 
> “Whether you think you can, or you think you can't — you're right.”

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-02-14 22:13 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-25 20:50 [PATCH 0/2] btrfs: block group size class load fixes Boris Burkov
2023-01-25 20:50 ` [PATCH 1/2] btrfs: fix size class loading logic Boris Burkov
2023-02-14 21:47   ` Filipe Manana
2023-02-14 22:12     ` Boris Burkov
2023-01-25 20:50 ` [PATCH 2/2] btrfs: add size class stats to sysfs Boris Burkov
2023-01-27 13:23   ` David Sterba
2023-01-27 21:43     ` Boris Burkov
2023-02-07 15:08       ` David Sterba
2023-01-25 22:05 ` [PATCH 0/2] btrfs: block group size class load fixes David Sterba

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.