Linux-BTRFS Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH] btrfs-progs: Fix false ENOSPC alert by tracking used space correctly
@ 2019-05-24 23:32 Qu Wenruo
  2019-06-14 15:57 ` David Sterba
  2019-06-19  6:32 ` Nikolay Borisov
  0 siblings, 2 replies; 4+ messages in thread
From: Qu Wenruo @ 2019-05-24 23:32 UTC (permalink / raw)
  To: linux-btrfs

[BUG]
There is a bug report of unexpected ENOSPC from btrfs-convert.
https://github.com/kdave/btrfs-progs/issues/123#

After some debug, even when we have enough unallocated space, we still
hit ENOSPC at btrfs_reserve_extent().

[CAUSE]
Btrfs-progs relies on chunk preallocator to make enough space for
data/metadata.

However after the introduction of delayed-ref, it's no longer reliable
to relie on btrfs_space_info::bytes_used and
btrfs_space_info::bytes_pinned to calculate used metadata space.

For a running transaction with a lot of allocated tree blocks,
btrfs_space_info::bytes_used stays its original value, and will only be
updated when running delayed ref.

This makes btrfs-progs chunk preallocator completely useless. And for
btrfs-convert/mkfs.btrfs --rootdir, if we're going to have enough
metadata to fill a metadata block group in one transaction, we will hit
ENOSPC no matter whether we have enough unallocated space.

[FIX]
This patch will introduce btrfs_space_info::bytes_reserved to trace how
many space we have reserved but not yet committed to extent tree.

To support this change, this commit also introduces the following
modification:
- More comment on btrfs_space_info::bytes_*
  To make code a little easier to read

- Export update_space_info() to preallocate empty data/metadata space
  info for mkfs.
  For mkfs, we only have a temporary fs image with SYSTEM chunk only.
  Export update_space_info() so that we can preallocate empty
  data/metadata space info before we start a transaction.

- Proper btrfs_space_info::bytes_reserved update
  The timing is the as kernel (except we don't need to update
  bytes_reserved for data extents)
  * Increase bytes_reserved when call alloc_reserved_tree_block()
  * Decrease bytes_reserved when running delayed refs
    With the help of head->must_insert_reserved to determine whether we
    need to decrease.

Issue: #123
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 ctree.h       | 24 ++++++++++++++++++++++++
 extent-tree.c | 43 +++++++++++++++++++++++++++++++++++++------
 mkfs/main.c   | 11 +++++++++++
 transaction.c |  8 ++++++++
 4 files changed, 80 insertions(+), 6 deletions(-)

diff --git a/ctree.h b/ctree.h
index 76f52b1c9b08..93f96a578f2c 100644
--- a/ctree.h
+++ b/ctree.h
@@ -1060,8 +1060,29 @@ struct btrfs_qgroup_limit_item {
 struct btrfs_space_info {
 	u64 flags;
 	u64 total_bytes;
+	/*
+	 * Space already used.
+	 * Only accounting space in current extent tree, thus delayed ref
+	 * won't be accounted here.
+	 */
 	u64 bytes_used;
+
+	/*
+	 * Space being pinned down.
+	 * So extent allocator will not try to allocate space from them.
+	 *
+	 * For cases like extents being freed in current transaction, or
+	 * manually pinned bytes for re-initializing certain trees.
+	 */
 	u64 bytes_pinned;
+
+	/*
+	 * Space being reserved.
+	 * Space has already being reserved but not yet reach extent tree.
+	 *
+	 * New tree blocks allocated in current transaction goes here.
+	 */
+	u64 bytes_reserved;
 	int full;
 	struct list_head list;
 };
@@ -2528,6 +2549,9 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 			    u64 root_objectid, u64 ref_generation,
 			    u64 owner_objectid);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
+int update_space_info(struct btrfs_fs_info *info, u64 flags,
+		      u64 total_bytes, u64 bytes_used,
+		      struct btrfs_space_info **space_info);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
 struct btrfs_block_group_cache *
diff --git a/extent-tree.c b/extent-tree.c
index e62ee8c2ba13..c7ca49bccd8b 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -1786,9 +1786,9 @@ static int free_space_info(struct btrfs_fs_info *fs_info, u64 flags,
 	return 0;
 }
 
-static int update_space_info(struct btrfs_fs_info *info, u64 flags,
-			     u64 total_bytes, u64 bytes_used,
-			     struct btrfs_space_info **space_info)
+int update_space_info(struct btrfs_fs_info *info, u64 flags,
+		      u64 total_bytes, u64 bytes_used,
+		      struct btrfs_space_info **space_info)
 {
 	struct btrfs_space_info *found;
 
@@ -1814,6 +1814,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->total_bytes = total_bytes;
 	found->bytes_used = bytes_used;
 	found->bytes_pinned = 0;
+	found->bytes_reserved = 0;
 	found->full = 0;
 	*space_info = found;
 	return 0;
@@ -1859,8 +1860,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 		return 0;
 
 	thresh = div_factor(space_info->total_bytes, 7);
-	if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
-	    thresh)
+	if ((space_info->bytes_used + space_info->bytes_pinned +
+	     space_info->bytes_reserved + alloc_bytes) < thresh)
 		return 0;
 
 	/*
@@ -2538,6 +2539,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_extent_item *extent_item;
 	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_space_info *sinfo;
 	struct extent_buffer *leaf;
 	struct btrfs_path *path;
 	struct btrfs_key ins;
@@ -2545,6 +2547,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	u64 start, end;
 	int ret;
 
+	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+	ASSERT(sinfo);
+
 	ins.objectid = node->bytenr;
 	if (skinny_metadata) {
 		ins.offset = ref->level;
@@ -2605,6 +2610,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 
 	ret = update_block_group(fs_info, ins.objectid, fs_info->nodesize, 1,
 				 0);
+	if (sinfo) {
+		if (fs_info->nodesize > sinfo->bytes_reserved) {
+			WARN_ON(1);
+			sinfo->bytes_reserved = 0;
+		} else {
+			sinfo->bytes_reserved -= fs_info->nodesize;
+		}
+	}
 
 	if (ref->root == BTRFS_EXTENT_TREE_OBJECTID) {
 		clear_extent_bits(&trans->fs_info->extent_ins, start, end,
@@ -2624,6 +2637,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
 	int ret;
 	u64 extent_size;
 	struct btrfs_delayed_extent_op *extent_op;
+	struct btrfs_space_info *sinfo;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
 						 SKINNY_METADATA);
 
@@ -2631,6 +2646,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
 	if (!extent_op)
 		return -ENOMEM;
 
+	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+	ASSERT(sinfo);
 	ret = btrfs_reserve_extent(trans, root, num_bytes, empty_size,
 				   hint_byte, search_end, ins, 0);
 	if (ret < 0)
@@ -2663,6 +2680,7 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
 		BUG_ON(ret);
 	}
 
+	sinfo->bytes_reserved += extent_size;
 	ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, ins->objectid,
 					 extent_size, 0, root_objectid,
 					 level, BTRFS_ADD_DELAYED_EXTENT,
@@ -3000,6 +3018,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		sinfo = list_entry(info->space_info.next,
 				   struct btrfs_space_info, list);
 		list_del_init(&sinfo->list);
+		if (sinfo->bytes_reserved)
+			warning(
+		"reserved space leaked, flag=0x%llx bytes_reserved=%llu",
+				sinfo->flags, sinfo->bytes_reserved);
 		kfree(sinfo);
 	}
 	return 0;
@@ -4106,8 +4128,17 @@ int cleanup_ref_head(struct btrfs_trans_handle *trans,
 	rb_erase(&head->href_node, &delayed_refs->href_root);
 	RB_CLEAR_NODE(&head->href_node);
 
-	if (head->must_insert_reserved)
+	if (head->must_insert_reserved) {
 		btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes);
+		if (!head->is_data) {
+			struct btrfs_space_info *sinfo;
+
+			sinfo = __find_space_info(trans->fs_info,
+					BTRFS_BLOCK_GROUP_METADATA);
+			ASSERT(sinfo);
+			sinfo->bytes_reserved -= head->num_bytes;
+		}
+	}
 
 	btrfs_put_delayed_ref_head(head);
 	return 0;
diff --git a/mkfs/main.c b/mkfs/main.c
index b442e6e40c37..1d03ec52ddd6 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -58,11 +58,22 @@ static int create_metadata_block_groups(struct btrfs_root *root, int mixed,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_space_info *sinfo;
 	u64 bytes_used;
 	u64 chunk_start = 0;
 	u64 chunk_size = 0;
 	int ret;
 
+	/* Create needed space info to trace extents reservation */
+	ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA,
+				0, 0, &sinfo);
+	if (ret < 0)
+		return ret;
+	ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA,
+				0, 0, &sinfo);
+	if (ret < 0)
+		return ret;
+
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(IS_ERR(trans));
 	bytes_used = btrfs_super_bytes_used(fs_info->super_copy);
diff --git a/transaction.c b/transaction.c
index 138e10f0d6cc..d2c7f4829eda 100644
--- a/transaction.c
+++ b/transaction.c
@@ -158,6 +158,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	u64 transid = trans->transid;
 	int ret = 0;
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_space_info *sinfo;
 
 	if (trans->fs_info->transaction_aborted)
 		return -EROFS;
@@ -209,6 +210,13 @@ commit_tree:
 	root->commit_root = NULL;
 	fs_info->running_transaction = NULL;
 	fs_info->last_trans_committed = transid;
+	list_for_each_entry(sinfo, &fs_info->space_info, list) {
+		if (sinfo->bytes_reserved) {
+			warning(
+	"reserved space leaked, transid=%llu flag=0x%llx bytes_reserved=%llu",
+				transid, sinfo->flags, sinfo->bytes_reserved);
+		}
+	}
 	return ret;
 error:
 	btrfs_destroy_delayed_refs(trans);
-- 
2.21.0


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] btrfs-progs: Fix false ENOSPC alert by tracking used space correctly
  2019-05-24 23:32 [PATCH] btrfs-progs: Fix false ENOSPC alert by tracking used space correctly Qu Wenruo
@ 2019-06-14 15:57 ` David Sterba
  2019-06-19  6:32 ` Nikolay Borisov
  1 sibling, 0 replies; 4+ messages in thread
From: David Sterba @ 2019-06-14 15:57 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Sat, May 25, 2019 at 07:32:43AM +0800, Qu Wenruo wrote:
> [BUG]
> There is a bug report of unexpected ENOSPC from btrfs-convert.
> https://github.com/kdave/btrfs-progs/issues/123#
> 
> After some debug, even when we have enough unallocated space, we still
> hit ENOSPC at btrfs_reserve_extent().
> 
> [CAUSE]
> Btrfs-progs relies on chunk preallocator to make enough space for
> data/metadata.
> 
> However after the introduction of delayed-ref, it's no longer reliable
> to relie on btrfs_space_info::bytes_used and
> btrfs_space_info::bytes_pinned to calculate used metadata space.
> 
> For a running transaction with a lot of allocated tree blocks,
> btrfs_space_info::bytes_used stays its original value, and will only be
> updated when running delayed ref.
> 
> This makes btrfs-progs chunk preallocator completely useless. And for
> btrfs-convert/mkfs.btrfs --rootdir, if we're going to have enough
> metadata to fill a metadata block group in one transaction, we will hit
> ENOSPC no matter whether we have enough unallocated space.
> 
> [FIX]
> This patch will introduce btrfs_space_info::bytes_reserved to trace how
> many space we have reserved but not yet committed to extent tree.
> 
> To support this change, this commit also introduces the following
> modification:
> - More comment on btrfs_space_info::bytes_*
>   To make code a little easier to read
> 
> - Export update_space_info() to preallocate empty data/metadata space
>   info for mkfs.
>   For mkfs, we only have a temporary fs image with SYSTEM chunk only.
>   Export update_space_info() so that we can preallocate empty
>   data/metadata space info before we start a transaction.
> 
> - Proper btrfs_space_info::bytes_reserved update
>   The timing is the as kernel (except we don't need to update
>   bytes_reserved for data extents)
>   * Increase bytes_reserved when call alloc_reserved_tree_block()
>   * Decrease bytes_reserved when running delayed refs
>     With the help of head->must_insert_reserved to determine whether we
>     need to decrease.
> 
> Issue: #123
> Signed-off-by: Qu Wenruo <wqu@suse.com>

Added to devel, thanks.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] btrfs-progs: Fix false ENOSPC alert by tracking used space correctly
  2019-05-24 23:32 [PATCH] btrfs-progs: Fix false ENOSPC alert by tracking used space correctly Qu Wenruo
  2019-06-14 15:57 ` David Sterba
@ 2019-06-19  6:32 ` Nikolay Borisov
  2019-06-19  6:49   ` Qu Wenruo
  1 sibling, 1 reply; 4+ messages in thread
From: Nikolay Borisov @ 2019-06-19  6:32 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs; +Cc: David Sterba



On 25.05.19 г. 2:32 ч., Qu Wenruo wrote:
> [BUG]
> There is a bug report of unexpected ENOSPC from btrfs-convert.
> https://github.com/kdave/btrfs-progs/issues/123#
> 
> After some debug, even when we have enough unallocated space, we still
> hit ENOSPC at btrfs_reserve_extent().
> 
> [CAUSE]
> Btrfs-progs relies on chunk preallocator to make enough space for
> data/metadata.
> 
> However after the introduction of delayed-ref, it's no longer reliable
> to relie on btrfs_space_info::bytes_used and
> btrfs_space_info::bytes_pinned to calculate used metadata space.
> 
> For a running transaction with a lot of allocated tree blocks,
> btrfs_space_info::bytes_used stays its original value, and will only be
> updated when running delayed ref.
> 
> This makes btrfs-progs chunk preallocator completely useless. And for
> btrfs-convert/mkfs.btrfs --rootdir, if we're going to have enough
> metadata to fill a metadata block group in one transaction, we will hit
> ENOSPC no matter whether we have enough unallocated space.
> 
> [FIX]
> This patch will introduce btrfs_space_info::bytes_reserved to trace how
> many space we have reserved but not yet committed to extent tree.
> 
> To support this change, this commit also introduces the following
> modification:
> - More comment on btrfs_space_info::bytes_*
>   To make code a little easier to read
> 
> - Export update_space_info() to preallocate empty data/metadata space
>   info for mkfs.
>   For mkfs, we only have a temporary fs image with SYSTEM chunk only.
>   Export update_space_info() so that we can preallocate empty
>   data/metadata space info before we start a transaction.
> 
> - Proper btrfs_space_info::bytes_reserved update
>   The timing is the as kernel (except we don't need to update
>   bytes_reserved for data extents)
>   * Increase bytes_reserved when call alloc_reserved_tree_block()
>   * Decrease bytes_reserved when running delayed refs
>     With the help of head->must_insert_reserved to determine whether we
>     need to decrease.

This text is opposite to what the code is doing. At the time of
alloc_reserved_tree_block we actually decrement bytes_reserved since the
allocated block is going to be added to bytes_used via
update_block_group. This is done when delayed refs are being run.

At alloc_tree_block you increment bytes_reserved since this is the time
when space for the extent is reserved.

> 
> Issue: #123
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  ctree.h       | 24 ++++++++++++++++++++++++
>  extent-tree.c | 43 +++++++++++++++++++++++++++++++++++++------
>  mkfs/main.c   | 11 +++++++++++
>  transaction.c |  8 ++++++++
>  4 files changed, 80 insertions(+), 6 deletions(-)
> 
> diff --git a/ctree.h b/ctree.h
> index 76f52b1c9b08..93f96a578f2c 100644
> --- a/ctree.h
> +++ b/ctree.h
> @@ -1060,8 +1060,29 @@ struct btrfs_qgroup_limit_item {
>  struct btrfs_space_info {
>  	u64 flags;
>  	u64 total_bytes;
> +	/*
> +	 * Space already used.
> +	 * Only accounting space in current extent tree, thus delayed ref
> +	 * won't be accounted here.
> +	 */
>  	u64 bytes_used;
> +
> +	/*
> +	 * Space being pinned down.
> +	 * So extent allocator will not try to allocate space from them.
> +	 *
> +	 * For cases like extents being freed in current transaction, or
> +	 * manually pinned bytes for re-initializing certain trees.
> +	 */
>  	u64 bytes_pinned;
> +
> +	/*
> +	 * Space being reserved.
> +	 * Space has already being reserved but not yet reach extent tree.
> +	 *
> +	 * New tree blocks allocated in current transaction goes here.
> +	 */
> +	u64 bytes_reserved;
>  	int full;
>  	struct list_head list;
>  };
> @@ -2528,6 +2549,9 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
>  			    u64 root_objectid, u64 ref_generation,
>  			    u64 owner_objectid);
>  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
> +int update_space_info(struct btrfs_fs_info *info, u64 flags,
> +		      u64 total_bytes, u64 bytes_used,
> +		      struct btrfs_space_info **space_info);
>  int btrfs_free_block_groups(struct btrfs_fs_info *info);
>  int btrfs_read_block_groups(struct btrfs_root *root);
>  struct btrfs_block_group_cache *
> diff --git a/extent-tree.c b/extent-tree.c
> index e62ee8c2ba13..c7ca49bccd8b 100644
> --- a/extent-tree.c
> +++ b/extent-tree.c
> @@ -1786,9 +1786,9 @@ static int free_space_info(struct btrfs_fs_info *fs_info, u64 flags,
>  	return 0;
>  }
>  
> -static int update_space_info(struct btrfs_fs_info *info, u64 flags,
> -			     u64 total_bytes, u64 bytes_used,
> -			     struct btrfs_space_info **space_info)
> +int update_space_info(struct btrfs_fs_info *info, u64 flags,
> +		      u64 total_bytes, u64 bytes_used,
> +		      struct btrfs_space_info **space_info)
>  {
>  	struct btrfs_space_info *found;
>  
> @@ -1814,6 +1814,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
>  	found->total_bytes = total_bytes;
>  	found->bytes_used = bytes_used;
>  	found->bytes_pinned = 0;
> +	found->bytes_reserved = 0;
>  	found->full = 0;
>  	*space_info = found;
>  	return 0;
> @@ -1859,8 +1860,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
>  		return 0;
>  
>  	thresh = div_factor(space_info->total_bytes, 7);
> -	if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
> -	    thresh)
> +	if ((space_info->bytes_used + space_info->bytes_pinned +
> +	     space_info->bytes_reserved + alloc_bytes) < thresh)
>  		return 0;
>  
>  	/*
> @@ -2538,6 +2539,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
>  	struct btrfs_fs_info *fs_info = trans->fs_info;
>  	struct btrfs_extent_item *extent_item;
>  	struct btrfs_extent_inline_ref *iref;
> +	struct btrfs_space_info *sinfo;
>  	struct extent_buffer *leaf;
>  	struct btrfs_path *path;
>  	struct btrfs_key ins;
> @@ -2545,6 +2547,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
>  	u64 start, end;
>  	int ret;
>  
> +	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
> +	ASSERT(sinfo);
> +
>  	ins.objectid = node->bytenr;
>  	if (skinny_metadata) {
>  		ins.offset = ref->level;
> @@ -2605,6 +2610,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
>  
>  	ret = update_block_group(fs_info, ins.objectid, fs_info->nodesize, 1,
>  				 0);
> +	if (sinfo) {
> +		if (fs_info->nodesize > sinfo->bytes_reserved) {
> +			WARN_ON(1);
> +			sinfo->bytes_reserved = 0;
> +		} else {
> +			sinfo->bytes_reserved -= fs_info->nodesize;
> +		}
> +	}
>  
>  	if (ref->root == BTRFS_EXTENT_TREE_OBJECTID) {
>  		clear_extent_bits(&trans->fs_info->extent_ins, start, end,
> @@ -2624,6 +2637,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
>  	int ret;
>  	u64 extent_size;
>  	struct btrfs_delayed_extent_op *extent_op;
> +	struct btrfs_space_info *sinfo;
> +	struct btrfs_fs_info *fs_info = root->fs_info;
>  	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
>  						 SKINNY_METADATA);
>  
> @@ -2631,6 +2646,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
>  	if (!extent_op)
>  		return -ENOMEM;
>  
> +	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
> +	ASSERT(sinfo);
>  	ret = btrfs_reserve_extent(trans, root, num_bytes, empty_size,
>  				   hint_byte, search_end, ins, 0);
>  	if (ret < 0)
> @@ -2663,6 +2680,7 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
>  		BUG_ON(ret);
>  	}
>  
> +	sinfo->bytes_reserved += extent_size;
>  	ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, ins->objectid,
>  					 extent_size, 0, root_objectid,
>  					 level, BTRFS_ADD_DELAYED_EXTENT,
> @@ -3000,6 +3018,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
>  		sinfo = list_entry(info->space_info.next,
>  				   struct btrfs_space_info, list);
>  		list_del_init(&sinfo->list);
> +		if (sinfo->bytes_reserved)
> +			warning(
> +		"reserved space leaked, flag=0x%llx bytes_reserved=%llu",
> +				sinfo->flags, sinfo->bytes_reserved);
>  		kfree(sinfo);
>  	}
>  	return 0;
> @@ -4106,8 +4128,17 @@ int cleanup_ref_head(struct btrfs_trans_handle *trans,
>  	rb_erase(&head->href_node, &delayed_refs->href_root);
>  	RB_CLEAR_NODE(&head->href_node);
>  
> -	if (head->must_insert_reserved)
> +	if (head->must_insert_reserved) {
>  		btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes);
> +		if (!head->is_data) {
> +			struct btrfs_space_info *sinfo;
> +
> +			sinfo = __find_space_info(trans->fs_info,
> +					BTRFS_BLOCK_GROUP_METADATA);
> +			ASSERT(sinfo);
> +			sinfo->bytes_reserved -= head->num_bytes;
> +		}
> +	}
>  
>  	btrfs_put_delayed_ref_head(head);
>  	return 0;
> diff --git a/mkfs/main.c b/mkfs/main.c
> index b442e6e40c37..1d03ec52ddd6 100644
> --- a/mkfs/main.c
> +++ b/mkfs/main.c
> @@ -58,11 +58,22 @@ static int create_metadata_block_groups(struct btrfs_root *root, int mixed,
>  {
>  	struct btrfs_fs_info *fs_info = root->fs_info;
>  	struct btrfs_trans_handle *trans;
> +	struct btrfs_space_info *sinfo;
>  	u64 bytes_used;
>  	u64 chunk_start = 0;
>  	u64 chunk_size = 0;
>  	int ret;
>  
> +	/* Create needed space info to trace extents reservation */
> +	ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA,
> +				0, 0, &sinfo);
> +	if (ret < 0)
> +		return ret;
> +	ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA,
> +				0, 0, &sinfo);
> +	if (ret < 0)
> +		return ret;
> +
>  	trans = btrfs_start_transaction(root, 1);
>  	BUG_ON(IS_ERR(trans));
>  	bytes_used = btrfs_super_bytes_used(fs_info->super_copy);
> diff --git a/transaction.c b/transaction.c
> index 138e10f0d6cc..d2c7f4829eda 100644
> --- a/transaction.c
> +++ b/transaction.c
> @@ -158,6 +158,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
>  	u64 transid = trans->transid;
>  	int ret = 0;
>  	struct btrfs_fs_info *fs_info = root->fs_info;
> +	struct btrfs_space_info *sinfo;
>  
>  	if (trans->fs_info->transaction_aborted)
>  		return -EROFS;
> @@ -209,6 +210,13 @@ commit_tree:
>  	root->commit_root = NULL;
>  	fs_info->running_transaction = NULL;
>  	fs_info->last_trans_committed = transid;
> +	list_for_each_entry(sinfo, &fs_info->space_info, list) {
> +		if (sinfo->bytes_reserved) {
> +			warning(
> +	"reserved space leaked, transid=%llu flag=0x%llx bytes_reserved=%llu",
> +				transid, sinfo->flags, sinfo->bytes_reserved);
> +		}
> +	}
>  	return ret;
>  error:
>  	btrfs_destroy_delayed_refs(trans);
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] btrfs-progs: Fix false ENOSPC alert by tracking used space correctly
  2019-06-19  6:32 ` Nikolay Borisov
@ 2019-06-19  6:49   ` Qu Wenruo
  0 siblings, 0 replies; 4+ messages in thread
From: Qu Wenruo @ 2019-06-19  6:49 UTC (permalink / raw)
  To: Nikolay Borisov, linux-btrfs; +Cc: David Sterba



On 2019/6/19 下午2:32, Nikolay Borisov wrote:
> 
> 
> On 25.05.19 г. 2:32 ч., Qu Wenruo wrote:
>> [BUG]
>> There is a bug report of unexpected ENOSPC from btrfs-convert.
>> https://github.com/kdave/btrfs-progs/issues/123#
>>
>> After some debug, even when we have enough unallocated space, we still
>> hit ENOSPC at btrfs_reserve_extent().
>>
>> [CAUSE]
>> Btrfs-progs relies on chunk preallocator to make enough space for
>> data/metadata.
>>
>> However after the introduction of delayed-ref, it's no longer reliable
>> to relie on btrfs_space_info::bytes_used and
>> btrfs_space_info::bytes_pinned to calculate used metadata space.
>>
>> For a running transaction with a lot of allocated tree blocks,
>> btrfs_space_info::bytes_used stays its original value, and will only be
>> updated when running delayed ref.
>>
>> This makes btrfs-progs chunk preallocator completely useless. And for
>> btrfs-convert/mkfs.btrfs --rootdir, if we're going to have enough
>> metadata to fill a metadata block group in one transaction, we will hit
>> ENOSPC no matter whether we have enough unallocated space.
>>
>> [FIX]
>> This patch will introduce btrfs_space_info::bytes_reserved to trace how
>> many space we have reserved but not yet committed to extent tree.
>>
>> To support this change, this commit also introduces the following
>> modification:
>> - More comment on btrfs_space_info::bytes_*
>>   To make code a little easier to read
>>
>> - Export update_space_info() to preallocate empty data/metadata space
>>   info for mkfs.
>>   For mkfs, we only have a temporary fs image with SYSTEM chunk only.
>>   Export update_space_info() so that we can preallocate empty
>>   data/metadata space info before we start a transaction.
>>
>> - Proper btrfs_space_info::bytes_reserved update
>>   The timing is the as kernel (except we don't need to update
>>   bytes_reserved for data extents)
>>   * Increase bytes_reserved when call alloc_reserved_tree_block()
>>   * Decrease bytes_reserved when running delayed refs
>>     With the help of head->must_insert_reserved to determine whether we
>>     need to decrease.
> 
> This text is opposite to what the code is doing. At the time of
> alloc_reserved_tree_block we actually decrement bytes_reserved since the
> allocated block is going to be added to bytes_used via
> update_block_group. This is done when delayed refs are being run.

My bad. I selected wrong alloc_* completion item for the increase part.

The increase is at alloc_tree_block().
The decrease part is correct, it's alloc_reserved_tree_block() which get
called at run delayed ref time.

I skipped several function call names in the call chain as I thought
it's obvious at patch write time, but not so obvious when re-viewing
this comment.

Thanks,
Qu

> 
> At alloc_tree_block you increment bytes_reserved since this is the time
> when space for the extent is reserved.
> 
>>
>> Issue: #123
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>  ctree.h       | 24 ++++++++++++++++++++++++
>>  extent-tree.c | 43 +++++++++++++++++++++++++++++++++++++------
>>  mkfs/main.c   | 11 +++++++++++
>>  transaction.c |  8 ++++++++
>>  4 files changed, 80 insertions(+), 6 deletions(-)
>>
>> diff --git a/ctree.h b/ctree.h
>> index 76f52b1c9b08..93f96a578f2c 100644
>> --- a/ctree.h
>> +++ b/ctree.h
>> @@ -1060,8 +1060,29 @@ struct btrfs_qgroup_limit_item {
>>  struct btrfs_space_info {
>>  	u64 flags;
>>  	u64 total_bytes;
>> +	/*
>> +	 * Space already used.
>> +	 * Only accounting space in current extent tree, thus delayed ref
>> +	 * won't be accounted here.
>> +	 */
>>  	u64 bytes_used;
>> +
>> +	/*
>> +	 * Space being pinned down.
>> +	 * So extent allocator will not try to allocate space from them.
>> +	 *
>> +	 * For cases like extents being freed in current transaction, or
>> +	 * manually pinned bytes for re-initializing certain trees.
>> +	 */
>>  	u64 bytes_pinned;
>> +
>> +	/*
>> +	 * Space being reserved.
>> +	 * Space has already being reserved but not yet reach extent tree.
>> +	 *
>> +	 * New tree blocks allocated in current transaction goes here.
>> +	 */
>> +	u64 bytes_reserved;
>>  	int full;
>>  	struct list_head list;
>>  };
>> @@ -2528,6 +2549,9 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
>>  			    u64 root_objectid, u64 ref_generation,
>>  			    u64 owner_objectid);
>>  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
>> +int update_space_info(struct btrfs_fs_info *info, u64 flags,
>> +		      u64 total_bytes, u64 bytes_used,
>> +		      struct btrfs_space_info **space_info);
>>  int btrfs_free_block_groups(struct btrfs_fs_info *info);
>>  int btrfs_read_block_groups(struct btrfs_root *root);
>>  struct btrfs_block_group_cache *
>> diff --git a/extent-tree.c b/extent-tree.c
>> index e62ee8c2ba13..c7ca49bccd8b 100644
>> --- a/extent-tree.c
>> +++ b/extent-tree.c
>> @@ -1786,9 +1786,9 @@ static int free_space_info(struct btrfs_fs_info *fs_info, u64 flags,
>>  	return 0;
>>  }
>>  
>> -static int update_space_info(struct btrfs_fs_info *info, u64 flags,
>> -			     u64 total_bytes, u64 bytes_used,
>> -			     struct btrfs_space_info **space_info)
>> +int update_space_info(struct btrfs_fs_info *info, u64 flags,
>> +		      u64 total_bytes, u64 bytes_used,
>> +		      struct btrfs_space_info **space_info)
>>  {
>>  	struct btrfs_space_info *found;
>>  
>> @@ -1814,6 +1814,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
>>  	found->total_bytes = total_bytes;
>>  	found->bytes_used = bytes_used;
>>  	found->bytes_pinned = 0;
>> +	found->bytes_reserved = 0;
>>  	found->full = 0;
>>  	*space_info = found;
>>  	return 0;
>> @@ -1859,8 +1860,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
>>  		return 0;
>>  
>>  	thresh = div_factor(space_info->total_bytes, 7);
>> -	if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
>> -	    thresh)
>> +	if ((space_info->bytes_used + space_info->bytes_pinned +
>> +	     space_info->bytes_reserved + alloc_bytes) < thresh)
>>  		return 0;
>>  
>>  	/*
>> @@ -2538,6 +2539,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
>>  	struct btrfs_fs_info *fs_info = trans->fs_info;
>>  	struct btrfs_extent_item *extent_item;
>>  	struct btrfs_extent_inline_ref *iref;
>> +	struct btrfs_space_info *sinfo;
>>  	struct extent_buffer *leaf;
>>  	struct btrfs_path *path;
>>  	struct btrfs_key ins;
>> @@ -2545,6 +2547,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
>>  	u64 start, end;
>>  	int ret;
>>  
>> +	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
>> +	ASSERT(sinfo);
>> +
>>  	ins.objectid = node->bytenr;
>>  	if (skinny_metadata) {
>>  		ins.offset = ref->level;
>> @@ -2605,6 +2610,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
>>  
>>  	ret = update_block_group(fs_info, ins.objectid, fs_info->nodesize, 1,
>>  				 0);
>> +	if (sinfo) {
>> +		if (fs_info->nodesize > sinfo->bytes_reserved) {
>> +			WARN_ON(1);
>> +			sinfo->bytes_reserved = 0;
>> +		} else {
>> +			sinfo->bytes_reserved -= fs_info->nodesize;
>> +		}
>> +	}
>>  
>>  	if (ref->root == BTRFS_EXTENT_TREE_OBJECTID) {
>>  		clear_extent_bits(&trans->fs_info->extent_ins, start, end,
>> @@ -2624,6 +2637,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
>>  	int ret;
>>  	u64 extent_size;
>>  	struct btrfs_delayed_extent_op *extent_op;
>> +	struct btrfs_space_info *sinfo;
>> +	struct btrfs_fs_info *fs_info = root->fs_info;
>>  	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
>>  						 SKINNY_METADATA);
>>  
>> @@ -2631,6 +2646,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
>>  	if (!extent_op)
>>  		return -ENOMEM;
>>  
>> +	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
>> +	ASSERT(sinfo);
>>  	ret = btrfs_reserve_extent(trans, root, num_bytes, empty_size,
>>  				   hint_byte, search_end, ins, 0);
>>  	if (ret < 0)
>> @@ -2663,6 +2680,7 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
>>  		BUG_ON(ret);
>>  	}
>>  
>> +	sinfo->bytes_reserved += extent_size;
>>  	ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, ins->objectid,
>>  					 extent_size, 0, root_objectid,
>>  					 level, BTRFS_ADD_DELAYED_EXTENT,
>> @@ -3000,6 +3018,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
>>  		sinfo = list_entry(info->space_info.next,
>>  				   struct btrfs_space_info, list);
>>  		list_del_init(&sinfo->list);
>> +		if (sinfo->bytes_reserved)
>> +			warning(
>> +		"reserved space leaked, flag=0x%llx bytes_reserved=%llu",
>> +				sinfo->flags, sinfo->bytes_reserved);
>>  		kfree(sinfo);
>>  	}
>>  	return 0;
>> @@ -4106,8 +4128,17 @@ int cleanup_ref_head(struct btrfs_trans_handle *trans,
>>  	rb_erase(&head->href_node, &delayed_refs->href_root);
>>  	RB_CLEAR_NODE(&head->href_node);
>>  
>> -	if (head->must_insert_reserved)
>> +	if (head->must_insert_reserved) {
>>  		btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes);
>> +		if (!head->is_data) {
>> +			struct btrfs_space_info *sinfo;
>> +
>> +			sinfo = __find_space_info(trans->fs_info,
>> +					BTRFS_BLOCK_GROUP_METADATA);
>> +			ASSERT(sinfo);
>> +			sinfo->bytes_reserved -= head->num_bytes;
>> +		}
>> +	}
>>  
>>  	btrfs_put_delayed_ref_head(head);
>>  	return 0;
>> diff --git a/mkfs/main.c b/mkfs/main.c
>> index b442e6e40c37..1d03ec52ddd6 100644
>> --- a/mkfs/main.c
>> +++ b/mkfs/main.c
>> @@ -58,11 +58,22 @@ static int create_metadata_block_groups(struct btrfs_root *root, int mixed,
>>  {
>>  	struct btrfs_fs_info *fs_info = root->fs_info;
>>  	struct btrfs_trans_handle *trans;
>> +	struct btrfs_space_info *sinfo;
>>  	u64 bytes_used;
>>  	u64 chunk_start = 0;
>>  	u64 chunk_size = 0;
>>  	int ret;
>>  
>> +	/* Create needed space info to trace extents reservation */
>> +	ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA,
>> +				0, 0, &sinfo);
>> +	if (ret < 0)
>> +		return ret;
>> +	ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA,
>> +				0, 0, &sinfo);
>> +	if (ret < 0)
>> +		return ret;
>> +
>>  	trans = btrfs_start_transaction(root, 1);
>>  	BUG_ON(IS_ERR(trans));
>>  	bytes_used = btrfs_super_bytes_used(fs_info->super_copy);
>> diff --git a/transaction.c b/transaction.c
>> index 138e10f0d6cc..d2c7f4829eda 100644
>> --- a/transaction.c
>> +++ b/transaction.c
>> @@ -158,6 +158,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
>>  	u64 transid = trans->transid;
>>  	int ret = 0;
>>  	struct btrfs_fs_info *fs_info = root->fs_info;
>> +	struct btrfs_space_info *sinfo;
>>  
>>  	if (trans->fs_info->transaction_aborted)
>>  		return -EROFS;
>> @@ -209,6 +210,13 @@ commit_tree:
>>  	root->commit_root = NULL;
>>  	fs_info->running_transaction = NULL;
>>  	fs_info->last_trans_committed = transid;
>> +	list_for_each_entry(sinfo, &fs_info->space_info, list) {
>> +		if (sinfo->bytes_reserved) {
>> +			warning(
>> +	"reserved space leaked, transid=%llu flag=0x%llx bytes_reserved=%llu",
>> +				transid, sinfo->flags, sinfo->bytes_reserved);
>> +		}
>> +	}
>>  	return ret;
>>  error:
>>  	btrfs_destroy_delayed_refs(trans);
>>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, back to index

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-24 23:32 [PATCH] btrfs-progs: Fix false ENOSPC alert by tracking used space correctly Qu Wenruo
2019-06-14 15:57 ` David Sterba
2019-06-19  6:32 ` Nikolay Borisov
2019-06-19  6:49   ` Qu Wenruo

Linux-BTRFS Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-btrfs/0 linux-btrfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-btrfs linux-btrfs/ https://lore.kernel.org/linux-btrfs \
		linux-btrfs@vger.kernel.org linux-btrfs@archiver.kernel.org
	public-inbox-index linux-btrfs


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-btrfs


AGPL code for this site: git clone https://public-inbox.org/ public-inbox