Linux-BTRFS Archive on lore.kernel.org
 help / Atom feed
* [PATCH] btrfs: Introduce new mount option to skip block group items scan
@ 2018-12-20  8:01 Qu Wenruo
  2019-01-07 18:59 ` David Sterba
  0 siblings, 1 reply; 3+ messages in thread
From: Qu Wenruo @ 2018-12-20  8:01 UTC (permalink / raw)
  To: linux-btrfs

Btrfs needs to read out all block group (bg) items to fill its bg
caches.

However such bg caches are only needed for read-write mount, and makes
no sense for RO mount.

So this patch introduce new mount option, skip_bg, to skip block group
items scan.

This new 'skip_bg' mount option can only be used with TRUE read-only
mount, which needs the following dependency:
- RO mount
  Obviously.

- No log tree or notreelog mount option

- No way to remoutn RW
  Similar to notreelog mount option.

- No chunk <-> bg <-> dev extents restrict check

This option should only be used as kernel equivalent of btrfs-restore.

With this patch, we can even mount a btrfs whose extent root is
completely corrupted.

But can also be an option to test if btrfs_read_block_groups() is the
major cause for slow btrfs mount.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     | 29 ++++++++++++++++++---
 fs/btrfs/extent-tree.c | 59 ++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/super.c       | 20 ++++++++++++++
 fs/btrfs/volumes.c     |  7 +++++
 5 files changed, 112 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80953528572d..371b5e2f6fbe 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1353,6 +1353,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 #define BTRFS_MOUNT_FREE_SPACE_TREE	(1 << 26)
 #define BTRFS_MOUNT_NOLOGREPLAY		(1 << 27)
 #define BTRFS_MOUNT_REF_VERIFY		(1 << 28)
+#define BTRFS_MOUNT_SKIP_BG		(1 << 29)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(2048)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0ab41da91d1..5228320030a5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2330,11 +2330,15 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 
 	root = btrfs_read_tree_root(tree_root, &location);
 	if (IS_ERR(root)) {
-		ret = PTR_ERR(root);
-		goto out;
+		if (!btrfs_test_opt(fs_info, SKIP_BG)) {
+			ret = PTR_ERR(root);
+			goto out;
+		}
+		fs_info->extent_root = NULL;
+	} else {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		fs_info->extent_root = root;
 	}
-	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-	fs_info->extent_root = root;
 
 	location.objectid = BTRFS_DEV_TREE_OBJECTID;
 	root = btrfs_read_tree_root(tree_root, &location);
@@ -2927,6 +2931,23 @@ int open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
+	/* Skip bg needs RO and no log tree replay */
+	if (btrfs_test_opt(fs_info, SKIP_BG)) {
+		if (!sb_rdonly(sb)) {
+			btrfs_err(fs_info,
+		"skip_bg mount option can only be used with read-only mount");
+			err = -EINVAL;
+			goto fail_alloc;
+		}
+		if (btrfs_super_log_root(disk_super) &&
+		    !btrfs_test_opt(fs_info, NOTREELOG)) {
+			btrfs_err(fs_info,
+	"skip_bg must be used with notreelog mount option for dirty log");
+			err = -EINVAL;
+			goto fail_alloc;
+		}
+	}
+
 	ret = btrfs_init_workqueues(fs_info, fs_devices);
 	if (ret) {
 		err = ret;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a1febf155747..051a5a63c2b0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9948,6 +9948,62 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
 	return ret;
 }
 
+static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
+{
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_space_info *space_info;
+	struct rb_node *node;
+	int ret = 0;
+
+	read_lock(&em_tree->lock);
+	for (node = rb_first_cached(&em_tree->map); node;
+	     node = rb_next(node)) {
+		em = rb_entry(node, struct extent_map, rb_node);
+		map = em->map_lookup;
+		cache = btrfs_create_block_group_cache(fs_info, em->start,
+						       em->len);
+		if (!cache) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		/* Fill dummy cache as FULL */
+		cache->flags = map->type;
+		cache->last_byte_to_unpin = (u64)-1;
+		cache->cached = BTRFS_CACHE_FINISHED;
+		btrfs_set_block_group_used(&cache->item, em->len);
+		btrfs_set_block_group_chunk_objectid(&cache->item, em->start);
+		btrfs_set_block_group_flags(&cache->item, map->type);
+		ret = exclude_super_stripes(cache);
+		if (ret) {
+			free_excluded_extents(cache);
+			btrfs_put_block_group(cache);
+			goto out;
+		}
+		free_excluded_extents(cache);
+		ret = btrfs_add_block_group_cache(fs_info, cache);
+		if (ret) {
+			btrfs_remove_free_space_cache(cache);
+			btrfs_put_block_group(cache);
+			goto out;
+		}
+		update_space_info(fs_info, cache->flags, em->start, em->len,
+				  cache->bytes_super, &space_info);
+		cache->space_info = space_info;
+		link_block_group(cache);
+
+		set_avail_alloc_bits(fs_info, cache->flags);
+		if (btrfs_chunk_readonly(fs_info, em->start))
+			inc_block_group_ro(cache, 1);
+	}
+out:
+	read_unlock(&em_tree->lock);
+	return ret;
+}
+
 int btrfs_read_block_groups(struct btrfs_fs_info *info)
 {
 	struct btrfs_path *path;
@@ -9962,6 +10018,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 	u64 feature;
 	int mixed;
 
+	if (btrfs_test_opt(info, SKIP_BG))
+		return fill_dummy_bgs(info);
+
 	feature = btrfs_super_incompat_flags(info->super_copy);
 	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b362b45dd757..5bc751819b19 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -337,6 +337,7 @@ enum {
 	Opt_check_integrity_including_extent_data,
 	Opt_check_integrity_print_mask,
 	Opt_enospc_debug, Opt_noenospc_debug,
+	Opt_skip_bg,
 #ifdef CONFIG_BTRFS_DEBUG
 	Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
 #endif
@@ -393,6 +394,7 @@ static const match_table_t tokens = {
 	{Opt_notreelog, "notreelog"},
 	{Opt_usebackuproot, "usebackuproot"},
 	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+	{Opt_skip_bg, "skip_bg"},
 
 	/* Deprecated options */
 	{Opt_alloc_start, "alloc_start=%s"},
@@ -664,6 +666,10 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			btrfs_clear_and_info(info, NOTREELOG,
 					     "enabling tree log");
 			break;
+		case Opt_skip_bg:
+			btrfs_set_and_info(info, SKIP_BG,
+				"skip mount time block groupo searching");
+			break;
 		case Opt_norecovery:
 		case Opt_nologreplay:
 			btrfs_set_and_info(info, NOLOGREPLAY,
@@ -1797,6 +1803,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		goto restore;
 
+	if (btrfs_test_opt(fs_info, SKIP_BG) !=
+	    (old_opts & BTRFS_MOUNT_SKIP_BG)) {
+		btrfs_err(fs_info,
+			"skip_bg mount option can't be changed during remount");
+		ret = -EINVAL;
+		goto restore;
+	}
 	btrfs_remount_begin(fs_info, old_opts, *flags);
 	btrfs_resize_thread_pool(fs_info,
 		fs_info->thread_pool_size, old_thread_pool_size);
@@ -1858,6 +1871,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 			goto restore;
 		}
 
+		if (btrfs_test_opt(fs_info, SKIP_BG)) {
+			btrfs_err(fs_info,
+			"remounting read-write with skip_bg is not allowed");
+			ret = -EINVAL;
+			goto restore;
+		}
+
 		ret = btrfs_cleanup_fs_roots(fs_info);
 		if (ret)
 			goto restore;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d397019e..d614b2fab652 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7480,6 +7480,13 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
 	struct btrfs_key key;
 	int ret = 0;
 
+	/*
+	 * For skip_bg mount option, we're already RO and are salvaging data,
+	 * no need for such restrict check.
+	 */
+	if (btrfs_test_opt(fs_info, SKIP_BG))
+		return 0;
+
 	key.objectid = 1;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 	key.offset = 0;
-- 
2.20.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] btrfs: Introduce new mount option to skip block group items scan
  2018-12-20  8:01 [PATCH] btrfs: Introduce new mount option to skip block group items scan Qu Wenruo
@ 2019-01-07 18:59 ` David Sterba
  2019-01-08  1:11   ` Qu Wenruo
  0 siblings, 1 reply; 3+ messages in thread
From: David Sterba @ 2019-01-07 18:59 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Thu, Dec 20, 2018 at 04:01:37PM +0800, Qu Wenruo wrote:
> Btrfs needs to read out all block group (bg) items to fill its bg
> caches.
> 
> However such bg caches are only needed for read-write mount, and makes
> no sense for RO mount.
> 
> So this patch introduce new mount option, skip_bg, to skip block group
> items scan.
> 
> This new 'skip_bg' mount option can only be used with TRUE read-only
> mount, which needs the following dependency:
> - RO mount
>   Obviously.
> 
> - No log tree or notreelog mount option
> 
> - No way to remoutn RW
>   Similar to notreelog mount option.
> 
> - No chunk <-> bg <-> dev extents restrict check
> 
> This option should only be used as kernel equivalent of btrfs-restore.
> 
> With this patch, we can even mount a btrfs whose extent root is
> completely corrupted.

So it's a last-resort rescue option, I'd suggest to make that more
explicit. Something like rescue=skip-bg. We can add all sorts of other
values that would relax some checks. Adding a separate mount option
would be quite impractical.

This would also align with the constraints you mention above, eg. no way
to remount RW. This is fine for the corrupted extent root. I wonder what
kind of metadata damage support would still make sense. a 'completely
corrupted extent root' means you never know what you get from the
filesystem.

The in-kernel checks and interconnection of the structures would have to
be ready for missing metadata or more sanity checks would need to be
added.

I think that all the restore and rescue functionality is better suited
for userspace where the unpredictable corruptions that cannot be parsed
do not lead to kernel crashes or silent memory overwrites.

> But can also be an option to test if btrfs_read_block_groups() is the
> major cause for slow btrfs mount.

We have a debugging/testing -only mount option 'fragment', so we may
consider adding more.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] btrfs: Introduce new mount option to skip block group items scan
  2019-01-07 18:59 ` David Sterba
@ 2019-01-08  1:11   ` Qu Wenruo
  0 siblings, 0 replies; 3+ messages in thread
From: Qu Wenruo @ 2019-01-08  1:11 UTC (permalink / raw)
  To: dsterba, Qu Wenruo, linux-btrfs

[-- Attachment #1.1: Type: text/plain, Size: 3498 bytes --]



On 2019/1/8 上午2:59, David Sterba wrote:
> On Thu, Dec 20, 2018 at 04:01:37PM +0800, Qu Wenruo wrote:
>> Btrfs needs to read out all block group (bg) items to fill its bg
>> caches.
>>
>> However such bg caches are only needed for read-write mount, and makes
>> no sense for RO mount.
>>
>> So this patch introduce new mount option, skip_bg, to skip block group
>> items scan.
>>
>> This new 'skip_bg' mount option can only be used with TRUE read-only
>> mount, which needs the following dependency:
>> - RO mount
>>   Obviously.
>>
>> - No log tree or notreelog mount option
>>
>> - No way to remoutn RW
>>   Similar to notreelog mount option.
>>
>> - No chunk <-> bg <-> dev extents restrict check
>>
>> This option should only be used as kernel equivalent of btrfs-restore.
>>
>> With this patch, we can even mount a btrfs whose extent root is
>> completely corrupted.
> 
> So it's a last-resort rescue option, I'd suggest to make that more
> explicit. Something like rescue=skip-bg. We can add all sorts of other
> values that would relax some checks. Adding a separate mount option
> would be quite impractical.

Nice suggestion, I'm also not satisfied with current mount option name.
I'll add new rescue mount option, and convert some existing options to it.

> 
> This would also align with the constraints you mention above, eg. no way
> to remount RW. This is fine for the corrupted extent root. I wonder what
> kind of metadata damage support would still make sense.

E.g. one leaf corrupted while containing the block group item.
Since we're going to read all block group items at mount time, such
corruption will reject mount immediately, no matter what mount option
we're using.

> a 'completely
> corrupted extent root' means you never know what you get from the
> filesystem.

Not exactly.
Just extent root node corrupted could reject mount, while fs tree could
be completely fine.

Normally we would go backup root and hopes we could get an good old
extent root.
But with this option, we should be able to access fs tree without problem.

> 
> The in-kernel checks and interconnection of the structures would have to
> be ready for missing metadata or more sanity checks would need to be
> added.

If fact, as mentioned, extent tree only affects write operation.

For fs tree read operations, current code is more or less good enough to
handle corruption, at least much robust than extent tree corruption.

> 
> I think that all the restore and rescue functionality is better suited
> for userspace where the unpredictable corruptions that cannot be parsed
> do not lead to kernel crashes or silent memory overwrites.

That's true.
Although btrfs-restore still can't provide everything, like
snapshot/subvolume structure, so such rescue option may still make sense.

> 
>> But can also be an option to test if btrfs_read_block_groups() is the
>> major cause for slow btrfs mount.
> 
> We have a debugging/testing -only mount option 'fragment', so we may
> consider adding more.

For this part, in fact it has better way to verify the cause, without
any modification to the kernel.

We could just use ftrace to get the non-inline function execution time,
like:
# perf ftrace -t function_graph -T open_ctree \
	-T btrfs_read_block_groups \
	-T check_chunk_block_group_mappings \
	-T btrfs_read_chunk_tree \
	-T btrfs_verify_dev_extents \
	mount /dev/test/test /mnt

Thanks,
Qu


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, back to index

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-20  8:01 [PATCH] btrfs: Introduce new mount option to skip block group items scan Qu Wenruo
2019-01-07 18:59 ` David Sterba
2019-01-08  1:11   ` Qu Wenruo

Linux-BTRFS Archive on lore.kernel.org

Archives are clonable: git clone --mirror https://lore.kernel.org/linux-btrfs/0 linux-btrfs/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-btrfs linux-btrfs/ https://lore.kernel.org/linux-btrfs \
		linux-btrfs@vger.kernel.org linux-btrfs@archiver.kernel.org
	public-inbox-index linux-btrfs


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-btrfs


AGPL code for this site: git clone https://public-inbox.org/ public-inbox