All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] btrfs: Allow read-only mount with corrupted extent tree
@ 2021-03-17  1:20 Dāvis Mosāns
  2021-03-17  1:29 ` Dāvis Mosāns
  2021-03-21 21:49 ` [PATCH] " Dāvis Mosāns
  0 siblings, 2 replies; 14+ messages in thread
From: Dāvis Mosāns @ 2021-03-17  1:20 UTC (permalink / raw)
  To: linux-btrfs
  Cc: clm, josef, dsterba, linux-kernel, ce3g8jdj, Dāvis Mosāns

Currently if there's any corruption at all in extent tree
(eg. even single bit) then mounting will fail with:
"failed to read block groups: -5" (-EIO)
It happens because we immediately abort on first error when
searching in extent tree for block groups.

Now with this patch if `ignorebadroots` option is specified
then we handle such case and continue by removing already
created block groups and creating dummy block groups.

Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
---
 fs/btrfs/block-group.c | 14 ++++++++++++++
 fs/btrfs/disk-io.c     |  4 ++--
 fs/btrfs/disk-io.h     |  2 ++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 48ebc106a606..827a977614b3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 	ret = check_chunk_block_group_mappings(info);
 error:
 	btrfs_free_path(path);
+
+	if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
+		btrfs_put_block_group_cache(info);
+		btrfs_stop_all_workers(info);
+		btrfs_free_block_groups(info);
+		ret = btrfs_init_workqueues(info, NULL);
+		if (ret)
+			return ret;
+		ret = btrfs_init_space_info(info);
+		if (ret)
+			return ret;
+		return fill_dummy_bgs(info);
+	}
+
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07a2b4f69b10..dc744f76d075 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1955,7 +1955,7 @@ static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
 }
 
 /* helper to cleanup workers */
-static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
+void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 {
 	btrfs_destroy_workqueue(fs_info->fixup_workers);
 	btrfs_destroy_workqueue(fs_info->delalloc_workers);
@@ -2122,7 +2122,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
 	mutex_init(&fs_info->qgroup_rescan_lock);
 }
 
-static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
+int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
 		struct btrfs_fs_devices *fs_devices)
 {
 	u32 max_active = fs_info->thread_pool_size;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index e45057c0c016..f9bfcba86a04 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -137,6 +137,8 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
 int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
 int __init btrfs_end_io_wq_init(void);
 void __cold btrfs_end_io_wq_exit(void);
+void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info);
+int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_set_buffer_lockdep_class(u64 objectid,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-17  1:20 [RFC] btrfs: Allow read-only mount with corrupted extent tree Dāvis Mosāns
@ 2021-03-17  1:29 ` Dāvis Mosāns
  2021-03-17 10:28   ` Qu Wenruo
  2021-03-21 21:49 ` [PATCH] " Dāvis Mosāns
  1 sibling, 1 reply; 14+ messages in thread
From: Dāvis Mosāns @ 2021-03-17  1:29 UTC (permalink / raw)
  To: Btrfs BTRFS; +Cc: clm, josef, dsterba, linux-kernel, Zygo Blaxell

trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
(<davispuh@gmail.com>) rakstīja:
>
> Currently if there's any corruption at all in extent tree
> (eg. even single bit) then mounting will fail with:
> "failed to read block groups: -5" (-EIO)
> It happens because we immediately abort on first error when
> searching in extent tree for block groups.
>
> Now with this patch if `ignorebadroots` option is specified
> then we handle such case and continue by removing already
> created block groups and creating dummy block groups.
>
> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
> ---
>  fs/btrfs/block-group.c | 14 ++++++++++++++
>  fs/btrfs/disk-io.c     |  4 ++--
>  fs/btrfs/disk-io.h     |  2 ++
>  3 files changed, 18 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 48ebc106a606..827a977614b3 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>         ret = check_chunk_block_group_mappings(info);
>  error:
>         btrfs_free_path(path);
> +
> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
> +               btrfs_put_block_group_cache(info);
> +               btrfs_stop_all_workers(info);
> +               btrfs_free_block_groups(info);
> +               ret = btrfs_init_workqueues(info, NULL);
> +               if (ret)
> +                       return ret;
> +               ret = btrfs_init_space_info(info);
> +               if (ret)
> +                       return ret;
> +               return fill_dummy_bgs(info);

This isn't that nice, but I don't really know how to properly clean up
everything related to already created block groups so this was easiest
way. It seems to work fine.
But looks like need to do something about replay log aswell because if
it's not disabled then it fails with:

[ 1397.246869] BTRFS info (device sde): start tree-log replay
[ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
[ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
[ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
errno=-5 IO failure
[ 1398.218828] BTRFS: error (device sde) in
btrfs_run_delayed_refs:2124: errno=-5 IO failure
[ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
errno=-5 IO failure (Failed to recover log tree)
[ 1398.229048] BTRFS error (device sde): open_ctree failed

Ideally it should replay everything except extent refs.


I also noticed that after unmount there is:

[11000.562504] BTRFS warning (device sde): page private not zero on
page 21057098481664
[11000.562510] BTRFS warning (device sde): page private not zero on
page 21057098485760

not sure what it means.


Best regards,
Dāvis

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-17  1:29 ` Dāvis Mosāns
@ 2021-03-17 10:28   ` Qu Wenruo
  2021-03-17 21:03     ` Dāvis Mosāns
  0 siblings, 1 reply; 14+ messages in thread
From: Qu Wenruo @ 2021-03-17 10:28 UTC (permalink / raw)
  To: Dāvis Mosāns, Btrfs BTRFS
  Cc: clm, josef, dsterba, linux-kernel, Zygo Blaxell



On 2021/3/17 上午9:29, Dāvis Mosāns wrote:
> trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
> (<davispuh@gmail.com>) rakstīja:
>>
>> Currently if there's any corruption at all in extent tree
>> (eg. even single bit) then mounting will fail with:
>> "failed to read block groups: -5" (-EIO)
>> It happens because we immediately abort on first error when
>> searching in extent tree for block groups.
>>
>> Now with this patch if `ignorebadroots` option is specified
>> then we handle such case and continue by removing already
>> created block groups and creating dummy block groups.
>>
>> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
>> ---
>>   fs/btrfs/block-group.c | 14 ++++++++++++++
>>   fs/btrfs/disk-io.c     |  4 ++--
>>   fs/btrfs/disk-io.h     |  2 ++
>>   3 files changed, 18 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>> index 48ebc106a606..827a977614b3 100644
>> --- a/fs/btrfs/block-group.c
>> +++ b/fs/btrfs/block-group.c
>> @@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>          ret = check_chunk_block_group_mappings(info);
>>   error:
>>          btrfs_free_path(path);
>> +
>> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
>> +               btrfs_put_block_group_cache(info);
>> +               btrfs_stop_all_workers(info);
>> +               btrfs_free_block_groups(info);
>> +               ret = btrfs_init_workqueues(info, NULL);
>> +               if (ret)
>> +                       return ret;
>> +               ret = btrfs_init_space_info(info);
>> +               if (ret)
>> +                       return ret;
>> +               return fill_dummy_bgs(info);

When we hit bad things in extent tree, we should ensure we're mounting
the fs RO, or we can't continue.

And we should also refuse to mount back to RW if we hit such case, so
that we don't need anything complex, just ignore the whole extent tree
and create the dummy block groups.

>
> This isn't that nice, but I don't really know how to properly clean up
> everything related to already created block groups so this was easiest
> way. It seems to work fine.
> But looks like need to do something about replay log aswell because if
> it's not disabled then it fails with:
>
> [ 1397.246869] BTRFS info (device sde): start tree-log replay
> [ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
> on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
> [ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
> on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
> [ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
> errno=-5 IO failure
> [ 1398.218828] BTRFS: error (device sde) in
> btrfs_run_delayed_refs:2124: errno=-5 IO failure
> [ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
> errno=-5 IO failure (Failed to recover log tree)
> [ 1398.229048] BTRFS error (device sde): open_ctree failed

This is because we shouldn't allow to do anything write to the fs if we
have anything wrong in extent tree.

Thanks,
Qu
>
> Ideally it should replay everything except extent refs. >
>
> I also noticed that after unmount there is:
>
> [11000.562504] BTRFS warning (device sde): page private not zero on
> page 21057098481664
> [11000.562510] BTRFS warning (device sde): page private not zero on
> page 21057098485760
>
> not sure what it means.
>
>
> Best regards,
> Dāvis
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-17 10:28   ` Qu Wenruo
@ 2021-03-17 21:03     ` Dāvis Mosāns
  2021-03-17 23:49       ` Qu Wenruo
  0 siblings, 1 reply; 14+ messages in thread
From: Dāvis Mosāns @ 2021-03-17 21:03 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Btrfs BTRFS, clm, Josef Bacik, dsterba, linux-kernel, Zygo Blaxell

trešd., 2021. g. 17. marts, plkst. 12:28 — lietotājs Qu Wenruo
(<quwenruo.btrfs@gmx.com>) rakstīja:
>
>
>
> On 2021/3/17 上午9:29, Dāvis Mosāns wrote:
> > trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
> > (<davispuh@gmail.com>) rakstīja:
> >>
> >> Currently if there's any corruption at all in extent tree
> >> (eg. even single bit) then mounting will fail with:
> >> "failed to read block groups: -5" (-EIO)
> >> It happens because we immediately abort on first error when
> >> searching in extent tree for block groups.
> >>
> >> Now with this patch if `ignorebadroots` option is specified
> >> then we handle such case and continue by removing already
> >> created block groups and creating dummy block groups.
> >>
> >> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
> >> ---
> >>   fs/btrfs/block-group.c | 14 ++++++++++++++
> >>   fs/btrfs/disk-io.c     |  4 ++--
> >>   fs/btrfs/disk-io.h     |  2 ++
> >>   3 files changed, 18 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> >> index 48ebc106a606..827a977614b3 100644
> >> --- a/fs/btrfs/block-group.c
> >> +++ b/fs/btrfs/block-group.c
> >> @@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
> >>          ret = check_chunk_block_group_mappings(info);
> >>   error:
> >>          btrfs_free_path(path);
> >> +
> >> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
> >> +               btrfs_put_block_group_cache(info);
> >> +               btrfs_stop_all_workers(info);
> >> +               btrfs_free_block_groups(info);
> >> +               ret = btrfs_init_workqueues(info, NULL);
> >> +               if (ret)
> >> +                       return ret;
> >> +               ret = btrfs_init_space_info(info);
> >> +               if (ret)
> >> +                       return ret;
> >> +               return fill_dummy_bgs(info);
>
> When we hit bad things in extent tree, we should ensure we're mounting
> the fs RO, or we can't continue.
>
> And we should also refuse to mount back to RW if we hit such case, so
> that we don't need anything complex, just ignore the whole extent tree
> and create the dummy block groups.
>

That's what we're doing here, `ignorebadroots` implies RO mount and
without specifying it doesn't mount at all.

> >
> > This isn't that nice, but I don't really know how to properly clean up
> > everything related to already created block groups so this was easiest
> > way. It seems to work fine.
> > But looks like need to do something about replay log aswell because if
> > it's not disabled then it fails with:
> >
> > [ 1397.246869] BTRFS info (device sde): start tree-log replay
> > [ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
> > on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
> > [ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
> > on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
> > [ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
> > errno=-5 IO failure
> > [ 1398.218828] BTRFS: error (device sde) in
> > btrfs_run_delayed_refs:2124: errno=-5 IO failure
> > [ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
> > errno=-5 IO failure (Failed to recover log tree)
> > [ 1398.229048] BTRFS error (device sde): open_ctree failed
>
> This is because we shouldn't allow to do anything write to the fs if we
> have anything wrong in extent tree.
>

This is happening when mounting read-only. My assumption is that it
only tries to replay in memory without writing anything to disk.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-17 21:03     ` Dāvis Mosāns
@ 2021-03-17 23:49       ` Qu Wenruo
  2021-03-19 15:34         ` Dāvis Mosāns
  0 siblings, 1 reply; 14+ messages in thread
From: Qu Wenruo @ 2021-03-17 23:49 UTC (permalink / raw)
  To: Dāvis Mosāns
  Cc: Btrfs BTRFS, clm, Josef Bacik, dsterba, linux-kernel, Zygo Blaxell



On 2021/3/18 上午5:03, Dāvis Mosāns wrote:
> trešd., 2021. g. 17. marts, plkst. 12:28 — lietotājs Qu Wenruo
> (<quwenruo.btrfs@gmx.com>) rakstīja:
>>
>>
>>
>> On 2021/3/17 上午9:29, Dāvis Mosāns wrote:
>>> trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
>>> (<davispuh@gmail.com>) rakstīja:
>>>>
>>>> Currently if there's any corruption at all in extent tree
>>>> (eg. even single bit) then mounting will fail with:
>>>> "failed to read block groups: -5" (-EIO)
>>>> It happens because we immediately abort on first error when
>>>> searching in extent tree for block groups.
>>>>
>>>> Now with this patch if `ignorebadroots` option is specified
>>>> then we handle such case and continue by removing already
>>>> created block groups and creating dummy block groups.
>>>>
>>>> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
>>>> ---
>>>>    fs/btrfs/block-group.c | 14 ++++++++++++++
>>>>    fs/btrfs/disk-io.c     |  4 ++--
>>>>    fs/btrfs/disk-io.h     |  2 ++
>>>>    3 files changed, 18 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>>>> index 48ebc106a606..827a977614b3 100644
>>>> --- a/fs/btrfs/block-group.c
>>>> +++ b/fs/btrfs/block-group.c
>>>> @@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>>>           ret = check_chunk_block_group_mappings(info);
>>>>    error:
>>>>           btrfs_free_path(path);
>>>> +
>>>> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
>>>> +               btrfs_put_block_group_cache(info);
>>>> +               btrfs_stop_all_workers(info);
>>>> +               btrfs_free_block_groups(info);
>>>> +               ret = btrfs_init_workqueues(info, NULL);
>>>> +               if (ret)
>>>> +                       return ret;
>>>> +               ret = btrfs_init_space_info(info);
>>>> +               if (ret)
>>>> +                       return ret;
>>>> +               return fill_dummy_bgs(info);
>>
>> When we hit bad things in extent tree, we should ensure we're mounting
>> the fs RO, or we can't continue.
>>
>> And we should also refuse to mount back to RW if we hit such case, so
>> that we don't need anything complex, just ignore the whole extent tree
>> and create the dummy block groups.
>>
>
> That's what we're doing here, `ignorebadroots` implies RO mount and
> without specifying it doesn't mount at all.
>
>>>
>>> This isn't that nice, but I don't really know how to properly clean up
>>> everything related to already created block groups so this was easiest
>>> way. It seems to work fine.
>>> But looks like need to do something about replay log aswell because if
>>> it's not disabled then it fails with:
>>>
>>> [ 1397.246869] BTRFS info (device sde): start tree-log replay
>>> [ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
>>> on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
>>> [ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
>>> on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
>>> [ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
>>> errno=-5 IO failure
>>> [ 1398.218828] BTRFS: error (device sde) in
>>> btrfs_run_delayed_refs:2124: errno=-5 IO failure
>>> [ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
>>> errno=-5 IO failure (Failed to recover log tree)
>>> [ 1398.229048] BTRFS error (device sde): open_ctree failed
>>
>> This is because we shouldn't allow to do anything write to the fs if we
>> have anything wrong in extent tree.
>>
>
> This is happening when mounting read-only. My assumption is that it
> only tries to replay in memory without writing anything to disk.
>

We lacks the check on log tree.

Normally for such forced RO mount, log replay is not allowed.

We should output a warning to prompt user to use nologreplay, and reject
the mount.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-17 23:49       ` Qu Wenruo
@ 2021-03-19 15:34         ` Dāvis Mosāns
  2021-03-20  0:34           ` Qu Wenruo
  0 siblings, 1 reply; 14+ messages in thread
From: Dāvis Mosāns @ 2021-03-19 15:34 UTC (permalink / raw)
  To: Qu Wenruo
  Cc: Btrfs BTRFS, clm, Josef Bacik, dsterba, linux-kernel, Zygo Blaxell

ceturtd., 2021. g. 18. marts, plkst. 01:49 — lietotājs Qu Wenruo
(<quwenruo.btrfs@gmx.com>) rakstīja:
>
>
>
> On 2021/3/18 上午5:03, Dāvis Mosāns wrote:
> > trešd., 2021. g. 17. marts, plkst. 12:28 — lietotājs Qu Wenruo
> > (<quwenruo.btrfs@gmx.com>) rakstīja:
> >>
> >>
> >>
> >> On 2021/3/17 上午9:29, Dāvis Mosāns wrote:
> >>> trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
> >>> (<davispuh@gmail.com>) rakstīja:
> >>>>
> >>>> Currently if there's any corruption at all in extent tree
> >>>> (eg. even single bit) then mounting will fail with:
> >>>> "failed to read block groups: -5" (-EIO)
> >>>> It happens because we immediately abort on first error when
> >>>> searching in extent tree for block groups.
> >>>>
> >>>> Now with this patch if `ignorebadroots` option is specified
> >>>> then we handle such case and continue by removing already
> >>>> created block groups and creating dummy block groups.
> >>>>
> >>>> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
> >>>> ---
> >>>>    fs/btrfs/block-group.c | 14 ++++++++++++++
> >>>>    fs/btrfs/disk-io.c     |  4 ++--
> >>>>    fs/btrfs/disk-io.h     |  2 ++
> >>>>    3 files changed, 18 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> >>>> index 48ebc106a606..827a977614b3 100644
> >>>> --- a/fs/btrfs/block-group.c
> >>>> +++ b/fs/btrfs/block-group.c
> >>>> @@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
> >>>>           ret = check_chunk_block_group_mappings(info);
> >>>>    error:
> >>>>           btrfs_free_path(path);
> >>>> +
> >>>> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
> >>>> +               btrfs_put_block_group_cache(info);
> >>>> +               btrfs_stop_all_workers(info);
> >>>> +               btrfs_free_block_groups(info);
> >>>> +               ret = btrfs_init_workqueues(info, NULL);
> >>>> +               if (ret)
> >>>> +                       return ret;
> >>>> +               ret = btrfs_init_space_info(info);
> >>>> +               if (ret)
> >>>> +                       return ret;
> >>>> +               return fill_dummy_bgs(info);
> >>
> >> When we hit bad things in extent tree, we should ensure we're mounting
> >> the fs RO, or we can't continue.
> >>
> >> And we should also refuse to mount back to RW if we hit such case, so
> >> that we don't need anything complex, just ignore the whole extent tree
> >> and create the dummy block groups.
> >>
> >
> > That's what we're doing here, `ignorebadroots` implies RO mount and
> > without specifying it doesn't mount at all.
> >
> >>>
> >>> This isn't that nice, but I don't really know how to properly clean up
> >>> everything related to already created block groups so this was easiest
> >>> way. It seems to work fine.
> >>> But looks like need to do something about replay log aswell because if
> >>> it's not disabled then it fails with:
> >>>
> >>> [ 1397.246869] BTRFS info (device sde): start tree-log replay
> >>> [ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
> >>> on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
> >>> [ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
> >>> on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
> >>> [ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
> >>> errno=-5 IO failure
> >>> [ 1398.218828] BTRFS: error (device sde) in
> >>> btrfs_run_delayed_refs:2124: errno=-5 IO failure
> >>> [ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
> >>> errno=-5 IO failure (Failed to recover log tree)
> >>> [ 1398.229048] BTRFS error (device sde): open_ctree failed
> >>
> >> This is because we shouldn't allow to do anything write to the fs if we
> >> have anything wrong in extent tree.
> >>
> >
> > This is happening when mounting read-only. My assumption is that it
> > only tries to replay in memory without writing anything to disk.
> >
>
> We lacks the check on log tree.
>
> Normally for such forced RO mount, log replay is not allowed.
>
> We should output a warning to prompt user to use nologreplay, and reject
> the mount.
>

I'm not familiar with log replay but couldn't there be something
useful (ignoring ref counts) that would still be worth replaying in
memory?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-19 15:34         ` Dāvis Mosāns
@ 2021-03-20  0:34           ` Qu Wenruo
  2021-03-21 21:54             ` Dāvis Mosāns
  0 siblings, 1 reply; 14+ messages in thread
From: Qu Wenruo @ 2021-03-20  0:34 UTC (permalink / raw)
  To: Dāvis Mosāns
  Cc: Btrfs BTRFS, clm, Josef Bacik, dsterba, linux-kernel, Zygo Blaxell



On 2021/3/19 下午11:34, Dāvis Mosāns wrote:
> ceturtd., 2021. g. 18. marts, plkst. 01:49 — lietotājs Qu Wenruo
> (<quwenruo.btrfs@gmx.com>) rakstīja:
>>
>>
>>
>> On 2021/3/18 上午5:03, Dāvis Mosāns wrote:
>>> trešd., 2021. g. 17. marts, plkst. 12:28 — lietotājs Qu Wenruo
>>> (<quwenruo.btrfs@gmx.com>) rakstīja:
>>>>
>>>>
>>>>
>>>> On 2021/3/17 上午9:29, Dāvis Mosāns wrote:
>>>>> trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
>>>>> (<davispuh@gmail.com>) rakstīja:
>>>>>>
>>>>>> Currently if there's any corruption at all in extent tree
>>>>>> (eg. even single bit) then mounting will fail with:
>>>>>> "failed to read block groups: -5" (-EIO)
>>>>>> It happens because we immediately abort on first error when
>>>>>> searching in extent tree for block groups.
>>>>>>
>>>>>> Now with this patch if `ignorebadroots` option is specified
>>>>>> then we handle such case and continue by removing already
>>>>>> created block groups and creating dummy block groups.
>>>>>>
>>>>>> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
>>>>>> ---
>>>>>>     fs/btrfs/block-group.c | 14 ++++++++++++++
>>>>>>     fs/btrfs/disk-io.c     |  4 ++--
>>>>>>     fs/btrfs/disk-io.h     |  2 ++
>>>>>>     3 files changed, 18 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>>>>>> index 48ebc106a606..827a977614b3 100644
>>>>>> --- a/fs/btrfs/block-group.c
>>>>>> +++ b/fs/btrfs/block-group.c
>>>>>> @@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>>>>>            ret = check_chunk_block_group_mappings(info);
>>>>>>     error:
>>>>>>            btrfs_free_path(path);
>>>>>> +
>>>>>> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
>>>>>> +               btrfs_put_block_group_cache(info);
>>>>>> +               btrfs_stop_all_workers(info);
>>>>>> +               btrfs_free_block_groups(info);
>>>>>> +               ret = btrfs_init_workqueues(info, NULL);
>>>>>> +               if (ret)
>>>>>> +                       return ret;
>>>>>> +               ret = btrfs_init_space_info(info);
>>>>>> +               if (ret)
>>>>>> +                       return ret;
>>>>>> +               return fill_dummy_bgs(info);
>>>>
>>>> When we hit bad things in extent tree, we should ensure we're mounting
>>>> the fs RO, or we can't continue.
>>>>
>>>> And we should also refuse to mount back to RW if we hit such case, so
>>>> that we don't need anything complex, just ignore the whole extent tree
>>>> and create the dummy block groups.
>>>>
>>>
>>> That's what we're doing here, `ignorebadroots` implies RO mount and
>>> without specifying it doesn't mount at all.
>>>
>>>>>
>>>>> This isn't that nice, but I don't really know how to properly clean up
>>>>> everything related to already created block groups so this was easiest
>>>>> way. It seems to work fine.
>>>>> But looks like need to do something about replay log aswell because if
>>>>> it's not disabled then it fails with:
>>>>>
>>>>> [ 1397.246869] BTRFS info (device sde): start tree-log replay
>>>>> [ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
>>>>> on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
>>>>> [ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
>>>>> on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
>>>>> [ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
>>>>> errno=-5 IO failure
>>>>> [ 1398.218828] BTRFS: error (device sde) in
>>>>> btrfs_run_delayed_refs:2124: errno=-5 IO failure
>>>>> [ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
>>>>> errno=-5 IO failure (Failed to recover log tree)
>>>>> [ 1398.229048] BTRFS error (device sde): open_ctree failed
>>>>
>>>> This is because we shouldn't allow to do anything write to the fs if we
>>>> have anything wrong in extent tree.
>>>>
>>>
>>> This is happening when mounting read-only. My assumption is that it
>>> only tries to replay in memory without writing anything to disk.
>>>
>>
>> We lacks the check on log tree.
>>
>> Normally for such forced RO mount, log replay is not allowed.
>>
>> We should output a warning to prompt user to use nologreplay, and reject
>> the mount.
>>
>
> I'm not familiar with log replay but couldn't there be something
> useful (ignoring ref counts) that would still be worth replaying in
> memory?
>
Log replay means metadata write.

Any write needs a valid extent tree to find out free space for new
metadata/data.

So no, we can't do anything but completely ignoring the log.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-17  1:20 [RFC] btrfs: Allow read-only mount with corrupted extent tree Dāvis Mosāns
  2021-03-17  1:29 ` Dāvis Mosāns
@ 2021-03-21 21:49 ` Dāvis Mosāns
  2021-04-21 16:00   ` Dāvis Mosāns
  1 sibling, 1 reply; 14+ messages in thread
From: Dāvis Mosāns @ 2021-03-21 21:49 UTC (permalink / raw)
  To: linux-btrfs
  Cc: clm, josef, dsterba, linux-kernel, ce3g8jdj, Dāvis Mosāns

Currently if there's any corruption at all in extent tree
(eg. even single bit) then mounting will fail with:
"failed to read block groups: -5" (-EIO)
It happens because we immediately abort on first error when
searching in extent tree for block groups.

Now with this patch if `ignorebadroots` option is specified
then we handle such case and continue by removing already
created block groups and creating dummy block groups.

Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
---
 fs/btrfs/block-group.c | 20 ++++++++++++++++++++
 fs/btrfs/disk-io.c     |  4 ++--
 fs/btrfs/disk-io.h     |  2 ++
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 48ebc106a606..f485cf14c2f8 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2048,6 +2048,26 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 	ret = check_chunk_block_group_mappings(info);
 error:
 	btrfs_free_path(path);
+
+	if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
+
+		if (btrfs_super_log_root(info->super_copy) != 0) {
+			btrfs_warn(info, "Ignoring tree-log replay due to extent tree corruption!");
+			btrfs_set_super_log_root(info->super_copy, 0);
+		}
+
+		btrfs_put_block_group_cache(info);
+		btrfs_stop_all_workers(info);
+		btrfs_free_block_groups(info);
+		ret = btrfs_init_workqueues(info, NULL);
+		if (ret)
+			return ret;
+		ret = btrfs_init_space_info(info);
+		if (ret)
+			return ret;
+		return fill_dummy_bgs(info);
+	}
+
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07a2b4f69b10..dc744f76d075 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1955,7 +1955,7 @@ static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
 }
 
 /* helper to cleanup workers */
-static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
+void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 {
 	btrfs_destroy_workqueue(fs_info->fixup_workers);
 	btrfs_destroy_workqueue(fs_info->delalloc_workers);
@@ -2122,7 +2122,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
 	mutex_init(&fs_info->qgroup_rescan_lock);
 }
 
-static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
+int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
 		struct btrfs_fs_devices *fs_devices)
 {
 	u32 max_active = fs_info->thread_pool_size;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index e45057c0c016..f9bfcba86a04 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -137,6 +137,8 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
 int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
 int __init btrfs_end_io_wq_init(void);
 void __cold btrfs_end_io_wq_exit(void);
+void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info);
+int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_set_buffer_lockdep_class(u64 objectid,
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-20  0:34           ` Qu Wenruo
@ 2021-03-21 21:54             ` Dāvis Mosāns
  2021-03-22  0:25               ` Qu Wenruo
  0 siblings, 1 reply; 14+ messages in thread
From: Dāvis Mosāns @ 2021-03-21 21:54 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Btrfs BTRFS, clm, Josef Bacik, dsterba, Zygo Blaxell

sestd., 2021. g. 20. marts, plkst. 02:34 — lietotājs Qu Wenruo
(<quwenruo.btrfs@gmx.com>) rakstīja:
>
>
>
> On 2021/3/19 下午11:34, Dāvis Mosāns wrote:
> > ceturtd., 2021. g. 18. marts, plkst. 01:49 — lietotājs Qu Wenruo
> > (<quwenruo.btrfs@gmx.com>) rakstīja:
> >>
> >>
> >>
> >> On 2021/3/18 上午5:03, Dāvis Mosāns wrote:
> >>> trešd., 2021. g. 17. marts, plkst. 12:28 — lietotājs Qu Wenruo
> >>> (<quwenruo.btrfs@gmx.com>) rakstīja:
> >>>>
> >>>>
> >>>>
> >>>> On 2021/3/17 上午9:29, Dāvis Mosāns wrote:
> >>>>> trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
> >>>>> (<davispuh@gmail.com>) rakstīja:
> >>>>>>
> >>>>>> Currently if there's any corruption at all in extent tree
> >>>>>> (eg. even single bit) then mounting will fail with:
> >>>>>> "failed to read block groups: -5" (-EIO)
> >>>>>> It happens because we immediately abort on first error when
> >>>>>> searching in extent tree for block groups.
> >>>>>>
> >>>>>> Now with this patch if `ignorebadroots` option is specified
> >>>>>> then we handle such case and continue by removing already
> >>>>>> created block groups and creating dummy block groups.
> >>>>>>
> >>>>>> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
> >>>>>> ---
> >>>>>>     fs/btrfs/block-group.c | 14 ++++++++++++++
> >>>>>>     fs/btrfs/disk-io.c     |  4 ++--
> >>>>>>     fs/btrfs/disk-io.h     |  2 ++
> >>>>>>     3 files changed, 18 insertions(+), 2 deletions(-)
> >>>>>>
> >>>>>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> >>>>>> index 48ebc106a606..827a977614b3 100644
> >>>>>> --- a/fs/btrfs/block-group.c
> >>>>>> +++ b/fs/btrfs/block-group.c
> >>>>>> @@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
> >>>>>>            ret = check_chunk_block_group_mappings(info);
> >>>>>>     error:
> >>>>>>            btrfs_free_path(path);
> >>>>>> +
> >>>>>> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
> >>>>>> +               btrfs_put_block_group_cache(info);
> >>>>>> +               btrfs_stop_all_workers(info);
> >>>>>> +               btrfs_free_block_groups(info);
> >>>>>> +               ret = btrfs_init_workqueues(info, NULL);
> >>>>>> +               if (ret)
> >>>>>> +                       return ret;
> >>>>>> +               ret = btrfs_init_space_info(info);
> >>>>>> +               if (ret)
> >>>>>> +                       return ret;
> >>>>>> +               return fill_dummy_bgs(info);
> >>>>
> >>>> When we hit bad things in extent tree, we should ensure we're mounting
> >>>> the fs RO, or we can't continue.
> >>>>
> >>>> And we should also refuse to mount back to RW if we hit such case, so
> >>>> that we don't need anything complex, just ignore the whole extent tree
> >>>> and create the dummy block groups.
> >>>>
> >>>
> >>> That's what we're doing here, `ignorebadroots` implies RO mount and
> >>> without specifying it doesn't mount at all.
> >>>
> >>>>>
> >>>>> This isn't that nice, but I don't really know how to properly clean up
> >>>>> everything related to already created block groups so this was easiest
> >>>>> way. It seems to work fine.
> >>>>> But looks like need to do something about replay log aswell because if
> >>>>> it's not disabled then it fails with:
> >>>>>
> >>>>> [ 1397.246869] BTRFS info (device sde): start tree-log replay
> >>>>> [ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
> >>>>> on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
> >>>>> [ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
> >>>>> on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
> >>>>> [ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
> >>>>> errno=-5 IO failure
> >>>>> [ 1398.218828] BTRFS: error (device sde) in
> >>>>> btrfs_run_delayed_refs:2124: errno=-5 IO failure
> >>>>> [ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
> >>>>> errno=-5 IO failure (Failed to recover log tree)
> >>>>> [ 1398.229048] BTRFS error (device sde): open_ctree failed
> >>>>
> >>>> This is because we shouldn't allow to do anything write to the fs if we
> >>>> have anything wrong in extent tree.
> >>>>
> >>>
> >>> This is happening when mounting read-only. My assumption is that it
> >>> only tries to replay in memory without writing anything to disk.
> >>>
> >>
> >> We lacks the check on log tree.
> >>
> >> Normally for such forced RO mount, log replay is not allowed.
> >>
> >> We should output a warning to prompt user to use nologreplay, and reject
> >> the mount.
> >>
> >
> > I'm not familiar with log replay but couldn't there be something
> > useful (ignoring ref counts) that would still be worth replaying in
> > memory?
> >
> Log replay means metadata write.
>
> Any write needs a valid extent tree to find out free space for new
> metadata/data.
>
> So no, we can't do anything but completely ignoring the log.
>

I see, updated patch. But even then it seems it could be possible to
add new ramdisk and make allocations there (eg. create new extent tree
there) thus allowing replay. I guess that's way too much work.
Anyway thanks for feedback!

Best regards,
Dāvis

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-21 21:54             ` Dāvis Mosāns
@ 2021-03-22  0:25               ` Qu Wenruo
  2021-03-22  3:13                 ` Dāvis Mosāns
  0 siblings, 1 reply; 14+ messages in thread
From: Qu Wenruo @ 2021-03-22  0:25 UTC (permalink / raw)
  To: Dāvis Mosāns
  Cc: Btrfs BTRFS, clm, Josef Bacik, dsterba, Zygo Blaxell



On 2021/3/22 上午5:54, Dāvis Mosāns wrote:
> sestd., 2021. g. 20. marts, plkst. 02:34 — lietotājs Qu Wenruo
> (<quwenruo.btrfs@gmx.com>) rakstīja:
>>
>>
>>
>> On 2021/3/19 下午11:34, Dāvis Mosāns wrote:
>>> ceturtd., 2021. g. 18. marts, plkst. 01:49 — lietotājs Qu Wenruo
>>> (<quwenruo.btrfs@gmx.com>) rakstīja:
>>>>
>>>>
>>>>
>>>> On 2021/3/18 上午5:03, Dāvis Mosāns wrote:
>>>>> trešd., 2021. g. 17. marts, plkst. 12:28 — lietotājs Qu Wenruo
>>>>> (<quwenruo.btrfs@gmx.com>) rakstīja:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 2021/3/17 上午9:29, Dāvis Mosāns wrote:
>>>>>>> trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
>>>>>>> (<davispuh@gmail.com>) rakstīja:
>>>>>>>>
>>>>>>>> Currently if there's any corruption at all in extent tree
>>>>>>>> (eg. even single bit) then mounting will fail with:
>>>>>>>> "failed to read block groups: -5" (-EIO)
>>>>>>>> It happens because we immediately abort on first error when
>>>>>>>> searching in extent tree for block groups.
>>>>>>>>
>>>>>>>> Now with this patch if `ignorebadroots` option is specified
>>>>>>>> then we handle such case and continue by removing already
>>>>>>>> created block groups and creating dummy block groups.
>>>>>>>>
>>>>>>>> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
>>>>>>>> ---
>>>>>>>>      fs/btrfs/block-group.c | 14 ++++++++++++++
>>>>>>>>      fs/btrfs/disk-io.c     |  4 ++--
>>>>>>>>      fs/btrfs/disk-io.h     |  2 ++
>>>>>>>>      3 files changed, 18 insertions(+), 2 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>>>>>>>> index 48ebc106a606..827a977614b3 100644
>>>>>>>> --- a/fs/btrfs/block-group.c
>>>>>>>> +++ b/fs/btrfs/block-group.c
>>>>>>>> @@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>>>>>>>>             ret = check_chunk_block_group_mappings(info);
>>>>>>>>      error:
>>>>>>>>             btrfs_free_path(path);
>>>>>>>> +
>>>>>>>> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
>>>>>>>> +               btrfs_put_block_group_cache(info);
>>>>>>>> +               btrfs_stop_all_workers(info);
>>>>>>>> +               btrfs_free_block_groups(info);
>>>>>>>> +               ret = btrfs_init_workqueues(info, NULL);
>>>>>>>> +               if (ret)
>>>>>>>> +                       return ret;
>>>>>>>> +               ret = btrfs_init_space_info(info);
>>>>>>>> +               if (ret)
>>>>>>>> +                       return ret;
>>>>>>>> +               return fill_dummy_bgs(info);
>>>>>>
>>>>>> When we hit bad things in extent tree, we should ensure we're mounting
>>>>>> the fs RO, or we can't continue.
>>>>>>
>>>>>> And we should also refuse to mount back to RW if we hit such case, so
>>>>>> that we don't need anything complex, just ignore the whole extent tree
>>>>>> and create the dummy block groups.
>>>>>>
>>>>>
>>>>> That's what we're doing here, `ignorebadroots` implies RO mount and
>>>>> without specifying it doesn't mount at all.
>>>>>
>>>>>>>
>>>>>>> This isn't that nice, but I don't really know how to properly clean up
>>>>>>> everything related to already created block groups so this was easiest
>>>>>>> way. It seems to work fine.
>>>>>>> But looks like need to do something about replay log aswell because if
>>>>>>> it's not disabled then it fails with:
>>>>>>>
>>>>>>> [ 1397.246869] BTRFS info (device sde): start tree-log replay
>>>>>>> [ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
>>>>>>> on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
>>>>>>> [ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
>>>>>>> on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
>>>>>>> [ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
>>>>>>> errno=-5 IO failure
>>>>>>> [ 1398.218828] BTRFS: error (device sde) in
>>>>>>> btrfs_run_delayed_refs:2124: errno=-5 IO failure
>>>>>>> [ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
>>>>>>> errno=-5 IO failure (Failed to recover log tree)
>>>>>>> [ 1398.229048] BTRFS error (device sde): open_ctree failed
>>>>>>
>>>>>> This is because we shouldn't allow to do anything write to the fs if we
>>>>>> have anything wrong in extent tree.
>>>>>>
>>>>>
>>>>> This is happening when mounting read-only. My assumption is that it
>>>>> only tries to replay in memory without writing anything to disk.
>>>>>
>>>>
>>>> We lacks the check on log tree.
>>>>
>>>> Normally for such forced RO mount, log replay is not allowed.
>>>>
>>>> We should output a warning to prompt user to use nologreplay, and reject
>>>> the mount.
>>>>
>>>
>>> I'm not familiar with log replay but couldn't there be something
>>> useful (ignoring ref counts) that would still be worth replaying in
>>> memory?
>>>
>> Log replay means metadata write.
>>
>> Any write needs a valid extent tree to find out free space for new
>> metadata/data.
>>
>> So no, we can't do anything but completely ignoring the log.
>>
>
> I see, updated patch. But even then it seems it could be possible to
> add new ramdisk and make allocations there (eg. create new extent tree
> there) thus allowing replay.

The problem here is, since the extent tree is corrupted, we won't know
which range has metadata already.
While metadata CoW, just like its name, needs to CoW, which means it
can't writeback (even just in memory) to anywhere we have metadata.

The worst case is, we choose a bytenr for the new metadata to be (in
memory), but it turns out later read needs to read metadata from the
exactly same location.

> I guess that's way too much work.

And we gain nothing but tons of potential bugs.


BTW, I'm curious what's your test cases? As it seems you're using
log-replay but if we hit anything wrong for the replayed data, it means
btrfs kernel module has something wrong.
Did you add extra corruption for the replayed data, or it's some bug
unexposed?

Thanks,
Qu

> Anyway thanks for feedback!
>
> Best regards,
> Dāvis
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-22  0:25               ` Qu Wenruo
@ 2021-03-22  3:13                 ` Dāvis Mosāns
  2021-03-22  4:48                   ` Zygo Blaxell
  0 siblings, 1 reply; 14+ messages in thread
From: Dāvis Mosāns @ 2021-03-22  3:13 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Btrfs BTRFS, clm, Josef Bacik, dsterba, Zygo Blaxell

pirmd., 2021. g. 22. marts, plkst. 02:25 — lietotājs Qu Wenruo
(<quwenruo.btrfs@gmx.com>) rakstīja:
>
>
>
> On 2021/3/22 上午5:54, Dāvis Mosāns wrote:
> > sestd., 2021. g. 20. marts, plkst. 02:34 — lietotājs Qu Wenruo
> > (<quwenruo.btrfs@gmx.com>) rakstīja:
> >>
> >>
> >>
> >> On 2021/3/19 下午11:34, Dāvis Mosāns wrote:
> >>> ceturtd., 2021. g. 18. marts, plkst. 01:49 — lietotājs Qu Wenruo
> >>> (<quwenruo.btrfs@gmx.com>) rakstīja:
> >>>>
> >>>>
> >>>>
> >>>> On 2021/3/18 上午5:03, Dāvis Mosāns wrote:
> >>>>> trešd., 2021. g. 17. marts, plkst. 12:28 — lietotājs Qu Wenruo
> >>>>> (<quwenruo.btrfs@gmx.com>) rakstīja:
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 2021/3/17 上午9:29, Dāvis Mosāns wrote:
> >>>>>>> trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
> >>>>>>> (<davispuh@gmail.com>) rakstīja:
> >>>>>>>>
> >>>>>>>> Currently if there's any corruption at all in extent tree
> >>>>>>>> (eg. even single bit) then mounting will fail with:
> >>>>>>>> "failed to read block groups: -5" (-EIO)
> >>>>>>>> It happens because we immediately abort on first error when
> >>>>>>>> searching in extent tree for block groups.
> >>>>>>>>
> >>>>>>>> Now with this patch if `ignorebadroots` option is specified
> >>>>>>>> then we handle such case and continue by removing already
> >>>>>>>> created block groups and creating dummy block groups.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
> >>>>>>>> ---
> >>>>>>>>      fs/btrfs/block-group.c | 14 ++++++++++++++
> >>>>>>>>      fs/btrfs/disk-io.c     |  4 ++--
> >>>>>>>>      fs/btrfs/disk-io.h     |  2 ++
> >>>>>>>>      3 files changed, 18 insertions(+), 2 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> >>>>>>>> index 48ebc106a606..827a977614b3 100644
> >>>>>>>> --- a/fs/btrfs/block-group.c
> >>>>>>>> +++ b/fs/btrfs/block-group.c
> >>>>>>>> @@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
> >>>>>>>>             ret = check_chunk_block_group_mappings(info);
> >>>>>>>>      error:
> >>>>>>>>             btrfs_free_path(path);
> >>>>>>>> +
> >>>>>>>> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
> >>>>>>>> +               btrfs_put_block_group_cache(info);
> >>>>>>>> +               btrfs_stop_all_workers(info);
> >>>>>>>> +               btrfs_free_block_groups(info);
> >>>>>>>> +               ret = btrfs_init_workqueues(info, NULL);
> >>>>>>>> +               if (ret)
> >>>>>>>> +                       return ret;
> >>>>>>>> +               ret = btrfs_init_space_info(info);
> >>>>>>>> +               if (ret)
> >>>>>>>> +                       return ret;
> >>>>>>>> +               return fill_dummy_bgs(info);
> >>>>>>
> >>>>>> When we hit bad things in extent tree, we should ensure we're mounting
> >>>>>> the fs RO, or we can't continue.
> >>>>>>
> >>>>>> And we should also refuse to mount back to RW if we hit such case, so
> >>>>>> that we don't need anything complex, just ignore the whole extent tree
> >>>>>> and create the dummy block groups.
> >>>>>>
> >>>>>
> >>>>> That's what we're doing here, `ignorebadroots` implies RO mount and
> >>>>> without specifying it doesn't mount at all.
> >>>>>
> >>>>>>>
> >>>>>>> This isn't that nice, but I don't really know how to properly clean up
> >>>>>>> everything related to already created block groups so this was easiest
> >>>>>>> way. It seems to work fine.
> >>>>>>> But looks like need to do something about replay log aswell because if
> >>>>>>> it's not disabled then it fails with:
> >>>>>>>
> >>>>>>> [ 1397.246869] BTRFS info (device sde): start tree-log replay
> >>>>>>> [ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
> >>>>>>> on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
> >>>>>>> [ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
> >>>>>>> on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
> >>>>>>> [ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
> >>>>>>> errno=-5 IO failure
> >>>>>>> [ 1398.218828] BTRFS: error (device sde) in
> >>>>>>> btrfs_run_delayed_refs:2124: errno=-5 IO failure
> >>>>>>> [ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
> >>>>>>> errno=-5 IO failure (Failed to recover log tree)
> >>>>>>> [ 1398.229048] BTRFS error (device sde): open_ctree failed
> >>>>>>
> >>>>>> This is because we shouldn't allow to do anything write to the fs if we
> >>>>>> have anything wrong in extent tree.
> >>>>>>
> >>>>>
> >>>>> This is happening when mounting read-only. My assumption is that it
> >>>>> only tries to replay in memory without writing anything to disk.
> >>>>>
> >>>>
> >>>> We lacks the check on log tree.
> >>>>
> >>>> Normally for such forced RO mount, log replay is not allowed.
> >>>>
> >>>> We should output a warning to prompt user to use nologreplay, and reject
> >>>> the mount.
> >>>>
> >>>
> >>> I'm not familiar with log replay but couldn't there be something
> >>> useful (ignoring ref counts) that would still be worth replaying in
> >>> memory?
> >>>
> >> Log replay means metadata write.
> >>
> >> Any write needs a valid extent tree to find out free space for new
> >> metadata/data.
> >>
> >> So no, we can't do anything but completely ignoring the log.
> >>
> >
> > I see, updated patch. But even then it seems it could be possible to
> > add new ramdisk and make allocations there (eg. create new extent tree
> > there) thus allowing replay.
>
> The problem here is, since the extent tree is corrupted, we won't know
> which range has metadata already.
> While metadata CoW, just like its name, needs to CoW, which means it
> can't writeback (even just in memory) to anywhere we have metadata.
>
> The worst case is, we choose a bytenr for the new metadata to be (in
> memory), but it turns out later read needs to read metadata from the
> exactly same location.
>

The idea is if we add new disk then we would put it after last bytenr
(which isn't mapped to any existing disks) thus there wouldn't be any
overlap.

>
> BTW, I'm curious what's your test cases? As it seems you're using
> log-replay but if we hit anything wrong for the replayed data, it means
> btrfs kernel module has something wrong.
> Did you add extra corruption for the replayed data, or it's some bug
> unexposed?

Basically I've a corrupted btrfs due to HBA card fault and before I
nuke it I want to copy as much usable data as possible. So I was
thinking if whatever is in replay log could be restored. The replay
tree log itself is perfectly fine with valid checksum and there isn't
any issues regarding that. I looked at it with `btrfs inspect
dump-tree` and saw that there isn't anything important so it's fine
ignoring it.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-22  3:13                 ` Dāvis Mosāns
@ 2021-03-22  4:48                   ` Zygo Blaxell
  2021-03-22 16:10                     ` Dāvis Mosāns
  0 siblings, 1 reply; 14+ messages in thread
From: Zygo Blaxell @ 2021-03-22  4:48 UTC (permalink / raw)
  To: Dāvis Mosāns; +Cc: Qu Wenruo, Btrfs BTRFS, clm, Josef Bacik, dsterba

On Mon, Mar 22, 2021 at 05:13:13AM +0200, Dāvis Mosāns wrote:
> pirmd., 2021. g. 22. marts, plkst. 02:25 — lietotājs Qu Wenruo
> (<quwenruo.btrfs@gmx.com>) rakstīja:
> >
> >
> >
> > On 2021/3/22 上午5:54, Dāvis Mosāns wrote:
> > > sestd., 2021. g. 20. marts, plkst. 02:34 — lietotājs Qu Wenruo
> > > (<quwenruo.btrfs@gmx.com>) rakstīja:
> > >>
> > >>
> > >>
> > >> On 2021/3/19 下午11:34, Dāvis Mosāns wrote:
> > >>> ceturtd., 2021. g. 18. marts, plkst. 01:49 — lietotājs Qu Wenruo
> > >>> (<quwenruo.btrfs@gmx.com>) rakstīja:
> > >>>>
> > >>>>
> > >>>>
> > >>>> On 2021/3/18 上午5:03, Dāvis Mosāns wrote:
> > >>>>> trešd., 2021. g. 17. marts, plkst. 12:28 — lietotājs Qu Wenruo
> > >>>>> (<quwenruo.btrfs@gmx.com>) rakstīja:
> > >>>>>>
> > >>>>>>
> > >>>>>>
> > >>>>>> On 2021/3/17 上午9:29, Dāvis Mosāns wrote:
> > >>>>>>> trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
> > >>>>>>> (<davispuh@gmail.com>) rakstīja:
> > >>>>>>>>
> > >>>>>>>> Currently if there's any corruption at all in extent tree
> > >>>>>>>> (eg. even single bit) then mounting will fail with:
> > >>>>>>>> "failed to read block groups: -5" (-EIO)
> > >>>>>>>> It happens because we immediately abort on first error when
> > >>>>>>>> searching in extent tree for block groups.
> > >>>>>>>>
> > >>>>>>>> Now with this patch if `ignorebadroots` option is specified
> > >>>>>>>> then we handle such case and continue by removing already
> > >>>>>>>> created block groups and creating dummy block groups.
> > >>>>>>>>
> > >>>>>>>> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
> > >>>>>>>> ---
> > >>>>>>>>      fs/btrfs/block-group.c | 14 ++++++++++++++
> > >>>>>>>>      fs/btrfs/disk-io.c     |  4 ++--
> > >>>>>>>>      fs/btrfs/disk-io.h     |  2 ++
> > >>>>>>>>      3 files changed, 18 insertions(+), 2 deletions(-)
> > >>>>>>>>
> > >>>>>>>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> > >>>>>>>> index 48ebc106a606..827a977614b3 100644
> > >>>>>>>> --- a/fs/btrfs/block-group.c
> > >>>>>>>> +++ b/fs/btrfs/block-group.c
> > >>>>>>>> @@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
> > >>>>>>>>             ret = check_chunk_block_group_mappings(info);
> > >>>>>>>>      error:
> > >>>>>>>>             btrfs_free_path(path);
> > >>>>>>>> +
> > >>>>>>>> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
> > >>>>>>>> +               btrfs_put_block_group_cache(info);
> > >>>>>>>> +               btrfs_stop_all_workers(info);
> > >>>>>>>> +               btrfs_free_block_groups(info);
> > >>>>>>>> +               ret = btrfs_init_workqueues(info, NULL);
> > >>>>>>>> +               if (ret)
> > >>>>>>>> +                       return ret;
> > >>>>>>>> +               ret = btrfs_init_space_info(info);
> > >>>>>>>> +               if (ret)
> > >>>>>>>> +                       return ret;
> > >>>>>>>> +               return fill_dummy_bgs(info);
> > >>>>>>
> > >>>>>> When we hit bad things in extent tree, we should ensure we're mounting
> > >>>>>> the fs RO, or we can't continue.
> > >>>>>>
> > >>>>>> And we should also refuse to mount back to RW if we hit such case, so
> > >>>>>> that we don't need anything complex, just ignore the whole extent tree
> > >>>>>> and create the dummy block groups.
> > >>>>>>
> > >>>>>
> > >>>>> That's what we're doing here, `ignorebadroots` implies RO mount and
> > >>>>> without specifying it doesn't mount at all.
> > >>>>>
> > >>>>>>>
> > >>>>>>> This isn't that nice, but I don't really know how to properly clean up
> > >>>>>>> everything related to already created block groups so this was easiest
> > >>>>>>> way. It seems to work fine.
> > >>>>>>> But looks like need to do something about replay log aswell because if
> > >>>>>>> it's not disabled then it fails with:
> > >>>>>>>
> > >>>>>>> [ 1397.246869] BTRFS info (device sde): start tree-log replay
> > >>>>>>> [ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
> > >>>>>>> on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
> > >>>>>>> [ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
> > >>>>>>> on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
> > >>>>>>> [ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
> > >>>>>>> errno=-5 IO failure
> > >>>>>>> [ 1398.218828] BTRFS: error (device sde) in
> > >>>>>>> btrfs_run_delayed_refs:2124: errno=-5 IO failure
> > >>>>>>> [ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
> > >>>>>>> errno=-5 IO failure (Failed to recover log tree)
> > >>>>>>> [ 1398.229048] BTRFS error (device sde): open_ctree failed
> > >>>>>>
> > >>>>>> This is because we shouldn't allow to do anything write to the fs if we
> > >>>>>> have anything wrong in extent tree.
> > >>>>>>
> > >>>>>
> > >>>>> This is happening when mounting read-only. My assumption is that it
> > >>>>> only tries to replay in memory without writing anything to disk.
> > >>>>>
> > >>>>
> > >>>> We lacks the check on log tree.
> > >>>>
> > >>>> Normally for such forced RO mount, log replay is not allowed.
> > >>>>
> > >>>> We should output a warning to prompt user to use nologreplay, and reject
> > >>>> the mount.
> > >>>>
> > >>>
> > >>> I'm not familiar with log replay but couldn't there be something
> > >>> useful (ignoring ref counts) that would still be worth replaying in
> > >>> memory?
> > >>>
> > >> Log replay means metadata write.
> > >>
> > >> Any write needs a valid extent tree to find out free space for new
> > >> metadata/data.
> > >>
> > >> So no, we can't do anything but completely ignoring the log.
> > >>
> > >
> > > I see, updated patch. But even then it seems it could be possible to
> > > add new ramdisk and make allocations there (eg. create new extent tree
> > > there) thus allowing replay.
> >
> > The problem here is, since the extent tree is corrupted, we won't know
> > which range has metadata already.
> > While metadata CoW, just like its name, needs to CoW, which means it
> > can't writeback (even just in memory) to anywhere we have metadata.
> >
> > The worst case is, we choose a bytenr for the new metadata to be (in
> > memory), but it turns out later read needs to read metadata from the
> > exactly same location.
> >
> 
> The idea is if we add new disk then we would put it after last bytenr
> (which isn't mapped to any existing disks) thus there wouldn't be any
> overlap.

I wonder if that idea can be turned into an online recovery tool.
Rebuild the metadata by more or less reflinking the old filesystem's data
into a new filesystem created in the unallocated spaces between the old
filesystem's block groups (building new subvol trees in the process).
It would need the device and chunk trees to be intact, but you have to
have those for rescue= to work at all, and it's relatively rarer for those
to get damaged.  There's a complicated dance that has to be done to flip
a block group from the old filesystem to the new one, but maybe that can
be done by just making the entire old filesystem into a giant file image,
then making ordinary reflinks to it, then finish by deleting the old
image and running defrag to discard the unreferenced blocks that remain.
Think "btrfs-convert" but using a busted btrfs as source instead of ext4.

OK, way out of scope for this thread.

> > BTW, I'm curious what's your test cases? As it seems you're using
> > log-replay but if we hit anything wrong for the replayed data, it means
> > btrfs kernel module has something wrong.
> > Did you add extra corruption for the replayed data, or it's some bug
> > unexposed?
> 
> Basically I've a corrupted btrfs due to HBA card fault and before I
> nuke it I want to copy as much usable data as possible. So I was
> thinking if whatever is in replay log could be restored. The replay
> tree log itself is perfectly fine with valid checksum and there isn't
> any issues regarding that. I looked at it with `btrfs inspect
> dump-tree` and saw that there isn't anything important so it's fine
> ignoring it.

The log tree will only contain things that were fsync()ed after the
last completed transaction commit.  Unless you're hitting a delayed
refs latency issue or have non-default mount options, that's data from
only the last 30 seconds before the filesystem failed.

It might be desirable to replay the log tree if you had very high-value
data there, but it's the last 0.001% of the filesystem that requires
the last 99% of the development effort to recover.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-22  4:48                   ` Zygo Blaxell
@ 2021-03-22 16:10                     ` Dāvis Mosāns
  0 siblings, 0 replies; 14+ messages in thread
From: Dāvis Mosāns @ 2021-03-22 16:10 UTC (permalink / raw)
  To: Zygo Blaxell; +Cc: Qu Wenruo, Btrfs BTRFS, clm, Josef Bacik, dsterba

pirmd., 2021. g. 22. marts, plkst. 06:48 — lietotājs Zygo Blaxell
(<ce3g8jdj@umail.furryterror.org>) rakstīja:
>
> On Mon, Mar 22, 2021 at 05:13:13AM +0200, Dāvis Mosāns wrote:
> > pirmd., 2021. g. 22. marts, plkst. 02:25 — lietotājs Qu Wenruo
> > (<quwenruo.btrfs@gmx.com>) rakstīja:
> > >
> > >
> > >
> > > On 2021/3/22 上午5:54, Dāvis Mosāns wrote:
> > > > sestd., 2021. g. 20. marts, plkst. 02:34 — lietotājs Qu Wenruo
> > > > (<quwenruo.btrfs@gmx.com>) rakstīja:
> > > >>
> > > >>
> > > >>
> > > >> On 2021/3/19 下午11:34, Dāvis Mosāns wrote:
> > > >>> ceturtd., 2021. g. 18. marts, plkst. 01:49 — lietotājs Qu Wenruo
> > > >>> (<quwenruo.btrfs@gmx.com>) rakstīja:
> > > >>>>
> > > >>>>
> > > >>>>
> > > >>>> On 2021/3/18 上午5:03, Dāvis Mosāns wrote:
> > > >>>>> trešd., 2021. g. 17. marts, plkst. 12:28 — lietotājs Qu Wenruo
> > > >>>>> (<quwenruo.btrfs@gmx.com>) rakstīja:
> > > >>>>>>
> > > >>>>>>
> > > >>>>>>
> > > >>>>>> On 2021/3/17 上午9:29, Dāvis Mosāns wrote:
> > > >>>>>>> trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
> > > >>>>>>> (<davispuh@gmail.com>) rakstīja:
> > > >>>>>>>>
> > > >>>>>>>> Currently if there's any corruption at all in extent tree
> > > >>>>>>>> (eg. even single bit) then mounting will fail with:
> > > >>>>>>>> "failed to read block groups: -5" (-EIO)
> > > >>>>>>>> It happens because we immediately abort on first error when
> > > >>>>>>>> searching in extent tree for block groups.
> > > >>>>>>>>
> > > >>>>>>>> Now with this patch if `ignorebadroots` option is specified
> > > >>>>>>>> then we handle such case and continue by removing already
> > > >>>>>>>> created block groups and creating dummy block groups.
> > > >>>>>>>>
> > > >>>>>>>> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
> > > >>>>>>>> ---
> > > >>>>>>>>      fs/btrfs/block-group.c | 14 ++++++++++++++
> > > >>>>>>>>      fs/btrfs/disk-io.c     |  4 ++--
> > > >>>>>>>>      fs/btrfs/disk-io.h     |  2 ++
> > > >>>>>>>>      3 files changed, 18 insertions(+), 2 deletions(-)
> > > >>>>>>>>
> > > >>>>>>>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> > > >>>>>>>> index 48ebc106a606..827a977614b3 100644
> > > >>>>>>>> --- a/fs/btrfs/block-group.c
> > > >>>>>>>> +++ b/fs/btrfs/block-group.c
> > > >>>>>>>> @@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
> > > >>>>>>>>             ret = check_chunk_block_group_mappings(info);
> > > >>>>>>>>      error:
> > > >>>>>>>>             btrfs_free_path(path);
> > > >>>>>>>> +
> > > >>>>>>>> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
> > > >>>>>>>> +               btrfs_put_block_group_cache(info);
> > > >>>>>>>> +               btrfs_stop_all_workers(info);
> > > >>>>>>>> +               btrfs_free_block_groups(info);
> > > >>>>>>>> +               ret = btrfs_init_workqueues(info, NULL);
> > > >>>>>>>> +               if (ret)
> > > >>>>>>>> +                       return ret;
> > > >>>>>>>> +               ret = btrfs_init_space_info(info);
> > > >>>>>>>> +               if (ret)
> > > >>>>>>>> +                       return ret;
> > > >>>>>>>> +               return fill_dummy_bgs(info);
> > > >>>>>>
> > > >>>>>> When we hit bad things in extent tree, we should ensure we're mounting
> > > >>>>>> the fs RO, or we can't continue.
> > > >>>>>>
> > > >>>>>> And we should also refuse to mount back to RW if we hit such case, so
> > > >>>>>> that we don't need anything complex, just ignore the whole extent tree
> > > >>>>>> and create the dummy block groups.
> > > >>>>>>
> > > >>>>>
> > > >>>>> That's what we're doing here, `ignorebadroots` implies RO mount and
> > > >>>>> without specifying it doesn't mount at all.
> > > >>>>>
> > > >>>>>>>
> > > >>>>>>> This isn't that nice, but I don't really know how to properly clean up
> > > >>>>>>> everything related to already created block groups so this was easiest
> > > >>>>>>> way. It seems to work fine.
> > > >>>>>>> But looks like need to do something about replay log aswell because if
> > > >>>>>>> it's not disabled then it fails with:
> > > >>>>>>>
> > > >>>>>>> [ 1397.246869] BTRFS info (device sde): start tree-log replay
> > > >>>>>>> [ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
> > > >>>>>>> on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
> > > >>>>>>> [ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
> > > >>>>>>> on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
> > > >>>>>>> [ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
> > > >>>>>>> errno=-5 IO failure
> > > >>>>>>> [ 1398.218828] BTRFS: error (device sde) in
> > > >>>>>>> btrfs_run_delayed_refs:2124: errno=-5 IO failure
> > > >>>>>>> [ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
> > > >>>>>>> errno=-5 IO failure (Failed to recover log tree)
> > > >>>>>>> [ 1398.229048] BTRFS error (device sde): open_ctree failed
> > > >>>>>>
> > > >>>>>> This is because we shouldn't allow to do anything write to the fs if we
> > > >>>>>> have anything wrong in extent tree.
> > > >>>>>>
> > > >>>>>
> > > >>>>> This is happening when mounting read-only. My assumption is that it
> > > >>>>> only tries to replay in memory without writing anything to disk.
> > > >>>>>
> > > >>>>
> > > >>>> We lacks the check on log tree.
> > > >>>>
> > > >>>> Normally for such forced RO mount, log replay is not allowed.
> > > >>>>
> > > >>>> We should output a warning to prompt user to use nologreplay, and reject
> > > >>>> the mount.
> > > >>>>
> > > >>>
> > > >>> I'm not familiar with log replay but couldn't there be something
> > > >>> useful (ignoring ref counts) that would still be worth replaying in
> > > >>> memory?
> > > >>>
> > > >> Log replay means metadata write.
> > > >>
> > > >> Any write needs a valid extent tree to find out free space for new
> > > >> metadata/data.
> > > >>
> > > >> So no, we can't do anything but completely ignoring the log.
> > > >>
> > > >
> > > > I see, updated patch. But even then it seems it could be possible to
> > > > add new ramdisk and make allocations there (eg. create new extent tree
> > > > there) thus allowing replay.
> > >
> > > The problem here is, since the extent tree is corrupted, we won't know
> > > which range has metadata already.
> > > While metadata CoW, just like its name, needs to CoW, which means it
> > > can't writeback (even just in memory) to anywhere we have metadata.
> > >
> > > The worst case is, we choose a bytenr for the new metadata to be (in
> > > memory), but it turns out later read needs to read metadata from the
> > > exactly same location.
> > >
> >
> > The idea is if we add new disk then we would put it after last bytenr
> > (which isn't mapped to any existing disks) thus there wouldn't be any
> > overlap.
>
> I wonder if that idea can be turned into an online recovery tool.
> Rebuild the metadata by more or less reflinking the old filesystem's data
> into a new filesystem created in the unallocated spaces between the old
> filesystem's block groups (building new subvol trees in the process).
> It would need the device and chunk trees to be intact, but you have to
> have those for rescue= to work at all, and it's relatively rarer for those
> to get damaged.  There's a complicated dance that has to be done to flip
> a block group from the old filesystem to the new one, but maybe that can
> be done by just making the entire old filesystem into a giant file image,
> then making ordinary reflinks to it, then finish by deleting the old
> image and running defrag to discard the unreferenced blocks that remain.
> Think "btrfs-convert" but using a busted btrfs as source instead of ext4.
>

That does sound pretty cool and could be doable, but it's way outside
of my knowledge about this all to even attempt anything like this.

>
> The log tree will only contain things that were fsync()ed after the
> last completed transaction commit.  Unless you're hitting a delayed
> refs latency issue or have non-default mount options, that's data from
> only the last 30 seconds before the filesystem failed.
>
> It might be desirable to replay the log tree if you had very high-value
> data there, but it's the last 0.001% of the filesystem that requires
> the last 99% of the development effort to recover.

Yea for me there was only some XATTR_ITEMs and single INODE_ITEM so
nothing that useful. Also I think easier way would be to parse it
offline and then apply those changes on data that's copied to new
filesystem. That doesn't seem that complicated if someone really needs
it.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] btrfs: Allow read-only mount with corrupted extent tree
  2021-03-21 21:49 ` [PATCH] " Dāvis Mosāns
@ 2021-04-21 16:00   ` Dāvis Mosāns
  0 siblings, 0 replies; 14+ messages in thread
From: Dāvis Mosāns @ 2021-04-21 16:00 UTC (permalink / raw)
  To: Btrfs BTRFS; +Cc: clm, Josef Bacik, dsterba, linux-kernel, Zygo Blaxell

svētd., 2021. g. 21. marts, plkst. 23:46 — lietotājs Dāvis Mosāns
(<davispuh@gmail.com>) rakstīja:
>
> Currently if there's any corruption at all in extent tree
> (eg. even single bit) then mounting will fail with:
> "failed to read block groups: -5" (-EIO)
> It happens because we immediately abort on first error when
> searching in extent tree for block groups.
>
> Now with this patch if `ignorebadroots` option is specified
> then we handle such case and continue by removing already
> created block groups and creating dummy block groups.
>
> Signed-off-by: Dāvis Mosāns <davispuh@gmail.com>
> ---
>  fs/btrfs/block-group.c | 20 ++++++++++++++++++++
>  fs/btrfs/disk-io.c     |  4 ++--
>  fs/btrfs/disk-io.h     |  2 ++
>  3 files changed, 24 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 48ebc106a606..f485cf14c2f8 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -2048,6 +2048,26 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
>         ret = check_chunk_block_group_mappings(info);
>  error:
>         btrfs_free_path(path);
> +
> +       if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
> +
> +               if (btrfs_super_log_root(info->super_copy) != 0) {
> +                       btrfs_warn(info, "Ignoring tree-log replay due to extent tree corruption!");
> +                       btrfs_set_super_log_root(info->super_copy, 0);
> +               }
> +
> +               btrfs_put_block_group_cache(info);
> +               btrfs_stop_all_workers(info);
> +               btrfs_free_block_groups(info);
> +               ret = btrfs_init_workqueues(info, NULL);
> +               if (ret)
> +                       return ret;
> +               ret = btrfs_init_space_info(info);
> +               if (ret)
> +                       return ret;
> +               return fill_dummy_bgs(info);
> +       }
> +
>         return ret;
>  }
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 07a2b4f69b10..dc744f76d075 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1955,7 +1955,7 @@ static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
>  }
>
>  /* helper to cleanup workers */
> -static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
> +void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
>  {
>         btrfs_destroy_workqueue(fs_info->fixup_workers);
>         btrfs_destroy_workqueue(fs_info->delalloc_workers);
> @@ -2122,7 +2122,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
>         mutex_init(&fs_info->qgroup_rescan_lock);
>  }
>
> -static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
> +int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
>                 struct btrfs_fs_devices *fs_devices)
>  {
>         u32 max_active = fs_info->thread_pool_size;
> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
> index e45057c0c016..f9bfcba86a04 100644
> --- a/fs/btrfs/disk-io.h
> +++ b/fs/btrfs/disk-io.h
> @@ -137,6 +137,8 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
>  int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
>  int __init btrfs_end_io_wq_init(void);
>  void __cold btrfs_end_io_wq_exit(void);
> +void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info);
> +int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices);
>
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>  void btrfs_set_buffer_lockdep_class(u64 objectid,
> --
> 2.30.2
>

Ping? Could anyone take a look?

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2021-04-21 16:01 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-17  1:20 [RFC] btrfs: Allow read-only mount with corrupted extent tree Dāvis Mosāns
2021-03-17  1:29 ` Dāvis Mosāns
2021-03-17 10:28   ` Qu Wenruo
2021-03-17 21:03     ` Dāvis Mosāns
2021-03-17 23:49       ` Qu Wenruo
2021-03-19 15:34         ` Dāvis Mosāns
2021-03-20  0:34           ` Qu Wenruo
2021-03-21 21:54             ` Dāvis Mosāns
2021-03-22  0:25               ` Qu Wenruo
2021-03-22  3:13                 ` Dāvis Mosāns
2021-03-22  4:48                   ` Zygo Blaxell
2021-03-22 16:10                     ` Dāvis Mosāns
2021-03-21 21:49 ` [PATCH] " Dāvis Mosāns
2021-04-21 16:00   ` Dāvis Mosāns

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.