linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] btrfs: speedup mount time with force readahead chunk tree
@ 2020-07-01  9:29 robbieko
  2020-07-06  6:13 ` Robbie Ko
  0 siblings, 1 reply; 10+ messages in thread
From: robbieko @ 2020-07-01  9:29 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Robbie Ko

From: Robbie Ko <robbieko@synology.com>

When mounting, we always need to read the whole chunk tree,
when there are too many chunk items, most of the time is
spent on btrfs_read_chunk_tree, because we only read one
leaf at a time.

We fix this by adding a new readahead mode READA_FORWARD_FORCE,
which reads all the leaves after the key in the node when
reading a level 1 node.

Signed-off-by: Robbie Ko <robbieko@synology.com>
---
 fs/btrfs/ctree.c   | 7 +++++--
 fs/btrfs/ctree.h   | 2 +-
 fs/btrfs/volumes.c | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3a7648bff42c..abb9108e2d7d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2194,7 +2194,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
 			if (nr == 0)
 				break;
 			nr--;
-		} else if (path->reada == READA_FORWARD) {
+		} else if (path->reada == READA_FORWARD || path->reada == READA_FORWARD_FORCE) {
 			nr++;
 			if (nr >= nritems)
 				break;
@@ -2205,12 +2205,15 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
 				break;
 		}
 		search = btrfs_node_blockptr(node, nr);
-		if ((search <= target && target - search <= 65536) ||
+		if ((path->reada == READA_FORWARD_FORCE) ||
+		    (search <= target && target - search <= 65536) ||
 		    (search > target && search - target <= 65536)) {
 			readahead_tree_block(fs_info, search);
 			nread += blocksize;
 		}
 		nscan++;
+		if (path->reada == READA_FORWARD_FORCE)
+			continue;
 		if ((nread > 65536 || nscan > 32))
 			break;
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d404cce8ae40..808bcbdc9530 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -353,7 +353,7 @@ struct btrfs_node {
  * The slots array records the index of the item or block pointer
  * used while walking the tree.
  */
-enum { READA_NONE, READA_BACK, READA_FORWARD };
+enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
 struct btrfs_path {
 	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0d6e785bcb98..78fd65abff69 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = READA_FORWARD_FORCE;
 
 	/*
 	 * uuid_mutex is needed only if we are mounting a sprout FS
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] btrfs: speedup mount time with force readahead chunk tree
  2020-07-01  9:29 [PATCH] btrfs: speedup mount time with force readahead chunk tree robbieko
@ 2020-07-06  6:13 ` Robbie Ko
  2020-07-06  6:16   ` Qu Wenruo
  0 siblings, 1 reply; 10+ messages in thread
From: Robbie Ko @ 2020-07-06  6:13 UTC (permalink / raw)
  To: linux-btrfs

Does anyone have any suggestions?

robbieko 於 2020/7/1 下午5:29 寫道:
> From: Robbie Ko <robbieko@synology.com>
>
> When mounting, we always need to read the whole chunk tree,
> when there are too many chunk items, most of the time is
> spent on btrfs_read_chunk_tree, because we only read one
> leaf at a time.
>
> We fix this by adding a new readahead mode READA_FORWARD_FORCE,
> which reads all the leaves after the key in the node when
> reading a level 1 node.
>
> Signed-off-by: Robbie Ko <robbieko@synology.com>
> ---
>   fs/btrfs/ctree.c   | 7 +++++--
>   fs/btrfs/ctree.h   | 2 +-
>   fs/btrfs/volumes.c | 1 +
>   3 files changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 3a7648bff42c..abb9108e2d7d 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -2194,7 +2194,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
>   			if (nr == 0)
>   				break;
>   			nr--;
> -		} else if (path->reada == READA_FORWARD) {
> +		} else if (path->reada == READA_FORWARD || path->reada == READA_FORWARD_FORCE) {
>   			nr++;
>   			if (nr >= nritems)
>   				break;
> @@ -2205,12 +2205,15 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
>   				break;
>   		}
>   		search = btrfs_node_blockptr(node, nr);
> -		if ((search <= target && target - search <= 65536) ||
> +		if ((path->reada == READA_FORWARD_FORCE) ||
> +		    (search <= target && target - search <= 65536) ||
>   		    (search > target && search - target <= 65536)) {
>   			readahead_tree_block(fs_info, search);
>   			nread += blocksize;
>   		}
>   		nscan++;
> +		if (path->reada == READA_FORWARD_FORCE)
> +			continue;
>   		if ((nread > 65536 || nscan > 32))
>   			break;
>   	}
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index d404cce8ae40..808bcbdc9530 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -353,7 +353,7 @@ struct btrfs_node {
>    * The slots array records the index of the item or block pointer
>    * used while walking the tree.
>    */
> -enum { READA_NONE, READA_BACK, READA_FORWARD };
> +enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
>   struct btrfs_path {
>   	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
>   	int slots[BTRFS_MAX_LEVEL];
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 0d6e785bcb98..78fd65abff69 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
>   	path = btrfs_alloc_path();
>   	if (!path)
>   		return -ENOMEM;
> +	path->reada = READA_FORWARD_FORCE;
>   
>   	/*
>   	 * uuid_mutex is needed only if we are mounting a sprout FS

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] btrfs: speedup mount time with force readahead chunk tree
  2020-07-06  6:13 ` Robbie Ko
@ 2020-07-06  6:16   ` Qu Wenruo
  2020-07-06  8:05     ` Robbie Ko
  0 siblings, 1 reply; 10+ messages in thread
From: Qu Wenruo @ 2020-07-06  6:16 UTC (permalink / raw)
  To: Robbie Ko, linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 3629 bytes --]



On 2020/7/6 下午2:13, Robbie Ko wrote:
> Does anyone have any suggestions?

I believe David's suggestion on using regular readahead is already good
enough for chunk tree.

Especially since chunk tree is not really the main cause for slow mount.

Thanks,
Qu

> 
> robbieko 於 2020/7/1 下午5:29 寫道:
>> From: Robbie Ko <robbieko@synology.com>
>>
>> When mounting, we always need to read the whole chunk tree,
>> when there are too many chunk items, most of the time is
>> spent on btrfs_read_chunk_tree, because we only read one
>> leaf at a time.
>>
>> We fix this by adding a new readahead mode READA_FORWARD_FORCE,
>> which reads all the leaves after the key in the node when
>> reading a level 1 node.
>>
>> Signed-off-by: Robbie Ko <robbieko@synology.com>
>> ---
>>   fs/btrfs/ctree.c   | 7 +++++--
>>   fs/btrfs/ctree.h   | 2 +-
>>   fs/btrfs/volumes.c | 1 +
>>   3 files changed, 7 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>> index 3a7648bff42c..abb9108e2d7d 100644
>> --- a/fs/btrfs/ctree.c
>> +++ b/fs/btrfs/ctree.c
>> @@ -2194,7 +2194,7 @@ static void reada_for_search(struct
>> btrfs_fs_info *fs_info,
>>               if (nr == 0)
>>                   break;
>>               nr--;
>> -        } else if (path->reada == READA_FORWARD) {
>> +        } else if (path->reada == READA_FORWARD || path->reada ==
>> READA_FORWARD_FORCE) {
>>               nr++;
>>               if (nr >= nritems)
>>                   break;
>> @@ -2205,12 +2205,15 @@ static void reada_for_search(struct
>> btrfs_fs_info *fs_info,
>>                   break;
>>           }
>>           search = btrfs_node_blockptr(node, nr);
>> -        if ((search <= target && target - search <= 65536) ||
>> +        if ((path->reada == READA_FORWARD_FORCE) ||
>> +            (search <= target && target - search <= 65536) ||
>>               (search > target && search - target <= 65536)) {
>>               readahead_tree_block(fs_info, search);
>>               nread += blocksize;
>>           }
>>           nscan++;
>> +        if (path->reada == READA_FORWARD_FORCE)
>> +            continue;
>>           if ((nread > 65536 || nscan > 32))
>>               break;
>>       }
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index d404cce8ae40..808bcbdc9530 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -353,7 +353,7 @@ struct btrfs_node {
>>    * The slots array records the index of the item or block pointer
>>    * used while walking the tree.
>>    */
>> -enum { READA_NONE, READA_BACK, READA_FORWARD };
>> +enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
>>   struct btrfs_path {
>>       struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
>>       int slots[BTRFS_MAX_LEVEL];
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 0d6e785bcb98..78fd65abff69 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info
>> *fs_info)
>>       path = btrfs_alloc_path();
>>       if (!path)
>>           return -ENOMEM;
>> +    path->reada = READA_FORWARD_FORCE;
>>         /*
>>        * uuid_mutex is needed only if we are mounting a sprout FS


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] btrfs: speedup mount time with force readahead chunk tree
  2020-07-06  6:16   ` Qu Wenruo
@ 2020-07-06  8:05     ` Robbie Ko
  2020-07-06  8:28       ` Qu Wenruo
  0 siblings, 1 reply; 10+ messages in thread
From: Robbie Ko @ 2020-07-06  8:05 UTC (permalink / raw)
  To: Qu Wenruo, linux-btrfs, dsterba


I've known btrfs_read_block_groups for a long time,

we can use BG_TREE freature to speed up btrfs_read_block_groups.

https://lwn.net/Articles/801990/


But reading the chunk tree also takes some time,

we can speed up the chunk tree by using the readahead mechanism.

Why we not just use regular forward readahead?
- Because the regular forward readahead ,
   reads only the logical address adjacent to the 64k,
   but the logical address of the next leaf may not be in 64k.

I have a test environment as follows:

200TB btrfs volume: used 192TB

Data, single: total=192.00TiB, used=192.00TiB
System, DUP: total=40.00MiB, used=19.91MiB
Metadata, DUP: total=63.00GiB, used=46.46GiB
GlobalReserve, single: total=2.00GiB, used=0.00B

chunk tree level : 2
chunk tree tree:
   nodes: 4
   leaves: 1270
   total: 1274
chunk tree size: 19.9 MB
SYSTEM chunks count : 2 (8MB, 32MB)

btrfs_read_chunk_tree spends the following time :

before: 1.89s

patch: 0.27s

Speed increase of about 85%.

Between the chunk tree leaves, there will be a different SYSTEM chunk,

when the The more frequent the chunk allocate/remove, the larger the gap 
between the leaves.

e.g. chunk tree node :
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85020014280704) block 
79866020003840 (4874635010) gen 12963
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85185370521600) block 28999680 
(1770) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85351800504320) block 
79866020347904 (4874635031) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85518230487040) block 
79866020102144 (4874635016) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85684660469760) block 
79866020118528 (4874635017) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 85851090452480) block 
79866020134912 (4874635018) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86017520435200) block 29261824 
(1786) gen 12964
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86183950417920) block 
79866020397056 (4874635034) gen 12965
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86350380400640) block 
79866020151296 (4874635019) gen 12965
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86516810383360) block 
79866020167680 (4874635020) gen 12965
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86683240366080) block 
79866020184064 (4874635021) gen 12965
     key (FIRST_CHUNK_TREE CHUNK_ITEM 86849670348800) block 
79866020200448 (4874635022) gen 12965
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87016100331520) block 29065216 
(1774) gen 12966
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87182530314240) block 
79866020315136 (4874635029) gen 12966
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87348960296960) block 
79866020331520 (4874635030) gen 12966
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87515390279680) block 
79866020413440 (4874635035) gen 12966
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87681820262400) block 
79866020429824 (4874635036) gen 12966
     key (FIRST_CHUNK_TREE CHUNK_ITEM 87848250245120) block 29294592 
(1788) gen 12968
     key (FIRST_CHUNK_TREE CHUNK_ITEM 88014680227840) block 
79866020544512 (4874635043) gen 12968


With 1PB of btrfs volume, we will have more SYSTEM CHUNK,

and each chunk tree leaf will be more fragmented,

and the time difference will be larger.


Qu Wenruo 於 2020/7/6 下午2:16 寫道:
>
> On 2020/7/6 下午2:13, Robbie Ko wrote:
>> Does anyone have any suggestions?
> I believe David's suggestion on using regular readahead is already good
> enough for chunk tree.
>
> Especially since chunk tree is not really the main cause for slow mount.
>
> Thanks,
> Qu
>
>> robbieko 於 2020/7/1 下午5:29 寫道:
>>> From: Robbie Ko <robbieko@synology.com>
>>>
>>> When mounting, we always need to read the whole chunk tree,
>>> when there are too many chunk items, most of the time is
>>> spent on btrfs_read_chunk_tree, because we only read one
>>> leaf at a time.
>>>
>>> We fix this by adding a new readahead mode READA_FORWARD_FORCE,
>>> which reads all the leaves after the key in the node when
>>> reading a level 1 node.
>>>
>>> Signed-off-by: Robbie Ko <robbieko@synology.com>
>>> ---
>>>    fs/btrfs/ctree.c   | 7 +++++--
>>>    fs/btrfs/ctree.h   | 2 +-
>>>    fs/btrfs/volumes.c | 1 +
>>>    3 files changed, 7 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>>> index 3a7648bff42c..abb9108e2d7d 100644
>>> --- a/fs/btrfs/ctree.c
>>> +++ b/fs/btrfs/ctree.c
>>> @@ -2194,7 +2194,7 @@ static void reada_for_search(struct
>>> btrfs_fs_info *fs_info,
>>>                if (nr == 0)
>>>                    break;
>>>                nr--;
>>> -        } else if (path->reada == READA_FORWARD) {
>>> +        } else if (path->reada == READA_FORWARD || path->reada ==
>>> READA_FORWARD_FORCE) {
>>>                nr++;
>>>                if (nr >= nritems)
>>>                    break;
>>> @@ -2205,12 +2205,15 @@ static void reada_for_search(struct
>>> btrfs_fs_info *fs_info,
>>>                    break;
>>>            }
>>>            search = btrfs_node_blockptr(node, nr);
>>> -        if ((search <= target && target - search <= 65536) ||
>>> +        if ((path->reada == READA_FORWARD_FORCE) ||
>>> +            (search <= target && target - search <= 65536) ||
>>>                (search > target && search - target <= 65536)) {
>>>                readahead_tree_block(fs_info, search);
>>>                nread += blocksize;
>>>            }
>>>            nscan++;
>>> +        if (path->reada == READA_FORWARD_FORCE)
>>> +            continue;
>>>            if ((nread > 65536 || nscan > 32))
>>>                break;
>>>        }
>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>> index d404cce8ae40..808bcbdc9530 100644
>>> --- a/fs/btrfs/ctree.h
>>> +++ b/fs/btrfs/ctree.h
>>> @@ -353,7 +353,7 @@ struct btrfs_node {
>>>     * The slots array records the index of the item or block pointer
>>>     * used while walking the tree.
>>>     */
>>> -enum { READA_NONE, READA_BACK, READA_FORWARD };
>>> +enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
>>>    struct btrfs_path {
>>>        struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
>>>        int slots[BTRFS_MAX_LEVEL];
>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>> index 0d6e785bcb98..78fd65abff69 100644
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info
>>> *fs_info)
>>>        path = btrfs_alloc_path();
>>>        if (!path)
>>>            return -ENOMEM;
>>> +    path->reada = READA_FORWARD_FORCE;
>>>          /*
>>>         * uuid_mutex is needed only if we are mounting a sprout FS

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] btrfs: speedup mount time with force readahead chunk tree
  2020-07-06  8:05     ` Robbie Ko
@ 2020-07-06  8:28       ` Qu Wenruo
  2020-07-06  8:37         ` Nikolay Borisov
  0 siblings, 1 reply; 10+ messages in thread
From: Qu Wenruo @ 2020-07-06  8:28 UTC (permalink / raw)
  To: Robbie Ko, linux-btrfs, dsterba


[-- Attachment #1.1: Type: text/plain, Size: 8053 bytes --]



On 2020/7/6 下午4:05, Robbie Ko wrote:
> 
> I've known btrfs_read_block_groups for a long time,
> 
> we can use BG_TREE freature to speed up btrfs_read_block_groups.
> 
> https://lwn.net/Articles/801990/
> 
> 
> But reading the chunk tree also takes some time,
> 
> we can speed up the chunk tree by using the readahead mechanism.
> 
> Why we not just use regular forward readahead?
> - Because the regular forward readahead ,
>   reads only the logical address adjacent to the 64k,
>   but the logical address of the next leaf may not be in 64k.

Great, please add these into the changelog for the need of
READA_FORWARD_FORCE.

But on the other hand, it looks like we could change READA_FORWARD
without introducing the _FORCE version,
as _FORCE variant is what we really want.


That logical bytenr limit looks not reasonable to me, which makes that
READA_FORWARD near useless to me.

Although in that direction, we also need to verify all existing
READA_FORWARD are really correctly.

Personally I tend to prefer change existing READA_FORWARD and review all
existing callers.

> 
> I have a test environment as follows:
> 
> 200TB btrfs volume: used 192TB
> 
> Data, single: total=192.00TiB, used=192.00TiB
> System, DUP: total=40.00MiB, used=19.91MiB
> Metadata, DUP: total=63.00GiB, used=46.46GiB
> GlobalReserve, single: total=2.00GiB, used=0.00B
> 
> chunk tree level : 2
> chunk tree tree:
>   nodes: 4
>   leaves: 1270
>   total: 1274
> chunk tree size: 19.9 MB
> SYSTEM chunks count : 2 (8MB, 32MB)
> 
> btrfs_read_chunk_tree spends the following time :
> 
> before: 1.89s
> 
> patch: 0.27s

That's great, although 2s is already fast enough to me :)

Thanks,
Qu

> 
> Speed increase of about 85%.
> 
> Between the chunk tree leaves, there will be a different SYSTEM chunk,
> 
> when the The more frequent the chunk allocate/remove, the larger the gap
> between the leaves.
> 
> e.g. chunk tree node :
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85020014280704) block
> 79866020003840 (4874635010) gen 12963
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85185370521600) block 28999680
> (1770) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85351800504320) block
> 79866020347904 (4874635031) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85518230487040) block
> 79866020102144 (4874635016) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85684660469760) block
> 79866020118528 (4874635017) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 85851090452480) block
> 79866020134912 (4874635018) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86017520435200) block 29261824
> (1786) gen 12964
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86183950417920) block
> 79866020397056 (4874635034) gen 12965
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86350380400640) block
> 79866020151296 (4874635019) gen 12965
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86516810383360) block
> 79866020167680 (4874635020) gen 12965
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86683240366080) block
> 79866020184064 (4874635021) gen 12965
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 86849670348800) block
> 79866020200448 (4874635022) gen 12965
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87016100331520) block 29065216
> (1774) gen 12966
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87182530314240) block
> 79866020315136 (4874635029) gen 12966
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87348960296960) block
> 79866020331520 (4874635030) gen 12966
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87515390279680) block
> 79866020413440 (4874635035) gen 12966
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87681820262400) block
> 79866020429824 (4874635036) gen 12966
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 87848250245120) block 29294592
> (1788) gen 12968
>     key (FIRST_CHUNK_TREE CHUNK_ITEM 88014680227840) block
> 79866020544512 (4874635043) gen 12968
> 
> 
> With 1PB of btrfs volume, we will have more SYSTEM CHUNK,
> 
> and each chunk tree leaf will be more fragmented,
> 
> and the time difference will be larger.
> 
> 
> Qu Wenruo 於 2020/7/6 下午2:16 寫道:
>>
>> On 2020/7/6 下午2:13, Robbie Ko wrote:
>>> Does anyone have any suggestions?
>> I believe David's suggestion on using regular readahead is already good
>> enough for chunk tree.
>>
>> Especially since chunk tree is not really the main cause for slow mount.
>>
>> Thanks,
>> Qu
>>
>>> robbieko 於 2020/7/1 下午5:29 寫道:
>>>> From: Robbie Ko <robbieko@synology.com>
>>>>
>>>> When mounting, we always need to read the whole chunk tree,
>>>> when there are too many chunk items, most of the time is
>>>> spent on btrfs_read_chunk_tree, because we only read one
>>>> leaf at a time.
>>>>
>>>> We fix this by adding a new readahead mode READA_FORWARD_FORCE,
>>>> which reads all the leaves after the key in the node when
>>>> reading a level 1 node.
>>>>
>>>> Signed-off-by: Robbie Ko <robbieko@synology.com>
>>>> ---
>>>>    fs/btrfs/ctree.c   | 7 +++++--
>>>>    fs/btrfs/ctree.h   | 2 +-
>>>>    fs/btrfs/volumes.c | 1 +
>>>>    3 files changed, 7 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
>>>> index 3a7648bff42c..abb9108e2d7d 100644
>>>> --- a/fs/btrfs/ctree.c
>>>> +++ b/fs/btrfs/ctree.c
>>>> @@ -2194,7 +2194,7 @@ static void reada_for_search(struct
>>>> btrfs_fs_info *fs_info,
>>>>                if (nr == 0)
>>>>                    break;
>>>>                nr--;
>>>> -        } else if (path->reada == READA_FORWARD) {
>>>> +        } else if (path->reada == READA_FORWARD || path->reada ==
>>>> READA_FORWARD_FORCE) {
>>>>                nr++;
>>>>                if (nr >= nritems)
>>>>                    break;
>>>> @@ -2205,12 +2205,15 @@ static void reada_for_search(struct
>>>> btrfs_fs_info *fs_info,
>>>>                    break;
>>>>            }
>>>>            search = btrfs_node_blockptr(node, nr);
>>>> -        if ((search <= target && target - search <= 65536) ||
>>>> +        if ((path->reada == READA_FORWARD_FORCE) ||
>>>> +            (search <= target && target - search <= 65536) ||
>>>>                (search > target && search - target <= 65536)) {
>>>>                readahead_tree_block(fs_info, search);
>>>>                nread += blocksize;
>>>>            }
>>>>            nscan++;
>>>> +        if (path->reada == READA_FORWARD_FORCE)
>>>> +            continue;
>>>>            if ((nread > 65536 || nscan > 32))
>>>>                break;
>>>>        }
>>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>>> index d404cce8ae40..808bcbdc9530 100644
>>>> --- a/fs/btrfs/ctree.h
>>>> +++ b/fs/btrfs/ctree.h
>>>> @@ -353,7 +353,7 @@ struct btrfs_node {
>>>>     * The slots array records the index of the item or block pointer
>>>>     * used while walking the tree.
>>>>     */
>>>> -enum { READA_NONE, READA_BACK, READA_FORWARD };
>>>> +enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
>>>>    struct btrfs_path {
>>>>        struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
>>>>        int slots[BTRFS_MAX_LEVEL];
>>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>>> index 0d6e785bcb98..78fd65abff69 100644
>>>> --- a/fs/btrfs/volumes.c
>>>> +++ b/fs/btrfs/volumes.c
>>>> @@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info
>>>> *fs_info)
>>>>        path = btrfs_alloc_path();
>>>>        if (!path)
>>>>            return -ENOMEM;
>>>> +    path->reada = READA_FORWARD_FORCE;
>>>>          /*
>>>>         * uuid_mutex is needed only if we are mounting a sprout FS


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] btrfs: speedup mount time with force readahead chunk tree
  2020-07-06  8:28       ` Qu Wenruo
@ 2020-07-06  8:37         ` Nikolay Borisov
  2020-07-06 10:07           ` Robbie Ko
  0 siblings, 1 reply; 10+ messages in thread
From: Nikolay Borisov @ 2020-07-06  8:37 UTC (permalink / raw)
  To: Qu Wenruo, Robbie Ko, linux-btrfs, dsterba



On 6.07.20 г. 11:28 ч., Qu Wenruo wrote:
> 
> 
> On 2020/7/6 下午4:05, Robbie Ko wrote:
>>
>> I've known btrfs_read_block_groups for a long time,
>>
>> we can use BG_TREE freature to speed up btrfs_read_block_groups.
>>
>> https://lwn.net/Articles/801990/
>>
>>
>> But reading the chunk tree also takes some time,
>>
>> we can speed up the chunk tree by using the readahead mechanism.
>>
>> Why we not just use regular forward readahead?
>> - Because the regular forward readahead ,
>>   reads only the logical address adjacent to the 64k,
>>   but the logical address of the next leaf may not be in 64k.
> 
> Great, please add these into the changelog for the need of
> READA_FORWARD_FORCE.

<nod> Performance patches should come with data backing their
performance improvements claims.

> 
> But on the other hand, it looks like we could change READA_FORWARD
> without introducing the _FORCE version,
> as _FORCE variant is what we really want.
> 
> 
> That logical bytenr limit looks not reasonable to me, which makes that
> READA_FORWARD near useless to me.
> 
> Although in that direction, we also need to verify all existing
> READA_FORWARD are really correctly.
> 
> Personally I tend to prefer change existing READA_FORWARD and review all
> existing callers.

I agree, we should ideally revise the existing READA_FORWARD and give it
better semantics if the current ones are needlessly rigid. Only if this
improves way too cumbersome i.e breaking assumption which are made
should we introduce yet another mode. Because this other mode is really
just special casing and we should avoid special cases as much as possible.


<snip>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] btrfs: speedup mount time with force readahead chunk tree
  2020-07-06  8:37         ` Nikolay Borisov
@ 2020-07-06 10:07           ` Robbie Ko
  0 siblings, 0 replies; 10+ messages in thread
From: Robbie Ko @ 2020-07-06 10:07 UTC (permalink / raw)
  To: Nikolay Borisov, Qu Wenruo, linux-btrfs, dsterba


Nikolay Borisov 於 2020/7/6 下午4:37 寫道:
>
> On 6.07.20 г. 11:28 ч., Qu Wenruo wrote:
>>
>> On 2020/7/6 下午4:05, Robbie Ko wrote:
>>> I've known btrfs_read_block_groups for a long time,
>>>
>>> we can use BG_TREE freature to speed up btrfs_read_block_groups.
>>>
>>> https://lwn.net/Articles/801990/
>>>
>>>
>>> But reading the chunk tree also takes some time,
>>>
>>> we can speed up the chunk tree by using the readahead mechanism.
>>>
>>> Why we not just use regular forward readahead?
>>> - Because the regular forward readahead ,
>>>    reads only the logical address adjacent to the 64k,
>>>    but the logical address of the next leaf may not be in 64k.
>> Great, please add these into the changelog for the need of
>> READA_FORWARD_FORCE.
> <nod> Performance patches should come with data backing their
> performance improvements claims.

Ok, I will add these into the changelog.


>
>> But on the other hand, it looks like we could change READA_FORWARD
>> without introducing the _FORCE version,
>> as _FORCE variant is what we really want.
>>
>>
>> That logical bytenr limit looks not reasonable to me, which makes that
>> READA_FORWARD near useless to me.
>>
>> Although in that direction, we also need to verify all existing
>> READA_FORWARD are really correctly.
>>
>> Personally I tend to prefer change existing READA_FORWARD and review all
>> existing callers.
> I agree, we should ideally revise the existing READA_FORWARD and give it
> better semantics if the current ones are needlessly rigid. Only if this
> improves way too cumbersome i.e breaking assumption which are made
> should we introduce yet another mode. Because this other mode is really
> just special casing and we should avoid special cases as much as possible.


Ok, I'll modify the existing READA_FORWARD mechanism to make it better

and send a V2 patch.


>
>
> <snip>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] btrfs: speedup mount time with force readahead chunk tree
  2020-07-01 10:58 ` Qu Wenruo
@ 2020-07-01 16:05   ` David Sterba
  0 siblings, 0 replies; 10+ messages in thread
From: David Sterba @ 2020-07-01 16:05 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: robbieko, fstests, linux-btrfs

On Wed, Jul 01, 2020 at 06:58:55PM +0800, Qu Wenruo wrote:
> 
> 
> On 2020/7/1 下午5:24, robbieko wrote:
> > From: Robbie Ko <robbieko@synology.com>
> > 
> > When mounting, we always need to read the whole chunk tree,
> > when there are too many chunk items, most of the time is
> > spent on btrfs_read_chunk_tree, because we only read one
> > leaf at a time.
> 
> Well, under most case it would be btrfs_read_block_groups(), unless all
> data chunks are very compact with just several large data extents.

I've checked chunk tree on some filesystems:

- 1T, 40% used, chunk tree size 80K, 1 node, the rest are leaves
- 1T, 93% used, chunk tree size 112K, 1 node, the rest are leaves

so yeah readahead of chunk tree is not the part where it takes long.
For many-terabytes filesystems it would be stil in range of megabytes
and the chunk tree is not scattered.

We could do the readahead of block group items, it could speed up some
things and maybe worth trying. We have the async readahead API, ie.
start readahead on a given key and forget about it. Either it will be in
cache in time we read it or the proper read will be first.

> > --- a/fs/btrfs/ctree.h
> > +++ b/fs/btrfs/ctree.h
> > @@ -353,7 +353,7 @@ struct btrfs_node {
> >   * The slots array records the index of the item or block pointer
> >   * used while walking the tree.
> >   */
> > -enum { READA_NONE, READA_BACK, READA_FORWARD };
> > +enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
> >  struct btrfs_path {
> >  	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
> >  	int slots[BTRFS_MAX_LEVEL];
> > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> > index 0d6e785bcb98..78fd65abff69 100644
> > --- a/fs/btrfs/volumes.c
> > +++ b/fs/btrfs/volumes.c
> > @@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
> >  	path = btrfs_alloc_path();
> >  	if (!path)
> >  		return -ENOMEM;
> > +	path->reada = READA_FORWARD_FORCE;
> 
> Why not just use regular forward readahead?
> 
> Mind to share the reason here? Just to force reada for all tree leaves?

Maybe the current readahead is a good idea to do here anyway, we know
we'll need to read the whole chunk tree anyway so it's not wasteful.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] btrfs: speedup mount time with force readahead chunk tree
  2020-07-01  9:24 robbieko
@ 2020-07-01 10:58 ` Qu Wenruo
  2020-07-01 16:05   ` David Sterba
  0 siblings, 1 reply; 10+ messages in thread
From: Qu Wenruo @ 2020-07-01 10:58 UTC (permalink / raw)
  To: robbieko, fstests; +Cc: linux-btrfs


[-- Attachment #1.1: Type: text/plain, Size: 2995 bytes --]



On 2020/7/1 下午5:24, robbieko wrote:
> From: Robbie Ko <robbieko@synology.com>
> 
> When mounting, we always need to read the whole chunk tree,
> when there are too many chunk items, most of the time is
> spent on btrfs_read_chunk_tree, because we only read one
> leaf at a time.

Well, under most case it would be btrfs_read_block_groups(), unless all
data chunks are very compact with just several large data extents.

> 
> We fix this by adding a new readahead mode READA_FORWARD_FORCE,
> which reads all the leaves after the key in the node when
> reading a level 1 node.
> 
> Signed-off-by: Robbie Ko <robbieko@synology.com>
> ---
>  fs/btrfs/ctree.c   | 7 +++++--
>  fs/btrfs/ctree.h   | 2 +-
>  fs/btrfs/volumes.c | 1 +
>  3 files changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 3a7648bff42c..abb9108e2d7d 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -2194,7 +2194,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
>  			if (nr == 0)
>  				break;
>  			nr--;
> -		} else if (path->reada == READA_FORWARD) {
> +		} else if (path->reada == READA_FORWARD || path->reada == READA_FORWARD_FORCE) {
>  			nr++;
>  			if (nr >= nritems)
>  				break;
> @@ -2205,12 +2205,15 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
>  				break;
>  		}
>  		search = btrfs_node_blockptr(node, nr);
> -		if ((search <= target && target - search <= 65536) ||
> +		if ((path->reada == READA_FORWARD_FORCE) ||
> +		    (search <= target && target - search <= 65536) ||
>  		    (search > target && search - target <= 65536)) {
>  			readahead_tree_block(fs_info, search);
>  			nread += blocksize;
>  		}
>  		nscan++;
> +		if (path->reada == READA_FORWARD_FORCE)
> +			continue;
>  		if ((nread > 65536 || nscan > 32))
>  			break;
>  	}
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index d404cce8ae40..808bcbdc9530 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -353,7 +353,7 @@ struct btrfs_node {
>   * The slots array records the index of the item or block pointer
>   * used while walking the tree.
>   */
> -enum { READA_NONE, READA_BACK, READA_FORWARD };
> +enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
>  struct btrfs_path {
>  	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
>  	int slots[BTRFS_MAX_LEVEL];
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 0d6e785bcb98..78fd65abff69 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
>  	path = btrfs_alloc_path();
>  	if (!path)
>  		return -ENOMEM;
> +	path->reada = READA_FORWARD_FORCE;

Why not just use regular forward readahead?

Mind to share the reason here? Just to force reada for all tree leaves?

Thanks,
Qu

>  
>  	/*
>  	 * uuid_mutex is needed only if we are mounting a sprout FS
> 


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] btrfs: speedup mount time with force readahead chunk tree
@ 2020-07-01  9:24 robbieko
  2020-07-01 10:58 ` Qu Wenruo
  0 siblings, 1 reply; 10+ messages in thread
From: robbieko @ 2020-07-01  9:24 UTC (permalink / raw)
  To: fstests; +Cc: linux-btrfs, Robbie Ko

From: Robbie Ko <robbieko@synology.com>

When mounting, we always need to read the whole chunk tree,
when there are too many chunk items, most of the time is
spent on btrfs_read_chunk_tree, because we only read one
leaf at a time.

We fix this by adding a new readahead mode READA_FORWARD_FORCE,
which reads all the leaves after the key in the node when
reading a level 1 node.

Signed-off-by: Robbie Ko <robbieko@synology.com>
---
 fs/btrfs/ctree.c   | 7 +++++--
 fs/btrfs/ctree.h   | 2 +-
 fs/btrfs/volumes.c | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3a7648bff42c..abb9108e2d7d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2194,7 +2194,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
 			if (nr == 0)
 				break;
 			nr--;
-		} else if (path->reada == READA_FORWARD) {
+		} else if (path->reada == READA_FORWARD || path->reada == READA_FORWARD_FORCE) {
 			nr++;
 			if (nr >= nritems)
 				break;
@@ -2205,12 +2205,15 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
 				break;
 		}
 		search = btrfs_node_blockptr(node, nr);
-		if ((search <= target && target - search <= 65536) ||
+		if ((path->reada == READA_FORWARD_FORCE) ||
+		    (search <= target && target - search <= 65536) ||
 		    (search > target && search - target <= 65536)) {
 			readahead_tree_block(fs_info, search);
 			nread += blocksize;
 		}
 		nscan++;
+		if (path->reada == READA_FORWARD_FORCE)
+			continue;
 		if ((nread > 65536 || nscan > 32))
 			break;
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d404cce8ae40..808bcbdc9530 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -353,7 +353,7 @@ struct btrfs_node {
  * The slots array records the index of the item or block pointer
  * used while walking the tree.
  */
-enum { READA_NONE, READA_BACK, READA_FORWARD };
+enum { READA_NONE, READA_BACK, READA_FORWARD, READA_FORWARD_FORCE };
 struct btrfs_path {
 	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0d6e785bcb98..78fd65abff69 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7043,6 +7043,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = READA_FORWARD_FORCE;
 
 	/*
 	 * uuid_mutex is needed only if we are mounting a sprout FS
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-07-06 10:07 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-01  9:29 [PATCH] btrfs: speedup mount time with force readahead chunk tree robbieko
2020-07-06  6:13 ` Robbie Ko
2020-07-06  6:16   ` Qu Wenruo
2020-07-06  8:05     ` Robbie Ko
2020-07-06  8:28       ` Qu Wenruo
2020-07-06  8:37         ` Nikolay Borisov
2020-07-06 10:07           ` Robbie Ko
  -- strict thread matches above, loose matches on Subject: below --
2020-07-01  9:24 robbieko
2020-07-01 10:58 ` Qu Wenruo
2020-07-01 16:05   ` David Sterba

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).