Re: [PATCH v4 04/18] btrfs: make attach_extent_buffer_page() to handle subpage case

From: Qu Wenruo <quwenruo.btrfs@gmx.com>
To: Josef Bacik <josef@toxicpanda.com>, Qu Wenruo <wqu@suse.com>,
	linux-btrfs@vger.kernel.org
Subject: Re: [PATCH v4 04/18] btrfs: make attach_extent_buffer_page() to handle subpage case
Date: Wed, 20 Jan 2021 08:27:17 +0800	[thread overview]
Message-ID: <a58c8366-f3b5-a152-92be-c7252891a7c6@gmx.com> (raw)
In-Reply-To: <5a6223fc-9937-3bd6-ecd0-d6c5939f59a7@toxicpanda.com>

On 2021/1/20 上午5:54, Josef Bacik wrote:
> On 1/16/21 2:15 AM, Qu Wenruo wrote:
>> For subpage case, we need to allocate new memory for each metadata page.
>>
>> So we need to:
>> - Allow attach_extent_buffer_page() to return int
>>    To indicate allocation failure
>>
>> - Prealloc btrfs_subpage structure for alloc_extent_buffer()
>>    We don't want to call memory allocation with spinlock hold, so
>>    do preallocation before we acquire mapping->private_lock.
>>
>> - Handle subpage and regular case differently in
>>    attach_extent_buffer_page()
>>    For regular case, just do the usual thing.
>>    For subpage case, allocate new memory or use the preallocated memory.
>>
>> For future subpage metadata, we will make more usage of radix tree to
>> grab extnet buffer.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/extent_io.c | 75 ++++++++++++++++++++++++++++++++++++++------
>>   fs/btrfs/subpage.h   | 17 ++++++++++
>>   2 files changed, 82 insertions(+), 10 deletions(-)
>>
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index a816ba4a8537..320731487ac0 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -24,6 +24,7 @@
>>   #include "rcu-string.h"
>>   #include "backref.h"
>>   #include "disk-io.h"
>> +#include "subpage.h"
>>   static struct kmem_cache *extent_state_cache;
>>   static struct kmem_cache *extent_buffer_cache;
>> @@ -3140,9 +3141,13 @@ static int submit_extent_page(unsigned int opf,
>>       return ret;
>>   }
>> -static void attach_extent_buffer_page(struct extent_buffer *eb,
>> -                      struct page *page)
>> +static int attach_extent_buffer_page(struct extent_buffer *eb,
>> +                      struct page *page,
>> +                      struct btrfs_subpage *prealloc)
>>   {
>> +    struct btrfs_fs_info *fs_info = eb->fs_info;
>> +    int ret;
> 
> int ret = 0;
> 
>> +
>>       /*
>>        * If the page is mapped to btree inode, we should hold the private
>>        * lock to prevent race.
>> @@ -3152,10 +3157,32 @@ static void attach_extent_buffer_page(struct 
>> extent_buffer *eb,
>>       if (page->mapping)
>>           lockdep_assert_held(&page->mapping->private_lock);
>> -    if (!PagePrivate(page))
>> -        attach_page_private(page, eb);
>> -    else
>> -        WARN_ON(page->private != (unsigned long)eb);
>> +    if (fs_info->sectorsize == PAGE_SIZE) {
>> +        if (!PagePrivate(page))
>> +            attach_page_private(page, eb);
>> +        else
>> +            WARN_ON(page->private != (unsigned long)eb);
>> +        return 0;
>> +    }
>> +
>> +    /* Already mapped, just free prealloc */
>> +    if (PagePrivate(page)) {
>> +        kfree(prealloc);
>> +        return 0;
>> +    }
>> +
>> +    if (prealloc) {
>> +        /* Has preallocated memory for subpage */
>> +        spin_lock_init(&prealloc->lock);
>> +        attach_page_private(page, prealloc);
>> +    } else {
>> +        /* Do new allocation to attach subpage */
>> +        ret = btrfs_attach_subpage(fs_info, page);
>> +        if (ret < 0)
>> +            return ret;
> 
> Delete the above 2 lines.
> 
>> +    }
>> +
>> +    return 0;
> 
> return ret;
> 
>>   }
>>   void set_page_extent_mapped(struct page *page)
>> @@ -5062,21 +5089,29 @@ struct extent_buffer 
>> *btrfs_clone_extent_buffer(const struct extent_buffer *src)
>>       if (new == NULL)
>>           return NULL;
>> +    set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
>> +    set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
>> +
> 
> Why are you doing this here?  It seems unrelated?  Looking at the code 
> it appears there's a reason for this later, but I had to go look to make 
> sure I wasn't crazy, so at the very least it needs to be done in a more 
> relevant patch.

This is to handle case where we allocated a page but failed to allocate 
subpage structure.

In that case, btrfs_release_extent_buffer() will go different routine to 
free the eb.

Without UNMAPPED bit, it just go wrong without knowing it's a unmapped eb.

This change is mostly due to the extra failure pattern introduced by the 
subpage memory allocation.

> 
>>       for (i = 0; i < num_pages; i++) {
>> +        int ret;
>> +
>>           p = alloc_page(GFP_NOFS);
>>           if (!p) {
>>               btrfs_release_extent_buffer(new);
>>               return NULL;
>>           }
>> -        attach_extent_buffer_page(new, p);
>> +        ret = attach_extent_buffer_page(new, p, NULL);
>> +        if (ret < 0) {
>> +            put_page(p);
>> +            btrfs_release_extent_buffer(new);
>> +            return NULL;
>> +        }
>>           WARN_ON(PageDirty(p));
>>           SetPageUptodate(p);
>>           new->pages[i] = p;
>>           copy_page(page_address(p), page_address(src->pages[i]));
>>       }
>> -    set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
>> -    set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
>>       return new;
>>   }
>> @@ -5308,12 +5343,28 @@ struct extent_buffer 
>> *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>       num_pages = num_extent_pages(eb);
>>       for (i = 0; i < num_pages; i++, index++) {
>> +        struct btrfs_subpage *prealloc = NULL;
>> +
>>           p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
>>           if (!p) {
>>               exists = ERR_PTR(-ENOMEM);
>>               goto free_eb;
>>           }
>> +        /*
>> +         * Preallocate page->private for subpage case, so that
>> +         * we won't allocate memory with private_lock hold.
>> +         * The memory will be freed by attach_extent_buffer_page() or
>> +         * freed manually if exit earlier.
>> +         */
>> +        ret = btrfs_alloc_subpage(fs_info, &prealloc);
>> +        if (ret < 0) {
>> +            unlock_page(p);
>> +            put_page(p);
>> +            exists = ERR_PTR(ret);
>> +            goto free_eb;
>> +        }
>> +
> 
> I realize that for subpage sectorsize we'll only have 1 page, but I'd 
> still rather see this outside of the for loop, just for clarity sake.

This is the trade-off.
Either we do every separately, sharing the minimal amount of code (and 
need extra for loop for future 16K pages), or using the same loop 
sacrifice a little readability.

Here I'd say sharing more code is not that a big deal.

> 
>>           spin_lock(&mapping->private_lock);
>>           exists = grab_extent_buffer(p);
>>           if (exists) {
>> @@ -5321,10 +5372,14 @@ struct extent_buffer 
>> *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>               unlock_page(p);
>>               put_page(p);
>>               mark_extent_buffer_accessed(exists, p);
>> +            kfree(prealloc);
>>               goto free_eb;
>>           }
>> -        attach_extent_buffer_page(eb, p);
>> +        /* Should not fail, as we have preallocated the memory */
>> +        ret = attach_extent_buffer_page(eb, p, prealloc);
>> +        ASSERT(!ret);
>>           spin_unlock(&mapping->private_lock);
>> +
>>           WARN_ON(PageDirty(p));
>>           eb->pages[i] = p;
>>           if (!PageUptodate(p))
>> diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
>> index 96f3b226913e..f701256dd1e2 100644
>> --- a/fs/btrfs/subpage.h
>> +++ b/fs/btrfs/subpage.h
>> @@ -23,8 +23,25 @@
>>   struct btrfs_subpage {
>>       /* Common members for both data and metadata pages */
>>       spinlock_t lock;
>> +    union {
>> +        /* Structures only used by metadata */
>> +        /* Structures only used by data */
>> +    };
>>   };
>> +/* For rare cases where we need to pre-allocate a btrfs_subpage 
>> structure */
>> +static inline int btrfs_alloc_subpage(struct btrfs_fs_info *fs_info,
>> +                      struct btrfs_subpage **ret)
>> +{
>> +    if (fs_info->sectorsize == PAGE_SIZE)
>> +        return 0;
>> +
>> +    *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
>> +    if (!*ret)
>> +        return -ENOMEM;
>> +    return 0;
>> +}
> 
> We're allocating these for every metadata page, that deserves a 
> dedicated kmem_cache.  Thanks,

That makes sense, especially it will go both data and metadata for subpage.

Thanks,
Qu

> 
> Josef