Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation

From: Filipe Manana <fdmanana@kernel.org>
To: Nikolay Borisov <nborisov@suse.com>
Cc: "linux-btrfs@vger.kernel.org" <linux-btrfs@vger.kernel.org>
Subject: Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation
Date: Fri, 3 Nov 2017 10:45:10 +0000	[thread overview]
Message-ID: <CAL3q7H7=4RupqV-GAZzm_X_7OCdw=05DkY3Ws_aYCC6vh7dA8A@mail.gmail.com> (raw)
In-Reply-To: <CAL3q7H6tPZzvyrQf0gtLKYMFjYRAQzRsATJtnd==1=4c4K1maQ@mail.gmail.com>

On Fri, Nov 3, 2017 at 10:29 AM, Filipe Manana <fdmanana@kernel.org> wrote:
> On Fri, Nov 3, 2017 at 9:30 AM, Nikolay Borisov <nborisov@suse.com> wrote:
>>
>>
>> On 25.10.2017 17:59, fdmanana@kernel.org wrote:
>>> From: Filipe Manana <fdmanana@suse.com>
>>>
>>> This implements support the zero range operation of fallocate. For now
>>> at least it's as simple as possible while reusing most of the existing
>>> fallocate and hole punching infrastructure.
>>>
>>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>>> ---
>>>
>>> V2: Removed double inode unlock on error path from failure to lock range.
>>>
>>>  fs/btrfs/file.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>>>  1 file changed, 290 insertions(+), 42 deletions(-)
>>>
>>> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
>>> index aafcc785f840..e0d15c0d1641 100644
>>> --- a/fs/btrfs/file.c
>>> +++ b/fs/btrfs/file.c
>>> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
>>>       return ret;
>>>  }
>>>
>>> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>> +static int btrfs_punch_hole_lock_range(struct inode *inode,
>>> +                                    const u64 lockstart,
>>> +                                    const u64 lockend,
>>> +                                    struct extent_state **cached_state)
>>> +{
>>> +     while (1) {
>>> +             struct btrfs_ordered_extent *ordered;
>>> +             int ret;
>>> +
>>> +             truncate_pagecache_range(inode, lockstart, lockend);
>>> +
>>> +             lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>>> +                              cached_state);
>>> +             ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
>>> +
>>> +             /*
>>> +              * We need to make sure we have no ordered extents in this range
>>> +              * and nobody raced in and read a page in this range, if we did
>>> +              * we need to try again.
>>> +              */
>>> +             if ((!ordered ||
>>> +                 (ordered->file_offset + ordered->len <= lockstart ||
>>> +                  ordered->file_offset > lockend)) &&
>>> +                  !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
>>> +                     if (ordered)
>>> +                             btrfs_put_ordered_extent(ordered);
>>> +                     break;
>>> +             }
>>> +             if (ordered)
>>> +                     btrfs_put_ordered_extent(ordered);
>>> +             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>>> +                                  lockend, cached_state, GFP_NOFS);
>>> +             ret = btrfs_wait_ordered_range(inode, lockstart,
>>> +                                            lockend - lockstart + 1);
>>> +             if (ret)
>>> +                     return ret;
>>> +     }
>>> +     return 0;
>>> +}
>>> +
>>> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
>>> +                         bool lock_inode)
>>>  {
>>>       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>>>       struct btrfs_root *root = BTRFS_I(inode)->root;
>>> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>>       if (ret)
>>>               return ret;
>>>
>>> -     inode_lock(inode);
>>> +     if (lock_inode)
>>> +             inode_lock(inode);
>>>       ino_size = round_up(inode->i_size, fs_info->sectorsize);
>>>       ret = find_first_non_hole(inode, &offset, &len);
>>>       if (ret < 0)
>>> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>>               truncated_block = true;
>>>               ret = btrfs_truncate_block(inode, offset, 0, 0);
>>>               if (ret) {
>>> -                     inode_unlock(inode);
>>> +                     if (lock_inode)
>>> +                             inode_unlock(inode);
>>>                       return ret;
>>>               }
>>>       }
>>> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>>               goto out_only_mutex;
>>>       }
>>>
>>> -     while (1) {
>>> -             struct btrfs_ordered_extent *ordered;
>>> -
>>> -             truncate_pagecache_range(inode, lockstart, lockend);
>>> -
>>> -             lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>>> -                              &cached_state);
>>> -             ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
>>> -
>>> -             /*
>>> -              * We need to make sure we have no ordered extents in this range
>>> -              * and nobody raced in and read a page in this range, if we did
>>> -              * we need to try again.
>>> -              */
>>> -             if ((!ordered ||
>>> -                 (ordered->file_offset + ordered->len <= lockstart ||
>>> -                  ordered->file_offset > lockend)) &&
>>> -                  !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
>>> -                     if (ordered)
>>> -                             btrfs_put_ordered_extent(ordered);
>>> -                     break;
>>> -             }
>>> -             if (ordered)
>>> -                     btrfs_put_ordered_extent(ordered);
>>> -             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>>> -                                  lockend, &cached_state, GFP_NOFS);
>>> -             ret = btrfs_wait_ordered_range(inode, lockstart,
>>> -                                            lockend - lockstart + 1);
>>> -             if (ret) {
>>> +     ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
>>> +                                       &cached_state);
>>> +     if (ret) {
>>> +             if (lock_inode)
>>>                       inode_unlock(inode);
>>> -                     return ret;
>>> -             }
>>> +             return ret;
>>>       }
>>>
>>>       path = btrfs_alloc_path();
>>> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>>                       ret = btrfs_end_transaction(trans);
>>>               }
>>>       }
>>> -     inode_unlock(inode);
>>> +     if (lock_inode)
>>> +             inode_unlock(inode);
>>>       if (ret && !err)
>>>               err = ret;
>>>       return err;
>>> @@ -2804,6 +2822,227 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
>>>       return 0;
>>>  }
>>>
>>> +static int btrfs_zero_range_update_isize(struct inode *inode,
>>> +                                      const loff_t offset,
>>> +                                      const loff_t len,
>>> +                                      const int mode)
>>> +{
>>> +     struct btrfs_root *root = BTRFS_I(inode)->root;
>>> +     struct btrfs_trans_handle *trans;
>>> +     const u64 end = offset + len;
>>> +     int ret;
>>> +
>>> +     if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
>>> +             return 0;
>>> +
>>> +     i_size_write(inode, end);
>>> +     btrfs_ordered_update_i_size(inode, end, NULL);
>>> +     trans = btrfs_start_transaction(root, 1);
>>> +     if (IS_ERR(trans)) {
>>> +             ret = PTR_ERR(trans);
>>> +     } else {
>>> +             int err;
>>> +
>>> +             ret = btrfs_update_inode(trans, root, inode);
>>> +             err = btrfs_end_transaction(trans);
>>> +             ret = ret ? ret : err;
>>> +     }
>>> +     return ret;
>>> +}
>>> +
>>> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
>>> +                                              u64 offset)
>>> +{
>>> +     const u64 sectorsize = btrfs_inode_sectorsize(inode);
>>> +     struct extent_map *em = NULL;
>>> +     int ret = 0;
>>> +
>>> +     offset = round_down(offset, sectorsize);
>>> +     em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
>>> +     if (IS_ERR(em))
>>> +             return PTR_ERR(em);
>>> +
>>> +     if (em->block_start == EXTENT_MAP_HOLE)
>>> +             ret = 1;
>>> +
>>> +     free_extent_map(em);
>>> +     return ret;
>>> +}
>>> +
>>> +static int btrfs_zero_range(struct inode *inode,
>>> +                         loff_t offset,
>>> +                         loff_t len,
>>> +                         const int mode)
>>> +{
>>> +     struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
>>> +     struct extent_map *em;
>>> +     struct extent_changeset *data_reserved = NULL;
>>> +     int ret;
>>> +     u64 alloc_hint = 0;
>>> +     const u64 sectorsize = btrfs_inode_sectorsize(inode);
>>> +     u64 alloc_start = round_down(offset, sectorsize);
>>> +     u64 alloc_end = round_up(offset + len, sectorsize);
>>> +     u64 bytes_to_reserve = 0;
>>> +     bool space_reserved = false;
>>> +     bool punch_hole = false;
>>> +
>>> +     inode_dio_wait(inode);
>>> +
>>> +     em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
>>> +                           alloc_start, alloc_end - alloc_start, 0);
>>> +     if (IS_ERR(em)) {
>>> +             ret = PTR_ERR(em);
>>> +             goto out;
>>> +     }
>>> +
>>> +     /*
>>> +      * Avoid hole punching and extent allocation for some cases. More cases
>>> +      * could be considered, but these are unlikely common and we keep things
>>> +      * as simple as possible for now. Also, intentionally, if the target
>>> +      * range contains one or more prealloc extents together with regular
>>> +      * extents and holes, we drop all the existing extents and allocate a
>>> +      * new prealloc extent, so that we get a larger contiguous disk extent.
>>> +      */
>>> +     if (em->start <= alloc_start &&
>>> +         test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>>> +             const u64 em_end = em->start + em->len;
>>> +
>>> +             if (em_end >= offset + len) {
>>> +                     /*
>>> +                      * The whole range is already a prealloc extent,
>>> +                      * do nothing except updating the inode's i_size if
>>> +                      * needed.
>>> +                      */
>>> +                     free_extent_map(em);
>>> +                     ret = btrfs_zero_range_update_isize(inode, offset,
>>> +                                                         len, mode);
>>> +                     goto out;
>>> +             }
>>> +             /*
>>> +              * Part of the range is already a prealloc extent, so operate
>>> +              * only on the remaining part of the range.
>>> +              */
>>> +             alloc_start = em_end;
>>> +             ASSERT(IS_ALIGNED(alloc_start, sectorsize));
>>> +             len = offset + len - alloc_start;
>>> +             offset = alloc_start;
>>> +             alloc_hint = em->block_start + em->len;
>>> +     }
>>> +     free_extent_map(em);
>>> +
>>> +     if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
>>> +         BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
>>> +             em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
>>> +                                   alloc_start, sectorsize, 0);
>>> +             if (IS_ERR(em)) {
>>> +                     ret = PTR_ERR(em);
>>> +                     goto out;
>>> +             }
>>> +
>>> +             if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>>> +                     free_extent_map(em);
>>> +                     ret = btrfs_zero_range_update_isize(inode, offset,
>>> +                                                         len, mode);
>>> +                     goto out;
>>> +             }
>>> +             if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)
>>> +                     punch_hole = true;
>>> +             free_extent_map(em);
>>> +             if (punch_hole)
>>> +                     goto punch_hole;
>>
>> This here is correct for a very non-obvious reason. If punch_hole is
>> true this means we are only ever going to execute the partial truncate
>> code in btrfs_punch_hole and not punch a hole at all, this is very
>> convoluted way of invoking truncation!
>
> Well, it might be non-obvious for people not experienced with fs
> development, but I don't think it's that terrible as you picture it.
> Every fs developer knows that punching into a range smaller then
> sector size means zeroing part of a block.
> It's not this sort of things that makes people unable to contribute or
> understand things.
>
>>
>> Instead, I propose something similar to the attached diff which just
>> calls btrfs_truncate_block directly. This allows to remove one of the
>> labels and simplifies the code flow. I if this check triggers:
>> (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) then it's
>> guaranteed that we are within the inode boundaries so there is no need
>> to update the inode size,
>
> There isn't???
> There is of course, if the file size is not sector size aligned and
> the target range affects the last block and goes beyond the current
> i_size.

Plus your diff introduces another bug when alloc_start == alloc_end,
in which case
we shouldn't do anything other the previous truncation of boundary pages, as
you left it, it results in locking a range with and end smaller then
its start plus
some other unpredictable behaviour.

>
>  hence I've omitted it, though I'm not 100%
>> sure, perhaps we want to update the inode's ctime ?
>>
>> This passes generic/008 and generic/009
>
> Well keep in mind those tests don't cover all possible scenarios.
>
> I've integrated those cleanups and will send a v3 later after some
> stress testing.
>
>
>>
>>> +             alloc_start = round_down(offset, sectorsize);
>>> +             alloc_end = alloc_start + sectorsize;
>>> +             goto reserve_space;
>>> +     }
>>> +
>>> +     alloc_start = round_up(offset, sectorsize);
>>> +     alloc_end = round_down(offset + len, sectorsize);
>>> +
>>> +     /*
>>> +      * For unaligned ranges, check the pages at the boundaries, they might
>>> +      * map to an extent, in which case we need to partially zero them, or
>>> +      * they might map to a hole, in which case we need our allocation range
>>> +      * to cover them.
>>> +      */
>>> +     if (!IS_ALIGNED(offset, sectorsize)) {
>>> +             ret = btrfs_zero_range_check_range_boundary(inode, offset);
>>> +             if (ret < 0)
>>> +                     goto out;
>>> +             if (ret) {
>>> +                     alloc_start = round_down(offset, sectorsize);
>>> +                     ret = 0;
>>> +             } else {
>>> +                     ret = btrfs_truncate_block(inode, offset, 0, 0);
>>> +                     if (ret)
>>> +                             goto out;
>>> +             }
>>> +     }
>>> +
>>> +     if (!IS_ALIGNED(offset + len, sectorsize)) {
>>> +             ret = btrfs_zero_range_check_range_boundary(inode,
>>> +                                                         offset + len);
>>> +             if (ret < 0)
>>> +                     goto out;
>>> +             if (ret) {
>>> +                     alloc_end = round_up(offset + len, sectorsize);
>>> +                     ret = 0;
>>> +             } else {
>>> +                     ret = btrfs_truncate_block(inode, offset + len, 0, 1);
>>> +                     if (ret)
>>> +                             goto out;
>>> +             }
>>> +     }
>>> +
>>> +reserve_space:
>>> +     if (alloc_start < alloc_end)
>>> +             bytes_to_reserve += alloc_end - alloc_start;
>>> +
>>> +     if (!punch_hole && bytes_to_reserve > 0) {
>>> +             ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>>> +                                                   bytes_to_reserve);
>>> +             if (ret < 0)
>>> +                     goto out;
>>> +             space_reserved = true;
>>> +             ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
>>> +                                             alloc_start, bytes_to_reserve);
>>> +             if (ret)
>>> +                     goto out;
>>> +     }
>>> +
>>> +punch_hole:
>>> +     if (punch_hole) {
>>> +             ret = btrfs_punch_hole(inode, offset, len, false);
>>> +             if (ret)
>>> +                     goto out;
>>> +             ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
>>> +     } else {
>>> +             struct extent_state *cached_state = NULL;
>>> +             const u64 lockstart = alloc_start;
>>> +             const u64 lockend = alloc_end - 1;
>>> +
>>> +             ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
>>> +                                               &cached_state);
>>> +             if (ret)
>>> +                     goto out;
>>> +             ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
>>> +                                             alloc_end - alloc_start,
>>> +                                             i_blocksize(inode),
>>> +                                             offset + len, &alloc_hint);
>>> +             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>>> +                                  lockend, &cached_state, GFP_KERNEL);
>>> +             /* btrfs_prealloc_file_range releases reserved space on error */
>>> +             if (ret)
>>> +                     space_reserved = false;
>>> +     }
>>> + out:
>>> +     if (ret && space_reserved)
>>> +             btrfs_free_reserved_data_space(inode, data_reserved,
>>> +                                            alloc_start, bytes_to_reserve);
>>> +     extent_changeset_free(data_reserved);
>>> +
>>> +     return ret;
>>> +}
>>> +
>>>  static long btrfs_fallocate(struct file *file, int mode,
>>>                           loff_t offset, loff_t len)
>>>  {
>>> @@ -2829,21 +3068,24 @@ static long btrfs_fallocate(struct file *file, int mode,
>>>       cur_offset = alloc_start;
>>>
>>>       /* Make sure we aren't being give some crap mode */
>>> -     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>>> +     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
>>> +                  FALLOC_FL_ZERO_RANGE))
>>>               return -EOPNOTSUPP;
>>>
>>>       if (mode & FALLOC_FL_PUNCH_HOLE)
>>> -             return btrfs_punch_hole(inode, offset, len);
>>> +             return btrfs_punch_hole(inode, offset, len, true);
>>>
>>>       /*
>>>        * Only trigger disk allocation, don't trigger qgroup reserve
>>>        *
>>>        * For qgroup space, it will be checked later.
>>>        */
>>> -     ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>>> -                     alloc_end - alloc_start);
>>> -     if (ret < 0)
>>> -             return ret;
>>> +     if (!(mode & FALLOC_FL_ZERO_RANGE)) {
>>> +             ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>>> +                                                   alloc_end - alloc_start);
>>> +             if (ret < 0)
>>> +                     return ret;
>>> +     }
>>>
>>>       inode_lock(inode);
>>>
>>> @@ -2885,6 +3127,12 @@ static long btrfs_fallocate(struct file *file, int mode,
>>>       if (ret)
>>>               goto out;
>>>
>>> +     if (mode & FALLOC_FL_ZERO_RANGE) {
>>> +             ret = btrfs_zero_range(inode, offset, len, mode);
>>> +             inode_unlock(inode);
>>> +             return ret;
>>> +     }
>>> +
>>>       locked_end = alloc_end - 1;
>>>       while (1) {
>>>               struct btrfs_ordered_extent *ordered;
>>> @@ -3010,7 +3258,7 @@ static long btrfs_fallocate(struct file *file, int mode,
>>>  out:
>>>       inode_unlock(inode);
>>>       /* Let go of our reservation. */
>>> -     if (ret != 0)
>>> +     if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
>>>               btrfs_free_reserved_data_space(inode, data_reserved,
>>>                               alloc_start, alloc_end - cur_offset);
>>>       extent_changeset_free(data_reserved);
>>>