From: Filipe Manana <fdmanana@kernel.org>
To: Nikolay Borisov <nborisov@suse.com>
Cc: "linux-btrfs@vger.kernel.org" <linux-btrfs@vger.kernel.org>
Subject: Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation
Date: Fri, 3 Nov 2017 10:45:10 +0000 [thread overview]
Message-ID: <CAL3q7H7=4RupqV-GAZzm_X_7OCdw=05DkY3Ws_aYCC6vh7dA8A@mail.gmail.com> (raw)
In-Reply-To: <CAL3q7H6tPZzvyrQf0gtLKYMFjYRAQzRsATJtnd==1=4c4K1maQ@mail.gmail.com>
On Fri, Nov 3, 2017 at 10:29 AM, Filipe Manana <fdmanana@kernel.org> wrote:
> On Fri, Nov 3, 2017 at 9:30 AM, Nikolay Borisov <nborisov@suse.com> wrote:
>>
>>
>> On 25.10.2017 17:59, fdmanana@kernel.org wrote:
>>> From: Filipe Manana <fdmanana@suse.com>
>>>
>>> This implements support the zero range operation of fallocate. For now
>>> at least it's as simple as possible while reusing most of the existing
>>> fallocate and hole punching infrastructure.
>>>
>>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>>> ---
>>>
>>> V2: Removed double inode unlock on error path from failure to lock range.
>>>
>>> fs/btrfs/file.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>>> 1 file changed, 290 insertions(+), 42 deletions(-)
>>>
>>> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
>>> index aafcc785f840..e0d15c0d1641 100644
>>> --- a/fs/btrfs/file.c
>>> +++ b/fs/btrfs/file.c
>>> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
>>> return ret;
>>> }
>>>
>>> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>> +static int btrfs_punch_hole_lock_range(struct inode *inode,
>>> + const u64 lockstart,
>>> + const u64 lockend,
>>> + struct extent_state **cached_state)
>>> +{
>>> + while (1) {
>>> + struct btrfs_ordered_extent *ordered;
>>> + int ret;
>>> +
>>> + truncate_pagecache_range(inode, lockstart, lockend);
>>> +
>>> + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>>> + cached_state);
>>> + ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
>>> +
>>> + /*
>>> + * We need to make sure we have no ordered extents in this range
>>> + * and nobody raced in and read a page in this range, if we did
>>> + * we need to try again.
>>> + */
>>> + if ((!ordered ||
>>> + (ordered->file_offset + ordered->len <= lockstart ||
>>> + ordered->file_offset > lockend)) &&
>>> + !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
>>> + if (ordered)
>>> + btrfs_put_ordered_extent(ordered);
>>> + break;
>>> + }
>>> + if (ordered)
>>> + btrfs_put_ordered_extent(ordered);
>>> + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>>> + lockend, cached_state, GFP_NOFS);
>>> + ret = btrfs_wait_ordered_range(inode, lockstart,
>>> + lockend - lockstart + 1);
>>> + if (ret)
>>> + return ret;
>>> + }
>>> + return 0;
>>> +}
>>> +
>>> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
>>> + bool lock_inode)
>>> {
>>> struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>>> struct btrfs_root *root = BTRFS_I(inode)->root;
>>> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>> if (ret)
>>> return ret;
>>>
>>> - inode_lock(inode);
>>> + if (lock_inode)
>>> + inode_lock(inode);
>>> ino_size = round_up(inode->i_size, fs_info->sectorsize);
>>> ret = find_first_non_hole(inode, &offset, &len);
>>> if (ret < 0)
>>> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>> truncated_block = true;
>>> ret = btrfs_truncate_block(inode, offset, 0, 0);
>>> if (ret) {
>>> - inode_unlock(inode);
>>> + if (lock_inode)
>>> + inode_unlock(inode);
>>> return ret;
>>> }
>>> }
>>> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>> goto out_only_mutex;
>>> }
>>>
>>> - while (1) {
>>> - struct btrfs_ordered_extent *ordered;
>>> -
>>> - truncate_pagecache_range(inode, lockstart, lockend);
>>> -
>>> - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>>> - &cached_state);
>>> - ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
>>> -
>>> - /*
>>> - * We need to make sure we have no ordered extents in this range
>>> - * and nobody raced in and read a page in this range, if we did
>>> - * we need to try again.
>>> - */
>>> - if ((!ordered ||
>>> - (ordered->file_offset + ordered->len <= lockstart ||
>>> - ordered->file_offset > lockend)) &&
>>> - !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
>>> - if (ordered)
>>> - btrfs_put_ordered_extent(ordered);
>>> - break;
>>> - }
>>> - if (ordered)
>>> - btrfs_put_ordered_extent(ordered);
>>> - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>>> - lockend, &cached_state, GFP_NOFS);
>>> - ret = btrfs_wait_ordered_range(inode, lockstart,
>>> - lockend - lockstart + 1);
>>> - if (ret) {
>>> + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
>>> + &cached_state);
>>> + if (ret) {
>>> + if (lock_inode)
>>> inode_unlock(inode);
>>> - return ret;
>>> - }
>>> + return ret;
>>> }
>>>
>>> path = btrfs_alloc_path();
>>> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>> ret = btrfs_end_transaction(trans);
>>> }
>>> }
>>> - inode_unlock(inode);
>>> + if (lock_inode)
>>> + inode_unlock(inode);
>>> if (ret && !err)
>>> err = ret;
>>> return err;
>>> @@ -2804,6 +2822,227 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
>>> return 0;
>>> }
>>>
>>> +static int btrfs_zero_range_update_isize(struct inode *inode,
>>> + const loff_t offset,
>>> + const loff_t len,
>>> + const int mode)
>>> +{
>>> + struct btrfs_root *root = BTRFS_I(inode)->root;
>>> + struct btrfs_trans_handle *trans;
>>> + const u64 end = offset + len;
>>> + int ret;
>>> +
>>> + if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
>>> + return 0;
>>> +
>>> + i_size_write(inode, end);
>>> + btrfs_ordered_update_i_size(inode, end, NULL);
>>> + trans = btrfs_start_transaction(root, 1);
>>> + if (IS_ERR(trans)) {
>>> + ret = PTR_ERR(trans);
>>> + } else {
>>> + int err;
>>> +
>>> + ret = btrfs_update_inode(trans, root, inode);
>>> + err = btrfs_end_transaction(trans);
>>> + ret = ret ? ret : err;
>>> + }
>>> + return ret;
>>> +}
>>> +
>>> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
>>> + u64 offset)
>>> +{
>>> + const u64 sectorsize = btrfs_inode_sectorsize(inode);
>>> + struct extent_map *em = NULL;
>>> + int ret = 0;
>>> +
>>> + offset = round_down(offset, sectorsize);
>>> + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
>>> + if (IS_ERR(em))
>>> + return PTR_ERR(em);
>>> +
>>> + if (em->block_start == EXTENT_MAP_HOLE)
>>> + ret = 1;
>>> +
>>> + free_extent_map(em);
>>> + return ret;
>>> +}
>>> +
>>> +static int btrfs_zero_range(struct inode *inode,
>>> + loff_t offset,
>>> + loff_t len,
>>> + const int mode)
>>> +{
>>> + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
>>> + struct extent_map *em;
>>> + struct extent_changeset *data_reserved = NULL;
>>> + int ret;
>>> + u64 alloc_hint = 0;
>>> + const u64 sectorsize = btrfs_inode_sectorsize(inode);
>>> + u64 alloc_start = round_down(offset, sectorsize);
>>> + u64 alloc_end = round_up(offset + len, sectorsize);
>>> + u64 bytes_to_reserve = 0;
>>> + bool space_reserved = false;
>>> + bool punch_hole = false;
>>> +
>>> + inode_dio_wait(inode);
>>> +
>>> + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
>>> + alloc_start, alloc_end - alloc_start, 0);
>>> + if (IS_ERR(em)) {
>>> + ret = PTR_ERR(em);
>>> + goto out;
>>> + }
>>> +
>>> + /*
>>> + * Avoid hole punching and extent allocation for some cases. More cases
>>> + * could be considered, but these are unlikely common and we keep things
>>> + * as simple as possible for now. Also, intentionally, if the target
>>> + * range contains one or more prealloc extents together with regular
>>> + * extents and holes, we drop all the existing extents and allocate a
>>> + * new prealloc extent, so that we get a larger contiguous disk extent.
>>> + */
>>> + if (em->start <= alloc_start &&
>>> + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>>> + const u64 em_end = em->start + em->len;
>>> +
>>> + if (em_end >= offset + len) {
>>> + /*
>>> + * The whole range is already a prealloc extent,
>>> + * do nothing except updating the inode's i_size if
>>> + * needed.
>>> + */
>>> + free_extent_map(em);
>>> + ret = btrfs_zero_range_update_isize(inode, offset,
>>> + len, mode);
>>> + goto out;
>>> + }
>>> + /*
>>> + * Part of the range is already a prealloc extent, so operate
>>> + * only on the remaining part of the range.
>>> + */
>>> + alloc_start = em_end;
>>> + ASSERT(IS_ALIGNED(alloc_start, sectorsize));
>>> + len = offset + len - alloc_start;
>>> + offset = alloc_start;
>>> + alloc_hint = em->block_start + em->len;
>>> + }
>>> + free_extent_map(em);
>>> +
>>> + if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
>>> + BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
>>> + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
>>> + alloc_start, sectorsize, 0);
>>> + if (IS_ERR(em)) {
>>> + ret = PTR_ERR(em);
>>> + goto out;
>>> + }
>>> +
>>> + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>>> + free_extent_map(em);
>>> + ret = btrfs_zero_range_update_isize(inode, offset,
>>> + len, mode);
>>> + goto out;
>>> + }
>>> + if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)
>>> + punch_hole = true;
>>> + free_extent_map(em);
>>> + if (punch_hole)
>>> + goto punch_hole;
>>
>> This here is correct for a very non-obvious reason. If punch_hole is
>> true this means we are only ever going to execute the partial truncate
>> code in btrfs_punch_hole and not punch a hole at all, this is very
>> convoluted way of invoking truncation!
>
> Well, it might be non-obvious for people not experienced with fs
> development, but I don't think it's that terrible as you picture it.
> Every fs developer knows that punching into a range smaller then
> sector size means zeroing part of a block.
> It's not this sort of things that makes people unable to contribute or
> understand things.
>
>>
>> Instead, I propose something similar to the attached diff which just
>> calls btrfs_truncate_block directly. This allows to remove one of the
>> labels and simplifies the code flow. I if this check triggers:
>> (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) then it's
>> guaranteed that we are within the inode boundaries so there is no need
>> to update the inode size,
>
> There isn't???
> There is of course, if the file size is not sector size aligned and
> the target range affects the last block and goes beyond the current
> i_size.
Plus your diff introduces another bug when alloc_start == alloc_end,
in which case
we shouldn't do anything other the previous truncation of boundary pages, as
you left it, it results in locking a range with and end smaller then
its start plus
some other unpredictable behaviour.
>
> hence I've omitted it, though I'm not 100%
>> sure, perhaps we want to update the inode's ctime ?
>>
>> This passes generic/008 and generic/009
>
> Well keep in mind those tests don't cover all possible scenarios.
>
> I've integrated those cleanups and will send a v3 later after some
> stress testing.
>
>
>>
>>> + alloc_start = round_down(offset, sectorsize);
>>> + alloc_end = alloc_start + sectorsize;
>>> + goto reserve_space;
>>> + }
>>> +
>>> + alloc_start = round_up(offset, sectorsize);
>>> + alloc_end = round_down(offset + len, sectorsize);
>>> +
>>> + /*
>>> + * For unaligned ranges, check the pages at the boundaries, they might
>>> + * map to an extent, in which case we need to partially zero them, or
>>> + * they might map to a hole, in which case we need our allocation range
>>> + * to cover them.
>>> + */
>>> + if (!IS_ALIGNED(offset, sectorsize)) {
>>> + ret = btrfs_zero_range_check_range_boundary(inode, offset);
>>> + if (ret < 0)
>>> + goto out;
>>> + if (ret) {
>>> + alloc_start = round_down(offset, sectorsize);
>>> + ret = 0;
>>> + } else {
>>> + ret = btrfs_truncate_block(inode, offset, 0, 0);
>>> + if (ret)
>>> + goto out;
>>> + }
>>> + }
>>> +
>>> + if (!IS_ALIGNED(offset + len, sectorsize)) {
>>> + ret = btrfs_zero_range_check_range_boundary(inode,
>>> + offset + len);
>>> + if (ret < 0)
>>> + goto out;
>>> + if (ret) {
>>> + alloc_end = round_up(offset + len, sectorsize);
>>> + ret = 0;
>>> + } else {
>>> + ret = btrfs_truncate_block(inode, offset + len, 0, 1);
>>> + if (ret)
>>> + goto out;
>>> + }
>>> + }
>>> +
>>> +reserve_space:
>>> + if (alloc_start < alloc_end)
>>> + bytes_to_reserve += alloc_end - alloc_start;
>>> +
>>> + if (!punch_hole && bytes_to_reserve > 0) {
>>> + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>>> + bytes_to_reserve);
>>> + if (ret < 0)
>>> + goto out;
>>> + space_reserved = true;
>>> + ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
>>> + alloc_start, bytes_to_reserve);
>>> + if (ret)
>>> + goto out;
>>> + }
>>> +
>>> +punch_hole:
>>> + if (punch_hole) {
>>> + ret = btrfs_punch_hole(inode, offset, len, false);
>>> + if (ret)
>>> + goto out;
>>> + ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
>>> + } else {
>>> + struct extent_state *cached_state = NULL;
>>> + const u64 lockstart = alloc_start;
>>> + const u64 lockend = alloc_end - 1;
>>> +
>>> + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
>>> + &cached_state);
>>> + if (ret)
>>> + goto out;
>>> + ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
>>> + alloc_end - alloc_start,
>>> + i_blocksize(inode),
>>> + offset + len, &alloc_hint);
>>> + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>>> + lockend, &cached_state, GFP_KERNEL);
>>> + /* btrfs_prealloc_file_range releases reserved space on error */
>>> + if (ret)
>>> + space_reserved = false;
>>> + }
>>> + out:
>>> + if (ret && space_reserved)
>>> + btrfs_free_reserved_data_space(inode, data_reserved,
>>> + alloc_start, bytes_to_reserve);
>>> + extent_changeset_free(data_reserved);
>>> +
>>> + return ret;
>>> +}
>>> +
>>> static long btrfs_fallocate(struct file *file, int mode,
>>> loff_t offset, loff_t len)
>>> {
>>> @@ -2829,21 +3068,24 @@ static long btrfs_fallocate(struct file *file, int mode,
>>> cur_offset = alloc_start;
>>>
>>> /* Make sure we aren't being give some crap mode */
>>> - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>>> + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
>>> + FALLOC_FL_ZERO_RANGE))
>>> return -EOPNOTSUPP;
>>>
>>> if (mode & FALLOC_FL_PUNCH_HOLE)
>>> - return btrfs_punch_hole(inode, offset, len);
>>> + return btrfs_punch_hole(inode, offset, len, true);
>>>
>>> /*
>>> * Only trigger disk allocation, don't trigger qgroup reserve
>>> *
>>> * For qgroup space, it will be checked later.
>>> */
>>> - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>>> - alloc_end - alloc_start);
>>> - if (ret < 0)
>>> - return ret;
>>> + if (!(mode & FALLOC_FL_ZERO_RANGE)) {
>>> + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>>> + alloc_end - alloc_start);
>>> + if (ret < 0)
>>> + return ret;
>>> + }
>>>
>>> inode_lock(inode);
>>>
>>> @@ -2885,6 +3127,12 @@ static long btrfs_fallocate(struct file *file, int mode,
>>> if (ret)
>>> goto out;
>>>
>>> + if (mode & FALLOC_FL_ZERO_RANGE) {
>>> + ret = btrfs_zero_range(inode, offset, len, mode);
>>> + inode_unlock(inode);
>>> + return ret;
>>> + }
>>> +
>>> locked_end = alloc_end - 1;
>>> while (1) {
>>> struct btrfs_ordered_extent *ordered;
>>> @@ -3010,7 +3258,7 @@ static long btrfs_fallocate(struct file *file, int mode,
>>> out:
>>> inode_unlock(inode);
>>> /* Let go of our reservation. */
>>> - if (ret != 0)
>>> + if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
>>> btrfs_free_reserved_data_space(inode, data_reserved,
>>> alloc_start, alloc_end - cur_offset);
>>> extent_changeset_free(data_reserved);
>>>
next prev parent reply other threads:[~2017-11-03 10:45 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-10-25 12:53 [PATCH] Btrfs: add support for fallocate's zero range operation fdmanana
2017-10-25 14:59 ` [PATCH v2] " fdmanana
2017-10-30 14:57 ` David Sterba
2017-11-01 10:34 ` Nikolay Borisov
2017-11-01 10:59 ` Filipe Manana
2017-11-02 8:33 ` Nikolay Borisov
2017-11-03 9:30 ` Nikolay Borisov
2017-11-03 10:29 ` Filipe Manana
2017-11-03 10:45 ` Filipe Manana [this message]
2017-11-03 17:20 ` [PATCH v3] " fdmanana
2017-11-03 20:59 ` Edmund Nadolski
2017-11-04 4:07 ` [PATCH v4] " fdmanana
2017-11-10 16:43 ` Nikolay Borisov
2018-01-05 16:49 ` David Sterba
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAL3q7H7=4RupqV-GAZzm_X_7OCdw=05DkY3Ws_aYCC6vh7dA8A@mail.gmail.com' \
--to=fdmanana@kernel.org \
--cc=linux-btrfs@vger.kernel.org \
--cc=nborisov@suse.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).