From: Edmund Nadolski <enadolski@suse.de>
To: fdmanana@kernel.org, linux-btrfs@vger.kernel.org
Subject: Re: [PATCH v3] Btrfs: add support for fallocate's zero range operation
Date: Fri, 3 Nov 2017 14:59:20 -0600 [thread overview]
Message-ID: <f40ef6d3-9b5d-8edf-c161-fe138670252e@suse.de> (raw)
In-Reply-To: <20171103172037.7107-1-fdmanana@kernel.org>
On 11/03/2017 11:20 AM, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
>
> This implements support the zero range operation of fallocate. For now
> at least it's as simple as possible while reusing most of the existing
> fallocate and hole punching infrastructure.
>
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
>
> V2: Removed double inode unlock on error path from failure to lock range.
> V3: Factored common code to update isize and inode item into a helper
> function, plus some minor cleanup.
>
> fs/btrfs/file.c | 351 +++++++++++++++++++++++++++++++++++++++++++++-----------
> 1 file changed, 285 insertions(+), 66 deletions(-)
>
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index aafcc785f840..2cc1aed1c564 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
> return ret;
> }
>
> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> +static int btrfs_punch_hole_lock_range(struct inode *inode,
> + const u64 lockstart,
> + const u64 lockend,
> + struct extent_state **cached_state)
> +{
> + while (1) {
> + struct btrfs_ordered_extent *ordered;
> + int ret;
> +
> + truncate_pagecache_range(inode, lockstart, lockend);
> +
> + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> + cached_state);
> + ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> +
> + /*
> + * We need to make sure we have no ordered extents in this range
> + * and nobody raced in and read a page in this range, if we did
> + * we need to try again.
> + */
> + if ((!ordered ||
> + (ordered->file_offset + ordered->len <= lockstart ||
> + ordered->file_offset > lockend)) &&
> + !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> + if (ordered)
> + btrfs_put_ordered_extent(ordered);
> + break;
> + }
> + if (ordered)
> + btrfs_put_ordered_extent(ordered);
> + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> + lockend, cached_state, GFP_NOFS);
> + ret = btrfs_wait_ordered_range(inode, lockstart,
> + lockend - lockstart + 1);
> + if (ret)
> + return ret;
> + }
> + return 0;
> +}
> +
> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
> + bool lock_inode)
The inode_lock may no longer be needed, since it looks to be always true
in this version of the patch.
Ed
> {
> struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> struct btrfs_root *root = BTRFS_I(inode)->root;
> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> if (ret)
> return ret;
>
> - inode_lock(inode);
> + if (lock_inode)
> + inode_lock(inode);
> ino_size = round_up(inode->i_size, fs_info->sectorsize);
> ret = find_first_non_hole(inode, &offset, &len);
> if (ret < 0)
> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> truncated_block = true;
> ret = btrfs_truncate_block(inode, offset, 0, 0);
> if (ret) {
> - inode_unlock(inode);
> + if (lock_inode)
> + inode_unlock(inode);
> return ret;
> }
> }
> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> goto out_only_mutex;
> }
>
> - while (1) {
> - struct btrfs_ordered_extent *ordered;
> -
> - truncate_pagecache_range(inode, lockstart, lockend);
> -
> - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> - &cached_state);
> - ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> -
> - /*
> - * We need to make sure we have no ordered extents in this range
> - * and nobody raced in and read a page in this range, if we did
> - * we need to try again.
> - */
> - if ((!ordered ||
> - (ordered->file_offset + ordered->len <= lockstart ||
> - ordered->file_offset > lockend)) &&
> - !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> - if (ordered)
> - btrfs_put_ordered_extent(ordered);
> - break;
> - }
> - if (ordered)
> - btrfs_put_ordered_extent(ordered);
> - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> - lockend, &cached_state, GFP_NOFS);
> - ret = btrfs_wait_ordered_range(inode, lockstart,
> - lockend - lockstart + 1);
> - if (ret) {
> + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> + &cached_state);
> + if (ret) {
> + if (lock_inode)
> inode_unlock(inode);
> - return ret;
> - }
> + return ret;
> }
>
> path = btrfs_alloc_path();
> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> ret = btrfs_end_transaction(trans);
> }
> }
> - inode_unlock(inode);
> + if (lock_inode)
> + inode_unlock(inode);
> if (ret && !err)
> err = ret;
> return err;
> @@ -2804,6 +2822,217 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
> return 0;
> }
>
> +static int btrfs_fallocate_update_isize(struct inode *inode,
> + const u64 end,
> + const int mode)
> +{
> + struct btrfs_trans_handle *trans;
> + struct btrfs_root *root = BTRFS_I(inode)->root;
> + int ret;
> + int ret2;
> +
> + if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
> + return 0;
> +
> + trans = btrfs_start_transaction(root, 1);
> + if (IS_ERR(trans))
> + return PTR_ERR(trans);
> +
> + inode->i_ctime = current_time(inode);
> + i_size_write(inode, end);
> + btrfs_ordered_update_i_size(inode, end, NULL);
> + ret = btrfs_update_inode(trans, root, inode);
> + ret2 = btrfs_end_transaction(trans);
> +
> + return ret ? ret : ret2;
> +}
> +
> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
> + u64 offset)
> +{
> + const u64 sectorsize = btrfs_inode_sectorsize(inode);
> + struct extent_map *em = NULL;
> + int ret = 0;
> +
> + offset = round_down(offset, sectorsize);
> + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
> + if (IS_ERR(em))
> + return PTR_ERR(em);
> +
> + if (em->block_start == EXTENT_MAP_HOLE)
> + ret = 1;
> +
> + free_extent_map(em);
> + return ret;
> +}
> +
> +static int btrfs_zero_range(struct inode *inode,
> + loff_t offset,
> + loff_t len,
> + const int mode)
> +{
> + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
> + struct extent_map *em;
> + struct extent_changeset *data_reserved = NULL;
> + int ret;
> + u64 alloc_hint = 0;
> + const u64 sectorsize = btrfs_inode_sectorsize(inode);
> + u64 alloc_start = round_down(offset, sectorsize);
> + u64 alloc_end = round_up(offset + len, sectorsize);
> + u64 bytes_to_reserve = 0;
> + bool space_reserved = false;
> +
> + inode_dio_wait(inode);
> +
> + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> + alloc_start, alloc_end - alloc_start, 0);
> + if (IS_ERR(em)) {
> + ret = PTR_ERR(em);
> + goto out;
> + }
> +
> + /*
> + * Avoid hole punching and extent allocation for some cases. More cases
> + * could be considered, but these are unlikely common and we keep things
> + * as simple as possible for now. Also, intentionally, if the target
> + * range contains one or more prealloc extents together with regular
> + * extents and holes, we drop all the existing extents and allocate a
> + * new prealloc extent, so that we get a larger contiguous disk extent.
> + */
> + if (em->start <= alloc_start &&
> + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> + const u64 em_end = em->start + em->len;
> +
> + if (em_end >= offset + len) {
> + /*
> + * The whole range is already a prealloc extent,
> + * do nothing except updating the inode's i_size if
> + * needed.
> + */
> + free_extent_map(em);
> + ret = btrfs_fallocate_update_isize(inode, offset + len,
> + mode);
> + goto out;
> + }
> + /*
> + * Part of the range is already a prealloc extent, so operate
> + * only on the remaining part of the range.
> + */
> + alloc_start = em_end;
> + ASSERT(IS_ALIGNED(alloc_start, sectorsize));
> + len = offset + len - alloc_start;
> + offset = alloc_start;
> + alloc_hint = em->block_start + em->len;
> + }
> + free_extent_map(em);
> +
> + if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
> + BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
> + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> + alloc_start, sectorsize, 0);
> + if (IS_ERR(em)) {
> + ret = PTR_ERR(em);
> + goto out;
> + }
> +
> + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> + free_extent_map(em);
> + ret = btrfs_fallocate_update_isize(inode, offset + len,
> + mode);
> + goto out;
> + }
> + if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
> + free_extent_map(em);
> + ret = btrfs_truncate_block(inode, offset, len, 0);
> + if (!ret)
> + ret = btrfs_fallocate_update_isize(inode,
> + offset + len,
> + mode);
> + return ret;
> + }
> + free_extent_map(em);
> + alloc_start = round_down(offset, sectorsize);
> + alloc_end = alloc_start + sectorsize;
> + goto reserve_space;
> + }
> +
> + alloc_start = round_up(offset, sectorsize);
> + alloc_end = round_down(offset + len, sectorsize);
> +
> + /*
> + * For unaligned ranges, check the pages at the boundaries, they might
> + * map to an extent, in which case we need to partially zero them, or
> + * they might map to a hole, in which case we need our allocation range
> + * to cover them.
> + */
> + if (!IS_ALIGNED(offset, sectorsize)) {
> + ret = btrfs_zero_range_check_range_boundary(inode, offset);
> + if (ret < 0)
> + goto out;
> + if (ret) {
> + alloc_start = round_down(offset, sectorsize);
> + ret = 0;
> + } else {
> + ret = btrfs_truncate_block(inode, offset, 0, 0);
> + if (ret)
> + goto out;
> + }
> + }
> +
> + if (!IS_ALIGNED(offset + len, sectorsize)) {
> + ret = btrfs_zero_range_check_range_boundary(inode,
> + offset + len);
> + if (ret < 0)
> + goto out;
> + if (ret) {
> + alloc_end = round_up(offset + len, sectorsize);
> + ret = 0;
> + } else {
> + ret = btrfs_truncate_block(inode, offset + len, 0, 1);
> + if (ret)
> + goto out;
> + }
> + }
> +
> +reserve_space:
> + if (alloc_start < alloc_end) {
> + struct extent_state *cached_state = NULL;
> + const u64 lockstart = alloc_start;
> + const u64 lockend = alloc_end - 1;
> +
> + bytes_to_reserve = alloc_end - alloc_start;
> + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> + bytes_to_reserve);
> + if (ret < 0)
> + goto out;
> + space_reserved = true;
> + ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
> + alloc_start, bytes_to_reserve);
> + if (ret)
> + goto out;
> + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> + &cached_state);
> + if (ret)
> + goto out;
> + ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
> + alloc_end - alloc_start,
> + i_blocksize(inode),
> + offset + len, &alloc_hint);
> + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> + lockend, &cached_state, GFP_KERNEL);
> + /* btrfs_prealloc_file_range releases reserved space on error */
> + if (ret)
> + space_reserved = false;
> + }
> + out:
> + if (ret && space_reserved)
> + btrfs_free_reserved_data_space(inode, data_reserved,
> + alloc_start, bytes_to_reserve);
> + extent_changeset_free(data_reserved);
> +
> + return ret;
> +}
> +
> static long btrfs_fallocate(struct file *file, int mode,
> loff_t offset, loff_t len)
> {
> @@ -2829,21 +3058,24 @@ static long btrfs_fallocate(struct file *file, int mode,
> cur_offset = alloc_start;
>
> /* Make sure we aren't being give some crap mode */
> - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> + FALLOC_FL_ZERO_RANGE))
> return -EOPNOTSUPP;
>
> if (mode & FALLOC_FL_PUNCH_HOLE)
> - return btrfs_punch_hole(inode, offset, len);
> + return btrfs_punch_hole(inode, offset, len, true);
>
> /*
> * Only trigger disk allocation, don't trigger qgroup reserve
> *
> * For qgroup space, it will be checked later.
> */
> - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> - alloc_end - alloc_start);
> - if (ret < 0)
> - return ret;
> + if (!(mode & FALLOC_FL_ZERO_RANGE)) {
> + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> + alloc_end - alloc_start);
> + if (ret < 0)
> + return ret;
> + }
>
> inode_lock(inode);
>
> @@ -2885,6 +3117,12 @@ static long btrfs_fallocate(struct file *file, int mode,
> if (ret)
> goto out;
>
> + if (mode & FALLOC_FL_ZERO_RANGE) {
> + ret = btrfs_zero_range(inode, offset, len, mode);
> + inode_unlock(inode);
> + return ret;
> + }
> +
> locked_end = alloc_end - 1;
> while (1) {
> struct btrfs_ordered_extent *ordered;
> @@ -2980,37 +3218,18 @@ static long btrfs_fallocate(struct file *file, int mode,
> if (ret < 0)
> goto out_unlock;
>
> - if (actual_end > inode->i_size &&
> - !(mode & FALLOC_FL_KEEP_SIZE)) {
> - struct btrfs_trans_handle *trans;
> - struct btrfs_root *root = BTRFS_I(inode)->root;
> -
> - /*
> - * We didn't need to allocate any more space, but we
> - * still extended the size of the file so we need to
> - * update i_size and the inode item.
> - */
> - trans = btrfs_start_transaction(root, 1);
> - if (IS_ERR(trans)) {
> - ret = PTR_ERR(trans);
> - } else {
> - inode->i_ctime = current_time(inode);
> - i_size_write(inode, actual_end);
> - btrfs_ordered_update_i_size(inode, actual_end, NULL);
> - ret = btrfs_update_inode(trans, root, inode);
> - if (ret)
> - btrfs_end_transaction(trans);
> - else
> - ret = btrfs_end_transaction(trans);
> - }
> - }
> + /*
> + * We didn't need to allocate any more space, but we still extended the
> + * size of the file so we need to update i_size and the inode item.
> + */
> + ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
> out_unlock:
> unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
> &cached_state, GFP_KERNEL);
> out:
> inode_unlock(inode);
> /* Let go of our reservation. */
> - if (ret != 0)
> + if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
> btrfs_free_reserved_data_space(inode, data_reserved,
> alloc_start, alloc_end - cur_offset);
> extent_changeset_free(data_reserved);
>
next prev parent reply other threads:[~2017-11-03 20:59 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-10-25 12:53 [PATCH] Btrfs: add support for fallocate's zero range operation fdmanana
2017-10-25 14:59 ` [PATCH v2] " fdmanana
2017-10-30 14:57 ` David Sterba
2017-11-01 10:34 ` Nikolay Borisov
2017-11-01 10:59 ` Filipe Manana
2017-11-02 8:33 ` Nikolay Borisov
2017-11-03 9:30 ` Nikolay Borisov
2017-11-03 10:29 ` Filipe Manana
2017-11-03 10:45 ` Filipe Manana
2017-11-03 17:20 ` [PATCH v3] " fdmanana
2017-11-03 20:59 ` Edmund Nadolski [this message]
2017-11-04 4:07 ` [PATCH v4] " fdmanana
2017-11-10 16:43 ` Nikolay Borisov
2018-01-05 16:49 ` David Sterba
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=f40ef6d3-9b5d-8edf-c161-fe138670252e@suse.de \
--to=enadolski@suse.de \
--cc=fdmanana@kernel.org \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).