From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mx2.suse.de ([195.135.220.15]:49446 "EHLO mx2.suse.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751805AbdKCU7Y (ORCPT ); Fri, 3 Nov 2017 16:59:24 -0400 Subject: Re: [PATCH v3] Btrfs: add support for fallocate's zero range operation To: fdmanana@kernel.org, linux-btrfs@vger.kernel.org References: <20171025125344.2111-1-fdmanana@kernel.org> <20171103172037.7107-1-fdmanana@kernel.org> From: Edmund Nadolski Message-ID: Date: Fri, 3 Nov 2017 14:59:20 -0600 MIME-Version: 1.0 In-Reply-To: <20171103172037.7107-1-fdmanana@kernel.org> Content-Type: text/plain; charset=windows-1252 Sender: linux-btrfs-owner@vger.kernel.org List-ID: On 11/03/2017 11:20 AM, fdmanana@kernel.org wrote: > From: Filipe Manana > > This implements support the zero range operation of fallocate. For now > at least it's as simple as possible while reusing most of the existing > fallocate and hole punching infrastructure. > > Signed-off-by: Filipe Manana > --- > > V2: Removed double inode unlock on error path from failure to lock range. > V3: Factored common code to update isize and inode item into a helper > function, plus some minor cleanup. > > fs/btrfs/file.c | 351 +++++++++++++++++++++++++++++++++++++++++++++----------- > 1 file changed, 285 insertions(+), 66 deletions(-) > > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index aafcc785f840..2cc1aed1c564 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) > return ret; > } > > -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) > +static int btrfs_punch_hole_lock_range(struct inode *inode, > + const u64 lockstart, > + const u64 lockend, > + struct extent_state **cached_state) > +{ > + while (1) { > + struct btrfs_ordered_extent *ordered; > + int ret; > + > + truncate_pagecache_range(inode, lockstart, lockend); > + > + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, > + cached_state); > + ordered = btrfs_lookup_first_ordered_extent(inode, lockend); > + > + /* > + * We need to make sure we have no ordered extents in this range > + * and nobody raced in and read a page in this range, if we did > + * we need to try again. > + */ > + if ((!ordered || > + (ordered->file_offset + ordered->len <= lockstart || > + ordered->file_offset > lockend)) && > + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { > + if (ordered) > + btrfs_put_ordered_extent(ordered); > + break; > + } > + if (ordered) > + btrfs_put_ordered_extent(ordered); > + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, > + lockend, cached_state, GFP_NOFS); > + ret = btrfs_wait_ordered_range(inode, lockstart, > + lockend - lockstart + 1); > + if (ret) > + return ret; > + } > + return 0; > +} > + > +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len, > + bool lock_inode) The inode_lock may no longer be needed, since it looks to be always true in this version of the patch. Ed > { > struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); > struct btrfs_root *root = BTRFS_I(inode)->root; > @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) > if (ret) > return ret; > > - inode_lock(inode); > + if (lock_inode) > + inode_lock(inode); > ino_size = round_up(inode->i_size, fs_info->sectorsize); > ret = find_first_non_hole(inode, &offset, &len); > if (ret < 0) > @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) > truncated_block = true; > ret = btrfs_truncate_block(inode, offset, 0, 0); > if (ret) { > - inode_unlock(inode); > + if (lock_inode) > + inode_unlock(inode); > return ret; > } > } > @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) > goto out_only_mutex; > } > > - while (1) { > - struct btrfs_ordered_extent *ordered; > - > - truncate_pagecache_range(inode, lockstart, lockend); > - > - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, > - &cached_state); > - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); > - > - /* > - * We need to make sure we have no ordered extents in this range > - * and nobody raced in and read a page in this range, if we did > - * we need to try again. > - */ > - if ((!ordered || > - (ordered->file_offset + ordered->len <= lockstart || > - ordered->file_offset > lockend)) && > - !btrfs_page_exists_in_range(inode, lockstart, lockend)) { > - if (ordered) > - btrfs_put_ordered_extent(ordered); > - break; > - } > - if (ordered) > - btrfs_put_ordered_extent(ordered); > - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, > - lockend, &cached_state, GFP_NOFS); > - ret = btrfs_wait_ordered_range(inode, lockstart, > - lockend - lockstart + 1); > - if (ret) { > + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, > + &cached_state); > + if (ret) { > + if (lock_inode) > inode_unlock(inode); > - return ret; > - } > + return ret; > } > > path = btrfs_alloc_path(); > @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) > ret = btrfs_end_transaction(trans); > } > } > - inode_unlock(inode); > + if (lock_inode) > + inode_unlock(inode); > if (ret && !err) > err = ret; > return err; > @@ -2804,6 +2822,217 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len) > return 0; > } > > +static int btrfs_fallocate_update_isize(struct inode *inode, > + const u64 end, > + const int mode) > +{ > + struct btrfs_trans_handle *trans; > + struct btrfs_root *root = BTRFS_I(inode)->root; > + int ret; > + int ret2; > + > + if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) > + return 0; > + > + trans = btrfs_start_transaction(root, 1); > + if (IS_ERR(trans)) > + return PTR_ERR(trans); > + > + inode->i_ctime = current_time(inode); > + i_size_write(inode, end); > + btrfs_ordered_update_i_size(inode, end, NULL); > + ret = btrfs_update_inode(trans, root, inode); > + ret2 = btrfs_end_transaction(trans); > + > + return ret ? ret : ret2; > +} > + > +static int btrfs_zero_range_check_range_boundary(struct inode *inode, > + u64 offset) > +{ > + const u64 sectorsize = btrfs_inode_sectorsize(inode); > + struct extent_map *em = NULL; > + int ret = 0; > + > + offset = round_down(offset, sectorsize); > + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); > + if (IS_ERR(em)) > + return PTR_ERR(em); > + > + if (em->block_start == EXTENT_MAP_HOLE) > + ret = 1; > + > + free_extent_map(em); > + return ret; > +} > + > +static int btrfs_zero_range(struct inode *inode, > + loff_t offset, > + loff_t len, > + const int mode) > +{ > + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; > + struct extent_map *em; > + struct extent_changeset *data_reserved = NULL; > + int ret; > + u64 alloc_hint = 0; > + const u64 sectorsize = btrfs_inode_sectorsize(inode); > + u64 alloc_start = round_down(offset, sectorsize); > + u64 alloc_end = round_up(offset + len, sectorsize); > + u64 bytes_to_reserve = 0; > + bool space_reserved = false; > + > + inode_dio_wait(inode); > + > + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, > + alloc_start, alloc_end - alloc_start, 0); > + if (IS_ERR(em)) { > + ret = PTR_ERR(em); > + goto out; > + } > + > + /* > + * Avoid hole punching and extent allocation for some cases. More cases > + * could be considered, but these are unlikely common and we keep things > + * as simple as possible for now. Also, intentionally, if the target > + * range contains one or more prealloc extents together with regular > + * extents and holes, we drop all the existing extents and allocate a > + * new prealloc extent, so that we get a larger contiguous disk extent. > + */ > + if (em->start <= alloc_start && > + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { > + const u64 em_end = em->start + em->len; > + > + if (em_end >= offset + len) { > + /* > + * The whole range is already a prealloc extent, > + * do nothing except updating the inode's i_size if > + * needed. > + */ > + free_extent_map(em); > + ret = btrfs_fallocate_update_isize(inode, offset + len, > + mode); > + goto out; > + } > + /* > + * Part of the range is already a prealloc extent, so operate > + * only on the remaining part of the range. > + */ > + alloc_start = em_end; > + ASSERT(IS_ALIGNED(alloc_start, sectorsize)); > + len = offset + len - alloc_start; > + offset = alloc_start; > + alloc_hint = em->block_start + em->len; > + } > + free_extent_map(em); > + > + if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == > + BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { > + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, > + alloc_start, sectorsize, 0); > + if (IS_ERR(em)) { > + ret = PTR_ERR(em); > + goto out; > + } > + > + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { > + free_extent_map(em); > + ret = btrfs_fallocate_update_isize(inode, offset + len, > + mode); > + goto out; > + } > + if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { > + free_extent_map(em); > + ret = btrfs_truncate_block(inode, offset, len, 0); > + if (!ret) > + ret = btrfs_fallocate_update_isize(inode, > + offset + len, > + mode); > + return ret; > + } > + free_extent_map(em); > + alloc_start = round_down(offset, sectorsize); > + alloc_end = alloc_start + sectorsize; > + goto reserve_space; > + } > + > + alloc_start = round_up(offset, sectorsize); > + alloc_end = round_down(offset + len, sectorsize); > + > + /* > + * For unaligned ranges, check the pages at the boundaries, they might > + * map to an extent, in which case we need to partially zero them, or > + * they might map to a hole, in which case we need our allocation range > + * to cover them. > + */ > + if (!IS_ALIGNED(offset, sectorsize)) { > + ret = btrfs_zero_range_check_range_boundary(inode, offset); > + if (ret < 0) > + goto out; > + if (ret) { > + alloc_start = round_down(offset, sectorsize); > + ret = 0; > + } else { > + ret = btrfs_truncate_block(inode, offset, 0, 0); > + if (ret) > + goto out; > + } > + } > + > + if (!IS_ALIGNED(offset + len, sectorsize)) { > + ret = btrfs_zero_range_check_range_boundary(inode, > + offset + len); > + if (ret < 0) > + goto out; > + if (ret) { > + alloc_end = round_up(offset + len, sectorsize); > + ret = 0; > + } else { > + ret = btrfs_truncate_block(inode, offset + len, 0, 1); > + if (ret) > + goto out; > + } > + } > + > +reserve_space: > + if (alloc_start < alloc_end) { > + struct extent_state *cached_state = NULL; > + const u64 lockstart = alloc_start; > + const u64 lockend = alloc_end - 1; > + > + bytes_to_reserve = alloc_end - alloc_start; > + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), > + bytes_to_reserve); > + if (ret < 0) > + goto out; > + space_reserved = true; > + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, > + alloc_start, bytes_to_reserve); > + if (ret) > + goto out; > + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, > + &cached_state); > + if (ret) > + goto out; > + ret = btrfs_prealloc_file_range(inode, mode, alloc_start, > + alloc_end - alloc_start, > + i_blocksize(inode), > + offset + len, &alloc_hint); > + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, > + lockend, &cached_state, GFP_KERNEL); > + /* btrfs_prealloc_file_range releases reserved space on error */ > + if (ret) > + space_reserved = false; > + } > + out: > + if (ret && space_reserved) > + btrfs_free_reserved_data_space(inode, data_reserved, > + alloc_start, bytes_to_reserve); > + extent_changeset_free(data_reserved); > + > + return ret; > +} > + > static long btrfs_fallocate(struct file *file, int mode, > loff_t offset, loff_t len) > { > @@ -2829,21 +3058,24 @@ static long btrfs_fallocate(struct file *file, int mode, > cur_offset = alloc_start; > > /* Make sure we aren't being give some crap mode */ > - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) > + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | > + FALLOC_FL_ZERO_RANGE)) > return -EOPNOTSUPP; > > if (mode & FALLOC_FL_PUNCH_HOLE) > - return btrfs_punch_hole(inode, offset, len); > + return btrfs_punch_hole(inode, offset, len, true); > > /* > * Only trigger disk allocation, don't trigger qgroup reserve > * > * For qgroup space, it will be checked later. > */ > - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), > - alloc_end - alloc_start); > - if (ret < 0) > - return ret; > + if (!(mode & FALLOC_FL_ZERO_RANGE)) { > + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), > + alloc_end - alloc_start); > + if (ret < 0) > + return ret; > + } > > inode_lock(inode); > > @@ -2885,6 +3117,12 @@ static long btrfs_fallocate(struct file *file, int mode, > if (ret) > goto out; > > + if (mode & FALLOC_FL_ZERO_RANGE) { > + ret = btrfs_zero_range(inode, offset, len, mode); > + inode_unlock(inode); > + return ret; > + } > + > locked_end = alloc_end - 1; > while (1) { > struct btrfs_ordered_extent *ordered; > @@ -2980,37 +3218,18 @@ static long btrfs_fallocate(struct file *file, int mode, > if (ret < 0) > goto out_unlock; > > - if (actual_end > inode->i_size && > - !(mode & FALLOC_FL_KEEP_SIZE)) { > - struct btrfs_trans_handle *trans; > - struct btrfs_root *root = BTRFS_I(inode)->root; > - > - /* > - * We didn't need to allocate any more space, but we > - * still extended the size of the file so we need to > - * update i_size and the inode item. > - */ > - trans = btrfs_start_transaction(root, 1); > - if (IS_ERR(trans)) { > - ret = PTR_ERR(trans); > - } else { > - inode->i_ctime = current_time(inode); > - i_size_write(inode, actual_end); > - btrfs_ordered_update_i_size(inode, actual_end, NULL); > - ret = btrfs_update_inode(trans, root, inode); > - if (ret) > - btrfs_end_transaction(trans); > - else > - ret = btrfs_end_transaction(trans); > - } > - } > + /* > + * We didn't need to allocate any more space, but we still extended the > + * size of the file so we need to update i_size and the inode item. > + */ > + ret = btrfs_fallocate_update_isize(inode, actual_end, mode); > out_unlock: > unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, > &cached_state, GFP_KERNEL); > out: > inode_unlock(inode); > /* Let go of our reservation. */ > - if (ret != 0) > + if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) > btrfs_free_reserved_data_space(inode, data_reserved, > alloc_start, alloc_end - cur_offset); > extent_changeset_free(data_reserved); >