Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation

From: Nikolay Borisov <nborisov@suse.com>
To: fdmanana@kernel.org, linux-btrfs@vger.kernel.org
Subject: Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation
Date: Fri, 3 Nov 2017 11:30:04 +0200	[thread overview]
Message-ID: <5501c193-9351-8915-4058-14acd109d822@suse.com> (raw)
In-Reply-To: <20171025145908.7311-1-fdmanana@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 14059 bytes --]



On 25.10.2017 17:59, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> This implements support the zero range operation of fallocate. For now
> at least it's as simple as possible while reusing most of the existing
> fallocate and hole punching infrastructure.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
> 
> V2: Removed double inode unlock on error path from failure to lock range.
> 
>  fs/btrfs/file.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 290 insertions(+), 42 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index aafcc785f840..e0d15c0d1641 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
>  	return ret;
>  }
>  
> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> +static int btrfs_punch_hole_lock_range(struct inode *inode,
> +				       const u64 lockstart,
> +				       const u64 lockend,
> +				       struct extent_state **cached_state)
> +{
> +	while (1) {
> +		struct btrfs_ordered_extent *ordered;
> +		int ret;
> +
> +		truncate_pagecache_range(inode, lockstart, lockend);
> +
> +		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> +				 cached_state);
> +		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> +
> +		/*
> +		 * We need to make sure we have no ordered extents in this range
> +		 * and nobody raced in and read a page in this range, if we did
> +		 * we need to try again.
> +		 */
> +		if ((!ordered ||
> +		    (ordered->file_offset + ordered->len <= lockstart ||
> +		     ordered->file_offset > lockend)) &&
> +		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> +			if (ordered)
> +				btrfs_put_ordered_extent(ordered);
> +			break;
> +		}
> +		if (ordered)
> +			btrfs_put_ordered_extent(ordered);
> +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +				     lockend, cached_state, GFP_NOFS);
> +		ret = btrfs_wait_ordered_range(inode, lockstart,
> +					       lockend - lockstart + 1);
> +		if (ret)
> +			return ret;
> +	}
> +	return 0;
> +}
> +
> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
> +			    bool lock_inode)
>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>  	struct btrfs_root *root = BTRFS_I(inode)->root;
> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  	if (ret)
>  		return ret;
>  
> -	inode_lock(inode);
> +	if (lock_inode)
> +		inode_lock(inode);
>  	ino_size = round_up(inode->i_size, fs_info->sectorsize);
>  	ret = find_first_non_hole(inode, &offset, &len);
>  	if (ret < 0)
> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		truncated_block = true;
>  		ret = btrfs_truncate_block(inode, offset, 0, 0);
>  		if (ret) {
> -			inode_unlock(inode);
> +			if (lock_inode)
> +				inode_unlock(inode);
>  			return ret;
>  		}
>  	}
> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		goto out_only_mutex;
>  	}
>  
> -	while (1) {
> -		struct btrfs_ordered_extent *ordered;
> -
> -		truncate_pagecache_range(inode, lockstart, lockend);
> -
> -		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> -				 &cached_state);
> -		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> -
> -		/*
> -		 * We need to make sure we have no ordered extents in this range
> -		 * and nobody raced in and read a page in this range, if we did
> -		 * we need to try again.
> -		 */
> -		if ((!ordered ||
> -		    (ordered->file_offset + ordered->len <= lockstart ||
> -		     ordered->file_offset > lockend)) &&
> -		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> -			if (ordered)
> -				btrfs_put_ordered_extent(ordered);
> -			break;
> -		}
> -		if (ordered)
> -			btrfs_put_ordered_extent(ordered);
> -		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> -				     lockend, &cached_state, GFP_NOFS);
> -		ret = btrfs_wait_ordered_range(inode, lockstart,
> -					       lockend - lockstart + 1);
> -		if (ret) {
> +	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +					  &cached_state);
> +	if (ret) {
> +		if (lock_inode)
>  			inode_unlock(inode);
> -			return ret;
> -		}
> +		return ret;
>  	}
>  
>  	path = btrfs_alloc_path();
> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  			ret = btrfs_end_transaction(trans);
>  		}
>  	}
> -	inode_unlock(inode);
> +	if (lock_inode)
> +		inode_unlock(inode);
>  	if (ret && !err)
>  		err = ret;
>  	return err;
> @@ -2804,6 +2822,227 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
>  	return 0;
>  }
>  
> +static int btrfs_zero_range_update_isize(struct inode *inode,
> +					 const loff_t offset,
> +					 const loff_t len,
> +					 const int mode)
> +{
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	struct btrfs_trans_handle *trans;
> +	const u64 end = offset + len;
> +	int ret;
> +
> +	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
> +		return 0;
> +
> +	i_size_write(inode, end);
> +	btrfs_ordered_update_i_size(inode, end, NULL);
> +	trans = btrfs_start_transaction(root, 1);
> +	if (IS_ERR(trans)) {
> +		ret = PTR_ERR(trans);
> +	} else {
> +		int err;
> +
> +		ret = btrfs_update_inode(trans, root, inode);
> +		err = btrfs_end_transaction(trans);
> +		ret = ret ? ret : err;
> +	}
> +	return ret;
> +}
> +
> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
> +						 u64 offset)
> +{
> +	const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +	struct extent_map *em = NULL;
> +	int ret = 0;
> +
> +	offset = round_down(offset, sectorsize);
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
> +	if (IS_ERR(em))
> +		return PTR_ERR(em);
> +
> +	if (em->block_start == EXTENT_MAP_HOLE)
> +		ret = 1;
> +
> +	free_extent_map(em);
> +	return ret;
> +}
> +
> +static int btrfs_zero_range(struct inode *inode,
> +			    loff_t offset,
> +			    loff_t len,
> +			    const int mode)
> +{
> +	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
> +	struct extent_map *em;
> +	struct extent_changeset *data_reserved = NULL;
> +	int ret;
> +	u64 alloc_hint = 0;
> +	const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +	u64 alloc_start = round_down(offset, sectorsize);
> +	u64 alloc_end = round_up(offset + len, sectorsize);
> +	u64 bytes_to_reserve = 0;
> +	bool space_reserved = false;
> +	bool punch_hole = false;
> +
> +	inode_dio_wait(inode);
> +
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +			      alloc_start, alloc_end - alloc_start, 0);
> +	if (IS_ERR(em)) {
> +		ret = PTR_ERR(em);
> +		goto out;
> +	}
> +
> +	/*
> +	 * Avoid hole punching and extent allocation for some cases. More cases
> +	 * could be considered, but these are unlikely common and we keep things
> +	 * as simple as possible for now. Also, intentionally, if the target
> +	 * range contains one or more prealloc extents together with regular
> +	 * extents and holes, we drop all the existing extents and allocate a
> +	 * new prealloc extent, so that we get a larger contiguous disk extent.
> +	 */
> +	if (em->start <= alloc_start &&
> +	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +		const u64 em_end = em->start + em->len;
> +
> +		if (em_end >= offset + len) {
> +			/*
> +			 * The whole range is already a prealloc extent,
> +			 * do nothing except updating the inode's i_size if
> +			 * needed.
> +			 */
> +			free_extent_map(em);
> +			ret = btrfs_zero_range_update_isize(inode, offset,
> +							    len, mode);
> +			goto out;
> +		}
> +		/*
> +		 * Part of the range is already a prealloc extent, so operate
> +		 * only on the remaining part of the range.
> +		 */
> +		alloc_start = em_end;
> +		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
> +		len = offset + len - alloc_start;
> +		offset = alloc_start;
> +		alloc_hint = em->block_start + em->len;
> +	}
> +	free_extent_map(em);
> +
> +	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
> +	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
> +		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +				      alloc_start, sectorsize, 0);
> +		if (IS_ERR(em)) {
> +			ret = PTR_ERR(em);
> +			goto out;
> +		}
> +
> +		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +			free_extent_map(em);
> +			ret = btrfs_zero_range_update_isize(inode, offset,
> +							    len, mode);
> +			goto out;
> +		}
> +		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)
> +			punch_hole = true;
> +		free_extent_map(em);
> +		if (punch_hole)
> +			goto punch_hole;

This here is correct for a very non-obvious reason. If punch_hole is
true this means we are only ever going to execute the partial truncate
code in btrfs_punch_hole and not punch a hole at all, this is very
convoluted way of invoking truncation!

Instead, I propose something similar to the attached diff which just
calls btrfs_truncate_block directly. This allows to remove one of the
labels and simplifies the code flow. I if this check triggers:
(len < sectorsize && em->block_start != EXTENT_MAP_HOLE) then it's
guaranteed that we are within the inode boundaries so there is no need
to update the inode size, hence I've omitted it, though I'm not 100%
sure, perhaps we want to update the inode's ctime ?

This passes generic/008 and generic/009

> +		alloc_start = round_down(offset, sectorsize);
> +		alloc_end = alloc_start + sectorsize;
> +		goto reserve_space;
> +	}
> +
> +	alloc_start = round_up(offset, sectorsize);
> +	alloc_end = round_down(offset + len, sectorsize);
> +
> +	/*
> +	 * For unaligned ranges, check the pages at the boundaries, they might
> +	 * map to an extent, in which case we need to partially zero them, or
> +	 * they might map to a hole, in which case we need our allocation range
> +	 * to cover them.
> +	 */
> +	if (!IS_ALIGNED(offset, sectorsize)) {
> +		ret = btrfs_zero_range_check_range_boundary(inode, offset);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			alloc_start = round_down(offset, sectorsize);
> +			ret = 0;
> +		} else {
> +			ret = btrfs_truncate_block(inode, offset, 0, 0);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +	if (!IS_ALIGNED(offset + len, sectorsize)) {
> +		ret = btrfs_zero_range_check_range_boundary(inode,
> +							    offset + len);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			alloc_end = round_up(offset + len, sectorsize);
> +			ret = 0;
> +		} else {
> +			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +reserve_space:
> +	if (alloc_start < alloc_end)
> +		bytes_to_reserve += alloc_end - alloc_start;
> +
> +	if (!punch_hole && bytes_to_reserve > 0) {
> +		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +						      bytes_to_reserve);
> +		if (ret < 0)
> +			goto out;
> +		space_reserved = true;
> +		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
> +						alloc_start, bytes_to_reserve);
> +		if (ret)
> +			goto out;
> +	}
> +
> +punch_hole:
> +	if (punch_hole) {
> +		ret = btrfs_punch_hole(inode, offset, len, false);
> +		if (ret)
> +			goto out;
> +		ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
> +	} else {
> +		struct extent_state *cached_state = NULL;
> +		const u64 lockstart = alloc_start;
> +		const u64 lockend = alloc_end - 1;
> +
> +		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +						  &cached_state);
> +		if (ret)
> +			goto out;
> +		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
> +						alloc_end - alloc_start,
> +						i_blocksize(inode),
> +						offset + len, &alloc_hint);
> +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +				     lockend, &cached_state, GFP_KERNEL);
> +		/* btrfs_prealloc_file_range releases reserved space on error */
> +		if (ret)
> +			space_reserved = false;
> +	}
> + out:
> +	if (ret && space_reserved)
> +		btrfs_free_reserved_data_space(inode, data_reserved,
> +					       alloc_start, bytes_to_reserve);
> +	extent_changeset_free(data_reserved);
> +
> +	return ret;
> +}
> +
>  static long btrfs_fallocate(struct file *file, int mode,
>  			    loff_t offset, loff_t len)
>  {
> @@ -2829,21 +3068,24 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	cur_offset = alloc_start;
>  
>  	/* Make sure we aren't being give some crap mode */
> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> +		     FALLOC_FL_ZERO_RANGE))
>  		return -EOPNOTSUPP;
>  
>  	if (mode & FALLOC_FL_PUNCH_HOLE)
> -		return btrfs_punch_hole(inode, offset, len);
> +		return btrfs_punch_hole(inode, offset, len, true);
>  
>  	/*
>  	 * Only trigger disk allocation, don't trigger qgroup reserve
>  	 *
>  	 * For qgroup space, it will be checked later.
>  	 */
> -	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> -			alloc_end - alloc_start);
> -	if (ret < 0)
> -		return ret;
> +	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
> +		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +						      alloc_end - alloc_start);
> +		if (ret < 0)
> +			return ret;
> +	}
>  
>  	inode_lock(inode);
>  
> @@ -2885,6 +3127,12 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	if (ret)
>  		goto out;
>  
> +	if (mode & FALLOC_FL_ZERO_RANGE) {
> +		ret = btrfs_zero_range(inode, offset, len, mode);
> +		inode_unlock(inode);
> +		return ret;
> +	}
> +
>  	locked_end = alloc_end - 1;
>  	while (1) {
>  		struct btrfs_ordered_extent *ordered;
> @@ -3010,7 +3258,7 @@ static long btrfs_fallocate(struct file *file, int mode,
>  out:
>  	inode_unlock(inode);
>  	/* Let go of our reservation. */
> -	if (ret != 0)
> +	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
>  		btrfs_free_reserved_data_space(inode, data_reserved,
>  				alloc_start, alloc_end - cur_offset);
>  	extent_changeset_free(data_reserved);
> 

[-- Attachment #2: zero-range-simplified.patch --]
[-- Type: text/x-patch, Size: 2918 bytes --]

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2a18a5f9c68e..00db8e10222a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2884,7 +2884,9 @@ static int btrfs_zero_range(struct inode *inode,
 	u64 alloc_end = round_up(offset + len, sectorsize);
 	u64 bytes_to_reserve = 0;
 	bool space_reserved = false;
-	bool punch_hole = false;
+	struct extent_state *cached_state = NULL;
+	u64 lockstart;
+	u64 lockend;
 
 	inode_dio_wait(inode);
 
@@ -2945,11 +2947,15 @@ static int btrfs_zero_range(struct inode *inode,
 							    len, mode);
 			goto out;
 		}
-		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)
-			punch_hole = true;
+
+		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+			ret = btrfs_truncate_block(inode, offset, len, 0);
+			free_extent_map(em);
+			return ret;
+		}
+
 		free_extent_map(em);
-		if (punch_hole)
-			goto punch_hole;
+
 		alloc_start = round_down(offset, sectorsize);
 		alloc_end = alloc_start + sectorsize;
 		goto reserve_space;
@@ -2994,10 +3000,9 @@ static int btrfs_zero_range(struct inode *inode,
 	}
 
 reserve_space:
-	if (alloc_start < alloc_end)
-		bytes_to_reserve += alloc_end - alloc_start;
+	if (alloc_start < alloc_end) {
+		bytes_to_reserve = alloc_end - alloc_start;
 
-	if (!punch_hole && bytes_to_reserve > 0) {
 		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
 						      bytes_to_reserve);
 		if (ret < 0)
@@ -3009,31 +3014,22 @@ static int btrfs_zero_range(struct inode *inode,
 			goto out;
 	}
 
-punch_hole:
-	if (punch_hole) {
-		ret = btrfs_punch_hole(inode, offset, len, false);
-		if (ret)
-			goto out;
-		ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
-	} else {
-		struct extent_state *cached_state = NULL;
-		const u64 lockstart = alloc_start;
-		const u64 lockend = alloc_end - 1;
+	lockstart = alloc_start;
+	lockend = alloc_end - 1;
+	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+					  &cached_state);
+	if (ret)
+		goto out;
+	ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+					alloc_end - alloc_start,
+					i_blocksize(inode),
+					offset + len, &alloc_hint);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+			     lockend, &cached_state, GFP_KERNEL);
+	/* btrfs_prealloc_file_range releases reserved space on error */
+	if (ret)
+		space_reserved = false;
 
-		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
-						  &cached_state);
-		if (ret)
-			goto out;
-		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
-						alloc_end - alloc_start,
-						i_blocksize(inode),
-						offset + len, &alloc_hint);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-				     lockend, &cached_state, GFP_KERNEL);
-		/* btrfs_prealloc_file_range releases reserved space on error */
-		if (ret)
-			space_reserved = false;
-	}
  out:
 	if (ret && space_reserved)
 		btrfs_free_reserved_data_space(inode, data_reserved,