All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Btrfs: add support for fallocate's zero range operation
@ 2017-10-25 12:53 fdmanana
  2017-10-25 14:59 ` [PATCH v2] " fdmanana
  2017-11-03 17:20 ` [PATCH v3] " fdmanana
  0 siblings, 2 replies; 14+ messages in thread
From: fdmanana @ 2017-10-25 12:53 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

This implements support the zero range operation of fallocate. For now
at least it's as simple as possible while reusing most of the existing
fallocate and hole punching infrastructure.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/file.c | 333 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 290 insertions(+), 43 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index aafcc785f840..8cf7172f1b63 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
 	return ret;
 }
 
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole_lock_range(struct inode *inode,
+				       const u64 lockstart,
+				       const u64 lockend,
+				       struct extent_state **cached_state)
+{
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		int ret;
+
+		truncate_pagecache_range(inode, lockstart, lockend);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+		/*
+		 * We need to make sure we have no ordered extents in this range
+		 * and nobody raced in and read a page in this range, if we did
+		 * we need to try again.
+		 */
+		if ((!ordered ||
+		    (ordered->file_offset + ordered->len <= lockstart ||
+		     ordered->file_offset > lockend)) &&
+		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, cached_state, GFP_NOFS);
+		ret = btrfs_wait_ordered_range(inode, lockstart,
+					       lockend - lockstart + 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
+			    bool lock_inode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		return ret;
 
-	inode_lock(inode);
+	if (lock_inode)
+		inode_lock(inode);
 	ino_size = round_up(inode->i_size, fs_info->sectorsize);
 	ret = find_first_non_hole(inode, &offset, &len);
 	if (ret < 0)
@@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		truncated_block = true;
 		ret = btrfs_truncate_block(inode, offset, 0, 0);
 		if (ret) {
-			inode_unlock(inode);
+			if (lock_inode)
+				inode_unlock(inode);
 			return ret;
 		}
 	}
@@ -2564,38 +2607,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		goto out_only_mutex;
 	}
 
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		truncate_pagecache_range(inode, lockstart, lockend);
-
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 &cached_state);
-		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
-
-		/*
-		 * We need to make sure we have no ordered extents in this range
-		 * and nobody raced in and read a page in this range, if we did
-		 * we need to try again.
-		 */
-		if ((!ordered ||
-		    (ordered->file_offset + ordered->len <= lockstart ||
-		     ordered->file_offset > lockend)) &&
-		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-			break;
-		}
-		if (ordered)
-			btrfs_put_ordered_extent(ordered);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-				     lockend, &cached_state, GFP_NOFS);
-		ret = btrfs_wait_ordered_range(inode, lockstart,
-					       lockend - lockstart + 1);
-		if (ret) {
-			inode_unlock(inode);
-			return ret;
-		}
+	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+					  &cached_state);
+	if (ret) {
+		inode_unlock(inode);
+		return ret;
 	}
 
 	path = btrfs_alloc_path();
@@ -2758,7 +2774,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 			ret = btrfs_end_transaction(trans);
 		}
 	}
-	inode_unlock(inode);
+	if (lock_inode)
+		inode_unlock(inode);
 	if (ret && !err)
 		err = ret;
 	return err;
@@ -2804,6 +2821,227 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
 	return 0;
 }
 
+static int btrfs_zero_range_update_isize(struct inode *inode,
+					 const loff_t offset,
+					 const loff_t len,
+					 const int mode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	const u64 end = offset + len;
+	int ret;
+
+	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
+		return 0;
+
+	i_size_write(inode, end);
+	btrfs_ordered_update_i_size(inode, end, NULL);
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+	} else {
+		int err;
+
+		ret = btrfs_update_inode(trans, root, inode);
+		err = btrfs_end_transaction(trans);
+		ret = ret ? ret : err;
+	}
+	return ret;
+}
+
+static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+						 u64 offset)
+{
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	struct extent_map *em = NULL;
+	int ret = 0;
+
+	offset = round_down(offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	if (em->block_start == EXTENT_MAP_HOLE)
+		ret = 1;
+
+	free_extent_map(em);
+	return ret;
+}
+
+static int btrfs_zero_range(struct inode *inode,
+			    loff_t offset,
+			    loff_t len,
+			    const int mode)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct extent_map *em;
+	struct extent_changeset *data_reserved = NULL;
+	int ret;
+	u64 alloc_hint = 0;
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	u64 alloc_start = round_down(offset, sectorsize);
+	u64 alloc_end = round_up(offset + len, sectorsize);
+	u64 bytes_to_reserve = 0;
+	bool space_reserved = false;
+	bool punch_hole = false;
+
+	inode_dio_wait(inode);
+
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+			      alloc_start, alloc_end - alloc_start, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+
+	/*
+	 * Avoid hole punching and extent allocation for some cases. More cases
+	 * could be considered, but these are unlikely common and we keep things
+	 * as simple as possible for now. Also, intentionally, if the target
+	 * range contains one or more prealloc extents together with regular
+	 * extents and holes, we drop all the existing extents and allocate a
+	 * new prealloc extent, so that we get a larger contiguous disk extent.
+	 */
+	if (em->start <= alloc_start &&
+	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		const u64 em_end = em->start + em->len;
+
+		if (em_end >= offset + len) {
+			/*
+			 * The whole range is already a prealloc extent,
+			 * do nothing except updating the inode's i_size if
+			 * needed.
+			 */
+			free_extent_map(em);
+			ret = btrfs_zero_range_update_isize(inode, offset,
+							    len, mode);
+			goto out;
+		}
+		/*
+		 * Part of the range is already a prealloc extent, so operate
+		 * only on the remaining part of the range.
+		 */
+		alloc_start = em_end;
+		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
+		len = offset + len - alloc_start;
+		offset = alloc_start;
+		alloc_hint = em->block_start + em->len;
+	}
+	free_extent_map(em);
+
+	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
+	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+				      alloc_start, sectorsize, 0);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+			free_extent_map(em);
+			ret = btrfs_zero_range_update_isize(inode, offset,
+							    len, mode);
+			goto out;
+		}
+		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)
+			punch_hole = true;
+		free_extent_map(em);
+		if (punch_hole)
+			goto punch_hole;
+		alloc_start = round_down(offset, sectorsize);
+		alloc_end = alloc_start + sectorsize;
+		goto reserve_space;
+	}
+
+	alloc_start = round_up(offset, sectorsize);
+	alloc_end = round_down(offset + len, sectorsize);
+
+	/*
+	 * For unaligned ranges, check the pages at the boundaries, they might
+	 * map to an extent, in which case we need to partially zero them, or
+	 * they might map to a hole, in which case we need our allocation range
+	 * to cover them.
+	 */
+	if (!IS_ALIGNED(offset, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode, offset);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_start = round_down(offset, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset, 0, 0);
+			if (ret)
+				goto out;
+		}
+	}
+
+	if (!IS_ALIGNED(offset + len, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode,
+							    offset + len);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_end = round_up(offset + len, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+			if (ret)
+				goto out;
+		}
+	}
+
+reserve_space:
+	if (alloc_start < alloc_end)
+		bytes_to_reserve += alloc_end - alloc_start;
+
+	if (!punch_hole && bytes_to_reserve > 0) {
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      bytes_to_reserve);
+		if (ret < 0)
+			goto out;
+		space_reserved = true;
+		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+						alloc_start, bytes_to_reserve);
+		if (ret)
+			goto out;
+	}
+
+punch_hole:
+	if (punch_hole) {
+		ret = btrfs_punch_hole(inode, offset, len, false);
+		if (ret)
+			goto out;
+		ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
+	} else {
+		struct extent_state *cached_state = NULL;
+		const u64 lockstart = alloc_start;
+		const u64 lockend = alloc_end - 1;
+
+		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+						  &cached_state);
+		if (ret)
+			goto out;
+		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+						alloc_end - alloc_start,
+						i_blocksize(inode),
+						offset + len, &alloc_hint);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state, GFP_KERNEL);
+		/* btrfs_prealloc_file_range releases reserved space on error */
+		if (ret)
+			space_reserved = false;
+	}
+ out:
+	if (ret && space_reserved)
+		btrfs_free_reserved_data_space(inode, data_reserved,
+					       alloc_start, bytes_to_reserve);
+	extent_changeset_free(data_reserved);
+
+	return ret;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
@@ -2829,21 +3067,24 @@ static long btrfs_fallocate(struct file *file, int mode,
 	cur_offset = alloc_start;
 
 	/* Make sure we aren't being give some crap mode */
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_ZERO_RANGE))
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
-		return btrfs_punch_hole(inode, offset, len);
+		return btrfs_punch_hole(inode, offset, len, true);
 
 	/*
 	 * Only trigger disk allocation, don't trigger qgroup reserve
 	 *
 	 * For qgroup space, it will be checked later.
 	 */
-	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
-			alloc_end - alloc_start);
-	if (ret < 0)
-		return ret;
+	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      alloc_end - alloc_start);
+		if (ret < 0)
+			return ret;
+	}
 
 	inode_lock(inode);
 
@@ -2885,6 +3126,12 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret)
 		goto out;
 
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = btrfs_zero_range(inode, offset, len, mode);
+		inode_unlock(inode);
+		return ret;
+	}
+
 	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
@@ -3010,7 +3257,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
 	inode_unlock(inode);
 	/* Let go of our reservation. */
-	if (ret != 0)
+	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
 		btrfs_free_reserved_data_space(inode, data_reserved,
 				alloc_start, alloc_end - cur_offset);
 	extent_changeset_free(data_reserved);
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v2] Btrfs: add support for fallocate's zero range operation
  2017-10-25 12:53 [PATCH] Btrfs: add support for fallocate's zero range operation fdmanana
@ 2017-10-25 14:59 ` fdmanana
  2017-10-30 14:57   ` David Sterba
                     ` (3 more replies)
  2017-11-03 17:20 ` [PATCH v3] " fdmanana
  1 sibling, 4 replies; 14+ messages in thread
From: fdmanana @ 2017-10-25 14:59 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

This implements support the zero range operation of fallocate. For now
at least it's as simple as possible while reusing most of the existing
fallocate and hole punching infrastructure.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---

V2: Removed double inode unlock on error path from failure to lock range.

 fs/btrfs/file.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 290 insertions(+), 42 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index aafcc785f840..e0d15c0d1641 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
 	return ret;
 }
 
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole_lock_range(struct inode *inode,
+				       const u64 lockstart,
+				       const u64 lockend,
+				       struct extent_state **cached_state)
+{
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		int ret;
+
+		truncate_pagecache_range(inode, lockstart, lockend);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+		/*
+		 * We need to make sure we have no ordered extents in this range
+		 * and nobody raced in and read a page in this range, if we did
+		 * we need to try again.
+		 */
+		if ((!ordered ||
+		    (ordered->file_offset + ordered->len <= lockstart ||
+		     ordered->file_offset > lockend)) &&
+		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, cached_state, GFP_NOFS);
+		ret = btrfs_wait_ordered_range(inode, lockstart,
+					       lockend - lockstart + 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
+			    bool lock_inode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		return ret;
 
-	inode_lock(inode);
+	if (lock_inode)
+		inode_lock(inode);
 	ino_size = round_up(inode->i_size, fs_info->sectorsize);
 	ret = find_first_non_hole(inode, &offset, &len);
 	if (ret < 0)
@@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		truncated_block = true;
 		ret = btrfs_truncate_block(inode, offset, 0, 0);
 		if (ret) {
-			inode_unlock(inode);
+			if (lock_inode)
+				inode_unlock(inode);
 			return ret;
 		}
 	}
@@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		goto out_only_mutex;
 	}
 
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		truncate_pagecache_range(inode, lockstart, lockend);
-
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 &cached_state);
-		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
-
-		/*
-		 * We need to make sure we have no ordered extents in this range
-		 * and nobody raced in and read a page in this range, if we did
-		 * we need to try again.
-		 */
-		if ((!ordered ||
-		    (ordered->file_offset + ordered->len <= lockstart ||
-		     ordered->file_offset > lockend)) &&
-		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-			break;
-		}
-		if (ordered)
-			btrfs_put_ordered_extent(ordered);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-				     lockend, &cached_state, GFP_NOFS);
-		ret = btrfs_wait_ordered_range(inode, lockstart,
-					       lockend - lockstart + 1);
-		if (ret) {
+	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+					  &cached_state);
+	if (ret) {
+		if (lock_inode)
 			inode_unlock(inode);
-			return ret;
-		}
+		return ret;
 	}
 
 	path = btrfs_alloc_path();
@@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 			ret = btrfs_end_transaction(trans);
 		}
 	}
-	inode_unlock(inode);
+	if (lock_inode)
+		inode_unlock(inode);
 	if (ret && !err)
 		err = ret;
 	return err;
@@ -2804,6 +2822,227 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
 	return 0;
 }
 
+static int btrfs_zero_range_update_isize(struct inode *inode,
+					 const loff_t offset,
+					 const loff_t len,
+					 const int mode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	const u64 end = offset + len;
+	int ret;
+
+	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
+		return 0;
+
+	i_size_write(inode, end);
+	btrfs_ordered_update_i_size(inode, end, NULL);
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+	} else {
+		int err;
+
+		ret = btrfs_update_inode(trans, root, inode);
+		err = btrfs_end_transaction(trans);
+		ret = ret ? ret : err;
+	}
+	return ret;
+}
+
+static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+						 u64 offset)
+{
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	struct extent_map *em = NULL;
+	int ret = 0;
+
+	offset = round_down(offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	if (em->block_start == EXTENT_MAP_HOLE)
+		ret = 1;
+
+	free_extent_map(em);
+	return ret;
+}
+
+static int btrfs_zero_range(struct inode *inode,
+			    loff_t offset,
+			    loff_t len,
+			    const int mode)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct extent_map *em;
+	struct extent_changeset *data_reserved = NULL;
+	int ret;
+	u64 alloc_hint = 0;
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	u64 alloc_start = round_down(offset, sectorsize);
+	u64 alloc_end = round_up(offset + len, sectorsize);
+	u64 bytes_to_reserve = 0;
+	bool space_reserved = false;
+	bool punch_hole = false;
+
+	inode_dio_wait(inode);
+
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+			      alloc_start, alloc_end - alloc_start, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+
+	/*
+	 * Avoid hole punching and extent allocation for some cases. More cases
+	 * could be considered, but these are unlikely common and we keep things
+	 * as simple as possible for now. Also, intentionally, if the target
+	 * range contains one or more prealloc extents together with regular
+	 * extents and holes, we drop all the existing extents and allocate a
+	 * new prealloc extent, so that we get a larger contiguous disk extent.
+	 */
+	if (em->start <= alloc_start &&
+	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		const u64 em_end = em->start + em->len;
+
+		if (em_end >= offset + len) {
+			/*
+			 * The whole range is already a prealloc extent,
+			 * do nothing except updating the inode's i_size if
+			 * needed.
+			 */
+			free_extent_map(em);
+			ret = btrfs_zero_range_update_isize(inode, offset,
+							    len, mode);
+			goto out;
+		}
+		/*
+		 * Part of the range is already a prealloc extent, so operate
+		 * only on the remaining part of the range.
+		 */
+		alloc_start = em_end;
+		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
+		len = offset + len - alloc_start;
+		offset = alloc_start;
+		alloc_hint = em->block_start + em->len;
+	}
+	free_extent_map(em);
+
+	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
+	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+				      alloc_start, sectorsize, 0);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+			free_extent_map(em);
+			ret = btrfs_zero_range_update_isize(inode, offset,
+							    len, mode);
+			goto out;
+		}
+		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)
+			punch_hole = true;
+		free_extent_map(em);
+		if (punch_hole)
+			goto punch_hole;
+		alloc_start = round_down(offset, sectorsize);
+		alloc_end = alloc_start + sectorsize;
+		goto reserve_space;
+	}
+
+	alloc_start = round_up(offset, sectorsize);
+	alloc_end = round_down(offset + len, sectorsize);
+
+	/*
+	 * For unaligned ranges, check the pages at the boundaries, they might
+	 * map to an extent, in which case we need to partially zero them, or
+	 * they might map to a hole, in which case we need our allocation range
+	 * to cover them.
+	 */
+	if (!IS_ALIGNED(offset, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode, offset);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_start = round_down(offset, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset, 0, 0);
+			if (ret)
+				goto out;
+		}
+	}
+
+	if (!IS_ALIGNED(offset + len, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode,
+							    offset + len);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_end = round_up(offset + len, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+			if (ret)
+				goto out;
+		}
+	}
+
+reserve_space:
+	if (alloc_start < alloc_end)
+		bytes_to_reserve += alloc_end - alloc_start;
+
+	if (!punch_hole && bytes_to_reserve > 0) {
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      bytes_to_reserve);
+		if (ret < 0)
+			goto out;
+		space_reserved = true;
+		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+						alloc_start, bytes_to_reserve);
+		if (ret)
+			goto out;
+	}
+
+punch_hole:
+	if (punch_hole) {
+		ret = btrfs_punch_hole(inode, offset, len, false);
+		if (ret)
+			goto out;
+		ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
+	} else {
+		struct extent_state *cached_state = NULL;
+		const u64 lockstart = alloc_start;
+		const u64 lockend = alloc_end - 1;
+
+		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+						  &cached_state);
+		if (ret)
+			goto out;
+		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+						alloc_end - alloc_start,
+						i_blocksize(inode),
+						offset + len, &alloc_hint);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state, GFP_KERNEL);
+		/* btrfs_prealloc_file_range releases reserved space on error */
+		if (ret)
+			space_reserved = false;
+	}
+ out:
+	if (ret && space_reserved)
+		btrfs_free_reserved_data_space(inode, data_reserved,
+					       alloc_start, bytes_to_reserve);
+	extent_changeset_free(data_reserved);
+
+	return ret;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
@@ -2829,21 +3068,24 @@ static long btrfs_fallocate(struct file *file, int mode,
 	cur_offset = alloc_start;
 
 	/* Make sure we aren't being give some crap mode */
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_ZERO_RANGE))
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
-		return btrfs_punch_hole(inode, offset, len);
+		return btrfs_punch_hole(inode, offset, len, true);
 
 	/*
 	 * Only trigger disk allocation, don't trigger qgroup reserve
 	 *
 	 * For qgroup space, it will be checked later.
 	 */
-	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
-			alloc_end - alloc_start);
-	if (ret < 0)
-		return ret;
+	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      alloc_end - alloc_start);
+		if (ret < 0)
+			return ret;
+	}
 
 	inode_lock(inode);
 
@@ -2885,6 +3127,12 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret)
 		goto out;
 
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = btrfs_zero_range(inode, offset, len, mode);
+		inode_unlock(inode);
+		return ret;
+	}
+
 	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
@@ -3010,7 +3258,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
 	inode_unlock(inode);
 	/* Let go of our reservation. */
-	if (ret != 0)
+	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
 		btrfs_free_reserved_data_space(inode, data_reserved,
 				alloc_start, alloc_end - cur_offset);
 	extent_changeset_free(data_reserved);
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation
  2017-10-25 14:59 ` [PATCH v2] " fdmanana
@ 2017-10-30 14:57   ` David Sterba
  2017-11-01 10:34   ` Nikolay Borisov
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: David Sterba @ 2017-10-30 14:57 UTC (permalink / raw)
  To: fdmanana; +Cc: linux-btrfs

On Wed, Oct 25, 2017 at 03:59:08PM +0100, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> This implements support the zero range operation of fallocate. For now
> at least it's as simple as possible while reusing most of the existing
> fallocate and hole punching infrastructure.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---

I'll add this to for-next so we can more testing coverage, review is
open.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation
  2017-10-25 14:59 ` [PATCH v2] " fdmanana
  2017-10-30 14:57   ` David Sterba
@ 2017-11-01 10:34   ` Nikolay Borisov
  2017-11-01 10:59     ` Filipe Manana
  2017-11-02  8:33   ` Nikolay Borisov
  2017-11-03  9:30   ` Nikolay Borisov
  3 siblings, 1 reply; 14+ messages in thread
From: Nikolay Borisov @ 2017-11-01 10:34 UTC (permalink / raw)
  To: fdmanana, linux-btrfs



On 25.10.2017 17:59, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> This implements support the zero range operation of fallocate. For now
> at least it's as simple as possible while reusing most of the existing
> fallocate and hole punching infrastructure.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
> 
> V2: Removed double inode unlock on error path from failure to lock range.
> 
>  fs/btrfs/file.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 290 insertions(+), 42 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index aafcc785f840..e0d15c0d1641 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
>  	return ret;
>  }
>  
> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> +static int btrfs_punch_hole_lock_range(struct inode *inode,
> +				       const u64 lockstart,
> +				       const u64 lockend,
> +				       struct extent_state **cached_state)
> +{
> +	while (1) {
> +		struct btrfs_ordered_extent *ordered;
> +		int ret;
> +
> +		truncate_pagecache_range(inode, lockstart, lockend);
> +
> +		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> +				 cached_state);
> +		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> +
> +		/*
> +		 * We need to make sure we have no ordered extents in this range
> +		 * and nobody raced in and read a page in this range, if we did
> +		 * we need to try again.
> +		 */
> +		if ((!ordered ||
> +		    (ordered->file_offset + ordered->len <= lockstart ||
> +		     ordered->file_offset > lockend)) &&
> +		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> +			if (ordered)
> +				btrfs_put_ordered_extent(ordered);
> +			break;
> +		}
> +		if (ordered)
> +			btrfs_put_ordered_extent(ordered);
> +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +				     lockend, cached_state, GFP_NOFS);
> +		ret = btrfs_wait_ordered_range(inode, lockstart,
> +					       lockend - lockstart + 1);
> +		if (ret)
> +			return ret;
> +	}
> +	return 0;
> +}
> +
> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
> +			    bool lock_inode)
>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>  	struct btrfs_root *root = BTRFS_I(inode)->root;
> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  	if (ret)
>  		return ret;
>  
> -	inode_lock(inode);
> +	if (lock_inode)
> +		inode_lock(inode);
>  	ino_size = round_up(inode->i_size, fs_info->sectorsize);
>  	ret = find_first_non_hole(inode, &offset, &len);
>  	if (ret < 0)
> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		truncated_block = true;
>  		ret = btrfs_truncate_block(inode, offset, 0, 0);
>  		if (ret) {
> -			inode_unlock(inode);
> +			if (lock_inode)
> +				inode_unlock(inode);
>  			return ret;
>  		}
>  	}
> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		goto out_only_mutex;
>  	}
>  
> -	while (1) {
> -		struct btrfs_ordered_extent *ordered;
> -
> -		truncate_pagecache_range(inode, lockstart, lockend);
> -
> -		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> -				 &cached_state);
> -		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> -
> -		/*
> -		 * We need to make sure we have no ordered extents in this range
> -		 * and nobody raced in and read a page in this range, if we did
> -		 * we need to try again.
> -		 */
> -		if ((!ordered ||
> -		    (ordered->file_offset + ordered->len <= lockstart ||
> -		     ordered->file_offset > lockend)) &&
> -		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> -			if (ordered)
> -				btrfs_put_ordered_extent(ordered);
> -			break;
> -		}
> -		if (ordered)
> -			btrfs_put_ordered_extent(ordered);
> -		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> -				     lockend, &cached_state, GFP_NOFS);
> -		ret = btrfs_wait_ordered_range(inode, lockstart,
> -					       lockend - lockstart + 1);
> -		if (ret) {
> +	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +					  &cached_state);
> +	if (ret) {
> +		if (lock_inode)
>  			inode_unlock(inode);
> -			return ret;
> -		}
> +		return ret;
>  	}
>  
>  	path = btrfs_alloc_path();
> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  			ret = btrfs_end_transaction(trans);
>  		}
>  	}
> -	inode_unlock(inode);
> +	if (lock_inode)
> +		inode_unlock(inode);
>  	if (ret && !err)
>  		err = ret;
>  	return err;
> @@ -2804,6 +2822,227 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
>  	return 0;
>  }
>  
> +static int btrfs_zero_range_update_isize(struct inode *inode,
> +					 const loff_t offset,
> +					 const loff_t len,
> +					 const int mode)
> +{
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	struct btrfs_trans_handle *trans;
> +	const u64 end = offset + len;
> +	int ret;
> +
> +	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
> +		return 0;
> +
> +	i_size_write(inode, end);
> +	btrfs_ordered_update_i_size(inode, end, NULL);
> +	trans = btrfs_start_transaction(root, 1);
> +	if (IS_ERR(trans)) {
> +		ret = PTR_ERR(trans);
> +	} else {
> +		int err;
> +
> +		ret = btrfs_update_inode(trans, root, inode);
> +		err = btrfs_end_transaction(trans);
> +		ret = ret ? ret : err;
> +	}
> +	return ret;
> +}
> +
> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
> +						 u64 offset)
> +{
> +	const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +	struct extent_map *em = NULL;
> +	int ret = 0;
> +
> +	offset = round_down(offset, sectorsize);
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
> +	if (IS_ERR(em))
> +		return PTR_ERR(em);
> +
> +	if (em->block_start == EXTENT_MAP_HOLE)
> +		ret = 1;
> +
> +	free_extent_map(em);
> +	return ret;
> +}
> +
> +static int btrfs_zero_range(struct inode *inode,
> +			    loff_t offset,
> +			    loff_t len,
> +			    const int mode)
> +{
> +	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
> +	struct extent_map *em;
> +	struct extent_changeset *data_reserved = NULL;
> +	int ret;
> +	u64 alloc_hint = 0;
> +	const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +	u64 alloc_start = round_down(offset, sectorsize);
> +	u64 alloc_end = round_up(offset + len, sectorsize);
> +	u64 bytes_to_reserve = 0;
> +	bool space_reserved = false;
> +	bool punch_hole = false;
> +
> +	inode_dio_wait(inode);
> +
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +			      alloc_start, alloc_end - alloc_start, 0);
> +	if (IS_ERR(em)) {
> +		ret = PTR_ERR(em);
> +		goto out;
> +	}
> +
> +	/*
> +	 * Avoid hole punching and extent allocation for some cases. More cases
> +	 * could be considered, but these are unlikely common and we keep things
> +	 * as simple as possible for now. Also, intentionally, if the target
> +	 * range contains one or more prealloc extents together with regular
> +	 * extents and holes, we drop all the existing extents and allocate a
> +	 * new prealloc extent, so that we get a larger contiguous disk extent.
> +	 */
> +	if (em->start <= alloc_start &&
> +	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +		const u64 em_end = em->start + em->len;
> +
> +		if (em_end >= offset + len) {
> +			/*
> +			 * The whole range is already a prealloc extent,
> +			 * do nothing except updating the inode's i_size if
> +			 * needed.
> +			 */
> +			free_extent_map(em);
> +			ret = btrfs_zero_range_update_isize(inode, offset,
> +							    len, mode);
> +			goto out;
> +		}
> +		/*
> +		 * Part of the range is already a prealloc extent, so operate
> +		 * only on the remaining part of the range.
> +		 */
> +		alloc_start = em_end;
> +		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
> +		len = offset + len - alloc_start;
> +		offset = alloc_start;
> +		alloc_hint = em->block_start + em->len;
> +	}
> +	free_extent_map(em);
> +
> +	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
> +	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
> +		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +				      alloc_start, sectorsize, 0);
> +		if (IS_ERR(em)) {
> +			ret = PTR_ERR(em);
> +			goto out;
> +		}
> +
> +		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +			free_extent_map(em);
> +			ret = btrfs_zero_range_update_isize(inode, offset,
> +							    len, mode);
> +			goto out;
> +		}
> +		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)

Is it really necessary to check if len < sectorsize, since the condition
involving BTRFS_BYTES_TO_BLKS can be true (and this code executing) only
if len is already less than sectorsize?

> +			punch_hole = true;
> +		free_extent_map(em);
> +		if (punch_hole)
> +			goto punch_hole;
> +		alloc_start = round_down(offset, sectorsize);
> +		alloc_end = alloc_start + sectorsize;
> +		goto reserve_space;
> +	}
> +
> +	alloc_start = round_up(offset, sectorsize);
> +	alloc_end = round_down(offset + len, sectorsize);

Shouldn't you be rounding_down the start offset and rounding up the end,
just as you are doing at the beginning of the function? Why reverse this
here? Furthermore, aren't those 2 lines really related to the PREALLOC
if in which case they are better placed there, otherwise you discard the
aligning you've done at the beginning of the function


> +
> +	/*
> +	 * For unaligned ranges, check the pages at the boundaries, they might
> +	 * map to an extent, in which case we need to partially zero them, or
> +	 * they might map to a hole, in which case we need our allocation range
> +	 * to cover them.
> +	 */
> +	if (!IS_ALIGNED(offset, sectorsize)) {
> +		ret = btrfs_zero_range_check_range_boundary(inode, offset);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			alloc_start = round_down(offset, sectorsize);
> +			ret = 0;
> +		} else {
> +			ret = btrfs_truncate_block(inode, offset, 0, 0);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +	if (!IS_ALIGNED(offset + len, sectorsize)) {
> +		ret = btrfs_zero_range_check_range_boundary(inode,
> +							    offset + len);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			alloc_end = round_up(offset + len, sectorsize);
> +			ret = 0;
> +		} else {
> +			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +reserve_space:
> +	if (alloc_start < alloc_end)
> +		bytes_to_reserve += alloc_end - alloc_start;

nit: You are not accumulating here but just assigning so you can use =

> +
> +	if (!punch_hole && bytes_to_reserve > 0) {

alloc_start < alloc_end is equivalent to bytes_to_reserve > 0 so you
could collapse the two if's and remove 'bytes_to_reserve > 0' condition
i.e.

if (alloc_start < alloc_end) {
	bytes_to_reserve += alloc_end - alloc_start;
	if (!punch_hole) {
	.......
	}
}

> +		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +						      bytes_to_reserve);
> +		if (ret < 0)
> +			goto out;
> +		space_reserved = true;
> +		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
> +						alloc_start, bytes_to_reserve);
> +		if (ret)
> +			goto out;
> +	}
> +
> +punch_hole:
> +	if (punch_hole) {
> +		ret = btrfs_punch_hole(inode, offset, len, false);
> +		if (ret)
> +			goto out;
> +		ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
> +	} else {
> +		struct extent_state *cached_state = NULL;
> +		const u64 lockstart = alloc_start;
> +		const u64 lockend = alloc_end - 1;
> +
> +		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +						  &cached_state);
> +		if (ret)
> +			goto out;
> +		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
> +						alloc_end - alloc_start,
> +						i_blocksize(inode),
> +						offset + len, &alloc_hint);
> +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +				     lockend, &cached_state, GFP_KERNEL);
> +		/* btrfs_prealloc_file_range releases reserved space on error */
> +		if (ret)
> +			space_reserved = false;
> +	}
> + out:
> +	if (ret && space_reserved)
> +		btrfs_free_reserved_data_space(inode, data_reserved,
> +					       alloc_start, bytes_to_reserve);
> +	extent_changeset_free(data_reserved);
> +
> +	return ret;
> +}
> +
>  static long btrfs_fallocate(struct file *file, int mode,
>  			    loff_t offset, loff_t len)
>  {
> @@ -2829,21 +3068,24 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	cur_offset = alloc_start;
>  
>  	/* Make sure we aren't being give some crap mode */
> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> +		     FALLOC_FL_ZERO_RANGE))
>  		return -EOPNOTSUPP;
>  
>  	if (mode & FALLOC_FL_PUNCH_HOLE)
> -		return btrfs_punch_hole(inode, offset, len);
> +		return btrfs_punch_hole(inode, offset, len, true);
>  
>  	/*
>  	 * Only trigger disk allocation, don't trigger qgroup reserve
>  	 *
>  	 * For qgroup space, it will be checked later.
>  	 */
> -	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> -			alloc_end - alloc_start);
> -	if (ret < 0)
> -		return ret;
> +	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
> +		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +						      alloc_end - alloc_start);
> +		if (ret < 0)
> +			return ret;
> +	}
>  
>  	inode_lock(inode);
>  
> @@ -2885,6 +3127,12 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	if (ret)
>  		goto out;
>  
> +	if (mode & FALLOC_FL_ZERO_RANGE) {
> +		ret = btrfs_zero_range(inode, offset, len, mode);
> +		inode_unlock(inode);
> +		return ret;
> +	}
> +
>  	locked_end = alloc_end - 1;
>  	while (1) {
>  		struct btrfs_ordered_extent *ordered;
> @@ -3010,7 +3258,7 @@ static long btrfs_fallocate(struct file *file, int mode,
>  out:
>  	inode_unlock(inode);
>  	/* Let go of our reservation. */
> -	if (ret != 0)
> +	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
>  		btrfs_free_reserved_data_space(inode, data_reserved,
>  				alloc_start, alloc_end - cur_offset);
>  	extent_changeset_free(data_reserved);
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation
  2017-11-01 10:34   ` Nikolay Borisov
@ 2017-11-01 10:59     ` Filipe Manana
  0 siblings, 0 replies; 14+ messages in thread
From: Filipe Manana @ 2017-11-01 10:59 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: linux-btrfs

On Wed, Nov 1, 2017 at 10:34 AM, Nikolay Borisov <nborisov@suse.com> wrote:
>
>
> On 25.10.2017 17:59, fdmanana@kernel.org wrote:
>> From: Filipe Manana <fdmanana@suse.com>
>>
>> This implements support the zero range operation of fallocate. For now
>> at least it's as simple as possible while reusing most of the existing
>> fallocate and hole punching infrastructure.
>>
>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>> ---
>>
>> V2: Removed double inode unlock on error path from failure to lock range.
>>
>>  fs/btrfs/file.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>>  1 file changed, 290 insertions(+), 42 deletions(-)
>>
>> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
>> index aafcc785f840..e0d15c0d1641 100644
>> --- a/fs/btrfs/file.c
>> +++ b/fs/btrfs/file.c
>> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
>>       return ret;
>>  }
>>
>> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>> +static int btrfs_punch_hole_lock_range(struct inode *inode,
>> +                                    const u64 lockstart,
>> +                                    const u64 lockend,
>> +                                    struct extent_state **cached_state)
>> +{
>> +     while (1) {
>> +             struct btrfs_ordered_extent *ordered;
>> +             int ret;
>> +
>> +             truncate_pagecache_range(inode, lockstart, lockend);
>> +
>> +             lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>> +                              cached_state);
>> +             ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
>> +
>> +             /*
>> +              * We need to make sure we have no ordered extents in this range
>> +              * and nobody raced in and read a page in this range, if we did
>> +              * we need to try again.
>> +              */
>> +             if ((!ordered ||
>> +                 (ordered->file_offset + ordered->len <= lockstart ||
>> +                  ordered->file_offset > lockend)) &&
>> +                  !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
>> +                     if (ordered)
>> +                             btrfs_put_ordered_extent(ordered);
>> +                     break;
>> +             }
>> +             if (ordered)
>> +                     btrfs_put_ordered_extent(ordered);
>> +             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>> +                                  lockend, cached_state, GFP_NOFS);
>> +             ret = btrfs_wait_ordered_range(inode, lockstart,
>> +                                            lockend - lockstart + 1);
>> +             if (ret)
>> +                     return ret;
>> +     }
>> +     return 0;
>> +}
>> +
>> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
>> +                         bool lock_inode)
>>  {
>>       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>>       struct btrfs_root *root = BTRFS_I(inode)->root;
>> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>       if (ret)
>>               return ret;
>>
>> -     inode_lock(inode);
>> +     if (lock_inode)
>> +             inode_lock(inode);
>>       ino_size = round_up(inode->i_size, fs_info->sectorsize);
>>       ret = find_first_non_hole(inode, &offset, &len);
>>       if (ret < 0)
>> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>               truncated_block = true;
>>               ret = btrfs_truncate_block(inode, offset, 0, 0);
>>               if (ret) {
>> -                     inode_unlock(inode);
>> +                     if (lock_inode)
>> +                             inode_unlock(inode);
>>                       return ret;
>>               }
>>       }
>> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>               goto out_only_mutex;
>>       }
>>
>> -     while (1) {
>> -             struct btrfs_ordered_extent *ordered;
>> -
>> -             truncate_pagecache_range(inode, lockstart, lockend);
>> -
>> -             lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>> -                              &cached_state);
>> -             ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
>> -
>> -             /*
>> -              * We need to make sure we have no ordered extents in this range
>> -              * and nobody raced in and read a page in this range, if we did
>> -              * we need to try again.
>> -              */
>> -             if ((!ordered ||
>> -                 (ordered->file_offset + ordered->len <= lockstart ||
>> -                  ordered->file_offset > lockend)) &&
>> -                  !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
>> -                     if (ordered)
>> -                             btrfs_put_ordered_extent(ordered);
>> -                     break;
>> -             }
>> -             if (ordered)
>> -                     btrfs_put_ordered_extent(ordered);
>> -             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>> -                                  lockend, &cached_state, GFP_NOFS);
>> -             ret = btrfs_wait_ordered_range(inode, lockstart,
>> -                                            lockend - lockstart + 1);
>> -             if (ret) {
>> +     ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
>> +                                       &cached_state);
>> +     if (ret) {
>> +             if (lock_inode)
>>                       inode_unlock(inode);
>> -                     return ret;
>> -             }
>> +             return ret;
>>       }
>>
>>       path = btrfs_alloc_path();
>> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>                       ret = btrfs_end_transaction(trans);
>>               }
>>       }
>> -     inode_unlock(inode);
>> +     if (lock_inode)
>> +             inode_unlock(inode);
>>       if (ret && !err)
>>               err = ret;
>>       return err;
>> @@ -2804,6 +2822,227 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
>>       return 0;
>>  }
>>
>> +static int btrfs_zero_range_update_isize(struct inode *inode,
>> +                                      const loff_t offset,
>> +                                      const loff_t len,
>> +                                      const int mode)
>> +{
>> +     struct btrfs_root *root = BTRFS_I(inode)->root;
>> +     struct btrfs_trans_handle *trans;
>> +     const u64 end = offset + len;
>> +     int ret;
>> +
>> +     if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
>> +             return 0;
>> +
>> +     i_size_write(inode, end);
>> +     btrfs_ordered_update_i_size(inode, end, NULL);
>> +     trans = btrfs_start_transaction(root, 1);
>> +     if (IS_ERR(trans)) {
>> +             ret = PTR_ERR(trans);
>> +     } else {
>> +             int err;
>> +
>> +             ret = btrfs_update_inode(trans, root, inode);
>> +             err = btrfs_end_transaction(trans);
>> +             ret = ret ? ret : err;
>> +     }
>> +     return ret;
>> +}
>> +
>> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
>> +                                              u64 offset)
>> +{
>> +     const u64 sectorsize = btrfs_inode_sectorsize(inode);
>> +     struct extent_map *em = NULL;
>> +     int ret = 0;
>> +
>> +     offset = round_down(offset, sectorsize);
>> +     em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
>> +     if (IS_ERR(em))
>> +             return PTR_ERR(em);
>> +
>> +     if (em->block_start == EXTENT_MAP_HOLE)
>> +             ret = 1;
>> +
>> +     free_extent_map(em);
>> +     return ret;
>> +}
>> +
>> +static int btrfs_zero_range(struct inode *inode,
>> +                         loff_t offset,
>> +                         loff_t len,
>> +                         const int mode)
>> +{
>> +     struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
>> +     struct extent_map *em;
>> +     struct extent_changeset *data_reserved = NULL;
>> +     int ret;
>> +     u64 alloc_hint = 0;
>> +     const u64 sectorsize = btrfs_inode_sectorsize(inode);
>> +     u64 alloc_start = round_down(offset, sectorsize);
>> +     u64 alloc_end = round_up(offset + len, sectorsize);
>> +     u64 bytes_to_reserve = 0;
>> +     bool space_reserved = false;
>> +     bool punch_hole = false;
>> +
>> +     inode_dio_wait(inode);
>> +
>> +     em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
>> +                           alloc_start, alloc_end - alloc_start, 0);
>> +     if (IS_ERR(em)) {
>> +             ret = PTR_ERR(em);
>> +             goto out;
>> +     }
>> +
>> +     /*
>> +      * Avoid hole punching and extent allocation for some cases. More cases
>> +      * could be considered, but these are unlikely common and we keep things
>> +      * as simple as possible for now. Also, intentionally, if the target
>> +      * range contains one or more prealloc extents together with regular
>> +      * extents and holes, we drop all the existing extents and allocate a
>> +      * new prealloc extent, so that we get a larger contiguous disk extent.
>> +      */
>> +     if (em->start <= alloc_start &&
>> +         test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>> +             const u64 em_end = em->start + em->len;
>> +
>> +             if (em_end >= offset + len) {
>> +                     /*
>> +                      * The whole range is already a prealloc extent,
>> +                      * do nothing except updating the inode's i_size if
>> +                      * needed.
>> +                      */
>> +                     free_extent_map(em);
>> +                     ret = btrfs_zero_range_update_isize(inode, offset,
>> +                                                         len, mode);
>> +                     goto out;
>> +             }
>> +             /*
>> +              * Part of the range is already a prealloc extent, so operate
>> +              * only on the remaining part of the range.
>> +              */
>> +             alloc_start = em_end;
>> +             ASSERT(IS_ALIGNED(alloc_start, sectorsize));
>> +             len = offset + len - alloc_start;
>> +             offset = alloc_start;
>> +             alloc_hint = em->block_start + em->len;
>> +     }
>> +     free_extent_map(em);
>> +
>> +     if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
>> +         BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
>> +             em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
>> +                                   alloc_start, sectorsize, 0);
>> +             if (IS_ERR(em)) {
>> +                     ret = PTR_ERR(em);
>> +                     goto out;
>> +             }
>> +
>> +             if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>> +                     free_extent_map(em);
>> +                     ret = btrfs_zero_range_update_isize(inode, offset,
>> +                                                         len, mode);
>> +                     goto out;
>> +             }
>> +             if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)
>
> Is it really necessary to check if len < sectorsize, since the condition
> involving BTRFS_BYTES_TO_BLKS can be true (and this code executing) only
> if len is already less than sectorsize?

Yes, it's necessary. Len can be == sectorsize, in which case we can
not do hole punching.
I suggest you take more attention reading the code.

>
>> +                     punch_hole = true;
>> +             free_extent_map(em);
>> +             if (punch_hole)
>> +                     goto punch_hole;
>> +             alloc_start = round_down(offset, sectorsize);
>> +             alloc_end = alloc_start + sectorsize;
>> +             goto reserve_space;
>> +     }
>> +
>> +     alloc_start = round_up(offset, sectorsize);
>> +     alloc_end = round_down(offset + len, sectorsize);
>
> Shouldn't you be rounding_down the start offset and rounding up the end,
> just as you are doing at the beginning of the function?

No. For unaligned boundaries you have to partially zero the respective blocks.

> Why reverse this
> here? Furthermore, aren't those 2 lines really related to the PREALLOC
> if in which case they are better placed there, otherwise you discard the
> aligning you've done at the beginning of the function

I don't understand your doubts. What prealloc if? These assignments
are in the correct place,
you can try yourself moving them around and see if the zero range
fstests still pass.

No I'm not discarding the alignments done at the beginning, because those values
are used before adjusting them here.

>
>
>> +
>> +     /*
>> +      * For unaligned ranges, check the pages at the boundaries, they might
>> +      * map to an extent, in which case we need to partially zero them, or
>> +      * they might map to a hole, in which case we need our allocation range
>> +      * to cover them.
>> +      */
>> +     if (!IS_ALIGNED(offset, sectorsize)) {
>> +             ret = btrfs_zero_range_check_range_boundary(inode, offset);
>> +             if (ret < 0)
>> +                     goto out;
>> +             if (ret) {
>> +                     alloc_start = round_down(offset, sectorsize);
>> +                     ret = 0;
>> +             } else {
>> +                     ret = btrfs_truncate_block(inode, offset, 0, 0);
>> +                     if (ret)
>> +                             goto out;
>> +             }
>> +     }
>> +
>> +     if (!IS_ALIGNED(offset + len, sectorsize)) {
>> +             ret = btrfs_zero_range_check_range_boundary(inode,
>> +                                                         offset + len);
>> +             if (ret < 0)
>> +                     goto out;
>> +             if (ret) {
>> +                     alloc_end = round_up(offset + len, sectorsize);
>> +                     ret = 0;
>> +             } else {
>> +                     ret = btrfs_truncate_block(inode, offset + len, 0, 1);
>> +                     if (ret)
>> +                             goto out;
>> +             }
>> +     }
>> +
>> +reserve_space:
>> +     if (alloc_start < alloc_end)
>> +             bytes_to_reserve += alloc_end - alloc_start;
>
> nit: You are not accumulating here but just assigning so you can use =

...

>
>> +
>> +     if (!punch_hole && bytes_to_reserve > 0) {
>
> alloc_start < alloc_end is equivalent to bytes_to_reserve > 0 so you
> could collapse the two if's and remove 'bytes_to_reserve > 0' condition
> i.e.
>
> if (alloc_start < alloc_end) {
>         bytes_to_reserve += alloc_end - alloc_start;
>         if (!punch_hole) {
>         .......
>         }
> }

I prefer to avoid more levels of indentation.

>
>> +             ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>> +                                                   bytes_to_reserve);
>> +             if (ret < 0)
>> +                     goto out;
>> +             space_reserved = true;
>> +             ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
>> +                                             alloc_start, bytes_to_reserve);
>> +             if (ret)
>> +                     goto out;
>> +     }
>> +
>> +punch_hole:
>> +     if (punch_hole) {
>> +             ret = btrfs_punch_hole(inode, offset, len, false);
>> +             if (ret)
>> +                     goto out;
>> +             ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
>> +     } else {
>> +             struct extent_state *cached_state = NULL;
>> +             const u64 lockstart = alloc_start;
>> +             const u64 lockend = alloc_end - 1;
>> +
>> +             ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
>> +                                               &cached_state);
>> +             if (ret)
>> +                     goto out;
>> +             ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
>> +                                             alloc_end - alloc_start,
>> +                                             i_blocksize(inode),
>> +                                             offset + len, &alloc_hint);
>> +             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>> +                                  lockend, &cached_state, GFP_KERNEL);
>> +             /* btrfs_prealloc_file_range releases reserved space on error */
>> +             if (ret)
>> +                     space_reserved = false;
>> +     }
>> + out:
>> +     if (ret && space_reserved)
>> +             btrfs_free_reserved_data_space(inode, data_reserved,
>> +                                            alloc_start, bytes_to_reserve);
>> +     extent_changeset_free(data_reserved);
>> +
>> +     return ret;
>> +}
>> +
>>  static long btrfs_fallocate(struct file *file, int mode,
>>                           loff_t offset, loff_t len)
>>  {
>> @@ -2829,21 +3068,24 @@ static long btrfs_fallocate(struct file *file, int mode,
>>       cur_offset = alloc_start;
>>
>>       /* Make sure we aren't being give some crap mode */
>> -     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>> +     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
>> +                  FALLOC_FL_ZERO_RANGE))
>>               return -EOPNOTSUPP;
>>
>>       if (mode & FALLOC_FL_PUNCH_HOLE)
>> -             return btrfs_punch_hole(inode, offset, len);
>> +             return btrfs_punch_hole(inode, offset, len, true);
>>
>>       /*
>>        * Only trigger disk allocation, don't trigger qgroup reserve
>>        *
>>        * For qgroup space, it will be checked later.
>>        */
>> -     ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>> -                     alloc_end - alloc_start);
>> -     if (ret < 0)
>> -             return ret;
>> +     if (!(mode & FALLOC_FL_ZERO_RANGE)) {
>> +             ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>> +                                                   alloc_end - alloc_start);
>> +             if (ret < 0)
>> +                     return ret;
>> +     }
>>
>>       inode_lock(inode);
>>
>> @@ -2885,6 +3127,12 @@ static long btrfs_fallocate(struct file *file, int mode,
>>       if (ret)
>>               goto out;
>>
>> +     if (mode & FALLOC_FL_ZERO_RANGE) {
>> +             ret = btrfs_zero_range(inode, offset, len, mode);
>> +             inode_unlock(inode);
>> +             return ret;
>> +     }
>> +
>>       locked_end = alloc_end - 1;
>>       while (1) {
>>               struct btrfs_ordered_extent *ordered;
>> @@ -3010,7 +3258,7 @@ static long btrfs_fallocate(struct file *file, int mode,
>>  out:
>>       inode_unlock(inode);
>>       /* Let go of our reservation. */
>> -     if (ret != 0)
>> +     if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
>>               btrfs_free_reserved_data_space(inode, data_reserved,
>>                               alloc_start, alloc_end - cur_offset);
>>       extent_changeset_free(data_reserved);
>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation
  2017-10-25 14:59 ` [PATCH v2] " fdmanana
  2017-10-30 14:57   ` David Sterba
  2017-11-01 10:34   ` Nikolay Borisov
@ 2017-11-02  8:33   ` Nikolay Borisov
  2017-11-03  9:30   ` Nikolay Borisov
  3 siblings, 0 replies; 14+ messages in thread
From: Nikolay Borisov @ 2017-11-02  8:33 UTC (permalink / raw)
  To: fdmanana, linux-btrfs



On 25.10.2017 17:59, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> This implements support the zero range operation of fallocate. For now
> at least it's as simple as possible while reusing most of the existing
> fallocate and hole punching infrastructure.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
> 
> V2: Removed double inode unlock on error path from failure to lock range.
> 
>  fs/btrfs/file.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 290 insertions(+), 42 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index aafcc785f840..e0d15c0d1641 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
>  	return ret;
>  }
>  

<snip>

>  
> +static int btrfs_zero_range_update_isize(struct inode *inode,
> +					 const loff_t offset,
> +					 const loff_t len,
> +					 const int mode)
> +{
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	struct btrfs_trans_handle *trans;
> +	const u64 end = offset + len;
> +	int ret;
> +
> +	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
> +		return 0;
> +

Don't we also want to add inode->i_ctime = current_time(inode) similar
to what we have when updating the isize at the end of btrfs_fallocate
and __btrfs_prealloc_file_range


Also this function is very similarto the code right before the
'out_unlock' label in btrfs_fallocate. Perhaps this function could be
named btrfs_update_isize and used there as well?

<snip>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation
  2017-10-25 14:59 ` [PATCH v2] " fdmanana
                     ` (2 preceding siblings ...)
  2017-11-02  8:33   ` Nikolay Borisov
@ 2017-11-03  9:30   ` Nikolay Borisov
  2017-11-03 10:29     ` Filipe Manana
  3 siblings, 1 reply; 14+ messages in thread
From: Nikolay Borisov @ 2017-11-03  9:30 UTC (permalink / raw)
  To: fdmanana, linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 14059 bytes --]



On 25.10.2017 17:59, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> This implements support the zero range operation of fallocate. For now
> at least it's as simple as possible while reusing most of the existing
> fallocate and hole punching infrastructure.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
> 
> V2: Removed double inode unlock on error path from failure to lock range.
> 
>  fs/btrfs/file.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 290 insertions(+), 42 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index aafcc785f840..e0d15c0d1641 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
>  	return ret;
>  }
>  
> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> +static int btrfs_punch_hole_lock_range(struct inode *inode,
> +				       const u64 lockstart,
> +				       const u64 lockend,
> +				       struct extent_state **cached_state)
> +{
> +	while (1) {
> +		struct btrfs_ordered_extent *ordered;
> +		int ret;
> +
> +		truncate_pagecache_range(inode, lockstart, lockend);
> +
> +		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> +				 cached_state);
> +		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> +
> +		/*
> +		 * We need to make sure we have no ordered extents in this range
> +		 * and nobody raced in and read a page in this range, if we did
> +		 * we need to try again.
> +		 */
> +		if ((!ordered ||
> +		    (ordered->file_offset + ordered->len <= lockstart ||
> +		     ordered->file_offset > lockend)) &&
> +		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> +			if (ordered)
> +				btrfs_put_ordered_extent(ordered);
> +			break;
> +		}
> +		if (ordered)
> +			btrfs_put_ordered_extent(ordered);
> +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +				     lockend, cached_state, GFP_NOFS);
> +		ret = btrfs_wait_ordered_range(inode, lockstart,
> +					       lockend - lockstart + 1);
> +		if (ret)
> +			return ret;
> +	}
> +	return 0;
> +}
> +
> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
> +			    bool lock_inode)
>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>  	struct btrfs_root *root = BTRFS_I(inode)->root;
> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  	if (ret)
>  		return ret;
>  
> -	inode_lock(inode);
> +	if (lock_inode)
> +		inode_lock(inode);
>  	ino_size = round_up(inode->i_size, fs_info->sectorsize);
>  	ret = find_first_non_hole(inode, &offset, &len);
>  	if (ret < 0)
> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		truncated_block = true;
>  		ret = btrfs_truncate_block(inode, offset, 0, 0);
>  		if (ret) {
> -			inode_unlock(inode);
> +			if (lock_inode)
> +				inode_unlock(inode);
>  			return ret;
>  		}
>  	}
> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		goto out_only_mutex;
>  	}
>  
> -	while (1) {
> -		struct btrfs_ordered_extent *ordered;
> -
> -		truncate_pagecache_range(inode, lockstart, lockend);
> -
> -		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> -				 &cached_state);
> -		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> -
> -		/*
> -		 * We need to make sure we have no ordered extents in this range
> -		 * and nobody raced in and read a page in this range, if we did
> -		 * we need to try again.
> -		 */
> -		if ((!ordered ||
> -		    (ordered->file_offset + ordered->len <= lockstart ||
> -		     ordered->file_offset > lockend)) &&
> -		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> -			if (ordered)
> -				btrfs_put_ordered_extent(ordered);
> -			break;
> -		}
> -		if (ordered)
> -			btrfs_put_ordered_extent(ordered);
> -		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> -				     lockend, &cached_state, GFP_NOFS);
> -		ret = btrfs_wait_ordered_range(inode, lockstart,
> -					       lockend - lockstart + 1);
> -		if (ret) {
> +	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +					  &cached_state);
> +	if (ret) {
> +		if (lock_inode)
>  			inode_unlock(inode);
> -			return ret;
> -		}
> +		return ret;
>  	}
>  
>  	path = btrfs_alloc_path();
> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  			ret = btrfs_end_transaction(trans);
>  		}
>  	}
> -	inode_unlock(inode);
> +	if (lock_inode)
> +		inode_unlock(inode);
>  	if (ret && !err)
>  		err = ret;
>  	return err;
> @@ -2804,6 +2822,227 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
>  	return 0;
>  }
>  
> +static int btrfs_zero_range_update_isize(struct inode *inode,
> +					 const loff_t offset,
> +					 const loff_t len,
> +					 const int mode)
> +{
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	struct btrfs_trans_handle *trans;
> +	const u64 end = offset + len;
> +	int ret;
> +
> +	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
> +		return 0;
> +
> +	i_size_write(inode, end);
> +	btrfs_ordered_update_i_size(inode, end, NULL);
> +	trans = btrfs_start_transaction(root, 1);
> +	if (IS_ERR(trans)) {
> +		ret = PTR_ERR(trans);
> +	} else {
> +		int err;
> +
> +		ret = btrfs_update_inode(trans, root, inode);
> +		err = btrfs_end_transaction(trans);
> +		ret = ret ? ret : err;
> +	}
> +	return ret;
> +}
> +
> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
> +						 u64 offset)
> +{
> +	const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +	struct extent_map *em = NULL;
> +	int ret = 0;
> +
> +	offset = round_down(offset, sectorsize);
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
> +	if (IS_ERR(em))
> +		return PTR_ERR(em);
> +
> +	if (em->block_start == EXTENT_MAP_HOLE)
> +		ret = 1;
> +
> +	free_extent_map(em);
> +	return ret;
> +}
> +
> +static int btrfs_zero_range(struct inode *inode,
> +			    loff_t offset,
> +			    loff_t len,
> +			    const int mode)
> +{
> +	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
> +	struct extent_map *em;
> +	struct extent_changeset *data_reserved = NULL;
> +	int ret;
> +	u64 alloc_hint = 0;
> +	const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +	u64 alloc_start = round_down(offset, sectorsize);
> +	u64 alloc_end = round_up(offset + len, sectorsize);
> +	u64 bytes_to_reserve = 0;
> +	bool space_reserved = false;
> +	bool punch_hole = false;
> +
> +	inode_dio_wait(inode);
> +
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +			      alloc_start, alloc_end - alloc_start, 0);
> +	if (IS_ERR(em)) {
> +		ret = PTR_ERR(em);
> +		goto out;
> +	}
> +
> +	/*
> +	 * Avoid hole punching and extent allocation for some cases. More cases
> +	 * could be considered, but these are unlikely common and we keep things
> +	 * as simple as possible for now. Also, intentionally, if the target
> +	 * range contains one or more prealloc extents together with regular
> +	 * extents and holes, we drop all the existing extents and allocate a
> +	 * new prealloc extent, so that we get a larger contiguous disk extent.
> +	 */
> +	if (em->start <= alloc_start &&
> +	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +		const u64 em_end = em->start + em->len;
> +
> +		if (em_end >= offset + len) {
> +			/*
> +			 * The whole range is already a prealloc extent,
> +			 * do nothing except updating the inode's i_size if
> +			 * needed.
> +			 */
> +			free_extent_map(em);
> +			ret = btrfs_zero_range_update_isize(inode, offset,
> +							    len, mode);
> +			goto out;
> +		}
> +		/*
> +		 * Part of the range is already a prealloc extent, so operate
> +		 * only on the remaining part of the range.
> +		 */
> +		alloc_start = em_end;
> +		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
> +		len = offset + len - alloc_start;
> +		offset = alloc_start;
> +		alloc_hint = em->block_start + em->len;
> +	}
> +	free_extent_map(em);
> +
> +	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
> +	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
> +		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +				      alloc_start, sectorsize, 0);
> +		if (IS_ERR(em)) {
> +			ret = PTR_ERR(em);
> +			goto out;
> +		}
> +
> +		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +			free_extent_map(em);
> +			ret = btrfs_zero_range_update_isize(inode, offset,
> +							    len, mode);
> +			goto out;
> +		}
> +		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)
> +			punch_hole = true;
> +		free_extent_map(em);
> +		if (punch_hole)
> +			goto punch_hole;

This here is correct for a very non-obvious reason. If punch_hole is
true this means we are only ever going to execute the partial truncate
code in btrfs_punch_hole and not punch a hole at all, this is very
convoluted way of invoking truncation!

Instead, I propose something similar to the attached diff which just
calls btrfs_truncate_block directly. This allows to remove one of the
labels and simplifies the code flow. I if this check triggers:
(len < sectorsize && em->block_start != EXTENT_MAP_HOLE) then it's
guaranteed that we are within the inode boundaries so there is no need
to update the inode size, hence I've omitted it, though I'm not 100%
sure, perhaps we want to update the inode's ctime ?

This passes generic/008 and generic/009

> +		alloc_start = round_down(offset, sectorsize);
> +		alloc_end = alloc_start + sectorsize;
> +		goto reserve_space;
> +	}
> +
> +	alloc_start = round_up(offset, sectorsize);
> +	alloc_end = round_down(offset + len, sectorsize);
> +
> +	/*
> +	 * For unaligned ranges, check the pages at the boundaries, they might
> +	 * map to an extent, in which case we need to partially zero them, or
> +	 * they might map to a hole, in which case we need our allocation range
> +	 * to cover them.
> +	 */
> +	if (!IS_ALIGNED(offset, sectorsize)) {
> +		ret = btrfs_zero_range_check_range_boundary(inode, offset);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			alloc_start = round_down(offset, sectorsize);
> +			ret = 0;
> +		} else {
> +			ret = btrfs_truncate_block(inode, offset, 0, 0);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +	if (!IS_ALIGNED(offset + len, sectorsize)) {
> +		ret = btrfs_zero_range_check_range_boundary(inode,
> +							    offset + len);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			alloc_end = round_up(offset + len, sectorsize);
> +			ret = 0;
> +		} else {
> +			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +reserve_space:
> +	if (alloc_start < alloc_end)
> +		bytes_to_reserve += alloc_end - alloc_start;
> +
> +	if (!punch_hole && bytes_to_reserve > 0) {
> +		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +						      bytes_to_reserve);
> +		if (ret < 0)
> +			goto out;
> +		space_reserved = true;
> +		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
> +						alloc_start, bytes_to_reserve);
> +		if (ret)
> +			goto out;
> +	}
> +
> +punch_hole:
> +	if (punch_hole) {
> +		ret = btrfs_punch_hole(inode, offset, len, false);
> +		if (ret)
> +			goto out;
> +		ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
> +	} else {
> +		struct extent_state *cached_state = NULL;
> +		const u64 lockstart = alloc_start;
> +		const u64 lockend = alloc_end - 1;
> +
> +		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +						  &cached_state);
> +		if (ret)
> +			goto out;
> +		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
> +						alloc_end - alloc_start,
> +						i_blocksize(inode),
> +						offset + len, &alloc_hint);
> +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +				     lockend, &cached_state, GFP_KERNEL);
> +		/* btrfs_prealloc_file_range releases reserved space on error */
> +		if (ret)
> +			space_reserved = false;
> +	}
> + out:
> +	if (ret && space_reserved)
> +		btrfs_free_reserved_data_space(inode, data_reserved,
> +					       alloc_start, bytes_to_reserve);
> +	extent_changeset_free(data_reserved);
> +
> +	return ret;
> +}
> +
>  static long btrfs_fallocate(struct file *file, int mode,
>  			    loff_t offset, loff_t len)
>  {
> @@ -2829,21 +3068,24 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	cur_offset = alloc_start;
>  
>  	/* Make sure we aren't being give some crap mode */
> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> +		     FALLOC_FL_ZERO_RANGE))
>  		return -EOPNOTSUPP;
>  
>  	if (mode & FALLOC_FL_PUNCH_HOLE)
> -		return btrfs_punch_hole(inode, offset, len);
> +		return btrfs_punch_hole(inode, offset, len, true);
>  
>  	/*
>  	 * Only trigger disk allocation, don't trigger qgroup reserve
>  	 *
>  	 * For qgroup space, it will be checked later.
>  	 */
> -	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> -			alloc_end - alloc_start);
> -	if (ret < 0)
> -		return ret;
> +	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
> +		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +						      alloc_end - alloc_start);
> +		if (ret < 0)
> +			return ret;
> +	}
>  
>  	inode_lock(inode);
>  
> @@ -2885,6 +3127,12 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	if (ret)
>  		goto out;
>  
> +	if (mode & FALLOC_FL_ZERO_RANGE) {
> +		ret = btrfs_zero_range(inode, offset, len, mode);
> +		inode_unlock(inode);
> +		return ret;
> +	}
> +
>  	locked_end = alloc_end - 1;
>  	while (1) {
>  		struct btrfs_ordered_extent *ordered;
> @@ -3010,7 +3258,7 @@ static long btrfs_fallocate(struct file *file, int mode,
>  out:
>  	inode_unlock(inode);
>  	/* Let go of our reservation. */
> -	if (ret != 0)
> +	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
>  		btrfs_free_reserved_data_space(inode, data_reserved,
>  				alloc_start, alloc_end - cur_offset);
>  	extent_changeset_free(data_reserved);
> 

[-- Attachment #2: zero-range-simplified.patch --]
[-- Type: text/x-patch, Size: 2918 bytes --]

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2a18a5f9c68e..00db8e10222a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2884,7 +2884,9 @@ static int btrfs_zero_range(struct inode *inode,
 	u64 alloc_end = round_up(offset + len, sectorsize);
 	u64 bytes_to_reserve = 0;
 	bool space_reserved = false;
-	bool punch_hole = false;
+	struct extent_state *cached_state = NULL;
+	u64 lockstart;
+	u64 lockend;
 
 	inode_dio_wait(inode);
 
@@ -2945,11 +2947,15 @@ static int btrfs_zero_range(struct inode *inode,
 							    len, mode);
 			goto out;
 		}
-		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)
-			punch_hole = true;
+
+		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+			ret = btrfs_truncate_block(inode, offset, len, 0);
+			free_extent_map(em);
+			return ret;
+		}
+
 		free_extent_map(em);
-		if (punch_hole)
-			goto punch_hole;
+
 		alloc_start = round_down(offset, sectorsize);
 		alloc_end = alloc_start + sectorsize;
 		goto reserve_space;
@@ -2994,10 +3000,9 @@ static int btrfs_zero_range(struct inode *inode,
 	}
 
 reserve_space:
-	if (alloc_start < alloc_end)
-		bytes_to_reserve += alloc_end - alloc_start;
+	if (alloc_start < alloc_end) {
+		bytes_to_reserve = alloc_end - alloc_start;
 
-	if (!punch_hole && bytes_to_reserve > 0) {
 		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
 						      bytes_to_reserve);
 		if (ret < 0)
@@ -3009,31 +3014,22 @@ static int btrfs_zero_range(struct inode *inode,
 			goto out;
 	}
 
-punch_hole:
-	if (punch_hole) {
-		ret = btrfs_punch_hole(inode, offset, len, false);
-		if (ret)
-			goto out;
-		ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
-	} else {
-		struct extent_state *cached_state = NULL;
-		const u64 lockstart = alloc_start;
-		const u64 lockend = alloc_end - 1;
+	lockstart = alloc_start;
+	lockend = alloc_end - 1;
+	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+					  &cached_state);
+	if (ret)
+		goto out;
+	ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+					alloc_end - alloc_start,
+					i_blocksize(inode),
+					offset + len, &alloc_hint);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+			     lockend, &cached_state, GFP_KERNEL);
+	/* btrfs_prealloc_file_range releases reserved space on error */
+	if (ret)
+		space_reserved = false;
 
-		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
-						  &cached_state);
-		if (ret)
-			goto out;
-		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
-						alloc_end - alloc_start,
-						i_blocksize(inode),
-						offset + len, &alloc_hint);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-				     lockend, &cached_state, GFP_KERNEL);
-		/* btrfs_prealloc_file_range releases reserved space on error */
-		if (ret)
-			space_reserved = false;
-	}
  out:
 	if (ret && space_reserved)
 		btrfs_free_reserved_data_space(inode, data_reserved,

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation
  2017-11-03  9:30   ` Nikolay Borisov
@ 2017-11-03 10:29     ` Filipe Manana
  2017-11-03 10:45       ` Filipe Manana
  0 siblings, 1 reply; 14+ messages in thread
From: Filipe Manana @ 2017-11-03 10:29 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: linux-btrfs

On Fri, Nov 3, 2017 at 9:30 AM, Nikolay Borisov <nborisov@suse.com> wrote:
>
>
> On 25.10.2017 17:59, fdmanana@kernel.org wrote:
>> From: Filipe Manana <fdmanana@suse.com>
>>
>> This implements support the zero range operation of fallocate. For now
>> at least it's as simple as possible while reusing most of the existing
>> fallocate and hole punching infrastructure.
>>
>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>> ---
>>
>> V2: Removed double inode unlock on error path from failure to lock range.
>>
>>  fs/btrfs/file.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>>  1 file changed, 290 insertions(+), 42 deletions(-)
>>
>> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
>> index aafcc785f840..e0d15c0d1641 100644
>> --- a/fs/btrfs/file.c
>> +++ b/fs/btrfs/file.c
>> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
>>       return ret;
>>  }
>>
>> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>> +static int btrfs_punch_hole_lock_range(struct inode *inode,
>> +                                    const u64 lockstart,
>> +                                    const u64 lockend,
>> +                                    struct extent_state **cached_state)
>> +{
>> +     while (1) {
>> +             struct btrfs_ordered_extent *ordered;
>> +             int ret;
>> +
>> +             truncate_pagecache_range(inode, lockstart, lockend);
>> +
>> +             lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>> +                              cached_state);
>> +             ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
>> +
>> +             /*
>> +              * We need to make sure we have no ordered extents in this range
>> +              * and nobody raced in and read a page in this range, if we did
>> +              * we need to try again.
>> +              */
>> +             if ((!ordered ||
>> +                 (ordered->file_offset + ordered->len <= lockstart ||
>> +                  ordered->file_offset > lockend)) &&
>> +                  !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
>> +                     if (ordered)
>> +                             btrfs_put_ordered_extent(ordered);
>> +                     break;
>> +             }
>> +             if (ordered)
>> +                     btrfs_put_ordered_extent(ordered);
>> +             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>> +                                  lockend, cached_state, GFP_NOFS);
>> +             ret = btrfs_wait_ordered_range(inode, lockstart,
>> +                                            lockend - lockstart + 1);
>> +             if (ret)
>> +                     return ret;
>> +     }
>> +     return 0;
>> +}
>> +
>> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
>> +                         bool lock_inode)
>>  {
>>       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>>       struct btrfs_root *root = BTRFS_I(inode)->root;
>> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>       if (ret)
>>               return ret;
>>
>> -     inode_lock(inode);
>> +     if (lock_inode)
>> +             inode_lock(inode);
>>       ino_size = round_up(inode->i_size, fs_info->sectorsize);
>>       ret = find_first_non_hole(inode, &offset, &len);
>>       if (ret < 0)
>> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>               truncated_block = true;
>>               ret = btrfs_truncate_block(inode, offset, 0, 0);
>>               if (ret) {
>> -                     inode_unlock(inode);
>> +                     if (lock_inode)
>> +                             inode_unlock(inode);
>>                       return ret;
>>               }
>>       }
>> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>               goto out_only_mutex;
>>       }
>>
>> -     while (1) {
>> -             struct btrfs_ordered_extent *ordered;
>> -
>> -             truncate_pagecache_range(inode, lockstart, lockend);
>> -
>> -             lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>> -                              &cached_state);
>> -             ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
>> -
>> -             /*
>> -              * We need to make sure we have no ordered extents in this range
>> -              * and nobody raced in and read a page in this range, if we did
>> -              * we need to try again.
>> -              */
>> -             if ((!ordered ||
>> -                 (ordered->file_offset + ordered->len <= lockstart ||
>> -                  ordered->file_offset > lockend)) &&
>> -                  !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
>> -                     if (ordered)
>> -                             btrfs_put_ordered_extent(ordered);
>> -                     break;
>> -             }
>> -             if (ordered)
>> -                     btrfs_put_ordered_extent(ordered);
>> -             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>> -                                  lockend, &cached_state, GFP_NOFS);
>> -             ret = btrfs_wait_ordered_range(inode, lockstart,
>> -                                            lockend - lockstart + 1);
>> -             if (ret) {
>> +     ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
>> +                                       &cached_state);
>> +     if (ret) {
>> +             if (lock_inode)
>>                       inode_unlock(inode);
>> -                     return ret;
>> -             }
>> +             return ret;
>>       }
>>
>>       path = btrfs_alloc_path();
>> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>                       ret = btrfs_end_transaction(trans);
>>               }
>>       }
>> -     inode_unlock(inode);
>> +     if (lock_inode)
>> +             inode_unlock(inode);
>>       if (ret && !err)
>>               err = ret;
>>       return err;
>> @@ -2804,6 +2822,227 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
>>       return 0;
>>  }
>>
>> +static int btrfs_zero_range_update_isize(struct inode *inode,
>> +                                      const loff_t offset,
>> +                                      const loff_t len,
>> +                                      const int mode)
>> +{
>> +     struct btrfs_root *root = BTRFS_I(inode)->root;
>> +     struct btrfs_trans_handle *trans;
>> +     const u64 end = offset + len;
>> +     int ret;
>> +
>> +     if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
>> +             return 0;
>> +
>> +     i_size_write(inode, end);
>> +     btrfs_ordered_update_i_size(inode, end, NULL);
>> +     trans = btrfs_start_transaction(root, 1);
>> +     if (IS_ERR(trans)) {
>> +             ret = PTR_ERR(trans);
>> +     } else {
>> +             int err;
>> +
>> +             ret = btrfs_update_inode(trans, root, inode);
>> +             err = btrfs_end_transaction(trans);
>> +             ret = ret ? ret : err;
>> +     }
>> +     return ret;
>> +}
>> +
>> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
>> +                                              u64 offset)
>> +{
>> +     const u64 sectorsize = btrfs_inode_sectorsize(inode);
>> +     struct extent_map *em = NULL;
>> +     int ret = 0;
>> +
>> +     offset = round_down(offset, sectorsize);
>> +     em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
>> +     if (IS_ERR(em))
>> +             return PTR_ERR(em);
>> +
>> +     if (em->block_start == EXTENT_MAP_HOLE)
>> +             ret = 1;
>> +
>> +     free_extent_map(em);
>> +     return ret;
>> +}
>> +
>> +static int btrfs_zero_range(struct inode *inode,
>> +                         loff_t offset,
>> +                         loff_t len,
>> +                         const int mode)
>> +{
>> +     struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
>> +     struct extent_map *em;
>> +     struct extent_changeset *data_reserved = NULL;
>> +     int ret;
>> +     u64 alloc_hint = 0;
>> +     const u64 sectorsize = btrfs_inode_sectorsize(inode);
>> +     u64 alloc_start = round_down(offset, sectorsize);
>> +     u64 alloc_end = round_up(offset + len, sectorsize);
>> +     u64 bytes_to_reserve = 0;
>> +     bool space_reserved = false;
>> +     bool punch_hole = false;
>> +
>> +     inode_dio_wait(inode);
>> +
>> +     em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
>> +                           alloc_start, alloc_end - alloc_start, 0);
>> +     if (IS_ERR(em)) {
>> +             ret = PTR_ERR(em);
>> +             goto out;
>> +     }
>> +
>> +     /*
>> +      * Avoid hole punching and extent allocation for some cases. More cases
>> +      * could be considered, but these are unlikely common and we keep things
>> +      * as simple as possible for now. Also, intentionally, if the target
>> +      * range contains one or more prealloc extents together with regular
>> +      * extents and holes, we drop all the existing extents and allocate a
>> +      * new prealloc extent, so that we get a larger contiguous disk extent.
>> +      */
>> +     if (em->start <= alloc_start &&
>> +         test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>> +             const u64 em_end = em->start + em->len;
>> +
>> +             if (em_end >= offset + len) {
>> +                     /*
>> +                      * The whole range is already a prealloc extent,
>> +                      * do nothing except updating the inode's i_size if
>> +                      * needed.
>> +                      */
>> +                     free_extent_map(em);
>> +                     ret = btrfs_zero_range_update_isize(inode, offset,
>> +                                                         len, mode);
>> +                     goto out;
>> +             }
>> +             /*
>> +              * Part of the range is already a prealloc extent, so operate
>> +              * only on the remaining part of the range.
>> +              */
>> +             alloc_start = em_end;
>> +             ASSERT(IS_ALIGNED(alloc_start, sectorsize));
>> +             len = offset + len - alloc_start;
>> +             offset = alloc_start;
>> +             alloc_hint = em->block_start + em->len;
>> +     }
>> +     free_extent_map(em);
>> +
>> +     if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
>> +         BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
>> +             em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
>> +                                   alloc_start, sectorsize, 0);
>> +             if (IS_ERR(em)) {
>> +                     ret = PTR_ERR(em);
>> +                     goto out;
>> +             }
>> +
>> +             if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>> +                     free_extent_map(em);
>> +                     ret = btrfs_zero_range_update_isize(inode, offset,
>> +                                                         len, mode);
>> +                     goto out;
>> +             }
>> +             if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)
>> +                     punch_hole = true;
>> +             free_extent_map(em);
>> +             if (punch_hole)
>> +                     goto punch_hole;
>
> This here is correct for a very non-obvious reason. If punch_hole is
> true this means we are only ever going to execute the partial truncate
> code in btrfs_punch_hole and not punch a hole at all, this is very
> convoluted way of invoking truncation!

Well, it might be non-obvious for people not experienced with fs
development, but I don't think it's that terrible as you picture it.
Every fs developer knows that punching into a range smaller then
sector size means zeroing part of a block.
It's not this sort of things that makes people unable to contribute or
understand things.

>
> Instead, I propose something similar to the attached diff which just
> calls btrfs_truncate_block directly. This allows to remove one of the
> labels and simplifies the code flow. I if this check triggers:
> (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) then it's
> guaranteed that we are within the inode boundaries so there is no need
> to update the inode size,

There isn't???
There is of course, if the file size is not sector size aligned and
the target range affects the last block and goes beyond the current
i_size.

 hence I've omitted it, though I'm not 100%
> sure, perhaps we want to update the inode's ctime ?
>
> This passes generic/008 and generic/009

Well keep in mind those tests don't cover all possible scenarios.

I've integrated those cleanups and will send a v3 later after some
stress testing.


>
>> +             alloc_start = round_down(offset, sectorsize);
>> +             alloc_end = alloc_start + sectorsize;
>> +             goto reserve_space;
>> +     }
>> +
>> +     alloc_start = round_up(offset, sectorsize);
>> +     alloc_end = round_down(offset + len, sectorsize);
>> +
>> +     /*
>> +      * For unaligned ranges, check the pages at the boundaries, they might
>> +      * map to an extent, in which case we need to partially zero them, or
>> +      * they might map to a hole, in which case we need our allocation range
>> +      * to cover them.
>> +      */
>> +     if (!IS_ALIGNED(offset, sectorsize)) {
>> +             ret = btrfs_zero_range_check_range_boundary(inode, offset);
>> +             if (ret < 0)
>> +                     goto out;
>> +             if (ret) {
>> +                     alloc_start = round_down(offset, sectorsize);
>> +                     ret = 0;
>> +             } else {
>> +                     ret = btrfs_truncate_block(inode, offset, 0, 0);
>> +                     if (ret)
>> +                             goto out;
>> +             }
>> +     }
>> +
>> +     if (!IS_ALIGNED(offset + len, sectorsize)) {
>> +             ret = btrfs_zero_range_check_range_boundary(inode,
>> +                                                         offset + len);
>> +             if (ret < 0)
>> +                     goto out;
>> +             if (ret) {
>> +                     alloc_end = round_up(offset + len, sectorsize);
>> +                     ret = 0;
>> +             } else {
>> +                     ret = btrfs_truncate_block(inode, offset + len, 0, 1);
>> +                     if (ret)
>> +                             goto out;
>> +             }
>> +     }
>> +
>> +reserve_space:
>> +     if (alloc_start < alloc_end)
>> +             bytes_to_reserve += alloc_end - alloc_start;
>> +
>> +     if (!punch_hole && bytes_to_reserve > 0) {
>> +             ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>> +                                                   bytes_to_reserve);
>> +             if (ret < 0)
>> +                     goto out;
>> +             space_reserved = true;
>> +             ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
>> +                                             alloc_start, bytes_to_reserve);
>> +             if (ret)
>> +                     goto out;
>> +     }
>> +
>> +punch_hole:
>> +     if (punch_hole) {
>> +             ret = btrfs_punch_hole(inode, offset, len, false);
>> +             if (ret)
>> +                     goto out;
>> +             ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
>> +     } else {
>> +             struct extent_state *cached_state = NULL;
>> +             const u64 lockstart = alloc_start;
>> +             const u64 lockend = alloc_end - 1;
>> +
>> +             ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
>> +                                               &cached_state);
>> +             if (ret)
>> +                     goto out;
>> +             ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
>> +                                             alloc_end - alloc_start,
>> +                                             i_blocksize(inode),
>> +                                             offset + len, &alloc_hint);
>> +             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>> +                                  lockend, &cached_state, GFP_KERNEL);
>> +             /* btrfs_prealloc_file_range releases reserved space on error */
>> +             if (ret)
>> +                     space_reserved = false;
>> +     }
>> + out:
>> +     if (ret && space_reserved)
>> +             btrfs_free_reserved_data_space(inode, data_reserved,
>> +                                            alloc_start, bytes_to_reserve);
>> +     extent_changeset_free(data_reserved);
>> +
>> +     return ret;
>> +}
>> +
>>  static long btrfs_fallocate(struct file *file, int mode,
>>                           loff_t offset, loff_t len)
>>  {
>> @@ -2829,21 +3068,24 @@ static long btrfs_fallocate(struct file *file, int mode,
>>       cur_offset = alloc_start;
>>
>>       /* Make sure we aren't being give some crap mode */
>> -     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>> +     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
>> +                  FALLOC_FL_ZERO_RANGE))
>>               return -EOPNOTSUPP;
>>
>>       if (mode & FALLOC_FL_PUNCH_HOLE)
>> -             return btrfs_punch_hole(inode, offset, len);
>> +             return btrfs_punch_hole(inode, offset, len, true);
>>
>>       /*
>>        * Only trigger disk allocation, don't trigger qgroup reserve
>>        *
>>        * For qgroup space, it will be checked later.
>>        */
>> -     ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>> -                     alloc_end - alloc_start);
>> -     if (ret < 0)
>> -             return ret;
>> +     if (!(mode & FALLOC_FL_ZERO_RANGE)) {
>> +             ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>> +                                                   alloc_end - alloc_start);
>> +             if (ret < 0)
>> +                     return ret;
>> +     }
>>
>>       inode_lock(inode);
>>
>> @@ -2885,6 +3127,12 @@ static long btrfs_fallocate(struct file *file, int mode,
>>       if (ret)
>>               goto out;
>>
>> +     if (mode & FALLOC_FL_ZERO_RANGE) {
>> +             ret = btrfs_zero_range(inode, offset, len, mode);
>> +             inode_unlock(inode);
>> +             return ret;
>> +     }
>> +
>>       locked_end = alloc_end - 1;
>>       while (1) {
>>               struct btrfs_ordered_extent *ordered;
>> @@ -3010,7 +3258,7 @@ static long btrfs_fallocate(struct file *file, int mode,
>>  out:
>>       inode_unlock(inode);
>>       /* Let go of our reservation. */
>> -     if (ret != 0)
>> +     if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
>>               btrfs_free_reserved_data_space(inode, data_reserved,
>>                               alloc_start, alloc_end - cur_offset);
>>       extent_changeset_free(data_reserved);
>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] Btrfs: add support for fallocate's zero range operation
  2017-11-03 10:29     ` Filipe Manana
@ 2017-11-03 10:45       ` Filipe Manana
  0 siblings, 0 replies; 14+ messages in thread
From: Filipe Manana @ 2017-11-03 10:45 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: linux-btrfs

On Fri, Nov 3, 2017 at 10:29 AM, Filipe Manana <fdmanana@kernel.org> wrote:
> On Fri, Nov 3, 2017 at 9:30 AM, Nikolay Borisov <nborisov@suse.com> wrote:
>>
>>
>> On 25.10.2017 17:59, fdmanana@kernel.org wrote:
>>> From: Filipe Manana <fdmanana@suse.com>
>>>
>>> This implements support the zero range operation of fallocate. For now
>>> at least it's as simple as possible while reusing most of the existing
>>> fallocate and hole punching infrastructure.
>>>
>>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>>> ---
>>>
>>> V2: Removed double inode unlock on error path from failure to lock range.
>>>
>>>  fs/btrfs/file.c | 332 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>>>  1 file changed, 290 insertions(+), 42 deletions(-)
>>>
>>> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
>>> index aafcc785f840..e0d15c0d1641 100644
>>> --- a/fs/btrfs/file.c
>>> +++ b/fs/btrfs/file.c
>>> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
>>>       return ret;
>>>  }
>>>
>>> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>> +static int btrfs_punch_hole_lock_range(struct inode *inode,
>>> +                                    const u64 lockstart,
>>> +                                    const u64 lockend,
>>> +                                    struct extent_state **cached_state)
>>> +{
>>> +     while (1) {
>>> +             struct btrfs_ordered_extent *ordered;
>>> +             int ret;
>>> +
>>> +             truncate_pagecache_range(inode, lockstart, lockend);
>>> +
>>> +             lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>>> +                              cached_state);
>>> +             ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
>>> +
>>> +             /*
>>> +              * We need to make sure we have no ordered extents in this range
>>> +              * and nobody raced in and read a page in this range, if we did
>>> +              * we need to try again.
>>> +              */
>>> +             if ((!ordered ||
>>> +                 (ordered->file_offset + ordered->len <= lockstart ||
>>> +                  ordered->file_offset > lockend)) &&
>>> +                  !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
>>> +                     if (ordered)
>>> +                             btrfs_put_ordered_extent(ordered);
>>> +                     break;
>>> +             }
>>> +             if (ordered)
>>> +                     btrfs_put_ordered_extent(ordered);
>>> +             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>>> +                                  lockend, cached_state, GFP_NOFS);
>>> +             ret = btrfs_wait_ordered_range(inode, lockstart,
>>> +                                            lockend - lockstart + 1);
>>> +             if (ret)
>>> +                     return ret;
>>> +     }
>>> +     return 0;
>>> +}
>>> +
>>> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
>>> +                         bool lock_inode)
>>>  {
>>>       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>>>       struct btrfs_root *root = BTRFS_I(inode)->root;
>>> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>>       if (ret)
>>>               return ret;
>>>
>>> -     inode_lock(inode);
>>> +     if (lock_inode)
>>> +             inode_lock(inode);
>>>       ino_size = round_up(inode->i_size, fs_info->sectorsize);
>>>       ret = find_first_non_hole(inode, &offset, &len);
>>>       if (ret < 0)
>>> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>>               truncated_block = true;
>>>               ret = btrfs_truncate_block(inode, offset, 0, 0);
>>>               if (ret) {
>>> -                     inode_unlock(inode);
>>> +                     if (lock_inode)
>>> +                             inode_unlock(inode);
>>>                       return ret;
>>>               }
>>>       }
>>> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>>               goto out_only_mutex;
>>>       }
>>>
>>> -     while (1) {
>>> -             struct btrfs_ordered_extent *ordered;
>>> -
>>> -             truncate_pagecache_range(inode, lockstart, lockend);
>>> -
>>> -             lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
>>> -                              &cached_state);
>>> -             ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
>>> -
>>> -             /*
>>> -              * We need to make sure we have no ordered extents in this range
>>> -              * and nobody raced in and read a page in this range, if we did
>>> -              * we need to try again.
>>> -              */
>>> -             if ((!ordered ||
>>> -                 (ordered->file_offset + ordered->len <= lockstart ||
>>> -                  ordered->file_offset > lockend)) &&
>>> -                  !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
>>> -                     if (ordered)
>>> -                             btrfs_put_ordered_extent(ordered);
>>> -                     break;
>>> -             }
>>> -             if (ordered)
>>> -                     btrfs_put_ordered_extent(ordered);
>>> -             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>>> -                                  lockend, &cached_state, GFP_NOFS);
>>> -             ret = btrfs_wait_ordered_range(inode, lockstart,
>>> -                                            lockend - lockstart + 1);
>>> -             if (ret) {
>>> +     ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
>>> +                                       &cached_state);
>>> +     if (ret) {
>>> +             if (lock_inode)
>>>                       inode_unlock(inode);
>>> -                     return ret;
>>> -             }
>>> +             return ret;
>>>       }
>>>
>>>       path = btrfs_alloc_path();
>>> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>>>                       ret = btrfs_end_transaction(trans);
>>>               }
>>>       }
>>> -     inode_unlock(inode);
>>> +     if (lock_inode)
>>> +             inode_unlock(inode);
>>>       if (ret && !err)
>>>               err = ret;
>>>       return err;
>>> @@ -2804,6 +2822,227 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
>>>       return 0;
>>>  }
>>>
>>> +static int btrfs_zero_range_update_isize(struct inode *inode,
>>> +                                      const loff_t offset,
>>> +                                      const loff_t len,
>>> +                                      const int mode)
>>> +{
>>> +     struct btrfs_root *root = BTRFS_I(inode)->root;
>>> +     struct btrfs_trans_handle *trans;
>>> +     const u64 end = offset + len;
>>> +     int ret;
>>> +
>>> +     if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
>>> +             return 0;
>>> +
>>> +     i_size_write(inode, end);
>>> +     btrfs_ordered_update_i_size(inode, end, NULL);
>>> +     trans = btrfs_start_transaction(root, 1);
>>> +     if (IS_ERR(trans)) {
>>> +             ret = PTR_ERR(trans);
>>> +     } else {
>>> +             int err;
>>> +
>>> +             ret = btrfs_update_inode(trans, root, inode);
>>> +             err = btrfs_end_transaction(trans);
>>> +             ret = ret ? ret : err;
>>> +     }
>>> +     return ret;
>>> +}
>>> +
>>> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
>>> +                                              u64 offset)
>>> +{
>>> +     const u64 sectorsize = btrfs_inode_sectorsize(inode);
>>> +     struct extent_map *em = NULL;
>>> +     int ret = 0;
>>> +
>>> +     offset = round_down(offset, sectorsize);
>>> +     em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
>>> +     if (IS_ERR(em))
>>> +             return PTR_ERR(em);
>>> +
>>> +     if (em->block_start == EXTENT_MAP_HOLE)
>>> +             ret = 1;
>>> +
>>> +     free_extent_map(em);
>>> +     return ret;
>>> +}
>>> +
>>> +static int btrfs_zero_range(struct inode *inode,
>>> +                         loff_t offset,
>>> +                         loff_t len,
>>> +                         const int mode)
>>> +{
>>> +     struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
>>> +     struct extent_map *em;
>>> +     struct extent_changeset *data_reserved = NULL;
>>> +     int ret;
>>> +     u64 alloc_hint = 0;
>>> +     const u64 sectorsize = btrfs_inode_sectorsize(inode);
>>> +     u64 alloc_start = round_down(offset, sectorsize);
>>> +     u64 alloc_end = round_up(offset + len, sectorsize);
>>> +     u64 bytes_to_reserve = 0;
>>> +     bool space_reserved = false;
>>> +     bool punch_hole = false;
>>> +
>>> +     inode_dio_wait(inode);
>>> +
>>> +     em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
>>> +                           alloc_start, alloc_end - alloc_start, 0);
>>> +     if (IS_ERR(em)) {
>>> +             ret = PTR_ERR(em);
>>> +             goto out;
>>> +     }
>>> +
>>> +     /*
>>> +      * Avoid hole punching and extent allocation for some cases. More cases
>>> +      * could be considered, but these are unlikely common and we keep things
>>> +      * as simple as possible for now. Also, intentionally, if the target
>>> +      * range contains one or more prealloc extents together with regular
>>> +      * extents and holes, we drop all the existing extents and allocate a
>>> +      * new prealloc extent, so that we get a larger contiguous disk extent.
>>> +      */
>>> +     if (em->start <= alloc_start &&
>>> +         test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>>> +             const u64 em_end = em->start + em->len;
>>> +
>>> +             if (em_end >= offset + len) {
>>> +                     /*
>>> +                      * The whole range is already a prealloc extent,
>>> +                      * do nothing except updating the inode's i_size if
>>> +                      * needed.
>>> +                      */
>>> +                     free_extent_map(em);
>>> +                     ret = btrfs_zero_range_update_isize(inode, offset,
>>> +                                                         len, mode);
>>> +                     goto out;
>>> +             }
>>> +             /*
>>> +              * Part of the range is already a prealloc extent, so operate
>>> +              * only on the remaining part of the range.
>>> +              */
>>> +             alloc_start = em_end;
>>> +             ASSERT(IS_ALIGNED(alloc_start, sectorsize));
>>> +             len = offset + len - alloc_start;
>>> +             offset = alloc_start;
>>> +             alloc_hint = em->block_start + em->len;
>>> +     }
>>> +     free_extent_map(em);
>>> +
>>> +     if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
>>> +         BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
>>> +             em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
>>> +                                   alloc_start, sectorsize, 0);
>>> +             if (IS_ERR(em)) {
>>> +                     ret = PTR_ERR(em);
>>> +                     goto out;
>>> +             }
>>> +
>>> +             if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
>>> +                     free_extent_map(em);
>>> +                     ret = btrfs_zero_range_update_isize(inode, offset,
>>> +                                                         len, mode);
>>> +                     goto out;
>>> +             }
>>> +             if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE)
>>> +                     punch_hole = true;
>>> +             free_extent_map(em);
>>> +             if (punch_hole)
>>> +                     goto punch_hole;
>>
>> This here is correct for a very non-obvious reason. If punch_hole is
>> true this means we are only ever going to execute the partial truncate
>> code in btrfs_punch_hole and not punch a hole at all, this is very
>> convoluted way of invoking truncation!
>
> Well, it might be non-obvious for people not experienced with fs
> development, but I don't think it's that terrible as you picture it.
> Every fs developer knows that punching into a range smaller then
> sector size means zeroing part of a block.
> It's not this sort of things that makes people unable to contribute or
> understand things.
>
>>
>> Instead, I propose something similar to the attached diff which just
>> calls btrfs_truncate_block directly. This allows to remove one of the
>> labels and simplifies the code flow. I if this check triggers:
>> (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) then it's
>> guaranteed that we are within the inode boundaries so there is no need
>> to update the inode size,
>
> There isn't???
> There is of course, if the file size is not sector size aligned and
> the target range affects the last block and goes beyond the current
> i_size.

Plus your diff introduces another bug when alloc_start == alloc_end,
in which case
we shouldn't do anything other the previous truncation of boundary pages, as
you left it, it results in locking a range with and end smaller then
its start plus
some other unpredictable behaviour.


>
>  hence I've omitted it, though I'm not 100%
>> sure, perhaps we want to update the inode's ctime ?
>>
>> This passes generic/008 and generic/009
>
> Well keep in mind those tests don't cover all possible scenarios.
>
> I've integrated those cleanups and will send a v3 later after some
> stress testing.
>
>
>>
>>> +             alloc_start = round_down(offset, sectorsize);
>>> +             alloc_end = alloc_start + sectorsize;
>>> +             goto reserve_space;
>>> +     }
>>> +
>>> +     alloc_start = round_up(offset, sectorsize);
>>> +     alloc_end = round_down(offset + len, sectorsize);
>>> +
>>> +     /*
>>> +      * For unaligned ranges, check the pages at the boundaries, they might
>>> +      * map to an extent, in which case we need to partially zero them, or
>>> +      * they might map to a hole, in which case we need our allocation range
>>> +      * to cover them.
>>> +      */
>>> +     if (!IS_ALIGNED(offset, sectorsize)) {
>>> +             ret = btrfs_zero_range_check_range_boundary(inode, offset);
>>> +             if (ret < 0)
>>> +                     goto out;
>>> +             if (ret) {
>>> +                     alloc_start = round_down(offset, sectorsize);
>>> +                     ret = 0;
>>> +             } else {
>>> +                     ret = btrfs_truncate_block(inode, offset, 0, 0);
>>> +                     if (ret)
>>> +                             goto out;
>>> +             }
>>> +     }
>>> +
>>> +     if (!IS_ALIGNED(offset + len, sectorsize)) {
>>> +             ret = btrfs_zero_range_check_range_boundary(inode,
>>> +                                                         offset + len);
>>> +             if (ret < 0)
>>> +                     goto out;
>>> +             if (ret) {
>>> +                     alloc_end = round_up(offset + len, sectorsize);
>>> +                     ret = 0;
>>> +             } else {
>>> +                     ret = btrfs_truncate_block(inode, offset + len, 0, 1);
>>> +                     if (ret)
>>> +                             goto out;
>>> +             }
>>> +     }
>>> +
>>> +reserve_space:
>>> +     if (alloc_start < alloc_end)
>>> +             bytes_to_reserve += alloc_end - alloc_start;
>>> +
>>> +     if (!punch_hole && bytes_to_reserve > 0) {
>>> +             ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>>> +                                                   bytes_to_reserve);
>>> +             if (ret < 0)
>>> +                     goto out;
>>> +             space_reserved = true;
>>> +             ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
>>> +                                             alloc_start, bytes_to_reserve);
>>> +             if (ret)
>>> +                     goto out;
>>> +     }
>>> +
>>> +punch_hole:
>>> +     if (punch_hole) {
>>> +             ret = btrfs_punch_hole(inode, offset, len, false);
>>> +             if (ret)
>>> +                     goto out;
>>> +             ret = btrfs_zero_range_update_isize(inode, offset, len, mode);
>>> +     } else {
>>> +             struct extent_state *cached_state = NULL;
>>> +             const u64 lockstart = alloc_start;
>>> +             const u64 lockend = alloc_end - 1;
>>> +
>>> +             ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
>>> +                                               &cached_state);
>>> +             if (ret)
>>> +                     goto out;
>>> +             ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
>>> +                                             alloc_end - alloc_start,
>>> +                                             i_blocksize(inode),
>>> +                                             offset + len, &alloc_hint);
>>> +             unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
>>> +                                  lockend, &cached_state, GFP_KERNEL);
>>> +             /* btrfs_prealloc_file_range releases reserved space on error */
>>> +             if (ret)
>>> +                     space_reserved = false;
>>> +     }
>>> + out:
>>> +     if (ret && space_reserved)
>>> +             btrfs_free_reserved_data_space(inode, data_reserved,
>>> +                                            alloc_start, bytes_to_reserve);
>>> +     extent_changeset_free(data_reserved);
>>> +
>>> +     return ret;
>>> +}
>>> +
>>>  static long btrfs_fallocate(struct file *file, int mode,
>>>                           loff_t offset, loff_t len)
>>>  {
>>> @@ -2829,21 +3068,24 @@ static long btrfs_fallocate(struct file *file, int mode,
>>>       cur_offset = alloc_start;
>>>
>>>       /* Make sure we aren't being give some crap mode */
>>> -     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>>> +     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
>>> +                  FALLOC_FL_ZERO_RANGE))
>>>               return -EOPNOTSUPP;
>>>
>>>       if (mode & FALLOC_FL_PUNCH_HOLE)
>>> -             return btrfs_punch_hole(inode, offset, len);
>>> +             return btrfs_punch_hole(inode, offset, len, true);
>>>
>>>       /*
>>>        * Only trigger disk allocation, don't trigger qgroup reserve
>>>        *
>>>        * For qgroup space, it will be checked later.
>>>        */
>>> -     ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>>> -                     alloc_end - alloc_start);
>>> -     if (ret < 0)
>>> -             return ret;
>>> +     if (!(mode & FALLOC_FL_ZERO_RANGE)) {
>>> +             ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
>>> +                                                   alloc_end - alloc_start);
>>> +             if (ret < 0)
>>> +                     return ret;
>>> +     }
>>>
>>>       inode_lock(inode);
>>>
>>> @@ -2885,6 +3127,12 @@ static long btrfs_fallocate(struct file *file, int mode,
>>>       if (ret)
>>>               goto out;
>>>
>>> +     if (mode & FALLOC_FL_ZERO_RANGE) {
>>> +             ret = btrfs_zero_range(inode, offset, len, mode);
>>> +             inode_unlock(inode);
>>> +             return ret;
>>> +     }
>>> +
>>>       locked_end = alloc_end - 1;
>>>       while (1) {
>>>               struct btrfs_ordered_extent *ordered;
>>> @@ -3010,7 +3258,7 @@ static long btrfs_fallocate(struct file *file, int mode,
>>>  out:
>>>       inode_unlock(inode);
>>>       /* Let go of our reservation. */
>>> -     if (ret != 0)
>>> +     if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
>>>               btrfs_free_reserved_data_space(inode, data_reserved,
>>>                               alloc_start, alloc_end - cur_offset);
>>>       extent_changeset_free(data_reserved);
>>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v3] Btrfs: add support for fallocate's zero range operation
  2017-10-25 12:53 [PATCH] Btrfs: add support for fallocate's zero range operation fdmanana
  2017-10-25 14:59 ` [PATCH v2] " fdmanana
@ 2017-11-03 17:20 ` fdmanana
  2017-11-03 20:59   ` Edmund Nadolski
  2017-11-04  4:07   ` [PATCH v4] " fdmanana
  1 sibling, 2 replies; 14+ messages in thread
From: fdmanana @ 2017-11-03 17:20 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

This implements support the zero range operation of fallocate. For now
at least it's as simple as possible while reusing most of the existing
fallocate and hole punching infrastructure.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---

V2: Removed double inode unlock on error path from failure to lock range.
V3: Factored common code to update isize and inode item into a helper
    function, plus some minor cleanup.

 fs/btrfs/file.c | 351 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 285 insertions(+), 66 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index aafcc785f840..2cc1aed1c564 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
 	return ret;
 }
 
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole_lock_range(struct inode *inode,
+				       const u64 lockstart,
+				       const u64 lockend,
+				       struct extent_state **cached_state)
+{
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		int ret;
+
+		truncate_pagecache_range(inode, lockstart, lockend);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+		/*
+		 * We need to make sure we have no ordered extents in this range
+		 * and nobody raced in and read a page in this range, if we did
+		 * we need to try again.
+		 */
+		if ((!ordered ||
+		    (ordered->file_offset + ordered->len <= lockstart ||
+		     ordered->file_offset > lockend)) &&
+		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, cached_state, GFP_NOFS);
+		ret = btrfs_wait_ordered_range(inode, lockstart,
+					       lockend - lockstart + 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
+			    bool lock_inode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		return ret;
 
-	inode_lock(inode);
+	if (lock_inode)
+		inode_lock(inode);
 	ino_size = round_up(inode->i_size, fs_info->sectorsize);
 	ret = find_first_non_hole(inode, &offset, &len);
 	if (ret < 0)
@@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		truncated_block = true;
 		ret = btrfs_truncate_block(inode, offset, 0, 0);
 		if (ret) {
-			inode_unlock(inode);
+			if (lock_inode)
+				inode_unlock(inode);
 			return ret;
 		}
 	}
@@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		goto out_only_mutex;
 	}
 
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		truncate_pagecache_range(inode, lockstart, lockend);
-
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 &cached_state);
-		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
-
-		/*
-		 * We need to make sure we have no ordered extents in this range
-		 * and nobody raced in and read a page in this range, if we did
-		 * we need to try again.
-		 */
-		if ((!ordered ||
-		    (ordered->file_offset + ordered->len <= lockstart ||
-		     ordered->file_offset > lockend)) &&
-		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-			break;
-		}
-		if (ordered)
-			btrfs_put_ordered_extent(ordered);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-				     lockend, &cached_state, GFP_NOFS);
-		ret = btrfs_wait_ordered_range(inode, lockstart,
-					       lockend - lockstart + 1);
-		if (ret) {
+	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+					  &cached_state);
+	if (ret) {
+		if (lock_inode)
 			inode_unlock(inode);
-			return ret;
-		}
+		return ret;
 	}
 
 	path = btrfs_alloc_path();
@@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 			ret = btrfs_end_transaction(trans);
 		}
 	}
-	inode_unlock(inode);
+	if (lock_inode)
+		inode_unlock(inode);
 	if (ret && !err)
 		err = ret;
 	return err;
@@ -2804,6 +2822,217 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
 	return 0;
 }
 
+static int btrfs_fallocate_update_isize(struct inode *inode,
+					const u64 end,
+					const int mode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+	int ret2;
+
+	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
+		return 0;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	inode->i_ctime = current_time(inode);
+	i_size_write(inode, end);
+	btrfs_ordered_update_i_size(inode, end, NULL);
+	ret = btrfs_update_inode(trans, root, inode);
+	ret2 = btrfs_end_transaction(trans);
+
+	return ret ? ret : ret2;
+}
+
+static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+						 u64 offset)
+{
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	struct extent_map *em = NULL;
+	int ret = 0;
+
+	offset = round_down(offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	if (em->block_start == EXTENT_MAP_HOLE)
+		ret = 1;
+
+	free_extent_map(em);
+	return ret;
+}
+
+static int btrfs_zero_range(struct inode *inode,
+			    loff_t offset,
+			    loff_t len,
+			    const int mode)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct extent_map *em;
+	struct extent_changeset *data_reserved = NULL;
+	int ret;
+	u64 alloc_hint = 0;
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	u64 alloc_start = round_down(offset, sectorsize);
+	u64 alloc_end = round_up(offset + len, sectorsize);
+	u64 bytes_to_reserve = 0;
+	bool space_reserved = false;
+
+	inode_dio_wait(inode);
+
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+			      alloc_start, alloc_end - alloc_start, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+
+	/*
+	 * Avoid hole punching and extent allocation for some cases. More cases
+	 * could be considered, but these are unlikely common and we keep things
+	 * as simple as possible for now. Also, intentionally, if the target
+	 * range contains one or more prealloc extents together with regular
+	 * extents and holes, we drop all the existing extents and allocate a
+	 * new prealloc extent, so that we get a larger contiguous disk extent.
+	 */
+	if (em->start <= alloc_start &&
+	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		const u64 em_end = em->start + em->len;
+
+		if (em_end >= offset + len) {
+			/*
+			 * The whole range is already a prealloc extent,
+			 * do nothing except updating the inode's i_size if
+			 * needed.
+			 */
+			free_extent_map(em);
+			ret = btrfs_fallocate_update_isize(inode, offset + len,
+							   mode);
+			goto out;
+		}
+		/*
+		 * Part of the range is already a prealloc extent, so operate
+		 * only on the remaining part of the range.
+		 */
+		alloc_start = em_end;
+		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
+		len = offset + len - alloc_start;
+		offset = alloc_start;
+		alloc_hint = em->block_start + em->len;
+	}
+	free_extent_map(em);
+
+	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
+	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+				      alloc_start, sectorsize, 0);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+			free_extent_map(em);
+			ret = btrfs_fallocate_update_isize(inode, offset + len,
+							   mode);
+			goto out;
+		}
+		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+			free_extent_map(em);
+			ret = btrfs_truncate_block(inode, offset, len, 0);
+			if (!ret)
+				ret = btrfs_fallocate_update_isize(inode,
+								   offset + len,
+								   mode);
+			return ret;
+		}
+		free_extent_map(em);
+		alloc_start = round_down(offset, sectorsize);
+		alloc_end = alloc_start + sectorsize;
+		goto reserve_space;
+	}
+
+	alloc_start = round_up(offset, sectorsize);
+	alloc_end = round_down(offset + len, sectorsize);
+
+	/*
+	 * For unaligned ranges, check the pages at the boundaries, they might
+	 * map to an extent, in which case we need to partially zero them, or
+	 * they might map to a hole, in which case we need our allocation range
+	 * to cover them.
+	 */
+	if (!IS_ALIGNED(offset, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode, offset);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_start = round_down(offset, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset, 0, 0);
+			if (ret)
+				goto out;
+		}
+	}
+
+	if (!IS_ALIGNED(offset + len, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode,
+							    offset + len);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_end = round_up(offset + len, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+			if (ret)
+				goto out;
+		}
+	}
+
+reserve_space:
+	if (alloc_start < alloc_end) {
+		struct extent_state *cached_state = NULL;
+		const u64 lockstart = alloc_start;
+		const u64 lockend = alloc_end - 1;
+
+		bytes_to_reserve = alloc_end - alloc_start;
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      bytes_to_reserve);
+		if (ret < 0)
+			goto out;
+		space_reserved = true;
+		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+						alloc_start, bytes_to_reserve);
+		if (ret)
+			goto out;
+		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+						  &cached_state);
+		if (ret)
+			goto out;
+		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+						alloc_end - alloc_start,
+						i_blocksize(inode),
+						offset + len, &alloc_hint);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state, GFP_KERNEL);
+		/* btrfs_prealloc_file_range releases reserved space on error */
+		if (ret)
+			space_reserved = false;
+	}
+ out:
+	if (ret && space_reserved)
+		btrfs_free_reserved_data_space(inode, data_reserved,
+					       alloc_start, bytes_to_reserve);
+	extent_changeset_free(data_reserved);
+
+	return ret;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
@@ -2829,21 +3058,24 @@ static long btrfs_fallocate(struct file *file, int mode,
 	cur_offset = alloc_start;
 
 	/* Make sure we aren't being give some crap mode */
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_ZERO_RANGE))
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
-		return btrfs_punch_hole(inode, offset, len);
+		return btrfs_punch_hole(inode, offset, len, true);
 
 	/*
 	 * Only trigger disk allocation, don't trigger qgroup reserve
 	 *
 	 * For qgroup space, it will be checked later.
 	 */
-	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
-			alloc_end - alloc_start);
-	if (ret < 0)
-		return ret;
+	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      alloc_end - alloc_start);
+		if (ret < 0)
+			return ret;
+	}
 
 	inode_lock(inode);
 
@@ -2885,6 +3117,12 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret)
 		goto out;
 
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = btrfs_zero_range(inode, offset, len, mode);
+		inode_unlock(inode);
+		return ret;
+	}
+
 	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
@@ -2980,37 +3218,18 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret < 0)
 		goto out_unlock;
 
-	if (actual_end > inode->i_size &&
-	    !(mode & FALLOC_FL_KEEP_SIZE)) {
-		struct btrfs_trans_handle *trans;
-		struct btrfs_root *root = BTRFS_I(inode)->root;
-
-		/*
-		 * We didn't need to allocate any more space, but we
-		 * still extended the size of the file so we need to
-		 * update i_size and the inode item.
-		 */
-		trans = btrfs_start_transaction(root, 1);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-		} else {
-			inode->i_ctime = current_time(inode);
-			i_size_write(inode, actual_end);
-			btrfs_ordered_update_i_size(inode, actual_end, NULL);
-			ret = btrfs_update_inode(trans, root, inode);
-			if (ret)
-				btrfs_end_transaction(trans);
-			else
-				ret = btrfs_end_transaction(trans);
-		}
-	}
+	/*
+	 * We didn't need to allocate any more space, but we still extended the
+	 * size of the file so we need to update i_size and the inode item.
+	 */
+	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
 out_unlock:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 			     &cached_state, GFP_KERNEL);
 out:
 	inode_unlock(inode);
 	/* Let go of our reservation. */
-	if (ret != 0)
+	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
 		btrfs_free_reserved_data_space(inode, data_reserved,
 				alloc_start, alloc_end - cur_offset);
 	extent_changeset_free(data_reserved);
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] Btrfs: add support for fallocate's zero range operation
  2017-11-03 17:20 ` [PATCH v3] " fdmanana
@ 2017-11-03 20:59   ` Edmund Nadolski
  2017-11-04  4:07   ` [PATCH v4] " fdmanana
  1 sibling, 0 replies; 14+ messages in thread
From: Edmund Nadolski @ 2017-11-03 20:59 UTC (permalink / raw)
  To: fdmanana, linux-btrfs



On 11/03/2017 11:20 AM, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> This implements support the zero range operation of fallocate. For now
> at least it's as simple as possible while reusing most of the existing
> fallocate and hole punching infrastructure.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
> 
> V2: Removed double inode unlock on error path from failure to lock range.
> V3: Factored common code to update isize and inode item into a helper
>     function, plus some minor cleanup.
> 
>  fs/btrfs/file.c | 351 +++++++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 285 insertions(+), 66 deletions(-)
> 
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index aafcc785f840..2cc1aed1c564 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2448,7 +2448,48 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
>  	return ret;
>  }
>  
> -static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> +static int btrfs_punch_hole_lock_range(struct inode *inode,
> +				       const u64 lockstart,
> +				       const u64 lockend,
> +				       struct extent_state **cached_state)
> +{
> +	while (1) {
> +		struct btrfs_ordered_extent *ordered;
> +		int ret;
> +
> +		truncate_pagecache_range(inode, lockstart, lockend);
> +
> +		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> +				 cached_state);
> +		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> +
> +		/*
> +		 * We need to make sure we have no ordered extents in this range
> +		 * and nobody raced in and read a page in this range, if we did
> +		 * we need to try again.
> +		 */
> +		if ((!ordered ||
> +		    (ordered->file_offset + ordered->len <= lockstart ||
> +		     ordered->file_offset > lockend)) &&
> +		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> +			if (ordered)
> +				btrfs_put_ordered_extent(ordered);
> +			break;
> +		}
> +		if (ordered)
> +			btrfs_put_ordered_extent(ordered);
> +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +				     lockend, cached_state, GFP_NOFS);
> +		ret = btrfs_wait_ordered_range(inode, lockstart,
> +					       lockend - lockstart + 1);
> +		if (ret)
> +			return ret;
> +	}
> +	return 0;
> +}
> +
> +static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len,
> +			    bool lock_inode)

The inode_lock may no longer be needed, since it looks to be always true
in this version of the patch.

Ed

>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>  	struct btrfs_root *root = BTRFS_I(inode)->root;
> @@ -2477,7 +2518,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  	if (ret)
>  		return ret;
>  
> -	inode_lock(inode);
> +	if (lock_inode)
> +		inode_lock(inode);
>  	ino_size = round_up(inode->i_size, fs_info->sectorsize);
>  	ret = find_first_non_hole(inode, &offset, &len);
>  	if (ret < 0)
> @@ -2516,7 +2558,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		truncated_block = true;
>  		ret = btrfs_truncate_block(inode, offset, 0, 0);
>  		if (ret) {
> -			inode_unlock(inode);
> +			if (lock_inode)
> +				inode_unlock(inode);
>  			return ret;
>  		}
>  	}
> @@ -2564,38 +2607,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  		goto out_only_mutex;
>  	}
>  
> -	while (1) {
> -		struct btrfs_ordered_extent *ordered;
> -
> -		truncate_pagecache_range(inode, lockstart, lockend);
> -
> -		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> -				 &cached_state);
> -		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
> -
> -		/*
> -		 * We need to make sure we have no ordered extents in this range
> -		 * and nobody raced in and read a page in this range, if we did
> -		 * we need to try again.
> -		 */
> -		if ((!ordered ||
> -		    (ordered->file_offset + ordered->len <= lockstart ||
> -		     ordered->file_offset > lockend)) &&
> -		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
> -			if (ordered)
> -				btrfs_put_ordered_extent(ordered);
> -			break;
> -		}
> -		if (ordered)
> -			btrfs_put_ordered_extent(ordered);
> -		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> -				     lockend, &cached_state, GFP_NOFS);
> -		ret = btrfs_wait_ordered_range(inode, lockstart,
> -					       lockend - lockstart + 1);
> -		if (ret) {
> +	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +					  &cached_state);
> +	if (ret) {
> +		if (lock_inode)
>  			inode_unlock(inode);
> -			return ret;
> -		}
> +		return ret;
>  	}
>  
>  	path = btrfs_alloc_path();
> @@ -2758,7 +2775,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
>  			ret = btrfs_end_transaction(trans);
>  		}
>  	}
> -	inode_unlock(inode);
> +	if (lock_inode)
> +		inode_unlock(inode);
>  	if (ret && !err)
>  		err = ret;
>  	return err;
> @@ -2804,6 +2822,217 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
>  	return 0;
>  }
>  
> +static int btrfs_fallocate_update_isize(struct inode *inode,
> +					const u64 end,
> +					const int mode)
> +{
> +	struct btrfs_trans_handle *trans;
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	int ret;
> +	int ret2;
> +
> +	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
> +		return 0;
> +
> +	trans = btrfs_start_transaction(root, 1);
> +	if (IS_ERR(trans))
> +		return PTR_ERR(trans);
> +
> +	inode->i_ctime = current_time(inode);
> +	i_size_write(inode, end);
> +	btrfs_ordered_update_i_size(inode, end, NULL);
> +	ret = btrfs_update_inode(trans, root, inode);
> +	ret2 = btrfs_end_transaction(trans);
> +
> +	return ret ? ret : ret2;
> +}
> +
> +static int btrfs_zero_range_check_range_boundary(struct inode *inode,
> +						 u64 offset)
> +{
> +	const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +	struct extent_map *em = NULL;
> +	int ret = 0;
> +
> +	offset = round_down(offset, sectorsize);
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
> +	if (IS_ERR(em))
> +		return PTR_ERR(em);
> +
> +	if (em->block_start == EXTENT_MAP_HOLE)
> +		ret = 1;
> +
> +	free_extent_map(em);
> +	return ret;
> +}
> +
> +static int btrfs_zero_range(struct inode *inode,
> +			    loff_t offset,
> +			    loff_t len,
> +			    const int mode)
> +{
> +	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
> +	struct extent_map *em;
> +	struct extent_changeset *data_reserved = NULL;
> +	int ret;
> +	u64 alloc_hint = 0;
> +	const u64 sectorsize = btrfs_inode_sectorsize(inode);
> +	u64 alloc_start = round_down(offset, sectorsize);
> +	u64 alloc_end = round_up(offset + len, sectorsize);
> +	u64 bytes_to_reserve = 0;
> +	bool space_reserved = false;
> +
> +	inode_dio_wait(inode);
> +
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +			      alloc_start, alloc_end - alloc_start, 0);
> +	if (IS_ERR(em)) {
> +		ret = PTR_ERR(em);
> +		goto out;
> +	}
> +
> +	/*
> +	 * Avoid hole punching and extent allocation for some cases. More cases
> +	 * could be considered, but these are unlikely common and we keep things
> +	 * as simple as possible for now. Also, intentionally, if the target
> +	 * range contains one or more prealloc extents together with regular
> +	 * extents and holes, we drop all the existing extents and allocate a
> +	 * new prealloc extent, so that we get a larger contiguous disk extent.
> +	 */
> +	if (em->start <= alloc_start &&
> +	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +		const u64 em_end = em->start + em->len;
> +
> +		if (em_end >= offset + len) {
> +			/*
> +			 * The whole range is already a prealloc extent,
> +			 * do nothing except updating the inode's i_size if
> +			 * needed.
> +			 */
> +			free_extent_map(em);
> +			ret = btrfs_fallocate_update_isize(inode, offset + len,
> +							   mode);
> +			goto out;
> +		}
> +		/*
> +		 * Part of the range is already a prealloc extent, so operate
> +		 * only on the remaining part of the range.
> +		 */
> +		alloc_start = em_end;
> +		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
> +		len = offset + len - alloc_start;
> +		offset = alloc_start;
> +		alloc_hint = em->block_start + em->len;
> +	}
> +	free_extent_map(em);
> +
> +	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
> +	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
> +		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
> +				      alloc_start, sectorsize, 0);
> +		if (IS_ERR(em)) {
> +			ret = PTR_ERR(em);
> +			goto out;
> +		}
> +
> +		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +			free_extent_map(em);
> +			ret = btrfs_fallocate_update_isize(inode, offset + len,
> +							   mode);
> +			goto out;
> +		}
> +		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
> +			free_extent_map(em);
> +			ret = btrfs_truncate_block(inode, offset, len, 0);
> +			if (!ret)
> +				ret = btrfs_fallocate_update_isize(inode,
> +								   offset + len,
> +								   mode);
> +			return ret;
> +		}
> +		free_extent_map(em);
> +		alloc_start = round_down(offset, sectorsize);
> +		alloc_end = alloc_start + sectorsize;
> +		goto reserve_space;
> +	}
> +
> +	alloc_start = round_up(offset, sectorsize);
> +	alloc_end = round_down(offset + len, sectorsize);
> +
> +	/*
> +	 * For unaligned ranges, check the pages at the boundaries, they might
> +	 * map to an extent, in which case we need to partially zero them, or
> +	 * they might map to a hole, in which case we need our allocation range
> +	 * to cover them.
> +	 */
> +	if (!IS_ALIGNED(offset, sectorsize)) {
> +		ret = btrfs_zero_range_check_range_boundary(inode, offset);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			alloc_start = round_down(offset, sectorsize);
> +			ret = 0;
> +		} else {
> +			ret = btrfs_truncate_block(inode, offset, 0, 0);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +	if (!IS_ALIGNED(offset + len, sectorsize)) {
> +		ret = btrfs_zero_range_check_range_boundary(inode,
> +							    offset + len);
> +		if (ret < 0)
> +			goto out;
> +		if (ret) {
> +			alloc_end = round_up(offset + len, sectorsize);
> +			ret = 0;
> +		} else {
> +			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +reserve_space:
> +	if (alloc_start < alloc_end) {
> +		struct extent_state *cached_state = NULL;
> +		const u64 lockstart = alloc_start;
> +		const u64 lockend = alloc_end - 1;
> +
> +		bytes_to_reserve = alloc_end - alloc_start;
> +		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +						      bytes_to_reserve);
> +		if (ret < 0)
> +			goto out;
> +		space_reserved = true;
> +		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
> +						alloc_start, bytes_to_reserve);
> +		if (ret)
> +			goto out;
> +		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
> +						  &cached_state);
> +		if (ret)
> +			goto out;
> +		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
> +						alloc_end - alloc_start,
> +						i_blocksize(inode),
> +						offset + len, &alloc_hint);
> +		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
> +				     lockend, &cached_state, GFP_KERNEL);
> +		/* btrfs_prealloc_file_range releases reserved space on error */
> +		if (ret)
> +			space_reserved = false;
> +	}
> + out:
> +	if (ret && space_reserved)
> +		btrfs_free_reserved_data_space(inode, data_reserved,
> +					       alloc_start, bytes_to_reserve);
> +	extent_changeset_free(data_reserved);
> +
> +	return ret;
> +}
> +
>  static long btrfs_fallocate(struct file *file, int mode,
>  			    loff_t offset, loff_t len)
>  {
> @@ -2829,21 +3058,24 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	cur_offset = alloc_start;
>  
>  	/* Make sure we aren't being give some crap mode */
> -	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> +		     FALLOC_FL_ZERO_RANGE))
>  		return -EOPNOTSUPP;
>  
>  	if (mode & FALLOC_FL_PUNCH_HOLE)
> -		return btrfs_punch_hole(inode, offset, len);
> +		return btrfs_punch_hole(inode, offset, len, true);
>  
>  	/*
>  	 * Only trigger disk allocation, don't trigger qgroup reserve
>  	 *
>  	 * For qgroup space, it will be checked later.
>  	 */
> -	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> -			alloc_end - alloc_start);
> -	if (ret < 0)
> -		return ret;
> +	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
> +		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
> +						      alloc_end - alloc_start);
> +		if (ret < 0)
> +			return ret;
> +	}
>  
>  	inode_lock(inode);
>  
> @@ -2885,6 +3117,12 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	if (ret)
>  		goto out;
>  
> +	if (mode & FALLOC_FL_ZERO_RANGE) {
> +		ret = btrfs_zero_range(inode, offset, len, mode);
> +		inode_unlock(inode);
> +		return ret;
> +	}
> +
>  	locked_end = alloc_end - 1;
>  	while (1) {
>  		struct btrfs_ordered_extent *ordered;
> @@ -2980,37 +3218,18 @@ static long btrfs_fallocate(struct file *file, int mode,
>  	if (ret < 0)
>  		goto out_unlock;
>  
> -	if (actual_end > inode->i_size &&
> -	    !(mode & FALLOC_FL_KEEP_SIZE)) {
> -		struct btrfs_trans_handle *trans;
> -		struct btrfs_root *root = BTRFS_I(inode)->root;
> -
> -		/*
> -		 * We didn't need to allocate any more space, but we
> -		 * still extended the size of the file so we need to
> -		 * update i_size and the inode item.
> -		 */
> -		trans = btrfs_start_transaction(root, 1);
> -		if (IS_ERR(trans)) {
> -			ret = PTR_ERR(trans);
> -		} else {
> -			inode->i_ctime = current_time(inode);
> -			i_size_write(inode, actual_end);
> -			btrfs_ordered_update_i_size(inode, actual_end, NULL);
> -			ret = btrfs_update_inode(trans, root, inode);
> -			if (ret)
> -				btrfs_end_transaction(trans);
> -			else
> -				ret = btrfs_end_transaction(trans);
> -		}
> -	}
> +	/*
> +	 * We didn't need to allocate any more space, but we still extended the
> +	 * size of the file so we need to update i_size and the inode item.
> +	 */
> +	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
>  out_unlock:
>  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
>  			     &cached_state, GFP_KERNEL);
>  out:
>  	inode_unlock(inode);
>  	/* Let go of our reservation. */
> -	if (ret != 0)
> +	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
>  		btrfs_free_reserved_data_space(inode, data_reserved,
>  				alloc_start, alloc_end - cur_offset);
>  	extent_changeset_free(data_reserved);
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v4] Btrfs: add support for fallocate's zero range operation
  2017-11-03 17:20 ` [PATCH v3] " fdmanana
  2017-11-03 20:59   ` Edmund Nadolski
@ 2017-11-04  4:07   ` fdmanana
  2017-11-10 16:43     ` Nikolay Borisov
  2018-01-05 16:49     ` David Sterba
  1 sibling, 2 replies; 14+ messages in thread
From: fdmanana @ 2017-11-04  4:07 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

This implements support the zero range operation of fallocate. For now
at least it's as simple as possible while reusing most of the existing
fallocate and hole punching infrastructure.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---

V2: Removed double inode unlock on error path from failure to lock range.
V3: Factored common code to update isize and inode item into a helper
    function, plus some minor cleanup.
V4: Removed no longer lock_inode parameter as of V3.

 fs/btrfs/file.c | 338 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 276 insertions(+), 62 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index aafcc785f840..ea2e863eb540 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2448,6 +2448,46 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
 	return ret;
 }
 
+static int btrfs_punch_hole_lock_range(struct inode *inode,
+				       const u64 lockstart,
+				       const u64 lockend,
+				       struct extent_state **cached_state)
+{
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		int ret;
+
+		truncate_pagecache_range(inode, lockstart, lockend);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+		/*
+		 * We need to make sure we have no ordered extents in this range
+		 * and nobody raced in and read a page in this range, if we did
+		 * we need to try again.
+		 */
+		if ((!ordered ||
+		    (ordered->file_offset + ordered->len <= lockstart ||
+		     ordered->file_offset > lockend)) &&
+		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, cached_state, GFP_NOFS);
+		ret = btrfs_wait_ordered_range(inode, lockstart,
+					       lockend - lockstart + 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2564,38 +2604,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		goto out_only_mutex;
 	}
 
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		truncate_pagecache_range(inode, lockstart, lockend);
-
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 &cached_state);
-		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
-
-		/*
-		 * We need to make sure we have no ordered extents in this range
-		 * and nobody raced in and read a page in this range, if we did
-		 * we need to try again.
-		 */
-		if ((!ordered ||
-		    (ordered->file_offset + ordered->len <= lockstart ||
-		     ordered->file_offset > lockend)) &&
-		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-			break;
-		}
-		if (ordered)
-			btrfs_put_ordered_extent(ordered);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-				     lockend, &cached_state, GFP_NOFS);
-		ret = btrfs_wait_ordered_range(inode, lockstart,
-					       lockend - lockstart + 1);
-		if (ret) {
-			inode_unlock(inode);
-			return ret;
-		}
+	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+					  &cached_state);
+	if (ret) {
+		inode_unlock(inode);
+		goto out_only_mutex;
 	}
 
 	path = btrfs_alloc_path();
@@ -2804,6 +2817,217 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
 	return 0;
 }
 
+static int btrfs_fallocate_update_isize(struct inode *inode,
+					const u64 end,
+					const int mode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+	int ret2;
+
+	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
+		return 0;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	inode->i_ctime = current_time(inode);
+	i_size_write(inode, end);
+	btrfs_ordered_update_i_size(inode, end, NULL);
+	ret = btrfs_update_inode(trans, root, inode);
+	ret2 = btrfs_end_transaction(trans);
+
+	return ret ? ret : ret2;
+}
+
+static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+						 u64 offset)
+{
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	struct extent_map *em;
+	int ret = 0;
+
+	offset = round_down(offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	if (em->block_start == EXTENT_MAP_HOLE)
+		ret = 1;
+
+	free_extent_map(em);
+	return ret;
+}
+
+static int btrfs_zero_range(struct inode *inode,
+			    loff_t offset,
+			    loff_t len,
+			    const int mode)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct extent_map *em;
+	struct extent_changeset *data_reserved = NULL;
+	int ret;
+	u64 alloc_hint = 0;
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	u64 alloc_start = round_down(offset, sectorsize);
+	u64 alloc_end = round_up(offset + len, sectorsize);
+	u64 bytes_to_reserve = 0;
+	bool space_reserved = false;
+
+	inode_dio_wait(inode);
+
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+			      alloc_start, alloc_end - alloc_start, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+
+	/*
+	 * Avoid hole punching and extent allocation for some cases. More cases
+	 * could be considered, but these are unlikely common and we keep things
+	 * as simple as possible for now. Also, intentionally, if the target
+	 * range contains one or more prealloc extents together with regular
+	 * extents and holes, we drop all the existing extents and allocate a
+	 * new prealloc extent, so that we get a larger contiguous disk extent.
+	 */
+	if (em->start <= alloc_start &&
+	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		const u64 em_end = em->start + em->len;
+
+		if (em_end >= offset + len) {
+			/*
+			 * The whole range is already a prealloc extent,
+			 * do nothing except updating the inode's i_size if
+			 * needed.
+			 */
+			free_extent_map(em);
+			ret = btrfs_fallocate_update_isize(inode, offset + len,
+							   mode);
+			goto out;
+		}
+		/*
+		 * Part of the range is already a prealloc extent, so operate
+		 * only on the remaining part of the range.
+		 */
+		alloc_start = em_end;
+		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
+		len = offset + len - alloc_start;
+		offset = alloc_start;
+		alloc_hint = em->block_start + em->len;
+	}
+	free_extent_map(em);
+
+	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
+	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+				      alloc_start, sectorsize, 0);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+			free_extent_map(em);
+			ret = btrfs_fallocate_update_isize(inode, offset + len,
+							   mode);
+			goto out;
+		}
+		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+			free_extent_map(em);
+			ret = btrfs_truncate_block(inode, offset, len, 0);
+			if (!ret)
+				ret = btrfs_fallocate_update_isize(inode,
+								   offset + len,
+								   mode);
+			return ret;
+		}
+		free_extent_map(em);
+		alloc_start = round_down(offset, sectorsize);
+		alloc_end = alloc_start + sectorsize;
+		goto reserve_space;
+	}
+
+	alloc_start = round_up(offset, sectorsize);
+	alloc_end = round_down(offset + len, sectorsize);
+
+	/*
+	 * For unaligned ranges, check the pages at the boundaries, they might
+	 * map to an extent, in which case we need to partially zero them, or
+	 * they might map to a hole, in which case we need our allocation range
+	 * to cover them.
+	 */
+	if (!IS_ALIGNED(offset, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode, offset);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_start = round_down(offset, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset, 0, 0);
+			if (ret)
+				goto out;
+		}
+	}
+
+	if (!IS_ALIGNED(offset + len, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode,
+							    offset + len);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_end = round_up(offset + len, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+			if (ret)
+				goto out;
+		}
+	}
+
+reserve_space:
+	if (alloc_start < alloc_end) {
+		struct extent_state *cached_state = NULL;
+		const u64 lockstart = alloc_start;
+		const u64 lockend = alloc_end - 1;
+
+		bytes_to_reserve = alloc_end - alloc_start;
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      bytes_to_reserve);
+		if (ret < 0)
+			goto out;
+		space_reserved = true;
+		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+						alloc_start, bytes_to_reserve);
+		if (ret)
+			goto out;
+		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+						  &cached_state);
+		if (ret)
+			goto out;
+		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+						alloc_end - alloc_start,
+						i_blocksize(inode),
+						offset + len, &alloc_hint);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state, GFP_KERNEL);
+		/* btrfs_prealloc_file_range releases reserved space on error */
+		if (ret)
+			space_reserved = false;
+	}
+ out:
+	if (ret && space_reserved)
+		btrfs_free_reserved_data_space(inode, data_reserved,
+					       alloc_start, bytes_to_reserve);
+	extent_changeset_free(data_reserved);
+
+	return ret;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
@@ -2829,7 +3053,8 @@ static long btrfs_fallocate(struct file *file, int mode,
 	cur_offset = alloc_start;
 
 	/* Make sure we aren't being give some crap mode */
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_ZERO_RANGE))
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -2840,10 +3065,12 @@ static long btrfs_fallocate(struct file *file, int mode,
 	 *
 	 * For qgroup space, it will be checked later.
 	 */
-	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
-			alloc_end - alloc_start);
-	if (ret < 0)
-		return ret;
+	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      alloc_end - alloc_start);
+		if (ret < 0)
+			return ret;
+	}
 
 	inode_lock(inode);
 
@@ -2885,6 +3112,12 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret)
 		goto out;
 
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = btrfs_zero_range(inode, offset, len, mode);
+		inode_unlock(inode);
+		return ret;
+	}
+
 	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
@@ -2980,37 +3213,18 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret < 0)
 		goto out_unlock;
 
-	if (actual_end > inode->i_size &&
-	    !(mode & FALLOC_FL_KEEP_SIZE)) {
-		struct btrfs_trans_handle *trans;
-		struct btrfs_root *root = BTRFS_I(inode)->root;
-
-		/*
-		 * We didn't need to allocate any more space, but we
-		 * still extended the size of the file so we need to
-		 * update i_size and the inode item.
-		 */
-		trans = btrfs_start_transaction(root, 1);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-		} else {
-			inode->i_ctime = current_time(inode);
-			i_size_write(inode, actual_end);
-			btrfs_ordered_update_i_size(inode, actual_end, NULL);
-			ret = btrfs_update_inode(trans, root, inode);
-			if (ret)
-				btrfs_end_transaction(trans);
-			else
-				ret = btrfs_end_transaction(trans);
-		}
-	}
+	/*
+	 * We didn't need to allocate any more space, but we still extended the
+	 * size of the file so we need to update i_size and the inode item.
+	 */
+	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
 out_unlock:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 			     &cached_state, GFP_KERNEL);
 out:
 	inode_unlock(inode);
 	/* Let go of our reservation. */
-	if (ret != 0)
+	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
 		btrfs_free_reserved_data_space(inode, data_reserved,
 				alloc_start, alloc_end - cur_offset);
 	extent_changeset_free(data_reserved);
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] Btrfs: add support for fallocate's zero range operation
  2017-11-04  4:07   ` [PATCH v4] " fdmanana
@ 2017-11-10 16:43     ` Nikolay Borisov
  2018-01-05 16:49     ` David Sterba
  1 sibling, 0 replies; 14+ messages in thread
From: Nikolay Borisov @ 2017-11-10 16:43 UTC (permalink / raw)
  To: fdmanana, linux-btrfs



On  4.11.2017 06:07, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> This implements support the zero range operation of fallocate. For now
> at least it's as simple as possible while reusing most of the existing
> fallocate and hole punching infrastructure.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
> 
> V2: Removed double inode unlock on error path from failure to lock range.
> V3: Factored common code to update isize and inode item into a helper
>     function, plus some minor cleanup.
> V4: Removed no longer lock_inode parameter as of V3.
> 

When this gets merged into Linus' tree we'd need to update fallocate's
man page as well

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v4] Btrfs: add support for fallocate's zero range operation
  2017-11-04  4:07   ` [PATCH v4] " fdmanana
  2017-11-10 16:43     ` Nikolay Borisov
@ 2018-01-05 16:49     ` David Sterba
  1 sibling, 0 replies; 14+ messages in thread
From: David Sterba @ 2018-01-05 16:49 UTC (permalink / raw)
  To: fdmanana; +Cc: linux-btrfs

On Sat, Nov 04, 2017 at 04:07:47AM +0000, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> This implements support the zero range operation of fallocate. For now
> at least it's as simple as possible while reusing most of the existing
> fallocate and hole punching infrastructure.
> 
> Signed-off-by: Filipe Manana <fdmanana@suse.com>

FYI, I've added this patch to the rest of the 4.16 queue.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-01-05 16:51 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-25 12:53 [PATCH] Btrfs: add support for fallocate's zero range operation fdmanana
2017-10-25 14:59 ` [PATCH v2] " fdmanana
2017-10-30 14:57   ` David Sterba
2017-11-01 10:34   ` Nikolay Borisov
2017-11-01 10:59     ` Filipe Manana
2017-11-02  8:33   ` Nikolay Borisov
2017-11-03  9:30   ` Nikolay Borisov
2017-11-03 10:29     ` Filipe Manana
2017-11-03 10:45       ` Filipe Manana
2017-11-03 17:20 ` [PATCH v3] " fdmanana
2017-11-03 20:59   ` Edmund Nadolski
2017-11-04  4:07   ` [PATCH v4] " fdmanana
2017-11-10 16:43     ` Nikolay Borisov
2018-01-05 16:49     ` David Sterba

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.