All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] btrfs: teach send to avoid trashing the page cache with data
@ 2022-05-05 17:16 fdmanana
  2022-05-05 17:16 ` [PATCH 1/2] btrfs: send: keep the current inode open while processing it fdmanana
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: fdmanana @ 2022-05-05 17:16 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

When doing a send operation, we read the data of all extents we need to
send into the page cache, which almost always is wasteful as it can lead
to eviction of other things from the page cache that are more useful for
applications (and maybe other kernel subsystems). This patchset makes send
evict the data from the page cache once it has sent it. The actual work
is in the second patch, while the first one is just preparatory work.
More details in the changelogs.

Filipe Manana (2):
  btrfs: send: keep the current inode open while processing it
  btrfs: send: avoid trashing the page cache

 fs/btrfs/send.c | 128 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 109 insertions(+), 19 deletions(-)

-- 
2.35.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/2] btrfs: send: keep the current inode open while processing it
  2022-05-05 17:16 [PATCH 0/2] btrfs: teach send to avoid trashing the page cache with data fdmanana
@ 2022-05-05 17:16 ` fdmanana
  2022-05-05 17:16 ` [PATCH 2/2] btrfs: send: avoid trashing the page cache fdmanana
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 13+ messages in thread
From: fdmanana @ 2022-05-05 17:16 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

Every time we send a write command, we open the inode, read some data to
a buffer and then close the inode. The amount of data we read for each
write command is at most 48K, returned by max_send_read_size(), and that
corresponds to: BTRFS_SEND_BUF_SIZE - 16K = 48K. In practice this does
not add any significant overhead, because the time elapsed between every
close (iput()) and open (btrfs_iget()) is very short, so the inode is kept
in the VFS's cache after the iput() and it's still there by the time we
do the next btrfs_iget().

As between processing extents of the current inode we don't do anything
else, it makes sense to keep the inode open after we process its first
extent that needs to be sent and keep it open until we start processing
the next inode. This serves to facilitate the next change, which aims
to avoid having send operations trash the page cache with data extents.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 54 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 330bef72a555..55275ba90cb4 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -131,6 +131,11 @@ struct send_ctx {
 	struct list_head name_cache_list;
 	int name_cache_size;
 
+	/*
+	 * The inode we are currently processing. It's not NULL only when we
+	 * need to issue write commands for data extents from this inode.
+	 */
+	struct inode *cur_inode;
 	struct file_ra_state ra;
 
 	/*
@@ -4868,7 +4873,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
 {
 	struct btrfs_root *root = sctx->send_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct inode *inode;
 	struct page *page;
 	pgoff_t index = offset >> PAGE_SHIFT;
 	pgoff_t last_index;
@@ -4879,37 +4883,30 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
 	if (ret)
 		return ret;
 
-	inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
 	last_index = (offset + len - 1) >> PAGE_SHIFT;
 
-	/* initial readahead */
-	memset(&sctx->ra, 0, sizeof(struct file_ra_state));
-	file_ra_state_init(&sctx->ra, inode->i_mapping);
-
 	while (index <= last_index) {
 		unsigned cur_len = min_t(unsigned, len,
 					 PAGE_SIZE - pg_offset);
 
-		page = find_lock_page(inode->i_mapping, index);
+		page = find_lock_page(sctx->cur_inode->i_mapping, index);
 		if (!page) {
-			page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
-				NULL, index, last_index + 1 - index);
+			page_cache_sync_readahead(sctx->cur_inode->i_mapping,
+						  &sctx->ra, NULL, index,
+						  last_index + 1 - index);
 
-			page = find_or_create_page(inode->i_mapping, index,
-					GFP_KERNEL);
+			page = find_or_create_page(sctx->cur_inode->i_mapping,
+						   index, GFP_KERNEL);
 			if (!page) {
 				ret = -ENOMEM;
 				break;
 			}
 		}
 
-		if (PageReadahead(page)) {
-			page_cache_async_readahead(inode->i_mapping, &sctx->ra,
-				NULL, page, index, last_index + 1 - index);
-		}
+		if (PageReadahead(page))
+			page_cache_async_readahead(sctx->cur_inode->i_mapping,
+						   &sctx->ra, NULL, page, index,
+						   last_index + 1 - index);
 
 		if (!PageUptodate(page)) {
 			btrfs_readpage(NULL, page);
@@ -4935,7 +4932,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
 		len -= cur_len;
 		sctx->send_size += cur_len;
 	}
-	iput(inode);
+
 	return ret;
 }
 
@@ -5148,6 +5145,20 @@ static int send_extent_data(struct send_ctx *sctx,
 	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
 		return send_update_extent(sctx, offset, len);
 
+	if (sctx->cur_inode == NULL) {
+		struct btrfs_root *root = sctx->send_root;
+
+		sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
+		if (IS_ERR(sctx->cur_inode)) {
+			int err = PTR_ERR(sctx->cur_inode);
+
+			sctx->cur_inode = NULL;
+			return err;
+		}
+		memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+		file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
+	}
+
 	while (sent < len) {
 		u64 size = min(len - sent, read_size);
 		int ret;
@@ -6171,6 +6182,9 @@ static int changed_inode(struct send_ctx *sctx,
 	u64 left_gen = 0;
 	u64 right_gen = 0;
 
+	iput(sctx->cur_inode);
+	sctx->cur_inode = NULL;
+
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
 	sctx->cur_inode_last_extent = (u64)-1;
@@ -7657,6 +7671,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 
 		name_cache_free(sctx);
 
+		iput(sctx->cur_inode);
+
 		kfree(sctx);
 	}
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/2] btrfs: send: avoid trashing the page cache
  2022-05-05 17:16 [PATCH 0/2] btrfs: teach send to avoid trashing the page cache with data fdmanana
  2022-05-05 17:16 ` [PATCH 1/2] btrfs: send: keep the current inode open while processing it fdmanana
@ 2022-05-05 17:16 ` fdmanana
  2022-05-17  6:35   ` Qu Wenruo
  2022-05-09 19:08 ` [PATCH 0/2] btrfs: teach send to avoid trashing the page cache with data David Sterba
  2022-05-17 10:47 ` [PATCH v2 " fdmanana
  3 siblings, 1 reply; 13+ messages in thread
From: fdmanana @ 2022-05-05 17:16 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

A send operation reads extent data using the buffered IO path for getting
extent data to send in write commands and this is both because it's simple
and to make use of the generic readahead infrastructure, which results in
a massive speedup.

However this fills the page cache with data that, most of the time, is
really only used by the send operation - once the write commands are sent,
it's not useful to have the data in the page cache anymore. For large
snapshots, bringing all data into the page cache eventually leads to the
need to evict other data from the page cache that may be more useful for
applications (and kernel susbsystems).

Even if extents are shared with the subvolume on which a snapshot is based
on and the data is currently on the page cache due to being read through
the subvolume, attempting to read the data through the snapshot will
always result in bringing a new copy of the data into another location in
the page cache (there's currently no shared memory for shared extents).

So make send evict the data it has read before if when it first opened
the inode, its mapping had no pages currently loaded: when
inode->i_mapping->nr_pages has a value of 0. Do this instead of deciding
based on the return value of filemap_range_has_page() before reading an
extent because the generic readahead mechanism may read pages beyond the
range we request (and it very often does it), which means a call to
filemap_range_has_page() will return true due to the readahead that was
triggered when processing a previous extent - we don't have a simple way
to distinguish this case from the case where the data was brought into
the page cache through someone else. So checking for the mapping number
of pages being 0 when we first open the inode is simple, cheap and it
generally accomplishes the goal of not trashing the page cache - the
only exception is if part of data was previously loaded into the page
cache through the snapshot by some other process, in that case we end
up not evicting any data send brings into the page cache, just like
before this change - but that however is not the common case.

Example scenario, on a box with 32G of RAM:

  $ btrfs subvolume create /mnt/sv1
  $ xfs_io -f -c "pwrite 0 4G" /mnt/sv1/file1

  $ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap1

  $ free -m
                 total        used        free      shared  buff/cache   available
  Mem:           31937         186       26866           0        4883       31297
  Swap:           8188           0        8188

  # After this we get less 4G of free memory.
  $ btrfs send /mnt/snap1 >/dev/null

  $ free -m
                 total        used        free      shared  buff/cache   available
  Mem:           31937         186       22814           0        8935       31297
  Swap:           8188           0        8188

The same, obviously, applies to an incremental send.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 55275ba90cb4..d899049dea53 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -137,6 +137,8 @@ struct send_ctx {
 	 */
 	struct inode *cur_inode;
 	struct file_ra_state ra;
+	u64 prev_extent_end;
+	bool clean_page_cache;
 
 	/*
 	 * We process inodes by their increasing order, so if before an
@@ -5157,6 +5159,28 @@ static int send_extent_data(struct send_ctx *sctx,
 		}
 		memset(&sctx->ra, 0, sizeof(struct file_ra_state));
 		file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
+
+		/*
+		 * It's very likely there are no pages from this inode in the page
+		 * cache, so after reading extents and sending their data, we clean
+		 * the page cache to avoid trashing the page cache (adding pressure
+		 * to the page cache and forcing eviction of other data more useful
+		 * for applications).
+		 *
+		 * We decide if we should clean the page cache simply by checking
+		 * if the inode's mapping nrpages is 0 when we first open it, and
+		 * not by using something like filemap_range_has_page() before
+		 * reading an extent because when we ask the readahead code to
+		 * read a given file range, it may (and almost always does) read
+		 * pages from beyond that range (see the documentation for
+		 * page_cache_sync_readahead()), so it would not be reliable,
+		 * because after reading the first extent future calls to
+		 * filemap_range_has_page() would return true because the readahead
+		 * on the previous extent resulted in reading pages of the current
+		 * extent as well.
+		 */
+		sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
+		sctx->prev_extent_end = offset;
 	}
 
 	while (sent < len) {
@@ -5168,6 +5192,33 @@ static int send_extent_data(struct send_ctx *sctx,
 			return ret;
 		sent += size;
 	}
+
+	if (sctx->clean_page_cache) {
+		const u64 end = round_up(offset + len, PAGE_SIZE);
+
+		/*
+		 * Always start from the end offset of the last processed extent.
+		 * This is because the readahead code may (and very often does)
+		 * reads pages beyond the range we request for readahead. So if
+		 * we have an extent layout like this:
+		 *
+		 *            [ extent A ] [ extent B ] [ extent C ]
+		 *
+		 * When we ask page_cache_sync_readahead() to read extent A, it
+		 * may also trigger reads for pages of extent B. If we are doing
+		 * an incremental send and extent B has not changed between the
+		 * parent and send snapshots, some or all of its pages may end
+		 * up being read and placed in the page cache. So when truncating
+		 * the page cache we always start from the end offset of the
+		 * previously processed extent up to the end of the current
+		 * extent.
+		 */
+		truncate_inode_pages_range(&sctx->cur_inode->i_data,
+					   sctx->prev_extent_end,
+					   end - 1);
+		sctx->prev_extent_end = end;
+	}
+
 	return 0;
 }
 
@@ -6172,6 +6223,30 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx)
 	return ret;
 }
 
+static void close_current_inode(struct send_ctx *sctx)
+{
+	u64 i_size;
+
+	if (sctx->cur_inode == NULL)
+		return;
+
+	i_size = i_size_read(sctx->cur_inode);
+
+	/*
+	 * If we are doing an incremental send, we may have extents between the
+	 * last processed extent and the i_size that have not been processed
+	 * because they haven't changed but we may have read some of their pages
+	 * through readahead, see the comments at send_extent_data().
+	 */
+	if (sctx->clean_page_cache && sctx->prev_extent_end < i_size)
+		truncate_inode_pages_range(&sctx->cur_inode->i_data,
+					   sctx->prev_extent_end,
+					   round_up(i_size, PAGE_SIZE) - 1);
+
+	iput(sctx->cur_inode);
+	sctx->cur_inode = NULL;
+}
+
 static int changed_inode(struct send_ctx *sctx,
 			 enum btrfs_compare_tree_result result)
 {
@@ -6182,8 +6257,7 @@ static int changed_inode(struct send_ctx *sctx,
 	u64 left_gen = 0;
 	u64 right_gen = 0;
 
-	iput(sctx->cur_inode);
-	sctx->cur_inode = NULL;
+	close_current_inode(sctx);
 
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
@@ -7671,7 +7745,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 
 		name_cache_free(sctx);
 
-		iput(sctx->cur_inode);
+		close_current_inode(sctx);
 
 		kfree(sctx);
 	}
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/2] btrfs: teach send to avoid trashing the page cache with data
  2022-05-05 17:16 [PATCH 0/2] btrfs: teach send to avoid trashing the page cache with data fdmanana
  2022-05-05 17:16 ` [PATCH 1/2] btrfs: send: keep the current inode open while processing it fdmanana
  2022-05-05 17:16 ` [PATCH 2/2] btrfs: send: avoid trashing the page cache fdmanana
@ 2022-05-09 19:08 ` David Sterba
  2022-05-17 10:47 ` [PATCH v2 " fdmanana
  3 siblings, 0 replies; 13+ messages in thread
From: David Sterba @ 2022-05-09 19:08 UTC (permalink / raw)
  To: fdmanana; +Cc: linux-btrfs

On Thu, May 05, 2022 at 06:16:13PM +0100, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> When doing a send operation, we read the data of all extents we need to
> send into the page cache, which almost always is wasteful as it can lead
> to eviction of other things from the page cache that are more useful for
> applications (and maybe other kernel subsystems). This patchset makes send
> evict the data from the page cache once it has sent it. The actual work
> is in the second patch, while the first one is just preparatory work.
> More details in the changelogs.
> 
> Filipe Manana (2):
>   btrfs: send: keep the current inode open while processing it
>   btrfs: send: avoid trashing the page cache

Added to misc-next, thanks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] btrfs: send: avoid trashing the page cache
  2022-05-05 17:16 ` [PATCH 2/2] btrfs: send: avoid trashing the page cache fdmanana
@ 2022-05-17  6:35   ` Qu Wenruo
  2022-05-17  7:26     ` Qu Wenruo
  0 siblings, 1 reply; 13+ messages in thread
From: Qu Wenruo @ 2022-05-17  6:35 UTC (permalink / raw)
  To: fdmanana, linux-btrfs



On 2022/5/6 01:16, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
>
> A send operation reads extent data using the buffered IO path for getting
> extent data to send in write commands and this is both because it's simple
> and to make use of the generic readahead infrastructure, which results in
> a massive speedup.
>
> However this fills the page cache with data that, most of the time, is
> really only used by the send operation - once the write commands are sent,
> it's not useful to have the data in the page cache anymore. For large
> snapshots, bringing all data into the page cache eventually leads to the
> need to evict other data from the page cache that may be more useful for
> applications (and kernel susbsystems).
>
> Even if extents are shared with the subvolume on which a snapshot is based
> on and the data is currently on the page cache due to being read through
> the subvolume, attempting to read the data through the snapshot will
> always result in bringing a new copy of the data into another location in
> the page cache (there's currently no shared memory for shared extents).
>
> So make send evict the data it has read before if when it first opened
> the inode, its mapping had no pages currently loaded: when
> inode->i_mapping->nr_pages has a value of 0. Do this instead of deciding
> based on the return value of filemap_range_has_page() before reading an
> extent because the generic readahead mechanism may read pages beyond the
> range we request (and it very often does it), which means a call to
> filemap_range_has_page() will return true due to the readahead that was
> triggered when processing a previous extent - we don't have a simple way
> to distinguish this case from the case where the data was brought into
> the page cache through someone else. So checking for the mapping number
> of pages being 0 when we first open the inode is simple, cheap and it
> generally accomplishes the goal of not trashing the page cache - the
> only exception is if part of data was previously loaded into the page
> cache through the snapshot by some other process, in that case we end
> up not evicting any data send brings into the page cache, just like
> before this change - but that however is not the common case.
>
> Example scenario, on a box with 32G of RAM:
>
>    $ btrfs subvolume create /mnt/sv1
>    $ xfs_io -f -c "pwrite 0 4G" /mnt/sv1/file1
>
>    $ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap1
>
>    $ free -m
>                   total        used        free      shared  buff/cache   available
>    Mem:           31937         186       26866           0        4883       31297
>    Swap:           8188           0        8188
>
>    # After this we get less 4G of free memory.
>    $ btrfs send /mnt/snap1 >/dev/null
>
>    $ free -m
>                   total        used        free      shared  buff/cache   available
>    Mem:           31937         186       22814           0        8935       31297
>    Swap:           8188           0        8188
>
> The same, obviously, applies to an incremental send.
>
> Signed-off-by: Filipe Manana <fdmanana@suse.com>

Unfortunately, this patch seems to cause subpage cases to fail test case
btrfs/007, the reproducibility is around 50%, thus better "-I 8" to be
extra safe.

And I believe it also causes other send related failure for subpage cases.

I guess it's truncate_inode_pages_range() only truncating the full page,
but for subpage case, since one sector is smaller than one page, it
doesn't work as expected?

If needed, I can provide you the access to my aarch64 vm for debugging.

Thanks,
Qu

> ---
>   fs/btrfs/send.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 77 insertions(+), 3 deletions(-)
>
> diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
> index 55275ba90cb4..d899049dea53 100644
> --- a/fs/btrfs/send.c
> +++ b/fs/btrfs/send.c
> @@ -137,6 +137,8 @@ struct send_ctx {
>   	 */
>   	struct inode *cur_inode;
>   	struct file_ra_state ra;
> +	u64 prev_extent_end;
> +	bool clean_page_cache;
>
>   	/*
>   	 * We process inodes by their increasing order, so if before an
> @@ -5157,6 +5159,28 @@ static int send_extent_data(struct send_ctx *sctx,
>   		}
>   		memset(&sctx->ra, 0, sizeof(struct file_ra_state));
>   		file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
> +
> +		/*
> +		 * It's very likely there are no pages from this inode in the page
> +		 * cache, so after reading extents and sending their data, we clean
> +		 * the page cache to avoid trashing the page cache (adding pressure
> +		 * to the page cache and forcing eviction of other data more useful
> +		 * for applications).
> +		 *
> +		 * We decide if we should clean the page cache simply by checking
> +		 * if the inode's mapping nrpages is 0 when we first open it, and
> +		 * not by using something like filemap_range_has_page() before
> +		 * reading an extent because when we ask the readahead code to
> +		 * read a given file range, it may (and almost always does) read
> +		 * pages from beyond that range (see the documentation for
> +		 * page_cache_sync_readahead()), so it would not be reliable,
> +		 * because after reading the first extent future calls to
> +		 * filemap_range_has_page() would return true because the readahead
> +		 * on the previous extent resulted in reading pages of the current
> +		 * extent as well.
> +		 */
> +		sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
> +		sctx->prev_extent_end = offset;
>   	}
>
>   	while (sent < len) {
> @@ -5168,6 +5192,33 @@ static int send_extent_data(struct send_ctx *sctx,
>   			return ret;
>   		sent += size;
>   	}
> +
> +	if (sctx->clean_page_cache) {
> +		const u64 end = round_up(offset + len, PAGE_SIZE);
> +
> +		/*
> +		 * Always start from the end offset of the last processed extent.
> +		 * This is because the readahead code may (and very often does)
> +		 * reads pages beyond the range we request for readahead. So if
> +		 * we have an extent layout like this:
> +		 *
> +		 *            [ extent A ] [ extent B ] [ extent C ]
> +		 *
> +		 * When we ask page_cache_sync_readahead() to read extent A, it
> +		 * may also trigger reads for pages of extent B. If we are doing
> +		 * an incremental send and extent B has not changed between the
> +		 * parent and send snapshots, some or all of its pages may end
> +		 * up being read and placed in the page cache. So when truncating
> +		 * the page cache we always start from the end offset of the
> +		 * previously processed extent up to the end of the current
> +		 * extent.
> +		 */
> +		truncate_inode_pages_range(&sctx->cur_inode->i_data,
> +					   sctx->prev_extent_end,
> +					   end - 1);
> +		sctx->prev_extent_end = end;
> +	}
> +
>   	return 0;
>   }
>
> @@ -6172,6 +6223,30 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx)
>   	return ret;
>   }
>
> +static void close_current_inode(struct send_ctx *sctx)
> +{
> +	u64 i_size;
> +
> +	if (sctx->cur_inode == NULL)
> +		return;
> +
> +	i_size = i_size_read(sctx->cur_inode);
> +
> +	/*
> +	 * If we are doing an incremental send, we may have extents between the
> +	 * last processed extent and the i_size that have not been processed
> +	 * because they haven't changed but we may have read some of their pages
> +	 * through readahead, see the comments at send_extent_data().
> +	 */
> +	if (sctx->clean_page_cache && sctx->prev_extent_end < i_size)
> +		truncate_inode_pages_range(&sctx->cur_inode->i_data,
> +					   sctx->prev_extent_end,
> +					   round_up(i_size, PAGE_SIZE) - 1);
> +
> +	iput(sctx->cur_inode);
> +	sctx->cur_inode = NULL;
> +}
> +
>   static int changed_inode(struct send_ctx *sctx,
>   			 enum btrfs_compare_tree_result result)
>   {
> @@ -6182,8 +6257,7 @@ static int changed_inode(struct send_ctx *sctx,
>   	u64 left_gen = 0;
>   	u64 right_gen = 0;
>
> -	iput(sctx->cur_inode);
> -	sctx->cur_inode = NULL;
> +	close_current_inode(sctx);
>
>   	sctx->cur_ino = key->objectid;
>   	sctx->cur_inode_new_gen = 0;
> @@ -7671,7 +7745,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
>
>   		name_cache_free(sctx);
>
> -		iput(sctx->cur_inode);
> +		close_current_inode(sctx);
>
>   		kfree(sctx);
>   	}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] btrfs: send: avoid trashing the page cache
  2022-05-17  6:35   ` Qu Wenruo
@ 2022-05-17  7:26     ` Qu Wenruo
  2022-05-17  9:39       ` Filipe Manana
  0 siblings, 1 reply; 13+ messages in thread
From: Qu Wenruo @ 2022-05-17  7:26 UTC (permalink / raw)
  To: fdmanana, linux-btrfs



On 2022/5/17 14:35, Qu Wenruo wrote:
> 
> 
> On 2022/5/6 01:16, fdmanana@kernel.org wrote:
>> From: Filipe Manana <fdmanana@suse.com>
>>
>> A send operation reads extent data using the buffered IO path for getting
>> extent data to send in write commands and this is both because it's 
>> simple
>> and to make use of the generic readahead infrastructure, which results in
>> a massive speedup.
>>
>> However this fills the page cache with data that, most of the time, is
>> really only used by the send operation - once the write commands are 
>> sent,
>> it's not useful to have the data in the page cache anymore. For large
>> snapshots, bringing all data into the page cache eventually leads to the
>> need to evict other data from the page cache that may be more useful for
>> applications (and kernel susbsystems).
>>
>> Even if extents are shared with the subvolume on which a snapshot is 
>> based
>> on and the data is currently on the page cache due to being read through
>> the subvolume, attempting to read the data through the snapshot will
>> always result in bringing a new copy of the data into another location in
>> the page cache (there's currently no shared memory for shared extents).
>>
>> So make send evict the data it has read before if when it first opened
>> the inode, its mapping had no pages currently loaded: when
>> inode->i_mapping->nr_pages has a value of 0. Do this instead of deciding
>> based on the return value of filemap_range_has_page() before reading an
>> extent because the generic readahead mechanism may read pages beyond the
>> range we request (and it very often does it), which means a call to
>> filemap_range_has_page() will return true due to the readahead that was
>> triggered when processing a previous extent - we don't have a simple way
>> to distinguish this case from the case where the data was brought into
>> the page cache through someone else. So checking for the mapping number
>> of pages being 0 when we first open the inode is simple, cheap and it
>> generally accomplishes the goal of not trashing the page cache - the
>> only exception is if part of data was previously loaded into the page
>> cache through the snapshot by some other process, in that case we end
>> up not evicting any data send brings into the page cache, just like
>> before this change - but that however is not the common case.
>>
>> Example scenario, on a box with 32G of RAM:
>>
>>    $ btrfs subvolume create /mnt/sv1
>>    $ xfs_io -f -c "pwrite 0 4G" /mnt/sv1/file1
>>
>>    $ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap1
>>
>>    $ free -m
>>                   total        used        free      shared  
>> buff/cache   available
>>    Mem:           31937         186       26866           0        
>> 4883       31297
>>    Swap:           8188           0        8188
>>
>>    # After this we get less 4G of free memory.
>>    $ btrfs send /mnt/snap1 >/dev/null
>>
>>    $ free -m
>>                   total        used        free      shared  
>> buff/cache   available
>>    Mem:           31937         186       22814           0        
>> 8935       31297
>>    Swap:           8188           0        8188
>>
>> The same, obviously, applies to an incremental send.
>>
>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> 
> Unfortunately, this patch seems to cause subpage cases to fail test case
> btrfs/007, the reproducibility is around 50%, thus better "-I 8" to be
> extra safe.
> 
> And I believe it also causes other send related failure for subpage cases.
> 
> I guess it's truncate_inode_pages_range() only truncating the full page,
> but for subpage case, since one sector is smaller than one page, it
> doesn't work as expected?

It looks like that's the case.

The following diff fixed the send related bugs here:
(mail client seems to screw up the indent)

https://paste.opensuse.org/95871661

Thanks,
Qu
> 
> If needed, I can provide you the access to my aarch64 vm for debugging.
> 
> Thanks,
> Qu
> 
>> ---
>>   fs/btrfs/send.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 77 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
>> index 55275ba90cb4..d899049dea53 100644
>> --- a/fs/btrfs/send.c
>> +++ b/fs/btrfs/send.c
>> @@ -137,6 +137,8 @@ struct send_ctx {
>>        */
>>       struct inode *cur_inode;
>>       struct file_ra_state ra;
>> +    u64 prev_extent_end;
>> +    bool clean_page_cache;
>>
>>       /*
>>        * We process inodes by their increasing order, so if before an
>> @@ -5157,6 +5159,28 @@ static int send_extent_data(struct send_ctx *sctx,
>>           }
>>           memset(&sctx->ra, 0, sizeof(struct file_ra_state));
>>           file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
>> +
>> +        /*
>> +         * It's very likely there are no pages from this inode in the 
>> page
>> +         * cache, so after reading extents and sending their data, we 
>> clean
>> +         * the page cache to avoid trashing the page cache (adding 
>> pressure
>> +         * to the page cache and forcing eviction of other data more 
>> useful
>> +         * for applications).
>> +         *
>> +         * We decide if we should clean the page cache simply by 
>> checking
>> +         * if the inode's mapping nrpages is 0 when we first open it, 
>> and
>> +         * not by using something like filemap_range_has_page() before
>> +         * reading an extent because when we ask the readahead code to
>> +         * read a given file range, it may (and almost always does) read
>> +         * pages from beyond that range (see the documentation for
>> +         * page_cache_sync_readahead()), so it would not be reliable,
>> +         * because after reading the first extent future calls to
>> +         * filemap_range_has_page() would return true because the 
>> readahead
>> +         * on the previous extent resulted in reading pages of the 
>> current
>> +         * extent as well.
>> +         */
>> +        sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages 
>> == 0);
>> +        sctx->prev_extent_end = offset;
>>       }
>>
>>       while (sent < len) {
>> @@ -5168,6 +5192,33 @@ static int send_extent_data(struct send_ctx *sctx,
>>               return ret;
>>           sent += size;
>>       }
>> +
>> +    if (sctx->clean_page_cache) {
>> +        const u64 end = round_up(offset + len, PAGE_SIZE);
>> +
>> +        /*
>> +         * Always start from the end offset of the last processed 
>> extent.
>> +         * This is because the readahead code may (and very often does)
>> +         * reads pages beyond the range we request for readahead. So if
>> +         * we have an extent layout like this:
>> +         *
>> +         *            [ extent A ] [ extent B ] [ extent C ]
>> +         *
>> +         * When we ask page_cache_sync_readahead() to read extent A, it
>> +         * may also trigger reads for pages of extent B. If we are doing
>> +         * an incremental send and extent B has not changed between the
>> +         * parent and send snapshots, some or all of its pages may end
>> +         * up being read and placed in the page cache. So when 
>> truncating
>> +         * the page cache we always start from the end offset of the
>> +         * previously processed extent up to the end of the current
>> +         * extent.
>> +         */
>> +        truncate_inode_pages_range(&sctx->cur_inode->i_data,
>> +                       sctx->prev_extent_end,
>> +                       end - 1);
>> +        sctx->prev_extent_end = end;
>> +    }
>> +
>>       return 0;
>>   }
>>
>> @@ -6172,6 +6223,30 @@ static int btrfs_unlink_all_paths(struct 
>> send_ctx *sctx)
>>       return ret;
>>   }
>>
>> +static void close_current_inode(struct send_ctx *sctx)
>> +{
>> +    u64 i_size;
>> +
>> +    if (sctx->cur_inode == NULL)
>> +        return;
>> +
>> +    i_size = i_size_read(sctx->cur_inode);
>> +
>> +    /*
>> +     * If we are doing an incremental send, we may have extents 
>> between the
>> +     * last processed extent and the i_size that have not been processed
>> +     * because they haven't changed but we may have read some of 
>> their pages
>> +     * through readahead, see the comments at send_extent_data().
>> +     */
>> +    if (sctx->clean_page_cache && sctx->prev_extent_end < i_size)
>> +        truncate_inode_pages_range(&sctx->cur_inode->i_data,
>> +                       sctx->prev_extent_end,
>> +                       round_up(i_size, PAGE_SIZE) - 1);
>> +
>> +    iput(sctx->cur_inode);
>> +    sctx->cur_inode = NULL;
>> +}
>> +
>>   static int changed_inode(struct send_ctx *sctx,
>>                enum btrfs_compare_tree_result result)
>>   {
>> @@ -6182,8 +6257,7 @@ static int changed_inode(struct send_ctx *sctx,
>>       u64 left_gen = 0;
>>       u64 right_gen = 0;
>>
>> -    iput(sctx->cur_inode);
>> -    sctx->cur_inode = NULL;
>> +    close_current_inode(sctx);
>>
>>       sctx->cur_ino = key->objectid;
>>       sctx->cur_inode_new_gen = 0;
>> @@ -7671,7 +7745,7 @@ long btrfs_ioctl_send(struct inode *inode, 
>> struct btrfs_ioctl_send_args *arg)
>>
>>           name_cache_free(sctx);
>>
>> -        iput(sctx->cur_inode);
>> +        close_current_inode(sctx);
>>
>>           kfree(sctx);
>>       }

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] btrfs: send: avoid trashing the page cache
  2022-05-17  7:26     ` Qu Wenruo
@ 2022-05-17  9:39       ` Filipe Manana
  2022-05-17 10:36         ` Qu Wenruo
  0 siblings, 1 reply; 13+ messages in thread
From: Filipe Manana @ 2022-05-17  9:39 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Tue, May 17, 2022 at 03:26:14PM +0800, Qu Wenruo wrote:
> 
> 
> On 2022/5/17 14:35, Qu Wenruo wrote:
> > 
> > 
> > On 2022/5/6 01:16, fdmanana@kernel.org wrote:
> > > From: Filipe Manana <fdmanana@suse.com>
> > > 
> > > A send operation reads extent data using the buffered IO path for getting
> > > extent data to send in write commands and this is both because it's
> > > simple
> > > and to make use of the generic readahead infrastructure, which results in
> > > a massive speedup.
> > > 
> > > However this fills the page cache with data that, most of the time, is
> > > really only used by the send operation - once the write commands are
> > > sent,
> > > it's not useful to have the data in the page cache anymore. For large
> > > snapshots, bringing all data into the page cache eventually leads to the
> > > need to evict other data from the page cache that may be more useful for
> > > applications (and kernel susbsystems).
> > > 
> > > Even if extents are shared with the subvolume on which a snapshot is
> > > based
> > > on and the data is currently on the page cache due to being read through
> > > the subvolume, attempting to read the data through the snapshot will
> > > always result in bringing a new copy of the data into another location in
> > > the page cache (there's currently no shared memory for shared extents).
> > > 
> > > So make send evict the data it has read before if when it first opened
> > > the inode, its mapping had no pages currently loaded: when
> > > inode->i_mapping->nr_pages has a value of 0. Do this instead of deciding
> > > based on the return value of filemap_range_has_page() before reading an
> > > extent because the generic readahead mechanism may read pages beyond the
> > > range we request (and it very often does it), which means a call to
> > > filemap_range_has_page() will return true due to the readahead that was
> > > triggered when processing a previous extent - we don't have a simple way
> > > to distinguish this case from the case where the data was brought into
> > > the page cache through someone else. So checking for the mapping number
> > > of pages being 0 when we first open the inode is simple, cheap and it
> > > generally accomplishes the goal of not trashing the page cache - the
> > > only exception is if part of data was previously loaded into the page
> > > cache through the snapshot by some other process, in that case we end
> > > up not evicting any data send brings into the page cache, just like
> > > before this change - but that however is not the common case.
> > > 
> > > Example scenario, on a box with 32G of RAM:
> > > 
> > >    $ btrfs subvolume create /mnt/sv1
> > >    $ xfs_io -f -c "pwrite 0 4G" /mnt/sv1/file1
> > > 
> > >    $ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap1
> > > 
> > >    $ free -m
> > >                   total        used        free      shared
> > > buff/cache   available
> > >    Mem:           31937         186       26866           0
> > > 4883       31297
> > >    Swap:           8188           0        8188
> > > 
> > >    # After this we get less 4G of free memory.
> > >    $ btrfs send /mnt/snap1 >/dev/null
> > > 
> > >    $ free -m
> > >                   total        used        free      shared
> > > buff/cache   available
> > >    Mem:           31937         186       22814           0
> > > 8935       31297
> > >    Swap:           8188           0        8188
> > > 
> > > The same, obviously, applies to an incremental send.
> > > 
> > > Signed-off-by: Filipe Manana <fdmanana@suse.com>
> > 
> > Unfortunately, this patch seems to cause subpage cases to fail test case
> > btrfs/007, the reproducibility is around 50%, thus better "-I 8" to be
> > extra safe.
> > 
> > And I believe it also causes other send related failure for subpage cases.
> > 
> > I guess it's truncate_inode_pages_range() only truncating the full page,
> > but for subpage case, since one sector is smaller than one page, it
> > doesn't work as expected?
> 
> It looks like that's the case.
> 
> The following diff fixed the send related bugs here:
> (mail client seems to screw up the indent)
> 
> https://paste.opensuse.org/95871661

That may seem to work often, but it's not really correct because prev_extent_end
is already aligned to the page size, except possibly for the first extent processed.

For the case of the first processed extent not starting at an offset that is page
size aligned, we also need to round down that offset so that we evict the page
(and not simply zero out part of its content).

Can you try this instead:  https://pastebin.com/raw/kRPZKYxG ?

Thanks.


> 
> Thanks,
> Qu
> > 
> > If needed, I can provide you the access to my aarch64 vm for debugging.
> > 
> > Thanks,
> > Qu
> > 
> > > ---
> > >   fs/btrfs/send.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++--
> > >   1 file changed, 77 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
> > > index 55275ba90cb4..d899049dea53 100644
> > > --- a/fs/btrfs/send.c
> > > +++ b/fs/btrfs/send.c
> > > @@ -137,6 +137,8 @@ struct send_ctx {
> > >        */
> > >       struct inode *cur_inode;
> > >       struct file_ra_state ra;
> > > +    u64 prev_extent_end;
> > > +    bool clean_page_cache;
> > > 
> > >       /*
> > >        * We process inodes by their increasing order, so if before an
> > > @@ -5157,6 +5159,28 @@ static int send_extent_data(struct send_ctx *sctx,
> > >           }
> > >           memset(&sctx->ra, 0, sizeof(struct file_ra_state));
> > >           file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
> > > +
> > > +        /*
> > > +         * It's very likely there are no pages from this inode in
> > > the page
> > > +         * cache, so after reading extents and sending their data,
> > > we clean
> > > +         * the page cache to avoid trashing the page cache (adding
> > > pressure
> > > +         * to the page cache and forcing eviction of other data
> > > more useful
> > > +         * for applications).
> > > +         *
> > > +         * We decide if we should clean the page cache simply by
> > > checking
> > > +         * if the inode's mapping nrpages is 0 when we first open
> > > it, and
> > > +         * not by using something like filemap_range_has_page() before
> > > +         * reading an extent because when we ask the readahead code to
> > > +         * read a given file range, it may (and almost always does) read
> > > +         * pages from beyond that range (see the documentation for
> > > +         * page_cache_sync_readahead()), so it would not be reliable,
> > > +         * because after reading the first extent future calls to
> > > +         * filemap_range_has_page() would return true because the
> > > readahead
> > > +         * on the previous extent resulted in reading pages of the
> > > current
> > > +         * extent as well.
> > > +         */
> > > +        sctx->clean_page_cache =
> > > (sctx->cur_inode->i_mapping->nrpages == 0);
> > > +        sctx->prev_extent_end = offset;
> > >       }
> > > 
> > >       while (sent < len) {
> > > @@ -5168,6 +5192,33 @@ static int send_extent_data(struct send_ctx *sctx,
> > >               return ret;
> > >           sent += size;
> > >       }
> > > +
> > > +    if (sctx->clean_page_cache) {
> > > +        const u64 end = round_up(offset + len, PAGE_SIZE);
> > > +
> > > +        /*
> > > +         * Always start from the end offset of the last processed
> > > extent.
> > > +         * This is because the readahead code may (and very often does)
> > > +         * reads pages beyond the range we request for readahead. So if
> > > +         * we have an extent layout like this:
> > > +         *
> > > +         *            [ extent A ] [ extent B ] [ extent C ]
> > > +         *
> > > +         * When we ask page_cache_sync_readahead() to read extent A, it
> > > +         * may also trigger reads for pages of extent B. If we are doing
> > > +         * an incremental send and extent B has not changed between the
> > > +         * parent and send snapshots, some or all of its pages may end
> > > +         * up being read and placed in the page cache. So when
> > > truncating
> > > +         * the page cache we always start from the end offset of the
> > > +         * previously processed extent up to the end of the current
> > > +         * extent.
> > > +         */
> > > +        truncate_inode_pages_range(&sctx->cur_inode->i_data,
> > > +                       sctx->prev_extent_end,
> > > +                       end - 1);
> > > +        sctx->prev_extent_end = end;
> > > +    }
> > > +
> > >       return 0;
> > >   }
> > > 
> > > @@ -6172,6 +6223,30 @@ static int btrfs_unlink_all_paths(struct
> > > send_ctx *sctx)
> > >       return ret;
> > >   }
> > > 
> > > +static void close_current_inode(struct send_ctx *sctx)
> > > +{
> > > +    u64 i_size;
> > > +
> > > +    if (sctx->cur_inode == NULL)
> > > +        return;
> > > +
> > > +    i_size = i_size_read(sctx->cur_inode);
> > > +
> > > +    /*
> > > +     * If we are doing an incremental send, we may have extents
> > > between the
> > > +     * last processed extent and the i_size that have not been processed
> > > +     * because they haven't changed but we may have read some of
> > > their pages
> > > +     * through readahead, see the comments at send_extent_data().
> > > +     */
> > > +    if (sctx->clean_page_cache && sctx->prev_extent_end < i_size)
> > > +        truncate_inode_pages_range(&sctx->cur_inode->i_data,
> > > +                       sctx->prev_extent_end,
> > > +                       round_up(i_size, PAGE_SIZE) - 1);
> > > +
> > > +    iput(sctx->cur_inode);
> > > +    sctx->cur_inode = NULL;
> > > +}
> > > +
> > >   static int changed_inode(struct send_ctx *sctx,
> > >                enum btrfs_compare_tree_result result)
> > >   {
> > > @@ -6182,8 +6257,7 @@ static int changed_inode(struct send_ctx *sctx,
> > >       u64 left_gen = 0;
> > >       u64 right_gen = 0;
> > > 
> > > -    iput(sctx->cur_inode);
> > > -    sctx->cur_inode = NULL;
> > > +    close_current_inode(sctx);
> > > 
> > >       sctx->cur_ino = key->objectid;
> > >       sctx->cur_inode_new_gen = 0;
> > > @@ -7671,7 +7745,7 @@ long btrfs_ioctl_send(struct inode *inode,
> > > struct btrfs_ioctl_send_args *arg)
> > > 
> > >           name_cache_free(sctx);
> > > 
> > > -        iput(sctx->cur_inode);
> > > +        close_current_inode(sctx);
> > > 
> > >           kfree(sctx);
> > >       }

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] btrfs: send: avoid trashing the page cache
  2022-05-17  9:39       ` Filipe Manana
@ 2022-05-17 10:36         ` Qu Wenruo
  2022-05-17 10:46           ` Filipe Manana
  0 siblings, 1 reply; 13+ messages in thread
From: Qu Wenruo @ 2022-05-17 10:36 UTC (permalink / raw)
  To: Filipe Manana; +Cc: linux-btrfs



On 2022/5/17 17:39, Filipe Manana wrote:
> On Tue, May 17, 2022 at 03:26:14PM +0800, Qu Wenruo wrote:
>>
>>
>> On 2022/5/17 14:35, Qu Wenruo wrote:
>>>
>>>
>>> On 2022/5/6 01:16, fdmanana@kernel.org wrote:
>>>> From: Filipe Manana <fdmanana@suse.com>
>>>>
>>>> A send operation reads extent data using the buffered IO path for getting
>>>> extent data to send in write commands and this is both because it's
>>>> simple
>>>> and to make use of the generic readahead infrastructure, which results in
>>>> a massive speedup.
>>>>
>>>> However this fills the page cache with data that, most of the time, is
>>>> really only used by the send operation - once the write commands are
>>>> sent,
>>>> it's not useful to have the data in the page cache anymore. For large
>>>> snapshots, bringing all data into the page cache eventually leads to the
>>>> need to evict other data from the page cache that may be more useful for
>>>> applications (and kernel susbsystems).
>>>>
>>>> Even if extents are shared with the subvolume on which a snapshot is
>>>> based
>>>> on and the data is currently on the page cache due to being read through
>>>> the subvolume, attempting to read the data through the snapshot will
>>>> always result in bringing a new copy of the data into another location in
>>>> the page cache (there's currently no shared memory for shared extents).
>>>>
>>>> So make send evict the data it has read before if when it first opened
>>>> the inode, its mapping had no pages currently loaded: when
>>>> inode->i_mapping->nr_pages has a value of 0. Do this instead of deciding
>>>> based on the return value of filemap_range_has_page() before reading an
>>>> extent because the generic readahead mechanism may read pages beyond the
>>>> range we request (and it very often does it), which means a call to
>>>> filemap_range_has_page() will return true due to the readahead that was
>>>> triggered when processing a previous extent - we don't have a simple way
>>>> to distinguish this case from the case where the data was brought into
>>>> the page cache through someone else. So checking for the mapping number
>>>> of pages being 0 when we first open the inode is simple, cheap and it
>>>> generally accomplishes the goal of not trashing the page cache - the
>>>> only exception is if part of data was previously loaded into the page
>>>> cache through the snapshot by some other process, in that case we end
>>>> up not evicting any data send brings into the page cache, just like
>>>> before this change - but that however is not the common case.
>>>>
>>>> Example scenario, on a box with 32G of RAM:
>>>>
>>>>     $ btrfs subvolume create /mnt/sv1
>>>>     $ xfs_io -f -c "pwrite 0 4G" /mnt/sv1/file1
>>>>
>>>>     $ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap1
>>>>
>>>>     $ free -m
>>>>                    total        used        free      shared
>>>> buff/cache   available
>>>>     Mem:           31937         186       26866           0
>>>> 4883       31297
>>>>     Swap:           8188           0        8188
>>>>
>>>>     # After this we get less 4G of free memory.
>>>>     $ btrfs send /mnt/snap1 >/dev/null
>>>>
>>>>     $ free -m
>>>>                    total        used        free      shared
>>>> buff/cache   available
>>>>     Mem:           31937         186       22814           0
>>>> 8935       31297
>>>>     Swap:           8188           0        8188
>>>>
>>>> The same, obviously, applies to an incremental send.
>>>>
>>>> Signed-off-by: Filipe Manana <fdmanana@suse.com>
>>>
>>> Unfortunately, this patch seems to cause subpage cases to fail test case
>>> btrfs/007, the reproducibility is around 50%, thus better "-I 8" to be
>>> extra safe.
>>>
>>> And I believe it also causes other send related failure for subpage cases.
>>>
>>> I guess it's truncate_inode_pages_range() only truncating the full page,
>>> but for subpage case, since one sector is smaller than one page, it
>>> doesn't work as expected?
>>
>> It looks like that's the case.
>>
>> The following diff fixed the send related bugs here:
>> (mail client seems to screw up the indent)
>>
>> https://paste.opensuse.org/95871661
>
> That may seem to work often, but it's not really correct because prev_extent_end
> is already aligned to the page size, except possibly for the first extent processed.

You're right, my snippet is really just a dirty hack.

>
> For the case of the first processed extent not starting at an offset that is page
> size aligned, we also need to round down that offset so that we evict the page
> (and not simply zero out part of its content).
>
> Can you try this instead:  https://pastebin.com/raw/kRPZKYxG ?

Tested with btrfs/007 and it passed without problems (8 runs).

Also tested it with the full send group, no regression found here.


And with his fix, the following subpage failures now all pass.

- btrfs/007
- btrfs/016
- btrfs/025
- btrfs/034
- btrfs/097
- btrfs/149
- btrfs/252

Thanks,
Qu

>
> Thanks.
>
>
>>
>> Thanks,
>> Qu
>>>
>>> If needed, I can provide you the access to my aarch64 vm for debugging.
>>>
>>> Thanks,
>>> Qu
>>>
>>>> ---
>>>>    fs/btrfs/send.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++--
>>>>    1 file changed, 77 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
>>>> index 55275ba90cb4..d899049dea53 100644
>>>> --- a/fs/btrfs/send.c
>>>> +++ b/fs/btrfs/send.c
>>>> @@ -137,6 +137,8 @@ struct send_ctx {
>>>>         */
>>>>        struct inode *cur_inode;
>>>>        struct file_ra_state ra;
>>>> +    u64 prev_extent_end;
>>>> +    bool clean_page_cache;
>>>>
>>>>        /*
>>>>         * We process inodes by their increasing order, so if before an
>>>> @@ -5157,6 +5159,28 @@ static int send_extent_data(struct send_ctx *sctx,
>>>>            }
>>>>            memset(&sctx->ra, 0, sizeof(struct file_ra_state));
>>>>            file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
>>>> +
>>>> +        /*
>>>> +         * It's very likely there are no pages from this inode in
>>>> the page
>>>> +         * cache, so after reading extents and sending their data,
>>>> we clean
>>>> +         * the page cache to avoid trashing the page cache (adding
>>>> pressure
>>>> +         * to the page cache and forcing eviction of other data
>>>> more useful
>>>> +         * for applications).
>>>> +         *
>>>> +         * We decide if we should clean the page cache simply by
>>>> checking
>>>> +         * if the inode's mapping nrpages is 0 when we first open
>>>> it, and
>>>> +         * not by using something like filemap_range_has_page() before
>>>> +         * reading an extent because when we ask the readahead code to
>>>> +         * read a given file range, it may (and almost always does) read
>>>> +         * pages from beyond that range (see the documentation for
>>>> +         * page_cache_sync_readahead()), so it would not be reliable,
>>>> +         * because after reading the first extent future calls to
>>>> +         * filemap_range_has_page() would return true because the
>>>> readahead
>>>> +         * on the previous extent resulted in reading pages of the
>>>> current
>>>> +         * extent as well.
>>>> +         */
>>>> +        sctx->clean_page_cache =
>>>> (sctx->cur_inode->i_mapping->nrpages == 0);
>>>> +        sctx->prev_extent_end = offset;
>>>>        }
>>>>
>>>>        while (sent < len) {
>>>> @@ -5168,6 +5192,33 @@ static int send_extent_data(struct send_ctx *sctx,
>>>>                return ret;
>>>>            sent += size;
>>>>        }
>>>> +
>>>> +    if (sctx->clean_page_cache) {
>>>> +        const u64 end = round_up(offset + len, PAGE_SIZE);
>>>> +
>>>> +        /*
>>>> +         * Always start from the end offset of the last processed
>>>> extent.
>>>> +         * This is because the readahead code may (and very often does)
>>>> +         * reads pages beyond the range we request for readahead. So if
>>>> +         * we have an extent layout like this:
>>>> +         *
>>>> +         *            [ extent A ] [ extent B ] [ extent C ]
>>>> +         *
>>>> +         * When we ask page_cache_sync_readahead() to read extent A, it
>>>> +         * may also trigger reads for pages of extent B. If we are doing
>>>> +         * an incremental send and extent B has not changed between the
>>>> +         * parent and send snapshots, some or all of its pages may end
>>>> +         * up being read and placed in the page cache. So when
>>>> truncating
>>>> +         * the page cache we always start from the end offset of the
>>>> +         * previously processed extent up to the end of the current
>>>> +         * extent.
>>>> +         */
>>>> +        truncate_inode_pages_range(&sctx->cur_inode->i_data,
>>>> +                       sctx->prev_extent_end,
>>>> +                       end - 1);
>>>> +        sctx->prev_extent_end = end;
>>>> +    }
>>>> +
>>>>        return 0;
>>>>    }
>>>>
>>>> @@ -6172,6 +6223,30 @@ static int btrfs_unlink_all_paths(struct
>>>> send_ctx *sctx)
>>>>        return ret;
>>>>    }
>>>>
>>>> +static void close_current_inode(struct send_ctx *sctx)
>>>> +{
>>>> +    u64 i_size;
>>>> +
>>>> +    if (sctx->cur_inode == NULL)
>>>> +        return;
>>>> +
>>>> +    i_size = i_size_read(sctx->cur_inode);
>>>> +
>>>> +    /*
>>>> +     * If we are doing an incremental send, we may have extents
>>>> between the
>>>> +     * last processed extent and the i_size that have not been processed
>>>> +     * because they haven't changed but we may have read some of
>>>> their pages
>>>> +     * through readahead, see the comments at send_extent_data().
>>>> +     */
>>>> +    if (sctx->clean_page_cache && sctx->prev_extent_end < i_size)
>>>> +        truncate_inode_pages_range(&sctx->cur_inode->i_data,
>>>> +                       sctx->prev_extent_end,
>>>> +                       round_up(i_size, PAGE_SIZE) - 1);
>>>> +
>>>> +    iput(sctx->cur_inode);
>>>> +    sctx->cur_inode = NULL;
>>>> +}
>>>> +
>>>>    static int changed_inode(struct send_ctx *sctx,
>>>>                 enum btrfs_compare_tree_result result)
>>>>    {
>>>> @@ -6182,8 +6257,7 @@ static int changed_inode(struct send_ctx *sctx,
>>>>        u64 left_gen = 0;
>>>>        u64 right_gen = 0;
>>>>
>>>> -    iput(sctx->cur_inode);
>>>> -    sctx->cur_inode = NULL;
>>>> +    close_current_inode(sctx);
>>>>
>>>>        sctx->cur_ino = key->objectid;
>>>>        sctx->cur_inode_new_gen = 0;
>>>> @@ -7671,7 +7745,7 @@ long btrfs_ioctl_send(struct inode *inode,
>>>> struct btrfs_ioctl_send_args *arg)
>>>>
>>>>            name_cache_free(sctx);
>>>>
>>>> -        iput(sctx->cur_inode);
>>>> +        close_current_inode(sctx);
>>>>
>>>>            kfree(sctx);
>>>>        }

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] btrfs: send: avoid trashing the page cache
  2022-05-17 10:36         ` Qu Wenruo
@ 2022-05-17 10:46           ` Filipe Manana
  0 siblings, 0 replies; 13+ messages in thread
From: Filipe Manana @ 2022-05-17 10:46 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Tue, May 17, 2022 at 06:36:57PM +0800, Qu Wenruo wrote:
> 
> 
> On 2022/5/17 17:39, Filipe Manana wrote:
> > On Tue, May 17, 2022 at 03:26:14PM +0800, Qu Wenruo wrote:
> > > 
> > > 
> > > On 2022/5/17 14:35, Qu Wenruo wrote:
> > > > 
> > > > 
> > > > On 2022/5/6 01:16, fdmanana@kernel.org wrote:
> > > > > From: Filipe Manana <fdmanana@suse.com>
> > > > > 
> > > > > A send operation reads extent data using the buffered IO path for getting
> > > > > extent data to send in write commands and this is both because it's
> > > > > simple
> > > > > and to make use of the generic readahead infrastructure, which results in
> > > > > a massive speedup.
> > > > > 
> > > > > However this fills the page cache with data that, most of the time, is
> > > > > really only used by the send operation - once the write commands are
> > > > > sent,
> > > > > it's not useful to have the data in the page cache anymore. For large
> > > > > snapshots, bringing all data into the page cache eventually leads to the
> > > > > need to evict other data from the page cache that may be more useful for
> > > > > applications (and kernel susbsystems).
> > > > > 
> > > > > Even if extents are shared with the subvolume on which a snapshot is
> > > > > based
> > > > > on and the data is currently on the page cache due to being read through
> > > > > the subvolume, attempting to read the data through the snapshot will
> > > > > always result in bringing a new copy of the data into another location in
> > > > > the page cache (there's currently no shared memory for shared extents).
> > > > > 
> > > > > So make send evict the data it has read before if when it first opened
> > > > > the inode, its mapping had no pages currently loaded: when
> > > > > inode->i_mapping->nr_pages has a value of 0. Do this instead of deciding
> > > > > based on the return value of filemap_range_has_page() before reading an
> > > > > extent because the generic readahead mechanism may read pages beyond the
> > > > > range we request (and it very often does it), which means a call to
> > > > > filemap_range_has_page() will return true due to the readahead that was
> > > > > triggered when processing a previous extent - we don't have a simple way
> > > > > to distinguish this case from the case where the data was brought into
> > > > > the page cache through someone else. So checking for the mapping number
> > > > > of pages being 0 when we first open the inode is simple, cheap and it
> > > > > generally accomplishes the goal of not trashing the page cache - the
> > > > > only exception is if part of data was previously loaded into the page
> > > > > cache through the snapshot by some other process, in that case we end
> > > > > up not evicting any data send brings into the page cache, just like
> > > > > before this change - but that however is not the common case.
> > > > > 
> > > > > Example scenario, on a box with 32G of RAM:
> > > > > 
> > > > >     $ btrfs subvolume create /mnt/sv1
> > > > >     $ xfs_io -f -c "pwrite 0 4G" /mnt/sv1/file1
> > > > > 
> > > > >     $ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap1
> > > > > 
> > > > >     $ free -m
> > > > >                    total        used        free      shared
> > > > > buff/cache   available
> > > > >     Mem:           31937         186       26866           0
> > > > > 4883       31297
> > > > >     Swap:           8188           0        8188
> > > > > 
> > > > >     # After this we get less 4G of free memory.
> > > > >     $ btrfs send /mnt/snap1 >/dev/null
> > > > > 
> > > > >     $ free -m
> > > > >                    total        used        free      shared
> > > > > buff/cache   available
> > > > >     Mem:           31937         186       22814           0
> > > > > 8935       31297
> > > > >     Swap:           8188           0        8188
> > > > > 
> > > > > The same, obviously, applies to an incremental send.
> > > > > 
> > > > > Signed-off-by: Filipe Manana <fdmanana@suse.com>
> > > > 
> > > > Unfortunately, this patch seems to cause subpage cases to fail test case
> > > > btrfs/007, the reproducibility is around 50%, thus better "-I 8" to be
> > > > extra safe.
> > > > 
> > > > And I believe it also causes other send related failure for subpage cases.
> > > > 
> > > > I guess it's truncate_inode_pages_range() only truncating the full page,
> > > > but for subpage case, since one sector is smaller than one page, it
> > > > doesn't work as expected?
> > > 
> > > It looks like that's the case.
> > > 
> > > The following diff fixed the send related bugs here:
> > > (mail client seems to screw up the indent)
> > > 
> > > https://paste.opensuse.org/95871661
> > 
> > That may seem to work often, but it's not really correct because prev_extent_end
> > is already aligned to the page size, except possibly for the first extent processed.
> 
> You're right, my snippet is really just a dirty hack.
> 
> > 
> > For the case of the first processed extent not starting at an offset that is page
> > size aligned, we also need to round down that offset so that we evict the page
> > (and not simply zero out part of its content).
> > 
> > Can you try this instead:  https://pastebin.com/raw/kRPZKYxG ?
> 
> Tested with btrfs/007 and it passed without problems (8 runs).
> 
> Also tested it with the full send group, no regression found here.
> 
> 
> And with his fix, the following subpage failures now all pass.
> 
> - btrfs/007
> - btrfs/016
> - btrfs/025
> - btrfs/034
> - btrfs/097
> - btrfs/149
> - btrfs/252

Thanks for testing and the report! I'll send a v2 with that patch incorporated
(and a few comment fixes).

> 
> Thanks,
> Qu
> 
> > 
> > Thanks.
> > 
> > 
> > > 
> > > Thanks,
> > > Qu
> > > > 
> > > > If needed, I can provide you the access to my aarch64 vm for debugging.
> > > > 
> > > > Thanks,
> > > > Qu
> > > > 
> > > > > ---
> > > > >    fs/btrfs/send.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++--
> > > > >    1 file changed, 77 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
> > > > > index 55275ba90cb4..d899049dea53 100644
> > > > > --- a/fs/btrfs/send.c
> > > > > +++ b/fs/btrfs/send.c
> > > > > @@ -137,6 +137,8 @@ struct send_ctx {
> > > > >         */
> > > > >        struct inode *cur_inode;
> > > > >        struct file_ra_state ra;
> > > > > +    u64 prev_extent_end;
> > > > > +    bool clean_page_cache;
> > > > > 
> > > > >        /*
> > > > >         * We process inodes by their increasing order, so if before an
> > > > > @@ -5157,6 +5159,28 @@ static int send_extent_data(struct send_ctx *sctx,
> > > > >            }
> > > > >            memset(&sctx->ra, 0, sizeof(struct file_ra_state));
> > > > >            file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
> > > > > +
> > > > > +        /*
> > > > > +         * It's very likely there are no pages from this inode in
> > > > > the page
> > > > > +         * cache, so after reading extents and sending their data,
> > > > > we clean
> > > > > +         * the page cache to avoid trashing the page cache (adding
> > > > > pressure
> > > > > +         * to the page cache and forcing eviction of other data
> > > > > more useful
> > > > > +         * for applications).
> > > > > +         *
> > > > > +         * We decide if we should clean the page cache simply by
> > > > > checking
> > > > > +         * if the inode's mapping nrpages is 0 when we first open
> > > > > it, and
> > > > > +         * not by using something like filemap_range_has_page() before
> > > > > +         * reading an extent because when we ask the readahead code to
> > > > > +         * read a given file range, it may (and almost always does) read
> > > > > +         * pages from beyond that range (see the documentation for
> > > > > +         * page_cache_sync_readahead()), so it would not be reliable,
> > > > > +         * because after reading the first extent future calls to
> > > > > +         * filemap_range_has_page() would return true because the
> > > > > readahead
> > > > > +         * on the previous extent resulted in reading pages of the
> > > > > current
> > > > > +         * extent as well.
> > > > > +         */
> > > > > +        sctx->clean_page_cache =
> > > > > (sctx->cur_inode->i_mapping->nrpages == 0);
> > > > > +        sctx->prev_extent_end = offset;
> > > > >        }
> > > > > 
> > > > >        while (sent < len) {
> > > > > @@ -5168,6 +5192,33 @@ static int send_extent_data(struct send_ctx *sctx,
> > > > >                return ret;
> > > > >            sent += size;
> > > > >        }
> > > > > +
> > > > > +    if (sctx->clean_page_cache) {
> > > > > +        const u64 end = round_up(offset + len, PAGE_SIZE);
> > > > > +
> > > > > +        /*
> > > > > +         * Always start from the end offset of the last processed
> > > > > extent.
> > > > > +         * This is because the readahead code may (and very often does)
> > > > > +         * reads pages beyond the range we request for readahead. So if
> > > > > +         * we have an extent layout like this:
> > > > > +         *
> > > > > +         *            [ extent A ] [ extent B ] [ extent C ]
> > > > > +         *
> > > > > +         * When we ask page_cache_sync_readahead() to read extent A, it
> > > > > +         * may also trigger reads for pages of extent B. If we are doing
> > > > > +         * an incremental send and extent B has not changed between the
> > > > > +         * parent and send snapshots, some or all of its pages may end
> > > > > +         * up being read and placed in the page cache. So when
> > > > > truncating
> > > > > +         * the page cache we always start from the end offset of the
> > > > > +         * previously processed extent up to the end of the current
> > > > > +         * extent.
> > > > > +         */
> > > > > +        truncate_inode_pages_range(&sctx->cur_inode->i_data,
> > > > > +                       sctx->prev_extent_end,
> > > > > +                       end - 1);
> > > > > +        sctx->prev_extent_end = end;
> > > > > +    }
> > > > > +
> > > > >        return 0;
> > > > >    }
> > > > > 
> > > > > @@ -6172,6 +6223,30 @@ static int btrfs_unlink_all_paths(struct
> > > > > send_ctx *sctx)
> > > > >        return ret;
> > > > >    }
> > > > > 
> > > > > +static void close_current_inode(struct send_ctx *sctx)
> > > > > +{
> > > > > +    u64 i_size;
> > > > > +
> > > > > +    if (sctx->cur_inode == NULL)
> > > > > +        return;
> > > > > +
> > > > > +    i_size = i_size_read(sctx->cur_inode);
> > > > > +
> > > > > +    /*
> > > > > +     * If we are doing an incremental send, we may have extents
> > > > > between the
> > > > > +     * last processed extent and the i_size that have not been processed
> > > > > +     * because they haven't changed but we may have read some of
> > > > > their pages
> > > > > +     * through readahead, see the comments at send_extent_data().
> > > > > +     */
> > > > > +    if (sctx->clean_page_cache && sctx->prev_extent_end < i_size)
> > > > > +        truncate_inode_pages_range(&sctx->cur_inode->i_data,
> > > > > +                       sctx->prev_extent_end,
> > > > > +                       round_up(i_size, PAGE_SIZE) - 1);
> > > > > +
> > > > > +    iput(sctx->cur_inode);
> > > > > +    sctx->cur_inode = NULL;
> > > > > +}
> > > > > +
> > > > >    static int changed_inode(struct send_ctx *sctx,
> > > > >                 enum btrfs_compare_tree_result result)
> > > > >    {
> > > > > @@ -6182,8 +6257,7 @@ static int changed_inode(struct send_ctx *sctx,
> > > > >        u64 left_gen = 0;
> > > > >        u64 right_gen = 0;
> > > > > 
> > > > > -    iput(sctx->cur_inode);
> > > > > -    sctx->cur_inode = NULL;
> > > > > +    close_current_inode(sctx);
> > > > > 
> > > > >        sctx->cur_ino = key->objectid;
> > > > >        sctx->cur_inode_new_gen = 0;
> > > > > @@ -7671,7 +7745,7 @@ long btrfs_ioctl_send(struct inode *inode,
> > > > > struct btrfs_ioctl_send_args *arg)
> > > > > 
> > > > >            name_cache_free(sctx);
> > > > > 
> > > > > -        iput(sctx->cur_inode);
> > > > > +        close_current_inode(sctx);
> > > > > 
> > > > >            kfree(sctx);
> > > > >        }

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 0/2] btrfs: teach send to avoid trashing the page cache with data
  2022-05-05 17:16 [PATCH 0/2] btrfs: teach send to avoid trashing the page cache with data fdmanana
                   ` (2 preceding siblings ...)
  2022-05-09 19:08 ` [PATCH 0/2] btrfs: teach send to avoid trashing the page cache with data David Sterba
@ 2022-05-17 10:47 ` fdmanana
  2022-05-17 10:47   ` [PATCH 1/2] btrfs: send: keep the current inode open while processing it fdmanana
                     ` (2 more replies)
  3 siblings, 3 replies; 13+ messages in thread
From: fdmanana @ 2022-05-17 10:47 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

When doing a send operation, we read the data of all extents we need to
send into the page cache, which almost always is wasteful as it can lead
to eviction of other things from the page cache that are more useful for
applications (and maybe other kernel subsystems). This patchset makes send
evict the data from the page cache once it has sent it. The actual work
is in the second patch, while the first one is just preparatory work.
More details in the changelogs.

V2: Fixed it to work with subpage sector size. It was broken as we could
    end up zeroing out part of a page when extents are not sector size
    aligned. Reported by Qu.

Filipe Manana (2):
  btrfs: send: keep the current inode open while processing it
  btrfs: send: avoid trashing the page cache

 fs/btrfs/send.c | 133 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 114 insertions(+), 19 deletions(-)

-- 
2.35.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/2] btrfs: send: keep the current inode open while processing it
  2022-05-17 10:47 ` [PATCH v2 " fdmanana
@ 2022-05-17 10:47   ` fdmanana
  2022-05-17 10:47   ` [PATCH 2/2] btrfs: send: avoid trashing the page cache fdmanana
  2022-05-17 18:18   ` [PATCH v2 0/2] btrfs: teach send to avoid trashing the page cache with data David Sterba
  2 siblings, 0 replies; 13+ messages in thread
From: fdmanana @ 2022-05-17 10:47 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

Every time we send a write command, we open the inode, read some data to
a buffer and then close the inode. The amount of data we read for each
write command is at most 48K, returned by max_send_read_size(), and that
corresponds to: BTRFS_SEND_BUF_SIZE - 16K = 48K. In practice this does
not add any significant overhead, because the time elapsed between every
close (iput()) and open (btrfs_iget()) is very short, so the inode is kept
in the VFS's cache after the iput() and it's still there by the time we
do the next btrfs_iget().

As between processing extents of the current inode we don't do anything
else, it makes sense to keep the inode open after we process its first
extent that needs to be sent and keep it open until we start processing
the next inode. This serves to facilitate the next change, which aims
to avoid having send operations trash the page cache with data extents.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 54 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 330bef72a555..55275ba90cb4 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -131,6 +131,11 @@ struct send_ctx {
 	struct list_head name_cache_list;
 	int name_cache_size;
 
+	/*
+	 * The inode we are currently processing. It's not NULL only when we
+	 * need to issue write commands for data extents from this inode.
+	 */
+	struct inode *cur_inode;
 	struct file_ra_state ra;
 
 	/*
@@ -4868,7 +4873,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
 {
 	struct btrfs_root *root = sctx->send_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct inode *inode;
 	struct page *page;
 	pgoff_t index = offset >> PAGE_SHIFT;
 	pgoff_t last_index;
@@ -4879,37 +4883,30 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
 	if (ret)
 		return ret;
 
-	inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
 	last_index = (offset + len - 1) >> PAGE_SHIFT;
 
-	/* initial readahead */
-	memset(&sctx->ra, 0, sizeof(struct file_ra_state));
-	file_ra_state_init(&sctx->ra, inode->i_mapping);
-
 	while (index <= last_index) {
 		unsigned cur_len = min_t(unsigned, len,
 					 PAGE_SIZE - pg_offset);
 
-		page = find_lock_page(inode->i_mapping, index);
+		page = find_lock_page(sctx->cur_inode->i_mapping, index);
 		if (!page) {
-			page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
-				NULL, index, last_index + 1 - index);
+			page_cache_sync_readahead(sctx->cur_inode->i_mapping,
+						  &sctx->ra, NULL, index,
+						  last_index + 1 - index);
 
-			page = find_or_create_page(inode->i_mapping, index,
-					GFP_KERNEL);
+			page = find_or_create_page(sctx->cur_inode->i_mapping,
+						   index, GFP_KERNEL);
 			if (!page) {
 				ret = -ENOMEM;
 				break;
 			}
 		}
 
-		if (PageReadahead(page)) {
-			page_cache_async_readahead(inode->i_mapping, &sctx->ra,
-				NULL, page, index, last_index + 1 - index);
-		}
+		if (PageReadahead(page))
+			page_cache_async_readahead(sctx->cur_inode->i_mapping,
+						   &sctx->ra, NULL, page, index,
+						   last_index + 1 - index);
 
 		if (!PageUptodate(page)) {
 			btrfs_readpage(NULL, page);
@@ -4935,7 +4932,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
 		len -= cur_len;
 		sctx->send_size += cur_len;
 	}
-	iput(inode);
+
 	return ret;
 }
 
@@ -5148,6 +5145,20 @@ static int send_extent_data(struct send_ctx *sctx,
 	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
 		return send_update_extent(sctx, offset, len);
 
+	if (sctx->cur_inode == NULL) {
+		struct btrfs_root *root = sctx->send_root;
+
+		sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
+		if (IS_ERR(sctx->cur_inode)) {
+			int err = PTR_ERR(sctx->cur_inode);
+
+			sctx->cur_inode = NULL;
+			return err;
+		}
+		memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+		file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
+	}
+
 	while (sent < len) {
 		u64 size = min(len - sent, read_size);
 		int ret;
@@ -6171,6 +6182,9 @@ static int changed_inode(struct send_ctx *sctx,
 	u64 left_gen = 0;
 	u64 right_gen = 0;
 
+	iput(sctx->cur_inode);
+	sctx->cur_inode = NULL;
+
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
 	sctx->cur_inode_last_extent = (u64)-1;
@@ -7657,6 +7671,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 
 		name_cache_free(sctx);
 
+		iput(sctx->cur_inode);
+
 		kfree(sctx);
 	}
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/2] btrfs: send: avoid trashing the page cache
  2022-05-17 10:47 ` [PATCH v2 " fdmanana
  2022-05-17 10:47   ` [PATCH 1/2] btrfs: send: keep the current inode open while processing it fdmanana
@ 2022-05-17 10:47   ` fdmanana
  2022-05-17 18:18   ` [PATCH v2 0/2] btrfs: teach send to avoid trashing the page cache with data David Sterba
  2 siblings, 0 replies; 13+ messages in thread
From: fdmanana @ 2022-05-17 10:47 UTC (permalink / raw)
  To: linux-btrfs

From: Filipe Manana <fdmanana@suse.com>

A send operation reads extent data using the buffered IO path for getting
extent data to send in write commands and this is both because it's simple
and to make use of the generic readahead infrastructure, which results in
a massive speedup.

However this fills the page cache with data that, most of the time, is
really only used by the send operation - once the write commands are sent,
it's not useful to have the data in the page cache anymore. For large
snapshots, bringing all data into the page cache eventually leads to the
need to evict other data from the page cache that may be more useful for
applications (and kernel susbsystems).

Even if extents are shared with the subvolume on which a snapshot is based
on and the data is currently on the page cache due to being read through
the subvolume, attempting to read the data through the snapshot will
always result in bringing a new copy of the data into another location in
the page cache (there's currently no shared memory for shared extents).

So make send evict the data it has read before if when it first opened
the inode, its mapping had no pages currently loaded: when
inode->i_mapping->nr_pages has a value of 0. Do this instead of deciding
based on the return value of filemap_range_has_page() before reading an
extent because the generic readahead mechanism may read pages beyond the
range we request (and it very often does it), which means a call to
filemap_range_has_page() will return true due to the readahead that was
triggered when processing a previous extent - we don't have a simple way
to distinguish this case from the case where the data was brought into
the page cache through someone else. So checking for the mapping number
of pages being 0 when we first open the inode is simple, cheap and it
generally accomplishes the goal of not trashing the page cache - the
only exception is if part of data was previously loaded into the page
cache through the snapshot by some other process, in that case we end
up not evicting any data send brings into the page cache, just like
before this change - but that however is not the common case.

Example scenario, on a box with 32G of RAM:

  $ btrfs subvolume create /mnt/sv1
  $ xfs_io -f -c "pwrite 0 4G" /mnt/sv1/file1

  $ btrfs subvolume snapshot -r /mnt/sv1 /mnt/snap1

  $ free -m
                 total        used        free      shared  buff/cache   available
  Mem:           31937         186       26866           0        4883       31297
  Swap:           8188           0        8188

  # After this we get less 4G of free memory.
  $ btrfs send /mnt/snap1 >/dev/null

  $ free -m
                 total        used        free      shared  buff/cache   available
  Mem:           31937         186       22814           0        8935       31297
  Swap:           8188           0        8188

The same, obviously, applies to an incremental send.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 55275ba90cb4..5a05beabf0c3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -137,6 +137,8 @@ struct send_ctx {
 	 */
 	struct inode *cur_inode;
 	struct file_ra_state ra;
+	u64 page_cache_clear_start;
+	bool clean_page_cache;
 
 	/*
 	 * We process inodes by their increasing order, so if before an
@@ -5139,6 +5141,7 @@ static int send_extent_data(struct send_ctx *sctx,
 			    const u64 offset,
 			    const u64 len)
 {
+	const u64 end = offset + len;
 	u64 read_size = max_send_read_size(sctx);
 	u64 sent = 0;
 
@@ -5157,6 +5160,28 @@ static int send_extent_data(struct send_ctx *sctx,
 		}
 		memset(&sctx->ra, 0, sizeof(struct file_ra_state));
 		file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
+
+		/*
+		 * It's very likely there are no pages from this inode in the page
+		 * cache, so after reading extents and sending their data, we clean
+		 * the page cache to avoid trashing the page cache (adding pressure
+		 * to the page cache and forcing eviction of other data more useful
+		 * for applications).
+		 *
+		 * We decide if we should clean the page cache simply by checking
+		 * if the inode's mapping nrpages is 0 when we first open it, and
+		 * not by using something like filemap_range_has_page() before
+		 * reading an extent because when we ask the readahead code to
+		 * read a given file range, it may (and almost always does) read
+		 * pages from beyond that range (see the documentation for
+		 * page_cache_sync_readahead()), so it would not be reliable,
+		 * because after reading the first extent future calls to
+		 * filemap_range_has_page() would return true because the readahead
+		 * on the previous extent resulted in reading pages of the current
+		 * extent as well.
+		 */
+		sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
+		sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
 	}
 
 	while (sent < len) {
@@ -5168,6 +5193,37 @@ static int send_extent_data(struct send_ctx *sctx,
 			return ret;
 		sent += size;
 	}
+
+	if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
+		/*
+		 * Always operate only on ranges that are a multiple of the page
+		 * size. This is not only to prevent zeroing parts of a page in
+		 * the case of subpage sector size, but also to guarantee we evict
+		 * pages, as passing a range that is smaller than page size does
+		 * not evict the respective page (only zeroes part of its content).
+		 *
+		 * Always start from the end offset of the last range cleared.
+		 * This is because the readahead code may (and very often does)
+		 * reads pages beyond the range we request for readahead. So if
+		 * we have an extent layout like this:
+		 *
+		 *            [ extent A ] [ extent B ] [ extent C ]
+		 *
+		 * When we ask page_cache_sync_readahead() to read extent A, it
+		 * may also trigger reads for pages of extent B. If we are doing
+		 * an incremental send and extent B has not changed between the
+		 * parent and send snapshots, some or all of its pages may end
+		 * up being read and placed in the page cache. So when truncating
+		 * the page cache we always start from the end offset of the
+		 * previously processed extent up to the end of the current
+		 * extent.
+		 */
+		truncate_inode_pages_range(&sctx->cur_inode->i_data,
+					   sctx->page_cache_clear_start,
+					   end - 1);
+		sctx->page_cache_clear_start = end;
+	}
+
 	return 0;
 }
 
@@ -6172,6 +6228,30 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx)
 	return ret;
 }
 
+static void close_current_inode(struct send_ctx *sctx)
+{
+	u64 i_size;
+
+	if (sctx->cur_inode == NULL)
+		return;
+
+	i_size = i_size_read(sctx->cur_inode);
+
+	/*
+	 * If we are doing an incremental send, we may have extents between the
+	 * last processed extent and the i_size that have not been processed
+	 * because they haven't changed but we may have read some of their pages
+	 * through readahead, see the comments at send_extent_data().
+	 */
+	if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
+		truncate_inode_pages_range(&sctx->cur_inode->i_data,
+					   sctx->page_cache_clear_start,
+					   round_up(i_size, PAGE_SIZE) - 1);
+
+	iput(sctx->cur_inode);
+	sctx->cur_inode = NULL;
+}
+
 static int changed_inode(struct send_ctx *sctx,
 			 enum btrfs_compare_tree_result result)
 {
@@ -6182,8 +6262,7 @@ static int changed_inode(struct send_ctx *sctx,
 	u64 left_gen = 0;
 	u64 right_gen = 0;
 
-	iput(sctx->cur_inode);
-	sctx->cur_inode = NULL;
+	close_current_inode(sctx);
 
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
@@ -7671,7 +7750,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
 
 		name_cache_free(sctx);
 
-		iput(sctx->cur_inode);
+		close_current_inode(sctx);
 
 		kfree(sctx);
 	}
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 0/2] btrfs: teach send to avoid trashing the page cache with data
  2022-05-17 10:47 ` [PATCH v2 " fdmanana
  2022-05-17 10:47   ` [PATCH 1/2] btrfs: send: keep the current inode open while processing it fdmanana
  2022-05-17 10:47   ` [PATCH 2/2] btrfs: send: avoid trashing the page cache fdmanana
@ 2022-05-17 18:18   ` David Sterba
  2 siblings, 0 replies; 13+ messages in thread
From: David Sterba @ 2022-05-17 18:18 UTC (permalink / raw)
  To: fdmanana; +Cc: linux-btrfs

On Tue, May 17, 2022 at 11:47:28AM +0100, fdmanana@kernel.org wrote:
> From: Filipe Manana <fdmanana@suse.com>
> 
> When doing a send operation, we read the data of all extents we need to
> send into the page cache, which almost always is wasteful as it can lead
> to eviction of other things from the page cache that are more useful for
> applications (and maybe other kernel subsystems). This patchset makes send
> evict the data from the page cache once it has sent it. The actual work
> is in the second patch, while the first one is just preparatory work.
> More details in the changelogs.
> 
> V2: Fixed it to work with subpage sector size. It was broken as we could
>     end up zeroing out part of a page when extents are not sector size
>     aligned. Reported by Qu.
> 
> Filipe Manana (2):
>   btrfs: send: keep the current inode open while processing it
>   btrfs: send: avoid trashing the page cache

Patch 1 is the same so I kept the committed version and 2 has been
replaced, still queued for first 5.19 pull as it's fixing a problem
that hampers testing. Thanks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-05-17 18:22 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-05 17:16 [PATCH 0/2] btrfs: teach send to avoid trashing the page cache with data fdmanana
2022-05-05 17:16 ` [PATCH 1/2] btrfs: send: keep the current inode open while processing it fdmanana
2022-05-05 17:16 ` [PATCH 2/2] btrfs: send: avoid trashing the page cache fdmanana
2022-05-17  6:35   ` Qu Wenruo
2022-05-17  7:26     ` Qu Wenruo
2022-05-17  9:39       ` Filipe Manana
2022-05-17 10:36         ` Qu Wenruo
2022-05-17 10:46           ` Filipe Manana
2022-05-09 19:08 ` [PATCH 0/2] btrfs: teach send to avoid trashing the page cache with data David Sterba
2022-05-17 10:47 ` [PATCH v2 " fdmanana
2022-05-17 10:47   ` [PATCH 1/2] btrfs: send: keep the current inode open while processing it fdmanana
2022-05-17 10:47   ` [PATCH 2/2] btrfs: send: avoid trashing the page cache fdmanana
2022-05-17 18:18   ` [PATCH v2 0/2] btrfs: teach send to avoid trashing the page cache with data David Sterba

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.