All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] btrfs: use preallocated pages for super block write
@ 2022-06-09 16:46 David Sterba
  2022-06-09 21:00 ` Matthew Wilcox
                   ` (7 more replies)
  0 siblings, 8 replies; 25+ messages in thread
From: David Sterba @ 2022-06-09 16:46 UTC (permalink / raw)
  To: linux-btrfs; +Cc: willy, nborisov, David Sterba

Currently the super block page is from the mapping of the block device,
this is result of direct conversion from the previous buffer_head to bio
API.  We don't use the page cache or the mapping anywhere else, the page
is a temporary space for the associated bio.

Allocate pages for all super block copies at device allocation time,
also to avoid any later allocation problems when writing the super
block. This simplifies the page reference tracking, but the page lock is
still used as waiting mechanism for the write and write error is tracked
in the page.

As there is a separate page for each super block copy all can be
submitted in parallel, as before.

This was inspired by Matthew's question
https://lore.kernel.org/all/Yn%2FtxWbij5voeGOB@casper.infradead.org/

Signed-off-by: David Sterba <dsterba@suse.com>
---

v2:

- allocate 3 pages per device to keep parallelism, otherwise the
  submission would be serialized on the page lock

fs/btrfs/disk-io.c | 42 +++++++++++-------------------------------
 fs/btrfs/volumes.c | 12 ++++++++++++
 fs/btrfs/volumes.h |  3 +++
 3 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 800ad3a9c68e..8a9c7a868727 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3887,7 +3887,6 @@ static void btrfs_end_super_write(struct bio *bio)
 			SetPageUptodate(page);
 		}
 
-		put_page(page);
 		unlock_page(page);
 	}
 
@@ -3974,7 +3973,6 @@ static int write_dev_supers(struct btrfs_device *device,
 			    struct btrfs_super_block *sb, int max_mirrors)
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
-	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	int i;
 	int errors = 0;
@@ -3989,7 +3987,6 @@ static int write_dev_supers(struct btrfs_device *device,
 	for (i = 0; i < max_mirrors; i++) {
 		struct page *page;
 		struct bio *bio;
-		struct btrfs_super_block *disk_super;
 
 		bytenr_orig = btrfs_sb_offset(i);
 		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
@@ -4012,21 +4009,17 @@ static int write_dev_supers(struct btrfs_device *device,
 				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
 				    sb->csum);
 
-		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
-					   GFP_NOFS);
-		if (!page) {
-			btrfs_err(device->fs_info,
-			    "couldn't get super block page for bytenr %llu",
-			    bytenr);
-			errors++;
-			continue;
-		}
-
-		/* Bump the refcount for wait_dev_supers() */
-		get_page(page);
+		/*
+		 * Super block is copied to a temporary page, which is locked
+		 * and submitted for write. Page is unlocked after IO finishes.
+		 * No page references are needed, write error is returned as
+		 * page Error bit.
+		 */
+		page = device->sb_write_page[i];
+		ClearPageError(page);
+		lock_page(page);
 
-		disk_super = page_address(page);
-		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
+		memcpy(page_address(page), sb, BTRFS_SUPER_INFO_SIZE);
 
 		/*
 		 * Directly use bios here instead of relying on the page cache
@@ -4093,14 +4086,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 		    device->commit_total_bytes)
 			break;
 
-		page = find_get_page(device->bdev->bd_inode->i_mapping,
-				     bytenr >> PAGE_SHIFT);
-		if (!page) {
-			errors++;
-			if (i == 0)
-				primary_failed = true;
-			continue;
-		}
+		page = device->sb_write_page[i];
 		/* Page is submitted locked and unlocked once the IO completes */
 		wait_on_page_locked(page);
 		if (PageError(page)) {
@@ -4108,12 +4094,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 			if (i == 0)
 				primary_failed = true;
 		}
-
-		/* Drop our reference */
-		put_page(page);
-
-		/* Drop the reference from the writing run */
-		put_page(page);
 	}
 
 	/* log error, force error return */
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 12a6150ee19d..a00546d2c7ea 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -394,6 +394,8 @@ void btrfs_free_device(struct btrfs_device *device)
 	rcu_string_free(device->name);
 	extent_io_tree_release(&device->alloc_state);
 	btrfs_destroy_dev_zone_info(device);
+	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++)
+		__free_page(device->sb_write_page[i]);
 	kfree(device);
 }
 
@@ -6898,6 +6900,16 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
+	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		dev->sb_write_page[i] = alloc_page(GFP_KERNEL);
+		if (!dev->sb_write_page[i]) {
+			while (--i >= 0)
+				__free_page(dev->sb_write_page[i]);
+			kfree(dev);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+
 	INIT_LIST_HEAD(&dev->dev_list);
 	INIT_LIST_HEAD(&dev->dev_alloc_list);
 	INIT_LIST_HEAD(&dev->post_commit_list);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 588367c76c46..516709e1d9f8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -10,6 +10,7 @@
 #include <linux/sort.h>
 #include <linux/btrfs.h>
 #include "async-thread.h"
+#include "disk-io.h"
 
 #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
 
@@ -158,6 +159,8 @@ struct btrfs_device {
 	/* Bio used for flushing device barriers */
 	struct bio flush_bio;
 	struct completion flush_wait;
+	/* Temporary pages for writing the super block copies */
+	struct page *sb_write_page[BTRFS_SUPER_MIRROR_MAX];
 
 	/* per-device scrub information */
 	struct scrub_ctx *scrub_ctx;
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 16:46 [PATCH v2] btrfs: use preallocated pages for super block write David Sterba
@ 2022-06-09 21:00 ` Matthew Wilcox
  2022-06-09 22:54   ` David Sterba
  2022-06-09 22:58 ` Qu Wenruo
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 25+ messages in thread
From: Matthew Wilcox @ 2022-06-09 21:00 UTC (permalink / raw)
  To: David Sterba; +Cc: linux-btrfs, nborisov

On Thu, Jun 09, 2022 at 06:46:29PM +0200, David Sterba wrote:
> Currently the super block page is from the mapping of the block device,
> this is result of direct conversion from the previous buffer_head to bio
> API.  We don't use the page cache or the mapping anywhere else, the page
> is a temporary space for the associated bio.
> 
> Allocate pages for all super block copies at device allocation time,
> also to avoid any later allocation problems when writing the super
> block. This simplifies the page reference tracking, but the page lock is
> still used as waiting mechanism for the write and write error is tracked
> in the page.
> 
> As there is a separate page for each super block copy all can be
> submitted in parallel, as before.

Why not submit the same page to all IOs?  Something like this
(uncompiled, probably full of brokenness, really just for getting the
concept across)


diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 31c3f592e587..f7c2de47dc88 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1767,8 +1767,8 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
 	btrfs_put_root(fs_info->block_group_root);
 	btrfs_check_leaked_roots(fs_info);
 	btrfs_extent_buffer_leak_debug_check(fs_info);
-	kfree(fs_info->super_copy);
-	kfree(fs_info->super_for_commit);
+	free_page(fs_info->super_copy);
+	free_page(fs_info->super_for_commit);
 	kfree(fs_info->subpage_info);
 	kvfree(fs_info);
 }
@@ -3981,20 +3981,19 @@ static void btrfs_end_super_write(struct bio *bio)
 	bio_for_each_segment_all(bvec, bio, iter_all) {
 		page = bvec->bv_page;
 
+		lock_page(page);
 		if (bio->bi_status) {
 			btrfs_warn_rl_in_rcu(device->fs_info,
 				"lost page write due to IO error on %s (%d)",
 				rcu_str_deref(device->name),
 				blk_status_to_errno(bio->bi_status));
-			ClearPageUptodate(page);
-			SetPageError(page);
+
 			btrfs_dev_stat_inc_and_print(device,
 						     BTRFS_DEV_STAT_WRITE_ERRS);
-		} else {
-			SetPageUptodate(page);
+			page->index++;
 		}
-
-		put_page(page);
+		if (--page->private == 0)
+			end_page_writeback(page);
 		unlock_page(page);
 	}
 
@@ -4083,8 +4082,8 @@ static int write_dev_supers(struct btrfs_device *device,
 	struct btrfs_fs_info *fs_info = device->fs_info;
 	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+	struct page *page = virt_to_page(sb);
 	int i;
-	int errors = 0;
 	int ret;
 	u64 bytenr, bytenr_orig;
 
@@ -4092,21 +4091,31 @@ static int write_dev_supers(struct btrfs_device *device,
 		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
 
 	shash->tfm = fs_info->csum_shash;
+	page->private = max_mirrors;
+	page->index = 0;	/* errors */
+	SetPageWriteback(page);
 
 	for (i = 0; i < max_mirrors; i++) {
-		struct page *page;
 		struct bio *bio;
 		struct btrfs_super_block *disk_super;
 
 		bytenr_orig = btrfs_sb_offset(i);
 		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
 		if (ret == -ENOENT) {
+			lock_page(page);
+			if (--page->private == 0)
+				end_page_writeback(page);
+			unlock_page(page);
 			continue;
 		} else if (ret < 0) {
 			btrfs_err(device->fs_info,
 				"couldn't get super block location for mirror %d",
 				i);
-			errors++;
+			lock_page(page);
+			page->index++;
+			if (--page->private == 0)
+				end_page_writeback(page);
+			unlock_page(page);
 			continue;
 		}
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
@@ -4119,22 +4128,6 @@ static int write_dev_supers(struct btrfs_device *device,
 				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
 				    sb->csum);
 
-		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
-					   GFP_NOFS);
-		if (!page) {
-			btrfs_err(device->fs_info,
-			    "couldn't get super block page for bytenr %llu",
-			    bytenr);
-			errors++;
-			continue;
-		}
-
-		/* Bump the refcount for wait_dev_supers() */
-		get_page(page);
-
-		disk_super = page_address(page);
-		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
-
 		/*
 		 * Directly use bios here instead of relying on the page cache
 		 * to do I/O, so we don't lose the ability to do integrity
@@ -4146,8 +4139,7 @@ static int write_dev_supers(struct btrfs_device *device,
 		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
 		bio->bi_private = device;
 		bio->bi_end_io = btrfs_end_super_write;
-		__bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
-			       offset_in_page(bytenr));
+		__bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, 0);
 
 		/*
 		 * We FUA only the first super block.  The others we allow to
@@ -4159,8 +4151,11 @@ static int write_dev_supers(struct btrfs_device *device,
 
 		btrfsic_submit_bio(bio);
 
-		if (btrfs_advance_sb_log(device, i))
-			errors++;
+		if (btrfs_advance_sb_log(device, i)) {
+			lock_page(page);
+			page->index++;
+			unlock_page(page);
+		}
 	}
 	return errors < i ? 0 : -1;
 }
@@ -4172,64 +4167,13 @@ static int write_dev_supers(struct btrfs_device *device,
  * Return number of errors when page is not found or not marked up to
  * date.
  */
-static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
+static int wait_dev_supers(struct btrfs_device *device,
+		struct btrfs_super_block *sb)
 {
-	int i;
-	int errors = 0;
-	bool primary_failed = false;
-	int ret;
-	u64 bytenr;
-
-	if (max_mirrors == 0)
-		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
-
-	for (i = 0; i < max_mirrors; i++) {
-		struct page *page;
-
-		ret = btrfs_sb_log_location(device, i, READ, &bytenr);
-		if (ret == -ENOENT) {
-			break;
-		} else if (ret < 0) {
-			errors++;
-			if (i == 0)
-				primary_failed = true;
-			continue;
-		}
-		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-		    device->commit_total_bytes)
-			break;
-
-		page = find_get_page(device->bdev->bd_inode->i_mapping,
-				     bytenr >> PAGE_SHIFT);
-		if (!page) {
-			errors++;
-			if (i == 0)
-				primary_failed = true;
-			continue;
-		}
-		/* Page is submitted locked and unlocked once the IO completes */
-		wait_on_page_locked(page);
-		if (PageError(page)) {
-			errors++;
-			if (i == 0)
-				primary_failed = true;
-		}
-
-		/* Drop our reference */
-		put_page(page);
-
-		/* Drop the reference from the writing run */
-		put_page(page);
-	}
-
-	/* log error, force error return */
-	if (primary_failed) {
-		btrfs_err(device->fs_info, "error writing primary super block to device %llu",
-			  device->devid);
-		return -1;
-	}
+	struct page *page = virt_to_page(sb);
 
-	return errors < i ? 0 : -1;
+	wait_on_page_writeback(page);
+	return page->index;
 }
 
 /*
@@ -4475,18 +4419,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 		return -EIO;
 	}
 
-	total_errors = 0;
-	list_for_each_entry(dev, head, dev_list) {
-		if (!dev->bdev)
-			continue;
-		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
-		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
-			continue;
-
-		ret = wait_dev_supers(dev, max_mirrors);
-		if (ret)
-			total_errors++;
-	}
+	total_errors = wait_dev_supers(dev, sb);
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 	if (total_errors > max_errors) {
 		btrfs_handle_fs_error(fs_info, -EIO,

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 21:00 ` Matthew Wilcox
@ 2022-06-09 22:54   ` David Sterba
  0 siblings, 0 replies; 25+ messages in thread
From: David Sterba @ 2022-06-09 22:54 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: David Sterba, linux-btrfs, nborisov

On Thu, Jun 09, 2022 at 10:00:19PM +0100, Matthew Wilcox wrote:
> On Thu, Jun 09, 2022 at 06:46:29PM +0200, David Sterba wrote:
> > Currently the super block page is from the mapping of the block device,
> > this is result of direct conversion from the previous buffer_head to bio
> > API.  We don't use the page cache or the mapping anywhere else, the page
> > is a temporary space for the associated bio.
> > 
> > Allocate pages for all super block copies at device allocation time,
> > also to avoid any later allocation problems when writing the super
> > block. This simplifies the page reference tracking, but the page lock is
> > still used as waiting mechanism for the write and write error is tracked
> > in the page.
> > 
> > As there is a separate page for each super block copy all can be
> > submitted in parallel, as before.
> 
> Why not submit the same page to all IOs?  Something like this

The byte offset and thus checksum is different in each super block copy,
so we can't use the same page.

In write_dev_supers:

- for each super block copy i
  - bytenr_orig = btrfs_sb_offset(i);
  - btrfs_set_super_bytenr(sb, bytenr_orig);
    - ie write the offset into the sb structure
  - memcpy sb -> page to write
  - calculate checksum
  - write page

Also you've removed too much code, for example the whole loop over all
devices in write_all_supers, but that's not important given the above.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 16:46 [PATCH v2] btrfs: use preallocated pages for super block write David Sterba
  2022-06-09 21:00 ` Matthew Wilcox
@ 2022-06-09 22:58 ` Qu Wenruo
  2022-06-09 22:59   ` David Sterba
  2022-06-10  0:07 ` Qu Wenruo
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 25+ messages in thread
From: Qu Wenruo @ 2022-06-09 22:58 UTC (permalink / raw)
  To: David Sterba, linux-btrfs; +Cc: willy, nborisov



On 2022/6/10 00:46, David Sterba wrote:
> Currently the super block page is from the mapping of the block device,
> this is result of direct conversion from the previous buffer_head to bio
> API.  We don't use the page cache or the mapping anywhere else, the page
> is a temporary space for the associated bio.
>
> Allocate pages for all super block copies at device allocation time,
> also to avoid any later allocation problems when writing the super
> block. This simplifies the page reference tracking, but the page lock is
> still used as waiting mechanism for the write and write error is tracked
> in the page.
>
> As there is a separate page for each super block copy all can be
> submitted in parallel, as before.
>
> This was inspired by Matthew's question
> https://lore.kernel.org/all/Yn%2FtxWbij5voeGOB@casper.infradead.org/
>
> Signed-off-by: David Sterba <dsterba@suse.com>
> ---
>
> v2:
>
> - allocate 3 pages per device to keep parallelism, otherwise the
>    submission would be serialized on the page lock

Wouldn't this cause extra memory overhead for non-4K page size systems?

Would simpler kmalloc() fulfill the requirement for both 4K and non-4K
page size systems?

Thanks,
Qu
>
> fs/btrfs/disk-io.c | 42 +++++++++++-------------------------------
>   fs/btrfs/volumes.c | 12 ++++++++++++
>   fs/btrfs/volumes.h |  3 +++
>   3 files changed, 26 insertions(+), 31 deletions(-)
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 800ad3a9c68e..8a9c7a868727 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3887,7 +3887,6 @@ static void btrfs_end_super_write(struct bio *bio)
>   			SetPageUptodate(page);
>   		}
>
> -		put_page(page);
>   		unlock_page(page);
>   	}
>
> @@ -3974,7 +3973,6 @@ static int write_dev_supers(struct btrfs_device *device,
>   			    struct btrfs_super_block *sb, int max_mirrors)
>   {
>   	struct btrfs_fs_info *fs_info = device->fs_info;
> -	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
>   	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
>   	int i;
>   	int errors = 0;
> @@ -3989,7 +3987,6 @@ static int write_dev_supers(struct btrfs_device *device,
>   	for (i = 0; i < max_mirrors; i++) {
>   		struct page *page;
>   		struct bio *bio;
> -		struct btrfs_super_block *disk_super;
>
>   		bytenr_orig = btrfs_sb_offset(i);
>   		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
> @@ -4012,21 +4009,17 @@ static int write_dev_supers(struct btrfs_device *device,
>   				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
>   				    sb->csum);
>
> -		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
> -					   GFP_NOFS);
> -		if (!page) {
> -			btrfs_err(device->fs_info,
> -			    "couldn't get super block page for bytenr %llu",
> -			    bytenr);
> -			errors++;
> -			continue;
> -		}
> -
> -		/* Bump the refcount for wait_dev_supers() */
> -		get_page(page);
> +		/*
> +		 * Super block is copied to a temporary page, which is locked
> +		 * and submitted for write. Page is unlocked after IO finishes.
> +		 * No page references are needed, write error is returned as
> +		 * page Error bit.
> +		 */
> +		page = device->sb_write_page[i];
> +		ClearPageError(page);
> +		lock_page(page);
>
> -		disk_super = page_address(page);
> -		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
> +		memcpy(page_address(page), sb, BTRFS_SUPER_INFO_SIZE);
>
>   		/*
>   		 * Directly use bios here instead of relying on the page cache
> @@ -4093,14 +4086,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>   		    device->commit_total_bytes)
>   			break;
>
> -		page = find_get_page(device->bdev->bd_inode->i_mapping,
> -				     bytenr >> PAGE_SHIFT);
> -		if (!page) {
> -			errors++;
> -			if (i == 0)
> -				primary_failed = true;
> -			continue;
> -		}
> +		page = device->sb_write_page[i];
>   		/* Page is submitted locked and unlocked once the IO completes */
>   		wait_on_page_locked(page);
>   		if (PageError(page)) {
> @@ -4108,12 +4094,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>   			if (i == 0)
>   				primary_failed = true;
>   		}
> -
> -		/* Drop our reference */
> -		put_page(page);
> -
> -		/* Drop the reference from the writing run */
> -		put_page(page);
>   	}
>
>   	/* log error, force error return */
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 12a6150ee19d..a00546d2c7ea 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -394,6 +394,8 @@ void btrfs_free_device(struct btrfs_device *device)
>   	rcu_string_free(device->name);
>   	extent_io_tree_release(&device->alloc_state);
>   	btrfs_destroy_dev_zone_info(device);
> +	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++)
> +		__free_page(device->sb_write_page[i]);
>   	kfree(device);
>   }
>
> @@ -6898,6 +6900,16 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
>   	if (!dev)
>   		return ERR_PTR(-ENOMEM);
>
> +	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
> +		dev->sb_write_page[i] = alloc_page(GFP_KERNEL);
> +		if (!dev->sb_write_page[i]) {
> +			while (--i >= 0)
> +				__free_page(dev->sb_write_page[i]);
> +			kfree(dev);
> +			return ERR_PTR(-ENOMEM);
> +		}
> +	}
> +
>   	INIT_LIST_HEAD(&dev->dev_list);
>   	INIT_LIST_HEAD(&dev->dev_alloc_list);
>   	INIT_LIST_HEAD(&dev->post_commit_list);
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 588367c76c46..516709e1d9f8 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -10,6 +10,7 @@
>   #include <linux/sort.h>
>   #include <linux/btrfs.h>
>   #include "async-thread.h"
> +#include "disk-io.h"
>
>   #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
>
> @@ -158,6 +159,8 @@ struct btrfs_device {
>   	/* Bio used for flushing device barriers */
>   	struct bio flush_bio;
>   	struct completion flush_wait;
> +	/* Temporary pages for writing the super block copies */
> +	struct page *sb_write_page[BTRFS_SUPER_MIRROR_MAX];
>
>   	/* per-device scrub information */
>   	struct scrub_ctx *scrub_ctx;

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 22:58 ` Qu Wenruo
@ 2022-06-09 22:59   ` David Sterba
  2022-06-09 23:15     ` Qu Wenruo
  2022-06-10  1:40     ` Matthew Wilcox
  0 siblings, 2 replies; 25+ messages in thread
From: David Sterba @ 2022-06-09 22:59 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: David Sterba, linux-btrfs, willy, nborisov

On Fri, Jun 10, 2022 at 06:58:00AM +0800, Qu Wenruo wrote:
> > v2:
> >
> > - allocate 3 pages per device to keep parallelism, otherwise the
> >    submission would be serialized on the page lock
> 
> Wouldn't this cause extra memory overhead for non-4K page size systems?
> 
> Would simpler kmalloc() fulfill the requirement for both 4K and non-4K
> page size systems?

Yeah on pages larger than 4K it's a bit wasteful. kmalloc should be
possible, for bios we need the page pointer but we should be able to get
it from the kmalloc address. I'd rather do that in a separate change
though.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 22:59   ` David Sterba
@ 2022-06-09 23:15     ` Qu Wenruo
  2022-06-09 23:35       ` David Sterba
  2022-06-10  1:40     ` Matthew Wilcox
  1 sibling, 1 reply; 25+ messages in thread
From: Qu Wenruo @ 2022-06-09 23:15 UTC (permalink / raw)
  To: dsterba, David Sterba, linux-btrfs, willy, nborisov



On 2022/6/10 06:59, David Sterba wrote:
> On Fri, Jun 10, 2022 at 06:58:00AM +0800, Qu Wenruo wrote:
>>> v2:
>>>
>>> - allocate 3 pages per device to keep parallelism, otherwise the
>>>     submission would be serialized on the page lock
>>
>> Wouldn't this cause extra memory overhead for non-4K page size systems?
>>
>> Would simpler kmalloc() fulfill the requirement for both 4K and non-4K
>> page size systems?
>
> Yeah on pages larger than 4K it's a bit wasteful. kmalloc should be
> possible, for bios we need the page pointer but we should be able to get
> it from the kmalloc address. I'd rather do that in a separate change
> though.

That would work for me.

Just want to bring a little more love to those larger page sized systems.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 23:15     ` Qu Wenruo
@ 2022-06-09 23:35       ` David Sterba
  0 siblings, 0 replies; 25+ messages in thread
From: David Sterba @ 2022-06-09 23:35 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, David Sterba, linux-btrfs, willy, nborisov

On Fri, Jun 10, 2022 at 07:15:17AM +0800, Qu Wenruo wrote:
> 
> 
> On 2022/6/10 06:59, David Sterba wrote:
> > On Fri, Jun 10, 2022 at 06:58:00AM +0800, Qu Wenruo wrote:
> >>> v2:
> >>>
> >>> - allocate 3 pages per device to keep parallelism, otherwise the
> >>>     submission would be serialized on the page lock
> >>
> >> Wouldn't this cause extra memory overhead for non-4K page size systems?
> >>
> >> Would simpler kmalloc() fulfill the requirement for both 4K and non-4K
> >> page size systems?
> >
> > Yeah on pages larger than 4K it's a bit wasteful. kmalloc should be
> > possible, for bios we need the page pointer but we should be able to get
> > it from the kmalloc address. I'd rather do that in a separate change
> > though.
> 
> That would work for me.
> 
> Just want to bring a little more love to those larger page sized systems.

Yeah that's a good point, memory usage should be always considered in
kernel. My other concern about the preallocated pages is that pinning
3 for the filesystem mount duration, that could be long, could create
fragmentation issues as the pages could prevent some internal
optimizations like coalescing pages into larger chunks. But, I think
that should not be that bad, with eg. up to 20 devices it's something
like 60 pages, so before implementing anything more efficient I'd need
an expert opinion.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 16:46 [PATCH v2] btrfs: use preallocated pages for super block write David Sterba
  2022-06-09 21:00 ` Matthew Wilcox
  2022-06-09 22:58 ` Qu Wenruo
@ 2022-06-10  0:07 ` Qu Wenruo
  2022-06-10  7:23   ` Nikolay Borisov
  2022-06-10  9:07 ` Qu Wenruo
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 25+ messages in thread
From: Qu Wenruo @ 2022-06-10  0:07 UTC (permalink / raw)
  To: David Sterba, linux-btrfs; +Cc: willy, nborisov



On 2022/6/10 00:46, David Sterba wrote:
> Currently the super block page is from the mapping of the block device,
> this is result of direct conversion from the previous buffer_head to bio
> API.  We don't use the page cache or the mapping anywhere else, the page
> is a temporary space for the associated bio.
>
> Allocate pages for all super block copies at device allocation time,
> also to avoid any later allocation problems when writing the super
> block. This simplifies the page reference tracking, but the page lock is
> still used as waiting mechanism for the write and write error is tracked
> in the page.
>
> As there is a separate page for each super block copy all can be
> submitted in parallel, as before.

Is there any history on why we want parallel super block submission?

In fact, for the 3 super blocks, the primary one has FUA flag, while the
other backup ones doesn't.

This means, even we wait for the super block write, only the first one
would take some real time, while the other two can return almost
immediate for devices with write cache.

In fact, waiting for the super block write back can tell us if our
primary super block did really reach disk, allowing us to do better
error handling, other than the almost non-exist check in endio.

And such synchronous submission allows us to have only one copy of the sb.


Furthermore, if we really go parallel, I don't think the current way is
the correct way.

One device can have at most 3 superblocks, but a btrfs can easily have
more than 4 devices.

Shouldn't we parallel based on device, other than each superblock?

Thanks,
Qu

>
> This was inspired by Matthew's question
> https://lore.kernel.org/all/Yn%2FtxWbij5voeGOB@casper.infradead.org/
>
> Signed-off-by: David Sterba <dsterba@suse.com>
> ---
>
> v2:
>
> - allocate 3 pages per device to keep parallelism, otherwise the
>    submission would be serialized on the page lock
>
> fs/btrfs/disk-io.c | 42 +++++++++++-------------------------------
>   fs/btrfs/volumes.c | 12 ++++++++++++
>   fs/btrfs/volumes.h |  3 +++
>   3 files changed, 26 insertions(+), 31 deletions(-)
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 800ad3a9c68e..8a9c7a868727 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3887,7 +3887,6 @@ static void btrfs_end_super_write(struct bio *bio)
>   			SetPageUptodate(page);
>   		}
>
> -		put_page(page);
>   		unlock_page(page);
>   	}
>
> @@ -3974,7 +3973,6 @@ static int write_dev_supers(struct btrfs_device *device,
>   			    struct btrfs_super_block *sb, int max_mirrors)
>   {
>   	struct btrfs_fs_info *fs_info = device->fs_info;
> -	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
>   	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
>   	int i;
>   	int errors = 0;
> @@ -3989,7 +3987,6 @@ static int write_dev_supers(struct btrfs_device *device,
>   	for (i = 0; i < max_mirrors; i++) {
>   		struct page *page;
>   		struct bio *bio;
> -		struct btrfs_super_block *disk_super;
>
>   		bytenr_orig = btrfs_sb_offset(i);
>   		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
> @@ -4012,21 +4009,17 @@ static int write_dev_supers(struct btrfs_device *device,
>   				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
>   				    sb->csum);
>
> -		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
> -					   GFP_NOFS);
> -		if (!page) {
> -			btrfs_err(device->fs_info,
> -			    "couldn't get super block page for bytenr %llu",
> -			    bytenr);
> -			errors++;
> -			continue;
> -		}
> -
> -		/* Bump the refcount for wait_dev_supers() */
> -		get_page(page);
> +		/*
> +		 * Super block is copied to a temporary page, which is locked
> +		 * and submitted for write. Page is unlocked after IO finishes.
> +		 * No page references are needed, write error is returned as
> +		 * page Error bit.
> +		 */
> +		page = device->sb_write_page[i];
> +		ClearPageError(page);
> +		lock_page(page);
>
> -		disk_super = page_address(page);
> -		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
> +		memcpy(page_address(page), sb, BTRFS_SUPER_INFO_SIZE);
>
>   		/*
>   		 * Directly use bios here instead of relying on the page cache
> @@ -4093,14 +4086,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>   		    device->commit_total_bytes)
>   			break;
>
> -		page = find_get_page(device->bdev->bd_inode->i_mapping,
> -				     bytenr >> PAGE_SHIFT);
> -		if (!page) {
> -			errors++;
> -			if (i == 0)
> -				primary_failed = true;
> -			continue;
> -		}
> +		page = device->sb_write_page[i];
>   		/* Page is submitted locked and unlocked once the IO completes */
>   		wait_on_page_locked(page);
>   		if (PageError(page)) {
> @@ -4108,12 +4094,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>   			if (i == 0)
>   				primary_failed = true;
>   		}
> -
> -		/* Drop our reference */
> -		put_page(page);
> -
> -		/* Drop the reference from the writing run */
> -		put_page(page);
>   	}
>
>   	/* log error, force error return */
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 12a6150ee19d..a00546d2c7ea 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -394,6 +394,8 @@ void btrfs_free_device(struct btrfs_device *device)
>   	rcu_string_free(device->name);
>   	extent_io_tree_release(&device->alloc_state);
>   	btrfs_destroy_dev_zone_info(device);
> +	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++)
> +		__free_page(device->sb_write_page[i]);
>   	kfree(device);
>   }
>
> @@ -6898,6 +6900,16 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
>   	if (!dev)
>   		return ERR_PTR(-ENOMEM);
>
> +	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
> +		dev->sb_write_page[i] = alloc_page(GFP_KERNEL);
> +		if (!dev->sb_write_page[i]) {
> +			while (--i >= 0)
> +				__free_page(dev->sb_write_page[i]);
> +			kfree(dev);
> +			return ERR_PTR(-ENOMEM);
> +		}
> +	}
> +
>   	INIT_LIST_HEAD(&dev->dev_list);
>   	INIT_LIST_HEAD(&dev->dev_alloc_list);
>   	INIT_LIST_HEAD(&dev->post_commit_list);
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 588367c76c46..516709e1d9f8 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -10,6 +10,7 @@
>   #include <linux/sort.h>
>   #include <linux/btrfs.h>
>   #include "async-thread.h"
> +#include "disk-io.h"
>
>   #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
>
> @@ -158,6 +159,8 @@ struct btrfs_device {
>   	/* Bio used for flushing device barriers */
>   	struct bio flush_bio;
>   	struct completion flush_wait;
> +	/* Temporary pages for writing the super block copies */
> +	struct page *sb_write_page[BTRFS_SUPER_MIRROR_MAX];
>
>   	/* per-device scrub information */
>   	struct scrub_ctx *scrub_ctx;

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 22:59   ` David Sterba
  2022-06-09 23:15     ` Qu Wenruo
@ 2022-06-10  1:40     ` Matthew Wilcox
  2022-06-10  2:46       ` Qu Wenruo
  1 sibling, 1 reply; 25+ messages in thread
From: Matthew Wilcox @ 2022-06-10  1:40 UTC (permalink / raw)
  To: dsterba, Qu Wenruo, David Sterba, linux-btrfs, nborisov

On Fri, Jun 10, 2022 at 12:59:06AM +0200, David Sterba wrote:
> On Fri, Jun 10, 2022 at 06:58:00AM +0800, Qu Wenruo wrote:
> > > v2:
> > >
> > > - allocate 3 pages per device to keep parallelism, otherwise the
> > >    submission would be serialized on the page lock
> > 
> > Wouldn't this cause extra memory overhead for non-4K page size systems?
> > 
> > Would simpler kmalloc() fulfill the requirement for both 4K and non-4K
> > page size systems?
> 
> Yeah on pages larger than 4K it's a bit wasteful. kmalloc should be
> possible, for bios we need the page pointer but we should be able to get
> it from the kmalloc address. I'd rather do that in a separate change
> though.

Slab uses the entirety of the struct page; if you use kmalloc, you
need a separate side structure to keep your metadata in rather than
using the struct page for your metadata.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-10  1:40     ` Matthew Wilcox
@ 2022-06-10  2:46       ` Qu Wenruo
  2022-06-10  3:31         ` Matthew Wilcox
  0 siblings, 1 reply; 25+ messages in thread
From: Qu Wenruo @ 2022-06-10  2:46 UTC (permalink / raw)
  To: Matthew Wilcox, dsterba, David Sterba, linux-btrfs, nborisov



On 2022/6/10 09:40, Matthew Wilcox wrote:
> On Fri, Jun 10, 2022 at 12:59:06AM +0200, David Sterba wrote:
>> On Fri, Jun 10, 2022 at 06:58:00AM +0800, Qu Wenruo wrote:
>>>> v2:
>>>>
>>>> - allocate 3 pages per device to keep parallelism, otherwise the
>>>>     submission would be serialized on the page lock
>>>
>>> Wouldn't this cause extra memory overhead for non-4K page size systems?
>>>
>>> Would simpler kmalloc() fulfill the requirement for both 4K and non-4K
>>> page size systems?
>>
>> Yeah on pages larger than 4K it's a bit wasteful. kmalloc should be
>> possible, for bios we need the page pointer but we should be able to get
>> it from the kmalloc address. I'd rather do that in a separate change
>> though.
>
> Slab uses the entirety of the struct page; if you use kmalloc, you
> need a separate side structure to keep your metadata in rather than
> using the struct page for your metadata.

Any idea what structure in page we need for this super block write scenario?

Currently in btrfs_end_super_write(), it only handle PageUptodate and
PageError.

But we only set them, and never really utilize them, resulting btrfs to
ignore any IO error on superblocks.

I'd say it's really way worse than submit_bio_wait() and handle the
errors properly, and forget the parrallel bio submissions for sb.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-10  2:46       ` Qu Wenruo
@ 2022-06-10  3:31         ` Matthew Wilcox
  2022-06-10  4:53           ` Qu Wenruo
  0 siblings, 1 reply; 25+ messages in thread
From: Matthew Wilcox @ 2022-06-10  3:31 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, David Sterba, linux-btrfs, nborisov

On Fri, Jun 10, 2022 at 10:46:18AM +0800, Qu Wenruo wrote:
> On 2022/6/10 09:40, Matthew Wilcox wrote:
> > On Fri, Jun 10, 2022 at 12:59:06AM +0200, David Sterba wrote:
> > > On Fri, Jun 10, 2022 at 06:58:00AM +0800, Qu Wenruo wrote:
> > > > > v2:
> > > > > 
> > > > > - allocate 3 pages per device to keep parallelism, otherwise the
> > > > >     submission would be serialized on the page lock
> > > > 
> > > > Wouldn't this cause extra memory overhead for non-4K page size systems?
> > > > 
> > > > Would simpler kmalloc() fulfill the requirement for both 4K and non-4K
> > > > page size systems?
> > > 
> > > Yeah on pages larger than 4K it's a bit wasteful. kmalloc should be
> > > possible, for bios we need the page pointer but we should be able to get
> > > it from the kmalloc address. I'd rather do that in a separate change
> > > though.
> > 
> > Slab uses the entirety of the struct page; if you use kmalloc, you
> > need a separate side structure to keep your metadata in rather than
> > using the struct page for your metadata.
> 
> Any idea what structure in page we need for this super block write scenario?
> 
> Currently in btrfs_end_super_write(), it only handle PageUptodate and
> PageError.
> 
> But we only set them, and never really utilize them, resulting btrfs to
> ignore any IO error on superblocks.

Huh?  I see btrfs reporting errors using them.  eg write_all_supers()
sums up total_errors.

I mean, it's your filesystem; you decide what information you need to
keep about each write.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-10  3:31         ` Matthew Wilcox
@ 2022-06-10  4:53           ` Qu Wenruo
  0 siblings, 0 replies; 25+ messages in thread
From: Qu Wenruo @ 2022-06-10  4:53 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: dsterba, David Sterba, linux-btrfs, nborisov



On 2022/6/10 11:31, Matthew Wilcox wrote:
> On Fri, Jun 10, 2022 at 10:46:18AM +0800, Qu Wenruo wrote:
>> On 2022/6/10 09:40, Matthew Wilcox wrote:
>>> On Fri, Jun 10, 2022 at 12:59:06AM +0200, David Sterba wrote:
>>>> On Fri, Jun 10, 2022 at 06:58:00AM +0800, Qu Wenruo wrote:
>>>>>> v2:
>>>>>>
>>>>>> - allocate 3 pages per device to keep parallelism, otherwise the
>>>>>>      submission would be serialized on the page lock
>>>>>
>>>>> Wouldn't this cause extra memory overhead for non-4K page size systems?
>>>>>
>>>>> Would simpler kmalloc() fulfill the requirement for both 4K and non-4K
>>>>> page size systems?
>>>>
>>>> Yeah on pages larger than 4K it's a bit wasteful. kmalloc should be
>>>> possible, for bios we need the page pointer but we should be able to get
>>>> it from the kmalloc address. I'd rather do that in a separate change
>>>> though.
>>>
>>> Slab uses the entirety of the struct page; if you use kmalloc, you
>>> need a separate side structure to keep your metadata in rather than
>>> using the struct page for your metadata.
>>
>> Any idea what structure in page we need for this super block write scenario?
>>
>> Currently in btrfs_end_super_write(), it only handle PageUptodate and
>> PageError.
>>
>> But we only set them, and never really utilize them, resulting btrfs to
>> ignore any IO error on superblocks.
>
> Huh?  I see btrfs reporting errors using them.  eg write_all_supers()
> sums up total_errors.

Unfortunately, it doesn't take IO error into consideration.

In btrfs_end_super_write() it just set PageError, output an error
message for the error, and that's all.

The total_errors doesn't include the most important and common error...

>
> I mean, it's your filesystem; you decide what information you need to
> keep about each write.
>

But sometimes we can have such stupid bugs like missing error handling
for super block writeback...

And thank you for touching this part of code, and let us find this
hidden bug.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-10  0:07 ` Qu Wenruo
@ 2022-06-10  7:23   ` Nikolay Borisov
  2022-06-10  7:33     ` Qu Wenruo
  0 siblings, 1 reply; 25+ messages in thread
From: Nikolay Borisov @ 2022-06-10  7:23 UTC (permalink / raw)
  To: Qu Wenruo, David Sterba, linux-btrfs; +Cc: willy



On 10.06.22 г. 3:07 ч., Qu Wenruo wrote:
> 
> 
> On 2022/6/10 00:46, David Sterba wrote:
>> Currently the super block page is from the mapping of the block device,
>> this is result of direct conversion from the previous buffer_head to bio
>> API.  We don't use the page cache or the mapping anywhere else, the page
>> is a temporary space for the associated bio.
>>
>> Allocate pages for all super block copies at device allocation time,
>> also to avoid any later allocation problems when writing the super
>> block. This simplifies the page reference tracking, but the page lock is
>> still used as waiting mechanism for the write and write error is tracked
>> in the page.
>>
>> As there is a separate page for each super block copy all can be
>> submitted in parallel, as before.
> 
> Is there any history on why we want parallel super block submission?

Because it's in the transaction critical path so it affects latency of 
operations.

> 
> In fact, for the 3 super blocks, the primary one has FUA flag, while the
> other backup ones doesn't.
> 
> This means, even we wait for the super block write, only the first one
> would take some real time, while the other two can return almost
> immediate for devices with write cache.
> 
> In fact, waiting for the super block write back can tell us if our
> primary super block did really reach disk, allowing us to do better
> error handling, other than the almost non-exist check in endio.
> 
> And such synchronous submission allows us to have only one copy of the sb.
> 
> 
> Furthermore, if we really go parallel, I don't think the current way is
> the correct way.
> 
> One device can have at most 3 superblocks, but a btrfs can easily have
> more than 4 devices.
> 
> Shouldn't we parallel based on device, other than each superblock?

I agree that instead of having 3 pages per-device we can go the 1 page 
per device, and parallelize based on the device, rather than the super 
block copies.

<snip>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-10  7:23   ` Nikolay Borisov
@ 2022-06-10  7:33     ` Qu Wenruo
  2022-06-10  7:39       ` Nikolay Borisov
  2022-06-10  8:39       ` Filipe Manana
  0 siblings, 2 replies; 25+ messages in thread
From: Qu Wenruo @ 2022-06-10  7:33 UTC (permalink / raw)
  To: Nikolay Borisov, David Sterba, linux-btrfs; +Cc: willy



On 2022/6/10 15:23, Nikolay Borisov wrote:
>
>
> On 10.06.22 г. 3:07 ч., Qu Wenruo wrote:
>>
>>
>> On 2022/6/10 00:46, David Sterba wrote:
>>> Currently the super block page is from the mapping of the block device,
>>> this is result of direct conversion from the previous buffer_head to bio
>>> API.  We don't use the page cache or the mapping anywhere else, the page
>>> is a temporary space for the associated bio.
>>>
>>> Allocate pages for all super block copies at device allocation time,
>>> also to avoid any later allocation problems when writing the super
>>> block. This simplifies the page reference tracking, but the page lock is
>>> still used as waiting mechanism for the write and write error is tracked
>>> in the page.
>>>
>>> As there is a separate page for each super block copy all can be
>>> submitted in parallel, as before.
>>
>> Is there any history on why we want parallel super block submission?
>
> Because it's in the transaction critical path so it affects latency of
> operations.

Not exactly.

Super block writeback happens with TRANS_STATE_UNBLOCKED status, which
means new transaction can already be started.

Thus even if we don't do any parallel submission, there is no
performance impact on transaction path.

Thanks,
Qu
>
>>
>> In fact, for the 3 super blocks, the primary one has FUA flag, while the
>> other backup ones doesn't.
>>
>> This means, even we wait for the super block write, only the first one
>> would take some real time, while the other two can return almost
>> immediate for devices with write cache.
>>
>> In fact, waiting for the super block write back can tell us if our
>> primary super block did really reach disk, allowing us to do better
>> error handling, other than the almost non-exist check in endio.
>>
>> And such synchronous submission allows us to have only one copy of the
>> sb.
>>
>>
>> Furthermore, if we really go parallel, I don't think the current way is
>> the correct way.
>>
>> One device can have at most 3 superblocks, but a btrfs can easily have
>> more than 4 devices.
>>
>> Shouldn't we parallel based on device, other than each superblock?
>
> I agree that instead of having 3 pages per-device we can go the 1 page
> per device, and parallelize based on the device, rather than the super
> block copies.
>
> <snip>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-10  7:33     ` Qu Wenruo
@ 2022-06-10  7:39       ` Nikolay Borisov
  2022-06-10  7:55         ` Qu Wenruo
  2022-06-10  8:39       ` Filipe Manana
  1 sibling, 1 reply; 25+ messages in thread
From: Nikolay Borisov @ 2022-06-10  7:39 UTC (permalink / raw)
  To: Qu Wenruo, David Sterba, linux-btrfs; +Cc: willy



On 10.06.22 г. 10:33 ч., Qu Wenruo wrote:
> 
> 
> On 2022/6/10 15:23, Nikolay Borisov wrote:
>>
>>
>> On 10.06.22 г. 3:07 ч., Qu Wenruo wrote:
>>>
>>>
>>> On 2022/6/10 00:46, David Sterba wrote:
>>>> Currently the super block page is from the mapping of the block device,
>>>> this is result of direct conversion from the previous buffer_head to 
>>>> bio
>>>> API.  We don't use the page cache or the mapping anywhere else, the 
>>>> page
>>>> is a temporary space for the associated bio.
>>>>
>>>> Allocate pages for all super block copies at device allocation time,
>>>> also to avoid any later allocation problems when writing the super
>>>> block. This simplifies the page reference tracking, but the page 
>>>> lock is
>>>> still used as waiting mechanism for the write and write error is 
>>>> tracked
>>>> in the page.
>>>>
>>>> As there is a separate page for each super block copy all can be
>>>> submitted in parallel, as before.
>>>
>>> Is there any history on why we want parallel super block submission?
>>
>> Because it's in the transaction critical path so it affects latency of
>> operations.
> 
> Not exactly.
> 
> Super block writeback happens with TRANS_STATE_UNBLOCKED status, which
> means new transaction can already be started.

You are right, in this case then I guess we can still stay with a single 
page and synchronous writeout but this needs to be explicitly mentioned 
in the changelog.

<snip>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-10  7:39       ` Nikolay Borisov
@ 2022-06-10  7:55         ` Qu Wenruo
  0 siblings, 0 replies; 25+ messages in thread
From: Qu Wenruo @ 2022-06-10  7:55 UTC (permalink / raw)
  To: Nikolay Borisov, David Sterba, linux-btrfs; +Cc: willy



On 2022/6/10 15:39, Nikolay Borisov wrote:
>
>
> On 10.06.22 г. 10:33 ч., Qu Wenruo wrote:
>>
>>
>> On 2022/6/10 15:23, Nikolay Borisov wrote:
>>>
>>>
>>> On 10.06.22 г. 3:07 ч., Qu Wenruo wrote:
>>>>
>>>>
>>>> On 2022/6/10 00:46, David Sterba wrote:
>>>>> Currently the super block page is from the mapping of the block
>>>>> device,
>>>>> this is result of direct conversion from the previous buffer_head
>>>>> to bio
>>>>> API.  We don't use the page cache or the mapping anywhere else, the
>>>>> page
>>>>> is a temporary space for the associated bio.
>>>>>
>>>>> Allocate pages for all super block copies at device allocation time,
>>>>> also to avoid any later allocation problems when writing the super
>>>>> block. This simplifies the page reference tracking, but the page
>>>>> lock is
>>>>> still used as waiting mechanism for the write and write error is
>>>>> tracked
>>>>> in the page.
>>>>>
>>>>> As there is a separate page for each super block copy all can be
>>>>> submitted in parallel, as before.
>>>>
>>>> Is there any history on why we want parallel super block submission?
>>>
>>> Because it's in the transaction critical path so it affects latency of
>>> operations.
>>
>> Not exactly.
>>
>> Super block writeback happens with TRANS_STATE_UNBLOCKED status, which
>> means new transaction can already be started.
>
> You are right, in this case then I guess we can still stay with a single
> page and synchronous writeout but this needs to be explicitly mentioned
> in the changelog.

Then I would be way more happier to convert all these bio submission
path to submit_bio_wait(), which makes our error handling easier and get
rid of the super stupid endio function.

Thanks,
Qu

>
> <snip>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-10  7:33     ` Qu Wenruo
  2022-06-10  7:39       ` Nikolay Borisov
@ 2022-06-10  8:39       ` Filipe Manana
  2022-06-10  8:44         ` Qu Wenruo
  1 sibling, 1 reply; 25+ messages in thread
From: Filipe Manana @ 2022-06-10  8:39 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: Nikolay Borisov, David Sterba, linux-btrfs, willy

On Fri, Jun 10, 2022 at 03:33:47PM +0800, Qu Wenruo wrote:
> 
> 
> On 2022/6/10 15:23, Nikolay Borisov wrote:
> > 
> > 
> > On 10.06.22 г. 3:07 ч., Qu Wenruo wrote:
> > > 
> > > 
> > > On 2022/6/10 00:46, David Sterba wrote:
> > > > Currently the super block page is from the mapping of the block device,
> > > > this is result of direct conversion from the previous buffer_head to bio
> > > > API.  We don't use the page cache or the mapping anywhere else, the page
> > > > is a temporary space for the associated bio.
> > > > 
> > > > Allocate pages for all super block copies at device allocation time,
> > > > also to avoid any later allocation problems when writing the super
> > > > block. This simplifies the page reference tracking, but the page lock is
> > > > still used as waiting mechanism for the write and write error is tracked
> > > > in the page.
> > > > 
> > > > As there is a separate page for each super block copy all can be
> > > > submitted in parallel, as before.
> > > 
> > > Is there any history on why we want parallel super block submission?
> > 
> > Because it's in the transaction critical path so it affects latency of
> > operations.
> 
> Not exactly.
> 
> Super block writeback happens with TRANS_STATE_UNBLOCKED status, which
> means new transaction can already be started.
> 
> Thus even if we don't do any parallel submission, there is no
> performance impact on transaction path.

Hell, no. There's more than transaction states to consider.

Taking longer to write super blocks can have a performance impact on fsync
for example. And fsync always has to write super blocks and wait for them
to complete. In fact, a large part of time spent on fsync is precisely on
writing super blocks.

In some cases fsync has to fallback to a transaction commit and wait for
it to complete before returning to user space - which again requires writing
super blocks and waiting for their completion.

Similarly, there are ioctls like snapshot and subvolume creation which
commit a transaction, and any changes to the way super blocks are written
can also affect their latency and impact applications.

> 
> Thanks,
> Qu
> > 
> > > 
> > > In fact, for the 3 super blocks, the primary one has FUA flag, while the
> > > other backup ones doesn't.
> > > 
> > > This means, even we wait for the super block write, only the first one
> > > would take some real time, while the other two can return almost
> > > immediate for devices with write cache.
> > > 
> > > In fact, waiting for the super block write back can tell us if our
> > > primary super block did really reach disk, allowing us to do better
> > > error handling, other than the almost non-exist check in endio.
> > > 
> > > And such synchronous submission allows us to have only one copy of the
> > > sb.
> > > 
> > > 
> > > Furthermore, if we really go parallel, I don't think the current way is
> > > the correct way.
> > > 
> > > One device can have at most 3 superblocks, but a btrfs can easily have
> > > more than 4 devices.
> > > 
> > > Shouldn't we parallel based on device, other than each superblock?
> > 
> > I agree that instead of having 3 pages per-device we can go the 1 page
> > per device, and parallelize based on the device, rather than the super
> > block copies.
> > 
> > <snip>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-10  8:39       ` Filipe Manana
@ 2022-06-10  8:44         ` Qu Wenruo
  2022-06-10  8:49           ` Qu Wenruo
  0 siblings, 1 reply; 25+ messages in thread
From: Qu Wenruo @ 2022-06-10  8:44 UTC (permalink / raw)
  To: Filipe Manana; +Cc: Nikolay Borisov, David Sterba, linux-btrfs, willy



On 2022/6/10 16:39, Filipe Manana wrote:
> On Fri, Jun 10, 2022 at 03:33:47PM +0800, Qu Wenruo wrote:
>>
>>
>> On 2022/6/10 15:23, Nikolay Borisov wrote:
>>>
>>>
>>> On 10.06.22 г. 3:07 ч., Qu Wenruo wrote:
>>>>
>>>>
>>>> On 2022/6/10 00:46, David Sterba wrote:
>>>>> Currently the super block page is from the mapping of the block device,
>>>>> this is result of direct conversion from the previous buffer_head to bio
>>>>> API.  We don't use the page cache or the mapping anywhere else, the page
>>>>> is a temporary space for the associated bio.
>>>>>
>>>>> Allocate pages for all super block copies at device allocation time,
>>>>> also to avoid any later allocation problems when writing the super
>>>>> block. This simplifies the page reference tracking, but the page lock is
>>>>> still used as waiting mechanism for the write and write error is tracked
>>>>> in the page.
>>>>>
>>>>> As there is a separate page for each super block copy all can be
>>>>> submitted in parallel, as before.
>>>>
>>>> Is there any history on why we want parallel super block submission?
>>>
>>> Because it's in the transaction critical path so it affects latency of
>>> operations.
>>
>> Not exactly.
>>
>> Super block writeback happens with TRANS_STATE_UNBLOCKED status, which
>> means new transaction can already be started.
>>
>> Thus even if we don't do any parallel submission, there is no
>> performance impact on transaction path.
>
> Hell, no. There's more than transaction states to consider.
>
> Taking longer to write super blocks can have a performance impact on fsync
> for example. And fsync always has to write super blocks and wait for them
> to complete. In fact, a large part of time spent on fsync is precisely on
> writing super blocks.

Fsync() only write the first super block, thus even if we go synchronous
submission, it's completely the same for that specific fsync use case.
We will wait for the write back of super blocks anyway.

>
> In some cases fsync has to fallback to a transaction commit and wait for
> it to complete before returning to user space - which again requires writing
> super blocks and waiting for their completion.

Although in that case, it's going to cause some differences, since we
are serializing the super block writeback for all super blocks.
And btrfs_commit_transaction() will only return if everything is done.

>
> Similarly, there are ioctls like snapshot and subvolume creation which
> commit a transaction, and any changes to the way super blocks are written
> can also affect their latency and impact applications.

Then I'd say, we would also want parallel device submissions too, which
can further save some time, even we're just writing back the primary
super blocks.

Thanks,
Qu

>
>>
>> Thanks,
>> Qu
>>>
>>>>
>>>> In fact, for the 3 super blocks, the primary one has FUA flag, while the
>>>> other backup ones doesn't.
>>>>
>>>> This means, even we wait for the super block write, only the first one
>>>> would take some real time, while the other two can return almost
>>>> immediate for devices with write cache.
>>>>
>>>> In fact, waiting for the super block write back can tell us if our
>>>> primary super block did really reach disk, allowing us to do better
>>>> error handling, other than the almost non-exist check in endio.
>>>>
>>>> And such synchronous submission allows us to have only one copy of the
>>>> sb.
>>>>
>>>>
>>>> Furthermore, if we really go parallel, I don't think the current way is
>>>> the correct way.
>>>>
>>>> One device can have at most 3 superblocks, but a btrfs can easily have
>>>> more than 4 devices.
>>>>
>>>> Shouldn't we parallel based on device, other than each superblock?
>>>
>>> I agree that instead of having 3 pages per-device we can go the 1 page
>>> per device, and parallelize based on the device, rather than the super
>>> block copies.
>>>
>>> <snip>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-10  8:44         ` Qu Wenruo
@ 2022-06-10  8:49           ` Qu Wenruo
  0 siblings, 0 replies; 25+ messages in thread
From: Qu Wenruo @ 2022-06-10  8:49 UTC (permalink / raw)
  To: Filipe Manana; +Cc: Nikolay Borisov, David Sterba, linux-btrfs, willy



On 2022/6/10 16:44, Qu Wenruo wrote:
>
>
> On 2022/6/10 16:39, Filipe Manana wrote:
>> On Fri, Jun 10, 2022 at 03:33:47PM +0800, Qu Wenruo wrote:
>>>
>>>
>>> On 2022/6/10 15:23, Nikolay Borisov wrote:
>>>>
>>>>
>>>> On 10.06.22 г. 3:07 ч., Qu Wenruo wrote:
>>>>>
>>>>>
>>>>> On 2022/6/10 00:46, David Sterba wrote:
>>>>>> Currently the super block page is from the mapping of the block
>>>>>> device,
>>>>>> this is result of direct conversion from the previous buffer_head
>>>>>> to bio
>>>>>> API.  We don't use the page cache or the mapping anywhere else,
>>>>>> the page
>>>>>> is a temporary space for the associated bio.
>>>>>>
>>>>>> Allocate pages for all super block copies at device allocation time,
>>>>>> also to avoid any later allocation problems when writing the super
>>>>>> block. This simplifies the page reference tracking, but the page
>>>>>> lock is
>>>>>> still used as waiting mechanism for the write and write error is
>>>>>> tracked
>>>>>> in the page.
>>>>>>
>>>>>> As there is a separate page for each super block copy all can be
>>>>>> submitted in parallel, as before.
>>>>>
>>>>> Is there any history on why we want parallel super block submission?
>>>>
>>>> Because it's in the transaction critical path so it affects latency of
>>>> operations.
>>>
>>> Not exactly.
>>>
>>> Super block writeback happens with TRANS_STATE_UNBLOCKED status, which
>>> means new transaction can already be started.
>>>
>>> Thus even if we don't do any parallel submission, there is no
>>> performance impact on transaction path.
>>
>> Hell, no. There's more than transaction states to consider.
>>
>> Taking longer to write super blocks can have a performance impact on
>> fsync
>> for example. And fsync always has to write super blocks and wait for them
>> to complete. In fact, a large part of time spent on fsync is precisely on
>> writing super blocks.
>
> Fsync() only write the first super block, thus even if we go synchronous
> submission, it's completely the same for that specific fsync use case.
> We will wait for the write back of super blocks anyway.
>
>>
>> In some cases fsync has to fallback to a transaction commit and wait for
>> it to complete before returning to user space - which again requires
>> writing
>> super blocks and waiting for their completion.
>
> Although in that case, it's going to cause some differences, since we
> are serializing the super block writeback for all super blocks.
> And btrfs_commit_transaction() will only return if everything is done.
>
>>
>> Similarly, there are ioctls like snapshot and subvolume creation which
>> commit a transaction, and any changes to the way super blocks are written
>> can also affect their latency and impact applications.
>
> Then I'd say, we would also want parallel device submissions too, which
> can further save some time, even we're just writing back the primary
> super blocks.

Oh, the existing code is already doing that.

Now I got the reason why we're doing the existing way of sb submission.
It's already firing all bios for all devices in one go, then waiting for
each device.

To improve the parallelism to max, and reduce the fsync() wait time for
sb writeback.

Thanks,
Qu

>
> Thanks,
> Qu
>
>>
>>>
>>> Thanks,
>>> Qu
>>>>
>>>>>
>>>>> In fact, for the 3 super blocks, the primary one has FUA flag,
>>>>> while the
>>>>> other backup ones doesn't.
>>>>>
>>>>> This means, even we wait for the super block write, only the first one
>>>>> would take some real time, while the other two can return almost
>>>>> immediate for devices with write cache.
>>>>>
>>>>> In fact, waiting for the super block write back can tell us if our
>>>>> primary super block did really reach disk, allowing us to do better
>>>>> error handling, other than the almost non-exist check in endio.
>>>>>
>>>>> And such synchronous submission allows us to have only one copy of the
>>>>> sb.
>>>>>
>>>>>
>>>>> Furthermore, if we really go parallel, I don't think the current
>>>>> way is
>>>>> the correct way.
>>>>>
>>>>> One device can have at most 3 superblocks, but a btrfs can easily have
>>>>> more than 4 devices.
>>>>>
>>>>> Shouldn't we parallel based on device, other than each superblock?
>>>>
>>>> I agree that instead of having 3 pages per-device we can go the 1 page
>>>> per device, and parallelize based on the device, rather than the super
>>>> block copies.
>>>>
>>>> <snip>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 16:46 [PATCH v2] btrfs: use preallocated pages for super block write David Sterba
                   ` (2 preceding siblings ...)
  2022-06-10  0:07 ` Qu Wenruo
@ 2022-06-10  9:07 ` Qu Wenruo
  2022-06-10 12:06 ` Nikolay Borisov
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 25+ messages in thread
From: Qu Wenruo @ 2022-06-10  9:07 UTC (permalink / raw)
  To: David Sterba, linux-btrfs; +Cc: willy, nborisov



On 2022/6/10 00:46, David Sterba wrote:
> Currently the super block page is from the mapping of the block device,
> this is result of direct conversion from the previous buffer_head to bio
> API.  We don't use the page cache or the mapping anywhere else, the page
> is a temporary space for the associated bio.
>
> Allocate pages for all super block copies at device allocation time,
> also to avoid any later allocation problems when writing the super
> block. This simplifies the page reference tracking, but the page lock is
> still used as waiting mechanism for the write and write error is tracked
> in the page.
>
> As there is a separate page for each super block copy all can be
> submitted in parallel, as before.
>
> This was inspired by Matthew's question
> https://lore.kernel.org/all/Yn%2FtxWbij5voeGOB@casper.infradead.org/
>
> Signed-off-by: David Sterba <dsterba@suse.com>

Reviewed-by: Qu Wenruo <wqu@suse.com>

After a lot of try-and-error, it turns out that the parallelism here is
intended to reduce the latency of super block writeback.
Mostly for fsync (thanks Filipe for the comments).

Thus I think this patch is all what we can do right now.

Going to one page per-device means we have to wait for previously
superblock inside the same device, thus increasing the latency.

Thanks,
Qu
> ---
>
> v2:
>
> - allocate 3 pages per device to keep parallelism, otherwise the
>    submission would be serialized on the page lock
>
> fs/btrfs/disk-io.c | 42 +++++++++++-------------------------------
>   fs/btrfs/volumes.c | 12 ++++++++++++
>   fs/btrfs/volumes.h |  3 +++
>   3 files changed, 26 insertions(+), 31 deletions(-)
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 800ad3a9c68e..8a9c7a868727 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3887,7 +3887,6 @@ static void btrfs_end_super_write(struct bio *bio)
>   			SetPageUptodate(page);
>   		}
>
> -		put_page(page);
>   		unlock_page(page);
>   	}
>
> @@ -3974,7 +3973,6 @@ static int write_dev_supers(struct btrfs_device *device,
>   			    struct btrfs_super_block *sb, int max_mirrors)
>   {
>   	struct btrfs_fs_info *fs_info = device->fs_info;
> -	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
>   	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
>   	int i;
>   	int errors = 0;
> @@ -3989,7 +3987,6 @@ static int write_dev_supers(struct btrfs_device *device,
>   	for (i = 0; i < max_mirrors; i++) {
>   		struct page *page;
>   		struct bio *bio;
> -		struct btrfs_super_block *disk_super;
>
>   		bytenr_orig = btrfs_sb_offset(i);
>   		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
> @@ -4012,21 +4009,17 @@ static int write_dev_supers(struct btrfs_device *device,
>   				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
>   				    sb->csum);
>
> -		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
> -					   GFP_NOFS);
> -		if (!page) {
> -			btrfs_err(device->fs_info,
> -			    "couldn't get super block page for bytenr %llu",
> -			    bytenr);
> -			errors++;
> -			continue;
> -		}
> -
> -		/* Bump the refcount for wait_dev_supers() */
> -		get_page(page);
> +		/*
> +		 * Super block is copied to a temporary page, which is locked
> +		 * and submitted for write. Page is unlocked after IO finishes.
> +		 * No page references are needed, write error is returned as
> +		 * page Error bit.
> +		 */
> +		page = device->sb_write_page[i];
> +		ClearPageError(page);
> +		lock_page(page);
>
> -		disk_super = page_address(page);
> -		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
> +		memcpy(page_address(page), sb, BTRFS_SUPER_INFO_SIZE);
>
>   		/*
>   		 * Directly use bios here instead of relying on the page cache
> @@ -4093,14 +4086,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>   		    device->commit_total_bytes)
>   			break;
>
> -		page = find_get_page(device->bdev->bd_inode->i_mapping,
> -				     bytenr >> PAGE_SHIFT);
> -		if (!page) {
> -			errors++;
> -			if (i == 0)
> -				primary_failed = true;
> -			continue;
> -		}
> +		page = device->sb_write_page[i];
>   		/* Page is submitted locked and unlocked once the IO completes */
>   		wait_on_page_locked(page);
>   		if (PageError(page)) {
> @@ -4108,12 +4094,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>   			if (i == 0)
>   				primary_failed = true;
>   		}
> -
> -		/* Drop our reference */
> -		put_page(page);
> -
> -		/* Drop the reference from the writing run */
> -		put_page(page);
>   	}
>
>   	/* log error, force error return */
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 12a6150ee19d..a00546d2c7ea 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -394,6 +394,8 @@ void btrfs_free_device(struct btrfs_device *device)
>   	rcu_string_free(device->name);
>   	extent_io_tree_release(&device->alloc_state);
>   	btrfs_destroy_dev_zone_info(device);
> +	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++)
> +		__free_page(device->sb_write_page[i]);
>   	kfree(device);
>   }
>
> @@ -6898,6 +6900,16 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
>   	if (!dev)
>   		return ERR_PTR(-ENOMEM);
>
> +	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
> +		dev->sb_write_page[i] = alloc_page(GFP_KERNEL);
> +		if (!dev->sb_write_page[i]) {
> +			while (--i >= 0)
> +				__free_page(dev->sb_write_page[i]);
> +			kfree(dev);
> +			return ERR_PTR(-ENOMEM);
> +		}
> +	}
> +
>   	INIT_LIST_HEAD(&dev->dev_list);
>   	INIT_LIST_HEAD(&dev->dev_alloc_list);
>   	INIT_LIST_HEAD(&dev->post_commit_list);
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 588367c76c46..516709e1d9f8 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -10,6 +10,7 @@
>   #include <linux/sort.h>
>   #include <linux/btrfs.h>
>   #include "async-thread.h"
> +#include "disk-io.h"
>
>   #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
>
> @@ -158,6 +159,8 @@ struct btrfs_device {
>   	/* Bio used for flushing device barriers */
>   	struct bio flush_bio;
>   	struct completion flush_wait;
> +	/* Temporary pages for writing the super block copies */
> +	struct page *sb_write_page[BTRFS_SUPER_MIRROR_MAX];
>
>   	/* per-device scrub information */
>   	struct scrub_ctx *scrub_ctx;

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 16:46 [PATCH v2] btrfs: use preallocated pages for super block write David Sterba
                   ` (3 preceding siblings ...)
  2022-06-10  9:07 ` Qu Wenruo
@ 2022-06-10 12:06 ` Nikolay Borisov
  2022-06-11 13:30 ` Anand Jain
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 25+ messages in thread
From: Nikolay Borisov @ 2022-06-10 12:06 UTC (permalink / raw)
  To: David Sterba, linux-btrfs; +Cc: willy



On 9.06.22 г. 19:46 ч., David Sterba wrote:
> Currently the super block page is from the mapping of the block device,
> this is result of direct conversion from the previous buffer_head to bio
> API.  We don't use the page cache or the mapping anywhere else, the page
> is a temporary space for the associated bio.
> 
> Allocate pages for all super block copies at device allocation time,
> also to avoid any later allocation problems when writing the super
> block. This simplifies the page reference tracking, but the page lock is
> still used as waiting mechanism for the write and write error is tracked
> in the page.
> 
> As there is a separate page for each super block copy all can be
> submitted in parallel, as before.
> 
> This was inspired by Matthew's question
> https://lore.kernel.org/all/Yn%2FtxWbij5voeGOB@casper.infradead.org/
> 
> Signed-off-by: David Sterba <dsterba@suse.com>

Reviewed-by: Nikolay Borisov <nborisov@suse.com>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 16:46 [PATCH v2] btrfs: use preallocated pages for super block write David Sterba
                   ` (4 preceding siblings ...)
  2022-06-10 12:06 ` Nikolay Borisov
@ 2022-06-11 13:30 ` Anand Jain
  2022-06-13  6:37   ` Nikolay Borisov
  2022-06-13  6:35 ` Nikolay Borisov
  2022-06-21 13:24 ` David Sterba
  7 siblings, 1 reply; 25+ messages in thread
From: Anand Jain @ 2022-06-11 13:30 UTC (permalink / raw)
  To: David Sterba, linux-btrfs; +Cc: willy, nborisov

On 6/9/22 22:16, David Sterba wrote:
> Currently the super block page is from the mapping of the block device,
> this is result of direct conversion from the previous buffer_head to bio

> API.  We don't use the page cache or the mapping anywhere else, the page
> is a temporary space for the associated bio.


We use mapping for SB reading.

btrfs_read_dev_one_super(...)
::
  page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);

  So isn't find_or_create_page() re-using the same page?

Thanks, Anand




> Allocate pages for all super block copies at device allocation time,
> also to avoid any later allocation problems when writing the super
> block. This simplifies the page reference tracking, but the page lock is
> still used as waiting mechanism for the write and write error is tracked
> in the page.
> 
> As there is a separate page for each super block copy all can be
> submitted in parallel, as before.
> 
> This was inspired by Matthew's question
> https://lore.kernel.org/all/Yn%2FtxWbij5voeGOB@casper.infradead.org/
> 
> Signed-off-by: David Sterba <dsterba@suse.com>
> ---
> 
> v2:
> 
> - allocate 3 pages per device to keep parallelism, otherwise the
>    submission would be serialized on the page lock
> 
> fs/btrfs/disk-io.c | 42 +++++++++++-------------------------------
>   fs/btrfs/volumes.c | 12 ++++++++++++
>   fs/btrfs/volumes.h |  3 +++
>   3 files changed, 26 insertions(+), 31 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 800ad3a9c68e..8a9c7a868727 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3887,7 +3887,6 @@ static void btrfs_end_super_write(struct bio *bio)
>   			SetPageUptodate(page);
>   		}
>   
> -		put_page(page);
>   		unlock_page(page);
>   	}
>   
> @@ -3974,7 +3973,6 @@ static int write_dev_supers(struct btrfs_device *device,
>   			    struct btrfs_super_block *sb, int max_mirrors)
>   {
>   	struct btrfs_fs_info *fs_info = device->fs_info;
> -	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
>   	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
>   	int i;
>   	int errors = 0;
> @@ -3989,7 +3987,6 @@ static int write_dev_supers(struct btrfs_device *device,
>   	for (i = 0; i < max_mirrors; i++) {
>   		struct page *page;
>   		struct bio *bio;
> -		struct btrfs_super_block *disk_super;
>   
>   		bytenr_orig = btrfs_sb_offset(i);
>   		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
> @@ -4012,21 +4009,17 @@ static int write_dev_supers(struct btrfs_device *device,
>   				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
>   				    sb->csum);
>   
> -		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
> -					   GFP_NOFS);
> -		if (!page) {
> -			btrfs_err(device->fs_info,
> -			    "couldn't get super block page for bytenr %llu",
> -			    bytenr);
> -			errors++;
> -			continue;
> -		}
> -
> -		/* Bump the refcount for wait_dev_supers() */
> -		get_page(page);
> +		/*
> +		 * Super block is copied to a temporary page, which is locked
> +		 * and submitted for write. Page is unlocked after IO finishes.
> +		 * No page references are needed, write error is returned as
> +		 * page Error bit.
> +		 */
> +		page = device->sb_write_page[i];
> +		ClearPageError(page);
> +		lock_page(page);
>   
> -		disk_super = page_address(page);
> -		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
> +		memcpy(page_address(page), sb, BTRFS_SUPER_INFO_SIZE);
>   
>   		/*
>   		 * Directly use bios here instead of relying on the page cache
> @@ -4093,14 +4086,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>   		    device->commit_total_bytes)
>   			break;
>   
> -		page = find_get_page(device->bdev->bd_inode->i_mapping,
> -				     bytenr >> PAGE_SHIFT);
> -		if (!page) {
> -			errors++;
> -			if (i == 0)
> -				primary_failed = true;
> -			continue;
> -		}
> +		page = device->sb_write_page[i];
>   		/* Page is submitted locked and unlocked once the IO completes */
>   		wait_on_page_locked(page);
>   		if (PageError(page)) {
> @@ -4108,12 +4094,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>   			if (i == 0)
>   				primary_failed = true;
>   		}
> -
> -		/* Drop our reference */
> -		put_page(page);
> -
> -		/* Drop the reference from the writing run */
> -		put_page(page);
>   	}
>   
>   	/* log error, force error return */
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 12a6150ee19d..a00546d2c7ea 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -394,6 +394,8 @@ void btrfs_free_device(struct btrfs_device *device)
>   	rcu_string_free(device->name);
>   	extent_io_tree_release(&device->alloc_state);
>   	btrfs_destroy_dev_zone_info(device);
> +	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++)
> +		__free_page(device->sb_write_page[i]);
>   	kfree(device);
>   }
>   
> @@ -6898,6 +6900,16 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
>   	if (!dev)
>   		return ERR_PTR(-ENOMEM);
>   
> +	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
> +		dev->sb_write_page[i] = alloc_page(GFP_KERNEL);
> +		if (!dev->sb_write_page[i]) {
> +			while (--i >= 0)
> +				__free_page(dev->sb_write_page[i]);
> +			kfree(dev);
> +			return ERR_PTR(-ENOMEM);
> +		}
> +	}
> +
>   	INIT_LIST_HEAD(&dev->dev_list);
>   	INIT_LIST_HEAD(&dev->dev_alloc_list);
>   	INIT_LIST_HEAD(&dev->post_commit_list);
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 588367c76c46..516709e1d9f8 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -10,6 +10,7 @@
>   #include <linux/sort.h>
>   #include <linux/btrfs.h>
>   #include "async-thread.h"
> +#include "disk-io.h"
>   
>   #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
>   
> @@ -158,6 +159,8 @@ struct btrfs_device {
>   	/* Bio used for flushing device barriers */
>   	struct bio flush_bio;
>   	struct completion flush_wait;
> +	/* Temporary pages for writing the super block copies */
> +	struct page *sb_write_page[BTRFS_SUPER_MIRROR_MAX];
>   
>   	/* per-device scrub information */
>   	struct scrub_ctx *scrub_ctx;


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 16:46 [PATCH v2] btrfs: use preallocated pages for super block write David Sterba
                   ` (5 preceding siblings ...)
  2022-06-11 13:30 ` Anand Jain
@ 2022-06-13  6:35 ` Nikolay Borisov
  2022-06-21 13:24 ` David Sterba
  7 siblings, 0 replies; 25+ messages in thread
From: Nikolay Borisov @ 2022-06-13  6:35 UTC (permalink / raw)
  To: David Sterba, linux-btrfs; +Cc: willy



On 9.06.22 г. 19:46 ч., David Sterba wrote:
> Currently the super block page is from the mapping of the block device,
> this is result of direct conversion from the previous buffer_head to bio
> API.  We don't use the page cache or the mapping anywhere else, the page
> is a temporary space for the associated bio.
> 
> Allocate pages for all super block copies at device allocation time,
> also to avoid any later allocation problems when writing the super
> block. This simplifies the page reference tracking, but the page lock is
> still used as waiting mechanism for the write and write error is tracked
> in the page.
> 
> As there is a separate page for each super block copy all can be
> submitted in parallel, as before.
> 
> This was inspired by Matthew's question
> https://lore.kernel.org/all/Yn%2FtxWbij5voeGOB@casper.infradead.org/
> 
> Signed-off-by: David Sterba <dsterba@suse.com>
> ---
> 
> v2:
> 
> - allocate 3 pages per device to keep parallelism, otherwise the
>    submission would be serialized on the page lock
> 
> fs/btrfs/disk-io.c | 42 +++++++++++-------------------------------
>   fs/btrfs/volumes.c | 12 ++++++++++++
>   fs/btrfs/volumes.h |  3 +++
>   3 files changed, 26 insertions(+), 31 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 800ad3a9c68e..8a9c7a868727 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3887,7 +3887,6 @@ static void btrfs_end_super_write(struct bio *bio)
>   			SetPageUptodate(page);
>   		}
>   
> -		put_page(page);
>   		unlock_page(page);
>   	}
>   
> @@ -3974,7 +3973,6 @@ static int write_dev_supers(struct btrfs_device *device,
>   			    struct btrfs_super_block *sb, int max_mirrors)
>   {
>   	struct btrfs_fs_info *fs_info = device->fs_info;
> -	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
>   	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
>   	int i;
>   	int errors = 0;
> @@ -3989,7 +3987,6 @@ static int write_dev_supers(struct btrfs_device *device,
>   	for (i = 0; i < max_mirrors; i++) {
>   		struct page *page;
>   		struct bio *bio;
> -		struct btrfs_super_block *disk_super;
>   
>   		bytenr_orig = btrfs_sb_offset(i);
>   		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
> @@ -4012,21 +4009,17 @@ static int write_dev_supers(struct btrfs_device *device,
>   				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
>   				    sb->csum);
>   
> -		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
> -					   GFP_NOFS);
> -		if (!page) {
> -			btrfs_err(device->fs_info,
> -			    "couldn't get super block page for bytenr %llu",
> -			    bytenr);
> -			errors++;
> -			continue;
> -		}
> -
> -		/* Bump the refcount for wait_dev_supers() */
> -		get_page(page);
> +		/*
> +		 * Super block is copied to a temporary page, which is locked
> +		 * and submitted for write. Page is unlocked after IO finishes.
> +		 * No page references are needed, write error is returned as
> +		 * page Error bit.
> +		 */
> +		page = device->sb_write_page[i];
> +		ClearPageError(page);

In light of Anand's comment about lingering usage of page cache pages 
just a remark:

ClearPageError should be done under the page lock, as well as clearing 
pageuptodate. Subsequently other users of those page can makedo without 
actually calling into page cache but directly working with in-memory 
pages i.e scratching the super block etc.

> +		lock_page(page);
>   


<snip>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-11 13:30 ` Anand Jain
@ 2022-06-13  6:37   ` Nikolay Borisov
  0 siblings, 0 replies; 25+ messages in thread
From: Nikolay Borisov @ 2022-06-13  6:37 UTC (permalink / raw)
  To: Anand Jain, David Sterba, linux-btrfs; +Cc: willy



On 11.06.22 г. 16:30 ч., Anand Jain wrote:
> On 6/9/22 22:16, David Sterba wrote:
>> Currently the super block page is from the mapping of the block device,
>> this is result of direct conversion from the previous buffer_head to bio
> 
>> API.  We don't use the page cache or the mapping anywhere else, the page
>> is a temporary space for the associated bio.
> 
> 
> We use mapping for SB reading.
> 
> btrfs_read_dev_one_super(...)
> ::
>   page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
> 
>   So isn't find_or_create_page() re-using the same page?

Good observation, the page cache is also used in 
btrfs_scratch_superblocks. IMO this change should also eliminate the 
usage of pagecache in those contexts as well. And this is actually 
critical, since the writes bypassing the pagecache means all readers of 
the sb form page cache would see stale data as the pages will never be 
set to  !uptodate

<snip>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2] btrfs: use preallocated pages for super block write
  2022-06-09 16:46 [PATCH v2] btrfs: use preallocated pages for super block write David Sterba
                   ` (6 preceding siblings ...)
  2022-06-13  6:35 ` Nikolay Borisov
@ 2022-06-21 13:24 ` David Sterba
  7 siblings, 0 replies; 25+ messages in thread
From: David Sterba @ 2022-06-21 13:24 UTC (permalink / raw)
  To: David Sterba; +Cc: linux-btrfs, willy, nborisov

On Thu, Jun 09, 2022 at 06:46:29PM +0200, David Sterba wrote:
> Currently the super block page is from the mapping of the block device,
> this is result of direct conversion from the previous buffer_head to bio
> API.  We don't use the page cache or the mapping anywhere else, the page
> is a temporary space for the associated bio.
> 
> Allocate pages for all super block copies at device allocation time,
> also to avoid any later allocation problems when writing the super
> block. This simplifies the page reference tracking, but the page lock is
> still used as waiting mechanism for the write and write error is tracked
> in the page.
> 
> As there is a separate page for each super block copy all can be
> submitted in parallel, as before.
> 
> This was inspired by Matthew's question
> https://lore.kernel.org/all/Yn%2FtxWbij5voeGOB@casper.infradead.org/
> 
> Signed-off-by: David Sterba <dsterba@suse.com>
> ---
> 
> v2:
> 
> - allocate 3 pages per device to keep parallelism, otherwise the
>   submission would be serialized on the page lock

I'll continue with the super block write in the future, so far the patch
has uncovered several issues:

- preallocated page must be for each super block copy (offset, checksum)

- bio for each page can be preallocated too

- wait on super block page/bio should move from page lock (eg.
  completion)

- we want parallel submission due to performance reasons and impact on
  operations that may block on commit (fsync)

- using page cache vs direct write, read from user space may get stale
  data if super block write does not use page cache, there's also page
  cache read in kernel but that won't race with write

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2022-06-21 13:34 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-09 16:46 [PATCH v2] btrfs: use preallocated pages for super block write David Sterba
2022-06-09 21:00 ` Matthew Wilcox
2022-06-09 22:54   ` David Sterba
2022-06-09 22:58 ` Qu Wenruo
2022-06-09 22:59   ` David Sterba
2022-06-09 23:15     ` Qu Wenruo
2022-06-09 23:35       ` David Sterba
2022-06-10  1:40     ` Matthew Wilcox
2022-06-10  2:46       ` Qu Wenruo
2022-06-10  3:31         ` Matthew Wilcox
2022-06-10  4:53           ` Qu Wenruo
2022-06-10  0:07 ` Qu Wenruo
2022-06-10  7:23   ` Nikolay Borisov
2022-06-10  7:33     ` Qu Wenruo
2022-06-10  7:39       ` Nikolay Borisov
2022-06-10  7:55         ` Qu Wenruo
2022-06-10  8:39       ` Filipe Manana
2022-06-10  8:44         ` Qu Wenruo
2022-06-10  8:49           ` Qu Wenruo
2022-06-10  9:07 ` Qu Wenruo
2022-06-10 12:06 ` Nikolay Borisov
2022-06-11 13:30 ` Anand Jain
2022-06-13  6:37   ` Nikolay Borisov
2022-06-13  6:35 ` Nikolay Borisov
2022-06-21 13:24 ` David Sterba

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.