[PATCH v2] btrfs: use preallocated pages for super block write

* [PATCH v2] btrfs: use preallocated pages for super block write
@ 2022-06-09 16:46 David Sterba
  2022-06-09 21:00 ` Matthew Wilcox
                   ` (7 more replies)
  0 siblings, 8 replies; 25+ messages in thread
From: David Sterba @ 2022-06-09 16:46 UTC (permalink / raw)
  To: linux-btrfs; +Cc: willy, nborisov, David Sterba

Currently the super block page is from the mapping of the block device,
this is result of direct conversion from the previous buffer_head to bio
API.  We don't use the page cache or the mapping anywhere else, the page
is a temporary space for the associated bio.

Allocate pages for all super block copies at device allocation time,
also to avoid any later allocation problems when writing the super
block. This simplifies the page reference tracking, but the page lock is
still used as waiting mechanism for the write and write error is tracked
in the page.

As there is a separate page for each super block copy all can be
submitted in parallel, as before.

This was inspired by Matthew's question
https://lore.kernel.org/all/Yn%2FtxWbij5voeGOB@casper.infradead.org/

Signed-off-by: David Sterba <dsterba@suse.com>
---

v2:

- allocate 3 pages per device to keep parallelism, otherwise the
  submission would be serialized on the page lock

fs/btrfs/disk-io.c | 42 +++++++++++-------------------------------
 fs/btrfs/volumes.c | 12 ++++++++++++
 fs/btrfs/volumes.h |  3 +++
 3 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 800ad3a9c68e..8a9c7a868727 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3887,7 +3887,6 @@ static void btrfs_end_super_write(struct bio *bio)
 			SetPageUptodate(page);
 		}
 
-		put_page(page);
 		unlock_page(page);
 	}
 
@@ -3974,7 +3973,6 @@ static int write_dev_supers(struct btrfs_device *device,
 			    struct btrfs_super_block *sb, int max_mirrors)
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
-	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	int i;
 	int errors = 0;
@@ -3989,7 +3987,6 @@ static int write_dev_supers(struct btrfs_device *device,
 	for (i = 0; i < max_mirrors; i++) {
 		struct page *page;
 		struct bio *bio;
-		struct btrfs_super_block *disk_super;
 
 		bytenr_orig = btrfs_sb_offset(i);
 		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
@@ -4012,21 +4009,17 @@ static int write_dev_supers(struct btrfs_device *device,
 				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
 				    sb->csum);
 
-		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
-					   GFP_NOFS);
-		if (!page) {
-			btrfs_err(device->fs_info,
-			    "couldn't get super block page for bytenr %llu",
-			    bytenr);
-			errors++;
-			continue;
-		}
-
-		/* Bump the refcount for wait_dev_supers() */
-		get_page(page);
+		/*
+		 * Super block is copied to a temporary page, which is locked
+		 * and submitted for write. Page is unlocked after IO finishes.
+		 * No page references are needed, write error is returned as
+		 * page Error bit.
+		 */
+		page = device->sb_write_page[i];
+		ClearPageError(page);
+		lock_page(page);
 
-		disk_super = page_address(page);
-		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
+		memcpy(page_address(page), sb, BTRFS_SUPER_INFO_SIZE);
 
 		/*
 		 * Directly use bios here instead of relying on the page cache
@@ -4093,14 +4086,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 		    device->commit_total_bytes)
 			break;
 
-		page = find_get_page(device->bdev->bd_inode->i_mapping,
-				     bytenr >> PAGE_SHIFT);
-		if (!page) {
-			errors++;
-			if (i == 0)
-				primary_failed = true;
-			continue;
-		}
+		page = device->sb_write_page[i];
 		/* Page is submitted locked and unlocked once the IO completes */
 		wait_on_page_locked(page);
 		if (PageError(page)) {
@@ -4108,12 +4094,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 			if (i == 0)
 				primary_failed = true;
 		}
-
-		/* Drop our reference */
-		put_page(page);
-
-		/* Drop the reference from the writing run */
-		put_page(page);
 	}
 
 	/* log error, force error return */
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 12a6150ee19d..a00546d2c7ea 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -394,6 +394,8 @@ void btrfs_free_device(struct btrfs_device *device)
 	rcu_string_free(device->name);
 	extent_io_tree_release(&device->alloc_state);
 	btrfs_destroy_dev_zone_info(device);
+	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++)
+		__free_page(device->sb_write_page[i]);
 	kfree(device);
 }
 
@@ -6898,6 +6900,16 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
+	for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		dev->sb_write_page[i] = alloc_page(GFP_KERNEL);
+		if (!dev->sb_write_page[i]) {
+			while (--i >= 0)
+				__free_page(dev->sb_write_page[i]);
+			kfree(dev);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+
 	INIT_LIST_HEAD(&dev->dev_list);
 	INIT_LIST_HEAD(&dev->dev_alloc_list);
 	INIT_LIST_HEAD(&dev->post_commit_list);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 588367c76c46..516709e1d9f8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -10,6 +10,7 @@
 #include <linux/sort.h>
 #include <linux/btrfs.h>
 #include "async-thread.h"
+#include "disk-io.h"
 
 #define BTRFS_MAX_DATA_CHUNK_SIZE	(10ULL * SZ_1G)
 
@@ -158,6 +159,8 @@ struct btrfs_device {
 	/* Bio used for flushing device barriers */
 	struct bio flush_bio;
 	struct completion flush_wait;
+	/* Temporary pages for writing the super block copies */
+	struct page *sb_write_page[BTRFS_SUPER_MIRROR_MAX];
 
 	/* per-device scrub information */
 	struct scrub_ctx *scrub_ctx;
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread