All of lore.kernel.org
 help / color / mirror / Atom feed
From: Naohiro Aota <naohiro.aota@wdc.com>
To: linux-btrfs@vger.kernel.org
Cc: David Sterba <dsterba@suse.com>, Naohiro Aota <naohiro.aota@wdc.com>
Subject: [PATCH 2/5] btrfs-progs: introduce btrfs_pwrite wrapper for pwrite
Date: Mon, 27 Sep 2021 13:15:51 +0900	[thread overview]
Message-ID: <20210927041554.325884-3-naohiro.aota@wdc.com> (raw)
In-Reply-To: <20210927041554.325884-1-naohiro.aota@wdc.com>

Wrap pwrite with btrfs_pwrite(). It simply calls pwrite() on non-zoned
btrfs (= opened without O_DIRECT). On zoned mode (= opened with O_DIRECT),
it allocates an aligned bounce buffer, copy the contents and use it for
direct-IO writing.

Writes in device_zero_blocks() and btrfs_wipe_existing_sb() are a little
tricky. We don't have fs_info on our hands, so use zinfo to determine it is
a zoned device or not.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 common/device-utils.c     | 76 ++++++++++++++++++++++++++++++++++++---
 common/device-utils.h     | 19 +++++++++-
 kernel-shared/extent_io.c |  7 ++--
 kernel-shared/zoned.c     |  4 +--
 mkfs/common.c             | 14 +++++---
 5 files changed, 106 insertions(+), 14 deletions(-)

diff --git a/common/device-utils.c b/common/device-utils.c
index 503705c43754..3ba4dccba689 100644
--- a/common/device-utils.c
+++ b/common/device-utils.c
@@ -26,6 +26,7 @@
 #include <dirent.h>
 #include <blkid/blkid.h>
 #include <linux/limits.h>
+#include <linux/fs.h>
 #include <limits.h>
 #include "kernel-lib/sizes.h"
 #include "kernel-shared/disk-io.h"
@@ -76,7 +77,7 @@ int device_discard_blocks(int fd, u64 start, u64 len)
 /*
  * Write zeros to the given range [start, start + len)
  */
-int device_zero_blocks(int fd, off_t start, size_t len)
+int device_zero_blocks(int fd, off_t start, size_t len, const bool direct)
 {
 	char *buf = malloc(len);
 	int ret = 0;
@@ -85,7 +86,7 @@ int device_zero_blocks(int fd, off_t start, size_t len)
 	if (!buf)
 		return -ENOMEM;
 	memset(buf, 0, len);
-	written = pwrite(fd, buf, len, start);
+	written = btrfs_pwrite(fd, buf, len, start, direct);
 	if (written != len)
 		ret = -EIO;
 	free(buf);
@@ -115,7 +116,7 @@ static int zero_dev_clamped(int fd, struct btrfs_zoned_device_info *zinfo,
 	if (zinfo && zinfo->model == ZONED_HOST_MANAGED)
 		return zero_zone_blocks(fd, zinfo, start, end - start);
 
-	return device_zero_blocks(fd, start, end - start);
+	return device_zero_blocks(fd, start, end - start, false);
 }
 
 /*
@@ -157,8 +158,10 @@ static int btrfs_wipe_existing_sb(int fd, struct btrfs_zoned_device_info *zinfo)
 		len = sizeof(buf);
 
 	if (!zone_is_sequential(zinfo, offset)) {
+		const bool direct = zinfo && zinfo->model == ZONED_HOST_MANAGED;
+
 		memset(buf, 0, len);
-		ret = pwrite(fd, buf, len, offset);
+		ret = btrfs_pwrite(fd, buf, len, offset, direct);
 		if (ret < 0) {
 			error("cannot wipe existing superblock: %m");
 			ret = -1;
@@ -491,3 +494,68 @@ out:
 	close(sysfs_fd);
 	return ret;
 }
+
+ssize_t btrfs_direct_pio(int rw, int fd, void *buf, size_t count, off_t offset)
+{
+	int alignment;
+	size_t iosize;
+	void *bounce_buf = NULL;
+	struct stat stat_buf;
+	unsigned long req;
+	int ret;
+	ssize_t ret_rw;
+
+	ASSERT(rw == READ || rw == WRITE);
+
+	if (fstat(fd, &stat_buf) == -1) {
+		error("fstat failed (%m)");
+		return 0;
+	}
+
+	if ((stat_buf.st_mode & S_IFMT) == S_IFBLK)
+		req = BLKSSZGET;
+	else
+		req = FIGETBSZ;
+
+	if (ioctl(fd, req, &alignment)) {
+		error("failed to get block size: %m");
+		return 0;
+	}
+
+	if (IS_ALIGNED((size_t)buf, alignment) && IS_ALIGNED(count, alignment)) {
+		if (rw == WRITE)
+			return pwrite(fd, buf, count, offset);
+		else
+			return pread(fd, buf, count, offset);
+	}
+
+	/* Cannot do anything if the write size is not aligned */
+	if (rw == WRITE && !IS_ALIGNED(count, alignment)) {
+		error("%lu is not aligned to %d", count, alignment);
+		return 0;
+	}
+
+	iosize = round_up(count, alignment);
+
+	ret = posix_memalign(&bounce_buf, alignment, iosize);
+	if (ret) {
+		error("failed to allocate bounce buffer: %m");
+		errno = ret;
+		return 0;
+	}
+
+	if (rw == WRITE) {
+		ASSERT(iosize == count);
+		memcpy(bounce_buf, buf, count);
+		ret_rw = pwrite(fd, bounce_buf, iosize, offset);
+	} else {
+		ret_rw = pread(fd, bounce_buf, iosize, offset);
+		if (ret_rw >= count) {
+			ret_rw = count;
+			memcpy(buf, bounce_buf, count);
+		}
+	}
+
+	free(bounce_buf);
+	return ret_rw;
+}
diff --git a/common/device-utils.h b/common/device-utils.h
index 099520bf9737..767dab4370e1 100644
--- a/common/device-utils.h
+++ b/common/device-utils.h
@@ -17,6 +17,8 @@
 #ifndef __DEVICE_UTILS_H__
 #define __DEVICE_UTILS_H__
 
+#include <stdbool.h>
+#include <unistd.h>
 #include "kerncompat.h"
 #include "sys/stat.h"
 
@@ -35,7 +37,7 @@
  * Generic block device helpers
  */
 int device_discard_blocks(int fd, u64 start, u64 len);
-int device_zero_blocks(int fd, off_t start, size_t len);
+int device_zero_blocks(int fd, off_t start, size_t len, const bool direct);
 u64 device_get_partition_size(const char *dev);
 u64 device_get_partition_size_fd(int fd);
 int device_get_queue_param(const char *file, const char *param, char *buf, size_t len);
@@ -47,5 +49,20 @@ u64 device_get_zone_size(int fd, const char *name);
 u64 btrfs_device_size(int fd, struct stat *st);
 int btrfs_prepare_device(int fd, const char *file, u64 *block_count_ret,
 		u64 max_block_count, unsigned opflags);
+ssize_t btrfs_direct_pio(int rw, int fd, void *buf, size_t count, off_t offset);
+
+#ifdef BTRFS_ZONED
+static inline ssize_t btrfs_pwrite(int fd, void *buf, size_t count,
+				   off_t offset, bool direct)
+{
+	if (!direct)
+		return pwrite(fd, buf, count, offset);
+
+	return btrfs_direct_pio(WRITE, fd, buf, count, offset);
+}
+#else
+#define btrfs_pwrite(fd, buf, count, offset, direct) \
+	({ (void)(direct); pwrite(fd, buf, count, offset); })
+#endif
 
 #endif
diff --git a/kernel-shared/extent_io.c b/kernel-shared/extent_io.c
index d3d79bc8f748..b5984949f431 100644
--- a/kernel-shared/extent_io.c
+++ b/kernel-shared/extent_io.c
@@ -29,6 +29,7 @@
 #include "kernel-shared/ctree.h"
 #include "kernel-shared/volumes.h"
 #include "common/utils.h"
+#include "common/device-utils.h"
 #include "common/internal.h"
 
 void extent_io_tree_init(struct extent_io_tree *tree)
@@ -809,7 +810,8 @@ out:
 int write_extent_to_disk(struct extent_buffer *eb)
 {
 	int ret;
-	ret = pwrite(eb->fd, eb->data, eb->len, eb->dev_bytenr);
+	ret = btrfs_pwrite(eb->fd, eb->data, eb->len, eb->dev_bytenr,
+			   eb->fs_info->zoned);
 	if (ret < 0)
 		goto out;
 	if (ret != eb->len) {
@@ -932,7 +934,8 @@ int write_data_to_disk(struct btrfs_fs_info *info, void *buf, u64 offset,
 			this_len = min(this_len, bytes_left);
 			dev_nr++;
 
-			ret = pwrite(device->fd, buf + total_write, this_len, dev_bytenr);
+			ret = btrfs_pwrite(device->fd, buf + total_write,
+					   this_len, dev_bytenr, info->zoned);
 			if (ret != this_len) {
 				if (ret < 0) {
 					fprintf(stderr, "Error writing to "
diff --git a/kernel-shared/zoned.c b/kernel-shared/zoned.c
index 8d94f98a7fce..c2cce3b5f366 100644
--- a/kernel-shared/zoned.c
+++ b/kernel-shared/zoned.c
@@ -424,7 +424,7 @@ int zero_zone_blocks(int fd, struct btrfs_zoned_device_info *zinfo, off_t start,
 			count = zone_len - (ofst & (zone_len - 1));
 
 		if (!zone_is_sequential(zinfo, ofst)) {
-			ret = device_zero_blocks(fd, ofst, count);
+			ret = device_zero_blocks(fd, ofst, count, true);
 			if (ret != 0)
 				return ret;
 		}
@@ -595,7 +595,7 @@ size_t btrfs_sb_io(int fd, void *buf, off_t offset, int rw)
 	if (rw == READ)
 		ret_sz = pread64(fd, buf, count, mapped);
 	else
-		ret_sz = pwrite64(fd, buf, count, mapped);
+		ret_sz = btrfs_pwrite(fd, buf, count, mapped, true);
 
 	if (ret_sz != count)
 		return ret_sz;
diff --git a/mkfs/common.c b/mkfs/common.c
index 20a7d1155972..5c8d6ac13a3b 100644
--- a/mkfs/common.c
+++ b/mkfs/common.c
@@ -54,7 +54,7 @@ static int btrfs_write_empty_tree(int fd, struct btrfs_mkfs_config *cfg,
 	btrfs_set_header_nritems(buf, 0);
 	csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
 			     cfg->csum_type);
-	ret = pwrite(fd, buf->data, cfg->nodesize, block);
+	ret = btrfs_pwrite(fd, buf->data, cfg->nodesize, block, cfg->zone_size);
 	if (ret != cfg->nodesize)
 		return ret < 0 ? -errno : -EIO;
 	return 0;
@@ -134,7 +134,8 @@ static int btrfs_create_tree_root(int fd, struct btrfs_mkfs_config *cfg,
 			     cfg->csum_type);
 
 	/* write back root tree */
-	ret = pwrite(fd, buf->data, cfg->nodesize, cfg->blocks[MKFS_ROOT_TREE]);
+	ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
+			   cfg->blocks[MKFS_ROOT_TREE], cfg->zone_size);
 	if (ret != cfg->nodesize)
 		return (ret < 0 ? -errno : -EIO);
 
@@ -422,7 +423,8 @@ int make_btrfs(int fd, struct btrfs_mkfs_config *cfg)
 	btrfs_set_header_nritems(buf, nritems);
 	csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
 			     cfg->csum_type);
-	ret = pwrite(fd, buf->data, cfg->nodesize, cfg->blocks[MKFS_EXTENT_TREE]);
+	ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
+			   cfg->blocks[MKFS_EXTENT_TREE], cfg->zone_size);
 	if (ret != cfg->nodesize) {
 		ret = (ret < 0 ? -errno : -EIO);
 		goto out;
@@ -510,7 +512,8 @@ int make_btrfs(int fd, struct btrfs_mkfs_config *cfg)
 	btrfs_set_header_nritems(buf, nritems);
 	csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
 			     cfg->csum_type);
-	ret = pwrite(fd, buf->data, cfg->nodesize, cfg->blocks[MKFS_CHUNK_TREE]);
+	ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
+			   cfg->blocks[MKFS_CHUNK_TREE], cfg->zone_size);
 	if (ret != cfg->nodesize) {
 		ret = (ret < 0 ? -errno : -EIO);
 		goto out;
@@ -550,7 +553,8 @@ int make_btrfs(int fd, struct btrfs_mkfs_config *cfg)
 	btrfs_set_header_nritems(buf, nritems);
 	csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
 			     cfg->csum_type);
-	ret = pwrite(fd, buf->data, cfg->nodesize, cfg->blocks[MKFS_DEV_TREE]);
+	ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
+			   cfg->blocks[MKFS_DEV_TREE], cfg->zone_size);
 	if (ret != cfg->nodesize) {
 		ret = (ret < 0 ? -errno : -EIO);
 		goto out;
-- 
2.33.0


  parent reply	other threads:[~2021-09-27  4:17 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-27  4:15 [PATCH 0/5] btrfs-progs: use direct-IO for zoned device Naohiro Aota
2021-09-27  4:15 ` [PATCH 1/5] btrfs-progs: mkfs: do not set zone size on non-zoned mode Naohiro Aota
2021-09-27  9:19   ` Johannes Thumshirn
2021-09-27  4:15 ` Naohiro Aota [this message]
2021-09-27  9:39   ` [PATCH 2/5] btrfs-progs: introduce btrfs_pwrite wrapper for pwrite Johannes Thumshirn
2021-09-27  4:15 ` [PATCH 3/5] btrfs-progs: introduce btrfs_pread wrapper for pread Naohiro Aota
2021-09-27 10:23   ` Johannes Thumshirn
2021-09-27 18:41     ` David Sterba
2021-09-27  4:15 ` [PATCH 4/5] btrfs-progs: temporally set zoned flag for initial tree reading Naohiro Aota
2021-09-27 12:38   ` Johannes Thumshirn
2021-09-27  4:15 ` [PATCH 5/5] btrfs-progs: use direct-IO for zoned device Naohiro Aota
2021-09-27 18:48   ` David Sterba
2021-09-27 19:26 ` [PATCH 0/5] " David Sterba
2021-09-29  2:21   ` Naohiro Aota
2021-09-29 10:16     ` David Sterba
2021-09-27 21:51 ` David Sterba
2021-09-29  2:24   ` Naohiro Aota
2021-09-29 10:22     ` David Sterba
2021-10-05  6:11       ` Naohiro Aota

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210927041554.325884-3-naohiro.aota@wdc.com \
    --to=naohiro.aota@wdc.com \
    --cc=dsterba@suse.com \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.