linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
To: linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org
Cc: Ojaswin Mujoo <ojaswin@linux.ibm.com>, Jan Kara <jack@suse.cz>,
	Theodore Ts'o <tytso@mit.edu>,
	Matthew Wilcox <willy@infradead.org>,
	"Darrick J . Wong" <djwong@kernel.org>,
	Luis Chamberlain <mcgrof@kernel.org>,
	John Garry <john.g.garry@oracle.com>,
	linux-kernel@vger.kernel.org,
	"Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Subject: [RFC 3/8] iomap: Add atomic write support for direct-io
Date: Sat,  2 Mar 2024 13:12:00 +0530	[thread overview]
Message-ID: <6a09654d152d3d1a07636174f5abcfce9948c20c.1709361537.git.ritesh.list@gmail.com> (raw)
In-Reply-To: <555cc3e262efa77ee5648196362f415a1efc018d.1709361537.git.ritesh.list@gmail.com>

This adds direct-io atomic writes support in iomap. This adds -
1. IOMAP_ATOMIC flag for iomap iter.
2. Sets REQ_ATOMIC to bio opflags.
3. Adds necessary checks in iomap_dio code to ensure a single bio is
   submitted for an atomic write request. (since we only support ubuf
   type iocb). Otherwise return an error EIO.
4. Adds a common helper routine iomap_dio_check_atomic(). It helps in
   verifying mapped length and start/end physical offset against the hw
   device constraints for supporting atomic writes.

This patch is based on a patch from John Garry <john.g.garry@oracle.com>
which adds such support of DIO atomic writes to iomap.

Co-developed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 fs/iomap/direct-io.c  | 75 +++++++++++++++++++++++++++++++++++++++++--
 fs/iomap/trace.h      |  3 +-
 include/linux/iomap.h |  1 +
 3 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bcd3f8cf5ea4..b4548acb74e7 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -256,7 +256,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
  * clearing the WRITE_THROUGH flag in the dio request.
  */
 static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
-		const struct iomap *iomap, bool use_fua)
+		const struct iomap *iomap, bool use_fua, bool atomic_write)
 {
 	blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
 
@@ -269,6 +269,9 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
 	else
 		dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
 
+	if (atomic_write)
+		opflags |= REQ_ATOMIC;
+
 	return opflags;
 }
 
@@ -279,11 +282,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	struct inode *inode = iter->inode;
 	unsigned int fs_block_size = i_blocksize(inode), pad;
 	loff_t length = iomap_length(iter);
+	const size_t orig_len = iter->len;
 	loff_t pos = iter->pos;
 	blk_opf_t bio_opf;
 	struct bio *bio;
 	bool need_zeroout = false;
-	bool use_fua = false;
+	bool use_fua = false, atomic_write = iter->flags & IOMAP_ATOMIC;
 	int nr_pages, ret = 0;
 	size_t copied = 0;
 	size_t orig_count;
@@ -356,6 +360,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	if (need_zeroout) {
 		/* zero out from the start of the block to the write offset */
 		pad = pos & (fs_block_size - 1);
+		if (unlikely(pad && atomic_write)) {
+			WARN_ON_ONCE("pos not atomic write aligned\n");
+			ret = -EINVAL;
+			goto out;
+		}
 		if (pad)
 			iomap_dio_zero(iter, dio, pos - pad, pad);
 	}
@@ -365,7 +374,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	 * can set up the page vector appropriately for a ZONE_APPEND
 	 * operation.
 	 */
-	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_write);
 
 	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
 	do {
@@ -397,6 +406,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		}
 
 		n = bio->bi_iter.bi_size;
+
+		/* This bio should have covered the complete length */
+		if (unlikely(atomic_write && n != orig_len)) {
+			WARN_ON_ONCE(1);
+			ret = -EINVAL;
+			bio_put(bio);
+			goto out;
+		}
 		if (dio->flags & IOMAP_DIO_WRITE) {
 			task_io_account_write(n);
 		} else {
@@ -429,6 +446,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
 		/* zero out from the end of the write to the end of the block */
 		pad = pos & (fs_block_size - 1);
+		/* This should never happen */
+		WARN_ON_ONCE(unlikely(pad && atomic_write));
 		if (pad)
 			iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
 	}
@@ -516,6 +535,44 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,
 	}
 }
 
+/*
+ * iomap_dio_check_atomic:	DIO Atomic checks before calling bio submission.
+ * @iter:			iomap iterator
+ * This function is called after filesystem block mapping and before bio
+ * formation/submission. This is the right place to verify hw device/block
+ * layer constraints to be followed for doing atomic writes. Hence do those
+ * common checks here.
+ */
+static bool iomap_dio_check_atomic(struct iomap_iter *iter)
+{
+	struct block_device *bdev = iter->iomap.bdev;
+	unsigned long long map_len = iomap_length(iter);
+	unsigned long long start = iomap_sector(&iter->iomap, iter->pos)
+						<< SECTOR_SHIFT;
+	unsigned long long end = start + map_len - 1;
+	unsigned int awu_min =
+			queue_atomic_write_unit_min_bytes(bdev->bd_queue);
+	unsigned int awu_max =
+			queue_atomic_write_unit_max_bytes(bdev->bd_queue);
+	unsigned long boundary =
+			queue_atomic_write_boundary_bytes(bdev->bd_queue);
+	unsigned long mask = ~(boundary - 1);
+
+
+	/* map_len should be same as user specified iter->len */
+	if (map_len < iter->len)
+		return false;
+	/* start should be aligned to block device min atomic unit alignment */
+	if (!IS_ALIGNED(start, awu_min))
+		return false;
+	/* If top bits doesn't match, means atomic unit boundary is crossed */
+	if (boundary && ((start | mask) != (end | mask)))
+		return false;
+
+	return true;
+}
+
+
 /*
  * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
  * is being issued as AIO or not.  This allows us to optimise pure data writes
@@ -554,12 +611,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	struct blk_plug plug;
 	struct iomap_dio *dio;
 	loff_t ret = 0;
+	bool atomic_write = iocb->ki_flags & IOCB_ATOMIC;
 
 	trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
 
 	if (!iomi.len)
 		return NULL;
 
+	if (atomic_write && !iter_is_ubuf(iter))
+		return ERR_PTR(-EINVAL);
+
 	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
 	if (!dio)
 		return ERR_PTR(-ENOMEM);
@@ -605,6 +666,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
 			dio->flags |= IOMAP_DIO_CALLER_COMP;
 
+		if (atomic_write)
+			iomi.flags |= IOMAP_ATOMIC;
+
 		if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
 			ret = -EAGAIN;
 			if (iomi.pos >= dio->i_size ||
@@ -656,6 +720,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 
 	blk_start_plug(&plug);
 	while ((ret = iomap_iter(&iomi, ops)) > 0) {
+		if (atomic_write && !iomap_dio_check_atomic(&iomi)) {
+			ret = -EIO;
+			break;
+		}
+
 		iomi.processed = iomap_dio_iter(&iomi, dio);
 
 		/*
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index c16fd55f5595..c95576420bca 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
 	{ IOMAP_REPORT,		"REPORT" }, \
 	{ IOMAP_FAULT,		"FAULT" }, \
 	{ IOMAP_DIRECT,		"DIRECT" }, \
-	{ IOMAP_NOWAIT,		"NOWAIT" }
+	{ IOMAP_NOWAIT,		"NOWAIT" }, \
+	{ IOMAP_ATOMIC,		"ATOMIC" }
 
 #define IOMAP_F_FLAGS_STRINGS \
 	{ IOMAP_F_NEW,		"NEW" }, \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 96dd0acbba44..9eac704a0d6f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -178,6 +178,7 @@ struct iomap_folio_ops {
 #else
 #define IOMAP_DAX		0
 #endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC		(1 << 9)
 
 struct iomap_ops {
 	/*
-- 
2.43.0


  parent reply	other threads:[~2024-03-02  7:42 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-02  7:41 [RFC 0/9] ext4: Add direct-io atomic write support using fsawu Ritesh Harjani (IBM)
2024-03-02  7:41 ` [RFC 1/8] fs: Add FS_XFLAG_ATOMICWRITES flag Ritesh Harjani (IBM)
2024-03-02  7:41   ` [RFC 2/8] fs: Reserve inode flag FS_ATOMICWRITES_FL for atomic writes Ritesh Harjani (IBM)
2024-03-04  0:59     ` Dave Chinner
2024-03-08  7:19       ` Ojaswin Mujoo
2024-03-02  7:42   ` Ritesh Harjani (IBM) [this message]
2024-03-04  1:16     ` [RFC 3/8] iomap: Add atomic write support for direct-io Dave Chinner
2024-03-04  5:33       ` Ritesh Harjani
2024-03-04  8:49         ` John Garry
2024-03-04 10:31           ` Ritesh Harjani
2024-03-04 20:56         ` Dave Chinner
2024-03-02  7:42   ` [RFC 4/8] ext4: Add statx and other atomic write helper routines Ritesh Harjani (IBM)
2024-03-06 11:14     ` John Garry
2024-03-08  8:10       ` Ritesh Harjani
2024-03-02  7:42   ` [RFC 5/8] ext4: Adds direct-io atomic writes checks Ritesh Harjani (IBM)
2024-03-02  7:42   ` [RFC 6/8] ext4: Add an inode flag for atomic writes Ritesh Harjani (IBM)
2024-03-04 20:34     ` Dave Chinner
2024-03-08  8:02       ` Ritesh Harjani
2024-03-02  7:42   ` [RFC 7/8] ext4: Enable FMODE_CAN_ATOMIC_WRITE in open for direct-io Ritesh Harjani (IBM)
2024-03-02  7:42   ` [RFC 8/8] ext4: Adds atomic writes using fsawu Ritesh Harjani (IBM)
2024-03-02  7:42 ` [RFC 9/9] e2fsprogs/chattr: Supports atomic writes attribute Ritesh Harjani (IBM)
2024-03-06 11:22 ` [RFC 0/9] ext4: Add direct-io atomic write support using fsawu John Garry
2024-03-06 13:13   ` Ritesh Harjani
2024-03-08 20:25   ` [RFC] ext4: Add support for ext4_map_blocks_atomic() Ritesh Harjani (IBM)
2024-03-09  2:37     ` Ritesh Harjani
2024-03-13 18:40     ` John Garry
2024-03-14 15:52       ` Ritesh Harjani
2024-03-18  8:22         ` John Garry

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=6a09654d152d3d1a07636174f5abcfce9948c20c.1709361537.git.ritesh.list@gmail.com \
    --to=ritesh.list@gmail.com \
    --cc=djwong@kernel.org \
    --cc=jack@suse.cz \
    --cc=john.g.garry@oracle.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mcgrof@kernel.org \
    --cc=ojaswin@linux.ibm.com \
    --cc=tytso@mit.edu \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).