From: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
To: linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org
Cc: Ojaswin Mujoo <ojaswin@linux.ibm.com>, Jan Kara <jack@suse.cz>,
Theodore Ts'o <tytso@mit.edu>,
Matthew Wilcox <willy@infradead.org>,
"Darrick J . Wong" <djwong@kernel.org>,
Luis Chamberlain <mcgrof@kernel.org>,
John Garry <john.g.garry@oracle.com>,
linux-kernel@vger.kernel.org,
"Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Subject: [RFC 3/8] iomap: Add atomic write support for direct-io
Date: Sat, 2 Mar 2024 13:12:00 +0530 [thread overview]
Message-ID: <6a09654d152d3d1a07636174f5abcfce9948c20c.1709361537.git.ritesh.list@gmail.com> (raw)
In-Reply-To: <555cc3e262efa77ee5648196362f415a1efc018d.1709361537.git.ritesh.list@gmail.com>
This adds direct-io atomic writes support in iomap. This adds -
1. IOMAP_ATOMIC flag for iomap iter.
2. Sets REQ_ATOMIC to bio opflags.
3. Adds necessary checks in iomap_dio code to ensure a single bio is
submitted for an atomic write request. (since we only support ubuf
type iocb). Otherwise return an error EIO.
4. Adds a common helper routine iomap_dio_check_atomic(). It helps in
verifying mapped length and start/end physical offset against the hw
device constraints for supporting atomic writes.
This patch is based on a patch from John Garry <john.g.garry@oracle.com>
which adds such support of DIO atomic writes to iomap.
Co-developed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
fs/iomap/direct-io.c | 75 +++++++++++++++++++++++++++++++++++++++++--
fs/iomap/trace.h | 3 +-
include/linux/iomap.h | 1 +
3 files changed, 75 insertions(+), 4 deletions(-)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bcd3f8cf5ea4..b4548acb74e7 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -256,7 +256,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
- const struct iomap *iomap, bool use_fua)
+ const struct iomap *iomap, bool use_fua, bool atomic_write)
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
@@ -269,6 +269,9 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
else
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ if (atomic_write)
+ opflags |= REQ_ATOMIC;
+
return opflags;
}
@@ -279,11 +282,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
loff_t length = iomap_length(iter);
+ const size_t orig_len = iter->len;
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
bool need_zeroout = false;
- bool use_fua = false;
+ bool use_fua = false, atomic_write = iter->flags & IOMAP_ATOMIC;
int nr_pages, ret = 0;
size_t copied = 0;
size_t orig_count;
@@ -356,6 +360,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
pad = pos & (fs_block_size - 1);
+ if (unlikely(pad && atomic_write)) {
+ WARN_ON_ONCE("pos not atomic write aligned\n");
+ ret = -EINVAL;
+ goto out;
+ }
if (pad)
iomap_dio_zero(iter, dio, pos - pad, pad);
}
@@ -365,7 +374,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* can set up the page vector appropriately for a ZONE_APPEND
* operation.
*/
- bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_write);
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
@@ -397,6 +406,14 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
n = bio->bi_iter.bi_size;
+
+ /* This bio should have covered the complete length */
+ if (unlikely(atomic_write && n != orig_len)) {
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ bio_put(bio);
+ goto out;
+ }
if (dio->flags & IOMAP_DIO_WRITE) {
task_io_account_write(n);
} else {
@@ -429,6 +446,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
/* zero out from the end of the write to the end of the block */
pad = pos & (fs_block_size - 1);
+ /* This should never happen */
+ WARN_ON_ONCE(unlikely(pad && atomic_write));
if (pad)
iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
}
@@ -516,6 +535,44 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,
}
}
+/*
+ * iomap_dio_check_atomic: DIO Atomic checks before calling bio submission.
+ * @iter: iomap iterator
+ * This function is called after filesystem block mapping and before bio
+ * formation/submission. This is the right place to verify hw device/block
+ * layer constraints to be followed for doing atomic writes. Hence do those
+ * common checks here.
+ */
+static bool iomap_dio_check_atomic(struct iomap_iter *iter)
+{
+ struct block_device *bdev = iter->iomap.bdev;
+ unsigned long long map_len = iomap_length(iter);
+ unsigned long long start = iomap_sector(&iter->iomap, iter->pos)
+ << SECTOR_SHIFT;
+ unsigned long long end = start + map_len - 1;
+ unsigned int awu_min =
+ queue_atomic_write_unit_min_bytes(bdev->bd_queue);
+ unsigned int awu_max =
+ queue_atomic_write_unit_max_bytes(bdev->bd_queue);
+ unsigned long boundary =
+ queue_atomic_write_boundary_bytes(bdev->bd_queue);
+ unsigned long mask = ~(boundary - 1);
+
+
+ /* map_len should be same as user specified iter->len */
+ if (map_len < iter->len)
+ return false;
+ /* start should be aligned to block device min atomic unit alignment */
+ if (!IS_ALIGNED(start, awu_min))
+ return false;
+ /* If top bits doesn't match, means atomic unit boundary is crossed */
+ if (boundary && ((start | mask) != (end | mask)))
+ return false;
+
+ return true;
+}
+
+
/*
* iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
* is being issued as AIO or not. This allows us to optimise pure data writes
@@ -554,12 +611,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
struct blk_plug plug;
struct iomap_dio *dio;
loff_t ret = 0;
+ bool atomic_write = iocb->ki_flags & IOCB_ATOMIC;
trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
if (!iomi.len)
return NULL;
+ if (atomic_write && !iter_is_ubuf(iter))
+ return ERR_PTR(-EINVAL);
+
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio)
return ERR_PTR(-ENOMEM);
@@ -605,6 +666,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
dio->flags |= IOMAP_DIO_CALLER_COMP;
+ if (atomic_write)
+ iomi.flags |= IOMAP_ATOMIC;
+
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (iomi.pos >= dio->i_size ||
@@ -656,6 +720,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug);
while ((ret = iomap_iter(&iomi, ops)) > 0) {
+ if (atomic_write && !iomap_dio_check_atomic(&iomi)) {
+ ret = -EIO;
+ break;
+ }
+
iomi.processed = iomap_dio_iter(&iomi, dio);
/*
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index c16fd55f5595..c95576420bca 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_REPORT, "REPORT" }, \
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
- { IOMAP_NOWAIT, "NOWAIT" }
+ { IOMAP_NOWAIT, "NOWAIT" }, \
+ { IOMAP_ATOMIC, "ATOMIC" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 96dd0acbba44..9eac704a0d6f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -178,6 +178,7 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC (1 << 9)
struct iomap_ops {
/*
--
2.43.0
next prev parent reply other threads:[~2024-03-02 7:42 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-03-02 7:41 [RFC 0/9] ext4: Add direct-io atomic write support using fsawu Ritesh Harjani (IBM)
2024-03-02 7:41 ` [RFC 1/8] fs: Add FS_XFLAG_ATOMICWRITES flag Ritesh Harjani (IBM)
2024-03-02 7:41 ` [RFC 2/8] fs: Reserve inode flag FS_ATOMICWRITES_FL for atomic writes Ritesh Harjani (IBM)
2024-03-04 0:59 ` Dave Chinner
2024-03-08 7:19 ` Ojaswin Mujoo
2024-03-02 7:42 ` Ritesh Harjani (IBM) [this message]
2024-03-04 1:16 ` [RFC 3/8] iomap: Add atomic write support for direct-io Dave Chinner
2024-03-04 5:33 ` Ritesh Harjani
2024-03-04 8:49 ` John Garry
2024-03-04 10:31 ` Ritesh Harjani
2024-03-04 20:56 ` Dave Chinner
2024-03-02 7:42 ` [RFC 4/8] ext4: Add statx and other atomic write helper routines Ritesh Harjani (IBM)
2024-03-06 11:14 ` John Garry
2024-03-08 8:10 ` Ritesh Harjani
2024-03-02 7:42 ` [RFC 5/8] ext4: Adds direct-io atomic writes checks Ritesh Harjani (IBM)
2024-03-02 7:42 ` [RFC 6/8] ext4: Add an inode flag for atomic writes Ritesh Harjani (IBM)
2024-03-04 20:34 ` Dave Chinner
2024-03-08 8:02 ` Ritesh Harjani
2024-03-02 7:42 ` [RFC 7/8] ext4: Enable FMODE_CAN_ATOMIC_WRITE in open for direct-io Ritesh Harjani (IBM)
2024-03-02 7:42 ` [RFC 8/8] ext4: Adds atomic writes using fsawu Ritesh Harjani (IBM)
2024-03-02 7:42 ` [RFC 9/9] e2fsprogs/chattr: Supports atomic writes attribute Ritesh Harjani (IBM)
2024-03-06 11:22 ` [RFC 0/9] ext4: Add direct-io atomic write support using fsawu John Garry
2024-03-06 13:13 ` Ritesh Harjani
2024-03-08 20:25 ` [RFC] ext4: Add support for ext4_map_blocks_atomic() Ritesh Harjani (IBM)
2024-03-09 2:37 ` Ritesh Harjani
2024-03-13 18:40 ` John Garry
2024-03-14 15:52 ` Ritesh Harjani
2024-03-18 8:22 ` John Garry
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=6a09654d152d3d1a07636174f5abcfce9948c20c.1709361537.git.ritesh.list@gmail.com \
--to=ritesh.list@gmail.com \
--cc=djwong@kernel.org \
--cc=jack@suse.cz \
--cc=john.g.garry@oracle.com \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mcgrof@kernel.org \
--cc=ojaswin@linux.ibm.com \
--cc=tytso@mit.edu \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).