All of lore.kernel.org
 help / color / mirror / Atom feed
From: John Garry <john.g.garry@oracle.com>
To: axboe@kernel.dk, brauner@kernel.org, djwong@kernel.org,
	viro@zeniv.linux.org.uk, jack@suse.cz, akpm@linux-foundation.org,
	willy@infradead.org, dchinner@redhat.com, tytso@mit.edu,
	hch@lst.de, martin.petersen@oracle.com, nilay@linux.ibm.com,
	ritesh.list@gmail.com, mcgrof@kernel.org
Cc: linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, ojaswin@linux.ibm.com, p.raghav@samsung.com,
	jbongio@google.com, okiselev@amazon.com,
	John Garry <john.g.garry@oracle.com>
Subject: [PATCH RFC 5/7] fs: iomap: buffered atomic write support
Date: Mon, 22 Apr 2024 14:39:21 +0000	[thread overview]
Message-ID: <20240422143923.3927601-6-john.g.garry@oracle.com> (raw)
In-Reply-To: <20240422143923.3927601-1-john.g.garry@oracle.com>

Add special handling of PG_atomic flag to iomap buffered write path.

To flag an iomap iter for an atomic write, set IOMAP_ATOMIC.

For a folio associated with a write which has IOMAP_ATOMIC set, set
PG_atomic.

Otherwise, when IOMAP_ATOMIC is unset, clear PG_atomic.

This means that for an "atomic" folio which has not been written back, it
loses it "atomicity". So if userspace issues a write with RWF_ATOMIC set
and another write with RWF_ATOMIC unset and which fully or partially
overwrites that same region as the first write, that folio is not written
back atomically. For such a scenario to occur, it would be considered a
userspace usage error.

To ensure that a buffered atomic write is written back atomically when
the write syscall returns, RWF_SYNC or similar needs to be used (in
conjunction with RWF_ATOMIC).

As a safety check, when getting a folio for an atomic write in
iomap_get_folio(), ensure that the length matches the inode mapping folio
order-limit.

Only a single BIO should ever be submitted for an atomic write. So modify
iomap_add_to_ioend() to ensure that we don't try to write back an atomic
folio as part of a larger mixed-atomicity BIO.

In iomap_alloc_ioend(), handle an atomic write by setting REQ_ATOMIC for
the allocated BIO.

When a folio is written back, again clear PG_atomic, as it is no longer
required. I assume it will not be needlessly written back a second time...

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 fs/iomap/buffered-io.c | 53 ++++++++++++++++++++++++++++++++++++------
 fs/iomap/trace.h       |  3 ++-
 include/linux/iomap.h  |  1 +
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 4e8e41c8b3c0..ac2a014c91a9 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -586,13 +586,25 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
  */
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
 {
+	struct address_space *mapping = iter->inode->i_mapping;
 	fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS;
 
 	if (iter->flags & IOMAP_NOWAIT)
 		fgp |= FGP_NOWAIT;
 	fgp |= fgf_set_order(len);
 
-	return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+	if (iter->flags & IOMAP_ATOMIC) {
+		unsigned int min_order = mapping_min_folio_order(mapping);
+		unsigned int max_order = mapping_max_folio_order(mapping);
+		unsigned int order = FGF_GET_ORDER(fgp);
+
+		if (order != min_order)
+			return ERR_PTR(-EINVAL);
+		if (order != max_order)
+			return ERR_PTR(-EINVAL);
+	}
+
+	return __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
 			fgp, mapping_gfp_mask(iter->inode->i_mapping));
 }
 EXPORT_SYMBOL_GPL(iomap_get_folio);
@@ -769,6 +781,7 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
 {
 	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
+	bool is_atomic = iter->flags & IOMAP_ATOMIC;
 	struct folio *folio;
 	int status = 0;
 
@@ -786,6 +799,11 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
 	if (IS_ERR(folio))
 		return PTR_ERR(folio);
 
+	if (is_atomic)
+		folio_set_atomic(folio);
+	else
+		folio_clear_atomic(folio);
+
 	/*
 	 * Now we have a locked folio, before we do anything with it we need to
 	 * check that the iomap we have cached is not stale. The inode extent
@@ -1010,6 +1028,8 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		iter.flags |= IOMAP_NOWAIT;
+	if (iocb->ki_flags & IOCB_ATOMIC)
+		iter.flags |= IOMAP_ATOMIC;
 
 	while ((ret = iomap_iter(&iter, ops)) > 0)
 		iter.processed = iomap_write_iter(&iter, i);
@@ -1499,8 +1519,10 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
 	WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
 	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
 
-	if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
+	if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) {
+		folio_clear_atomic(folio);
 		folio_end_writeback(folio);
+	}
 }
 
 /*
@@ -1679,14 +1701,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
 }
 
 static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct inode *inode, loff_t pos)
+		struct writeback_control *wbc, struct inode *inode, loff_t pos,
+		bool atomic)
 {
+	blk_opf_t opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 	struct iomap_ioend *ioend;
 	struct bio *bio;
 
+	if (atomic)
+		opf |= REQ_ATOMIC;
+
 	bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
-			       REQ_OP_WRITE | wbc_to_write_flags(wbc),
-			       GFP_NOFS, &iomap_ioend_bioset);
+			       opf, GFP_NOFS, &iomap_ioend_bioset);
 	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
 	bio->bi_end_io = iomap_writepage_end_bio;
 	wbc_init_bio(wbc, bio);
@@ -1744,14 +1770,27 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 {
 	struct iomap_folio_state *ifs = folio->private;
 	size_t poff = offset_in_folio(folio, pos);
+	bool is_atomic = folio_test_atomic(folio);
 	int error;
 
-	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+	if (!wpc->ioend || is_atomic || !iomap_can_add_to_ioend(wpc, pos)) {
 new_ioend:
 		error = iomap_submit_ioend(wpc, 0);
 		if (error)
 			return error;
-		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
+		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, is_atomic);
+	}
+
+	/* We must not append anything later if atomic, so submit now */
+	if (is_atomic) {
+		if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+			return -EINVAL;
+		wpc->ioend->io_size = len;
+		wbc_account_cgroup_owner(wbc, &folio->page, len);
+		if (ifs)
+			atomic_add(len, &ifs->write_bytes_pending);
+
+		return iomap_submit_ioend(wpc, 0);
 	}
 
 	if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 0a991c4ce87d..4118a42cdab0 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
 	{ IOMAP_REPORT,		"REPORT" }, \
 	{ IOMAP_FAULT,		"FAULT" }, \
 	{ IOMAP_DIRECT,		"DIRECT" }, \
-	{ IOMAP_NOWAIT,		"NOWAIT" }
+	{ IOMAP_NOWAIT,		"NOWAIT" }, \
+	{ IOMAP_ATOMIC,		"ATOMIC" }
 
 #define IOMAP_F_FLAGS_STRINGS \
 	{ IOMAP_F_NEW,		"NEW" }, \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f726f0058fd6..2f50abe06f27 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -179,6 +179,7 @@ struct iomap_folio_ops {
 #else
 #define IOMAP_DAX		0
 #endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC	(1 << 9)
 
 struct iomap_ops {
 	/*
-- 
2.31.1


  parent reply	other threads:[~2024-04-22 14:40 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-22 14:39 [PATCH RFC 0/7] buffered block atomic writes John Garry
2024-04-22 14:39 ` [PATCH RFC 1/7] fs: Rename STATX{_ATTR}_WRITE_ATOMIC -> STATX{_ATTR}_WRITE_ATOMIC_DIO John Garry
2024-04-22 14:39 ` [PATCH RFC 2/7] filemap: Change mapping_set_folio_min_order() -> mapping_set_folio_orders() John Garry
2024-04-25 14:47   ` Pankaj Raghav (Samsung)
2024-04-26  8:02     ` John Garry
2024-04-22 14:39 ` [PATCH RFC 3/7] mm: Add PG_atomic John Garry
2024-04-22 14:39 ` [PATCH RFC 4/7] fs: Add initial buffered atomic write support info to statx John Garry
2024-04-22 14:39 ` John Garry [this message]
2024-04-22 15:03   ` [PATCH RFC 5/7] fs: iomap: buffered atomic write support Matthew Wilcox
2024-04-22 16:02     ` John Garry
2024-04-22 14:39 ` [PATCH RFC 6/7] fs: xfs: buffered atomic writes statx support John Garry
2024-04-22 14:39 ` [PATCH RFC 7/7] fs: xfs: Enable buffered atomic writes John Garry

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240422143923.3927601-6-john.g.garry@oracle.com \
    --to=john.g.garry@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=axboe@kernel.dk \
    --cc=brauner@kernel.org \
    --cc=dchinner@redhat.com \
    --cc=djwong@kernel.org \
    --cc=hch@lst.de \
    --cc=jack@suse.cz \
    --cc=jbongio@google.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=mcgrof@kernel.org \
    --cc=nilay@linux.ibm.com \
    --cc=ojaswin@linux.ibm.com \
    --cc=okiselev@amazon.com \
    --cc=p.raghav@samsung.com \
    --cc=ritesh.list@gmail.com \
    --cc=tytso@mit.edu \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.