Linux-ext4 Archive on lore.kernel.org
 help / color / Atom feed
From: Jan Kara <jack@suse.cz>
To: Ted Tso <tytso@mit.edu>
Cc: <linux-ext4@vger.kernel.org>, Eric Whitney <enwlinux@gmail.com>,
	<linux-fsdevel@vger.kernel.org>,
	"Darrick J . Wong" <djwong@kernel.org>, Jan Kara <jack@suse.cz>
Subject: [PATCH 2/3] ext4: Fix occasional generic/418 failure
Date: Mon, 12 Apr 2021 12:23:32 +0200
Message-ID: <20210412102333.2676-3-jack@suse.cz> (raw)
In-Reply-To: <20210412102333.2676-1-jack@suse.cz>

Eric has noticed that after pagecache read rework, generic/418 is
occasionally failing for ext4 when blocksize < pagesize. In fact, the
pagecache rework just made hard to hit race in ext4 more likely. The
problem is that since ext4 conversion of direct IO writes to iomap
framework (commit 378f32bab371), we update inode size after direct IO
write only after invalidating page cache. Thus if buffered read sneaks
at unfortunate moment like:

CPU1 - write at offset 1k                       CPU2 - read from offset 0
iomap_dio_rw(..., IOMAP_DIO_FORCE_WAIT);
                                                ext4_readpage();
ext4_handle_inode_extension()

the read will zero out tail of the page as it still sees smaller inode
size and thus page cache becomes inconsistent with on-disk contents with
all the consequences.

Fix the problem by moving inode size update into end_io handler which
gets called before the page cache is invalidated.

Reported-by: Eric Whitney <enwlinux@gmail.com>
Fixes: 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure")
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/file.c | 185 ++++++++++++++++++++++++++-----------------------
 1 file changed, 99 insertions(+), 86 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2505313d96b0..ec59ea078f53 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -280,11 +280,67 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
 	return ret;
 }
 
+static int ext4_prepare_dio_extend(struct inode *inode)
+{
+	handle_t *handle;
+	int ret;
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	ext4_fc_start_update(inode);
+	ret = ext4_orphan_add(handle, inode);
+	ext4_fc_stop_update(inode);
+	if (ret) {
+		ext4_journal_stop(handle);
+		return ret;
+	}
+	ext4_journal_stop(handle);
+	return 0;
+}
+
+static void ext4_cleanup_dio_extend(struct inode *inode, loff_t offset,
+				    ssize_t written, size_t count)
+{
+	handle_t *handle;
+	u8 blkbits = inode->i_blkbits;
+	ext4_lblk_t written_blk, end_blk;
+
+	written_blk = ALIGN(offset + written, 1 << blkbits);
+	end_blk = ALIGN(offset + count, 1 << blkbits);
+	if (written_blk < end_blk && ext4_can_truncate(inode)) {
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If the truncate operation failed early, then the inode may
+		 * still be on the orphan list. In that case, we need to try
+		 * remove the inode from the in-memory linked list.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+		return;
+	}
+
+	/*
+	 * We need to ensure that the inode is removed from the orphan
+	 * list if it has been added prematurely, due to writeback of
+	 * delalloc blocks.
+	 */
+	if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+		if (IS_ERR(handle)) {
+			ext4_orphan_del(NULL, inode);
+			return;
+		}
+		ext4_orphan_del(handle, inode);
+		ext4_journal_stop(handle);
+	}
+}
+
 static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
 					   ssize_t written, size_t count)
 {
 	handle_t *handle;
-	bool truncate = false;
 	u8 blkbits = inode->i_blkbits;
 	ext4_lblk_t written_blk, end_blk;
 	int ret;
@@ -298,73 +354,31 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
 	 * as much as we intended.
 	 */
 	WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
-	if (offset + count <= EXT4_I(inode)->i_disksize) {
-		/*
-		 * We need to ensure that the inode is removed from the orphan
-		 * list if it has been added prematurely, due to writeback of
-		 * delalloc blocks.
-		 */
-		if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
-			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-
-			if (IS_ERR(handle)) {
-				ext4_orphan_del(NULL, inode);
-				return PTR_ERR(handle);
-			}
-
-			ext4_orphan_del(handle, inode);
-			ext4_journal_stop(handle);
-		}
-
+	if (offset + count <= EXT4_I(inode)->i_disksize)
 		return written;
-	}
-
-	if (written < 0)
-		goto truncate;
 
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-	if (IS_ERR(handle)) {
-		written = PTR_ERR(handle);
-		goto truncate;
-	}
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 
 	if (ext4_update_inode_size(inode, offset + written)) {
 		ret = ext4_mark_inode_dirty(handle, inode);
 		if (unlikely(ret)) {
-			written = ret;
 			ext4_journal_stop(handle);
-			goto truncate;
+			return ret;
 		}
 	}
 
 	/*
-	 * We may need to truncate allocated but not written blocks beyond EOF.
+	 * If there cannot be allocated but unused blocks beyond EOF, remove
+	 * the inode from the orphan list as we are done with it.
 	 */
 	written_blk = ALIGN(offset + written, 1 << blkbits);
 	end_blk = ALIGN(offset + count, 1 << blkbits);
-	if (written_blk < end_blk && ext4_can_truncate(inode))
-		truncate = true;
-
-	/*
-	 * Remove the inode from the orphan list if it has been extended and
-	 * everything went OK.
-	 */
-	if (!truncate && inode->i_nlink)
+	if (written_blk == end_blk && inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 	ext4_journal_stop(handle);
 
-	if (truncate) {
-truncate:
-		ext4_truncate_failed_write(inode);
-		/*
-		 * If the truncate operation failed early, then the inode may
-		 * still be on the orphan list. In that case, we need to try
-		 * remove the inode from the in-memory linked list.
-		 */
-		if (inode->i_nlink)
-			ext4_orphan_del(NULL, inode);
-	}
-
 	return written;
 }
 
@@ -385,10 +399,28 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
 	return 0;
 }
 
+static int ext4_dio_extending_write_end_io(struct kiocb *iocb, ssize_t size,
+					   ssize_t orig_size, int error,
+					   unsigned int flags)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	int ret;
+
+	ret = ext4_dio_write_end_io(iocb, size, orig_size, error, flags);
+	if (ret < 0)
+		return ret;
+	return ext4_handle_inode_extension(inode, iocb->ki_pos, size,
+					   orig_size);
+}
+
 static const struct iomap_dio_ops ext4_dio_write_ops = {
 	.end_io = ext4_dio_write_end_io,
 };
 
+static const struct iomap_dio_ops ext4_dio_extending_write_ops = {
+	.end_io = ext4_dio_extending_write_end_io,
+};
+
 /*
  * The intention here is to start with shared lock acquired then see if any
  * condition requires an exclusive inode lock. If yes, then we restart the
@@ -455,11 +487,11 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
 static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	ssize_t ret;
-	handle_t *handle;
 	struct inode *inode = file_inode(iocb->ki_filp);
 	loff_t offset = iocb->ki_pos;
 	size_t count = iov_iter_count(from);
 	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
+	const struct iomap_dio_ops *dio_ops = &ext4_dio_write_ops;
 	bool extend = false, unaligned_io = false;
 	bool ilock_shared = true;
 
@@ -530,33 +562,21 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		inode_dio_wait(inode);
 
 	if (extend) {
-		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
+		ret = ext4_prepare_dio_extend(inode);
+		if (ret)
 			goto out;
-		}
-
-		ext4_fc_start_update(inode);
-		ret = ext4_orphan_add(handle, inode);
-		ext4_fc_stop_update(inode);
-		if (ret) {
-			ext4_journal_stop(handle);
-			goto out;
-		}
-
-		ext4_journal_stop(handle);
 	}
 
 	if (ilock_shared)
 		iomap_ops = &ext4_iomap_overwrite_ops;
-	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
-			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
-	if (ret == -ENOTBLK)
-		ret = 0;
-
 	if (extend)
-		ret = ext4_handle_inode_extension(inode, offset, ret, count);
+		dio_ops = &ext4_dio_extending_write_ops;
 
+	ret = iomap_dio_rw(iocb, from, iomap_ops, dio_ops,
+			   (extend || unaligned_io) ? IOMAP_DIO_FORCE_WAIT : 0);
+	if (ret == -ENOTBLK)
+		ret = 0;
+	ext4_cleanup_dio_extend(inode, offset, ret, count);
 out:
 	if (ilock_shared)
 		inode_unlock_shared(inode);
@@ -599,7 +619,6 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t ret;
 	size_t count;
 	loff_t offset;
-	handle_t *handle;
 	bool extend = false;
 	struct inode *inode = file_inode(iocb->ki_filp);
 
@@ -618,26 +637,20 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	count = iov_iter_count(from);
 
 	if (offset + count > EXT4_I(inode)->i_disksize) {
-		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			goto out;
-		}
-
-		ret = ext4_orphan_add(handle, inode);
-		if (ret) {
-			ext4_journal_stop(handle);
+		ret = ext4_prepare_dio_extend(inode);
+		if (ret < 0)
 			goto out;
-		}
-
 		extend = true;
-		ext4_journal_stop(handle);
 	}
 
 	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
 
-	if (extend)
-		ret = ext4_handle_inode_extension(inode, offset, ret, count);
+	if (extend) {
+		if (ret > 0)
+			ret = ext4_handle_inode_extension(inode, offset, ret,
+							  count);
+		ext4_cleanup_dio_extend(inode, offset, ret, count);
+	}
 out:
 	inode_unlock(inode);
 	if (ret > 0)
-- 
2.26.2


  parent reply index

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-12 10:23 [PATCH 0/3] ext4: Fix data corruption when extending DIO write races with buffered read Jan Kara
2021-04-12 10:23 ` [PATCH 1/3] iomap: Pass original DIO size to completion handler Jan Kara
2021-04-12 14:07   ` kernel test robot
2021-04-12 14:12   ` kernel test robot
2021-04-12 16:37   ` kernel test robot
2021-04-12 10:23 ` Jan Kara [this message]
2021-04-12 21:50   ` [PATCH 2/3] ext4: Fix occasional generic/418 failure Dave Chinner
2021-04-13  9:11     ` Jan Kara
2021-04-13 22:45       ` Dave Chinner
2021-04-14 11:56         ` Jan Kara
2021-04-12 10:23 ` [PATCH 3/3] ext4: Fix overflow in ext4_iomap_alloc() Jan Kara
2021-04-12 11:30   ` Matthew Wilcox
2021-06-16 12:22   ` Theodore Ts'o

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210412102333.2676-3-jack@suse.cz \
    --to=jack@suse.cz \
    --cc=djwong@kernel.org \
    --cc=enwlinux@gmail.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-ext4 Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-ext4/0 linux-ext4/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-ext4 linux-ext4/ https://lore.kernel.org/linux-ext4 \
		linux-ext4@vger.kernel.org
	public-inbox-index linux-ext4

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-ext4


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git