All of lore.kernel.org
 help / color / mirror / Atom feed
From: Zhang Yi <yi.zhang@huaweicloud.com>
To: linux-ext4@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org, tytso@mit.edu,
	adilger.kernel@dilger.ca, jack@suse.cz, ritesh.list@gmail.com,
	hch@infradead.org, djwong@kernel.org, yi.zhang@huawei.com,
	yi.zhang@huaweicloud.com, chengzhihao1@huawei.com,
	yukuai3@huawei.com
Subject: [RFC PATCH 10/18] ext4: implement buffered write iomap path
Date: Thu, 23 Nov 2023 20:51:12 +0800	[thread overview]
Message-ID: <20231123125121.4064694-11-yi.zhang@huaweicloud.com> (raw)
In-Reply-To: <20231123125121.4064694-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Implement both buffer write path with/without delayed allocation
feature, also inherit the fallback to nodelalloc logic from buffer_head
path when the free space is about to run out. After switching to iomap,
we support mapping multi-blocks once a time, which could bring a lot of
performance gains.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/inode.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 207 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4c206cf37a49..9229297e1efc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3525,13 +3525,154 @@ const struct iomap_ops ext4_iomap_report_ops = {
 	.iomap_begin = ext4_iomap_begin_report,
 };
 
+static int ext4_iomap_da_map_blocks(struct inode *inode,
+				    struct ext4_map_blocks *map)
+{
+	struct extent_status es;
+	unsigned int status;
+	ext4_lblk_t next;
+	int mapped_len;
+	int ret = 0;
+#ifdef ES_AGGRESSIVE_TEST
+	struct ext4_map_blocks orig_map;
+
+	memcpy(&orig_map, map, sizeof(*map));
+#endif
+
+	map->m_flags = 0;
+	ext_debug(inode, "max_blocks %u, logical block %llu\n", map->m_len,
+		  (unsigned long long)map->m_lblk);
+
+	/* Lookup extent status tree firstly */
+	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+		int es_len = es.es_len - (map->m_lblk - es.es_lblk);
+
+		map->m_len = min_t(unsigned int, map->m_len, es_len);
+		if (ext4_es_is_delonly(&es)) {
+			map->m_pblk = 0;
+			map->m_flags |= EXT4_MAP_DELAYED;
+			return 0;
+		}
+		if (ext4_es_is_hole(&es)) {
+			down_read(&EXT4_I(inode)->i_data_sem);
+			goto add_delayed;
+		}
+
+		map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
+		if (ext4_es_is_written(&es))
+			map->m_flags |= EXT4_MAP_MAPPED;
+		else if (ext4_es_is_unwritten(&es))
+			map->m_flags |= EXT4_MAP_UNWRITTEN;
+		else
+			BUG();
+
+#ifdef ES_AGGRESSIVE_TEST
+		ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
+		/* Already delayed */
+		if (ext4_es_is_delayed(&es))
+			return 0;
+
+		down_read(&EXT4_I(inode)->i_data_sem);
+		goto insert_extent;
+	}
+
+	/*
+	 * Not found cached extents, adjust the length if it has been
+	 * partially allocated.
+	 */
+	if (es.es_lblk > map->m_lblk &&
+	    es.es_lblk < map->m_lblk + map->m_len) {
+		next = es.es_lblk;
+		if (ext4_es_is_hole(&es))
+			next = ext4_es_skip_hole_extent(inode, map->m_lblk,
+							map->m_len);
+		map->m_len = next - map->m_lblk;
+	}
+
+	/*
+	 * Try to see if we can get blocks without requesting new file
+	 * system blocks.
+	 */
+	down_read(&EXT4_I(inode)->i_data_sem);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		mapped_len = ext4_ext_map_blocks(NULL, inode, map, 0);
+	else
+		mapped_len = ext4_ind_map_blocks(NULL, inode, map, 0);
+	if (mapped_len < 0) {
+		ret = mapped_len;
+		goto out_unlock;
+	}
+	if (mapped_len == 0)
+		goto add_delayed;
+
+	if (unlikely(mapped_len != map->m_len)) {
+		ext4_warning(inode->i_sb,
+			     "ES len assertion failed for inode %lu: "
+			     "retval %d != map->m_len %d",
+			     inode->i_ino, mapped_len, map->m_len);
+		WARN_ON(1);
+	}
+
+insert_extent:
+	status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+			EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+	if (status == EXTENT_STATUS_UNWRITTEN)
+		status |= EXTENT_STATUS_DELAYED;
+	ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+			      map->m_pblk, status);
+	goto out_unlock;
+add_delayed:
+	ret = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
+out_unlock:
+	up_read((&EXT4_I(inode)->i_data_sem));
+	return ret;
+}
+
+static int ext4_iomap_noda_map_blocks(struct inode *inode,
+				      struct ext4_map_blocks *map)
+{
+	handle_t *handle;
+	int ret, needed_blocks;
+	int flags;
+
+	/*
+	 * Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason.
+	 */
+	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (ext4_should_dioread_nolock(inode))
+		flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+	else
+		flags = EXT4_GET_BLOCKS_CREATE;
+
+	ret = ext4_map_blocks(handle, inode, map, flags);
+	if (ret < 0) {
+		ext4_journal_stop(handle);
+		return ret;
+	}
+
+	return 0;
+}
+
+#define IOMAP_F_EXT4_NONDELALLOC IOMAP_F_PRIVATE
+
 static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
 				loff_t length, unsigned int flags,
 				struct iomap *iomap, struct iomap *srcmap)
 {
-	int ret;
+	int ret, retries = 0;
 	struct ext4_map_blocks map;
 	u8 blkbits = inode->i_blkbits;
+	bool no_delalloc = false;
+
+	if ((flags & IOMAP_WRITE) &&
+	    unlikely(ext4_forced_shutdown(inode->i_sb)))
+		return -EIO;
 
 	if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
 		return -EINVAL;
@@ -3539,6 +3680,7 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
 	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
 		return -ERANGE;
 
+retry:
 	/*
 	 * Calculate the first and last logical blocks respectively.
 	 */
@@ -3546,14 +3688,77 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
 	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
 			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
 
-	ret = ext4_map_blocks(NULL, inode, &map, 0);
+	if (flags & IOMAP_WRITE) {
+		if (test_opt(inode->i_sb, DELALLOC) &&
+		    !ext4_nonda_switch(inode->i_sb)) {
+			ret = ext4_iomap_da_map_blocks(inode, &map);
+		} else {
+			ret = ext4_iomap_noda_map_blocks(inode, &map);
+			no_delalloc = true;
+		}
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+	} else {
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+	}
 	if (ret < 0)
 		return ret;
 
 	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
+	if (no_delalloc)
+		iomap->flags |= IOMAP_F_EXT4_NONDELALLOC;
+
 	return 0;
 }
 
+static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
+					 loff_t length, ssize_t written,
+					 unsigned flags, struct iomap *iomap)
+{
+	handle_t *handle;
+	int ret = 0, ret2;
+
+	if (!(flags & IOMAP_WRITE))
+		return 0;
+	if (!(iomap->flags & IOMAP_F_EXT4_NONDELALLOC))
+		return 0;
+
+	handle = ext4_journal_current_handle();
+	if (iomap->flags & IOMAP_F_SIZE_CHANGED) {
+		ext4_update_i_disksize(inode, inode->i_size);
+		ret = ext4_mark_inode_dirty(handle, inode);
+	}
+
+	/*
+	 * If we have allocated more blocks and copied less.
+	 * We will have blocks allocated outside inode->i_size,
+	 * so truncate them.
+	 */
+	if (offset + length > inode->i_size)
+		ext4_orphan_add(handle, inode);
+
+	ret2 = ext4_journal_stop(handle);
+	ret = ret ? ret : ret2;
+
+	if (offset + length > inode->i_size) {
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+	return ret;
+}
+
+const struct iomap_ops ext4_iomap_buffered_write_ops = {
+	.iomap_begin = ext4_iomap_buffered_io_begin,
+	.iomap_end = ext4_iomap_buffered_write_end,
+};
+
 const struct iomap_ops ext4_iomap_read_ops = {
 	.iomap_begin = ext4_iomap_buffered_io_begin,
 };
-- 
2.39.2


  parent reply	other threads:[~2023-11-23 12:52 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-23 12:51 [RFC PATCH 00/18] ext4: use iomap for regular file's buffered IO path and enable large foilo Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 01/18] ext4: introduce ext4_es_skip_hole_extent() to skip hole extents Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 02/18] ext4: make ext4_es_lookup_extent() return the next extent if not found Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 03/18] ext4: correct the hole length returned by ext4_map_blocks() Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 04/18] ext4: add a hole extent entry in cache after punch Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 05/18] ext4: make ext4_map_blocks() distinguish delayed only mapping Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 06/18] ext4: make ext4_set_iomap() recognize IOMAP_DELALLOC mapping type Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 07/18] ext4: allow reserving multi-delayed blocks Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 08/18] ext4: add a new iomap aops for regular file's buffered IO path Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 09/18] ext4: implement buffered read iomap path Zhang Yi
2023-11-23 12:51 ` Zhang Yi [this message]
2023-11-23 12:51 ` [RFC PATCH 11/18] iomap: add a fs private parameter to iomap_ioend Zhang Yi
2023-11-23 15:36   ` Christoph Hellwig
2023-11-24  1:36     ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 12/18] iomap: don't increase i_size if it's not a write operation Zhang Yi
2023-11-23 15:34   ` Christoph Hellwig
2023-11-24  1:41     ` Zhang Yi
2023-11-30 12:26     ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 13/18] ext4: impliment writeback iomap path Zhang Yi
2023-11-24  7:56   ` [PATCH RFC " kernel test robot
2023-11-24 15:41   ` kernel test robot
2023-11-23 12:51 ` [RFC PATCH 14/18] ext4: impliment zero_range " Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 15/18] ext4: writeback partial blocks before zero range Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 16/18] ext4: impliment mmap iomap path Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 17/18] ext4: partial enable iomap for regular file's buffered IO path Zhang Yi
2023-11-24 13:57   ` Zhang Yi
2023-11-23 12:51 ` Zhang Yi
2023-11-23 12:51 ` [RFC PATCH 18/18] ext4: enable large folio for regular file which has been switched to use iomap Zhang Yi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231123125121.4064694-11-yi.zhang@huaweicloud.com \
    --to=yi.zhang@huaweicloud.com \
    --cc=adilger.kernel@dilger.ca \
    --cc=chengzhihao1@huawei.com \
    --cc=djwong@kernel.org \
    --cc=hch@infradead.org \
    --cc=jack@suse.cz \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=ritesh.list@gmail.com \
    --cc=tytso@mit.edu \
    --cc=yi.zhang@huawei.com \
    --cc=yukuai3@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.