* [PATCH v4 01/34] ext4: factor out a common helper to query extent map
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [PATCH v4 02/34] ext4: check the extent status again before inserting delalloc block Zhang Yi
` (28 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Factor out a new common helper ext4_map_query_blocks() from the
ext4_da_map_blocks(), it query and return the extent map status on the
inode's extent path, no logic changes.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 57 +++++++++++++++++++++++++++----------------------
1 file changed, 32 insertions(+), 25 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 537803250ca9..6a41172c06e1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -453,6 +453,35 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
}
#endif /* ES_AGGRESSIVE_TEST */
+static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ unsigned int status;
+ int retval;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
+ else
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
+
+ if (retval <= 0)
+ return retval;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ return retval;
+}
+
/*
* The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
@@ -1744,33 +1773,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_has_inline_data(inode))
retval = 0;
- else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
- retval = ext4_ind_map_blocks(NULL, inode, map, 0);
- if (retval < 0) {
- up_read(&EXT4_I(inode)->i_data_sem);
- return retval;
- }
- if (retval > 0) {
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
- }
-
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- up_read(&EXT4_I(inode)->i_data_sem);
- return retval;
- }
+ retval = ext4_map_query_blocks(NULL, inode, map);
up_read(&EXT4_I(inode)->i_data_sem);
+ if (retval)
+ return retval;
add_delayed:
down_write(&EXT4_I(inode)->i_data_sem);
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [PATCH v4 02/34] ext4: check the extent status again before inserting delalloc block
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
2024-04-10 13:27 ` [PATCH v4 01/34] ext4: factor out a common helper to query extent map Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [PATCH v4 03/34] ext4: trim delalloc extent Zhang Yi
` (27 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Now we lookup extent status entry without holding the i_data_sem before
inserting delalloc block, it works fine in buffered write path and
because it holds i_rwsem and folio lock, and the mmap path holds folio
lock, so the found extent locklessly couldn't be modified concurrently.
But it could be raced by fallocate since it allocate block whitout
holding i_rwsem and folio lock.
ext4_page_mkwrite() ext4_fallocate()
block_page_mkwrite()
ext4_da_map_blocks()
//find hole in extent status tree
ext4_alloc_file_blocks()
ext4_map_blocks()
//allocate block and unwritten extent
ext4_insert_delayed_block()
ext4_da_reserve_space()
//reserve one more block
ext4_es_insert_delayed_block()
//drop unwritten extent and add delayed extent by mistake
Then, the delalloc extent is wrong until writeback, the one more
reserved block can't be release any more and trigger below warning:
EXT4-fs (pmem2): Inode 13 (00000000bbbd4d23): i_reserved_data_blocks(1) not cleared!
Hold i_data_sem in write mode directly can fix the problem, but it's
expansive, we should keep the lockless check and check the extent again
once we need to add an new delalloc block.
Cc: stable@vger.kernel.org
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6a41172c06e1..118b0497a954 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1737,6 +1737,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
if (ext4_es_is_hole(&es))
goto add_delayed;
+found:
/*
* Delayed extent could be allocated by fallocate.
* So we need to check it.
@@ -1781,6 +1782,24 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
add_delayed:
down_write(&EXT4_I(inode)->i_data_sem);
+ /*
+ * Lookup extents tree again under i_data_sem, make sure this
+ * inserting delalloc range haven't been delayed or allocated
+ * whitout holding i_rwsem and folio lock.
+ */
+ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
+ if (!ext4_es_is_hole(&es)) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto found;
+ }
+ } else if (!ext4_has_inline_data(inode)) {
+ retval = ext4_map_query_blocks(NULL, inode, map);
+ if (retval) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ return retval;
+ }
+ }
+
retval = ext4_insert_delayed_block(inode, map->m_lblk);
up_write(&EXT4_I(inode)->i_data_sem);
if (retval)
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [PATCH v4 03/34] ext4: trim delalloc extent
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
2024-04-10 13:27 ` [PATCH v4 01/34] ext4: factor out a common helper to query extent map Zhang Yi
2024-04-10 13:27 ` [PATCH v4 02/34] ext4: check the extent status again before inserting delalloc block Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [PATCH v4 04/34] ext4: drop iblock parameter Zhang Yi
` (26 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
The cached delalloc or hole extent should be trimed to the map->map_len
if we map delalloc blocks in ext4_da_map_blocks(). But it doesn't
trigger any issue now because the map->m_len is always set to one and we
always insert one delayed block once a time. Fix this by trim the extent
once we get one from the cached extent tree, prearing for mapping a
extent with multiple delalloc blocks.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 118b0497a954..e4043ddb07a5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1734,6 +1734,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
+ retval = es.es_len - (iblock - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+
if (ext4_es_is_hole(&es))
goto add_delayed;
@@ -1750,10 +1755,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
}
map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
- retval = es.es_len - (iblock - es.es_lblk);
- if (retval > map->m_len)
- retval = map->m_len;
- map->m_len = retval;
if (ext4_es_is_written(&es))
map->m_flags |= EXT4_MAP_MAPPED;
else if (ext4_es_is_unwritten(&es))
@@ -1788,6 +1789,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
* whitout holding i_rwsem and folio lock.
*/
if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
+ retval = es.es_len - (iblock - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+
if (!ext4_es_is_hole(&es)) {
up_write(&EXT4_I(inode)->i_data_sem);
goto found;
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [PATCH v4 04/34] ext4: drop iblock parameter
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (2 preceding siblings ...)
2024-04-10 13:27 ` [PATCH v4 03/34] ext4: trim delalloc extent Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [PATCH v4 05/34] ext4: make ext4_es_insert_delayed_block() insert multi-blocks Zhang Yi
` (25 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
The start block of the delalloc extent to be inserted is equal to
map->m_lblk, just drop the duplicate iblock input parameter.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e4043ddb07a5..cccc16506f5f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1712,8 +1712,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
* time. This function looks up the requested blocks and sets the
* buffer delay bit under the protection of i_data_sem.
*/
-static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
- struct ext4_map_blocks *map,
+static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map,
struct buffer_head *bh)
{
struct extent_status es;
@@ -1733,8 +1732,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
(unsigned long) map->m_lblk);
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
- retval = es.es_len - (iblock - es.es_lblk);
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
if (retval > map->m_len)
retval = map->m_len;
map->m_len = retval;
@@ -1754,7 +1753,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
return 0;
}
- map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+ map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
if (ext4_es_is_written(&es))
map->m_flags |= EXT4_MAP_MAPPED;
else if (ext4_es_is_unwritten(&es))
@@ -1788,8 +1787,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
* inserting delalloc range haven't been delayed or allocated
* whitout holding i_rwsem and folio lock.
*/
- if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
- retval = es.es_len - (iblock - es.es_lblk);
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
if (retval > map->m_len)
retval = map->m_len;
map->m_len = retval;
@@ -1846,7 +1845,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+ ret = ext4_da_map_blocks(inode, &map, bh);
if (ret <= 0)
return ret;
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [PATCH v4 05/34] ext4: make ext4_es_insert_delayed_block() insert multi-blocks
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (3 preceding siblings ...)
2024-04-10 13:27 ` [PATCH v4 04/34] ext4: drop iblock parameter Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [PATCH v4 06/34] ext4: make ext4_da_reserve_space() reserve multi-clusters Zhang Yi
` (24 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Rename ext4_es_insert_delayed_block() to ext4_es_insert_delayed_extent()
and pass length parameter to make it insert multi delalloc blocks once a
time. For the case of bigalloc, expand the allocated parameter to
lclu_allocated and end_allocated. lclu_allocated indicates the allocate
state of the cluster which containing the lblk, end_allocated represents
the end, and the middle clusters must be unallocated.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents_status.c | 63 ++++++++++++++++++++++++-------------
fs/ext4/extents_status.h | 5 +--
fs/ext4/inode.c | 2 +-
include/trace/events/ext4.h | 16 +++++-----
4 files changed, 55 insertions(+), 31 deletions(-)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 4a00e2f019d9..2320b0d71001 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -2052,34 +2052,42 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
}
/*
- * ext4_es_insert_delayed_block - adds a delayed block to the extents status
- * tree, adding a pending reservation where
- * needed
+ * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
+ * status tree, adding a pending reservation
+ * where needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
- * @allocated - indicates whether a physical cluster has been allocated for
- * the logical cluster that contains the block
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
+ * @lclu_allocated/end_allocated - indicates whether a physical cluster has
+ * been allocated for the logical cluster
+ * that contains the block
*/
-void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated)
+void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, bool lclu_allocated,
+ bool end_allocated)
{
struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
int err1 = 0, err2 = 0, err3 = 0;
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
- struct pending_reservation *pr = NULL;
+ struct pending_reservation *pr1 = NULL;
+ struct pending_reservation *pr2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
- lblk, inode->i_ino);
+ es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+ if (!len)
+ return;
newes.es_lblk = lblk;
- newes.es_len = 1;
+ newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
- trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
+ trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
+ end_allocated);
ext4_es_insert_extent_check(inode, &newes);
@@ -2088,11 +2096,15 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
- if ((err1 || err2 || err3) && allocated && !pr)
- pr = __alloc_pending(true);
+ if (err1 || err2 || err3) {
+ if (lclu_allocated && !pr1)
+ pr1 = __alloc_pending(true);
+ if (end_allocated && !pr2)
+ pr2 = __alloc_pending(true);
+ }
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
+ err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
@@ -2112,13 +2124,22 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
es2 = NULL;
}
- if (allocated) {
- err3 = __insert_pending(inode, lblk, &pr);
+ if (lclu_allocated) {
+ err3 = __insert_pending(inode, lblk, &pr1);
if (err3 != 0)
goto error;
- if (pr) {
- __free_pending(pr);
- pr = NULL;
+ if (pr1) {
+ __free_pending(pr1);
+ pr1 = NULL;
+ }
+ }
+ if (end_allocated) {
+ err3 = __insert_pending(inode, end, &pr2);
+ if (err3 != 0)
+ goto error;
+ if (pr2) {
+ __free_pending(pr2);
+ pr2 = NULL;
}
}
error:
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index d9847a4a25db..3c8e2edee5d5 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -249,8 +249,9 @@ extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
-extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated);
+extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, bool lclu_allocated,
+ bool end_allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cccc16506f5f..d37233e2ed0b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1702,7 +1702,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
}
}
- ext4_es_insert_delayed_block(inode, lblk, allocated);
+ ext4_es_insert_delayed_extent(inode, lblk, 1, allocated, false);
return 0;
}
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index a697f4b77162..6b41ac61310f 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -2478,11 +2478,11 @@ TRACE_EVENT(ext4_es_shrink,
__entry->scan_time, __entry->nr_skipped, __entry->retried)
);
-TRACE_EVENT(ext4_es_insert_delayed_block,
+TRACE_EVENT(ext4_es_insert_delayed_extent,
TP_PROTO(struct inode *inode, struct extent_status *es,
- bool allocated),
+ bool lclu_allocated, bool end_allocated),
- TP_ARGS(inode, es, allocated),
+ TP_ARGS(inode, es, lclu_allocated, end_allocated),
TP_STRUCT__entry(
__field( dev_t, dev )
@@ -2491,7 +2491,8 @@ TRACE_EVENT(ext4_es_insert_delayed_block,
__field( ext4_lblk_t, len )
__field( ext4_fsblk_t, pblk )
__field( char, status )
- __field( bool, allocated )
+ __field( bool, lclu_allocated )
+ __field( bool, end_allocated )
),
TP_fast_assign(
@@ -2501,16 +2502,17 @@ TRACE_EVENT(ext4_es_insert_delayed_block,
__entry->len = es->es_len;
__entry->pblk = ext4_es_show_pblock(es);
__entry->status = ext4_es_status(es);
- __entry->allocated = allocated;
+ __entry->lclu_allocated = lclu_allocated;
+ __entry->end_allocated = end_allocated;
),
TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
- "allocated %d",
+ "allocated %d %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->lblk, __entry->len,
__entry->pblk, show_extent_status(__entry->status),
- __entry->allocated)
+ __entry->lclu_allocated, __entry->end_allocated)
);
/* fsmap traces */
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [PATCH v4 06/34] ext4: make ext4_da_reserve_space() reserve multi-clusters
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (4 preceding siblings ...)
2024-04-10 13:27 ` [PATCH v4 05/34] ext4: make ext4_es_insert_delayed_block() insert multi-blocks Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [PATCH v4 07/34] ext4: factor out check for whether a cluster is allocated Zhang Yi
` (23 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Add 'nr_resv' parameter to ext4_da_reserve_space(), which indicates the
number of clusters wants to reserve, make it reserve multi-clusters once
a time.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 18 +++++++++---------
include/trace/events/ext4.h | 10 ++++++----
2 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d37233e2ed0b..1180a9eb4362 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1479,9 +1479,9 @@ static int ext4_journalled_write_end(struct file *file,
}
/*
- * Reserve space for a single cluster
+ * Reserve space for 'nr_resv' clusters
*/
-static int ext4_da_reserve_space(struct inode *inode)
+static int ext4_da_reserve_space(struct inode *inode, int nr_resv)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1492,18 +1492,18 @@ static int ext4_da_reserve_space(struct inode *inode)
* us from metadata over-estimation, though we may go over by
* a small amount in the end. Here we just reserve for data.
*/
- ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv));
if (ret)
return ret;
spin_lock(&ei->i_block_reservation_lock);
- if (ext4_claim_free_clusters(sbi, 1, 0)) {
+ if (ext4_claim_free_clusters(sbi, nr_resv, 0)) {
spin_unlock(&ei->i_block_reservation_lock);
- dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv));
return -ENOSPC;
}
- ei->i_reserved_data_blocks++;
- trace_ext4_da_reserve_space(inode);
+ ei->i_reserved_data_blocks += nr_resv;
+ trace_ext4_da_reserve_space(inode, nr_resv);
spin_unlock(&ei->i_block_reservation_lock);
return 0; /* success */
@@ -1678,7 +1678,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
* extents status tree doesn't get a match.
*/
if (sbi->s_cluster_ratio == 1) {
- ret = ext4_da_reserve_space(inode);
+ ret = ext4_da_reserve_space(inode, 1);
if (ret != 0) /* ENOSPC */
return ret;
} else { /* bigalloc */
@@ -1690,7 +1690,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
if (ret < 0)
return ret;
if (ret == 0) {
- ret = ext4_da_reserve_space(inode);
+ ret = ext4_da_reserve_space(inode, 1);
if (ret != 0) /* ENOSPC */
return ret;
} else {
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 6b41ac61310f..cc5e9b7b2b44 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -1246,14 +1246,15 @@ TRACE_EVENT(ext4_da_update_reserve_space,
);
TRACE_EVENT(ext4_da_reserve_space,
- TP_PROTO(struct inode *inode),
+ TP_PROTO(struct inode *inode, int nr_resv),
- TP_ARGS(inode),
+ TP_ARGS(inode, nr_resv),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( __u64, i_blocks )
+ __field( int, reserve_blocks )
__field( int, reserved_data_blocks )
__field( __u16, mode )
),
@@ -1262,16 +1263,17 @@ TRACE_EVENT(ext4_da_reserve_space,
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->i_blocks = inode->i_blocks;
+ __entry->reserve_blocks = nr_resv;
__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
__entry->mode = inode->i_mode;
),
- TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu "
+ TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu reserve_blocks %d"
"reserved_data_blocks %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->mode, __entry->i_blocks,
- __entry->reserved_data_blocks)
+ __entry->reserve_blocks, __entry->reserved_data_blocks)
);
TRACE_EVENT(ext4_da_release_space,
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [PATCH v4 07/34] ext4: factor out check for whether a cluster is allocated
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (5 preceding siblings ...)
2024-04-10 13:27 ` [PATCH v4 06/34] ext4: make ext4_da_reserve_space() reserve multi-clusters Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [PATCH v4 08/34] ext4: make ext4_insert_delayed_block() insert multi-blocks Zhang Yi
` (22 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Factor out a common helper ext4_da_check_clu_allocated(), check whether
the cluster containing a delalloc block to be added has been delayed or
allocated, no logic changes.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 52 +++++++++++++++++++++++++++++++++----------------
1 file changed, 35 insertions(+), 17 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1180a9eb4362..46c34baa848a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1649,6 +1649,34 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
+/*
+ * Check whether the cluster containing lblk has been delayed or allocated,
+ * if not, it means we should reserve a cluster when add delalloc, return 1,
+ * otherwise return 0 or error code.
+ */
+static int ext4_da_check_clu_allocated(struct inode *inode, ext4_lblk_t lblk,
+ bool *allocated)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int ret;
+
+ *allocated = false;
+ if (ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk))
+ return 0;
+
+ if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk))
+ goto allocated;
+
+ ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk));
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return 1;
+allocated:
+ *allocated = true;
+ return 0;
+}
+
/*
* ext4_insert_delayed_block - adds a delayed block to the extents status
* tree, incrementing the reserved cluster/block
@@ -1682,23 +1710,13 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
if (ret != 0) /* ENOSPC */
return ret;
} else { /* bigalloc */
- if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
- if (!ext4_es_scan_clu(inode,
- &ext4_es_is_mapped, lblk)) {
- ret = ext4_clu_mapped(inode,
- EXT4_B2C(sbi, lblk));
- if (ret < 0)
- return ret;
- if (ret == 0) {
- ret = ext4_da_reserve_space(inode, 1);
- if (ret != 0) /* ENOSPC */
- return ret;
- } else {
- allocated = true;
- }
- } else {
- allocated = true;
- }
+ ret = ext4_da_check_clu_allocated(inode, lblk, &allocated);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = ext4_da_reserve_space(inode, 1);
+ if (ret != 0) /* ENOSPC */
+ return ret;
}
}
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [PATCH v4 08/34] ext4: make ext4_insert_delayed_block() insert multi-blocks
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (6 preceding siblings ...)
2024-04-10 13:27 ` [PATCH v4 07/34] ext4: factor out check for whether a cluster is allocated Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [PATCH v4 09/34] ext4: make ext4_da_map_blocks() buffer_head unaware Zhang Yi
` (21 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Rename ext4_insert_delayed_block() to ext4_insert_delayed_blocks(),
pass length parameter to make it insert multi delalloc blocks once a
time. For non-bigalloc case, just reserve len blocks and insert delalloc
extent. For bigalloc case, we can ensure the middle clusters are not
allocated, but need to check whether the start and end clusters are
delayed/allocated, if not, we should reserve more space for the start
and/or end block(s).
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 51 ++++++++++++++++++++++++++++++++++---------------
1 file changed, 36 insertions(+), 15 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 46c34baa848a..08e2692b7286 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1678,24 +1678,28 @@ static int ext4_da_check_clu_allocated(struct inode *inode, ext4_lblk_t lblk,
}
/*
- * ext4_insert_delayed_block - adds a delayed block to the extents status
- * tree, incrementing the reserved cluster/block
- * count or making a pending reservation
- * where needed
+ * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents
+ * status tree, incrementing the reserved
+ * cluster/block count or making pending
+ * reservations where needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
*
* Returns 0 on success, negative error code on failure.
*/
-static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
+static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int ret;
- bool allocated = false;
+ int resv_clu, ret;
+ bool lclu_allocated = false;
+ bool end_allocated = false;
+ ext4_lblk_t end = lblk + len - 1;
/*
- * If the cluster containing lblk is shared with a delayed,
+ * If the cluster containing lblk or end is shared with a delayed,
* written, or unwritten extent in a bigalloc file system, it's
* already been accounted for and does not need to be reserved.
* A pending reservation must be made for the cluster if it's
@@ -1706,21 +1710,38 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
* extents status tree doesn't get a match.
*/
if (sbi->s_cluster_ratio == 1) {
- ret = ext4_da_reserve_space(inode, 1);
+ ret = ext4_da_reserve_space(inode, len);
if (ret != 0) /* ENOSPC */
return ret;
} else { /* bigalloc */
- ret = ext4_da_check_clu_allocated(inode, lblk, &allocated);
+ resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) - 1;
+ if (resv_clu < 0)
+ resv_clu = 0;
+
+ ret = ext4_da_check_clu_allocated(inode, lblk, &lclu_allocated);
if (ret < 0)
return ret;
- if (ret > 0) {
- ret = ext4_da_reserve_space(inode, 1);
+ if (ret > 0)
+ resv_clu++;
+
+ if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) {
+ ret = ext4_da_check_clu_allocated(inode, end,
+ &end_allocated);
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ resv_clu++;
+ }
+
+ if (resv_clu) {
+ ret = ext4_da_reserve_space(inode, resv_clu);
if (ret != 0) /* ENOSPC */
return ret;
}
}
- ext4_es_insert_delayed_extent(inode, lblk, 1, allocated, false);
+ ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated,
+ end_allocated);
return 0;
}
@@ -1823,7 +1844,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map,
}
}
- retval = ext4_insert_delayed_block(inode, map->m_lblk);
+ retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
up_write(&EXT4_I(inode)->i_data_sem);
if (retval)
return retval;
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [PATCH v4 09/34] ext4: make ext4_da_map_blocks() buffer_head unaware
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (7 preceding siblings ...)
2024-04-10 13:27 ` [PATCH v4 08/34] ext4: make ext4_insert_delayed_block() insert multi-blocks Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [RFC PATCH v4 10/34] ext4: factor out ext4_map_create_blocks() to allocate new blocks Zhang Yi
` (20 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
After calling ext4_da_map_blocks(), a delalloc extent state could be
distinguished through EXT4_MAP_DELAYED flag in map. So factor out
buffer_head related handles in ext4_da_map_blocks(), make it
buffer_head unaware, make it become a common helper, it could be used
for iomap in the future.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 54 +++++++++++++++++++++++--------------------------
1 file changed, 25 insertions(+), 29 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 08e2692b7286..1731c1d24362 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1749,33 +1749,26 @@ static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk,
* This function is grabs code from the very beginning of
* ext4_map_blocks, but assumes that the caller is from delayed write
* time. This function looks up the requested blocks and sets the
- * buffer delay bit under the protection of i_data_sem.
+ * delalloc extent map under the protection of i_data_sem.
*/
-static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map,
- struct buffer_head *bh)
+static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
{
struct extent_status es;
int retval;
- sector_t invalid_block = ~((sector_t) 0xffff);
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
memcpy(&orig_map, map, sizeof(*map));
#endif
- if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
- invalid_block = ~0;
-
map->m_flags = 0;
ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
- retval = es.es_len - (map->m_lblk - es.es_lblk);
- if (retval > map->m_len)
- retval = map->m_len;
- map->m_len = retval;
+ map->m_len = min_t(unsigned int, map->m_len,
+ es.es_len - (map->m_lblk - es.es_lblk));
if (ext4_es_is_hole(&es))
goto add_delayed;
@@ -1785,10 +1778,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map,
* Delayed extent could be allocated by fallocate.
* So we need to check it.
*/
- if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
+ if (ext4_es_is_delonly(&es)) {
+ map->m_flags |= EXT4_MAP_DELAYED;
return 0;
}
@@ -1803,7 +1794,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map,
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
#endif
- return retval;
+ return 0;
}
/*
@@ -1817,7 +1808,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map,
retval = ext4_map_query_blocks(NULL, inode, map);
up_read(&EXT4_I(inode)->i_data_sem);
if (retval)
- return retval;
+ return retval < 0 ? retval : 0;
add_delayed:
down_write(&EXT4_I(inode)->i_data_sem);
@@ -1827,10 +1818,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map,
* whitout holding i_rwsem and folio lock.
*/
if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
- retval = es.es_len - (map->m_lblk - es.es_lblk);
- if (retval > map->m_len)
- retval = map->m_len;
- map->m_len = retval;
+ map->m_len = min_t(unsigned int, map->m_len,
+ es.es_len - (map->m_lblk - es.es_lblk));
if (!ext4_es_is_hole(&es)) {
up_write(&EXT4_I(inode)->i_data_sem);
@@ -1840,18 +1829,14 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map,
retval = ext4_map_query_blocks(NULL, inode, map);
if (retval) {
up_write(&EXT4_I(inode)->i_data_sem);
- return retval;
+ return retval < 0 ? retval : 0;
}
}
+ map->m_flags |= EXT4_MAP_DELAYED;
retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
up_write(&EXT4_I(inode)->i_data_sem);
- if (retval)
- return retval;
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
return retval;
}
@@ -1871,11 +1856,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct ext4_map_blocks map;
+ sector_t invalid_block = ~((sector_t) 0xffff);
int ret = 0;
BUG_ON(create == 0);
BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+ if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+ invalid_block = ~0;
+
map.m_lblk = iblock;
map.m_len = 1;
@@ -1884,10 +1873,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_da_map_blocks(inode, &map, bh);
- if (ret <= 0)
+ ret = ext4_da_map_blocks(inode, &map);
+ if (ret < 0)
return ret;
+ if (map.m_flags & EXT4_MAP_DELAYED) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ return 0;
+ }
+
map_bh(bh, inode->i_sb, map.m_pblk);
ext4_update_bh_state(bh, map.m_flags);
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 10/34] ext4: factor out ext4_map_create_blocks() to allocate new blocks
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (8 preceding siblings ...)
2024-04-10 13:27 ` [PATCH v4 09/34] ext4: make ext4_da_map_blocks() buffer_head unaware Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [RFC PATCH v4 11/34] ext4: optimize the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag set Zhang Yi
` (19 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Factor out a common helper ext4_map_create_blocks() from
ext4_map_blocks(), it allocate new blocks and create a new extent, no
logic changes.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 157 +++++++++++++++++++++++++-----------------------
1 file changed, 81 insertions(+), 76 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1731c1d24362..10a256cfcaa1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -482,6 +482,86 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
return retval;
}
+static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
+{
+ struct extent_status es;
+ unsigned int status;
+ int err, retval = 0;
+
+ /*
+ * Here we clear m_flags because after allocating an new extent,
+ * it will be set again.
+ */
+ map->m_flags &= ~EXT4_MAP_FLAGS;
+
+ /*
+ * We need to check for EXT4 here because migrate could have
+ * changed the inode type in between.
+ */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
+ } else {
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
+
+ /*
+ * We allocated new blocks which will result in i_data's
+ * format changing. Force the migrate to fail by clearing
+ * migrate flags.
+ */
+ if (retval > 0 && map->m_flags & EXT4_MAP_NEW)
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ }
+ if (retval <= 0)
+ return retval;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode %lu: "
+ "retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ /*
+ * We have to zeroout blocks before inserting them into extent
+ * status tree. Otherwise someone could look them up there and
+ * use them before they are really zeroed. We also have to
+ * unmap metadata before zeroing as otherwise writeback can
+ * overwrite zeros with stale data from block device.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO &&
+ map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) {
+ err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk,
+ map->m_len);
+ if (err)
+ return err;
+ }
+
+ /*
+ * If the extent has been zeroed out, we don't need to update
+ * extent status tree.
+ */
+ if (flags & EXT4_GET_BLOCKS_PRE_IO &&
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (ext4_es_is_written(&es))
+ return retval;
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ !(status & EXTENT_STATUS_WRITTEN) &&
+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
+ status |= EXTENT_STATUS_DELAYED;
+
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+
+ return retval;
+}
+
/*
* The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
@@ -630,12 +710,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
return retval;
- /*
- * Here we clear m_flags because after allocating an new extent,
- * it will be set again.
- */
- map->m_flags &= ~EXT4_MAP_FLAGS;
-
/*
* New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
@@ -643,76 +717,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* with create == 1 flag.
*/
down_write(&EXT4_I(inode)->i_data_sem);
-
- /*
- * We need to check for EXT4 here because migrate
- * could have changed the inode type in between
- */
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, flags);
- } else {
- retval = ext4_ind_map_blocks(handle, inode, map, flags);
-
- if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
- /*
- * We allocated new blocks which will result in
- * i_data's format changing. Force the migrate
- * to fail by clearing migrate flags
- */
- ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
- }
- }
-
- if (retval > 0) {
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
- }
-
- /*
- * We have to zeroout blocks before inserting them into extent
- * status tree. Otherwise someone could look them up there and
- * use them before they are really zeroed. We also have to
- * unmap metadata before zeroing as otherwise writeback can
- * overwrite zeros with stale data from block device.
- */
- if (flags & EXT4_GET_BLOCKS_ZERO &&
- map->m_flags & EXT4_MAP_MAPPED &&
- map->m_flags & EXT4_MAP_NEW) {
- ret = ext4_issue_zeroout(inode, map->m_lblk,
- map->m_pblk, map->m_len);
- if (ret) {
- retval = ret;
- goto out_sem;
- }
- }
-
- /*
- * If the extent has been zeroed out, we don't need to update
- * extent status tree.
- */
- if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
- ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
- if (ext4_es_is_written(&es))
- goto out_sem;
- }
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
- !(status & EXTENT_STATUS_WRITTEN) &&
- ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
- map->m_lblk + map->m_len - 1))
- status |= EXTENT_STATUS_DELAYED;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- }
-
-out_sem:
+ retval = ext4_map_create_blocks(handle, inode, map, flags);
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
ret = check_block_validity(inode, map);
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 11/34] ext4: optimize the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag set
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (9 preceding siblings ...)
2024-04-10 13:27 ` [RFC PATCH v4 10/34] ext4: factor out ext4_map_create_blocks() to allocate new blocks Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [RFC PATCH v4 12/34] ext4: don't set EXTENT_STATUS_DELAYED on allocated blocks Zhang Yi
` (18 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Magic EXT4_GET_BLOCKS_DELALLOC_RESERVE means caller is from the delayed
allocation writeout path, and the blocks and quotas has already been
checked when the data was copied into the page cache, there is no need
to check them again when we do the real allocation. But now we only set
this magic when allocating delayed allocated clusters, it makes things
complicated because we have to deal with the case of allocating
non-delayed allocated clusters, e.g. from fallocate, because in this
case, free space has already been claimed by ext4_mb_new_blocks(), we
shouldn't claim it again. Move setting EXT4_GET_BLOCKS_DELALLOC_RESERVE
to where we actually do block allocation could simplify the handling
process a lot, it means that we always set this magic once the
allocation range covers delalloc blocks, don't distinguish the
allocation path.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 10a256cfcaa1..fd5a27db62c0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -489,6 +489,14 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
unsigned int status;
int err, retval = 0;
+ /*
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE
+ * indicates that the blocks and quotas has already been
+ * checked when the data was copied into the page cache.
+ */
+ if (map->m_flags & EXT4_MAP_DELAYED)
+ flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
/*
* Here we clear m_flags because after allocating an new extent,
* it will be set again.
@@ -2216,11 +2224,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* writeback and there is nothing we can do about it so it might result
* in data loss. So use reserved blocks to allocate metadata if
* possible.
- *
- * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
- * the blocks in question are delalloc blocks. This indicates
- * that the blocks and quotas has already been checked when
- * the data was copied into the page cache.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL |
@@ -2228,8 +2231,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (map->m_flags & BIT(BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
if (err < 0)
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 12/34] ext4: don't set EXTENT_STATUS_DELAYED on allocated blocks
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (10 preceding siblings ...)
2024-04-10 13:27 ` [RFC PATCH v4 11/34] ext4: optimize the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag set Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [RFC PATCH v4 13/34] ext4: let __revise_pending() return newly inserted pendings Zhang Yi
` (17 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Since we always set EXT4_GET_BLOCKS_DELALLOC_RESERVE when allocating
delalloc blocks, no matter whether allocating delayed or non-delayed
allocated blocks. There is no need to keep delayed flag on unwritten
extent status entry, so just drop it after blocks have been allocated.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents_status.c | 9 +--------
fs/ext4/inode.c | 11 -----------
2 files changed, 1 insertion(+), 19 deletions(-)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 2320b0d71001..952a38eaea0f 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -867,14 +867,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
return;
BUG_ON(end < lblk);
-
- if ((status & EXTENT_STATUS_DELAYED) &&
- (status & EXTENT_STATUS_WRITTEN)) {
- ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
- " delayed and written which can potentially "
- " cause data loss.", lblk, len);
- WARN_ON(1);
- }
+ WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED);
newes.es_lblk = lblk;
newes.es_len = len;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fd5a27db62c0..752fc0555dc0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -558,12 +558,6 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
- !(status & EXTENT_STATUS_WRITTEN) &&
- ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
- map->m_lblk + map->m_len - 1))
- status |= EXTENT_STATUS_DELAYED;
-
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
@@ -682,11 +676,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
- !(status & EXTENT_STATUS_WRITTEN) &&
- ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
- map->m_lblk + map->m_len - 1))
- status |= EXTENT_STATUS_DELAYED;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
}
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 13/34] ext4: let __revise_pending() return newly inserted pendings
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (11 preceding siblings ...)
2024-04-10 13:27 ` [RFC PATCH v4 12/34] ext4: don't set EXTENT_STATUS_DELAYED on allocated blocks Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [RFC PATCH v4 14/34] ext4: count removed reserved blocks for delalloc only extent entry Zhang Yi
` (16 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Let __insert_pending() return 1 after successfully inserting a new
pending cluster, and also let __revise_pending() to return the number of
of newly inserted pendings.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents_status.c | 28 ++++++++++++++++++----------
1 file changed, 18 insertions(+), 10 deletions(-)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 952a38eaea0f..382a96c1bc5c 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -885,7 +885,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
- if ((err1 || err2 || err3) && revise_pending && !pr)
+ if ((err1 || err2 || err3 < 0) && revise_pending && !pr)
pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
@@ -913,7 +913,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
if (revise_pending) {
err3 = __revise_pending(inode, lblk, len, &pr);
- if (err3 != 0)
+ if (err3 < 0)
goto error;
if (pr) {
__free_pending(pr);
@@ -922,7 +922,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
}
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- if (err1 || err2 || err3)
+ if (err1 || err2 || err3 < 0)
goto retry;
ext4_es_print_tree(inode);
@@ -1931,7 +1931,7 @@ static struct pending_reservation *__get_pending(struct inode *inode,
* @lblk - logical block in the cluster to be added
* @prealloc - preallocated pending entry
*
- * Returns 0 on successful insertion and -ENOMEM on failure. If the
+ * Returns 1 on successful insertion and -ENOMEM on failure. If the
* pending reservation is already in the set, returns successfully.
*/
static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
@@ -1975,6 +1975,7 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
rb_link_node(&pr->rb_node, parent, p);
rb_insert_color(&pr->rb_node, &tree->root);
+ ret = 1;
out:
return ret;
@@ -2089,7 +2090,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
- if (err1 || err2 || err3) {
+ if (err1 || err2 || err3 < 0) {
if (lclu_allocated && !pr1)
pr1 = __alloc_pending(true);
if (end_allocated && !pr2)
@@ -2119,7 +2120,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
if (lclu_allocated) {
err3 = __insert_pending(inode, lblk, &pr1);
- if (err3 != 0)
+ if (err3 < 0)
goto error;
if (pr1) {
__free_pending(pr1);
@@ -2128,7 +2129,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
}
if (end_allocated) {
err3 = __insert_pending(inode, end, &pr2);
- if (err3 != 0)
+ if (err3 < 0)
goto error;
if (pr2) {
__free_pending(pr2);
@@ -2137,7 +2138,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
}
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- if (err1 || err2 || err3)
+ if (err1 || err2 || err3 < 0)
goto retry;
ext4_es_print_tree(inode);
@@ -2247,7 +2248,9 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
*
* Used after a newly allocated extent is added to the extents status tree.
* Requires that the extents in the range have either written or unwritten
- * status. Must be called while holding i_es_lock.
+ * status. Must be called while holding i_es_lock. Returns number of new
+ * inserts pending cluster on insert pendings, returns 0 on remove pendings,
+ * return -ENOMEM on failure.
*/
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len,
@@ -2257,6 +2260,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end = lblk + len - 1;
ext4_lblk_t first, last;
bool f_del = false, l_del = false;
+ int pendings = 0;
int ret = 0;
if (len == 0)
@@ -2284,6 +2288,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ret = __insert_pending(inode, first, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else {
last = EXT4_LBLK_CMASK(sbi, end) +
sbi->s_cluster_ratio - 1;
@@ -2295,6 +2300,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ret = __insert_pending(inode, last, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else
__remove_pending(inode, last);
}
@@ -2307,6 +2313,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ret = __insert_pending(inode, first, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else
__remove_pending(inode, first);
@@ -2318,9 +2325,10 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ret = __insert_pending(inode, last, prealloc);
if (ret < 0)
goto out;
+ pendings += ret;
} else
__remove_pending(inode, last);
}
out:
- return ret;
+ return (ret < 0) ? ret : pendings;
}
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 14/34] ext4: count removed reserved blocks for delalloc only extent entry
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (12 preceding siblings ...)
2024-04-10 13:27 ` [RFC PATCH v4 13/34] ext4: let __revise_pending() return newly inserted pendings Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:27 ` [RFC PATCH v4 15/34] ext4: update delalloc data reserve spcae in ext4_es_insert_extent() Zhang Yi
` (15 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Current __es_remove_extent() only count reserved clusters if the removed
extent entry is delalloc, it doesn't count reserved blocks if the
bigalloc feature is enabled, so we can't get reserved block number.
However, it's useful to distinguish whether we are allocating delalloc
range in one cluster, so add a parameter to count the reserved blocks
number too.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents_status.c | 64 +++++++++++++++++++++++++---------------
1 file changed, 40 insertions(+), 24 deletions(-)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 382a96c1bc5c..38ec2cc5ae3b 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -141,13 +141,18 @@
* -- Extent-level locking
*/
+struct rsvd_info {
+ int delonly_cluster; /* reserved clusters for delalloc es entry */
+ int delonly_block; /* reserved blocks for delalloc es entry */
+};
+
static struct kmem_cache *ext4_es_cachep;
static struct kmem_cache *ext4_pending_cachep;
static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
struct extent_status *prealloc);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved,
+ ext4_lblk_t end, struct rsvd_info *rinfo,
struct extent_status *prealloc);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
@@ -1042,7 +1047,8 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
}
struct rsvd_count {
- int ndelonly;
+ int ndelonly_cluster;
+ int ndelonly_block;
bool first_do_lblk_found;
ext4_lblk_t first_do_lblk;
ext4_lblk_t last_do_lblk;
@@ -1068,7 +1074,8 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct rb_node *node;
- rc->ndelonly = 0;
+ rc->ndelonly_cluster = 0;
+ rc->ndelonly_block = 0;
/*
* for bigalloc, note the first delonly block in the range has not
@@ -1116,11 +1123,13 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
WARN_ON(len <= 0);
if (sbi->s_cluster_ratio == 1) {
- rc->ndelonly += (int) len;
+ rc->ndelonly_cluster += (int) len;
+ rc->ndelonly_block = rc->ndelonly_cluster;
return;
}
/* bigalloc */
+ rc->ndelonly_block += (int)len;
i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
end = lblk + (ext4_lblk_t) len - 1;
@@ -1140,7 +1149,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
* doesn't start with it, count it and stop tracking
*/
if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
- rc->ndelonly++;
+ rc->ndelonly_cluster++;
rc->partial = false;
}
@@ -1150,7 +1159,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
*/
if (EXT4_LBLK_COFF(sbi, i) != 0) {
if (end >= EXT4_LBLK_CFILL(sbi, i)) {
- rc->ndelonly++;
+ rc->ndelonly_cluster++;
rc->partial = false;
i = EXT4_LBLK_CFILL(sbi, i) + 1;
}
@@ -1162,7 +1171,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
*/
if ((i + sbi->s_cluster_ratio - 1) <= end) {
nclu = (end - i + 1) >> sbi->s_cluster_bits;
- rc->ndelonly += nclu;
+ rc->ndelonly_cluster += nclu;
i += nclu << sbi->s_cluster_bits;
}
@@ -1242,9 +1251,9 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
if (sbi->s_cluster_ratio > 1) {
/* count any remaining partial cluster */
if (rc->partial)
- rc->ndelonly++;
+ rc->ndelonly_cluster++;
- if (rc->ndelonly == 0)
+ if (rc->ndelonly_cluster == 0)
return 0;
first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
@@ -1261,7 +1270,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
while (es && ext4_es_end(es) >=
EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
if (ext4_es_is_delonly(es)) {
- rc->ndelonly--;
+ rc->ndelonly_cluster--;
left_delonly = true;
break;
}
@@ -1281,7 +1290,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
while (es && es->es_lblk <=
EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
if (ext4_es_is_delonly(es)) {
- rc->ndelonly--;
+ rc->ndelonly_cluster--;
right_delonly = true;
break;
}
@@ -1327,7 +1336,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
if (count_pending) {
pr = __pr_tree_search(&tree->root, first_lclu);
while (pr && pr->lclu <= last_lclu) {
- rc->ndelonly--;
+ rc->ndelonly_cluster--;
node = rb_next(&pr->rb_node);
rb_erase(&pr->rb_node, &tree->root);
__free_pending(pr);
@@ -1338,7 +1347,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
}
}
}
- return rc->ndelonly;
+ return rc->ndelonly_cluster;
}
@@ -1348,16 +1357,17 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
* @inode - file containing range
* @lblk - first block in range
* @end - last block in range
- * @reserved - number of cluster reservations released
+ * @rinfo - reserved information collected, includes number of
+ * block/cluster reservations released
* @prealloc - pre-allocated es to avoid memory allocation failures
*
- * If @reserved is not NULL and delayed allocation is enabled, counts
+ * If @rinfo is not NULL and delayed allocation is enabled, counts
* block/cluster reservations freed by removing range and if bigalloc
* enabled cancels pending reservations as needed. Returns 0 on success,
* error code on failure.
*/
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved,
+ ext4_lblk_t end, struct rsvd_info *rinfo,
struct extent_status *prealloc)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
@@ -1367,11 +1377,15 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
int err = 0;
- bool count_reserved = true;
+ bool count_reserved = false;
struct rsvd_count rc;
- if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
- count_reserved = false;
+ if (rinfo) {
+ rinfo->delonly_cluster = 0;
+ rinfo->delonly_block = 0;
+ if (test_opt(inode->i_sb, DELALLOC))
+ count_reserved = true;
+ }
es = __es_tree_search(&tree->root, lblk);
if (!es)
@@ -1469,8 +1483,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
}
out_get_reserved:
- if (count_reserved)
- *reserved = get_rsvd(inode, end, es, &rc);
+ if (count_reserved) {
+ rinfo->delonly_cluster = get_rsvd(inode, end, es, &rc);
+ rinfo->delonly_block = rc.ndelonly_block;
+ }
out:
return err;
}
@@ -1489,8 +1505,8 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len)
{
ext4_lblk_t end;
+ struct rsvd_info rinfo;
int err = 0;
- int reserved = 0;
struct extent_status *es = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
@@ -1515,7 +1531,7 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
* is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, &reserved, es);
+ err = __es_remove_extent(inode, lblk, end, &rinfo, es);
/* Free preallocated extent if it didn't get used. */
if (es) {
if (!es->es_len)
@@ -1527,7 +1543,7 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
goto retry;
ext4_es_print_tree(inode);
- ext4_da_release_space(inode, reserved);
+ ext4_da_release_space(inode, rinfo.delonly_cluster);
return;
}
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 15/34] ext4: update delalloc data reserve spcae in ext4_es_insert_extent()
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (13 preceding siblings ...)
2024-04-10 13:27 ` [RFC PATCH v4 14/34] ext4: count removed reserved blocks for delalloc only extent entry Zhang Yi
@ 2024-04-10 13:27 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 16/34] ext4: drop ext4_es_delayed_clu() Zhang Yi
` (14 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:27 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Now we update data reserved space for delalloc after allocating new
blocks in ext4_{ind|ext}_map_blocks(). If bigalloc feature is enabled,
we also need to query the extents_status tree and calculate the exact
reserved clusters. This is complicated and it appears
ext4_es_insert_extent() is a better place to do this job, it could make
things simple because __es_remove_extent() could count delalloc blocks
and __revise_pending() and return newly added pending count.
One special case needs to concern is the quota claiming, when bigalloc
is enabled, if the delayed cluster allocation has been raced by another
no-delayed allocation which doesn't overlap the delayed blocks (from
fallocate, filemap, DIO...) , we cannot claim quota as usual because the
racer have already done it, so we also need to check the counted
reserved blocks.
| one cluster |
-------------------------------------------
| | delayed es |
-------------------------------------------
^ ^
| fallocate | <- don't claim quota
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents.c | 37 -------------------------------------
fs/ext4/extents_status.c | 22 +++++++++++++++++++++-
fs/ext4/indirect.c | 7 -------
3 files changed, 21 insertions(+), 45 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e57054bdc5fd..8bc8a519f745 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4355,43 +4355,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
goto out;
}
- /*
- * Reduce the reserved cluster count to reflect successful deferred
- * allocation of delayed allocated clusters or direct allocation of
- * clusters discovered to be delayed allocated. Once allocated, a
- * cluster is not included in the reserved count.
- */
- if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- /*
- * When allocating delayed allocated clusters, simply
- * reduce the reserved cluster count and claim quota
- */
- ext4_da_update_reserve_space(inode, allocated_clusters,
- 1);
- } else {
- ext4_lblk_t lblk, len;
- unsigned int n;
-
- /*
- * When allocating non-delayed allocated clusters
- * (from fallocate, filemap, DIO, or clusters
- * allocated when delalloc has been disabled by
- * ext4_nonda_switch), reduce the reserved cluster
- * count by the number of allocated clusters that
- * have previously been delayed allocated. Quota
- * has been claimed by ext4_mb_new_blocks() above,
- * so release the quota reservations made for any
- * previously delayed allocated clusters.
- */
- lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
- len = allocated_clusters << sbi->s_cluster_bits;
- n = ext4_es_delayed_clu(inode, lblk, len);
- if (n > 0)
- ext4_da_update_reserve_space(inode, (int) n, 0);
- }
- }
-
/*
* Cache the extent and update transaction to commit on fdatasync only
* when it is _not_ an unwritten extent.
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 38ec2cc5ae3b..75227f151b8f 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -856,6 +856,8 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
int err1 = 0, err2 = 0, err3 = 0;
+ struct rsvd_info rinfo;
+ int resv_used, pending = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
@@ -894,7 +896,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
+ err1 = __es_remove_extent(inode, lblk, end, &rinfo, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
@@ -924,9 +926,27 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
__free_pending(pr);
pr = NULL;
}
+ pending = err3;
}
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ /*
+ * Reduce the reserved cluster count to reflect successful deferred
+ * allocation of delayed allocated clusters or direct allocation of
+ * clusters discovered to be delayed allocated. Once allocated, a
+ * cluster is not included in the reserved count.
+ *
+ * When bigalloc is enabled, allocating non-delayed allocated blocks
+ * which belong to delayed allocated clusters (from fallocate, filemap,
+ * DIO, or clusters allocated when delalloc has been disabled by
+ * ext4_nonda_switch()). Quota has been claimed by ext4_mb_new_blocks(),
+ * so release the quota reservations made for any previously delayed
+ * allocated clusters.
+ */
+ resv_used = rinfo.delonly_cluster + pending;
+ if (resv_used)
+ ext4_da_update_reserve_space(inode, resv_used,
+ rinfo.delonly_block);
if (err1 || err2 || err3 < 0)
goto retry;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index d8ca7f64f952..7404f0935c90 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -652,13 +652,6 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
ext4_update_inode_fsync_trans(handle, inode, 1);
count = ar.len;
- /*
- * Update reserved blocks/metadata blocks after successful block
- * allocation which had been deferred till now.
- */
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
- ext4_da_update_reserve_space(inode, count, 1);
-
got_it:
map->m_flags |= EXT4_MAP_MAPPED;
map->m_pblk = le32_to_cpu(chain[depth-1].key);
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 16/34] ext4: drop ext4_es_delayed_clu()
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (14 preceding siblings ...)
2024-04-10 13:27 ` [RFC PATCH v4 15/34] ext4: update delalloc data reserve spcae in ext4_es_insert_extent() Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 17/34] ext4: use ext4_map_query_blocks() in ext4_map_blocks() Zhang Yi
` (13 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
ext4_es_delayed_clu() and __es_delayed_clu() are not used, drop them.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents_status.c | 88 ----------------------------------------
fs/ext4/extents_status.h | 2 -
2 files changed, 90 deletions(-)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 75227f151b8f..9cac4ea57b73 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -2182,94 +2182,6 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
return;
}
-/*
- * __es_delayed_clu - count number of clusters containing blocks that
- * are delayed only
- *
- * @inode - file containing block range
- * @start - logical block defining start of range
- * @end - logical block defining end of range
- *
- * Returns the number of clusters containing only delayed (not delayed
- * and unwritten) blocks in the range specified by @start and @end. Any
- * cluster or part of a cluster within the range and containing a delayed
- * and not unwritten block within the range is counted as a whole cluster.
- */
-static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t end)
-{
- struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
- struct extent_status *es;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct rb_node *node;
- ext4_lblk_t first_lclu, last_lclu;
- unsigned long long last_counted_lclu;
- unsigned int n = 0;
-
- /* guaranteed to be unequal to any ext4_lblk_t value */
- last_counted_lclu = ~0ULL;
-
- es = __es_tree_search(&tree->root, start);
-
- while (es && (es->es_lblk <= end)) {
- if (ext4_es_is_delonly(es)) {
- if (es->es_lblk <= start)
- first_lclu = EXT4_B2C(sbi, start);
- else
- first_lclu = EXT4_B2C(sbi, es->es_lblk);
-
- if (ext4_es_end(es) >= end)
- last_lclu = EXT4_B2C(sbi, end);
- else
- last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
-
- if (first_lclu == last_counted_lclu)
- n += last_lclu - first_lclu;
- else
- n += last_lclu - first_lclu + 1;
- last_counted_lclu = last_lclu;
- }
- node = rb_next(&es->rb_node);
- if (!node)
- break;
- es = rb_entry(node, struct extent_status, rb_node);
- }
-
- return n;
-}
-
-/*
- * ext4_es_delayed_clu - count number of clusters containing blocks that
- * are both delayed and unwritten
- *
- * @inode - file containing block range
- * @lblk - logical block defining start of range
- * @len - number of blocks in range
- *
- * Locking for external use of __es_delayed_clu().
- */
-unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- ext4_lblk_t end;
- unsigned int n;
-
- if (len == 0)
- return 0;
-
- end = lblk + len - 1;
- WARN_ON(end < lblk);
-
- read_lock(&ei->i_es_lock);
-
- n = __es_delayed_clu(inode, lblk, end);
-
- read_unlock(&ei->i_es_lock);
-
- return n;
-}
-
/*
* __revise_pending - makes, cancels, or leaves unchanged pending cluster
* reservations for a specified block range depending
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 3c8e2edee5d5..5b49cb3b9aff 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -252,8 +252,6 @@ extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, bool lclu_allocated,
bool end_allocated);
-extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 17/34] ext4: use ext4_map_query_blocks() in ext4_map_blocks()
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (15 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 16/34] ext4: drop ext4_es_delayed_clu() Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 18/34] ext4: drop ext4_es_is_delonly() Zhang Yi
` (12 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
The blocks map querying logic in ext4_map_blocks() are the same with
ext4_map_query_blocks(), so use it directly.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 22 +---------------------
1 file changed, 1 insertion(+), 21 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 752fc0555dc0..64bdfa9e06b2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -658,27 +658,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* file system block.
*/
down_read(&EXT4_I(inode)->i_data_sem);
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
- } else {
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
- }
- if (retval > 0) {
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
- }
-
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- }
+ retval = ext4_map_query_blocks(handle, inode, map);
up_read((&EXT4_I(inode)->i_data_sem));
found:
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 18/34] ext4: drop ext4_es_is_delonly()
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (16 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 17/34] ext4: use ext4_map_query_blocks() in ext4_map_blocks() Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 19/34] ext4: drop all delonly descriptions Zhang Yi
` (11 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Since we don't add delayed flag in unwritten extent status entry, so
there is no difference between ext4_es_is_delayed() and
ext4_es_is_delonly(), just drop ext4_es_is_delonly().
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents_status.c | 18 +++++++++---------
fs/ext4/extents_status.h | 5 -----
fs/ext4/inode.c | 4 ++--
3 files changed, 11 insertions(+), 16 deletions(-)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 9cac4ea57b73..062293e739cc 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -561,8 +561,8 @@ static int ext4_es_can_be_merged(struct extent_status *es1,
if (ext4_es_is_hole(es1))
return 1;
- /* we need to check delayed extent is without unwritten status */
- if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+ /* we need to check delayed extent */
+ if (ext4_es_is_delayed(es1))
return 1;
return 0;
@@ -1137,7 +1137,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t i, end, nclu;
- if (!ext4_es_is_delonly(es))
+ if (!ext4_es_is_delayed(es))
return;
WARN_ON(len <= 0);
@@ -1289,7 +1289,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
es = rc->left_es;
while (es && ext4_es_end(es) >=
EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
- if (ext4_es_is_delonly(es)) {
+ if (ext4_es_is_delayed(es)) {
rc->ndelonly_cluster--;
left_delonly = true;
break;
@@ -1309,7 +1309,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
}
while (es && es->es_lblk <=
EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
- if (ext4_es_is_delonly(es)) {
+ if (ext4_es_is_delayed(es)) {
rc->ndelonly_cluster--;
right_delonly = true;
break;
@@ -2230,7 +2230,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
- f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1);
if (f_del) {
ret = __insert_pending(inode, first, prealloc);
@@ -2242,7 +2242,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
sbi->s_cluster_ratio - 1;
if (last != end)
l_del = __es_scan_range(inode,
- &ext4_es_is_delonly,
+ &ext4_es_is_delayed,
end + 1, last);
if (l_del) {
ret = __insert_pending(inode, last, prealloc);
@@ -2255,7 +2255,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
} else {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
- f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1);
if (f_del) {
ret = __insert_pending(inode, first, prealloc);
@@ -2267,7 +2267,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
if (last != end)
- l_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ l_del = __es_scan_range(inode, &ext4_es_is_delayed,
end + 1, last);
if (l_del) {
ret = __insert_pending(inode, last, prealloc);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 5b49cb3b9aff..e484c60e55e3 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -184,11 +184,6 @@ static inline int ext4_es_is_mapped(struct extent_status *es)
return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
}
-static inline int ext4_es_is_delonly(struct extent_status *es)
-{
- return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
-}
-
static inline void ext4_es_set_referenced(struct extent_status *es)
{
es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 64bdfa9e06b2..2704dca96ee7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1643,7 +1643,7 @@ static int ext4_da_check_clu_allocated(struct inode *inode, ext4_lblk_t lblk,
int ret;
*allocated = false;
- if (ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk))
+ if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk))
return 0;
if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk))
@@ -1760,7 +1760,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
* Delayed extent could be allocated by fallocate.
* So we need to check it.
*/
- if (ext4_es_is_delonly(&es)) {
+ if (ext4_es_is_delayed(&es)) {
map->m_flags |= EXT4_MAP_DELAYED;
return 0;
}
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 19/34] ext4: drop all delonly descriptions
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (17 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 18/34] ext4: drop ext4_es_is_delonly() Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 20/34] ext4: use reserved metadata blocks when splitting extent on endio Zhang Yi
` (10 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Drop all delonly descriptions in parameters and comments.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents_status.c | 92 ++++++++++++++++++++--------------------
1 file changed, 45 insertions(+), 47 deletions(-)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 062293e739cc..926669d8eb3e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -142,8 +142,8 @@
*/
struct rsvd_info {
- int delonly_cluster; /* reserved clusters for delalloc es entry */
- int delonly_block; /* reserved blocks for delalloc es entry */
+ int delayed_cluster; /* reserved clusters for delalloc es entry */
+ int delayed_block; /* reserved blocks for delalloc es entry */
};
static struct kmem_cache *ext4_es_cachep;
@@ -943,10 +943,10 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
* so release the quota reservations made for any previously delayed
* allocated clusters.
*/
- resv_used = rinfo.delonly_cluster + pending;
+ resv_used = rinfo.delayed_cluster + pending;
if (resv_used)
ext4_da_update_reserve_space(inode, resv_used,
- rinfo.delonly_block);
+ rinfo.delayed_block);
if (err1 || err2 || err3 < 0)
goto retry;
@@ -1067,8 +1067,8 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
}
struct rsvd_count {
- int ndelonly_cluster;
- int ndelonly_block;
+ int ndelayed_cluster;
+ int ndelayed_block;
bool first_do_lblk_found;
ext4_lblk_t first_do_lblk;
ext4_lblk_t last_do_lblk;
@@ -1094,11 +1094,11 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct rb_node *node;
- rc->ndelonly_cluster = 0;
- rc->ndelonly_block = 0;
+ rc->ndelayed_cluster = 0;
+ rc->ndelayed_block = 0;
/*
- * for bigalloc, note the first delonly block in the range has not
+ * for bigalloc, note the first delayed block in the range has not
* been found, record the extent containing the block to the left of
* the region to be removed, if any, and note that there's no partial
* cluster to track
@@ -1118,9 +1118,8 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
}
/*
- * count_rsvd - count the clusters containing delayed and not unwritten
- * (delonly) blocks in a range within an extent and add to
- * the running tally in rsvd_count
+ * count_rsvd - count the clusters containing delayed blocks in a range
+ * within an extent and add to the running tally in rsvd_count
*
* @inode - file containing extent
* @lblk - first block in range
@@ -1143,19 +1142,19 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
WARN_ON(len <= 0);
if (sbi->s_cluster_ratio == 1) {
- rc->ndelonly_cluster += (int) len;
- rc->ndelonly_block = rc->ndelonly_cluster;
+ rc->ndelayed_cluster += (int) len;
+ rc->ndelayed_block = rc->ndelayed_cluster;
return;
}
/* bigalloc */
- rc->ndelonly_block += (int)len;
+ rc->ndelayed_block += (int)len;
i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
end = lblk + (ext4_lblk_t) len - 1;
end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
- /* record the first block of the first delonly extent seen */
+ /* record the first block of the first delayed extent seen */
if (!rc->first_do_lblk_found) {
rc->first_do_lblk = i;
rc->first_do_lblk_found = true;
@@ -1169,7 +1168,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
* doesn't start with it, count it and stop tracking
*/
if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
- rc->ndelonly_cluster++;
+ rc->ndelayed_cluster++;
rc->partial = false;
}
@@ -1179,7 +1178,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
*/
if (EXT4_LBLK_COFF(sbi, i) != 0) {
if (end >= EXT4_LBLK_CFILL(sbi, i)) {
- rc->ndelonly_cluster++;
+ rc->ndelayed_cluster++;
rc->partial = false;
i = EXT4_LBLK_CFILL(sbi, i) + 1;
}
@@ -1187,11 +1186,11 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
/*
* if the current cluster starts on a cluster boundary, count the
- * number of whole delonly clusters in the extent
+ * number of whole delayed clusters in the extent
*/
if ((i + sbi->s_cluster_ratio - 1) <= end) {
nclu = (end - i + 1) >> sbi->s_cluster_bits;
- rc->ndelonly_cluster += nclu;
+ rc->ndelayed_cluster += nclu;
i += nclu << sbi->s_cluster_bits;
}
@@ -1251,10 +1250,9 @@ static struct pending_reservation *__pr_tree_search(struct rb_root *root,
* @rc - pointer to reserved count data
*
* The number of reservations to be released is equal to the number of
- * clusters containing delayed and not unwritten (delonly) blocks within
- * the range, minus the number of clusters still containing delonly blocks
- * at the ends of the range, and minus the number of pending reservations
- * within the range.
+ * clusters containing delayed blocks within the range, minus the number of
+ * clusters still containing delayed blocks at the ends of the range, and
+ * minus the number of pending reservations within the range.
*/
static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct extent_status *right_es,
@@ -1265,33 +1263,33 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
struct rb_node *node;
ext4_lblk_t first_lclu, last_lclu;
- bool left_delonly, right_delonly, count_pending;
+ bool left_delayed, right_delayed, count_pending;
struct extent_status *es;
if (sbi->s_cluster_ratio > 1) {
/* count any remaining partial cluster */
if (rc->partial)
- rc->ndelonly_cluster++;
+ rc->ndelayed_cluster++;
- if (rc->ndelonly_cluster == 0)
+ if (rc->ndelayed_cluster == 0)
return 0;
first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
/*
- * decrease the delonly count by the number of clusters at the
- * ends of the range that still contain delonly blocks -
+ * decrease the delayed count by the number of clusters at the
+ * ends of the range that still contain delayed blocks -
* these clusters still need to be reserved
*/
- left_delonly = right_delonly = false;
+ left_delayed = right_delayed = false;
es = rc->left_es;
while (es && ext4_es_end(es) >=
EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
if (ext4_es_is_delayed(es)) {
- rc->ndelonly_cluster--;
- left_delonly = true;
+ rc->ndelayed_cluster--;
+ left_delayed = true;
break;
}
node = rb_prev(&es->rb_node);
@@ -1299,7 +1297,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
break;
es = rb_entry(node, struct extent_status, rb_node);
}
- if (right_es && (!left_delonly || first_lclu != last_lclu)) {
+ if (right_es && (!left_delayed || first_lclu != last_lclu)) {
if (end < ext4_es_end(right_es)) {
es = right_es;
} else {
@@ -1310,8 +1308,8 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
while (es && es->es_lblk <=
EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
if (ext4_es_is_delayed(es)) {
- rc->ndelonly_cluster--;
- right_delonly = true;
+ rc->ndelayed_cluster--;
+ right_delayed = true;
break;
}
node = rb_next(&es->rb_node);
@@ -1325,21 +1323,21 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/*
* Determine the block range that should be searched for
* pending reservations, if any. Clusters on the ends of the
- * original removed range containing delonly blocks are
+ * original removed range containing delayed blocks are
* excluded. They've already been accounted for and it's not
* possible to determine if an associated pending reservation
* should be released with the information available in the
* extents status tree.
*/
if (first_lclu == last_lclu) {
- if (left_delonly | right_delonly)
+ if (left_delayed | right_delayed)
count_pending = false;
else
count_pending = true;
} else {
- if (left_delonly)
+ if (left_delayed)
first_lclu++;
- if (right_delonly)
+ if (right_delayed)
last_lclu--;
if (first_lclu <= last_lclu)
count_pending = true;
@@ -1350,13 +1348,13 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/*
* a pending reservation found between first_lclu and last_lclu
* represents an allocated cluster that contained at least one
- * delonly block, so the delonly total must be reduced by one
+ * delayed block, so the delayed total must be reduced by one
* for each pending reservation found and released
*/
if (count_pending) {
pr = __pr_tree_search(&tree->root, first_lclu);
while (pr && pr->lclu <= last_lclu) {
- rc->ndelonly_cluster--;
+ rc->ndelayed_cluster--;
node = rb_next(&pr->rb_node);
rb_erase(&pr->rb_node, &tree->root);
__free_pending(pr);
@@ -1367,7 +1365,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
}
}
}
- return rc->ndelonly_cluster;
+ return rc->ndelayed_cluster;
}
@@ -1401,8 +1399,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
struct rsvd_count rc;
if (rinfo) {
- rinfo->delonly_cluster = 0;
- rinfo->delonly_block = 0;
+ rinfo->delayed_cluster = 0;
+ rinfo->delayed_block = 0;
if (test_opt(inode->i_sb, DELALLOC))
count_reserved = true;
}
@@ -1504,8 +1502,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
out_get_reserved:
if (count_reserved) {
- rinfo->delonly_cluster = get_rsvd(inode, end, es, &rc);
- rinfo->delonly_block = rc.ndelonly_block;
+ rinfo->delayed_cluster = get_rsvd(inode, end, es, &rc);
+ rinfo->delayed_block = rc.ndelayed_block;
}
out:
return err;
@@ -1563,7 +1561,7 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
goto retry;
ext4_es_print_tree(inode);
- ext4_da_release_space(inode, rinfo.delonly_cluster);
+ ext4_da_release_space(inode, rinfo.delayed_cluster);
return;
}
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 20/34] ext4: use reserved metadata blocks when splitting extent on endio
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (18 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 19/34] ext4: drop all delonly descriptions Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 21/34] ext4: introduce seq counter for the extent status entry Zhang Yi
` (9 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
ext4 only reserved space for delalloc for data blocks, doesn't reserve
space for metadata blocks in ext4_da_reserve_space(). Besides, if we
enable dioread_nolock mount option, it also doesn't reserve metadata
blocks for the extent status conversion.
In order to prevent data loss caused by fail to allocate metadata blocks
on writeback, we reserve 2% space or 4096 blocks for meta data, and use
EXT4_GET_BLOCKS_PRE_IO to do the potential split in advance. But all
these two methods were just best efforts, if it's really running out of
sapce, there is no difference between splitting extent on writeback and
on IO completed, both will lead to data loss.
The best way is to reserve enough space for metadata. Before that, we
can at least make sure that things won't get worse if we postpone
splitting extent to endio. So let's use reserved sapce in endio too.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8bc8a519f745..fcb1916a7c29 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3722,7 +3722,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
(unsigned long long)map->m_lblk, map->m_len);
#endif
err = ext4_split_convert_extents(handle, inode, map, ppath,
- EXT4_GET_BLOCKS_CONVERT);
+ EXT4_GET_BLOCKS_CONVERT |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL);
if (err < 0)
return err;
path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 21/34] ext4: introduce seq counter for the extent status entry
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (19 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 20/34] ext4: use reserved metadata blocks when splitting extent on endio Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 22/34] ext4: add a new iomap aops for regular file's buffered IO path Zhang Yi
` (8 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Add a modify counter for the extent status entry, which indicates the
version of extent status of one inode, increase it once extent changes.
It it a preparation for the conversion of the regular file's buffered
write path from bh to iomap.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 1 +
fs/ext4/extents_status.c | 13 ++++++++++++-
fs/ext4/super.c | 1 +
include/trace/events/ext4.h | 20 ++++++++++++++------
4 files changed, 28 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8d126654019e..7e27e1e7c579 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1115,6 +1115,7 @@ struct ext4_inode_info {
ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
extents to shrink. Protected by
i_es_lock */
+ unsigned int i_es_seq; /* modify counter for extents */
/* ialloc */
ext4_group_t i_last_alloc_group;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 926669d8eb3e..90b58cf42cdd 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -209,6 +209,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
return es->es_lblk + es->es_len - 1;
}
+static inline void ext4_es_inc_seq(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ WRITE_ONCE(ei->i_es_seq, READ_ONCE(ei->i_es_seq) + 1);
+}
+
/*
* search through the tree for an delayed extent with a given offset. If
* it can't be found, try to find next extent.
@@ -876,6 +883,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
BUG_ON(end < lblk);
WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED);
+ ext4_es_inc_seq(inode);
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
@@ -1530,13 +1538,15 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
if (!len)
return;
+ ext4_es_inc_seq(inode);
+ trace_ext4_es_remove_extent(inode, lblk, len);
+
end = lblk + len - 1;
BUG_ON(end < lblk);
@@ -2111,6 +2121,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
if (!len)
return;
+ ext4_es_inc_seq(inode);
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 044135796f2b..5fce4d2b3b87 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1421,6 +1421,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_es_all_nr = 0;
ei->i_es_shk_nr = 0;
ei->i_es_shrink_lblk = 0;
+ ei->i_es_seq = 0;
ei->i_reserved_data_blocks = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
ext4_init_pending_tree(&ei->i_pending_tree);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index cc5e9b7b2b44..4d583d0248d9 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -2183,6 +2183,7 @@ DECLARE_EVENT_CLASS(ext4__es_extent,
__field( ext4_lblk_t, len )
__field( ext4_fsblk_t, pblk )
__field( char, status )
+ __field( unsigned int, seq )
),
TP_fast_assign(
@@ -2192,13 +2193,15 @@ DECLARE_EVENT_CLASS(ext4__es_extent,
__entry->len = es->es_len;
__entry->pblk = ext4_es_show_pblock(es);
__entry->status = ext4_es_status(es);
+ __entry->seq = EXT4_I(inode)->i_es_seq;
),
- TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
+ TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s seq %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->lblk, __entry->len,
- __entry->pblk, show_extent_status(__entry->status))
+ __entry->pblk, show_extent_status(__entry->status),
+ __entry->seq)
);
DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent,
@@ -2223,6 +2226,7 @@ TRACE_EVENT(ext4_es_remove_extent,
__field( ino_t, ino )
__field( loff_t, lblk )
__field( loff_t, len )
+ __field( unsigned int, seq )
),
TP_fast_assign(
@@ -2230,12 +2234,13 @@ TRACE_EVENT(ext4_es_remove_extent,
__entry->ino = inode->i_ino;
__entry->lblk = lblk;
__entry->len = len;
+ __entry->seq = EXT4_I(inode)->i_es_seq;
),
- TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
+ TP_printk("dev %d,%d ino %lu es [%lld/%lld) seq %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
- __entry->lblk, __entry->len)
+ __entry->lblk, __entry->len, __entry->seq)
);
TRACE_EVENT(ext4_es_find_extent_range_enter,
@@ -2495,6 +2500,7 @@ TRACE_EVENT(ext4_es_insert_delayed_extent,
__field( char, status )
__field( bool, lclu_allocated )
__field( bool, end_allocated )
+ __field( unsigned int, seq )
),
TP_fast_assign(
@@ -2506,15 +2512,17 @@ TRACE_EVENT(ext4_es_insert_delayed_extent,
__entry->status = ext4_es_status(es);
__entry->lclu_allocated = lclu_allocated;
__entry->end_allocated = end_allocated;
+ __entry->seq = EXT4_I(inode)->i_es_seq;
),
TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
- "allocated %d %d",
+ "allocated %d %d seq %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->lblk, __entry->len,
__entry->pblk, show_extent_status(__entry->status),
- __entry->lclu_allocated, __entry->end_allocated)
+ __entry->lclu_allocated, __entry->end_allocated,
+ __entry->seq)
);
/* fsmap traces */
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 22/34] ext4: add a new iomap aops for regular file's buffered IO path
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (20 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 21/34] ext4: introduce seq counter for the extent status entry Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 23/34] ext4: implement buffered read iomap path Zhang Yi
` (7 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Introduce a new iomap address space operations ext4_iomap_aops to
support regular file's buffered IO path, also add an inode state flag
EXT4_STATE_BUFFERED_IOMAP, if it was set on an inode, it means that
inode use the iomap path instead of buffer_head path for buffered IO.
Most of their callbacks can use generic implementations, the left over
read_folio, readahead and writepages will be implemented later.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 1 +
fs/ext4/inode.c | 33 +++++++++++++++++++++++++++++++++
2 files changed, 34 insertions(+)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7e27e1e7c579..05949a8136ae 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1913,6 +1913,7 @@ enum {
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
+ EXT4_STATE_BUFFERED_IOMAP, /* Inode use iomap for buffered IO */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2704dca96ee7..4c1fed516d9e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3523,6 +3523,22 @@ const struct iomap_ops ext4_iomap_report_ops = {
.iomap_begin = ext4_iomap_begin_report,
};
+static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
+{
+ return 0;
+}
+
+static void ext4_iomap_readahead(struct readahead_control *rac)
+{
+
+}
+
+static int ext4_iomap_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return 0;
+}
+
/*
* For data=journal mode, folio should be marked dirty only when it was
* writeably mapped. When that happens, it was already attached to the
@@ -3612,6 +3628,21 @@ static const struct address_space_operations ext4_da_aops = {
.swap_activate = ext4_iomap_swap_activate,
};
+static const struct address_space_operations ext4_iomap_aops = {
+ .read_folio = ext4_iomap_read_folio,
+ .readahead = ext4_iomap_readahead,
+ .writepages = ext4_iomap_writepages,
+ .dirty_folio = iomap_dirty_folio,
+ .bmap = ext4_bmap,
+ .invalidate_folio = iomap_invalidate_folio,
+ .release_folio = iomap_release_folio,
+ .direct_IO = noop_direct_IO,
+ .migrate_folio = filemap_migrate_folio,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
+ .error_remove_folio = generic_error_remove_folio,
+ .swap_activate = ext4_iomap_swap_activate,
+};
+
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
.direct_IO = noop_direct_IO,
@@ -3634,6 +3665,8 @@ void ext4_set_aops(struct inode *inode)
}
if (IS_DAX(inode))
inode->i_mapping->a_ops = &ext4_dax_aops;
+ else if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ inode->i_mapping->a_ops = &ext4_iomap_aops;
else if (test_opt(inode->i_sb, DELALLOC))
inode->i_mapping->a_ops = &ext4_da_aops;
else
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 23/34] ext4: implement buffered read iomap path
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (21 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 22/34] ext4: add a new iomap aops for regular file's buffered IO path Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 24/34] ext4: implement buffered write " Zhang Yi
` (6 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Add ext4_iomap_buffered_io_begin() for the iomap read path, it call
ext4_map_blocks() to query map status and call ext4_set_iomap() to
convert ext4 map to iomap.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 36 ++++++++++++++++++++++++++++++++++--
1 file changed, 34 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4c1fed516d9e..20eb772f4f62 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3523,14 +3523,46 @@ const struct iomap_ops ext4_iomap_report_ops = {
.iomap_begin = ext4_iomap_begin_report,
};
-static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
+static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int iomap_flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
+ int ret;
+ struct ext4_map_blocks map;
+ u8 blkbits = inode->i_blkbits;
+
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ return -EIO;
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ return -ERANGE;
+
+ /* Calculate the first and last logical blocks respectively. */
+ map.m_lblk = offset >> blkbits;
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+
+ ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags);
return 0;
}
-static void ext4_iomap_readahead(struct readahead_control *rac)
+const struct iomap_ops ext4_iomap_buffered_read_ops = {
+ .iomap_begin = ext4_iomap_buffered_io_begin,
+};
+
+static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
{
+ return iomap_read_folio(folio, &ext4_iomap_buffered_read_ops);
+}
+static void ext4_iomap_readahead(struct readahead_control *rac)
+{
+ iomap_readahead(rac, &ext4_iomap_buffered_read_ops);
}
static int ext4_iomap_writepages(struct address_space *mapping,
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 24/34] ext4: implement buffered write iomap path
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (22 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 23/34] ext4: implement buffered read iomap path Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 25/34] ext4: implement writeback " Zhang Yi
` (5 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Implement buffered write iomap path, use ext4_da_map_blocks() to map
delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
delalloc is disabled or free space is about to run out.
Note that we always allocate unwritten extents for new blocks in the
iomap write path, this means that the allocation type is no longer
controlled by the dioread_nolock mount option. After that, we could
postpone the i_disksize updating to the writeback path, and drop journal
handle in the buffered dealloc write path completely.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 3 +
fs/ext4/file.c | 19 +++++-
fs/ext4/inode.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 183 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 05949a8136ae..2bd543c43341 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2970,6 +2970,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+int ext4_nonda_switch(struct super_block *sb);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -3827,6 +3828,8 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
+extern const struct iomap_ops ext4_iomap_buffered_write_ops;
+extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 54d6ff22585c..52f37c49572a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -282,6 +282,20 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
return count;
}
+static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ const struct iomap_ops *iomap_ops;
+
+ if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
+ iomap_ops = &ext4_iomap_buffered_da_write_ops;
+ else
+ iomap_ops = &ext4_iomap_buffered_write_ops;
+
+ return iomap_file_buffered_write(iocb, from, iomap_ops);
+}
+
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
@@ -296,7 +310,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
if (ret <= 0)
goto out;
- ret = generic_perform_write(iocb, from);
+ if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ ret = ext4_iomap_buffered_write(iocb, from);
+ else
+ ret = generic_perform_write(iocb, from);
out:
inode_unlock(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 20eb772f4f62..e825ed16fd60 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2857,7 +2857,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
return ret;
}
-static int ext4_nonda_switch(struct super_block *sb)
+int ext4_nonda_switch(struct super_block *sb)
{
s64 free_clusters, dirty_clusters;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3254,6 +3254,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
return inode->i_state & I_DIRTY_DATASYNC;
}
+static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
+{
+ return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
+}
+
+static const struct iomap_folio_ops ext4_iomap_folio_ops = {
+ .iomap_valid = ext4_iomap_valid,
+};
+
static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
struct ext4_map_blocks *map, loff_t offset,
loff_t length, unsigned int flags)
@@ -3284,6 +3293,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
iomap->flags |= IOMAP_F_MERGED;
+ iomap->validity_cookie = READ_ONCE(EXT4_I(inode)->i_es_seq);
+ iomap->folio_ops = &ext4_iomap_folio_ops;
+
/*
* Flags passed to ext4_map_blocks() for direct I/O writes can result
* in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
@@ -3523,11 +3535,42 @@ const struct iomap_ops ext4_iomap_report_ops = {
.iomap_begin = ext4_iomap_begin_report,
};
-static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
+static int ext4_iomap_get_blocks(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ handle_t *handle;
+ int ret, needed_blocks;
+
+ /*
+ * Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason.
+ */
+ needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ ret = ext4_map_blocks(handle, inode, map,
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+ /*
+ * Have to stop journal here since there is a potential deadlock
+ * caused by later balance_dirty_pages(), it might wait on the
+ * ditry pages to be written back, which might start another
+ * handle and wait this handle stop.
+ */
+ ext4_journal_stop(handle);
+
+ return ret;
+}
+
+#define IOMAP_F_EXT4_DELALLOC IOMAP_F_PRIVATE
+
+static int __ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
loff_t length, unsigned int iomap_flags,
- struct iomap *iomap, struct iomap *srcmap)
+ struct iomap *iomap, struct iomap *srcmap,
+ bool delalloc)
{
- int ret;
+ int ret, retries = 0;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
@@ -3537,20 +3580,133 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
return -EINVAL;
if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
return -ERANGE;
-
+retry:
/* Calculate the first and last logical blocks respectively. */
map.m_lblk = offset >> blkbits;
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ if (iomap_flags & IOMAP_WRITE) {
+ if (delalloc)
+ ret = ext4_da_map_blocks(inode, &map);
+ else
+ ret = ext4_iomap_get_blocks(inode, &map);
- ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ } else {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ }
if (ret < 0)
return ret;
ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags);
+ if (delalloc)
+ iomap->flags |= IOMAP_F_EXT4_DELALLOC;
+
+ return 0;
+}
+
+static inline int ext4_iomap_buffered_io_begin(struct inode *inode,
+ loff_t offset, loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
+ iomap, srcmap, false);
+}
+
+static inline int ext4_iomap_buffered_da_write_begin(struct inode *inode,
+ loff_t offset, loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
+ iomap, srcmap, true);
+}
+
+/*
+ * Drop the staled delayed allocation range from the write failure,
+ * including both start and end blocks. If not, we could leave a range
+ * of delayed extents covered by a clean folio, it could lead to
+ * inaccurate space reservation.
+ */
+static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
+ loff_t length)
+{
+ ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
+ DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
return 0;
}
+static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
+ loff_t length, ssize_t written,
+ unsigned int flags,
+ struct iomap *iomap)
+{
+ handle_t *handle;
+ loff_t end;
+ int ret = 0, ret2;
+
+ /* delalloc */
+ if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
+ ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
+ offset, length, written, ext4_iomap_punch_delalloc);
+ if (ret)
+ ext4_warning(inode->i_sb,
+ "Failed to clean up delalloc for inode %lu, %d",
+ inode->i_ino, ret);
+ return ret;
+ }
+
+ /* nodelalloc */
+ end = offset + length;
+ if (!(iomap->flags & IOMAP_F_SIZE_CHANGED) && end <= inode->i_size)
+ return 0;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (iomap->flags & IOMAP_F_SIZE_CHANGED) {
+ ext4_update_i_disksize(inode, inode->i_size);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ }
+
+ /*
+ * If we have allocated more blocks and copied less.
+ * We will have blocks allocated outside inode->i_size,
+ * so truncate them.
+ */
+ if (end > inode->i_size)
+ ext4_orphan_add(handle, inode);
+
+ ret2 = ext4_journal_stop(handle);
+ ret = ret ? : ret2;
+
+ if (end > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ return ret;
+}
+
+
+const struct iomap_ops ext4_iomap_buffered_write_ops = {
+ .iomap_begin = ext4_iomap_buffered_io_begin,
+ .iomap_end = ext4_iomap_buffered_write_end,
+};
+
+const struct iomap_ops ext4_iomap_buffered_da_write_ops = {
+ .iomap_begin = ext4_iomap_buffered_da_write_begin,
+ .iomap_end = ext4_iomap_buffered_write_end,
+};
+
const struct iomap_ops ext4_iomap_buffered_read_ops = {
.iomap_begin = ext4_iomap_buffered_io_begin,
};
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 25/34] ext4: implement writeback iomap path
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (23 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 24/34] ext4: implement buffered write " Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 26/34] ext4: implement mmap " Zhang Yi
` (4 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Implement the buffered writeback iomap path, including the map_blocks()
and the prepare_ioend() callback in iomap_writeback_ops and the
corresponding end io path. Add ext4_iomap_map_blocks() to dirty map
status before writeback, start journal handle and allocate new blocks if
it's not been allocated. Add ext4_iomap_prepare_ioend() to register the
end io handler of converting unwritten extents to mapped extents.
Note that current iomap call iomap_do_writepage() to write back dirty
folios one by one, but we can't map or allocate block(s) for dirty folio
one by one because it's expensive if folio size is small. In order to
reduce the number of blocks mapping times, we can calculate the length
through wbc->range_end carefully and map an entire delayed extent on the
first call.
Besides, since we always allocate unwritten extents for the new
allocated blocks, there are other 4 processes are different from the
buffered_head writeback path, which could be more simple.
1. We have to allow splitting extents in endio during the unwritten to
written conversion.
2. We don't need to write back the data before the metadata, there is no
risk of exposing stale data, the data=ordered journal mode becomes
useless. So we don't need to attach data to the jinode, and the
journal thread doesn't need to write data.
3. Since data=ordered is not used, we don't need to reserve journal
credits and use reserved handle for the extent status conversion.
4. We can postpone the i_disksize updating to endio path.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 4 +
fs/ext4/ext4_jbd2.c | 6 ++
fs/ext4/extents.c | 23 +++---
fs/ext4/inode.c | 186 +++++++++++++++++++++++++++++++++++++++++++-
fs/ext4/page-io.c | 107 +++++++++++++++++++++++++
fs/ext4/super.c | 2 +
6 files changed, 318 insertions(+), 10 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2bd543c43341..2ec6c7884e9a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1147,6 +1147,8 @@ struct ext4_inode_info {
*/
struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work;
+ struct list_head i_iomap_ioend_list;
+ struct work_struct i_iomap_ioend_work;
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -3755,6 +3757,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
size_t len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
+extern void ext4_iomap_end_io(struct work_struct *work);
+extern void ext4_iomap_end_bio(struct bio *bio);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 5d8055161acd..2f83cd90e132 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -11,6 +11,12 @@ int ext4_inode_journal_mode(struct inode *inode)
{
if (EXT4_JOURNAL(inode) == NULL)
return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ /*
+ * Ordered mode is no longer needed for the inode that use the
+ * iomap path, always use writeback mode.
+ */
+ if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
/* We do not support data journalling with delayed allocation */
if (!S_ISREG(inode->i_mode) ||
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index fcb1916a7c29..9849947cec56 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3708,18 +3708,23 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)ee_block, ee_len);
- /* If extent is larger than requested it is a clear sign that we still
- * have some extent state machine issues left. So extent_split is still
- * required.
- * TODO: Once all related issues will be fixed this situation should be
- * illegal.
+ /*
+ * For the inodes that use the buffered iomap path need to split
+ * extents in endio, other inodes not.
+ *
+ * TODO: Reserve enough sapce for splitting extents, always split
+ * extents here, and totally remove this warning.
*/
if (ee_block != map->m_lblk || ee_len > map->m_len) {
#ifdef CONFIG_EXT4_DEBUG
- ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
- " len %u; IO logical block %llu, len %u",
- inode->i_ino, (unsigned long long)ee_block, ee_len,
- (unsigned long long)map->m_lblk, map->m_len);
+ if (!ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) {
+ ext4_warning(inode->i_sb,
+ "Inode (%ld) finished: extent logical block %llu, "
+ "len %u; IO logical block %llu, len %u",
+ inode->i_ino, (unsigned long long)ee_block,
+ ee_len, (unsigned long long)map->m_lblk,
+ map->m_len);
+ }
#endif
err = ext4_split_convert_extents(handle, inode, map, ppath,
EXT4_GET_BLOCKS_CONVERT |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e825ed16fd60..55a4d293177d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -43,6 +43,7 @@
#include <linux/iversion.h>
#include "ext4_jbd2.h"
+#include "ext4_extents.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"
@@ -3721,10 +3722,193 @@ static void ext4_iomap_readahead(struct readahead_control *rac)
iomap_readahead(rac, &ext4_iomap_buffered_read_ops);
}
+struct ext4_writeback_ctx {
+ struct iomap_writepage_ctx ctx;
+ struct writeback_control *wbc;
+ unsigned int data_seq;
+};
+
+static int ext4_iomap_map_one_extent(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ struct extent_status es;
+ handle_t *handle = NULL;
+ int credits, map_flags;
+ int retval;
+
+ credits = ext4_da_writepages_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ map->m_flags = 0;
+ /*
+ * In order to protect from the race of truncate, we have to lookup
+ * extent stats and map blocks under i_data_sem, otherwise the
+ * delalloc extent could be stale.
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (likely(ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es))) {
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ map->m_len = min_t(unsigned int, retval, map->m_len);
+
+ if (likely(ext4_es_is_delayed(&es))) {
+ map->m_flags |= EXT4_MAP_DELAYED;
+ trace_ext4_da_write_pages_extent(inode, map);
+ /*
+ * Call ext4_map_create_blocks() to allocate any delayed
+ * allocation blocks. It is possible that we're going to
+ * need more metadata blocks, however we must not fail
+ * because we're in writeback and there is nothing we
+ * can do so it might result in data loss. So use
+ * reserved blocks to allocate metadata if possible.
+ */
+ map_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL;
+
+ retval = ext4_map_create_blocks(handle, inode, map,
+ map_flags);
+ }
+ if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ map->m_pblk = ext4_es_pblock(&es) + map->m_lblk -
+ es.es_lblk;
+ map->m_flags = ext4_es_is_written(&es) ?
+ EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+ }
+ } else {
+ retval = ext4_map_query_blocks(handle, inode, map);
+ }
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
+ return retval < 0 ? retval : 0;
+}
+
+static int ext4_iomap_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset,
+ unsigned int dirty_len)
+{
+ struct ext4_writeback_ctx *ewpc =
+ container_of(wpc, struct ext4_writeback_ctx, ctx);
+ struct super_block *sb = inode->i_sb;
+ struct journal_s *journal = EXT4_SB(sb)->s_journal;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int index = offset >> blkbits;
+ unsigned int end, len;
+ int ret;
+
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
+ /* Check validity of the cached writeback mapping. */
+ if (offset >= wpc->iomap.offset &&
+ offset < wpc->iomap.offset + wpc->iomap.length &&
+ ewpc->data_seq == READ_ONCE(ei->i_es_seq))
+ return 0;
+
+ end = min_t(unsigned int,
+ (ewpc->wbc->range_end >> blkbits), (UINT_MAX - 1));
+ len = (end > index + dirty_len) ? end - index + 1 : dirty_len;
+
+retry:
+ map.m_lblk = index;
+ map.m_len = min_t(unsigned int, EXT_UNWRITTEN_MAX_LEN, len);
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * The map isn't a delalloc extent, it must be a hole or have
+ * already been allocated.
+ */
+ if (!(map.m_flags & EXT4_MAP_DELAYED))
+ goto out;
+
+ /* Map one delalloc extent. */
+ ret = ext4_iomap_map_one_extent(inode, &map);
+ if (ret < 0) {
+ if (ext4_forced_shutdown(sb))
+ return ret;
+
+ /*
+ * Retry transient ENOSPC errors, if
+ * ext4_count_free_blocks() is non-zero, a commit
+ * should free up blocks.
+ */
+ if (ret == -ENOSPC && ext4_count_free_clusters(sb)) {
+ jbd2_journal_force_commit_nested(journal);
+ goto retry;
+ }
+
+ ext4_msg(sb, KERN_CRIT,
+ "Delayed block allocation failed for "
+ "inode %lu at logical offset %llu with "
+ "max blocks %u with error %d",
+ inode->i_ino, (unsigned long long)map.m_lblk,
+ (unsigned int)map.m_len, -ret);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will "
+ "be lost\n");
+ if (ret == -ENOSPC)
+ ext4_print_free_blocks(inode);
+ return ret;
+ }
+out:
+ ewpc->data_seq = READ_ONCE(ei->i_es_seq);
+ ext4_set_iomap(inode, &wpc->iomap, &map, offset,
+ map.m_len << blkbits, 0);
+ return 0;
+}
+
+static int ext4_iomap_prepare_ioend(struct iomap_ioend *ioend, int status)
+{
+ struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
+
+ /* Need to convert unwritten extents when I/Os are completed. */
+ if (ioend->io_type == IOMAP_UNWRITTEN ||
+ ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize))
+ ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
+
+ return status;
+}
+
+static void ext4_iomap_discard_folio(struct folio *folio, loff_t pos)
+{
+ struct inode *inode = folio->mapping->host;
+
+ ext4_iomap_punch_delalloc(inode, pos,
+ folio_pos(folio) + folio_size(folio) - pos);
+}
+
+static const struct iomap_writeback_ops ext4_writeback_ops = {
+ .map_blocks = ext4_iomap_map_blocks,
+ .prepare_ioend = ext4_iomap_prepare_ioend,
+ .discard_folio = ext4_iomap_discard_folio,
+};
+
static int ext4_iomap_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- return 0;
+ struct inode *inode = mapping->host;
+ struct super_block *sb = inode->i_sb;
+ long nr = wbc->nr_to_write;
+ int alloc_ctx, ret;
+ struct ext4_writeback_ctx ewpc = {
+ .wbc = wbc,
+ };
+
+ if (unlikely(ext4_forced_shutdown(sb)))
+ return -EIO;
+
+ alloc_ctx = ext4_writepages_down_read(sb);
+ trace_ext4_writepages(inode, wbc);
+ ret = iomap_writepages(mapping, wbc, &ewpc.ctx, &ext4_writeback_ops);
+ trace_ext4_writepages_result(inode, wbc, ret, nr - wbc->nr_to_write);
+ ext4_writepages_up_read(sb, alloc_ctx);
+
+ return ret;
}
/*
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 312bc6813357..5ad72f725e0c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -22,6 +22,7 @@
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
+#include <linux/iomap.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
@@ -565,3 +566,109 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
return 0;
}
+
+static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
+{
+ struct inode *inode = ioend->io_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ loff_t pos = ioend->io_offset;
+ size_t size = ioend->io_size;
+ loff_t new_disksize;
+ handle_t *handle;
+ int credits;
+ int ret, err;
+
+ ret = blk_status_to_errno(ioend->io_bio.bi_status);
+ if (unlikely(ret))
+ goto out;
+
+ /*
+ * We may need to convert up to one extent per block in
+ * the page and we may dirty the inode.
+ */
+ credits = ext4_chunk_trans_blocks(inode,
+ EXT4_MAX_BLOCKS(size, pos, inode->i_blkbits));
+ handle = ext4_journal_start(inode, EXT4_HT_EXT_CONVERT, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_err;
+ }
+
+ if (ioend->io_type == IOMAP_UNWRITTEN) {
+ ret = ext4_convert_unwritten_extents(handle, inode, pos, size);
+ if (ret)
+ goto out_journal;
+ }
+
+ /*
+ * Update on-disk size after IO is completed. Races with
+ * truncate are avoided by checking i_size under i_data_sem.
+ */
+ new_disksize = pos + size;
+ if (new_disksize > READ_ONCE(ei->i_disksize)) {
+ down_write(&ei->i_data_sem);
+ new_disksize = min(new_disksize, i_size_read(inode));
+ if (new_disksize > ei->i_disksize)
+ ei->i_disksize = new_disksize;
+ up_write(&ei->i_data_sem);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (ret)
+ EXT4_ERROR_INODE_ERR(inode, -ret,
+ "Failed to mark inode dirty");
+ }
+
+out_journal:
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+out_err:
+ if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "failed to convert unwritten extents to "
+ "written extents or update inode size -- "
+ "potential data loss! (inode %lu, error %d)",
+ inode->i_ino, ret);
+ }
+out:
+ iomap_finish_ioends(ioend, ret);
+}
+
+/*
+ * Work on buffered iomap completed IO, to convert unwritten extents to
+ * mapped extents
+ */
+void ext4_iomap_end_io(struct work_struct *work)
+{
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_iomap_ioend_work);
+ struct iomap_ioend *ioend;
+ struct list_head ioend_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ list_replace_init(&ei->i_iomap_ioend_list, &ioend_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ iomap_sort_ioends(&ioend_list);
+ while (!list_empty(&ioend_list)) {
+ ioend = list_entry(ioend_list.next, struct iomap_ioend, io_list);
+ list_del_init(&ioend->io_list);
+ iomap_ioend_try_merge(ioend, &ioend_list);
+ ext4_iomap_finish_ioend(ioend);
+ }
+}
+
+void ext4_iomap_end_bio(struct bio *bio)
+{
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+ struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
+ struct ext4_sb_info *sbi = EXT4_SB(ioend->io_inode->i_sb);
+ unsigned long flags;
+
+ /* Only reserved conversions from writeback should enter here */
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ if (list_empty(&ei->i_iomap_ioend_list))
+ queue_work(sbi->rsv_conversion_wq, &ei->i_iomap_ioend_work);
+ list_add_tail(&ioend->io_list, &ei->i_iomap_ioend_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5fce4d2b3b87..6410918161a0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1431,11 +1431,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
#endif
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+ INIT_LIST_HEAD(&ei->i_iomap_ioend_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ INIT_WORK(&ei->i_iomap_ioend_work, ext4_iomap_end_io);
ext4_fc_init_inode(&ei->vfs_inode);
mutex_init(&ei->i_fc_lock);
return &ei->vfs_inode;
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 26/34] ext4: implement mmap iomap path
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (24 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 25/34] ext4: implement writeback " Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 27/34] ext4: implement zero_range " Zhang Yi
` (3 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Add ext4_iomap_page_mkwrite() for the mmap iomap path. It dirty folio
and map blocks, almost all work have been done in iomap_page_mkwrite(),
so call it directly.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 55a4d293177d..9d694c780007 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6484,6 +6484,26 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
return !buffer_mapped(bh);
}
+static vm_fault_t ext4_iomap_page_mkwrite(struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ const struct iomap_ops *iomap_ops;
+
+ /*
+ * ext4_nonda_switch() could writeback this folio, so have to
+ * call it before lock folio.
+ *
+ * TODO: drop ext4_nonda_switch() after reserving enough sapce
+ * for metadata and merge delalloc and nodelalloc operations.
+ */
+ if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
+ iomap_ops = &ext4_iomap_buffered_da_write_ops;
+ else
+ iomap_ops = &ext4_iomap_buffered_write_ops;
+
+ return iomap_page_mkwrite(vmf, iomap_ops);
+}
+
vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -6507,6 +6527,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
filemap_invalidate_lock_shared(mapping);
+ if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) {
+ ret = ext4_iomap_page_mkwrite(vmf);
+ goto out;
+ }
+
err = ext4_convert_inline_data(inode);
if (err)
goto out_ret;
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 27/34] ext4: implement zero_range iomap path
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (25 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 26/34] ext4: implement mmap " Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 28/34] ext4: writeback partial blocks before zeroing out range Zhang Yi
` (2 subsequent siblings)
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Add ext4_iomap_zero_range() for the zero_range iomap path, it zero out
the mapped blocks, all work have been done in iomap_zero_range(), so
call it directly.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9d694c780007..5af3b8acf1b9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4144,6 +4144,13 @@ static int __ext4_block_zero_page_range(handle_t *handle,
return err;
}
+static int ext4_iomap_zero_range(struct inode *inode,
+ loff_t from, loff_t length)
+{
+ return iomap_zero_range(inode, from, length, NULL,
+ &ext4_iomap_buffered_read_ops);
+}
+
/*
* ext4_block_zero_page_range() zeros out a mapping of length 'length'
* starting from file offset 'from'. The range to be zero'd must
@@ -4169,6 +4176,8 @@ static int ext4_block_zero_page_range(handle_t *handle,
if (IS_DAX(inode)) {
return dax_zero_range(inode, from, length, NULL,
&ext4_iomap_ops);
+ } else if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) {
+ return ext4_iomap_zero_range(inode, from, length);
}
return __ext4_block_zero_page_range(handle, mapping, from, length);
}
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 28/34] ext4: writeback partial blocks before zeroing out range
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (26 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 27/34] ext4: implement zero_range " Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 29/34] ext4: fall back to buffer_head path for defrag Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 30/34] ext4: partial enable iomap for regular file's buffered IO path Zhang Yi
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
If we zero partial blocks, iomap_zero_iter() will skip zeroing out the
IOMAP_UNWRITTEN srcmap, it works fine in xfs because this type means the
block is pure unwritten and doesn't contain any delayed data. But it
doesn't work in ext4, because IOMAP_UNWRITTEN may contain delayed data
in ext4. For now it's hard to unify the meaning of this flag, so just
fix it by writeback partial blocks before zeroing out.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9849947cec56..c4c38a323ff7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4580,6 +4580,15 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (ret)
goto out_mutex;
+ ret = filemap_write_and_wait_range(mapping,
+ round_down(offset, 1 << blkbits), offset);
+ if (ret)
+ goto out_mutex;
+
+ ret = filemap_write_and_wait_range(mapping, offset + len,
+ round_up((offset + len), 1 << blkbits));
+ if (ret)
+ goto out_mutex;
}
/* Zero range excluding the unaligned edges */
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 29/34] ext4: fall back to buffer_head path for defrag
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (27 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 28/34] ext4: writeback partial blocks before zeroing out range Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
2024-04-10 13:28 ` [RFC PATCH v4 30/34] ext4: partial enable iomap for regular file's buffered IO path Zhang Yi
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Online defrag doesn't support iomap path yet, we have to fall back to
buffer_head path for the inode which has been using iomap. Changing
active inode is dangerous, before we start, we must hold the inode lock
and the mapping->invalidate_lock, and writeback all dirty folios and
drop the inode's pagecache.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/move_extent.c | 34 ++++++++++++++++++++++++++++++++++
1 file changed, 34 insertions(+)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 7cd4afa4de1d..3db255385367 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -538,6 +538,34 @@ mext_check_arguments(struct inode *orig_inode,
return 0;
}
+/*
+ * Disable buffered iomap path for the inode that requiring move extents,
+ * fallback to buffer_head path.
+ */
+static int ext4_disable_buffered_iomap_aops(struct inode *inode)
+{
+ int err;
+
+ /*
+ * The buffered_head aops don't know how to handle folios
+ * dirtied by iomap, so before falling back, flush all dirty
+ * folios the inode has.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err < 0) {
+ filemap_invalidate_unlock(inode->i_mapping);
+ return err;
+ }
+ truncate_inode_pages(inode->i_mapping, 0);
+
+ ext4_clear_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
+ ext4_set_aops(inode);
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ return 0;
+}
+
/**
* ext4_move_extents - Exchange the specified range of a file
*
@@ -609,6 +637,12 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
inode_dio_wait(orig_inode);
inode_dio_wait(donor_inode);
+ /* Fallback to buffer_head aops for inodes with buffered iomap aops */
+ if (ext4_test_inode_state(orig_inode, EXT4_STATE_BUFFERED_IOMAP))
+ ext4_disable_buffered_iomap_aops(orig_inode);
+ if (ext4_test_inode_state(donor_inode, EXT4_STATE_BUFFERED_IOMAP))
+ ext4_disable_buffered_iomap_aops(donor_inode);
+
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v4 30/34] ext4: partial enable iomap for regular file's buffered IO path
2024-04-10 13:27 [RFC PATCH v4 00/34] ext4: use iomap for regular file's buffered IO path and enable large folio Zhang Yi
` (28 preceding siblings ...)
2024-04-10 13:28 ` [RFC PATCH v4 29/34] ext4: fall back to buffer_head path for defrag Zhang Yi
@ 2024-04-10 13:28 ` Zhang Yi
29 siblings, 0 replies; 33+ messages in thread
From: Zhang Yi @ 2024-04-10 13:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-mm, linux-kernel, tytso, adilger.kernel,
jack, ritesh.list, hch, djwong, willy, zokeefe, yi.zhang,
yi.zhang, chengzhihao1, yukuai3, wangkefeng.wang
From: Zhang Yi <yi.zhang@huawei.com>
Partial enable iomap for regular file's buffered IO path on default
mount option, support default filesystem features and bigalloc feature,
doesn't support inline data, fs_verity, fs_crypt, defrag and
data=journal mode yet (these would be supported gradually in the
future). ext4 will fallback to buffered_head path automatically if these
options or features are enable.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 1 +
fs/ext4/ialloc.c | 3 +++
fs/ext4/inode.c | 32 ++++++++++++++++++++++++++++++++
3 files changed, 36 insertions(+)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2ec6c7884e9a..4e7667b21c2f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2972,6 +2972,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+bool ext4_should_use_buffered_iomap(struct inode *inode);
int ext4_nonda_switch(struct super_block *sb);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e9bbb1da2d0a..956b9d69c559 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1336,6 +1336,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
}
}
+ if (ext4_should_use_buffered_iomap(inode))
+ ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
+
if (ext4_handle_valid(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
ei->i_datasync_tid = handle->h_transaction->t_tid;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5af3b8acf1b9..624eac0cc705 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -770,6 +770,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
if (ext4_has_inline_data(inode))
return -ERANGE;
+ if (WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)))
+ return -EINVAL;
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
@@ -2567,6 +2569,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
trace_ext4_writepages(inode, wbc);
+ if (WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)))
+ return -EINVAL;
+
/*
* No pages to write? This is mainly a kludge to avoid starting
* a transaction for special inodes like journal inode on last iput()
@@ -5107,6 +5112,30 @@ static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
return NULL;
}
+bool ext4_should_use_buffered_iomap(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (ext4_has_feature_inline_data(sb))
+ return false;
+ if (ext4_has_feature_verity(sb))
+ return false;
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ return false;
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (IS_DAX(inode))
+ return false;
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return false;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
+ return false;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
+ return false;
+
+ return true;
+}
+
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
unsigned int line)
@@ -5371,6 +5400,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (ret)
goto bad_inode;
+ if (ext4_should_use_buffered_iomap(inode))
+ ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
+
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
--
2.39.2
^ permalink raw reply related [flat|nested] 33+ messages in thread