All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT()
@ 2020-11-07 15:58 Chunguang Xu
  2020-11-07 15:58 ` [PATCH RESEND 2/8] ext4: remove redundant mb_regenerate_buddy() Chunguang Xu
                   ` (7 more replies)
  0 siblings, 8 replies; 27+ messages in thread
From: Chunguang Xu @ 2020-11-07 15:58 UTC (permalink / raw)
  To: tytso, adilger.kernel; +Cc: linux-ext4

From: Chunguang Xu <brookxu@tencent.com>

There are currently multiple forms of assertion, such as J_ASSERT().
J_ASEERT() is provided for the jbd module, which is a public module.
Maybe we should use custom ASSERT() like other file systems, such as
xfs, which would be better.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
 fs/ext4/balloc.c   |  2 +-
 fs/ext4/ext4.h     | 10 ++++++++++
 fs/ext4/fsync.c    |  2 +-
 fs/ext4/indirect.c |  4 ++--
 fs/ext4/inode.c    | 10 +++++-----
 fs/ext4/namei.c    | 12 ++++--------
 fs/ext4/super.c    |  2 +-
 7 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1d640b1..2d7f4eb 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -185,7 +185,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_fsblk_t start, tmp;
 
-	J_ASSERT_BH(bh, buffer_locked(bh));
+	ext4_assert(buffer_locked(bh));
 
 	/* If checksum is bad mark all blocks used to prevent allocation
 	 * essentially implementing a per-group read-only flag. */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1b399ca..bd88b4a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -98,6 +98,16 @@
 #define ext_debug(ino, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
+#define ext4_assert(assert)						\
+do {									\
+	if (unlikely(!(assert))) {					\
+		printk(KERN_EMERG					\
+		       "Assertion failure in %s() at %s:%d: '%s'\n",	\
+		       __func__, __FILE__, __LINE__, #assert);		\
+		BUG();							\
+	}								\
+} while (0)
+
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
 
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a42ca95..7e74279 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -136,7 +136,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (unlikely(ext4_forced_shutdown(sbi)))
 		return -EIO;
 
-	J_ASSERT(ext4_journal_current_handle() == NULL);
+	ext4_assert(ext4_journal_current_handle() == NULL);
 
 	trace_ext4_sync_file_enter(file, datasync);
 
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 05efa682..bffc5e4 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -534,8 +534,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t first_block = 0;
 
 	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
-	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
-	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+	ext4_assert(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+	ext4_assert(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
 				   &blocks_to_boundary);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0d8385a..67fa932 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -830,8 +830,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	int create = map_flags & EXT4_GET_BLOCKS_CREATE;
 	int err;
 
-	J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
-		 || handle != NULL || create == 0);
+	ext4_assert((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+		    || handle != NULL || create == 0);
 
 	map.m_lblk = block;
 	map.m_len = 1;
@@ -846,9 +846,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	if (unlikely(!bh))
 		return ERR_PTR(-ENOMEM);
 	if (map.m_flags & EXT4_MAP_NEW) {
-		J_ASSERT(create != 0);
-		J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
-			 || (handle != NULL));
+		ext4_assert(create != 0);
+		ext4_assert((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+			    || (handle != NULL));
 
 		/*
 		 * Now that we do not always journal data, we should
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3350926..9177352 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -182,10 +182,6 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	return bh;
 }
 
-#ifndef assert
-#define assert(test) J_ASSERT(test)
-#endif
-
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
@@ -849,7 +845,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 					break;
 				}
 			}
-			assert (at == p - 1);
+			ext4_assert(at == p - 1);
 		}
 
 		at = p - 1;
@@ -1265,8 +1261,8 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
 	struct dx_entry *old = frame->at, *new = old + 1;
 	int count = dx_get_count(entries);
 
-	assert(count < dx_get_limit(entries));
-	assert(old < entries + count);
+	ext4_assert(count < dx_get_limit(entries));
+	ext4_assert(old < entries + count);
 	memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
 	dx_set_hash(new, hash);
 	dx_set_block(new, block);
@@ -2961,7 +2957,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	 * hold i_mutex, or the inode can not be referenced from outside,
 	 * so i_nlink should not be bumped due to race
 	 */
-	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	ext4_assert((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
 
 	BUFFER_TRACE(sbi->s_sbh, "get_write_access");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c3b8645..5006c42 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1251,7 +1251,7 @@ static void ext4_put_super(struct super_block *sb)
 	 * in-memory list had better be clean by this point. */
 	if (!list_empty(&sbi->s_orphan))
 		dump_orphan_list(sb, sbi);
-	J_ASSERT(list_empty(&sbi->s_orphan));
+	ext4_assert(list_empty(&sbi->s_orphan));
 
 	sync_blockdev(sb->s_bdev);
 	invalidate_bdev(sb->s_bdev);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH RESEND 2/8] ext4: remove redundant mb_regenerate_buddy()
  2020-11-07 15:58 [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Chunguang Xu
@ 2020-11-07 15:58 ` Chunguang Xu
  2020-12-03 14:42   ` Theodore Y. Ts'o
  2020-11-07 15:58 ` [PATCH RESEND 3/8] ext4: simplify the code of mb_find_order_for_block Chunguang Xu
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Chunguang Xu @ 2020-11-07 15:58 UTC (permalink / raw)
  To: tytso, adilger.kernel; +Cc: linux-ext4

From: Chunguang Xu <brookxu@tencent.com>

After this patch (163a203), if an abnormal bitmap is detected, we
will mark the group as corrupt, and we will not use this group in
the future. Therefore, it should be meaningless to regenerate the
buddy bitmap of this group, It might be better to delete it.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
 fs/ext4/mballoc.c | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 24af9ed..5b74555 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -822,24 +822,6 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 	spin_unlock(&sbi->s_bal_lock);
 }
 
-static void mb_regenerate_buddy(struct ext4_buddy *e4b)
-{
-	int count;
-	int order = 1;
-	void *buddy;
-
-	while ((buddy = mb_find_buddy(e4b, order++, &count))) {
-		ext4_set_bits(buddy, 0, count);
-	}
-	e4b->bd_info->bb_fragments = 0;
-	memset(e4b->bd_info->bb_counters, 0,
-		sizeof(*e4b->bd_info->bb_counters) *
-		(e4b->bd_sb->s_blocksize_bits + 2));
-
-	ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
-		e4b->bd_bitmap, e4b->bd_group);
-}
-
 /* The buddy information is attached the buddy cache inode
  * for convenience. The information regarding each group
  * is loaded via ext4_mb_load_buddy. The information involve
@@ -1512,7 +1494,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 				sb, e4b->bd_group,
 				EXT4_GROUP_INFO_BBITMAP_CORRUPT);
 		}
-		mb_regenerate_buddy(e4b);
 		goto done;
 	}
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH RESEND 3/8] ext4: simplify the code of mb_find_order_for_block
  2020-11-07 15:58 [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Chunguang Xu
  2020-11-07 15:58 ` [PATCH RESEND 2/8] ext4: remove redundant mb_regenerate_buddy() Chunguang Xu
@ 2020-11-07 15:58 ` Chunguang Xu
  2020-12-03 14:43   ` Theodore Y. Ts'o
  2020-11-07 15:58 ` [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone Chunguang Xu
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Chunguang Xu @ 2020-11-07 15:58 UTC (permalink / raw)
  To: tytso, adilger.kernel; +Cc: linux-ext4

From: Chunguang Xu <brookxu@tencent.com>

The code of mb_find_order_for_block is a bit obscure, but we can
simplify it with mb_find_buddy(), make the code more concise.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
 fs/ext4/mballoc.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5b74555..29dfeb04 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1289,22 +1289,18 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 
 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
 {
-	int order = 1;
-	int bb_incr = 1 << (e4b->bd_blkbits - 1);
+	int order = 1, max;
 	void *bb;
 
 	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
 	BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
 
-	bb = e4b->bd_buddy;
 	while (order <= e4b->bd_blkbits + 1) {
-		block = block >> 1;
-		if (!mb_test_bit(block, bb)) {
+		bb = mb_find_buddy(e4b, order, &max);
+		if (!mb_test_bit(block >> order, bb)) {
 			/* this block is part of buddy of order 'order' */
 			return order;
 		}
-		bb += bb_incr;
-		bb_incr >>= 1;
 		order++;
 	}
 	return 0;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone
  2020-11-07 15:58 [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Chunguang Xu
  2020-11-07 15:58 ` [PATCH RESEND 2/8] ext4: remove redundant mb_regenerate_buddy() Chunguang Xu
  2020-11-07 15:58 ` [PATCH RESEND 3/8] ext4: simplify the code of mb_find_order_for_block Chunguang Xu
@ 2020-11-07 15:58 ` Chunguang Xu
  2020-12-03 15:08   ` Theodore Y. Ts'o
  2020-11-07 15:58 ` [PATCH RESEND 5/8] ext4: update ext4_data_block_valid related comments Chunguang Xu
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Chunguang Xu @ 2020-11-07 15:58 UTC (permalink / raw)
  To: tytso, adilger.kernel; +Cc: linux-ext4

From: Chunguang Xu <brookxu@tencent.com>

In order to avoid poor search efficiency of system_zone, the
system only adds metadata of some sparse group to system_zone.
In the meta_bg scenario, the non-sparse group may contain gdt
blocks. Perhaps we should add these blocks to system_zone to
improve fault tolerance without significantly reducing system
performance.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
---
 fs/ext4/block_validity.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 8e6ca23..37025e3 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -218,6 +218,7 @@ int ext4_setup_system_zone(struct super_block *sb)
 	struct ext4_group_desc *gdp;
 	ext4_group_t i;
 	int flex_size = ext4_flex_bg_size(sbi);
+	int gd_blks;
 	int ret;
 
 	system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
@@ -226,13 +227,16 @@ int ext4_setup_system_zone(struct super_block *sb)
 
 	for (i=0; i < ngroups; i++) {
 		cond_resched();
-		if (ext4_bg_has_super(sb, i) &&
-		    ((i < 5) || ((i % flex_size) == 0))) {
-			ret = add_system_zone(system_blks,
-					ext4_group_first_block_no(sb, i),
-					ext4_bg_num_gdb(sb, i) + 1, 0);
-			if (ret)
-				goto err;
+		if ((i < 5) || ((i % flex_size) == 0)) {
+			gd_blks = ext4_bg_has_super(sb, i) +
+				ext4_bg_num_gdb(sb, i);
+			if (gd_blks) {
+				ret = add_system_zone(system_blks,
+						ext4_group_first_block_no(sb, i),
+						gd_blks, 0);
+				if (ret)
+					goto err;
+			}
 		}
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		ret = add_system_zone(system_blks,
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH RESEND 5/8] ext4: update ext4_data_block_valid related comments
  2020-11-07 15:58 [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Chunguang Xu
                   ` (2 preceding siblings ...)
  2020-11-07 15:58 ` [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone Chunguang Xu
@ 2020-11-07 15:58 ` Chunguang Xu
  2020-12-09 19:11   ` Theodore Y. Ts'o
  2020-11-07 15:58 ` [PATCH 6/8] ext4: add a helper function to validate metadata block Chunguang Xu
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Chunguang Xu @ 2020-11-07 15:58 UTC (permalink / raw)
  To: tytso, adilger.kernel; +Cc: linux-ext4

From: Chunguang Xu <brookxu@tencent.com>

Since ext4_data_block_valid() has been renamed to ext4_inode_block_valid(),
the related comments need to be updated.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
 fs/ext4/block_validity.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 37025e3..07e9dc3 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -206,7 +206,7 @@ static void ext4_destroy_system_zone(struct rcu_head *rcu)
  *
  * The update of system_blks pointer in this function is protected by
  * sb->s_umount semaphore. However we have to be careful as we can be
- * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * racing with ext4_inode_block_valid() calls reading system_blks rbtree
  * protected only by RCU. That's why we first build the rbtree and then
  * swap it in place.
  */
@@ -262,7 +262,7 @@ int ext4_setup_system_zone(struct super_block *sb)
 
 	/*
 	 * System blks rbtree complete, announce it once to prevent racing
-	 * with ext4_data_block_valid() accessing the rbtree at the same
+	 * with ext4_inode_block_valid() accessing the rbtree at the same
 	 * time.
 	 */
 	rcu_assign_pointer(sbi->s_system_blks, system_blks);
@@ -282,7 +282,7 @@ int ext4_setup_system_zone(struct super_block *sb)
  *
  * The update of system_blks pointer in this function is protected by
  * sb->s_umount semaphore. However we have to be careful as we can be
- * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * racing with ext4_inode_block_valid() calls reading system_blks rbtree
  * protected only by RCU. So we first clear the system_blks pointer and
  * then free the rbtree only after RCU grace period expires.
  */
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 6/8] ext4: add a helper function to validate metadata block
  2020-11-07 15:58 [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Chunguang Xu
                   ` (3 preceding siblings ...)
  2020-11-07 15:58 ` [PATCH RESEND 5/8] ext4: update ext4_data_block_valid related comments Chunguang Xu
@ 2020-11-07 15:58 ` Chunguang Xu
  2020-12-09  4:55   ` Theodore Y. Ts'o
  2020-11-07 15:58 ` [PATCH RESEND 7/8] ext4: delete invalid code inside ext4_xattr_block_set() Chunguang Xu
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Chunguang Xu @ 2020-11-07 15:58 UTC (permalink / raw)
  To: tytso, adilger.kernel; +Cc: linux-ext4

From: Chunguang Xu <brookxu@tencent.com>

There is a need to check whether a block or a segment overlaps
with metadata, since information of system_zone is incomplete,
we need a more accurate function. Now we check whether it
overlaps with block bitmap, inode bitmap, and inode table.
Perhaps it is better to add a check of super_block and block
group descriptor and provide a helper function.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
---
 fs/ext4/ext4.h    |  5 ++++-
 fs/ext4/mballoc.c | 57 +++++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bd88b4a..73d59cb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2801,7 +2801,10 @@ extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
 				     unsigned int nr, int *cnt);
 extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
 				  unsigned int nr);
-
+extern int ext4_metadata_block_overlaps(struct super_block *sb,
+					ext4_group_t block_group,
+					ext4_fsblk_t block,
+					unsigned long count);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			     struct buffer_head *bh, ext4_fsblk_t block,
 			     unsigned long count, int flags);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 29dfeb04..d8704fe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5061,6 +5061,49 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
 	kmem_cache_free(ext4_free_data_cachep, entry);
 }
 
+/*
+ * Returns 1 if the passed-in block region (block, block+count)
+ * overlaps with some other filesystem metadata blocks. Others,
+ * return 0.
+ */
+int ext4_metadata_block_overlaps(struct super_block *sb,
+				 ext4_group_t block_group,
+				 ext4_fsblk_t block,
+				 unsigned long count)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp;
+	int gd_first = ext4_group_first_block_no(sb, block_group);
+	int itable, gd_blk;
+	int ret = 0;
+
+	gdp = ext4_get_group_desc(sb, block_group, NULL);
+	// check block bitmap and inode bitmap
+	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
+	    in_range(ext4_inode_bitmap(sb, gdp), block, count))
+		ret = 1;
+
+	// check inode table
+	itable = ext4_inode_table(sb, gdp);
+	if (!(block >= itable + sbi->s_itb_per_group ||
+	      block + count - 1  < itable))
+		ret = 1;
+
+	/* check super_block and block group descriptor table, the
+	 * reserved space of the block group descriptor is managed
+	 * by resize_inode, it may not be processed now due to
+	 * performance.
+	 */
+	gd_blk = ext4_bg_has_super(sb, block_group) +
+		ext4_bg_num_gdb(sb, block_group);
+	if (gd_blk) {
+		if (!(block >= gd_first + gd_blk ||
+		      block + count - 1 < gd_first))
+			ret = 1;
+	}
+	return ret;
+}
+
 static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		      struct ext4_free_data *new_entry)
@@ -5360,13 +5403,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 		goto error_return;
 	}
 
-	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
-	    in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
-	    in_range(block, ext4_inode_table(sb, gdp),
-		     sbi->s_itb_per_group) ||
-	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
-		     sbi->s_itb_per_group)) {
-
+	if (ext4_metadata_block_overlaps(sb, block_group, block, count)) {
 		ext4_error(sb, "Freeing blocks in system zone - "
 			   "Block = %llu, count = %lu", block, count);
 		/* err = 0. ext4_std_error should be a no op */
@@ -5552,11 +5589,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 		goto error_return;
 	}
 
-	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
-	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
-	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
-	    in_range(block + count - 1, ext4_inode_table(sb, desc),
-		     sbi->s_itb_per_group)) {
+	if (ext4_metadata_block_overlaps(sb, block_group, block, count)) {
 		ext4_error(sb, "Adding blocks in system zones - "
 			   "Block = %llu, count = %lu",
 			   block, count);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH RESEND 7/8] ext4: delete invalid code inside ext4_xattr_block_set()
  2020-11-07 15:58 [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Chunguang Xu
                   ` (4 preceding siblings ...)
  2020-11-07 15:58 ` [PATCH 6/8] ext4: add a helper function to validate metadata block Chunguang Xu
@ 2020-11-07 15:58 ` Chunguang Xu
  2020-12-09 19:24   ` Theodore Y. Ts'o
  2020-11-07 15:58 ` [PATCH RESEND 8/8] ext4: fix a memory leak of ext4_free_data Chunguang Xu
  2020-12-03 14:38 ` [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Theodore Y. Ts'o
  7 siblings, 1 reply; 27+ messages in thread
From: Chunguang Xu @ 2020-11-07 15:58 UTC (permalink / raw)
  To: tytso, adilger.kernel; +Cc: linux-ext4

From: Chunguang Xu <brookxu@tencent.com>

Delete invalid code inside ext4_xattr_block_set().

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
 fs/ext4/xattr.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6127e94..4e3b1f8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1927,7 +1927,6 @@ struct ext4_xattr_block_find {
 	} else {
 		/* Allocate a buffer where we construct the new block. */
 		s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
-		/* assert(header == s->base) */
 		error = -ENOMEM;
 		if (s->base == NULL)
 			goto cleanup;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH RESEND 8/8] ext4: fix a memory leak of ext4_free_data
  2020-11-07 15:58 [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Chunguang Xu
                   ` (5 preceding siblings ...)
  2020-11-07 15:58 ` [PATCH RESEND 7/8] ext4: delete invalid code inside ext4_xattr_block_set() Chunguang Xu
@ 2020-11-07 15:58 ` Chunguang Xu
  2020-12-09 19:29   ` Theodore Y. Ts'o
  2020-12-03 14:38 ` [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Theodore Y. Ts'o
  7 siblings, 1 reply; 27+ messages in thread
From: Chunguang Xu @ 2020-11-07 15:58 UTC (permalink / raw)
  To: tytso, adilger.kernel; +Cc: linux-ext4

From: Chunguang Xu <brookxu@tencent.com>

When freeing metadata, we will create an ext4_free_data and
insert it into the pending free list. After the current
transaction is committed, the object will be freed.

ext4_mb_free_metadata() will check whether the area to be
freed overlaps with the pending free list. If true, return
directly. At this time, ext4_free_data is leaked. Fortunately,
the probability of this problem is relatively small, maybe we
should fix this problem.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
---
 fs/ext4/mballoc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d8704fe..03b4522 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5146,6 +5146,7 @@ int ext4_metadata_block_overlaps(struct super_block *sb,
 				ext4_group_first_block_no(sb, group) +
 				EXT4_C2B(sbi, cluster),
 				"Block already on to-be-freed list");
+			kmem_cache_free(ext4_free_data_cachep, new_entry);
 			return 0;
 		}
 	}
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT()
  2020-11-07 15:58 [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Chunguang Xu
                   ` (6 preceding siblings ...)
  2020-11-07 15:58 ` [PATCH RESEND 8/8] ext4: fix a memory leak of ext4_free_data Chunguang Xu
@ 2020-12-03 14:38 ` Theodore Y. Ts'o
  7 siblings, 0 replies; 27+ messages in thread
From: Theodore Y. Ts'o @ 2020-12-03 14:38 UTC (permalink / raw)
  To: Chunguang Xu; +Cc: adilger.kernel, linux-ext4

On Sat, Nov 07, 2020 at 11:58:11PM +0800, Chunguang Xu wrote:
> From: Chunguang Xu <brookxu@tencent.com>
> 
> There are currently multiple forms of assertion, such as J_ASSERT().
> J_ASEERT() is provided for the jbd module, which is a public module.
> Maybe we should use custom ASSERT() like other file systems, such as
> xfs, which would be better.
> 
> Signed-off-by: Chunguang Xu <brookxu@tencent.com>
> Reviewed-by: Andreas Dilger <adilger@dilger.ca>

Thanks, applied --- but I changed ext4_assert() to ASSERT().  All caps
makes it easier to spot the assertion, and since ext4.h is a private
header file, there's no reason to use a "ext4_" or "EXT4_" prefix.

       	     	     	       	  - Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 2/8] ext4: remove redundant mb_regenerate_buddy()
  2020-11-07 15:58 ` [PATCH RESEND 2/8] ext4: remove redundant mb_regenerate_buddy() Chunguang Xu
@ 2020-12-03 14:42   ` Theodore Y. Ts'o
  0 siblings, 0 replies; 27+ messages in thread
From: Theodore Y. Ts'o @ 2020-12-03 14:42 UTC (permalink / raw)
  To: Chunguang Xu; +Cc: adilger.kernel, linux-ext4

On Sat, Nov 07, 2020 at 11:58:12PM +0800, Chunguang Xu wrote:
> From: Chunguang Xu <brookxu@tencent.com>
> 
> After this patch (163a203), if an abnormal bitmap is detected, we
> will mark the group as corrupt, and we will not use this group in
> the future. Therefore, it should be meaningless to regenerate the
> buddy bitmap of this group, It might be better to delete it.
> 
> Signed-off-by: Chunguang Xu <brookxu@tencent.com>
> Reviewed-by: Andreas Dilger <adilger@dilger.ca>

Thanks, applied.

					- Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 3/8] ext4: simplify the code of mb_find_order_for_block
  2020-11-07 15:58 ` [PATCH RESEND 3/8] ext4: simplify the code of mb_find_order_for_block Chunguang Xu
@ 2020-12-03 14:43   ` Theodore Y. Ts'o
  0 siblings, 0 replies; 27+ messages in thread
From: Theodore Y. Ts'o @ 2020-12-03 14:43 UTC (permalink / raw)
  To: Chunguang Xu; +Cc: adilger.kernel, linux-ext4

On Sat, Nov 07, 2020 at 11:58:13PM +0800, Chunguang Xu wrote:
> From: Chunguang Xu <brookxu@tencent.com>
> 
> The code of mb_find_order_for_block is a bit obscure, but we can
> simplify it with mb_find_buddy(), make the code more concise.
> 
> Signed-off-by: Chunguang Xu <brookxu@tencent.com>
> Reviewed-by: Andreas Dilger <adilger@dilger.ca>

Thanks, applied.

					- Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone
  2020-11-07 15:58 ` [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone Chunguang Xu
@ 2020-12-03 15:08   ` Theodore Y. Ts'o
  2020-12-04  1:26     ` brookxu
  2020-12-04  1:29     ` brookxu
  0 siblings, 2 replies; 27+ messages in thread
From: Theodore Y. Ts'o @ 2020-12-03 15:08 UTC (permalink / raw)
  To: Chunguang Xu; +Cc: adilger.kernel, linux-ext4

On Sat, Nov 07, 2020 at 11:58:14PM +0800, Chunguang Xu wrote:
> From: Chunguang Xu <brookxu@tencent.com>
> 
> In order to avoid poor search efficiency of system_zone, the
> system only adds metadata of some sparse group to system_zone.
> In the meta_bg scenario, the non-sparse group may contain gdt
> blocks. Perhaps we should add these blocks to system_zone to
> improve fault tolerance without significantly reducing system
> performance.

> @@ -226,13 +227,16 @@ int ext4_setup_system_zone(struct super_block *sb)
>  
>  	for (i=0; i < ngroups; i++) {
>  		cond_resched();
> -		if (ext4_bg_has_super(sb, i) &&
> -		    ((i < 5) || ((i % flex_size) == 0))) {
> -			ret = add_system_zone(system_blks,
> -					ext4_group_first_block_no(sb, i),
> -					ext4_bg_num_gdb(sb, i) + 1, 0);
> -			if (ret)
> -				goto err;
> +		if ((i < 5) || ((i % flex_size) == 0)) {

If we're going to do this, why not just drop the above conditional,
and just always do this logic for all block groups?

> +			gd_blks = ext4_bg_has_super(sb, i) +
> +				ext4_bg_num_gdb(sb, i);
> +			if (gd_blks) {
> +				ret = add_system_zone(system_blks,
> +						ext4_group_first_block_no(sb, i),
> +						gd_blks, 0);
> +				if (ret)
> +					goto err;
> +			}

						- Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone
  2020-12-03 15:08   ` Theodore Y. Ts'o
@ 2020-12-04  1:26     ` brookxu
  2020-12-09  4:34       ` Theodore Y. Ts'o
  2020-12-04  1:29     ` brookxu
  1 sibling, 1 reply; 27+ messages in thread
From: brookxu @ 2020-12-04  1:26 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: adilger.kernel, linux-ext4


Theodore Y. Ts'o wrote on 2020/12/3 23:08:
> On Sat, Nov 07, 2020 at 11:58:14PM +0800, Chunguang Xu wrote:
>> From: Chunguang Xu <brookxu@tencent.com>
>>
>> In order to avoid poor search efficiency of system_zone, the
>> system only adds metadata of some sparse group to system_zone.
>> In the meta_bg scenario, the non-sparse group may contain gdt
>> blocks. Perhaps we should add these blocks to system_zone to
>> improve fault tolerance without significantly reducing system
>> performance.

Thanks, in the large-market scenario, if we deal with all groups,
the system_zone will be very large, which may reduce performance.
I think the previous method is good, but it needs to be changed
slightly, so that the fault tolerance in the meta_bg scenario
can be improved without the risk of performance degradation.

>> @@ -226,13 +227,16 @@ int ext4_setup_system_zone(struct super_block *sb)
>>  
>>  	for (i=0; i < ngroups; i++) {
>>  		cond_resched();
>> -		if (ext4_bg_has_super(sb, i) &&
>> -		    ((i < 5) || ((i % flex_size) == 0))) {
>> -			ret = add_system_zone(system_blks,
>> -					ext4_group_first_block_no(sb, i),
>> -					ext4_bg_num_gdb(sb, i) + 1, 0);
>> -			if (ret)
>> -				goto err;
>> +		if ((i < 5) || ((i % flex_size) == 0)) {
> 
> If we're going to do this, why not just drop the above conditional,
> and just always do this logic for all block groups?
> 
>> +			gd_blks = ext4_bg_has_super(sb, i) +
>> +				ext4_bg_num_gdb(sb, i);
>> +			if (gd_blks) {
>> +				ret = add_system_zone(system_blks,
>> +						ext4_group_first_block_no(sb, i),
>> +						gd_blks, 0);
>> +				if (ret)
>> +					goto err;
>> +			}
> 
> 						- Ted
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone
  2020-12-03 15:08   ` Theodore Y. Ts'o
  2020-12-04  1:26     ` brookxu
@ 2020-12-04  1:29     ` brookxu
  1 sibling, 0 replies; 27+ messages in thread
From: brookxu @ 2020-12-04  1:29 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: adilger.kernel, linux-ext4



Theodore Y. Ts'o wrote on 2020/12/3 23:08:
> On Sat, Nov 07, 2020 at 11:58:14PM +0800, Chunguang Xu wrote:
>> From: Chunguang Xu <brookxu@tencent.com>
>>
>> In order to avoid poor search efficiency of system_zone, the
>> system only adds metadata of some sparse group to system_zone.
>> In the meta_bg scenario, the non-sparse group may contain gdt
>> blocks. Perhaps we should add these blocks to system_zone to
>> improve fault tolerance without significantly reducing system
>> performance.
> 
>> @@ -226,13 +227,16 @@ int ext4_setup_system_zone(struct super_block *sb)
>>  
>>  	for (i=0; i < ngroups; i++) {
>>  		cond_resched();
>> -		if (ext4_bg_has_super(sb, i) &&
>> -		    ((i < 5) || ((i % flex_size) == 0))) {
>> -			ret = add_system_zone(system_blks,
>> -					ext4_group_first_block_no(sb, i),
>> -					ext4_bg_num_gdb(sb, i) + 1, 0);
>> -			if (ret)
>> -				goto err;
>> +		if ((i < 5) || ((i % flex_size) == 0)) {
> 
> If we're going to do this, why not just drop the above conditional,
> and just always do this logic for all block groups?

Thanks, in the large disk scenario, if we deal with all groups, the
system_zone will be very large, which may reduce performance. I think
the previous method is good, but it needs to be changed slightly, so
that the fault tolerance in the meta_bg scenario can be improved
without the risk of performance degradation.

>> +			gd_blks = ext4_bg_has_super(sb, i) +
>> +				ext4_bg_num_gdb(sb, i);
>> +			if (gd_blks) {
>> +				ret = add_system_zone(system_blks,
>> +						ext4_group_first_block_no(sb, i),
>> +						gd_blks, 0);
>> +				if (ret)
>> +					goto err;
>> +			}
> 
> 						- Ted
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone
  2020-12-04  1:26     ` brookxu
@ 2020-12-09  4:34       ` Theodore Y. Ts'o
  2020-12-09 11:48         ` brookxu
  0 siblings, 1 reply; 27+ messages in thread
From: Theodore Y. Ts'o @ 2020-12-09  4:34 UTC (permalink / raw)
  To: brookxu; +Cc: adilger.kernel, linux-ext4

On Fri, Dec 04, 2020 at 09:26:49AM +0800, brookxu wrote:
> 
> Theodore Y. Ts'o wrote on 2020/12/3 23:08:
> > On Sat, Nov 07, 2020 at 11:58:14PM +0800, Chunguang Xu wrote:
> >> From: Chunguang Xu <brookxu@tencent.com>
> >>
> >> In order to avoid poor search efficiency of system_zone, the
> >> system only adds metadata of some sparse group to system_zone.
> >> In the meta_bg scenario, the non-sparse group may contain gdt
> >> blocks. Perhaps we should add these blocks to system_zone to
> >> improve fault tolerance without significantly reducing system
> >> performance.
> 
> Thanks, in the large-market scenario, if we deal with all groups,
> the system_zone will be very large, which may reduce performance.
> I think the previous method is good, but it needs to be changed
> slightly, so that the fault tolerance in the meta_bg scenario
> can be improved without the risk of performance degradation.

OK, I see.   But this is not actually reliable:

> >> +		if ((i < 5) || ((i % flex_size) == 0)) {

This only works if the flex_size is less than or equal to 64 (assuming
a 4k blocksize).  That's because on 64-bit file systems, we can fit 64
block group descripters in a 4k block group descriptor block, so
that's the size of the meta_bg.  The default flex_bg size is 16, but
it's quite possible to create a file system via "mke2fs -t ext4 -G
256".  In that case, the flex_size will be 256, and we would not be
including all of the meta_bg groups.  So i % flex_size needs to be
replaced by "i % meta_bg_size", where meta_bg_size would be
initialized to EXT4_DESC_PER_BLOCK(sb).

Does that make sense?

						- Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 6/8] ext4: add a helper function to validate metadata block
  2020-11-07 15:58 ` [PATCH 6/8] ext4: add a helper function to validate metadata block Chunguang Xu
@ 2020-12-09  4:55   ` Theodore Y. Ts'o
  2020-12-09 12:12     ` brookxu
  0 siblings, 1 reply; 27+ messages in thread
From: Theodore Y. Ts'o @ 2020-12-09  4:55 UTC (permalink / raw)
  To: Chunguang Xu; +Cc: adilger.kernel, linux-ext4

On Sat, Nov 07, 2020 at 11:58:16PM +0800, Chunguang Xu wrote:
> From: Chunguang Xu <brookxu@tencent.com>
> 
> There is a need to check whether a block or a segment overlaps
> with metadata, since information of system_zone is incomplete,
> we need a more accurate function. Now we check whether it
> overlaps with block bitmap, inode bitmap, and inode table.
> Perhaps it is better to add a check of super_block and block
> group descriptor and provide a helper function.

The original code was valid only for file systems that are not using
flex_bg.  I suspect the Lustre folks who implemented mballoc.c did so
before flex_bg, and fortunately, on flex_bg we the check is simply
going to have more false negaties, but not any false positives, so no
one noticed.

> +/*
> + * Returns 1 if the passed-in block region (block, block+count)
> + * overlaps with some other filesystem metadata blocks. Others,
> + * return 0.
> + */
> +int ext4_metadata_block_overlaps(struct super_block *sb,
> +				 ext4_group_t block_group,
> +				 ext4_fsblk_t block,
> +				 unsigned long count)
> +{
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +	struct ext4_group_desc *gdp;
> +	int gd_first = ext4_group_first_block_no(sb, block_group);
> +	int itable, gd_blk;
> +	int ret = 0;
> +
> +	gdp = ext4_get_group_desc(sb, block_group, NULL);
> +	// check block bitmap and inode bitmap
> +	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
> +	    in_range(ext4_inode_bitmap(sb, gdp), block, count))

We are only checking a single block group descriptor; this is fine if
the allocation bitmaps and inode table are guaranteed to be located in
their own block group.  But this is no longer true when flex_bg is
enabled.

I think what we should do is to rely on the rb tree maintained by
block_validity.c (if the inode number is zero, then the entry refers
to blocks in the "system zone"); that's going to be a much more
complete check.

What do you think?

						- Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone
  2020-12-09  4:34       ` Theodore Y. Ts'o
@ 2020-12-09 11:48         ` brookxu
  2020-12-09 19:39           ` Theodore Y. Ts'o
  0 siblings, 1 reply; 27+ messages in thread
From: brookxu @ 2020-12-09 11:48 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: adilger.kernel, linux-ext4



Theodore Y. Ts'o wrote on 2020/12/9 12:34:
> On Fri, Dec 04, 2020 at 09:26:49AM +0800, brookxu wrote:
>>
>> Theodore Y. Ts'o wrote on 2020/12/3 23:08:
>>> On Sat, Nov 07, 2020 at 11:58:14PM +0800, Chunguang Xu wrote:
>>>> From: Chunguang Xu <brookxu@tencent.com>
>>>>
>>>> In order to avoid poor search efficiency of system_zone, the
>>>> system only adds metadata of some sparse group to system_zone.
>>>> In the meta_bg scenario, the non-sparse group may contain gdt
>>>> blocks. Perhaps we should add these blocks to system_zone to
>>>> improve fault tolerance without significantly reducing system
>>>> performance.
>>
>> Thanks, in the large-market scenario, if we deal with all groups,
>> the system_zone will be very large, which may reduce performance.
>> I think the previous method is good, but it needs to be changed
>> slightly, so that the fault tolerance in the meta_bg scenario
>> can be improved without the risk of performance degradation.
> 
> OK, I see.   But this is not actually reliable:
> 
>>>> +		if ((i < 5) || ((i % flex_size) == 0)) {
> 
> This only works if the flex_size is less than or equal to 64 (assuming
> a 4k blocksize).  That's because on 64-bit file systems, we can fit 64
> block group descripters in a 4k block group descriptor block, so
> that's the size of the meta_bg.  The default flex_bg size is 16, but
> it's quite possible to create a file system via "mke2fs -t ext4 -G
> 256".  In that case, the flex_size will be 256, and we would not be
> including all of the meta_bg groups.  So i % flex_size needs to be
> replaced by "i % meta_bg_size", where meta_bg_size would be
> initialized to EXT4_DESC_PER_BLOCK(sb).
> 
> Does that make sense?
Maybe I missed something. If i% meta_bg_size is used instead, if
flex_size <64, then we will miss some flex_bg. There seems to be
a contradiction here. In the scenario where only flex_bg is
enabled, it may not be appropriate to use meta_bg_size. In the
scenario where only meta_bg is enabled, it may not be appropriate
to use flex_size.

As you said before, it maybe better to remove

	if ((i <5) || ((i% flex_size) == 0))

and do it for all groups. 

In this way we won't miss some flex_bg, meta_bg, and sparse_bg.
I tested it on an 80T disk and found that the performance loss
was small:

 unpatched kernel:
 ext4_setup_system_zone() takes 524ms, 
 mount-3137    [006] ....    89.548026: ext4_setup_system_zone: (ext4_setup_system_zone+0x0/0x3f0)
 mount-3137    [006] d...    90.072895: ext4_setup_system_zone_1: (ext4_fill_super+0x2057/0x39b0 <- ext4_setup_system_zone)

 patched kernel:
 ext4_setup_system_zone() takes 552ms, 
 mount-4425    [006] ....   402.555793: ext4_setup_system_zone: (ext4_setup_system_zone+0x0/0x3d0)
 mount-4425    [006] d...   403.107307: ext4_setup_system_zone_1: (ext4_fill_super+0x2057/0x39b0 <- ext4_setup_system_zone)
> 
> 						- Ted
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 6/8] ext4: add a helper function to validate metadata block
  2020-12-09  4:55   ` Theodore Y. Ts'o
@ 2020-12-09 12:12     ` brookxu
  0 siblings, 0 replies; 27+ messages in thread
From: brookxu @ 2020-12-09 12:12 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: adilger.kernel, linux-ext4



Theodore Y. Ts'o wrote on 2020/12/9 12:55:
> On Sat, Nov 07, 2020 at 11:58:16PM +0800, Chunguang Xu wrote:
>> From: Chunguang Xu <brookxu@tencent.com>
>>
>> There is a need to check whether a block or a segment overlaps
>> with metadata, since information of system_zone is incomplete,
>> we need a more accurate function. Now we check whether it
>> overlaps with block bitmap, inode bitmap, and inode table.
>> Perhaps it is better to add a check of super_block and block
>> group descriptor and provide a helper function.
> 
> The original code was valid only for file systems that are not using
> flex_bg.  I suspect the Lustre folks who implemented mballoc.c did so
> before flex_bg, and fortunately, on flex_bg we the check is simply
> going to have more false negaties, but not any false positives, so no
> one noticed.
> 
>> +/*
>> + * Returns 1 if the passed-in block region (block, block+count)
>> + * overlaps with some other filesystem metadata blocks. Others,
>> + * return 0.
>> + */
>> +int ext4_metadata_block_overlaps(struct super_block *sb,
>> +				 ext4_group_t block_group,
>> +				 ext4_fsblk_t block,
>> +				 unsigned long count)
>> +{
>> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
>> +	struct ext4_group_desc *gdp;
>> +	int gd_first = ext4_group_first_block_no(sb, block_group);
>> +	int itable, gd_blk;
>> +	int ret = 0;
>> +
>> +	gdp = ext4_get_group_desc(sb, block_group, NULL);
>> +	// check block bitmap and inode bitmap
>> +	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
>> +	    in_range(ext4_inode_bitmap(sb, gdp), block, count))
> 
> We are only checking a single block group descriptor; this is fine if
> the allocation bitmaps and inode table are guaranteed to be located in
> their own block group.  But this is no longer true when flex_bg is
> enabled.

 Right, the check of bb and ib here is not very correct.

> I think what we should do is to rely on the rb tree maintained by
> block_validity.c (if the inode number is zero, then the entry refers
> to blocks in the "system zone"); that's going to be a much more
> complete check.
> 
> What do you think?

This is a good idea. After we merge ext4: add the gdt block of
meta_bg to system_zone, the metadata information of system_zone
is relatively complete. Using system_zone makes the logic
clearer. However, due to the need for additional tree search,
there is a performance risk. I will try this method later and
test the performance overhead.

> 						- Ted
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 5/8] ext4: update ext4_data_block_valid related comments
  2020-11-07 15:58 ` [PATCH RESEND 5/8] ext4: update ext4_data_block_valid related comments Chunguang Xu
@ 2020-12-09 19:11   ` Theodore Y. Ts'o
  0 siblings, 0 replies; 27+ messages in thread
From: Theodore Y. Ts'o @ 2020-12-09 19:11 UTC (permalink / raw)
  To: Chunguang Xu; +Cc: adilger.kernel, linux-ext4

On Sat, Nov 07, 2020 at 11:58:15PM +0800, Chunguang Xu wrote:
> From: Chunguang Xu <brookxu@tencent.com>
> 
> Since ext4_data_block_valid() has been renamed to ext4_inode_block_valid(),
> the related comments need to be updated.
> 
> Signed-off-by: Chunguang Xu <brookxu@tencent.com>
> Reviewed-by: Andreas Dilger <adilger@dilger.ca>

Thanks, applied.

					- Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 7/8] ext4: delete invalid code inside ext4_xattr_block_set()
  2020-11-07 15:58 ` [PATCH RESEND 7/8] ext4: delete invalid code inside ext4_xattr_block_set() Chunguang Xu
@ 2020-12-09 19:24   ` Theodore Y. Ts'o
  0 siblings, 0 replies; 27+ messages in thread
From: Theodore Y. Ts'o @ 2020-12-09 19:24 UTC (permalink / raw)
  To: Chunguang Xu; +Cc: adilger.kernel, linux-ext4

On Sat, Nov 07, 2020 at 11:58:17PM +0800, Chunguang Xu wrote:
> From: Chunguang Xu <brookxu@tencent.com>
> 
> Delete invalid code inside ext4_xattr_block_set().
> 
> Signed-off-by: Chunguang Xu <brookxu@tencent.com>
> Reviewed-by: Andreas Dilger <adilger@dilger.ca>

Thanks, applied, with a slighly reworded commit description:

    ext4: delete nonsensical (commented-out) code inside ext4_xattr_block_set()

 	     		     	  	 - Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 8/8] ext4: fix a memory leak of ext4_free_data
  2020-11-07 15:58 ` [PATCH RESEND 8/8] ext4: fix a memory leak of ext4_free_data Chunguang Xu
@ 2020-12-09 19:29   ` Theodore Y. Ts'o
  0 siblings, 0 replies; 27+ messages in thread
From: Theodore Y. Ts'o @ 2020-12-09 19:29 UTC (permalink / raw)
  To: Chunguang Xu; +Cc: adilger.kernel, linux-ext4

On Sat, Nov 07, 2020 at 11:58:18PM +0800, Chunguang Xu wrote:
> From: Chunguang Xu <brookxu@tencent.com>
> 
> When freeing metadata, we will create an ext4_free_data and
> insert it into the pending free list. After the current
> transaction is committed, the object will be freed.
> 
> ext4_mb_free_metadata() will check whether the area to be
> freed overlaps with the pending free list. If true, return
> directly. At this time, ext4_free_data is leaked. Fortunately,
> the probability of this problem is relatively small, maybe we
> should fix this problem.
> 
> Signed-off-by: Chunguang Xu <brookxu@tencent.com>

Thanks, applied.  I added an explanatory note that the leak would only
happen when the file system is corrupted (a block claimed by more than
one inode, with those inodes deleted within a single jbd2 transaction).

    	   	      	     	     	    - Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone
  2020-12-09 11:48         ` brookxu
@ 2020-12-09 19:39           ` Theodore Y. Ts'o
  2020-12-10 11:00             ` brookxu
  2020-12-15  1:14             ` brookxu
  0 siblings, 2 replies; 27+ messages in thread
From: Theodore Y. Ts'o @ 2020-12-09 19:39 UTC (permalink / raw)
  To: brookxu; +Cc: adilger.kernel, linux-ext4

On Wed, Dec 09, 2020 at 07:48:09PM +0800, brookxu wrote:
> 
> Maybe I missed something. If i% meta_bg_size is used instead, if
> flex_size <64, then we will miss some flex_bg. There seems to be
> a contradiction here. In the scenario where only flex_bg is
> enabled, it may not be appropriate to use meta_bg_size. In the
> scenario where only meta_bg is enabled, it may not be appropriate
> to use flex_size.
> 
> As you said before, it maybe better to remove
> 
> 	if ((i <5) || ((i% flex_size) == 0))
> 
> and do it for all groups.

I don't think the original (i % flex_size) made any sense in the first
place.

What flex_bg does is that it collects the allocation bitmaps and inode
tables for each block group and locates them within the first block
group in a flex_bg.  It doesn't have anything to do with whether or
not a particular block group has a backup copy of the superblock and
block group descriptor table --- in non-meta_bg file systems and the
meta_bg file systems where the block group is less than
s_first_meta_bg * EXT4_DESC_PER_BLOCK(sb).  And the condition in
question is only about whether or not to add the backup superblock and
backup block group descriptors.  So checking for i % flex_size made no
sense, and I'm not sure that check was there in the first place.

> In this way weh won't miss some flex_bg, meta_bg, and sparse_bg.
> I tested it on an 80T disk and found that the performance loss
> was small:
> 
>  unpatched kernel:
>  ext4_setup_system_zone() takes 524ms, 
> 
>  patched kernel:
>  ext4_setup_system_zone() takes 552ms, 

I don't really care that much about the time it takes to execute
ext4_setup_system_zone().

The really interesting question is how large is the rb_tree
constructed by that function, and what is the percentage increase of
time that the ext4_inode_block_valid() function takes.  (e.g., how
much additional memory is the system_blks tree taking, and how deep is
that tree, since ext4_inode_block_valid() gets called every time we
allocate or free a block, and every time we need to validate an extent
tree node.

Cheers,

						- Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone
  2020-12-09 19:39           ` Theodore Y. Ts'o
@ 2020-12-10 11:00             ` brookxu
  2020-12-15  1:14             ` brookxu
  1 sibling, 0 replies; 27+ messages in thread
From: brookxu @ 2020-12-10 11:00 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: adilger.kernel, linux-ext4



Theodore Y. Ts'o wrote on 2020/12/10 3:39:
> On Wed, Dec 09, 2020 at 07:48:09PM +0800, brookxu wrote:
>>
>> Maybe I missed something. If i% meta_bg_size is used instead, if
>> flex_size <64, then we will miss some flex_bg. There seems to be
>> a contradiction here. In the scenario where only flex_bg is
>> enabled, it may not be appropriate to use meta_bg_size. In the
>> scenario where only meta_bg is enabled, it may not be appropriate
>> to use flex_size.
>>
>> As you said before, it maybe better to remove
>>
>> 	if ((i <5) || ((i% flex_size) == 0))
>>
>> and do it for all groups.
> 
> I don't think the original (i % flex_size) made any sense in the first
> place.
> 
> What flex_bg does is that it collects the allocation bitmaps and inode
> tables for each block group and locates them within the first block
> group in a flex_bg.  It doesn't have anything to do with whether or
> not a particular block group has a backup copy of the superblock and
> block group descriptor table --- in non-meta_bg file systems and the
> meta_bg file systems where the block group is less than
> s_first_meta_bg * EXT4_DESC_PER_BLOCK(sb).  And the condition in
> question is only about whether or not to add the backup superblock and
> backup block group descriptors.  So checking for i % flex_size made no
> sense, and I'm not sure that check was there in the first place.

I think we should add backup sb and gdt to system_zone, because
these blocks should not be used by applications. In fact, I
think we may have done some work.

>> In this way weh won't miss some flex_bg, meta_bg, and sparse_bg.
>> I tested it on an 80T disk and found that the performance loss
>> was small:
>>
>>  unpatched kernel:
>>  ext4_setup_system_zone() takes 524ms, 
>>
>>  patched kernel:
>>  ext4_setup_system_zone() takes 552ms, 
> 
> I don't really care that much about the time it takes to execute
> ext4_setup_system_zone().
> 
> The really interesting question is how large is the rb_tree
> constructed by that function, and what is the percentage increase of
> time that the ext4_inode_block_valid() function takes.  (e.g., how
> much additional memory is the system_blks tree taking, and how deep is
> that tree, since ext4_inode_block_valid() gets called every time we
> allocate or free a block, and every time we need to validate an extent
> tree node.

During detailed analysis, I found that when the current logic
calls ext4_setup_system_zone(), s_log_groups_per_flex has not
been initialized, and flex_size is always 1, which seems to
be a mistake. therefore

if (ext4_bg_has_super(sb, i) &&
                    ((i <5) || ((i% flex_size) == 0)))

Degenerate to

if (ext4_bg_has_super(sb, i))

So, the existing implementation just adds the backup super
block in sparse_group to system_zone. Due to this mistake,
the behavior of the system in the flex_bg scenario happens to
be correct?

I tested it in three scenarios: only meta_bg, only flex_bg,
both flex_bg and meta_bg were enabled. The test results are as
follows:

Meta_bg only
 unpacthed kernel:
 ext4_setup_system_zone time 866 count 1309087
 
 pacthed kernel:
 ext4_setup_system_zone time 841 count 1309087

Since the backup gdt of meta_bg and BB are connected, they can
be merged, so no additional nodes are added.

Flex_bg only
 unpacthed kernel:
 ext4_setup_system_zone time 529 count 41016

 pacthed kernel:
 ext4_setup_system_zone time 553 count 41016

The system behavior has not changed. All sparse_group backup sb
and gdt are still added, so no additional nodes are added.

Meta_bg & Flex_bg only
 unpacthed kernel:
 ext4_setup_system_zone time 535 count 41016
 
 pacthed kernel:
 ext4_setup_system_zone time 571 count 61508

In addition to sparse_group, the system needs to add the backup
gdt of meta_bg to the system. Set

	N=max(flex_bg_size / meta_bg_size, 1)

then every N meta_bg has a gdt block that can be merged into 
the node corresponding to flex_bg, such as flex_bg_size < meta_bg_size,
then the number of new nodes is 2 * nr_meta_bg. On this 80T
disk, the maximum depth of rbtree is 2log(n+1). According to
this calculation, in this test case, the depth of rbtree is
not increased. Thus, there is no major performance overhead.

Maybe we can deal with it in the same way as discussed before?

> Cheers,
> 
> 						- Ted
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone
  2020-12-09 19:39           ` Theodore Y. Ts'o
  2020-12-10 11:00             ` brookxu
@ 2020-12-15  1:14             ` brookxu
  2020-12-15 20:13               ` Theodore Y. Ts'o
  1 sibling, 1 reply; 27+ messages in thread
From: brookxu @ 2020-12-15  1:14 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: adilger.kernel, linux-ext4

Hi, Ted, how do you think of this, should we need to go ahead? Thanks.

Theodore Y. Ts'o wrote on 2020/12/10 3:39:
> On Wed, Dec 09, 2020 at 07:48:09PM +0800, brookxu wrote:
>>
>> Maybe I missed something. If i% meta_bg_size is used instead, if
>> flex_size <64, then we will miss some flex_bg. There seems to be
>> a contradiction here. In the scenario where only flex_bg is
>> enabled, it may not be appropriate to use meta_bg_size. In the
>> scenario where only meta_bg is enabled, it may not be appropriate
>> to use flex_size.
>>
>> As you said before, it maybe better to remove
>>
>> 	if ((i <5) || ((i% flex_size) == 0))
>>
>> and do it for all groups.
> 
> I don't think the original (i % flex_size) made any sense in the first
> place.
> 
> What flex_bg does is that it collects the allocation bitmaps and inode
> tables for each block group and locates them within the first block
> group in a flex_bg.  It doesn't have anything to do with whether or
> not a particular block group has a backup copy of the superblock and
> block group descriptor table --- in non-meta_bg file systems and the
> meta_bg file systems where the block group is less than
> s_first_meta_bg * EXT4_DESC_PER_BLOCK(sb).  And the condition in
> question is only about whether or not to add the backup superblock and
> backup block group descriptors.  So checking for i % flex_size made no
> sense, and I'm not sure that check was there in the first place.

I think we should add backup sb and gdt to system_zone, because
these blocks should not be used by applications. In fact, I
think we may have done some work.

>> In this way weh won't miss some flex_bg, meta_bg, and sparse_bg.
>> I tested it on an 80T disk and found that the performance loss
>> was small:
>>
>>  unpatched kernel:
>>  ext4_setup_system_zone() takes 524ms, 
>>
>>  patched kernel:
>>  ext4_setup_system_zone() takes 552ms, 
> 
> I don't really care that much about the time it takes to execute
> ext4_setup_system_zone().
> 
> The really interesting question is how large is the rb_tree
> constructed by that function, and what is the percentage increase of
> time that the ext4_inode_block_valid() function takes.  (e.g., how
> much additional memory is the system_blks tree taking, and how deep is
> that tree, since ext4_inode_block_valid() gets called every time we
> allocate or free a block, and every time we need to validate an extent
> tree node.

During detailed analysis, I found that when the current logic
calls ext4_setup_system_zone(), s_log_groups_per_flex has not
been initialized, and flex_size is always 1, which seems to
be a mistake. therefore

if (ext4_bg_has_super(sb, i) &&
                    ((i <5) || ((i% flex_size) == 0)))

Degenerate to

if (ext4_bg_has_super(sb, i))

So, the existing implementation just adds the backup sb and gdt
in sparse_group to system_zone. Due to this mistake, the behavior
of the system in the flex_bg scenario happens to be correct?

I tested it in three scenarios: only meta_bg, only flex_bg,
both flex_bg and meta_bg were enabled. The test results are as
follows:

Meta_bg only
 unpacthed kernel:
 ext4_setup_system_zone time 866ms count 1309087(number of nodes inside rbtree)
 
 pacthed kernel:
 ext4_setup_system_zone time 841ms count 1309087(number of nodes inside rbtree)

Since the backup gdt of meta_bg and BB are connected, they can
be merged, so no additional nodes are added.

Flex_bg only
 unpacthed kernel:
 ext4_setup_system_zone time 529ms count 41016(number of nodes inside rbtree)

 pacthed kernel:
 ext4_setup_system_zone time 553ms count 41016(number of nodes inside rbtree)

The system behavior has not changed. All sparse_group backup sb
and gdt are still added, so no additional nodes are added.

Meta_bg & Flex_bg only
 unpacthed kernel:
 ext4_setup_system_zone time 535ms count 41016(number of nodes inside rbtree)
 
 pacthed kernel:
 ext4_setup_system_zone time 571ms count 61508(number of nodes inside rbtree)

In addition to sparse_group, the system needs to add the backup
gdt of meta_bg to the system. Set

	N=max(flex_bg_size / meta_bg_size, 1)

then every N meta_bg has a gdt block that can be merged into 
the node corresponding to flex_bg, such as flex_bg_size < meta_bg_size,
then the number of new nodes is 2 * nr_meta_bg. On this 80T
disk, the maximum depth of rbtree is 2log(n+1). According to
this calculation, in this test case, the depth of rbtree is
not increased. Thus, there is no major performance overhead.

Maybe we can deal with it in the same way as discussed before?

> Cheers,
> 
> 						- Ted
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone
  2020-12-15  1:14             ` brookxu
@ 2020-12-15 20:13               ` Theodore Y. Ts'o
  2020-12-17 16:01                 ` Andreas Dilger
  0 siblings, 1 reply; 27+ messages in thread
From: Theodore Y. Ts'o @ 2020-12-15 20:13 UTC (permalink / raw)
  To: brookxu; +Cc: adilger.kernel, linux-ext4

You did your test on a 80T file system, but that's not where someone
would be using meta_bg.  Meta_bg ges used for much larger file systems
than that!  With meta_bg, we have 3 block group descriptors every 64
block groups.  Each block group describes 128M of memory.  So for that
means we are going to have 3 entries in the system zone tree for every_
8GB of file system space, 383,216 entries for every PB.  Given that
each entry is 40 bytes, that means that the block_validity entries
will consume 15 megabytes per PB.

Now, one third of these entries overlap with the flex_bg entries
(meta_bg groups are in the first, second, and last block group of each
meta_bg, where are 64 block groups in 4k file systems), and of course,
the default flex_bg size of 16 block groups means that there are
524,288 entries per PB.  So if we include all backup sb and block
groups, in a 1 PB file system, there will be roughly 786,432 entries
in a 1 PB file system.  (I'm ignoring the entries for the backup
superblocks, but that's only about 20 or so extra entries.)

So for a flex_bg 1PB file system, the amount of memory for a
block_validity data structure is roughly 20M, and including all backup
descriptors for meta_bg on a flex_bg + meta_bg setup is roughly 30M.

I agree with you that for a non-meta_bg file system, including all of
the backup superblock and block group descriptors is not going to be
large.  But while protecting the meta_bg group descriptors is
worthwhile, protecting the backup meta_bg's is not free, and will
increase the size of the tree by 33%.

I'm also wondering whether or not Lustre (where they do have some file
systems that are in the PB range) have run into overhead issues with
block_validity.

What do folks think?

						- Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone
  2020-12-15 20:13               ` Theodore Y. Ts'o
@ 2020-12-17 16:01                 ` Andreas Dilger
  0 siblings, 0 replies; 27+ messages in thread
From: Andreas Dilger @ 2020-12-17 16:01 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: brookxu, adilger.kernel, linux-ext4

An extra 20-30MB of RAM for mounting a 1PB filesystem isn't
a huge deal. We already need 512MB for just the 8M group descriptors,
and we have a 1GB journal.

I haven't heard any specific performance issues with block_validity,
but it may be newer than the 3.10 kernels we are currently using on
our servers. 

Cheers, Andreas

> On Dec 15, 2020, at 13:13, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> 
> You did your test on a 80T file system, but that's not where someone
> would be using meta_bg.  Meta_bg ges used for much larger file systems
> than that!  With meta_bg, we have 3 block group descriptors every 64
> block groups.  Each block group describes 128M of memory.  So for that
> means we are going to have 3 entries in the system zone tree for every_
> 8GB of file system space, 383,216 entries for every PB.  Given that
> each entry is 40 bytes, that means that the block_validity entries
> will consume 15 megabytes per PB.
> 
> Now, one third of these entries overlap with the flex_bg entries
> (meta_bg groups are in the first, second, and last block group of each
> meta_bg, where are 64 block groups in 4k file systems), and of course,
> the default flex_bg size of 16 block groups means that there are
> 524,288 entries per PB.  So if we include all backup sb and block
> groups, in a 1 PB file system, there will be roughly 786,432 entries
> in a 1 PB file system.  (I'm ignoring the entries for the backup
> superblocks, but that's only about 20 or so extra entries.)
> 
> So for a flex_bg 1PB file system, the amount of memory for a
> block_validity data structure is roughly 20M, and including all backup
> descriptors for meta_bg on a flex_bg + meta_bg setup is roughly 30M.
> 
> I agree with you that for a non-meta_bg file system, including all of
> the backup superblock and block group descriptors is not going to be
> large.  But while protecting the meta_bg group descriptors is
> worthwhile, protecting the backup meta_bg's is not free, and will
> increase the size of the tree by 33%.
> 
> I'm also wondering whether or not Lustre (where they do have some file
> systems that are in the PB range) have run into overhead issues with
> block_validity.
> 
> What do folks think?
> 
>                       - Ted

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 6/8] ext4: add a helper function to validate metadata block
  2020-10-21  9:15 [PATCH " Chunguang Xu
@ 2020-10-21  9:15 ` Chunguang Xu
  0 siblings, 0 replies; 27+ messages in thread
From: Chunguang Xu @ 2020-10-21  9:15 UTC (permalink / raw)
  To: tytso, adilger.kernel; +Cc: linux-ext4

From: Chunguang Xu <brookxu@tencent.com>

There is a need to check whether a block or a segment overlaps
with metadata, since information of system_zone is incomplete,
we need a more accurate function. Now we check whether it
overlaps with block bitmap, inode bitmap, and inode table.
Perhaps it is better to add a check of super_block and block
group descriptor and provide a helper function.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
---
 fs/ext4/ext4.h    |  5 ++++-
 fs/ext4/mballoc.c | 57 +++++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e7344ef..d3ff7d9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2784,7 +2784,10 @@ extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
 				     unsigned int nr, int *cnt);
 extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
 				  unsigned int nr);
-
+extern int ext4_metadata_block_overlaps(struct super_block *sb,
+					ext4_group_t block_group,
+					ext4_fsblk_t block,
+					unsigned long count);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			     struct buffer_head *bh, ext4_fsblk_t block,
 			     unsigned long count, int flags);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 2efb489..2b1bc6c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5061,6 +5061,49 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
 	kmem_cache_free(ext4_free_data_cachep, entry);
 }
 
+/*
+ * Returns 1 if the passed-in block region (block, block+count)
+ * overlaps with some other filesystem metadata blocks. Others,
+ * return 0.
+ */
+int ext4_metadata_block_overlaps(struct super_block *sb,
+				 ext4_group_t block_group,
+				 ext4_fsblk_t block,
+				 unsigned long count)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp;
+	int gd_first = ext4_group_first_block_no(sb, block_group);
+	int itable, gd_blk;
+	int ret = 0;
+
+	gdp = ext4_get_group_desc(sb, block_group, NULL);
+	// check block bitmap and inode bitmap
+	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
+	    in_range(ext4_inode_bitmap(sb, gdp), block, count))
+		ret = 1;
+
+	// check inode table
+	itable = ext4_inode_table(sb, gdp);
+	if (!(block >= itable + sbi->s_itb_per_group ||
+	      block + count - 1  < itable))
+		ret = 1;
+
+	/* check super_block and block group descriptor table, the
+	 * reserved space of the block group descriptor is managed
+	 * by resize_inode, it may not be processed now due to
+	 * performance.
+	 */
+	gd_blk = ext4_bg_has_super(sb, block_group) +
+		ext4_bg_num_gdb(sb, block_group);
+	if (gd_blk) {
+		if (!(block >= gd_first + gd_blk ||
+		      block + count - 1 < gd_first))
+			ret = 1;
+	}
+	return ret;
+}
+
 static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		      struct ext4_free_data *new_entry)
@@ -5360,13 +5403,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 		goto error_return;
 	}
 
-	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
-	    in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
-	    in_range(block, ext4_inode_table(sb, gdp),
-		     sbi->s_itb_per_group) ||
-	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
-		     sbi->s_itb_per_group)) {
-
+	if (ext4_metadata_block_overlaps(sb, block_group, block, count)) {
 		ext4_error(sb, "Freeing blocks in system zone - "
 			   "Block = %llu, count = %lu", block, count);
 		/* err = 0. ext4_std_error should be a no op */
@@ -5552,11 +5589,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 		goto error_return;
 	}
 
-	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
-	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
-	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
-	    in_range(block + count - 1, ext4_inode_table(sb, desc),
-		     sbi->s_itb_per_group)) {
+	if (ext4_metadata_block_overlaps(sb, block_group, block, count)) {
 		ext4_error(sb, "Adding blocks in system zones - "
 			   "Block = %llu, count = %lu",
 			   block, count);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2020-12-17 16:01 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-07 15:58 [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Chunguang Xu
2020-11-07 15:58 ` [PATCH RESEND 2/8] ext4: remove redundant mb_regenerate_buddy() Chunguang Xu
2020-12-03 14:42   ` Theodore Y. Ts'o
2020-11-07 15:58 ` [PATCH RESEND 3/8] ext4: simplify the code of mb_find_order_for_block Chunguang Xu
2020-12-03 14:43   ` Theodore Y. Ts'o
2020-11-07 15:58 ` [PATCH RESEND 4/8] ext4: add the gdt block of meta_bg to system_zone Chunguang Xu
2020-12-03 15:08   ` Theodore Y. Ts'o
2020-12-04  1:26     ` brookxu
2020-12-09  4:34       ` Theodore Y. Ts'o
2020-12-09 11:48         ` brookxu
2020-12-09 19:39           ` Theodore Y. Ts'o
2020-12-10 11:00             ` brookxu
2020-12-15  1:14             ` brookxu
2020-12-15 20:13               ` Theodore Y. Ts'o
2020-12-17 16:01                 ` Andreas Dilger
2020-12-04  1:29     ` brookxu
2020-11-07 15:58 ` [PATCH RESEND 5/8] ext4: update ext4_data_block_valid related comments Chunguang Xu
2020-12-09 19:11   ` Theodore Y. Ts'o
2020-11-07 15:58 ` [PATCH 6/8] ext4: add a helper function to validate metadata block Chunguang Xu
2020-12-09  4:55   ` Theodore Y. Ts'o
2020-12-09 12:12     ` brookxu
2020-11-07 15:58 ` [PATCH RESEND 7/8] ext4: delete invalid code inside ext4_xattr_block_set() Chunguang Xu
2020-12-09 19:24   ` Theodore Y. Ts'o
2020-11-07 15:58 ` [PATCH RESEND 8/8] ext4: fix a memory leak of ext4_free_data Chunguang Xu
2020-12-09 19:29   ` Theodore Y. Ts'o
2020-12-03 14:38 ` [PATCH RESEND 1/8] ext4: use ext4_assert() to replace J_ASSERT() Theodore Y. Ts'o
  -- strict thread matches above, loose matches on Subject: below --
2020-10-21  9:15 [PATCH " Chunguang Xu
2020-10-21  9:15 ` [PATCH 6/8] ext4: add a helper function to validate metadata block Chunguang Xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.