All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ext4: fix a NULL pointer when validating an inode bitmap
@ 2022-10-10 14:20 Luís Henriques
  2022-10-11 15:56 ` [PATCH v2] " Luís Henriques
  2022-10-12 13:13 ` [PATCH v2] ext4: fix BUG_ON() when directory entry has invalid rec_len Luís Henriques
  0 siblings, 2 replies; 16+ messages in thread
From: Luís Henriques @ 2022-10-10 14:20 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger
  Cc: linux-ext4, linux-kernel, Luís Henriques, stable

It's possible to hit a NULL pointer exception while accessing the
sb->s_group_info in ext4_validate_inode_bitmap(), when calling
ext4_get_group_info().

 EXT4-fs (loop0): warning: mounting unchecked fs, running e2fsck is recommended
 EXT4-fs error (device loop0): ext4_clear_blocks:866: inode #32: comm mount: attempt to clear invalid blocks 16777450 len 1
 EXT4-fs error (device loop0): ext4_free_branches:1012: inode #32: comm mount: invalid indirect mapped block 1258291200 (level 1)
 EXT4-fs error (device loop0): ext4_free_branches:1012: inode #32: comm mount: invalid indirect mapped block 7379847 (level 2)
 BUG: kernel NULL pointer dereference, address: 0000000000000000
 ...
 RIP: 0010:ext4_read_inode_bitmap+0x21b/0x5a0
 ...
 Call Trace:
  <TASK>
  ext4_free_inode+0x172/0x5c0
  ext4_evict_inode+0x4a5/0x730
  evict+0xc1/0x1c0
  ext4_setup_system_zone+0x2ea/0x380
  ext4_fill_super+0x249f/0x3910
  ? ext4_reconfigure+0x880/0x880
  ? snprintf+0x49/0x60
  ? ext4_reconfigure+0x880/0x880
  get_tree_bdev+0x169/0x260
  vfs_get_tree+0x16/0x70
  path_mount+0x77d/0xa30
  __x64_sys_mount+0x101/0x140
  do_syscall_64+0x3c/0x80
  entry_SYSCALL_64_after_hwframe+0x46/0xb0

Fix the issue by adding an extra NULL check.  And, while there, also ensure the
right error code (-EFSCORRUPTED) is propagated to user-space.  EUCLEAN is more
informative than ENOMEM.

CC: stable@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216541
Signed-off-by: Luís Henriques <lhenriques@suse.de>
---
 fs/ext4/ext4.h     | 17 ++++++++++-------
 fs/ext4/ialloc.c   |  2 +-
 fs/ext4/indirect.c | 14 ++++++++++++--
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3bf9a6926798..91317f592999 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3323,13 +3323,16 @@ static inline
 struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 					    ext4_group_t group)
 {
-	 struct ext4_group_info **grp_info;
-	 long indexv, indexh;
-	 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
-	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
-	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
-	 grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
-	 return grp_info[indexh];
+	struct ext4_group_info **grp_info;
+	long indexv, indexh;
+
+	BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
+	if (!EXT4_SB(sb)->s_group_info)
+		return NULL;
+	indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+	indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+	grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
+	return grp_info[indexh];
 }
 
 /*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 208b87ce8858..0e8d35d05b69 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -91,7 +91,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
 
 	if (buffer_verified(bh))
 		return 0;
-	if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+	if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
 		return -EFSCORRUPTED;
 
 	ext4_lock_group(sb, block_group);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 860fc5119009..e5ac5c2363df 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	struct super_block *sb = inode->i_sb;
 	Indirect *p = chain;
 	struct buffer_head *bh;
+	unsigned int key;
 	int ret = -EIO;
 
 	*err = 0;
@@ -156,9 +157,18 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
-		bh = sb_getblk(sb, le32_to_cpu(p->key));
+		key = le32_to_cpu(p->key);
+		bh = sb_getblk(sb, key);
 		if (unlikely(!bh)) {
-			ret = -ENOMEM;
+			/*
+			 * sb_getblk() masks different errors by always
+			 * returning NULL.  Let's distinguish at least the case
+			 * where the block is out of range.
+			 */
+			if (key > ext4_blocks_count(EXT4_SB(sb)->s_es))
+				ret = -EFSCORRUPTED;
+			else
+				ret = -ENOMEM;
 			goto failure;
 		}
 

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2] ext4: fix a NULL pointer when validating an inode bitmap
  2022-10-10 14:20 [PATCH] ext4: fix a NULL pointer when validating an inode bitmap Luís Henriques
@ 2022-10-11 15:56 ` Luís Henriques
  2022-11-06  0:32   ` Theodore Ts'o
  2022-10-12 13:13 ` [PATCH v2] ext4: fix BUG_ON() when directory entry has invalid rec_len Luís Henriques
  1 sibling, 1 reply; 16+ messages in thread
From: Luís Henriques @ 2022-10-11 15:56 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger
  Cc: linux-ext4, linux-kernel, Luís Henriques, stable

It's possible to hit a NULL pointer exception while accessing the
sb->s_group_info in ext4_validate_inode_bitmap(), when calling
ext4_get_group_info().

 EXT4-fs (loop0): warning: mounting unchecked fs, running e2fsck is recommended
 EXT4-fs error (device loop0): ext4_clear_blocks:866: inode #32: comm mount: attempt to clear invalid blocks 16777450 len 1
 EXT4-fs error (device loop0): ext4_free_branches:1012: inode #32: comm mount: invalid indirect mapped block 1258291200 (level 1)
 EXT4-fs error (device loop0): ext4_free_branches:1012: inode #32: comm mount: invalid indirect mapped block 7379847 (level 2)
 BUG: kernel NULL pointer dereference, address: 0000000000000000
 ...
 RIP: 0010:ext4_read_inode_bitmap+0x21b/0x5a0
 ...
 Call Trace:
  <TASK>
  ext4_free_inode+0x172/0x5c0
  ext4_evict_inode+0x4a5/0x730
  evict+0xc1/0x1c0
  ext4_setup_system_zone+0x2ea/0x380
  ext4_fill_super+0x249f/0x3910
  ? ext4_reconfigure+0x880/0x880
  ? snprintf+0x49/0x60
  ? ext4_reconfigure+0x880/0x880
  get_tree_bdev+0x169/0x260
  vfs_get_tree+0x16/0x70
  path_mount+0x77d/0xa30
  __x64_sys_mount+0x101/0x140
  do_syscall_64+0x3c/0x80
  entry_SYSCALL_64_after_hwframe+0x46/0xb0

This issue can be fixed by returning NULL in ext4_get_group_info() when
->s_group_info is NULL.  This also requires checking the return code from
ext4_get_group_info() when appropriate.

While there, also ensure the right error code (-EFSCORRUPTED) is propagated
to user-space.  EUCLEAN is more informative than ENOMEM.

CC: stable@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216541
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216539
Signed-off-by: Luís Henriques <lhenriques@suse.de>
---

* Changes since v1:

I found out another bugzilla with the same issue (#216539), but on a
different place.  The pattern was the same: a call to
ext4_get_group_info() followed by a EXT4_MB_GRP_*_CORRUPT check.  I've
grep'ed the code and added the same check in (hopefully) all of them.

 fs/ext4/balloc.c   |  2 +-
 fs/ext4/ext4.h     | 17 ++++++++++-------
 fs/ext4/ialloc.c   |  6 +++---
 fs/ext4/indirect.c | 14 ++++++++++++--
 fs/ext4/mballoc.c  |  9 +++++----
 5 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 8ff4b9192a9f..1af1fc8b1891 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -377,7 +377,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
 
 	if (buffer_verified(bh))
 		return 0;
-	if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+	if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
 		return -EFSCORRUPTED;
 
 	ext4_lock_group(sb, block_group);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e5f2f5ca5120..1c8b5876a28a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3324,13 +3324,16 @@ static inline
 struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 					    ext4_group_t group)
 {
-	 struct ext4_group_info **grp_info;
-	 long indexv, indexh;
-	 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
-	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
-	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
-	 grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
-	 return grp_info[indexh];
+	struct ext4_group_info **grp_info;
+	long indexv, indexh;
+
+	BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
+	if (!EXT4_SB(sb)->s_group_info)
+		return NULL;
+	indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+	indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+	grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
+	return grp_info[indexh];
 }
 
 /*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 208b87ce8858..079b9c3af327 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -91,7 +91,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
 
 	if (buffer_verified(bh))
 		return 0;
-	if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+	if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
 		return -EFSCORRUPTED;
 
 	ext4_lock_group(sb, block_group);
@@ -293,7 +293,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	}
 	if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
 		grp = ext4_get_group_info(sb, block_group);
-		if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
+		if (unlikely(!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
 			fatal = -EFSCORRUPTED;
 			goto error_return;
 		}
@@ -1048,7 +1048,7 @@ struct inode *__ext4_new_inode(struct user_namespace *mnt_userns,
 			 * Skip groups with already-known suspicious inode
 			 * tables
 			 */
-			if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+			if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
 				goto next_group;
 		}
 
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 860fc5119009..e5ac5c2363df 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	struct super_block *sb = inode->i_sb;
 	Indirect *p = chain;
 	struct buffer_head *bh;
+	unsigned int key;
 	int ret = -EIO;
 
 	*err = 0;
@@ -156,9 +157,18 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
-		bh = sb_getblk(sb, le32_to_cpu(p->key));
+		key = le32_to_cpu(p->key);
+		bh = sb_getblk(sb, key);
 		if (unlikely(!bh)) {
-			ret = -ENOMEM;
+			/*
+			 * sb_getblk() masks different errors by always
+			 * returning NULL.  Let's distinguish at least the case
+			 * where the block is out of range.
+			 */
+			if (key > ext4_blocks_count(EXT4_SB(sb)->s_es))
+				ret = -EFSCORRUPTED;
+			else
+				ret = -ENOMEM;
 			goto failure;
 		}
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9dad93059945..577e4d7415c3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2386,7 +2386,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
 
 	BUG_ON(cr < 0 || cr >= 4);
 
-	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+	if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
 		return false;
 
 	free = grp->bb_free;
@@ -2466,7 +2466,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
 		goto out;
 	if (cr <= 2 && free < ac->ac_g_ex.fe_len)
 		goto out;
-	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+	if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
 		goto out;
 	if (should_lock) {
 		__acquire(ext4_group_lock_ptr(sb, group));
@@ -5895,6 +5895,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
 	ext4_grpblk_t bit;
 	struct buffer_head *gd_bh;
 	ext4_group_t block_group;
+	struct ext4_group_info *grp_info;
 	struct ext4_sb_info *sbi;
 	struct ext4_buddy e4b;
 	unsigned int count_clusters;
@@ -5916,8 +5917,8 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
 	overflow = 0;
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
 
-	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
-			ext4_get_group_info(sb, block_group))))
+	grp_info = ext4_get_group_info(sb, block_group);
+	if (unlikely(!grp_info || EXT4_MB_GRP_BBITMAP_CORRUPT(grp_info)))
 		return;
 
 	/*

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2] ext4: fix BUG_ON() when directory entry has invalid rec_len
  2022-10-10 14:20 [PATCH] ext4: fix a NULL pointer when validating an inode bitmap Luís Henriques
  2022-10-11 15:56 ` [PATCH v2] " Luís Henriques
@ 2022-10-12 13:13 ` Luís Henriques
  2022-10-12 13:16   ` Luís Henriques
  2022-11-06  6:16   ` Theodore Ts'o
  1 sibling, 2 replies; 16+ messages in thread
From: Luís Henriques @ 2022-10-12 13:13 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger
  Cc: linux-ext4, linux-kernel, Luís Henriques, stable

The rec_len field in the directory entry has to be a multiple of 4.  A
corrupted filesystem image can be used to hit a BUG() in
ext4_rec_len_to_disk(), called from make_indexed_dir().

 ------------[ cut here ]------------
 kernel BUG at fs/ext4/ext4.h:2413!
 ...
 RIP: 0010:make_indexed_dir+0x53f/0x5f0
 ...
 Call Trace:
  <TASK>
  ? add_dirent_to_buf+0x1b2/0x200
  ext4_add_entry+0x36e/0x480
  ext4_add_nondir+0x2b/0xc0
  ext4_create+0x163/0x200
  path_openat+0x635/0xe90
  do_filp_open+0xb4/0x160
  ? __create_object.isra.0+0x1de/0x3b0
  ? _raw_spin_unlock+0x12/0x30
  do_sys_openat2+0x91/0x150
  __x64_sys_open+0x6c/0xa0
  do_syscall_64+0x3c/0x80
  entry_SYSCALL_64_after_hwframe+0x46/0xb0

The fix simply adds a call to ext4_check_dir_entry() to validate the
directory entry, returning -EFSCORRUPTED if the entry is invalid.

CC: stable@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216540
Signed-off-by: Luís Henriques <lhenriques@suse.de>
---
* Changes since v1:

As suggested by Ted, I've removed the incorrect 'de->rec_len' check from
previous version and replaced it with a call to ext4_check_dir_entry()
instead, which is a much more complete verification.

 fs/ext4/namei.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3a31b662f661..ed76e89ffbe9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2254,8 +2254,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
 	memset(de, 0, len); /* wipe old data */
 	de = (struct ext4_dir_entry_2 *) data2;
 	top = data2 + len;
-	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
+					 (data2 + (blocksize - csum_size) -
+					  (char *) de))) {
+			brelse(bh2);
+			brelse(bh);
+			return -EFSCORRUPTED;
+		}
 		de = de2;
+	}
 	de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
 					   (char *) de, blocksize);
 

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix BUG_ON() when directory entry has invalid rec_len
  2022-10-12 13:13 ` [PATCH v2] ext4: fix BUG_ON() when directory entry has invalid rec_len Luís Henriques
@ 2022-10-12 13:16   ` Luís Henriques
  2022-10-12 14:21     ` Theodore Ts'o
  2022-11-06  6:16   ` Theodore Ts'o
  1 sibling, 1 reply; 16+ messages in thread
From: Luís Henriques @ 2022-10-12 13:16 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger; +Cc: linux-ext4, linux-kernel, stable

Grr, looks like I accidentally reused a 'git send-email' from shell
history which had a '--in-reply-to' in it.  Please ignore and sorry about
that.  I've just resent a new email.

Cheers,
--
Luís

On Wed, Oct 12, 2022 at 02:13:30PM +0100, Luís Henriques wrote:
> The rec_len field in the directory entry has to be a multiple of 4.  A
> corrupted filesystem image can be used to hit a BUG() in
> ext4_rec_len_to_disk(), called from make_indexed_dir().
> 
>  ------------[ cut here ]------------
>  kernel BUG at fs/ext4/ext4.h:2413!
>  ...
>  RIP: 0010:make_indexed_dir+0x53f/0x5f0
>  ...
>  Call Trace:
>   <TASK>
>   ? add_dirent_to_buf+0x1b2/0x200
>   ext4_add_entry+0x36e/0x480
>   ext4_add_nondir+0x2b/0xc0
>   ext4_create+0x163/0x200
>   path_openat+0x635/0xe90
>   do_filp_open+0xb4/0x160
>   ? __create_object.isra.0+0x1de/0x3b0
>   ? _raw_spin_unlock+0x12/0x30
>   do_sys_openat2+0x91/0x150
>   __x64_sys_open+0x6c/0xa0
>   do_syscall_64+0x3c/0x80
>   entry_SYSCALL_64_after_hwframe+0x46/0xb0
> 
> The fix simply adds a call to ext4_check_dir_entry() to validate the
> directory entry, returning -EFSCORRUPTED if the entry is invalid.
> 
> CC: stable@vger.kernel.org
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=216540
> Signed-off-by: Luís Henriques <lhenriques@suse.de>
> ---
> * Changes since v1:
> 
> As suggested by Ted, I've removed the incorrect 'de->rec_len' check from
> previous version and replaced it with a call to ext4_check_dir_entry()
> instead, which is a much more complete verification.
> 
>  fs/ext4/namei.c | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 3a31b662f661..ed76e89ffbe9 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -2254,8 +2254,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
>  	memset(de, 0, len); /* wipe old data */
>  	de = (struct ext4_dir_entry_2 *) data2;
>  	top = data2 + len;
> -	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
> +	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
> +		if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
> +					 (data2 + (blocksize - csum_size) -
> +					  (char *) de))) {
> +			brelse(bh2);
> +			brelse(bh);
> +			return -EFSCORRUPTED;
> +		}
>  		de = de2;
> +	}
>  	de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
>  					   (char *) de, blocksize);
>  

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix BUG_ON() when directory entry has invalid rec_len
  2022-10-12 13:16   ` Luís Henriques
@ 2022-10-12 14:21     ` Theodore Ts'o
  2022-10-12 15:18       ` Luís Henriques
  0 siblings, 1 reply; 16+ messages in thread
From: Theodore Ts'o @ 2022-10-12 14:21 UTC (permalink / raw)
  To: Luís Henriques; +Cc: Andreas Dilger, linux-ext4, linux-kernel, stable

On Wed, Oct 12, 2022 at 02:16:42PM +0100, Luís Henriques wrote:
> Grr, looks like I accidentally reused a 'git send-email' from shell
> history which had a '--in-reply-to' in it.  Please ignore and sorry about
> that.  I've just resent a new email.

No worries!  The --in-reply-to wasn't actually a problem, since b4
generally will do the right thing (and sometimes humans prefer the
in-reply-to since they can more easily see the patch that it is
replacing/obsoleting).

b4 can sometimes get confused when a patch series gets split, and both
parts of the patch series are in a reply-to mail thread to the
original patch series, since if it can't use the -vn+1 hueristic or
the "subject line has stayed the same but has a newer date" hueristic,
it falls back to "latest patch in the mail thread".  So if there are
two "valid" patches or patch series in an e-mail thread, b4 -c
(--check-newer-revisions) can get confused.  But even in that case,
that it's more a minor annoyance than anything else.

So in the future, don't feel that you need to resend a patch if
there's an incorrect/older --in-reply-to; it's not a big deal.

Cheers, and thanks!

						- Ted

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix BUG_ON() when directory entry has invalid rec_len
  2022-10-12 14:21     ` Theodore Ts'o
@ 2022-10-12 15:18       ` Luís Henriques
  0 siblings, 0 replies; 16+ messages in thread
From: Luís Henriques @ 2022-10-12 15:18 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Andreas Dilger, linux-ext4, linux-kernel, stable

On Wed, Oct 12, 2022 at 10:21:39AM -0400, Theodore Ts'o wrote:
> On Wed, Oct 12, 2022 at 02:16:42PM +0100, Luís Henriques wrote:
> > Grr, looks like I accidentally reused a 'git send-email' from shell
> > history which had a '--in-reply-to' in it.  Please ignore and sorry about
> > that.  I've just resent a new email.
> 
> No worries!  The --in-reply-to wasn't actually a problem, since b4
> generally will do the right thing (and sometimes humans prefer the
> in-reply-to since they can more easily see the patch that it is
> replacing/obsoleting).
> 
> b4 can sometimes get confused when a patch series gets split, and both
> parts of the patch series are in a reply-to mail thread to the
> original patch series, since if it can't use the -vn+1 hueristic or
> the "subject line has stayed the same but has a newer date" hueristic,
> it falls back to "latest patch in the mail thread".  So if there are
> two "valid" patches or patch series in an e-mail thread, b4 -c
> (--check-newer-revisions) can get confused.  But even in that case,
> that it's more a minor annoyance than anything else.
> 
> So in the future, don't feel that you need to resend a patch if
> there's an incorrect/older --in-reply-to; it's not a big deal.

Great, I haven't yet included b4 in my workflow so, to be honest, I didn't
really thought about that tool being confused.  What really made me resend
the patch was that I used the *wrong message-ID in the "--in-reply-to"!
And that thread already had a v2 patch, which would could easily confuse
humans.  Hopefully, b4 won't be confused by that either.

Cheers,
--
Luís

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix a NULL pointer when validating an inode bitmap
  2022-10-11 15:56 ` [PATCH v2] " Luís Henriques
@ 2022-11-06  0:32   ` Theodore Ts'o
  2022-11-08 14:06     ` Luís Henriques
  0 siblings, 1 reply; 16+ messages in thread
From: Theodore Ts'o @ 2022-11-06  0:32 UTC (permalink / raw)
  To: Luís Henriques; +Cc: Andreas Dilger, linux-ext4, linux-kernel, stable

First of all, you replied to this patch a completely different patch,
"ext4: fix BUG_ON() when directory entry has invalid rec_len".  This
very much confuses b4, so please don't do that.  If you send a patch
series, where the message-id are related, e.g.:

    20221011155623.14840-1-lhenriques@suse.de
    20221011155623.14840-2-lhenriques@suse.de

etc., b4 will figure out what is going on.  But when the message id's
are unrelated, e.g:

    20221011155623.14840-1-lhenriques@suse.de
vs    
    20221012131330.32456-1-lhenriques@suse.de

... b4 will assume that 20221012131330.32456-1-lhenriques@suse.de is a
newer version of 20221011155623.14840-1-lhenriques@suse.de and there
is apparently no way to tell it to not try to use the "newer" version
of the patch.

On Tue, Oct 11, 2022 at 04:56:24PM +0100, Luís Henriques wrote:
> It's possible to hit a NULL pointer exception while accessing the
> sb->s_group_info in ext4_validate_inode_bitmap(), when calling
> ext4_get_group_info().

  ...

> This issue can be fixed by returning NULL in ext4_get_group_info() when
> ->s_group_info is NULL.  This also requires checking the return code from
> ext4_get_group_info() when appropriate.

I don't believe this is a correct diagnosis of what is going on.  Did
you actually confirm the line numbers associated with the call stack?
What makes you believe that?  Look at how s_group_info is initialized
in ext4_mb_alloc_groupinfo() in fs/ext4/mballoc.c.  It's pretty
careful to make sure this is not the case.

>  EXT4-fs (loop0): warning: mounting unchecked fs, running e2fsck is recommended
>  EXT4-fs error (device loop0): ext4_clear_blocks:866: inode #32: comm mount: attempt to clear invalid blocks 16777450 len 1
>  EXT4-fs error (device loop0): ext4_free_branches:1012: inode #32: comm mount: invalid indirect mapped block 1258291200 (level 1)
>  EXT4-fs error (device loop0): ext4_free_branches:1012: inode #32: comm mount: invalid indirect mapped block 7379847 (level 2)
>  BUG: kernel NULL pointer dereference, address: 0000000000000000
>  ...
>  RIP: 0010:ext4_read_inode_bitmap+0x21b/0x5a0
>  ...
>  Call Trace:
>   <TASK>
>   ext4_free_inode+0x172/0x5c0
>   ext4_evict_inode+0x4a5/0x730
>   evict+0xc1/0x1c0
>   ext4_setup_system_zone+0x2ea/0x380
>   ext4_fill_super+0x249f/0x3910
>   ? ext4_reconfigure+0x880/0x880
>   ? snprintf+0x49/0x60
>   ? ext4_reconfigure+0x880/0x880
>   get_tree_bdev+0x169/0x260
>   vfs_get_tree+0x16/0x70
>   path_mount+0x77d/0xa30
>   __x64_sys_mount+0x101/0x140
>   do_syscall_64+0x3c/0x80
>   entry_SYSCALL_64_after_hwframe+0x46/0xb0

So we're evicting an inode while in the middle of calling
ext4_setup_system_zone() in fs/ext4/block_validity.c.  That can only
happen if we are calling iput() on an an inode, and the only place
that we do that in block_validity.c is in the function
ext4_protect_reserved_inode() --- which we call on the journal inode.

Given the error messages, I suspect this was a fuzzed file system
where the journal inode was not in the standard reserved ino, but
rather in a the normal inode number, in s_journal_inum (which is a
leftover relic from the very early ext3 days), and that inode number
was then explicitly/maliciously placed on the orphan list, and then
hilarity ensued from there.

We need to add some better error checking to protect against this case
in ext4_orphan_get().

Do you have the file system image which triggered this failure?  Was
it the same syzkaller report, or perhaps was it some other syzkaller
report?


> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index 860fc5119009..e5ac5c2363df 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
>  	struct super_block *sb = inode->i_sb;
>  	Indirect *p = chain;
>  	struct buffer_head *bh;
> +	unsigned int key;
>  	int ret = -EIO;
>  
>  	*err = 0;
> @@ -156,9 +157,18 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
>  	if (!p->key)
>  		goto no_block;
>  	while (--depth) {
> -		bh = sb_getblk(sb, le32_to_cpu(p->key));
> +		key = le32_to_cpu(p->key);
> +		bh = sb_getblk(sb, key);
>  		if (unlikely(!bh)) {
> -			ret = -ENOMEM;
> +			/*
> +			 * sb_getblk() masks different errors by always
> +			 * returning NULL.  Let's distinguish at least the case
> +			 * where the block is out of range.
> +			 */
> +			if (key > ext4_blocks_count(EXT4_SB(sb)->s_es))
> +				ret = -EFSCORRUPTED;
> +			else
> +				ret = -ENOMEM;
>  			goto failure;
>  		}
>

And this is fixing a completely different problem and should go in a
different patch.  It's also not the best way of fixing it.  What we
should do is check whether key is out of bounds *before* calling
sb_getblkf(), and then call ext4_error() to mark the file system is
corrupted, and then return -EFSCORRUPTED.

Cheers,

						- Ted

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix BUG_ON() when directory entry has invalid rec_len
  2022-10-12 13:13 ` [PATCH v2] ext4: fix BUG_ON() when directory entry has invalid rec_len Luís Henriques
  2022-10-12 13:16   ` Luís Henriques
@ 2022-11-06  6:16   ` Theodore Ts'o
  1 sibling, 0 replies; 16+ messages in thread
From: Theodore Ts'o @ 2022-11-06  6:16 UTC (permalink / raw)
  To: Luís Henriques, Andreas Dilger
  Cc: Theodore Ts'o, linux-kernel, stable, linux-ext4

On Wed, 12 Oct 2022 14:13:30 +0100, Luís Henriques wrote:
> The rec_len field in the directory entry has to be a multiple of 4.  A
> corrupted filesystem image can be used to hit a BUG() in
> ext4_rec_len_to_disk(), called from make_indexed_dir().
> 
>  ------------[ cut here ]------------
>  kernel BUG at fs/ext4/ext4.h:2413!
>  ...
>  RIP: 0010:make_indexed_dir+0x53f/0x5f0
>  ...
>  Call Trace:
>   <TASK>
>   ? add_dirent_to_buf+0x1b2/0x200
>   ext4_add_entry+0x36e/0x480
>   ext4_add_nondir+0x2b/0xc0
>   ext4_create+0x163/0x200
>   path_openat+0x635/0xe90
>   do_filp_open+0xb4/0x160
>   ? __create_object.isra.0+0x1de/0x3b0
>   ? _raw_spin_unlock+0x12/0x30
>   do_sys_openat2+0x91/0x150
>   __x64_sys_open+0x6c/0xa0
>   do_syscall_64+0x3c/0x80
>   entry_SYSCALL_64_after_hwframe+0x46/0xb0
> 
> [...]

Applied, thanks!

[1/1] ext4: fix BUG_ON() when directory entry has invalid rec_len
      commit: 17a0bc9bd697f75cfdf9b378d5eb2d7409c91340

Best regards,
-- 
Theodore Ts'o <tytso@mit.edu>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix a NULL pointer when validating an inode bitmap
  2022-11-06  0:32   ` Theodore Ts'o
@ 2022-11-08 14:06     ` Luís Henriques
  2022-11-28 22:28       ` Theodore Ts'o
  0 siblings, 1 reply; 16+ messages in thread
From: Luís Henriques @ 2022-11-08 14:06 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Andreas Dilger, linux-ext4, linux-kernel, stable

On Sat, Nov 05, 2022 at 08:32:08PM -0400, Theodore Ts'o wrote:
> First of all, you replied to this patch a completely different patch,
> "ext4: fix BUG_ON() when directory entry has invalid rec_len".  This
> very much confuses b4, so please don't do that.  If you send a patch
> series, where the message-id are related, e.g.:
> 
>     20221011155623.14840-1-lhenriques@suse.de
>     20221011155623.14840-2-lhenriques@suse.de
> 
> etc., b4 will figure out what is going on.  But when the message id's
> are unrelated, e.g:
> 
>     20221011155623.14840-1-lhenriques@suse.de
> vs    
>     20221012131330.32456-1-lhenriques@suse.de
> 
> ... b4 will assume that 20221012131330.32456-1-lhenriques@suse.de is a
> newer version of 20221011155623.14840-1-lhenriques@suse.de and there
> is apparently no way to tell it to not try to use the "newer" version
> of the patch.

Yeah, I'm really sorry for this.  As I mentioned in a reply to that email,
I messed it up by running my scripts from shell history, without cleaning
the extra parameters.  Lesson learned -- *never* use shell history for
sending patches! :-(

> On Tue, Oct 11, 2022 at 04:56:24PM +0100, Luís Henriques wrote:
> > It's possible to hit a NULL pointer exception while accessing the
> > sb->s_group_info in ext4_validate_inode_bitmap(), when calling
> > ext4_get_group_info().
> 
>   ...
> 
> > This issue can be fixed by returning NULL in ext4_get_group_info() when
> > ->s_group_info is NULL.  This also requires checking the return code from
> > ext4_get_group_info() when appropriate.
> 
> I don't believe this is a correct diagnosis of what is going on.  Did
> you actually confirm the line numbers associated with the call stack?

Here's the line numbers:

$ ./scripts/faddr2line fs/ext4/ialloc.o ext4_read_inode_bitmap+0x21b/0x5a0
ext4_read_inode_bitmap+0x21b/0x5a0:
ext4_get_group_info at /home/miguel/kernel/linux/fs/ext4/ext4.h:3332
(inlined by) ext4_validate_inode_bitmap at /home/miguel/kernel/linux/fs/ext4/ialloc.c:90
(inlined by) ext4_read_inode_bitmap at /home/miguel/kernel/linux/fs/ext4/ialloc.c:210

This is on a 6.1.0-rc4 kernel, where I got:

  RIP: 0010:ext4_read_inode_bitmap+0x21b/0x5a0

So, the issue is happening in ext4_read_inode_bitmap(), when
jumping to the 'verify' label from here:

    184         if (buffer_uptodate(bh)) {
    185                 /*
    186                  * if not uninit if bh is uptodate,
    187                  * bitmap is also uptodate
    188                  */
    189                 set_bitmap_uptodate(bh);
    190                 unlock_buffer(bh);
    191                 goto verify;
    192         }
    ...
    209 verify:
==> 210         err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
    211         if (err)
    212                 goto out;
    213         return bh;
    214 out:
    215         put_bh(bh);
    216         return ERR_PTR(err);
    217 }

> What makes you believe that?  Look at how s_group_info is initialized
> in ext4_mb_alloc_groupinfo() in fs/ext4/mballoc.c.  It's pretty
> careful to make sure this is not the case.

Right.  I may be missing something, but I don't think we get that far.
__ext4_fill_super() will first call ext4_setup_system_zone() (which is
where this bug occurs) and only after that ext4_mb_init() will be invoked
(which is where ext4_mb_alloc_groupinfo() will eventually be called).

> >  EXT4-fs (loop0): warning: mounting unchecked fs, running e2fsck is recommended
> >  EXT4-fs error (device loop0): ext4_clear_blocks:866: inode #32: comm mount: attempt to clear invalid blocks 16777450 len 1
> >  EXT4-fs error (device loop0): ext4_free_branches:1012: inode #32: comm mount: invalid indirect mapped block 1258291200 (level 1)
> >  EXT4-fs error (device loop0): ext4_free_branches:1012: inode #32: comm mount: invalid indirect mapped block 7379847 (level 2)
> >  BUG: kernel NULL pointer dereference, address: 0000000000000000
> >  ...
> >  RIP: 0010:ext4_read_inode_bitmap+0x21b/0x5a0
> >  ...
> >  Call Trace:
> >   <TASK>
> >   ext4_free_inode+0x172/0x5c0
> >   ext4_evict_inode+0x4a5/0x730
> >   evict+0xc1/0x1c0
> >   ext4_setup_system_zone+0x2ea/0x380
> >   ext4_fill_super+0x249f/0x3910
> >   ? ext4_reconfigure+0x880/0x880
> >   ? snprintf+0x49/0x60
> >   ? ext4_reconfigure+0x880/0x880
> >   get_tree_bdev+0x169/0x260
> >   vfs_get_tree+0x16/0x70
> >   path_mount+0x77d/0xa30
> >   __x64_sys_mount+0x101/0x140
> >   do_syscall_64+0x3c/0x80
> >   entry_SYSCALL_64_after_hwframe+0x46/0xb0
> 
> So we're evicting an inode while in the middle of calling
> ext4_setup_system_zone() in fs/ext4/block_validity.c.  That can only
> happen if we are calling iput() on an an inode, and the only place
> that we do that in block_validity.c is in the function
> ext4_protect_reserved_inode() --- which we call on the journal inode.
> 
> Given the error messages, I suspect this was a fuzzed file system
> where the journal inode was not in the standard reserved ino, but
> rather in a the normal inode number, in s_journal_inum (which is a
> leftover relic from the very early ext3 days), and that inode number
> was then explicitly/maliciously placed on the orphan list, and then
> hilarity ensued from there.

Correct, the images do indeed have the wrong inode number (32) in
s_journal_inum.

> We need to add some better error checking to protect against this case
> in ext4_orphan_get().

Unfortunately, after some debug, I don't see ext4_orphan_get() ever being
invoked anywhere.

> 
> Do you have the file system image which triggered this failure?  Was
> it the same syzkaller report, or perhaps was it some other syzkaller
> report?

Yes, these were generated with a fuzzer, and the 2 images I've used as
reproducers were picked from the bugzillas in the commit 'Link' tags:

  Link: https://bugzilla.kernel.org/show_bug.cgi?id=216541
  Link: https://bugzilla.kernel.org/show_bug.cgi?id=216539

To reproduce the issue you simply need to mount those images.

> 
> 
> > diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> > index 860fc5119009..e5ac5c2363df 100644
> > --- a/fs/ext4/indirect.c
> > +++ b/fs/ext4/indirect.c
> > @@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
> >  	struct super_block *sb = inode->i_sb;
> >  	Indirect *p = chain;
> >  	struct buffer_head *bh;
> > +	unsigned int key;
> >  	int ret = -EIO;
> >  
> >  	*err = 0;
> > @@ -156,9 +157,18 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
> >  	if (!p->key)
> >  		goto no_block;
> >  	while (--depth) {
> > -		bh = sb_getblk(sb, le32_to_cpu(p->key));
> > +		key = le32_to_cpu(p->key);
> > +		bh = sb_getblk(sb, key);
> >  		if (unlikely(!bh)) {
> > -			ret = -ENOMEM;
> > +			/*
> > +			 * sb_getblk() masks different errors by always
> > +			 * returning NULL.  Let's distinguish at least the case
> > +			 * where the block is out of range.
> > +			 */
> > +			if (key > ext4_blocks_count(EXT4_SB(sb)->s_es))
> > +				ret = -EFSCORRUPTED;
> > +			else
> > +				ret = -ENOMEM;
> >  			goto failure;
> >  		}
> >
> 
> And this is fixing a completely different problem and should go in a
> different patch.  It's also not the best way of fixing it.  What we
> should do is check whether key is out of bounds *before* calling
> sb_getblkf(), and then call ext4_error() to mark the file system is
> corrupted, and then return -EFSCORRUPTED.

OK, makes sense.  I'll send out a separate patch for this.  Thanks a lot
for your review, Ted.

Cheers,
--
Luís

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix a NULL pointer when validating an inode bitmap
  2022-11-08 14:06     ` Luís Henriques
@ 2022-11-28 22:28       ` Theodore Ts'o
  2022-11-29  3:18         ` Baokun Li
  0 siblings, 1 reply; 16+ messages in thread
From: Theodore Ts'o @ 2022-11-28 22:28 UTC (permalink / raw)
  To: Luís Henriques; +Cc: Andreas Dilger, linux-ext4, linux-kernel, stable

On Tue, Nov 08, 2022 at 02:06:29PM +0000, Luís Henriques wrote:
> > What makes you believe that?  Look at how s_group_info is initialized
> > in ext4_mb_alloc_groupinfo() in fs/ext4/mballoc.c.  It's pretty
> > careful to make sure this is not the case.
> 
> Right.  I may be missing something, but I don't think we get that far.
> __ext4_fill_super() will first call ext4_setup_system_zone() (which is
> where this bug occurs) and only after that ext4_mb_init() will be invoked
> (which is where ext4_mb_alloc_groupinfo() will eventually be called).

I finally got around to taking a closer look at this, and I have a
much better understandign of what is going on.  For more details, and
a suggested fix, please see:

     https://bugzilla.kernel.org/show_bug.cgi?id=216541#c1

						- Ted

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix a NULL pointer when validating an inode bitmap
  2022-11-28 22:28       ` Theodore Ts'o
@ 2022-11-29  3:18         ` Baokun Li
  2022-11-29 21:00           ` Theodore Ts'o
  0 siblings, 1 reply; 16+ messages in thread
From: Baokun Li @ 2022-11-29  3:18 UTC (permalink / raw)
  To: Theodore Ts'o, Luís Henriques
  Cc: Andreas Dilger, linux-ext4, linux-kernel, stable

On 2022/11/29 6:28, Theodore Ts'o wrote:
> On Tue, Nov 08, 2022 at 02:06:29PM +0000, Luís Henriques wrote:
>>> What makes you believe that?  Look at how s_group_info is initialized
>>> in ext4_mb_alloc_groupinfo() in fs/ext4/mballoc.c.  It's pretty
>>> careful to make sure this is not the case.
>> Right.  I may be missing something, but I don't think we get that far.
>> __ext4_fill_super() will first call ext4_setup_system_zone() (which is
>> where this bug occurs) and only after that ext4_mb_init() will be invoked
>> (which is where ext4_mb_alloc_groupinfo() will eventually be called).
> I finally got around to taking a closer look at this, and I have a
> much better understandign of what is going on.  For more details, and
> a suggested fix, please see:
>
>       https://bugzilla.kernel.org/show_bug.cgi?id=216541#c1
>
> 						- Ted
>
>
Hi Theodore,

In my opinion, the s_journal_inum should not be modified when the file 
system is
mounted, especially after we have successfully loaded and replayed the 
journal with
the current s_journal_inum. Even if the s_journal_inumon the disk is 
modified, we should
use the current one. This is how journal_devnum is handled in 
ext4_load_journal():

          if (!really_read_only && journal_devnum &&
              journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                  es->s_journal_dev = cpu_to_le32(journal_devnum);

                  /* Make sure we flush the recovery flag to disk. */
                  ext4_commit_super(sb);
          }

We can avoid this problem by adding a similar check for journal_inum in 
ext4_load_journal().

-- 
With Best Regards,
Baokun Li


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix a NULL pointer when validating an inode bitmap
  2022-11-29  3:18         ` Baokun Li
@ 2022-11-29 21:00           ` Theodore Ts'o
  2022-11-30  3:20             ` Baokun Li
  0 siblings, 1 reply; 16+ messages in thread
From: Theodore Ts'o @ 2022-11-29 21:00 UTC (permalink / raw)
  To: Baokun Li
  Cc: Luís Henriques, Andreas Dilger, linux-ext4, linux-kernel, stable

On Tue, Nov 29, 2022 at 11:18:34AM +0800, Baokun Li wrote:
> 
> In my opinion, the s_journal_inum should not be modified when the
> file system is mounted, especially after we have successfully loaded
> and replayed the journal with the current s_journal_inum. Even if
> the s_journal_inumon the disk is modified, we should use the current
> one. This is how journal_devnum is handled in ext4_load_journal():
> 
>          if (!really_read_only && journal_devnum &&
>              journal_devnum != le32_to_cpu(es->s_journal_dev)) {
>                  es->s_journal_dev = cpu_to_le32(journal_devnum);
> 
>                  /* Make sure we flush the recovery flag to disk. */
>                  ext4_commit_super(sb);
>          }
> 
> We can avoid this problem by adding a similar check for journal_inum in
> ext4_load_journal().

This check you've pointed out wasn't actually intended to protect
against the problem where the journal_inum is getting overwritten by
the journal replay.  The s_journal_dev field is a hint about where to
find the external journal.  However, this can change over time --- for
example, if a SCSI disk is removed from the system, so /dev/sdcXX
becomes /dev/sbdXX.  The official way to find the journal device is
via the external journal's UUID.  So userspace might use a command
like:

  mount -t ext4 -o journal_path="$(blkid -U <journal uuid>)" UUID=<fs uuid> /mnt

So s_journal_devnum might get updated, and we don't want the hint to
get overwritten by the journal replay.  So that's why the code that
you've quoted exists (and this goes all the way back to ext3).  It's a
code path that can be quite legitimately triggered when the location
of the external journal device changes (or when the device's
major/minor numbers get renumbered).

Now, we *could* do something like this for s_journal_inum, but it
would be for a different purpose.  In practice, this would never
happen in real life due to random bit flips, since the journal is
protected using checksum.  It can only happen when there are
deliberately, maliciously fuzzed file system images, such as was the
case here.  And s_journal_inum is only one of any number of superblock
fields that shouldn't ever be modified by the journal replay.  We have
had previous failures caused by we validated the superblock fields to
be valid, but then after that, we replay the journal, and then it
turns out the superblock fields are incorrect.  (And then some Red Hat
principal engineer will try to call it a high severity CVE, which is
really bullshit, since if you allow random unprivileged processes to
mount arbitrary file system images, you've got other problems.  Don't
do that.)

If we *really* cared about these sorts of problems, we should special
case the journal replay of certain blocks, such as the superblock, and
validate those blocks to make sure they are not crazy --- and if it is
crazy, we should abort the journal replay right then and there.

Alternatively, one *could* consider making a copy of certain blocks
(in particular the superblock and block group descriptors), and then
do a post-hoc validation of the superblock after the replay --- and if
it is invalid, we could put the old superblock back.  But we need to
remember that sometimes superblock fields *can* change.  For example,
in the case of online resize, we can't just say that if the old
superblock value is different from the new superblock value, something
Must Be Wrong.  That being said, if the result of the journal replay
ends up with a result where s_block_count is greater than the physical
block device, *that's* something that is probably wrong.

That being said, the question is whether all of this complexity is
really all *that* necessary, since again, thanks to checksums, short
of Malicious File System Fuzzing, this should never happen.  If all of
this complexity is only needed to protect against malicious fuzzed
file systems, then maybe it's not the worth it.

If we can protect against the problem by adding a check that has other
value as well (such as making usre that when ext4_iget fetches a
special inode, we enforce that i_links_couint must be > 0), maybe
that's worth it.

So ultimately it's all a question of engineering tradeoffs.  Is it
worth it to check for s_journal_inum changing, and changing it back?
Meh.  At the very least we would want to add a warning, since this is
only going happen in case of a malicious fuzzed file system.  And
since we're only undoing the s_journal_inum update, there may be other
superblock fields beyond s_journal_inum that could get modified by the
malicious file system fuzzer.  So how many post hoc checks do we
really want to be adding here?  Should we also do similar checks for
s_blocks_per_group?  s_inodes_per_group?  s_log_block_size?
s_first_ino?  s_first_data_block?  I realize this is a slipperly slope
argument; but the bottom line that it's not immediately obvious that
it's good idea to only worry about s_journal_inum.

Cheers,

						- Ted

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix a NULL pointer when validating an inode bitmap
  2022-11-29 21:00           ` Theodore Ts'o
@ 2022-11-30  3:20             ` Baokun Li
  2022-12-01  4:32               ` Theodore Ts'o
  0 siblings, 1 reply; 16+ messages in thread
From: Baokun Li @ 2022-11-30  3:20 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Luís Henriques, Andreas Dilger, linux-ext4, linux-kernel,
	stable, Baokun Li

On 2022/11/30 5:00, Theodore Ts'o wrote:
> On Tue, Nov 29, 2022 at 11:18:34AM +0800, Baokun Li wrote:
>> In my opinion, the s_journal_inum should not be modified when the
>> file system is mounted, especially after we have successfully loaded
>> and replayed the journal with the current s_journal_inum. Even if
>> the s_journal_inumon the disk is modified, we should use the current
>> one. This is how journal_devnum is handled in ext4_load_journal():
>>
>>           if (!really_read_only && journal_devnum &&
>>               journal_devnum != le32_to_cpu(es->s_journal_dev)) {
>>                   es->s_journal_dev = cpu_to_le32(journal_devnum);
>>
>>                   /* Make sure we flush the recovery flag to disk. */
>>                   ext4_commit_super(sb);
>>           }
>>
>> We can avoid this problem by adding a similar check for journal_inum in
>> ext4_load_journal().
> This check you've pointed out wasn't actually intended to protect
> against the problem where the journal_inum is getting overwritten by
> the journal replay.  The s_journal_dev field is a hint about where to
> find the external journal.  However, this can change over time --- for
> example, if a SCSI disk is removed from the system, so /dev/sdcXX
> becomes /dev/sbdXX.  The official way to find the journal device is
> via the external journal's UUID.  So userspace might use a command
> like:
>
>    mount -t ext4 -o journal_path="$(blkid -U <journal uuid>)" UUID=<fs uuid> /mnt
>
> So s_journal_devnum might get updated, and we don't want the hint to
> get overwritten by the journal replay.  So that's why the code that
> you've quoted exists (and this goes all the way back to ext3).  It's a
> code path that can be quite legitimately triggered when the location
> of the external journal device changes (or when the device's
> major/minor numbers get renumbered).

I get it! Thank you very much for your patient and detailed explanation!

> Now, we *could* do something like this for s_journal_inum, but it
> would be for a different purpose.  In practice, this would never
> happen in real life due to random bit flips, since the journal is
> protected using checksum.  It can only happen when there are
> deliberately, maliciously fuzzed file system images, such as was the
> case here.
Totally agree! Because of this, we should intercept these anomalies in a 
simpler way
at a more peripheral location.

> And s_journal_inum is only one of any number of superblock
> fields that shouldn't ever be modified by the journal replay.  We have
> had previous failures caused by we validated the superblock fields to
> be valid, but then after that, we replay the journal, and then it
> turns out the superblock fields are incorrect.  (And then some Red Hat
> principal engineer will try to call it a high severity CVE, which is
> really bullshit, since if you allow random unprivileged processes to
> mount arbitrary file system images, you've got other problems.  Don't
> do that.)
Indeed, these fuzzy image problems are triggered by mounting first.
However, mounting an unchecked image and loose mounting permission
management are inherently problematic. But many times we can't ask too
much from users, because there are too many scenarios for linux.
>
> If we *really* cared about these sorts of problems, we should special
> case the journal replay of certain blocks, such as the superblock, and
> validate those blocks to make sure they are not crazy --- and if it is
> crazy, we should abort the journal replay right then and there.
>
> Alternatively, one *could* consider making a copy of certain blocks
> (in particular the superblock and block group descriptors), and then
> do a post-hoc validation of the superblock after the replay --- and if
> it is invalid, we could put the old superblock back.  But we need to
> remember that sometimes superblock fields *can* change.  For example,
> in the case of online resize, we can't just say that if the old
> superblock value is different from the new superblock value, something
> Must Be Wrong.  That being said, if the result of the journal replay
> ends up with a result where s_block_count is greater than the physical
> block device, *that's* something that is probably wrong.
>
> That being said, the question is whether all of this complexity is
> really all *that* necessary, since again, thanks to checksums, short
> of Malicious File System Fuzzing, this should never happen.  If all of
> this complexity is only needed to protect against malicious fuzzed
> file systems, then maybe it's not the worth it.

Yes, it's too complicated to fully check the data after journal replays.

> If we can protect against the problem by adding a check that has other
> value as well (such as making usre that when ext4_iget fetches a
> special inode, we enforce that i_links_couint must be > 0), maybe
> that's worth it.
Yes, but some special inodes allow i_links_couint to be zero,
such as the uninitialized boot load inode.
>
> So ultimately it's all a question of engineering tradeoffs.  Is it
> worth it to check for s_journal_inum changing, and changing it back?
> Meh.  At the very least we would want to add a warning, since this is
> only going happen in case of a malicious fuzzed file system.  And
> since we're only undoing the s_journal_inum update, there may be other
> superblock fields beyond s_journal_inum that could get modified by the
> malicious file system fuzzer.  So how many post hoc checks do we
> really want to be adding here?  Should we also do similar checks for
> s_blocks_per_group?  s_inodes_per_group?  s_log_block_size?
> s_first_ino?  s_first_data_block?  I realize this is a slipperly slope
> argument; but the bottom line that it's not immediately obvious that
> it's good idea to only worry about s_journal_inum.
>
> Cheers,
>
> 						- Ted
That's right! We don't have to divergence this problem, just focus on 
s_journal_inum.
We only need to ensure that the s_journal_inum on the disk is the same 
as that in use.
The s_journal_inum in use passes the check. In addition, the current 
journal is recorded
on the journal inode in use.

Thanks again for your patience!

-- 
With Best Regards,
Baokun Li
.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix a NULL pointer when validating an inode bitmap
  2022-11-30  3:20             ` Baokun Li
@ 2022-12-01  4:32               ` Theodore Ts'o
  2022-12-01  6:20                 ` Baokun Li
  0 siblings, 1 reply; 16+ messages in thread
From: Theodore Ts'o @ 2022-12-01  4:32 UTC (permalink / raw)
  To: Baokun Li
  Cc: Luís Henriques, Andreas Dilger, linux-ext4, linux-kernel, stable

On Wed, Nov 30, 2022 at 11:20:11AM +0800, Baokun Li wrote:
> > If we can protect against the problem by adding a check that has other
> > value as well (such as making usre that when ext4_iget fetches a
> > special inode, we enforce that i_links_couint must be > 0), maybe
> > that's worth it.
>
> Yes, but some special inodes allow i_links_couint to be zero,
> such as the uninitialized boot load inode.

That's a good point; but the only time when a special inode can
validly have a zero i_links_count is when it has no blocks associated
to it.  Otherwise, when the file system releases the inode using
iput() when the file system is unmounted, all of the blocks will get
released when the inode is evicted.  So we can have ext4_iget() allow
fetching an inode if i_blocks[] is zeros.  But if it has any blocks
and i_links_count is non-zero, something must be terribly wrong with
that inode.

Cheers,

					- Ted

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] ext4: fix a NULL pointer when validating an inode bitmap
  2022-12-01  4:32               ` Theodore Ts'o
@ 2022-12-01  6:20                 ` Baokun Li
  0 siblings, 0 replies; 16+ messages in thread
From: Baokun Li @ 2022-12-01  6:20 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Luís Henriques, Andreas Dilger, linux-ext4, linux-kernel, stable

On 2022/12/1 12:32, Theodore Ts'o wrote:
> On Wed, Nov 30, 2022 at 11:20:11AM +0800, Baokun Li wrote:
>>> If we can protect against the problem by adding a check that has other
>>> value as well (such as making usre that when ext4_iget fetches a
>>> special inode, we enforce that i_links_couint must be > 0), maybe
>>> that's worth it.
>> Yes, but some special inodes allow i_links_couint to be zero,
>> such as the uninitialized boot load inode.
> That's a good point; but the only time when a special inode can
> validly have a zero i_links_count is when it has no blocks associated
> to it.  Otherwise, when the file system releases the inode using
> iput() when the file system is unmounted, all of the blocks will get
> released when the inode is evicted.  So we can have ext4_iget() allow
> fetching an inode if i_blocks[] is zeros.  But if it has any blocks
> and i_links_count is non-zero, something must be terribly wrong with
> that inode.
>
> Cheers,
>
> 					- Ted
>
Totally agree! That sounds good!

-- 
With Best Regards,
Baokun Li
.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2] ext4: fix BUG_ON() when directory entry has invalid rec_len
@ 2022-10-12 13:16 Luís Henriques
  0 siblings, 0 replies; 16+ messages in thread
From: Luís Henriques @ 2022-10-12 13:16 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger
  Cc: linux-ext4, linux-kernel, Luís Henriques, stable

The rec_len field in the directory entry has to be a multiple of 4.  A
corrupted filesystem image can be used to hit a BUG() in
ext4_rec_len_to_disk(), called from make_indexed_dir().

 ------------[ cut here ]------------
 kernel BUG at fs/ext4/ext4.h:2413!
 ...
 RIP: 0010:make_indexed_dir+0x53f/0x5f0
 ...
 Call Trace:
  <TASK>
  ? add_dirent_to_buf+0x1b2/0x200
  ext4_add_entry+0x36e/0x480
  ext4_add_nondir+0x2b/0xc0
  ext4_create+0x163/0x200
  path_openat+0x635/0xe90
  do_filp_open+0xb4/0x160
  ? __create_object.isra.0+0x1de/0x3b0
  ? _raw_spin_unlock+0x12/0x30
  do_sys_openat2+0x91/0x150
  __x64_sys_open+0x6c/0xa0
  do_syscall_64+0x3c/0x80
  entry_SYSCALL_64_after_hwframe+0x46/0xb0

The fix simply adds a call to ext4_check_dir_entry() to validate the
directory entry, returning -EFSCORRUPTED if the entry is invalid.

CC: stable@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216540
Signed-off-by: Luís Henriques <lhenriques@suse.de>
---
* Changes since v1:

As suggested by Ted, I've removed the incorrect 'de->rec_len' check from
previous version and replaced it with a call to ext4_check_dir_entry()
instead, which is a much more complete verification.

 fs/ext4/namei.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3a31b662f661..ed76e89ffbe9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2254,8 +2254,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
 	memset(de, 0, len); /* wipe old data */
 	de = (struct ext4_dir_entry_2 *) data2;
 	top = data2 + len;
-	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
+					 (data2 + (blocksize - csum_size) -
+					  (char *) de))) {
+			brelse(bh2);
+			brelse(bh);
+			return -EFSCORRUPTED;
+		}
 		de = de2;
+	}
 	de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
 					   (char *) de, blocksize);
 

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-12-01  6:20 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-10 14:20 [PATCH] ext4: fix a NULL pointer when validating an inode bitmap Luís Henriques
2022-10-11 15:56 ` [PATCH v2] " Luís Henriques
2022-11-06  0:32   ` Theodore Ts'o
2022-11-08 14:06     ` Luís Henriques
2022-11-28 22:28       ` Theodore Ts'o
2022-11-29  3:18         ` Baokun Li
2022-11-29 21:00           ` Theodore Ts'o
2022-11-30  3:20             ` Baokun Li
2022-12-01  4:32               ` Theodore Ts'o
2022-12-01  6:20                 ` Baokun Li
2022-10-12 13:13 ` [PATCH v2] ext4: fix BUG_ON() when directory entry has invalid rec_len Luís Henriques
2022-10-12 13:16   ` Luís Henriques
2022-10-12 14:21     ` Theodore Ts'o
2022-10-12 15:18       ` Luís Henriques
2022-11-06  6:16   ` Theodore Ts'o
2022-10-12 13:16 Luís Henriques

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.