All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] ext4: save the error code which triggered an ext4_error() in the superblock
@ 2019-11-21 18:30 Theodore Ts'o
  2019-11-21 18:30 ` [PATCH 2/2] ext4: simulate various I/O and checksum errors when reading metadata Theodore Ts'o
  0 siblings, 1 reply; 7+ messages in thread
From: Theodore Ts'o @ 2019-11-21 18:30 UTC (permalink / raw)
  To: Ext4 Developers List; +Cc: Theodore Ts'o

This allows the cause of an ext4_error() report to be categorized
based on whether it was triggered due to an I/O error, or an memory
allocation error, or other possible causes.  Most errors are caused by
a detected file system inconsistency, so the default code stored in
the superblock will be EXT4_ERR_EFSCORRUPTED.

Previous-Version-Link: https://lore.kernel.org/r/20191120030224.22190-1-tytso@mit.edu
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/balloc.c    |  1 +
 fs/ext4/ext4.h      | 30 +++++++++++++++++++-
 fs/ext4/ext4_jbd2.c |  3 ++
 fs/ext4/extents.c   |  1 +
 fs/ext4/ialloc.c    |  2 ++
 fs/ext4/inline.c    |  2 ++
 fs/ext4/inode.c     |  8 +++++-
 fs/ext4/mballoc.c   |  4 +++
 fs/ext4/mmp.c       |  6 +++-
 fs/ext4/namei.c     |  4 +++
 fs/ext4/super.c     | 68 ++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/xattr.c     |  4 ++-
 12 files changed, 128 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0b202e00d93f..102c38527a10 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -506,6 +506,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
 		return -EFSCORRUPTED;
 	wait_on_buffer(bh);
 	if (!buffer_uptodate(bh)) {
+		ext4_set_errno(sb, EIO);
 		ext4_error(sb, "Cannot read block bitmap - "
 			   "block_group = %u, block_bitmap = %llu",
 			   block_group, (unsigned long long) bh->b_blocknr);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 61987c106511..1c9ac0fc8715 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1343,7 +1343,8 @@ struct ext4_super_block {
 	__u8	s_lastcheck_hi;
 	__u8	s_first_error_time_hi;
 	__u8	s_last_error_time_hi;
-	__u8	s_pad[2];
+	__u8	s_first_error_errcode;
+	__u8    s_last_error_errcode;
 	__le16  s_encoding;		/* Filename charset encoding */
 	__le16  s_encoding_flags;	/* Filename charset encoding flags */
 	__le32	s_reserved[95];		/* Padding to the end of the block */
@@ -1574,6 +1575,32 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
 
+/*
+ * Error number codes for s_{first,last}_error_errno
+ *
+ * Linux errno numbers are architecture specific, so we need to translate
+ * them into something which is architecture independent.   We don't define
+ * codes for all errno's; just the ones which are most likely to be the cause
+ * of an ext4_error() call.
+ */
+#define EXT4_ERR_UNKNOWN	 1
+#define EXT4_ERR_EIO		 2
+#define EXT4_ERR_ENOMEM		 3
+#define EXT4_ERR_EFSBADCRC	 4
+#define EXT4_ERR_EFSCORRUPTED	 5
+#define EXT4_ERR_ENOSPC		 6
+#define EXT4_ERR_ENOKEY		 7
+#define EXT4_ERR_EROFS		 8
+#define EXT4_ERR_EFBIG		 9
+#define EXT4_ERR_EEXIST		10
+#define EXT4_ERR_ERANGE		11
+#define EXT4_ERR_EOVERFLOW	12
+#define EXT4_ERR_EBUSY		13
+#define EXT4_ERR_ENOTDIR	14
+#define EXT4_ERR_ENOTEMPTY	15
+#define EXT4_ERR_ESHUTDOWN	16
+#define EXT4_ERR_EFAULT		17
+
 /*
  * Inode dynamic state flags
  */
@@ -2686,6 +2713,7 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno,
 extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
 					     ext4_group_t block_group,
 					     unsigned int flags);
+extern void ext4_set_errno(struct super_block *sb, int err);
 
 extern __printf(4, 5)
 void __ext4_error(struct super_block *, const char *, unsigned int,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d3b8cdea5df7..19217a3f1ae4 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -58,6 +58,7 @@ static int ext4_journal_check_start(struct super_block *sb)
 	 * take the FS itself readonly cleanly.
 	 */
 	if (journal && is_journal_aborted(journal)) {
+		ext4_set_errno(sb, -journal->j_errno);
 		ext4_abort(sb, "Detected aborted journal");
 		return -EROFS;
 	}
@@ -249,6 +250,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
 	if (err) {
 		ext4_journal_abort_handle(where, line, __func__,
 					  bh, handle, err);
+		ext4_set_errno(inode->i_sb, -err);
 		__ext4_abort(inode->i_sb, where, line,
 			   "error %d when attempting revoke", err);
 	}
@@ -320,6 +322,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 				es = EXT4_SB(inode->i_sb)->s_es;
 				es->s_last_error_block =
 					cpu_to_le64(bh->b_blocknr);
+				ext4_set_errno(inode->i_sb, EIO);
 				ext4_error_inode(inode, where, line,
 						 bh->b_blocknr,
 					"IO error syncing itable block");
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0e8708b77da6..ee83fe7c98aa 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -492,6 +492,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
 	return 0;
 
 corrupted:
+	ext4_set_errno(inode->i_sb, -err);
 	ext4_error_inode(inode, function, line, 0,
 			 "pblk %llu bad header/extent: %s - magic %x, "
 			 "entries %u, max %u(%u), depth %u(%u)",
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fa8c3c485e4b..31a5fd6f5e6a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -194,6 +194,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	wait_on_buffer(bh);
 	if (!buffer_uptodate(bh)) {
 		put_bh(bh);
+		ext4_set_errno(sb, EIO);
 		ext4_error(sb, "Cannot read inode bitmap - "
 			   "block_group = %u, inode_bitmap = %llu",
 			   block_group, bitmap_blk);
@@ -1228,6 +1229,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
+		ext4_set_errno(sb, -err);
 		ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
 			   ino, err);
 		return inode;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 2fec62d764fa..e61603f47035 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -98,6 +98,7 @@ int ext4_get_max_inline_size(struct inode *inode)
 
 	error = ext4_get_inode_loc(inode, &iloc);
 	if (error) {
+		ext4_set_errno(inode->i_sb, -error);
 		ext4_error_inode(inode, __func__, __LINE__, 0,
 				 "can't get inode location %lu",
 				 inode->i_ino);
@@ -1761,6 +1762,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
 
 	err = ext4_get_inode_loc(dir, &iloc);
 	if (err) {
+		ext4_set_errno(dir->i_sb, -err);
 		EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
 				 err, dir->i_ino);
 		return true;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index de70f19bfa7e..b67ffa24ae0a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -271,6 +271,7 @@ void ext4_evict_inode(struct inode *inode)
 	if (inode->i_blocks) {
 		err = ext4_truncate(inode);
 		if (err) {
+			ext4_set_errno(inode->i_sb, -err);
 			ext4_error(inode->i_sb,
 				   "couldn't truncate inode %lu (err %d)",
 				   inode->i_ino, err);
@@ -2478,10 +2479,12 @@ static int mpage_map_and_submit_extent(handle_t *handle,
 			EXT4_I(inode)->i_disksize = disksize;
 		up_write(&EXT4_I(inode)->i_data_sem);
 		err2 = ext4_mark_inode_dirty(handle, inode);
-		if (err2)
+		if (err2) {
+			ext4_set_errno(inode->i_sb, -err2);
 			ext4_error(inode->i_sb,
 				   "Failed to mark inode %lu dirty",
 				   inode->i_ino);
+		}
 		if (!err)
 			err = err2;
 	}
@@ -4338,6 +4341,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
 		blk_finish_plug(&plug);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
+			ext4_set_errno(inode->i_sb, EIO);
 			EXT4_ERROR_INODE_BLOCK(inode, block,
 					       "unable to read itable block");
 			brelse(bh);
@@ -4552,6 +4556,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	}
 
 	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+		ext4_set_errno(inode->i_sb, EFSBADCRC);
 		ext4_error_inode(inode, function, line, 0,
 				 "iget: checksum invalid");
 		ret = -EFSBADCRC;
@@ -5090,6 +5095,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
 			sync_dirty_buffer(iloc.bh);
 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+			ext4_set_errno(inode->i_sb, EIO);
 			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
 					 "IO error syncing inode");
 			err = -EIO;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a3e2767bdf2f..f64838187559 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3895,6 +3895,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (IS_ERR(bitmap_bh)) {
 		err = PTR_ERR(bitmap_bh);
+		ext4_set_errno(sb, -err);
 		ext4_error(sb, "Error %d reading block bitmap for %u",
 			   err, group);
 		return 0;
@@ -4063,6 +4064,7 @@ void ext4_discard_preallocations(struct inode *inode)
 		err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
 					     GFP_NOFS|__GFP_NOFAIL);
 		if (err) {
+			ext4_set_errno(sb, -err);
 			ext4_error(sb, "Error %d loading buddy information for %u",
 				   err, group);
 			continue;
@@ -4071,6 +4073,7 @@ void ext4_discard_preallocations(struct inode *inode)
 		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (IS_ERR(bitmap_bh)) {
 			err = PTR_ERR(bitmap_bh);
+			ext4_set_errno(sb, -err);
 			ext4_error(sb, "Error %d reading block bitmap for %u",
 					err, group);
 			ext4_mb_unload_buddy(&e4b);
@@ -4325,6 +4328,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 		err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
 					     GFP_NOFS|__GFP_NOFAIL);
 		if (err) {
+			ext4_set_errno(sb, -err);
 			ext4_error(sb, "Error %d loading buddy information for %u",
 				   err, group);
 			continue;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 2305b4374fd3..1c44b1a32001 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -173,8 +173,10 @@ static int kmmpd(void *data)
 		 * (s_mmp_update_interval * 60) seconds.
 		 */
 		if (retval) {
-			if ((failed_writes % 60) == 0)
+			if ((failed_writes % 60) == 0) {
+				ext4_set_errno(sb, -retval);
 				ext4_error(sb, "Error writing to MMP block");
+			}
 			failed_writes++;
 		}
 
@@ -205,6 +207,7 @@ static int kmmpd(void *data)
 
 			retval = read_mmp_block(sb, &bh_check, mmp_block);
 			if (retval) {
+				ext4_set_errno(sb, -retval);
 				ext4_error(sb, "error reading MMP data: %d",
 					   retval);
 				goto exit_thread;
@@ -218,6 +221,7 @@ static int kmmpd(void *data)
 					     "Error while updating MMP info. "
 					     "The filesystem seems to have been"
 					     " multiply mounted.");
+				ext4_set_errno(sb, EBUSY);
 				ext4_error(sb, "abort");
 				put_bh(bh_check);
 				retval = -EBUSY;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a67cae3c8ff5..cee7c28e070d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -156,6 +156,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 		if (ext4_dx_csum_verify(inode, dirent))
 			set_buffer_verified(bh);
 		else {
+			ext4_set_errno(inode->i_sb, EFSBADCRC);
 			ext4_error_inode(inode, func, line, block,
 					 "Directory index failed checksum");
 			brelse(bh);
@@ -166,6 +167,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 		if (ext4_dirblock_csum_verify(inode, bh))
 			set_buffer_verified(bh);
 		else {
+			ext4_set_errno(inode->i_sb, EFSBADCRC);
 			ext4_error_inode(inode, func, line, block,
 					 "Directory block failed checksum");
 			brelse(bh);
@@ -1527,6 +1529,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
 			goto next;
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
+			ext4_set_errno(sb, EIO);
 			EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
 					 (unsigned long) block);
 			brelse(bh);
@@ -1537,6 +1540,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
 		    !is_dx_internal_node(dir, block,
 					 (struct ext4_dir_entry *)bh->b_data) &&
 		    !ext4_dirblock_csum_verify(dir, bh)) {
+			ext4_set_errno(sb, EFSBADCRC);
 			EXT4_ERROR_INODE(dir, "checksumming directory "
 					 "block %lu", (unsigned long)block);
 			brelse(bh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7796e2ffc294..0234e3522b0c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -367,6 +367,8 @@ static void __save_error_info(struct super_block *sb, const char *func,
 	ext4_update_tstamp(es, s_last_error_time);
 	strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
 	es->s_last_error_line = cpu_to_le32(line);
+	if (es->s_last_error_errcode == 0)
+		es->s_last_error_errcode = EXT4_ERR_EFSCORRUPTED;
 	if (!es->s_first_error_time) {
 		es->s_first_error_time = es->s_last_error_time;
 		es->s_first_error_time_hi = es->s_last_error_time_hi;
@@ -375,6 +377,7 @@ static void __save_error_info(struct super_block *sb, const char *func,
 		es->s_first_error_line = cpu_to_le32(line);
 		es->s_first_error_ino = es->s_last_error_ino;
 		es->s_first_error_block = es->s_last_error_block;
+		es->s_first_error_errcode = es->s_last_error_errcode;
 	}
 	/*
 	 * Start the daily error reporting function if it hasn't been
@@ -631,6 +634,66 @@ const char *ext4_decode_error(struct super_block *sb, int errno,
 	return errstr;
 }
 
+void ext4_set_errno(struct super_block *sb, int err)
+{
+	if (err < 0)
+		err = -err;
+
+	switch (err) {
+	case EIO:
+		err = EXT4_ERR_EIO;
+		break;
+	case ENOMEM:
+		err = EXT4_ERR_ENOMEM;
+		break;
+	case EFSBADCRC:
+		err = EXT4_ERR_EFSBADCRC;
+		break;
+	case EFSCORRUPTED:
+		err = EXT4_ERR_EFSCORRUPTED;
+		break;
+	case ENOSPC:
+		err = EXT4_ERR_ENOSPC;
+		break;
+	case ENOKEY:
+		err = EXT4_ERR_ENOKEY;
+		break;
+	case EROFS:
+		err = EXT4_ERR_EROFS;
+		break;
+	case EFBIG:
+		err = EXT4_ERR_EFBIG;
+		break;
+	case EEXIST:
+		err = EXT4_ERR_EEXIST;
+		break;
+	case ERANGE:
+		err = EXT4_ERR_ERANGE;
+		break;
+	case EOVERFLOW:
+		err = EXT4_ERR_EOVERFLOW;
+		break;
+	case EBUSY:
+		err = EXT4_ERR_EBUSY;
+		break;
+	case ENOTDIR:
+		err = EXT4_ERR_ENOTDIR;
+		break;
+	case ENOTEMPTY:
+		err = EXT4_ERR_ENOTEMPTY;
+		break;
+	case ESHUTDOWN:
+		err = EXT4_ERR_ESHUTDOWN;
+		break;
+	case EFAULT:
+		err = EXT4_ERR_EFAULT;
+		break;
+	default:
+		err = EXT4_ERR_UNKNOWN;
+	}
+	EXT4_SB(sb)->s_es->s_last_error_errcode = err;
+}
+
 /* __ext4_std_error decodes expected errors from journaling functions
  * automatically and invokes the appropriate error response.  */
 
@@ -655,6 +718,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
 		       sb->s_id, function, line, errstr);
 	}
 
+	ext4_set_errno(sb, -errno);
 	save_error_info(sb, function, line);
 	ext4_handle_error(sb);
 }
@@ -982,8 +1046,10 @@ static void ext4_put_super(struct super_block *sb)
 		aborted = is_journal_aborted(sbi->s_journal);
 		err = jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
-		if ((err < 0) && !aborted)
+		if ((err < 0) && !aborted) {
+			ext4_set_errno(sb, -err);
 			ext4_abort(sb, "Couldn't clean up the journal");
+		}
 	}
 
 	ext4_unregister_sysfs(sb);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8966a5439a22..246fbeeb6366 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2879,9 +2879,11 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 		bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
 		if (IS_ERR(bh)) {
 			error = PTR_ERR(bh);
-			if (error == -EIO)
+			if (error == -EIO) {
+				ext4_set_errno(inode->i_sb, EIO);
 				EXT4_ERROR_INODE(inode, "block %llu read error",
 						 EXT4_I(inode)->i_file_acl);
+			}
 			bh = NULL;
 			goto cleanup;
 		}
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/2] ext4: simulate various I/O and checksum errors when reading metadata
  2019-11-21 18:30 [PATCH 1/2] ext4: save the error code which triggered an ext4_error() in the superblock Theodore Ts'o
@ 2019-11-21 18:30 ` Theodore Ts'o
  2019-11-22  0:09   ` Darrick J. Wong
  0 siblings, 1 reply; 7+ messages in thread
From: Theodore Ts'o @ 2019-11-21 18:30 UTC (permalink / raw)
  To: Ext4 Developers List; +Cc: Theodore Ts'o

This allows us to test various error handling code paths

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/balloc.c |  4 +++-
 fs/ext4/ext4.h   | 42 ++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ialloc.c |  4 +++-
 fs/ext4/inode.c  |  6 +++++-
 fs/ext4/namei.c  | 11 ++++++++---
 fs/ext4/sysfs.c  | 23 +++++++++++++++++++++++
 6 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 102c38527a10..5f993a411251 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -371,7 +371,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
 	if (buffer_verified(bh))
 		goto verified;
 	if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
-			desc, bh))) {
+						    desc, bh) ||
+		     ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
 		ext4_unlock_group(sb, block_group);
 		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
 		ext4_mark_group_bitmap_corrupted(sb, block_group,
@@ -505,6 +506,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
 	if (!desc)
 		return -EFSCORRUPTED;
 	wait_on_buffer(bh);
+	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
 	if (!buffer_uptodate(bh)) {
 		ext4_set_errno(sb, EIO);
 		ext4_error(sb, "Cannot read block bitmap - "
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1c9ac0fc8715..e6798db4634c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1557,6 +1557,9 @@ struct ext4_sb_info {
 	/* Barrier between changing inodes' journal flags and writepages ops. */
 	struct percpu_rw_semaphore s_journal_flag_rwsem;
 	struct dax_device *s_daxdev;
+#ifdef CONFIG_EXT4_DEBUG
+	unsigned long s_simulate_fail;
+#endif
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1575,6 +1578,45 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
 
+static inline int ext4_simulate_fail(struct super_block *sb,
+				     unsigned long flag)
+{
+#ifdef CONFIG_EXT4_DEBUG
+	unsigned long old, new;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	do {
+		old = READ_ONCE(sbi->s_simulate_fail);
+		if (likely((old & flag) == 0))
+			return 0;
+		new = old & ~flag;
+	} while (unlikely(cmpxchg(&sbi->s_simulate_fail, old, new) != old));
+	return 1;
+#else
+	return 0;
+#endif
+}
+
+static inline void ext4_simulate_fail_bh(struct super_block *sb,
+					 struct buffer_head *bh,
+					 unsigned long flag)
+{
+	if (!IS_ERR(bh) && ext4_simulate_fail(sb, flag))
+		clear_buffer_uptodate(bh);
+}
+
+/*
+ * Simulate_fail flags
+ */
+#define EXT4_SIM_BBITMAP_EIO	0x00000001
+#define EXT4_SIM_BBITMAP_CRC	0x00000002
+#define EXT4_SIM_IBITMAP_EIO	0x00000004
+#define EXT4_SIM_IBITMAP_CRC	0x00000008
+#define EXT4_SIM_INODE_EIO	0x00000010
+#define EXT4_SIM_INODE_CRC	0x00000020
+#define EXT4_SIM_DIRBLOCK_EIO	0x00000040
+#define EXT4_SIM_DIRBLOCK_CRC	0x00000080
+
 /*
  * Error number codes for s_{first,last}_error_errno
  *
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 31a5fd6f5e6a..f7033594f4a0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -94,7 +94,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
 		goto verified;
 	blk = ext4_inode_bitmap(sb, desc);
 	if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
-					   EXT4_INODES_PER_GROUP(sb) / 8)) {
+					   EXT4_INODES_PER_GROUP(sb) / 8) ||
+	    ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
 		ext4_unlock_group(sb, block_group);
 		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
 			   "inode_bitmap = %llu", block_group, blk);
@@ -192,6 +193,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	get_bh(bh);
 	submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
 	wait_on_buffer(bh);
+	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
 	if (!buffer_uptodate(bh)) {
 		put_bh(bh);
 		ext4_set_errno(sb, EIO);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b67ffa24ae0a..564f88040de9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4243,6 +4243,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
 	bh = sb_getblk(sb, block);
 	if (unlikely(!bh))
 		return -ENOMEM;
+	if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
+		goto simulate_eio;
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
 
@@ -4341,6 +4343,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
 		blk_finish_plug(&plug);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
+		simulate_eio:
 			ext4_set_errno(inode->i_sb, EIO);
 			EXT4_ERROR_INODE_BLOCK(inode, block,
 					       "unable to read itable block");
@@ -4555,7 +4558,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 					      sizeof(gen));
 	}
 
-	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+	if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
+	    ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
 		ext4_set_errno(inode->i_sb, EFSBADCRC);
 		ext4_error_inode(inode, function, line, 0,
 				 "iget: checksum invalid");
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index cee7c28e070d..be14e33e5103 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -109,7 +109,10 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	struct ext4_dir_entry *dirent;
 	int is_dx_block = 0;
 
-	bh = ext4_bread(NULL, inode, block, 0);
+	if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
+		bh = ERR_PTR(-EIO);
+	else
+		bh = ext4_bread(NULL, inode, block, 0);
 	if (IS_ERR(bh)) {
 		__ext4_warning(inode->i_sb, func, line,
 			       "inode #%lu: lblock %lu: comm %s: "
@@ -153,7 +156,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	 * caller is sure it should be an index block.
 	 */
 	if (is_dx_block && type == INDEX) {
-		if (ext4_dx_csum_verify(inode, dirent))
+		if (ext4_dx_csum_verify(inode, dirent) &&
+		    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
 			set_buffer_verified(bh);
 		else {
 			ext4_set_errno(inode->i_sb, EFSBADCRC);
@@ -164,7 +168,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 		}
 	}
 	if (!is_dx_block) {
-		if (ext4_dirblock_csum_verify(inode, bh))
+		if (ext4_dirblock_csum_verify(inode, bh) &&
+		    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
 			set_buffer_verified(bh);
 		else {
 			ext4_set_errno(inode->i_sb, EFSBADCRC);
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index eb1efad0e20a..a990d28d191b 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -29,6 +29,7 @@ typedef enum {
 	attr_last_error_time,
 	attr_feature,
 	attr_pointer_ui,
+	attr_pointer_ul,
 	attr_pointer_atomic,
 	attr_journal_task,
 } attr_id_t;
@@ -160,6 +161,9 @@ static struct ext4_attr ext4_attr_##_name = {			\
 #define EXT4_RW_ATTR_SBI_UI(_name,_elname)	\
 	EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
 
+#define EXT4_RW_ATTR_SBI_UL(_name,_elname)	\
+	EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)
+
 #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
 static struct ext4_attr ext4_attr_##_name = {			\
 	.attr = {.name = __stringify(_name), .mode = _mode },	\
@@ -194,6 +198,9 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
 EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+#ifdef CONFIG_EXT4_DEBUG
+EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
+#endif
 EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 EXT4_ATTR(first_error_time, 0444, first_error_time);
 EXT4_ATTR(last_error_time, 0444, last_error_time);
@@ -228,6 +235,9 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(first_error_time),
 	ATTR_LIST(last_error_time),
 	ATTR_LIST(journal_task),
+#ifdef CONFIG_EXT4_DEBUG
+	ATTR_LIST(simulate_fail),
+#endif
 	NULL,
 };
 ATTRIBUTE_GROUPS(ext4);
@@ -318,6 +328,11 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
 		else
 			return snprintf(buf, PAGE_SIZE, "%u\n",
 					*((unsigned int *) ptr));
+	case attr_pointer_ul:
+		if (!ptr)
+			return 0;
+		return snprintf(buf, PAGE_SIZE, "%lu\n",
+				*((unsigned long *) ptr));
 	case attr_pointer_atomic:
 		if (!ptr)
 			return 0;
@@ -361,6 +376,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
 		else
 			*((unsigned int *) ptr) = t;
 		return len;
+	case attr_pointer_ul:
+		if (!ptr)
+			return 0;
+		ret = kstrtoul(skip_spaces(buf), 0, &t);
+		if (ret)
+			return ret;
+		*((unsigned long *) ptr) = t;
+		return len;
 	case attr_inode_readahead:
 		return inode_readahead_blks_store(sbi, buf, len);
 	case attr_trigger_test_error:
-- 
2.23.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] ext4: simulate various I/O and checksum errors when reading metadata
  2019-11-21 18:30 ` [PATCH 2/2] ext4: simulate various I/O and checksum errors when reading metadata Theodore Ts'o
@ 2019-11-22  0:09   ` Darrick J. Wong
  2019-11-22  1:00     ` Theodore Y. Ts'o
  2019-11-22 23:46     ` Andreas Dilger
  0 siblings, 2 replies; 7+ messages in thread
From: Darrick J. Wong @ 2019-11-22  0:09 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Ext4 Developers List

On Thu, Nov 21, 2019 at 01:30:36PM -0500, Theodore Ts'o wrote:
> This allows us to test various error handling code paths
> 
> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
> ---
>  fs/ext4/balloc.c |  4 +++-
>  fs/ext4/ext4.h   | 42 ++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/ialloc.c |  4 +++-
>  fs/ext4/inode.c  |  6 +++++-
>  fs/ext4/namei.c  | 11 ++++++++---
>  fs/ext4/sysfs.c  | 23 +++++++++++++++++++++++
>  6 files changed, 84 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 102c38527a10..5f993a411251 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -371,7 +371,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
>  	if (buffer_verified(bh))
>  		goto verified;
>  	if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
> -			desc, bh))) {
> +						    desc, bh) ||
> +		     ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
>  		ext4_unlock_group(sb, block_group);
>  		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
>  		ext4_mark_group_bitmap_corrupted(sb, block_group,
> @@ -505,6 +506,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
>  	if (!desc)
>  		return -EFSCORRUPTED;
>  	wait_on_buffer(bh);
> +	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
>  	if (!buffer_uptodate(bh)) {
>  		ext4_set_errno(sb, EIO);
>  		ext4_error(sb, "Cannot read block bitmap - "
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 1c9ac0fc8715..e6798db4634c 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1557,6 +1557,9 @@ struct ext4_sb_info {
>  	/* Barrier between changing inodes' journal flags and writepages ops. */
>  	struct percpu_rw_semaphore s_journal_flag_rwsem;
>  	struct dax_device *s_daxdev;
> +#ifdef CONFIG_EXT4_DEBUG
> +	unsigned long s_simulate_fail;
> +#endif
>  };
>  
>  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
> @@ -1575,6 +1578,45 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
>  		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
>  }
>  
> +static inline int ext4_simulate_fail(struct super_block *sb,
> +				     unsigned long flag)

Nit: bool?

> +{
> +#ifdef CONFIG_EXT4_DEBUG
> +	unsigned long old, new;
> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
> +
> +	do {
> +		old = READ_ONCE(sbi->s_simulate_fail);
> +		if (likely((old & flag) == 0))
> +			return 0;
> +		new = old & ~flag;
> +	} while (unlikely(cmpxchg(&sbi->s_simulate_fail, old, new) != old));

If I'm reading this correctly, this means that userspace sets a
s_simulate_fail bit via sysfs knob, and the next time the filesystem
calls ext4_simulate_fail with the same bit set in @flag we'll return
true to say "simulate the failure" and clear the bit in s_simulate_fail?

IOWs, the simulated failures have to be re-armed every time?

Seems reasonable, but consider the possibility that in the future it
might be useful if you could set up periodic failures (e.g. directory
lookups fail 10% of the time) so that you can see how something like
fsstress reacts to less-predictable failures?

Of course that also increases the amount of fugly sysfs boilerplate so
that each knob can have its own sysfs file... that alone is half of a
reason not to do that. :(

--D

> +	return 1;
> +#else
> +	return 0;
> +#endif
> +}
> +
> +static inline void ext4_simulate_fail_bh(struct super_block *sb,
> +					 struct buffer_head *bh,
> +					 unsigned long flag)
> +{
> +	if (!IS_ERR(bh) && ext4_simulate_fail(sb, flag))
> +		clear_buffer_uptodate(bh);
> +}
> +
> +/*
> + * Simulate_fail flags
> + */
> +#define EXT4_SIM_BBITMAP_EIO	0x00000001
> +#define EXT4_SIM_BBITMAP_CRC	0x00000002
> +#define EXT4_SIM_IBITMAP_EIO	0x00000004
> +#define EXT4_SIM_IBITMAP_CRC	0x00000008
> +#define EXT4_SIM_INODE_EIO	0x00000010
> +#define EXT4_SIM_INODE_CRC	0x00000020
> +#define EXT4_SIM_DIRBLOCK_EIO	0x00000040
> +#define EXT4_SIM_DIRBLOCK_CRC	0x00000080
> +
>  /*
>   * Error number codes for s_{first,last}_error_errno
>   *
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 31a5fd6f5e6a..f7033594f4a0 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -94,7 +94,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
>  		goto verified;
>  	blk = ext4_inode_bitmap(sb, desc);
>  	if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
> -					   EXT4_INODES_PER_GROUP(sb) / 8)) {
> +					   EXT4_INODES_PER_GROUP(sb) / 8) ||
> +	    ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
>  		ext4_unlock_group(sb, block_group);
>  		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
>  			   "inode_bitmap = %llu", block_group, blk);
> @@ -192,6 +193,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
>  	get_bh(bh);
>  	submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
>  	wait_on_buffer(bh);
> +	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
>  	if (!buffer_uptodate(bh)) {
>  		put_bh(bh);
>  		ext4_set_errno(sb, EIO);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index b67ffa24ae0a..564f88040de9 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4243,6 +4243,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
>  	bh = sb_getblk(sb, block);
>  	if (unlikely(!bh))
>  		return -ENOMEM;
> +	if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
> +		goto simulate_eio;
>  	if (!buffer_uptodate(bh)) {
>  		lock_buffer(bh);
>  
> @@ -4341,6 +4343,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
>  		blk_finish_plug(&plug);
>  		wait_on_buffer(bh);
>  		if (!buffer_uptodate(bh)) {
> +		simulate_eio:
>  			ext4_set_errno(inode->i_sb, EIO);
>  			EXT4_ERROR_INODE_BLOCK(inode, block,
>  					       "unable to read itable block");
> @@ -4555,7 +4558,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
>  					      sizeof(gen));
>  	}
>  
> -	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
> +	if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
> +	    ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
>  		ext4_set_errno(inode->i_sb, EFSBADCRC);
>  		ext4_error_inode(inode, function, line, 0,
>  				 "iget: checksum invalid");
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index cee7c28e070d..be14e33e5103 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -109,7 +109,10 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
>  	struct ext4_dir_entry *dirent;
>  	int is_dx_block = 0;
>  
> -	bh = ext4_bread(NULL, inode, block, 0);
> +	if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
> +		bh = ERR_PTR(-EIO);
> +	else
> +		bh = ext4_bread(NULL, inode, block, 0);
>  	if (IS_ERR(bh)) {
>  		__ext4_warning(inode->i_sb, func, line,
>  			       "inode #%lu: lblock %lu: comm %s: "
> @@ -153,7 +156,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
>  	 * caller is sure it should be an index block.
>  	 */
>  	if (is_dx_block && type == INDEX) {
> -		if (ext4_dx_csum_verify(inode, dirent))
> +		if (ext4_dx_csum_verify(inode, dirent) &&
> +		    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
>  			set_buffer_verified(bh);
>  		else {
>  			ext4_set_errno(inode->i_sb, EFSBADCRC);
> @@ -164,7 +168,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
>  		}
>  	}
>  	if (!is_dx_block) {
> -		if (ext4_dirblock_csum_verify(inode, bh))
> +		if (ext4_dirblock_csum_verify(inode, bh) &&
> +		    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
>  			set_buffer_verified(bh);
>  		else {
>  			ext4_set_errno(inode->i_sb, EFSBADCRC);
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index eb1efad0e20a..a990d28d191b 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -29,6 +29,7 @@ typedef enum {
>  	attr_last_error_time,
>  	attr_feature,
>  	attr_pointer_ui,
> +	attr_pointer_ul,
>  	attr_pointer_atomic,
>  	attr_journal_task,
>  } attr_id_t;
> @@ -160,6 +161,9 @@ static struct ext4_attr ext4_attr_##_name = {			\
>  #define EXT4_RW_ATTR_SBI_UI(_name,_elname)	\
>  	EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
>  
> +#define EXT4_RW_ATTR_SBI_UL(_name,_elname)	\
> +	EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)
> +
>  #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
>  static struct ext4_attr ext4_attr_##_name = {			\
>  	.attr = {.name = __stringify(_name), .mode = _mode },	\
> @@ -194,6 +198,9 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
>  EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
>  EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
>  EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
> +#ifdef CONFIG_EXT4_DEBUG
> +EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
> +#endif
>  EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
>  EXT4_ATTR(first_error_time, 0444, first_error_time);
>  EXT4_ATTR(last_error_time, 0444, last_error_time);
> @@ -228,6 +235,9 @@ static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(first_error_time),
>  	ATTR_LIST(last_error_time),
>  	ATTR_LIST(journal_task),
> +#ifdef CONFIG_EXT4_DEBUG
> +	ATTR_LIST(simulate_fail),
> +#endif
>  	NULL,
>  };
>  ATTRIBUTE_GROUPS(ext4);
> @@ -318,6 +328,11 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
>  		else
>  			return snprintf(buf, PAGE_SIZE, "%u\n",
>  					*((unsigned int *) ptr));
> +	case attr_pointer_ul:
> +		if (!ptr)
> +			return 0;
> +		return snprintf(buf, PAGE_SIZE, "%lu\n",
> +				*((unsigned long *) ptr));
>  	case attr_pointer_atomic:
>  		if (!ptr)
>  			return 0;
> @@ -361,6 +376,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
>  		else
>  			*((unsigned int *) ptr) = t;
>  		return len;
> +	case attr_pointer_ul:
> +		if (!ptr)
> +			return 0;
> +		ret = kstrtoul(skip_spaces(buf), 0, &t);
> +		if (ret)
> +			return ret;
> +		*((unsigned long *) ptr) = t;
> +		return len;
>  	case attr_inode_readahead:
>  		return inode_readahead_blks_store(sbi, buf, len);
>  	case attr_trigger_test_error:
> -- 
> 2.23.0
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] ext4: simulate various I/O and checksum errors when reading metadata
  2019-11-22  0:09   ` Darrick J. Wong
@ 2019-11-22  1:00     ` Theodore Y. Ts'o
  2019-11-22  1:18       ` Darrick J. Wong
  2019-11-22 23:46     ` Andreas Dilger
  1 sibling, 1 reply; 7+ messages in thread
From: Theodore Y. Ts'o @ 2019-11-22  1:00 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Ext4 Developers List

On Thu, Nov 21, 2019 at 04:09:33PM -0800, Darrick J. Wong wrote:
> > +static inline int ext4_simulate_fail(struct super_block *sb,
> > +				     unsigned long flag)
> 
> Nit: bool?

Sure, I'll do this for the next version.

> If I'm reading this correctly, this means that userspace sets a
> s_simulate_fail bit via sysfs knob, and the next time the filesystem
> calls ext4_simulate_fail with the same bit set in @flag we'll return
> true to say "simulate the failure" and clear the bit in s_simulate_fail?
> 
> IOWs, the simulated failures have to be re-armed every time?

Yes, that's correct.

> Seems reasonable, but consider the possibility that in the future it
> might be useful if you could set up periodic failures (e.g. directory
> lookups fail 10% of the time) so that you can see how something like
> fsstress reacts to less-predictable failures?

So in theory, we could do that with dm_flakey --- but that's a pain in
the tuckus, since you have to specify the LBA for the directory blocks
that you might want to have fail.  I implemented this so I could have
a quick and dirty way of testing the first patch in this series (and
in fact, I found a bug in the first version of the previous patch, so
I'm glad I spent the time to implement the test patch :-).

What might be interesting to do is some kind of eBPF hook where we
pass in the block #, inode #, and metadata type, and the ePBF program
could do use a much more complex set of criteria in terms of whether
or not to trigger an EIO, or how to fuzz a particular block to either
force a CRC failure, or to try to find bugs ala Hydra[1] (funded via a
Google Faculty Research Award grant), but using a much more glass-box
style test approach.

[1] https://gts3.org/~sanidhya/pubs/2019/hydra.pdf

This would be a lot more work, and I'm not sufficiently up to speed
with eBPF, and I just needed a quick and dirty testing scheme.

The reason why I think it's worthwhile to land this patch (as opposed
to throwing it away after doing the development work for the previous
patch) is that it's a relatively small set of changes, and all of the
code disappears if CONFIG_DEBUG_EXT4 is not enabled.  So it has no
performance cost on production kernels, and it's highly unlikely that
users would have a reason to use this feature on production use cases,
so ripping this out if and when we have a more functional eBPF testing
infrastructure to replace it shouldn't really be a problem.

					- Ted

P.S.  A fascinating question is whether we could make the hooks for
this hypothetical eBPF hook general enough that it could work for more
than just ext4, but for other file systems.  The problem is that the
fs metadata types are not going to be same across different file
systems, so that makes the API design quite tricky; and perhaps not
worth it?


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] ext4: simulate various I/O and checksum errors when reading metadata
  2019-11-22  1:00     ` Theodore Y. Ts'o
@ 2019-11-22  1:18       ` Darrick J. Wong
  2019-11-22  3:56         ` Theodore Y. Ts'o
  0 siblings, 1 reply; 7+ messages in thread
From: Darrick J. Wong @ 2019-11-22  1:18 UTC (permalink / raw)
  To: Theodore Y. Ts'o; +Cc: Ext4 Developers List

On Thu, Nov 21, 2019 at 08:00:26PM -0500, Theodore Y. Ts'o wrote:
> On Thu, Nov 21, 2019 at 04:09:33PM -0800, Darrick J. Wong wrote:
> > > +static inline int ext4_simulate_fail(struct super_block *sb,
> > > +				     unsigned long flag)
> > 
> > Nit: bool?
> 
> Sure, I'll do this for the next version.
> 
> > If I'm reading this correctly, this means that userspace sets a
> > s_simulate_fail bit via sysfs knob, and the next time the filesystem
> > calls ext4_simulate_fail with the same bit set in @flag we'll return
> > true to say "simulate the failure" and clear the bit in s_simulate_fail?
> > 
> > IOWs, the simulated failures have to be re-armed every time?
> 
> Yes, that's correct.
> 
> > Seems reasonable, but consider the possibility that in the future it
> > might be useful if you could set up periodic failures (e.g. directory
> > lookups fail 10% of the time) so that you can see how something like
> > fsstress reacts to less-predictable failures?
> 
> So in theory, we could do that with dm_flakey --- but that's a pain in
> the tuckus, since you have to specify the LBA for the directory blocks
> that you might want to have fail.

Funny, I've been working on a fstests helper function to make it easy to
set up dm-flakey based on fiemap/getfsmap output and such. :)

> I implemented this so I could have
> a quick and dirty way of testing the first patch in this series (and
> in fact, I found a bug in the first version of the previous patch, so
> I'm glad I spent the time to implement the test patch :-).

Heh, cool!

> What might be interesting to do is some kind of eBPF hook where we
> pass in the block #, inode #, and metadata type, and the ePBF program
> could do use a much more complex set of criteria in terms of whether
> or not to trigger an EIO, or how to fuzz a particular block to either
> force a CRC failure, or to try to find bugs ala Hydra[1] (funded via a
> Google Faculty Research Award grant), but using a much more glass-box
> style test approach.

That would be fun.  Attach an arbitrary eBPF program to a range of
sectors.  I wonder how loud the howls of protest would be for "can we
let ebpf programs scribble on a kernel io buffer pleeze?"...

...a couple of years ago I sent out an RFCRAP patch so that you could
use eBPF's "new" ability to change function return values, which
Christoph immediately NAKd.  I think Josef's original purpose was so
that he could inject arbitrary debugging knobs all over btrfs.

> [1] https://gts3.org/~sanidhya/pubs/2019/hydra.pdf
> 
> This would be a lot more work, and I'm not sufficiently up to speed
> with eBPF, and I just needed a quick and dirty testing scheme.
> 
> The reason why I think it's worthwhile to land this patch (as opposed
> to throwing it away after doing the development work for the previous
> patch) is that it's a relatively small set of changes, and all of the
> code disappears if CONFIG_DEBUG_EXT4 is not enabled.  So it has no
> performance cost on production kernels, and it's highly unlikely that
> users would have a reason to use this feature on production use cases,
> so ripping this out if and when we have a more functional eBPF testing
> infrastructure to replace it shouldn't really be a problem.

Admittedly it's a debug knob so I don't see it as a big deal if you
merge this and some day rip it out or supersede it.  The XFS knobs have
undergone a few, uh, interface revisions.

> 					- Ted
> 
> P.S.  A fascinating question is whether we could make the hooks for
> this hypothetical eBPF hook general enough that it could work for more
> than just ext4, but for other file systems.  The problem is that the
> fs metadata types are not going to be same across different file
> systems, so that makes the API design quite tricky; and perhaps not
> worth it?

Yeah.  I mean, it's eBPF glomming onto random parts of the kernel, so I
don't think there's ever going to be a General API For Brain Slugs[3].

OTOH I need LSF topics so sure lets roll.

--D

[2] https://lore.kernel.org/linux-xfs/20171213061825.GO19219@magnolia/
[3] https://futurama.fandom.com/wiki/Brain_Slug

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] ext4: simulate various I/O and checksum errors when reading metadata
  2019-11-22  1:18       ` Darrick J. Wong
@ 2019-11-22  3:56         ` Theodore Y. Ts'o
  0 siblings, 0 replies; 7+ messages in thread
From: Theodore Y. Ts'o @ 2019-11-22  3:56 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Ext4 Developers List

On Thu, Nov 21, 2019 at 05:18:34PM -0800, Darrick J. Wong wrote:
> > So in theory, we could do that with dm_flakey --- but that's a pain in
> > the tuckus, since you have to specify the LBA for the directory blocks
> > that you might want to have fail.
> 
> Funny, I've been working on a fstests helper function to make it easy to
> set up dm-flakey based on fiemap/getfsmap output and such. :)

Ah, excellent, I'm looking forward to seeing it.  :-)

> > What might be interesting to do is some kind of eBPF hook where we
> > pass in the block #, inode #, and metadata type, and the ePBF program
> > could do use a much more complex set of criteria in terms of whether
> > or not to trigger an EIO, or how to fuzz a particular block to either
> > force a CRC failure, or to try to find bugs ala Hydra[1] (funded via a
> > Google Faculty Research Award grant), but using a much more glass-box
> > style test approach.
> 
> That would be fun.  Attach an arbitrary eBPF program to a range of
> sectors.  I wonder how loud the howls of protest would be for "can we
> let ebpf programs scribble on a kernel io buffer pleeze?"...

Well, because the eBPF hook would include the metadata type (an
allocation bitmap vs inode table vs htree index vs htree leaf block,
etc.) what I was thinking about has to be done in ext4 as a
ext4-specific hook.  And it would only be enabled if CONFIG_EXT4_DEBUG
or a separate Kconfig option is enabled --- and I *hope* no
distribution would be so silly/stupid enough to enable it on a product
kernel build.  :-)

						- Ted

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/2] ext4: simulate various I/O and checksum errors when reading metadata
  2019-11-22  0:09   ` Darrick J. Wong
  2019-11-22  1:00     ` Theodore Y. Ts'o
@ 2019-11-22 23:46     ` Andreas Dilger
  1 sibling, 0 replies; 7+ messages in thread
From: Andreas Dilger @ 2019-11-22 23:46 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Theodore Ts'o, Ext4 Developers List

[-- Attachment #1: Type: text/plain, Size: 6727 bytes --]

On Nov 21, 2019, at 5:09 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> 
> On Thu, Nov 21, 2019 at 01:30:36PM -0500, Theodore Ts'o wrote:
>> This allows us to test various error handling code paths
>> 
>> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
>> ---
>> fs/ext4/balloc.c |  4 +++-
>> fs/ext4/ext4.h   | 42 ++++++++++++++++++++++++++++++++++++++++++
>> fs/ext4/ialloc.c |  4 +++-
>> fs/ext4/inode.c  |  6 +++++-
>> fs/ext4/namei.c  | 11 ++++++++---
>> fs/ext4/sysfs.c  | 23 +++++++++++++++++++++++
>> 6 files changed, 84 insertions(+), 6 deletions(-)
>> 
>> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
>> index 102c38527a10..5f993a411251 100644
>> --- a/fs/ext4/balloc.c
>> +++ b/fs/ext4/balloc.c
>> @@ -371,7 +371,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
>> 	if (buffer_verified(bh))
>> 		goto verified;
>> 	if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
>> -			desc, bh))) {
>> +						    desc, bh) ||
>> +		     ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
>> 		ext4_unlock_group(sb, block_group);
>> 		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
>> 		ext4_mark_group_bitmap_corrupted(sb, block_group,
>> @@ -505,6 +506,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
>> 	if (!desc)
>> 		return -EFSCORRUPTED;
>> 	wait_on_buffer(bh);
>> +	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
>> 	if (!buffer_uptodate(bh)) {
>> 		ext4_set_errno(sb, EIO);
>> 		ext4_error(sb, "Cannot read block bitmap - "
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 1c9ac0fc8715..e6798db4634c 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1557,6 +1557,9 @@ struct ext4_sb_info {
>> 	/* Barrier between changing inodes' journal flags and writepages ops. */
>> 	struct percpu_rw_semaphore s_journal_flag_rwsem;
>> 	struct dax_device *s_daxdev;
>> +#ifdef CONFIG_EXT4_DEBUG
>> +	unsigned long s_simulate_fail;
>> +#endif
>> };
>> 
>> static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
>> @@ -1575,6 +1578,45 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
>> 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
>> }
>> 
>> +static inline int ext4_simulate_fail(struct super_block *sb,
>> +				     unsigned long flag)
> 
> Nit: bool?
> 
>> +{
>> +#ifdef CONFIG_EXT4_DEBUG
>> +	unsigned long old, new;
>> +	struct ext4_sb_info *sbi = EXT4_SB(sb);
>> +
>> +	do {
>> +		old = READ_ONCE(sbi->s_simulate_fail);
>> +		if (likely((old & flag) == 0))
>> +			return 0;
>> +		new = old & ~flag;
>> +	} while (unlikely(cmpxchg(&sbi->s_simulate_fail, old, new) != old));
> 
> If I'm reading this correctly, this means that userspace sets a
> s_simulate_fail bit via sysfs knob, and the next time the filesystem
> calls ext4_simulate_fail with the same bit set in @flag we'll return
> true to say "simulate the failure" and clear the bit in s_simulate_fail?
> 
> IOWs, the simulated failures have to be re-armed every time?
> 
> Seems reasonable, but consider the possibility that in the future it
> might be useful if you could set up periodic failures (e.g. directory
> lookups fail 10% of the time) so that you can see how something like
> fsstress reacts to less-predictable failures?
> 
> Of course that also increases the amount of fugly sysfs boilerplate so
> that each knob can have its own sysfs file... that alone is half of a
> reason not to do that. :(

Just for comparison, Lustre has had a fault injection mechanism for ages
that can do a bunch of things like this.  Each fault location has a unique
number (we separate them by subsystems in the code, but numbers are rather
arbitrary), and then a sysfs parameter "fail_loc" that can be set to match
the fault location to inject errors, and "fail_val" that allows userspace
to adjust/tune the failure behavior (e.g. only affect target N, or sleep N
seconds, ...).

The low 16 bits set in fail_loc is the fault location number, and the high
16 bits of fail_loc are flags that can modify the behavior independent of
which failure number is being used:
- CFS_FAIL_ONCE: the fail_loc should only fail once (default forever)
- CFS_FAIL_SKIP: skip the fail_loc the first "fail_val" times
- CFS_FAIL_SOME: trigger the failure the first "fail_val" times
- CFS_FAIL_RAND: trigger the failure at a rate of 1/fail_val

There are also flags set by the kernel when the failure is hit, so it is
possible to read fail_loc in a test script if the failure was already hit.

Internally in the code, the most common use is just checking if we hit the
currently-set fail_loc (which is unlikely() for minimal impact), like:

	if (CFS_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_RECONNECT))
		 RETURN(1);	 /* don't send early reply */

        if (CFS_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ) && req->rq_no_delay) {
                /* the same error returned by ptlrpc_import_delay_req() */
                rc = -EWOULDBLOCK;
                req->rq_status = rc;

It is possible to inject a delay into a thread to allow something else to
happen (maybe more useful for a distributed system rather than a local one):

	CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLAY_DELAY2, cfs_fail_val);

It is also possible to set up a race between two threads in the same or
different parts of the code on the same node:

	CFS_RACE(CFS_FAIL_CHLOG_USER_REG_UNREG_RACE);

The first thread to hit this fail_loc will sleep, and the second thread
that hits it will wake it up.  There is a variation of this to make it
explicit that only one thread to hit a location should sleep, and a
second thread needs to hit a different location to wake it up:

	thread1:
	CFS_RACE_WAIT(OBD_FAIL_OBD_ZERO_NLINK_RACE);

				thread2:
				CFS_RACE_WAKEUP(OBD_FAIL_OBD_ZERO_NLINK_RACE);

It is also possible to daisy-chain failure conditions:

	if (ns_is_client(ldlm_lock_to_ns(lock)) &&
	    CFS_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
				 OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE))
		ldlm_set_fail_loc(lock);

so here if "OBD_FAIL_LDLM_INTR_CP_AST" is hit, it will reset fail_loc to be "OBD_FAIL_LDLM_CP_BL_RACE" to fail once, flag a particular DLM lock and then
the next two threads that access this lock will race to process it:

	if (ldlm_is_fail_loc(lock))
		CFS_RACE(OBD_FAIL_LDLM_CP_BL_RACE);

The CFS_FAIL functionality has been used for quite a few years and has
proven sufficient and easily used to invoke failure conditions that would
be otherwise impossible to reproduce (over a thousand fault injection
sites in the Lustre code and corresponding tests to trigger them today).

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-11-22 23:47 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-21 18:30 [PATCH 1/2] ext4: save the error code which triggered an ext4_error() in the superblock Theodore Ts'o
2019-11-21 18:30 ` [PATCH 2/2] ext4: simulate various I/O and checksum errors when reading metadata Theodore Ts'o
2019-11-22  0:09   ` Darrick J. Wong
2019-11-22  1:00     ` Theodore Y. Ts'o
2019-11-22  1:18       ` Darrick J. Wong
2019-11-22  3:56         ` Theodore Y. Ts'o
2019-11-22 23:46     ` Andreas Dilger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.