Linux-ext4 Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH] e2fsprogs: Try again to solve unreliable io case
@ 2021-04-20  7:18 Haotian Li
  2021-04-20 15:08 ` Darrick J. Wong
  2021-04-20 16:19 ` Theodore Ts'o
  0 siblings, 2 replies; 7+ messages in thread
From: Haotian Li @ 2021-04-20  7:18 UTC (permalink / raw)
  To: Ext4 Developers List
  Cc: Theodore Y. Ts'o, harshad shirwadkar,, liuzhiqiang (I), linfeilong

If some I/O error occured during e2fsck, for example the
fibre channel connections are flasky, the e2fsck may exit.
Try again in these I/O error cases may help e2fsck
successfully execute and fix the disk correctly.

We may try five times when io_channel_write/read failed.
Enable this option by setting true on unreliable_io in
configure file "e2fsck.conf".

Signed-off-by: Haotian Li <lihaotian9@huawei.com>
Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
---
 e2fsck/unix.c           |   7 +++
 lib/ext2fs/ext2_io.h    |   3 ++
 lib/ext2fs/ext2fs.h     |   2 +-
 lib/ext2fs/io_manager.c | 102 ++++++++++++++++++++++++++++++----------
 lib/ext2fs/openfs.c     |   2 +
 lib/ext2fs/unix_io.c    |   4 ++
 6 files changed, 93 insertions(+), 27 deletions(-)

diff --git a/e2fsck/unix.c b/e2fsck/unix.c
index c5f9e441..faa2f77d 100644
--- a/e2fsck/unix.c
+++ b/e2fsck/unix.c
@@ -1398,6 +1398,7 @@ int main (int argc, char *argv[])
 	int journal_size;
 	int sysval, sys_page_size = 4096;
 	int old_bitmaps;
+	int unreliable_io;
 	__u32 features[3];
 	char *cp;
 	enum quota_type qtype;
@@ -1496,6 +1497,12 @@ restart:
 			    &old_bitmaps);
 	if (!old_bitmaps)
 		flags |= EXT2_FLAG_64BITS;
+
+	profile_get_boolean(ctx->profile, "options", "unreliable_io", 0, 0,
+			    &unreliable_io);
+	if (unreliable_io)
+		flags |= EXT2_FLAG_UNRELIABLE_IO;
+
 	if ((ctx->options & E2F_OPT_READONLY) == 0) {
 		flags |= EXT2_FLAG_RW;
 		if (!(ctx->mount_flags & EXT2_MF_ISROOT &&
diff --git a/lib/ext2fs/ext2_io.h b/lib/ext2fs/ext2_io.h
index 8fe5b323..38d23a1b 100644
--- a/lib/ext2fs/ext2_io.h
+++ b/lib/ext2fs/ext2_io.h
@@ -34,8 +34,10 @@ typedef struct struct_io_stats *io_stats;
 #define CHANNEL_FLAGS_DISCARD_ZEROES	0x02
 #define CHANNEL_FLAGS_BLOCK_DEVICE	0x04
 #define CHANNEL_FLAGS_THREADS		0x08
+#define CHANNEL_FLAGS_UNRELIABLE_IO     0x10

 #define io_channel_discard_zeroes_data(i) (i->flags & CHANNEL_FLAGS_DISCARD_ZEROES)
+#define UNRELIABLE_IO_RETRY_TIMES 5

 struct struct_io_channel {
 	errcode_t	magic;
@@ -107,6 +109,7 @@ struct struct_io_manager {
 #define IO_FLAG_FORCE_BOUNCE	0x0008
 #define IO_FLAG_THREADS		0x0010
 #define IO_FLAG_NOCACHE		0x0020
+#define IO_FLAG_UNRELIABLE_IO   0x0040

 /*
  * Convenience functions....
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index df150f00..4ce5e311 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -207,7 +207,7 @@ typedef struct ext2_file *ext2_file_t;
 #define EXT2_FLAG_BBITMAP_TAIL_PROBLEM	0x1000000
 #define EXT2_FLAG_IBITMAP_TAIL_PROBLEM	0x2000000
 #define EXT2_FLAG_THREADS		0x4000000
-
+#define EXT2_FLAG_UNRELIABLE_IO         0x8000000
 /*
  * Special flag in the ext2 inode i_flag field that means that this is
  * a new inode.  (So that ext2_write_inode() can clear extra fields.)
diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
index 996c31a1..f062ef0a 100644
--- a/lib/ext2fs/io_manager.c
+++ b/lib/ext2fs/io_manager.c
@@ -61,66 +61,116 @@ errcode_t io_channel_write_byte(io_channel channel, unsigned long offset,
 				int count, const void *data)
 {
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
-
-	if (channel->manager->write_byte)
-		return channel->manager->write_byte(channel, offset,
+	errcode_t ret = EXT2_ET_UNIMPLEMENTED;
+       int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
+               UNRELIABLE_IO_RETRY_TIMES : 0;
+retry:
+	if (channel->manager->write_byte) {
+		ret = channel->manager->write_byte(channel, offset,
 						    count, data);
-
-	return EXT2_ET_UNIMPLEMENTED;
+		if (ret && retry_times) {
+                        retry_times--;
+                        goto retry;
+               }
+	}
+	return ret;
 }

 errcode_t io_channel_read_blk64(io_channel channel, unsigned long long block,
 				 int count, void *data)
 {
-	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
-
-	if (channel->manager->read_blk64)
-		return (channel->manager->read_blk64)(channel, block,
+       EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
+	errcode_t ret = 0;
+	int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
+		UNRELIABLE_IO_RETRY_TIMES : 0;
+retry_read_blk64:
+	if (channel->manager->read_blk64) {
+		ret = (channel->manager->read_blk64)(channel, block,
 						      count, data);
+		if (ret && retry_times) {
+			retry_times--;
+			goto retry_read_blk64;
+		}
+		return ret;
+	}

 	if ((block >> 32) != 0)
 		return EXT2_ET_IO_CHANNEL_NO_SUPPORT_64;
-
-	return (channel->manager->read_blk)(channel, (unsigned long) block,
+retry_read_blk:
+	ret = (channel->manager->read_blk)(channel, (unsigned long) block,
 					     count, data);
+	if (ret && retry_times) {
+		retry_times--;
+               goto retry_read_blk;
+       }
+	return ret;
 }

+
 errcode_t io_channel_write_blk64(io_channel channel, unsigned long long block,
 				 int count, const void *data)
 {
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
-
-	if (channel->manager->write_blk64)
-		return (channel->manager->write_blk64)(channel, block,
+	errcode_t ret = 0;
+       int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
+                UNRELIABLE_IO_RETRY_TIMES : 0;
+retry_write_blk64:
+	if (channel->manager->write_blk64) {
+		ret = (channel->manager->write_blk64)(channel, block,
 						       count, data);
+		if (ret && retry_times) {
+                        retry_times--;
+                        goto retry_write_blk64;
+               }
+               return ret;
+        }

 	if ((block >> 32) != 0)
 		return EXT2_ET_IO_CHANNEL_NO_SUPPORT_64;
-
-	return (channel->manager->write_blk)(channel, (unsigned long) block,
+retry_write_blk:
+	ret = (channel->manager->write_blk)(channel, (unsigned long) block,
 					     count, data);
+	if (ret && retry_times) {
+               retry_times--;
+               goto retry_write_blk;
+       }
+	return ret;
 }

 errcode_t io_channel_discard(io_channel channel, unsigned long long block,
 			     unsigned long long count)
 {
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
-
-	if (channel->manager->discard)
-		return (channel->manager->discard)(channel, block, count);
-
-	return EXT2_ET_UNIMPLEMENTED;
+	errcode_t ret = EXT2_ET_UNIMPLEMENTED;
+	int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
+	                UNRELIABLE_IO_RETRY_TIMES : 0;
+retry:
+	if (channel->manager->discard) {
+		ret = (channel->manager->discard)(channel, block, count);
+		if (ret && retry_times) {
+                       retry_times--;
+                       goto retry;
+               }
+	}
+	return ret;
 }

 errcode_t io_channel_zeroout(io_channel channel, unsigned long long block,
 			     unsigned long long count)
 {
 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
-
-	if (channel->manager->zeroout)
-		return (channel->manager->zeroout)(channel, block, count);
-
-	return EXT2_ET_UNIMPLEMENTED;
+	errcode_t ret = EXT2_ET_UNIMPLEMENTED;
+       int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
+                        UNRELIABLE_IO_RETRY_TIMES : 0;
+retry:
+	if (channel->manager->zeroout) {
+		ret = (channel->manager->zeroout)(channel, block, count);
+		if (ret && retry_times) {
+                        retry_times--;
+                        goto retry;
+               }
+       }
+	return ret;
 }

 errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
diff --git a/lib/ext2fs/openfs.c b/lib/ext2fs/openfs.c
index 5ec8ed5c..a98a5815 100644
--- a/lib/ext2fs/openfs.c
+++ b/lib/ext2fs/openfs.c
@@ -172,6 +172,8 @@ errcode_t ext2fs_open2(const char *name, const char *io_options,
 		io_flags |= IO_FLAG_DIRECT_IO;
 	if (flags & EXT2_FLAG_THREADS)
 		io_flags |= IO_FLAG_THREADS;
+	if (flags & EXT2_FLAG_UNRELIABLE_IO)
+		io_flags |= IO_FLAG_UNRELIABLE_IO;
 	retval = manager->open(fs->device_name, io_flags, &fs->io);
 	if (retval)
 		goto cleanup;
diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index 64eee342..845a1e16 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -837,6 +837,10 @@ static errcode_t unix_open_channel(const char *name, int fd,
 		}
 	}
 #endif
+	if (flags & IO_FLAG_UNRELIABLE_IO) {
+		io->flags |= CHANNEL_FLAGS_UNRELIABLE_IO;
+	}
+
 	*channel = io;
 	return 0;

-- 
2.23.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] e2fsprogs: Try again to solve unreliable io case
  2021-04-20  7:18 [PATCH] e2fsprogs: Try again to solve unreliable io case Haotian Li
@ 2021-04-20 15:08 ` Darrick J. Wong
  2021-04-22 13:03   ` Zhiqiang Liu
  2021-04-20 16:19 ` Theodore Ts'o
  1 sibling, 1 reply; 7+ messages in thread
From: Darrick J. Wong @ 2021-04-20 15:08 UTC (permalink / raw)
  To: Haotian Li
  Cc: Ext4 Developers List, Theodore Y. Ts'o, harshad shirwadkar,,
	liuzhiqiang (I),
	linfeilong

On Tue, Apr 20, 2021 at 03:18:05PM +0800, Haotian Li wrote:
> If some I/O error occured during e2fsck, for example the
> fibre channel connections are flasky, the e2fsck may exit.
> Try again in these I/O error cases may help e2fsck
> successfully execute and fix the disk correctly.
> 
> We may try five times when io_channel_write/read failed.
> Enable this option by setting true on unreliable_io in
> configure file "e2fsck.conf".

Why not set 'io_max_retries = 5' in e2fsck.conf and feed that to the io
manager directly?  That would be more flexible than a boolean that
triggers a hardwired retry counter.

--D

> 
> Signed-off-by: Haotian Li <lihaotian9@huawei.com>
> Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
> ---
>  e2fsck/unix.c           |   7 +++
>  lib/ext2fs/ext2_io.h    |   3 ++
>  lib/ext2fs/ext2fs.h     |   2 +-
>  lib/ext2fs/io_manager.c | 102 ++++++++++++++++++++++++++++++----------
>  lib/ext2fs/openfs.c     |   2 +
>  lib/ext2fs/unix_io.c    |   4 ++
>  6 files changed, 93 insertions(+), 27 deletions(-)
> 
> diff --git a/e2fsck/unix.c b/e2fsck/unix.c
> index c5f9e441..faa2f77d 100644
> --- a/e2fsck/unix.c
> +++ b/e2fsck/unix.c
> @@ -1398,6 +1398,7 @@ int main (int argc, char *argv[])
>  	int journal_size;
>  	int sysval, sys_page_size = 4096;
>  	int old_bitmaps;
> +	int unreliable_io;
>  	__u32 features[3];
>  	char *cp;
>  	enum quota_type qtype;
> @@ -1496,6 +1497,12 @@ restart:
>  			    &old_bitmaps);
>  	if (!old_bitmaps)
>  		flags |= EXT2_FLAG_64BITS;
> +
> +	profile_get_boolean(ctx->profile, "options", "unreliable_io", 0, 0,
> +			    &unreliable_io);
> +	if (unreliable_io)
> +		flags |= EXT2_FLAG_UNRELIABLE_IO;
> +
>  	if ((ctx->options & E2F_OPT_READONLY) == 0) {
>  		flags |= EXT2_FLAG_RW;
>  		if (!(ctx->mount_flags & EXT2_MF_ISROOT &&
> diff --git a/lib/ext2fs/ext2_io.h b/lib/ext2fs/ext2_io.h
> index 8fe5b323..38d23a1b 100644
> --- a/lib/ext2fs/ext2_io.h
> +++ b/lib/ext2fs/ext2_io.h
> @@ -34,8 +34,10 @@ typedef struct struct_io_stats *io_stats;
>  #define CHANNEL_FLAGS_DISCARD_ZEROES	0x02
>  #define CHANNEL_FLAGS_BLOCK_DEVICE	0x04
>  #define CHANNEL_FLAGS_THREADS		0x08
> +#define CHANNEL_FLAGS_UNRELIABLE_IO     0x10
> 
>  #define io_channel_discard_zeroes_data(i) (i->flags & CHANNEL_FLAGS_DISCARD_ZEROES)
> +#define UNRELIABLE_IO_RETRY_TIMES 5
> 
>  struct struct_io_channel {
>  	errcode_t	magic;
> @@ -107,6 +109,7 @@ struct struct_io_manager {
>  #define IO_FLAG_FORCE_BOUNCE	0x0008
>  #define IO_FLAG_THREADS		0x0010
>  #define IO_FLAG_NOCACHE		0x0020
> +#define IO_FLAG_UNRELIABLE_IO   0x0040
> 
>  /*
>   * Convenience functions....
> diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
> index df150f00..4ce5e311 100644
> --- a/lib/ext2fs/ext2fs.h
> +++ b/lib/ext2fs/ext2fs.h
> @@ -207,7 +207,7 @@ typedef struct ext2_file *ext2_file_t;
>  #define EXT2_FLAG_BBITMAP_TAIL_PROBLEM	0x1000000
>  #define EXT2_FLAG_IBITMAP_TAIL_PROBLEM	0x2000000
>  #define EXT2_FLAG_THREADS		0x4000000
> -
> +#define EXT2_FLAG_UNRELIABLE_IO         0x8000000
>  /*
>   * Special flag in the ext2 inode i_flag field that means that this is
>   * a new inode.  (So that ext2_write_inode() can clear extra fields.)
> diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
> index 996c31a1..f062ef0a 100644
> --- a/lib/ext2fs/io_manager.c
> +++ b/lib/ext2fs/io_manager.c
> @@ -61,66 +61,116 @@ errcode_t io_channel_write_byte(io_channel channel, unsigned long offset,
>  				int count, const void *data)
>  {
>  	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
> -
> -	if (channel->manager->write_byte)
> -		return channel->manager->write_byte(channel, offset,
> +	errcode_t ret = EXT2_ET_UNIMPLEMENTED;
> +       int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
> +               UNRELIABLE_IO_RETRY_TIMES : 0;
> +retry:
> +	if (channel->manager->write_byte) {
> +		ret = channel->manager->write_byte(channel, offset,
>  						    count, data);
> -
> -	return EXT2_ET_UNIMPLEMENTED;
> +		if (ret && retry_times) {
> +                        retry_times--;
> +                        goto retry;
> +               }
> +	}
> +	return ret;
>  }
> 
>  errcode_t io_channel_read_blk64(io_channel channel, unsigned long long block,
>  				 int count, void *data)
>  {
> -	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
> -
> -	if (channel->manager->read_blk64)
> -		return (channel->manager->read_blk64)(channel, block,
> +       EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
> +	errcode_t ret = 0;
> +	int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
> +		UNRELIABLE_IO_RETRY_TIMES : 0;
> +retry_read_blk64:
> +	if (channel->manager->read_blk64) {
> +		ret = (channel->manager->read_blk64)(channel, block,
>  						      count, data);
> +		if (ret && retry_times) {
> +			retry_times--;
> +			goto retry_read_blk64;
> +		}
> +		return ret;
> +	}
> 
>  	if ((block >> 32) != 0)
>  		return EXT2_ET_IO_CHANNEL_NO_SUPPORT_64;
> -
> -	return (channel->manager->read_blk)(channel, (unsigned long) block,
> +retry_read_blk:
> +	ret = (channel->manager->read_blk)(channel, (unsigned long) block,
>  					     count, data);
> +	if (ret && retry_times) {
> +		retry_times--;
> +               goto retry_read_blk;
> +       }
> +	return ret;
>  }
> 
> +
>  errcode_t io_channel_write_blk64(io_channel channel, unsigned long long block,
>  				 int count, const void *data)
>  {
>  	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
> -
> -	if (channel->manager->write_blk64)
> -		return (channel->manager->write_blk64)(channel, block,
> +	errcode_t ret = 0;
> +       int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
> +                UNRELIABLE_IO_RETRY_TIMES : 0;
> +retry_write_blk64:
> +	if (channel->manager->write_blk64) {
> +		ret = (channel->manager->write_blk64)(channel, block,
>  						       count, data);
> +		if (ret && retry_times) {
> +                        retry_times--;
> +                        goto retry_write_blk64;
> +               }
> +               return ret;
> +        }
> 
>  	if ((block >> 32) != 0)
>  		return EXT2_ET_IO_CHANNEL_NO_SUPPORT_64;
> -
> -	return (channel->manager->write_blk)(channel, (unsigned long) block,
> +retry_write_blk:
> +	ret = (channel->manager->write_blk)(channel, (unsigned long) block,
>  					     count, data);
> +	if (ret && retry_times) {
> +               retry_times--;
> +               goto retry_write_blk;
> +       }
> +	return ret;
>  }
> 
>  errcode_t io_channel_discard(io_channel channel, unsigned long long block,
>  			     unsigned long long count)
>  {
>  	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
> -
> -	if (channel->manager->discard)
> -		return (channel->manager->discard)(channel, block, count);
> -
> -	return EXT2_ET_UNIMPLEMENTED;
> +	errcode_t ret = EXT2_ET_UNIMPLEMENTED;
> +	int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
> +	                UNRELIABLE_IO_RETRY_TIMES : 0;
> +retry:
> +	if (channel->manager->discard) {
> +		ret = (channel->manager->discard)(channel, block, count);
> +		if (ret && retry_times) {
> +                       retry_times--;
> +                       goto retry;
> +               }
> +	}
> +	return ret;
>  }
> 
>  errcode_t io_channel_zeroout(io_channel channel, unsigned long long block,
>  			     unsigned long long count)
>  {
>  	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
> -
> -	if (channel->manager->zeroout)
> -		return (channel->manager->zeroout)(channel, block, count);
> -
> -	return EXT2_ET_UNIMPLEMENTED;
> +	errcode_t ret = EXT2_ET_UNIMPLEMENTED;
> +       int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
> +                        UNRELIABLE_IO_RETRY_TIMES : 0;
> +retry:
> +	if (channel->manager->zeroout) {
> +		ret = (channel->manager->zeroout)(channel, block, count);
> +		if (ret && retry_times) {
> +                        retry_times--;
> +                        goto retry;
> +               }
> +       }
> +	return ret;
>  }
> 
>  errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
> diff --git a/lib/ext2fs/openfs.c b/lib/ext2fs/openfs.c
> index 5ec8ed5c..a98a5815 100644
> --- a/lib/ext2fs/openfs.c
> +++ b/lib/ext2fs/openfs.c
> @@ -172,6 +172,8 @@ errcode_t ext2fs_open2(const char *name, const char *io_options,
>  		io_flags |= IO_FLAG_DIRECT_IO;
>  	if (flags & EXT2_FLAG_THREADS)
>  		io_flags |= IO_FLAG_THREADS;
> +	if (flags & EXT2_FLAG_UNRELIABLE_IO)
> +		io_flags |= IO_FLAG_UNRELIABLE_IO;
>  	retval = manager->open(fs->device_name, io_flags, &fs->io);
>  	if (retval)
>  		goto cleanup;
> diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
> index 64eee342..845a1e16 100644
> --- a/lib/ext2fs/unix_io.c
> +++ b/lib/ext2fs/unix_io.c
> @@ -837,6 +837,10 @@ static errcode_t unix_open_channel(const char *name, int fd,
>  		}
>  	}
>  #endif
> +	if (flags & IO_FLAG_UNRELIABLE_IO) {
> +		io->flags |= CHANNEL_FLAGS_UNRELIABLE_IO;
> +	}
> +
>  	*channel = io;
>  	return 0;
> 
> -- 
> 2.23.0
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] e2fsprogs: Try again to solve unreliable io case
  2021-04-20  7:18 [PATCH] e2fsprogs: Try again to solve unreliable io case Haotian Li
  2021-04-20 15:08 ` Darrick J. Wong
@ 2021-04-20 16:19 ` Theodore Ts'o
  2021-04-23  2:18   ` Zhiqiang Liu
  1 sibling, 1 reply; 7+ messages in thread
From: Theodore Ts'o @ 2021-04-20 16:19 UTC (permalink / raw)
  To: Haotian Li
  Cc: Ext4 Developers List, harshad shirwadkar,, liuzhiqiang (I), linfeilong

On Tue, Apr 20, 2021 at 03:18:05PM +0800, Haotian Li wrote:
> If some I/O error occured during e2fsck, for example the
> fibre channel connections are flasky, the e2fsck may exit.
> Try again in these I/O error cases may help e2fsck
> successfully execute and fix the disk correctly.

Why not fix this by retrying in the device driver instead?  If the
Fibre Channel is that flaky, then it's going to be a problem when the
file system is mounted, so it would seem to me that fixing this in the
kernel makes a lot more sense.

    	   	       	    - Ted

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] e2fsprogs: Try again to solve unreliable io case
  2021-04-20 15:08 ` Darrick J. Wong
@ 2021-04-22 13:03   ` Zhiqiang Liu
  0 siblings, 0 replies; 7+ messages in thread
From: Zhiqiang Liu @ 2021-04-22 13:03 UTC (permalink / raw)
  To: Darrick J. Wong, Haotian Li
  Cc: Ext4 Developers List, Theodore Y. Ts'o, harshad shirwadkar,,
	linfeilong



On 2021/4/20 23:08, Darrick J. Wong wrote:
> On Tue, Apr 20, 2021 at 03:18:05PM +0800, Haotian Li wrote:
>> If some I/O error occured during e2fsck, for example the
>> fibre channel connections are flasky, the e2fsck may exit.
>> Try again in these I/O error cases may help e2fsck
>> successfully execute and fix the disk correctly.
>>
>> We may try five times when io_channel_write/read failed.
>> Enable this option by setting true on unreliable_io in
>> configure file "e2fsck.conf".
> 
> Why not set 'io_max_retries = 5' in e2fsck.conf and feed that to the io
> manager directly?  That would be more flexible than a boolean that
> triggers a hardwired retry counter.
> 
> --D
>
Thanks for your suggestion.

We will have a try, and send the v2 patch.

Regards
Zhiqiang Liu
>>
>> Signed-off-by: Haotian Li <lihaotian9@huawei.com>
>> Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
>> ---
>>  e2fsck/unix.c           |   7 +++
>>  lib/ext2fs/ext2_io.h    |   3 ++
>>  lib/ext2fs/ext2fs.h     |   2 +-
>>  lib/ext2fs/io_manager.c | 102 ++++++++++++++++++++++++++++++----------
>>  lib/ext2fs/openfs.c     |   2 +
>>  lib/ext2fs/unix_io.c    |   4 ++
>>  6 files changed, 93 insertions(+), 27 deletions(-)
>>
>> diff --git a/e2fsck/unix.c b/e2fsck/unix.c
>> index c5f9e441..faa2f77d 100644
>> --- a/e2fsck/unix.c
>> +++ b/e2fsck/unix.c
>> @@ -1398,6 +1398,7 @@ int main (int argc, char *argv[])
>>  	int journal_size;
>>  	int sysval, sys_page_size = 4096;
>>  	int old_bitmaps;
>> +	int unreliable_io;
>>  	__u32 features[3];
>>  	char *cp;
>>  	enum quota_type qtype;
>> @@ -1496,6 +1497,12 @@ restart:
>>  			    &old_bitmaps);
>>  	if (!old_bitmaps)
>>  		flags |= EXT2_FLAG_64BITS;
>> +
>> +	profile_get_boolean(ctx->profile, "options", "unreliable_io", 0, 0,
>> +			    &unreliable_io);
>> +	if (unreliable_io)
>> +		flags |= EXT2_FLAG_UNRELIABLE_IO;
>> +
>>  	if ((ctx->options & E2F_OPT_READONLY) == 0) {
>>  		flags |= EXT2_FLAG_RW;
>>  		if (!(ctx->mount_flags & EXT2_MF_ISROOT &&
>> diff --git a/lib/ext2fs/ext2_io.h b/lib/ext2fs/ext2_io.h
>> index 8fe5b323..38d23a1b 100644
>> --- a/lib/ext2fs/ext2_io.h
>> +++ b/lib/ext2fs/ext2_io.h
>> @@ -34,8 +34,10 @@ typedef struct struct_io_stats *io_stats;
>>  #define CHANNEL_FLAGS_DISCARD_ZEROES	0x02
>>  #define CHANNEL_FLAGS_BLOCK_DEVICE	0x04
>>  #define CHANNEL_FLAGS_THREADS		0x08
>> +#define CHANNEL_FLAGS_UNRELIABLE_IO     0x10
>>
>>  #define io_channel_discard_zeroes_data(i) (i->flags & CHANNEL_FLAGS_DISCARD_ZEROES)
>> +#define UNRELIABLE_IO_RETRY_TIMES 5
>>
>>  struct struct_io_channel {
>>  	errcode_t	magic;
>> @@ -107,6 +109,7 @@ struct struct_io_manager {
>>  #define IO_FLAG_FORCE_BOUNCE	0x0008
>>  #define IO_FLAG_THREADS		0x0010
>>  #define IO_FLAG_NOCACHE		0x0020
>> +#define IO_FLAG_UNRELIABLE_IO   0x0040
>>
>>  /*
>>   * Convenience functions....
>> diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
>> index df150f00..4ce5e311 100644
>> --- a/lib/ext2fs/ext2fs.h
>> +++ b/lib/ext2fs/ext2fs.h
>> @@ -207,7 +207,7 @@ typedef struct ext2_file *ext2_file_t;
>>  #define EXT2_FLAG_BBITMAP_TAIL_PROBLEM	0x1000000
>>  #define EXT2_FLAG_IBITMAP_TAIL_PROBLEM	0x2000000
>>  #define EXT2_FLAG_THREADS		0x4000000
>> -
>> +#define EXT2_FLAG_UNRELIABLE_IO         0x8000000
>>  /*
>>   * Special flag in the ext2 inode i_flag field that means that this is
>>   * a new inode.  (So that ext2_write_inode() can clear extra fields.)
>> diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
>> index 996c31a1..f062ef0a 100644
>> --- a/lib/ext2fs/io_manager.c
>> +++ b/lib/ext2fs/io_manager.c
>> @@ -61,66 +61,116 @@ errcode_t io_channel_write_byte(io_channel channel, unsigned long offset,
>>  				int count, const void *data)
>>  {
>>  	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
>> -
>> -	if (channel->manager->write_byte)
>> -		return channel->manager->write_byte(channel, offset,
>> +	errcode_t ret = EXT2_ET_UNIMPLEMENTED;
>> +       int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
>> +               UNRELIABLE_IO_RETRY_TIMES : 0;
>> +retry:
>> +	if (channel->manager->write_byte) {
>> +		ret = channel->manager->write_byte(channel, offset,
>>  						    count, data);
>> -
>> -	return EXT2_ET_UNIMPLEMENTED;
>> +		if (ret && retry_times) {
>> +                        retry_times--;
>> +                        goto retry;
>> +               }
>> +	}
>> +	return ret;
>>  }
>>
>>  errcode_t io_channel_read_blk64(io_channel channel, unsigned long long block,
>>  				 int count, void *data)
>>  {
>> -	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
>> -
>> -	if (channel->manager->read_blk64)
>> -		return (channel->manager->read_blk64)(channel, block,
>> +       EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
>> +	errcode_t ret = 0;
>> +	int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
>> +		UNRELIABLE_IO_RETRY_TIMES : 0;
>> +retry_read_blk64:
>> +	if (channel->manager->read_blk64) {
>> +		ret = (channel->manager->read_blk64)(channel, block,
>>  						      count, data);
>> +		if (ret && retry_times) {
>> +			retry_times--;
>> +			goto retry_read_blk64;
>> +		}
>> +		return ret;
>> +	}
>>
>>  	if ((block >> 32) != 0)
>>  		return EXT2_ET_IO_CHANNEL_NO_SUPPORT_64;
>> -
>> -	return (channel->manager->read_blk)(channel, (unsigned long) block,
>> +retry_read_blk:
>> +	ret = (channel->manager->read_blk)(channel, (unsigned long) block,
>>  					     count, data);
>> +	if (ret && retry_times) {
>> +		retry_times--;
>> +               goto retry_read_blk;
>> +       }
>> +	return ret;
>>  }
>>
>> +
>>  errcode_t io_channel_write_blk64(io_channel channel, unsigned long long block,
>>  				 int count, const void *data)
>>  {
>>  	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
>> -
>> -	if (channel->manager->write_blk64)
>> -		return (channel->manager->write_blk64)(channel, block,
>> +	errcode_t ret = 0;
>> +       int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
>> +                UNRELIABLE_IO_RETRY_TIMES : 0;
>> +retry_write_blk64:
>> +	if (channel->manager->write_blk64) {
>> +		ret = (channel->manager->write_blk64)(channel, block,
>>  						       count, data);
>> +		if (ret && retry_times) {
>> +                        retry_times--;
>> +                        goto retry_write_blk64;
>> +               }
>> +               return ret;
>> +        }
>>
>>  	if ((block >> 32) != 0)
>>  		return EXT2_ET_IO_CHANNEL_NO_SUPPORT_64;
>> -
>> -	return (channel->manager->write_blk)(channel, (unsigned long) block,
>> +retry_write_blk:
>> +	ret = (channel->manager->write_blk)(channel, (unsigned long) block,
>>  					     count, data);
>> +	if (ret && retry_times) {
>> +               retry_times--;
>> +               goto retry_write_blk;
>> +       }
>> +	return ret;
>>  }
>>
>>  errcode_t io_channel_discard(io_channel channel, unsigned long long block,
>>  			     unsigned long long count)
>>  {
>>  	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
>> -
>> -	if (channel->manager->discard)
>> -		return (channel->manager->discard)(channel, block, count);
>> -
>> -	return EXT2_ET_UNIMPLEMENTED;
>> +	errcode_t ret = EXT2_ET_UNIMPLEMENTED;
>> +	int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
>> +	                UNRELIABLE_IO_RETRY_TIMES : 0;
>> +retry:
>> +	if (channel->manager->discard) {
>> +		ret = (channel->manager->discard)(channel, block, count);
>> +		if (ret && retry_times) {
>> +                       retry_times--;
>> +                       goto retry;
>> +               }
>> +	}
>> +	return ret;
>>  }
>>
>>  errcode_t io_channel_zeroout(io_channel channel, unsigned long long block,
>>  			     unsigned long long count)
>>  {
>>  	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
>> -
>> -	if (channel->manager->zeroout)
>> -		return (channel->manager->zeroout)(channel, block, count);
>> -
>> -	return EXT2_ET_UNIMPLEMENTED;
>> +	errcode_t ret = EXT2_ET_UNIMPLEMENTED;
>> +       int retry_times = (channel->flags & CHANNEL_FLAGS_UNRELIABLE_IO) ?
>> +                        UNRELIABLE_IO_RETRY_TIMES : 0;
>> +retry:
>> +	if (channel->manager->zeroout) {
>> +		ret = (channel->manager->zeroout)(channel, block, count);
>> +		if (ret && retry_times) {
>> +                        retry_times--;
>> +                        goto retry;
>> +               }
>> +       }
>> +	return ret;
>>  }
>>
>>  errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
>> diff --git a/lib/ext2fs/openfs.c b/lib/ext2fs/openfs.c
>> index 5ec8ed5c..a98a5815 100644
>> --- a/lib/ext2fs/openfs.c
>> +++ b/lib/ext2fs/openfs.c
>> @@ -172,6 +172,8 @@ errcode_t ext2fs_open2(const char *name, const char *io_options,
>>  		io_flags |= IO_FLAG_DIRECT_IO;
>>  	if (flags & EXT2_FLAG_THREADS)
>>  		io_flags |= IO_FLAG_THREADS;
>> +	if (flags & EXT2_FLAG_UNRELIABLE_IO)
>> +		io_flags |= IO_FLAG_UNRELIABLE_IO;
>>  	retval = manager->open(fs->device_name, io_flags, &fs->io);
>>  	if (retval)
>>  		goto cleanup;
>> diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
>> index 64eee342..845a1e16 100644
>> --- a/lib/ext2fs/unix_io.c
>> +++ b/lib/ext2fs/unix_io.c
>> @@ -837,6 +837,10 @@ static errcode_t unix_open_channel(const char *name, int fd,
>>  		}
>>  	}
>>  #endif
>> +	if (flags & IO_FLAG_UNRELIABLE_IO) {
>> +		io->flags |= CHANNEL_FLAGS_UNRELIABLE_IO;
>> +	}
>> +
>>  	*channel = io;
>>  	return 0;
>>
>> -- 
>> 2.23.0
>>
> 
> .
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] e2fsprogs: Try again to solve unreliable io case
  2021-04-20 16:19 ` Theodore Ts'o
@ 2021-04-23  2:18   ` Zhiqiang Liu
  2021-04-23 15:46     ` Theodore Ts'o
  0 siblings, 1 reply; 7+ messages in thread
From: Zhiqiang Liu @ 2021-04-23  2:18 UTC (permalink / raw)
  To: Theodore Ts'o, Haotian Li
  Cc: Ext4 Developers List, harshad shirwadkar,, linfeilong

On 2021/4/21 0:19, Theodore Ts'o wrote:
> On Tue, Apr 20, 2021 at 03:18:05PM +0800, Haotian Li wrote:
>> If some I/O error occured during e2fsck, for example the
>> fibre channel connections are flasky, the e2fsck may exit.
>> Try again in these I/O error cases may help e2fsck
>> successfully execute and fix the disk correctly.
> 
> Why not fix this by retrying in the device driver instead?  If the
> Fibre Channel is that flaky, then it's going to be a problem when the
> file system is mounted, so it would seem to me that fixing this in the
> kernel makes a lot more sense.
> 
>     	   	       	    - Ted
>
Thanks for your reply.
Actually, we have met the problem in ipsan situation.
When exec 'fsck -a <remote-device>', short-term fluctuations or
abnormalities may occur on the network. Despite the driver has
do the best effort, some IO errors may occur. So add retrying in
e2fsprogs can further improve the reliability of the repair
process.

Regards
Zhiqiang Liu
> .
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] e2fsprogs: Try again to solve unreliable io case
  2021-04-23  2:18   ` Zhiqiang Liu
@ 2021-04-23 15:46     ` Theodore Ts'o
  2021-04-24  4:46       ` Zhiqiang Liu
  0 siblings, 1 reply; 7+ messages in thread
From: Theodore Ts'o @ 2021-04-23 15:46 UTC (permalink / raw)
  To: Zhiqiang Liu
  Cc: Haotian Li, Ext4 Developers List, harshad shirwadkar,, linfeilong

On Fri, Apr 23, 2021 at 10:18:09AM +0800, Zhiqiang Liu wrote:
> Thanks for your reply.
> Actually, we have met the problem in ipsan situation.
> When exec 'fsck -a <remote-device>', short-term fluctuations or
> abnormalities may occur on the network. Despite the driver has
> do the best effort, some IO errors may occur. So add retrying in
> e2fsprogs can further improve the reliability of the repair
> process.

But why doesn't this happen when the file system is mounted, and why
is that acceptable?   And why not change the driver to do more retries?

   		      	      	  - Ted

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] e2fsprogs: Try again to solve unreliable io case
  2021-04-23 15:46     ` Theodore Ts'o
@ 2021-04-24  4:46       ` Zhiqiang Liu
  0 siblings, 0 replies; 7+ messages in thread
From: Zhiqiang Liu @ 2021-04-24  4:46 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: Haotian Li, Ext4 Developers List, harshad shirwadkar,, linfeilong



On 2021/4/23 23:46, Theodore Ts'o wrote:
> On Fri, Apr 23, 2021 at 10:18:09AM +0800, Zhiqiang Liu wrote:
>> Thanks for your reply.
>> Actually, we have met the problem in ipsan situation.
>> When exec 'fsck -a <remote-device>', short-term fluctuations or
>> abnormalities may occur on the network. Despite the driver has
>> do the best effort, some IO errors may occur. So add retrying in
>> e2fsprogs can further improve the reliability of the repair
>> process.
> 
> But why doesn't this happen when the file system is mounted, and why
> is that acceptable?   And why not change the driver to do more retries?
> 
>    		      	      	  - Ted
> 
Actually, this may happen when the filesystem is mounted. The difference
is that the mounted filesystem can ensure the consistency with journal.

For example, if the IO error occurs when calling io_channel_write_byte()
to update superblock, the checksum may be not written to the disk successfully.
Then the checksum error will occur, and the filesystem cannot be
repaired with 'fsck -y|a|f'.

This situation has a very low probability. For improving the reliability of
the repair process, the retries in e2fsprogs may be necessary.

Regards
Zhiqiang Liu.

> .
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, back to index

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-20  7:18 [PATCH] e2fsprogs: Try again to solve unreliable io case Haotian Li
2021-04-20 15:08 ` Darrick J. Wong
2021-04-22 13:03   ` Zhiqiang Liu
2021-04-20 16:19 ` Theodore Ts'o
2021-04-23  2:18   ` Zhiqiang Liu
2021-04-23 15:46     ` Theodore Ts'o
2021-04-24  4:46       ` Zhiqiang Liu

Linux-ext4 Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-ext4/0 linux-ext4/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-ext4 linux-ext4/ https://lore.kernel.org/linux-ext4 \
		linux-ext4@vger.kernel.org
	public-inbox-index linux-ext4

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-ext4


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git