linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] libfs: Fix DIO mode aligment
@ 2020-10-23 11:26 Artem Blagodarenko
  2020-11-17 15:30 ` Благодаренко Артём
  2021-02-26 23:17 ` Theodore Ts'o
  0 siblings, 2 replies; 10+ messages in thread
From: Artem Blagodarenko @ 2020-10-23 11:26 UTC (permalink / raw)
  To: linux-ext4; +Cc: adilger.kernel, Alexey Lyashkov

From: Alexey Lyashkov <alexey.lyashkov@hpe.com>

Bounce buffer must be extended to hold a whole disk block size.
Read/write operations with unaligned offsets is prohobined
in the DIO mode, so read/write blocks should be adjusted to
reflect it.

Change-Id: Ic573c9ff0d476028dd2293f8b814c6112705db0e
Signed-off-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
HPE-bug-id: LUS-9241
---
 lib/ext2fs/io_manager.c |   5 +-
 lib/ext2fs/unix_io.c    | 296 +++++++++++++++++++++++++---------------
 2 files changed, 190 insertions(+), 111 deletions(-)

diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
index c395d615..84399a12 100644
--- a/lib/ext2fs/io_manager.c
+++ b/lib/ext2fs/io_manager.c
@@ -20,6 +20,9 @@
 #include "ext2_fs.h"
 #include "ext2fs.h"
 
+#define max(a, b) ((a) > (b) ? (a) : (b))
+
+
 errcode_t io_channel_set_options(io_channel channel, const char *opts)
 {
 	errcode_t retval = 0;
@@ -128,7 +131,7 @@ errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
 	size_t	size;
 
 	if (count == 0)
-		size = io->block_size;
+		size = max(io->block_size, io->align);
 	else if (count > 0)
 		size = io->block_size * count;
 	else
diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index 628e60c3..53647a22 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -154,46 +154,30 @@ static char *safe_getenv(const char *arg)
 /*
  * Here are the raw I/O functions
  */
-static errcode_t raw_read_blk(io_channel channel,
+static errcode_t raw_aligned_read_blk(io_channel channel,
 			      struct unix_private_data *data,
-			      unsigned long long block,
-			      int count, void *bufv)
+			      ext2_loff_t location,
+			      ssize_t size, void *bufv,
+			      int *asize)
 {
-	errcode_t	retval;
-	ssize_t		size;
-	ext2_loff_t	location;
 	int		actual = 0;
+	errcode_t	retval;
 	unsigned char	*buf = bufv;
-	ssize_t		really_read = 0;
-
-	size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
-	data->io_stats.bytes_read += size;
-	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
 
-	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
-		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
-			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
-			goto error_out;
-		}
-		goto bounce_read;
-	}
+#ifdef ALIGN_DEBUG
+	printf("raw_aligned_read_blk: %p %lu<>%lu\n", buf,
+		location, (unsigned long) size);
+#endif
 
 #ifdef HAVE_PREAD64
 	/* Try an aligned pread */
-	if ((channel->align == 0) ||
-	    (IS_ALIGNED(buf, channel->align) &&
-	     IS_ALIGNED(size, channel->align))) {
-		actual = pread64(data->dev, buf, size, location);
-		if (actual == size)
-			return 0;
-		actual = 0;
-	}
+	actual = pread64(data->dev, buf, size, location);
+	if (actual == size)
+		return 0;
+	actual = 0;
 #elif HAVE_PREAD
 	/* Try an aligned pread */
-	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
-	    ((channel->align == 0) ||
-	     (IS_ALIGNED(buf, channel->align) &&
-	      IS_ALIGNED(size, channel->align)))) {
+	if (sizeof(off_t) >= sizeof(ext2_loff_t)) {
 		actual = pread(data->dev, buf, size, location);
 		if (actual == size)
 			return 0;
@@ -205,47 +189,100 @@ static errcode_t raw_read_blk(io_channel channel,
 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
 		goto error_out;
 	}
+
+	actual = read(data->dev, buf, size);
+	if (actual != size) {
+		if (actual < 0) {
+			retval = errno;
+			actual = 0;
+		} else {
+			retval = EXT2_ET_SHORT_READ;
+		}
+	}
+
+error_out:
+	*asize = actual;
+	return retval;
+}
+
+
+/*
+ * Here are the raw I/O functions
+ */
+static errcode_t raw_read_blk(io_channel channel,
+			      struct unix_private_data *data,
+			      unsigned long long block,
+			      int count, void *bufv)
+{
+	errcode_t	retval;
+	ssize_t		size;
+	ext2_loff_t	location;
+	int		actual = 0;
+	unsigned char	*buf = bufv;
+	unsigned int	align = channel->align;
+	ssize_t		really_read = 0;
+	blk64_t		blk;
+	loff_t		offset;
+
+	size = (count < 0) ? -count : count * channel->block_size;
+	data->io_stats.bytes_read += size;
+	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
+
+	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
+		align = channel->block_size;
+		goto bounce_read;
+	}
+
 	if ((channel->align == 0) ||
-	    (IS_ALIGNED(buf, channel->align) &&
+	    (IS_ALIGNED(location, channel->align) &&
+	     IS_ALIGNED(buf, channel->align) &&
 	     IS_ALIGNED(size, channel->align))) {
-		actual = read(data->dev, buf, size);
-		if (actual != size) {
-		short_read:
-			if (actual < 0) {
-				retval = errno;
-				actual = 0;
-			} else
-				retval = EXT2_ET_SHORT_READ;
+		retval = raw_aligned_read_blk(channel, data, location,
+						size, bufv, &actual);
+		if (retval != 0)
 			goto error_out;
-		}
-		return 0;
+
+		return retval;
 	}
 
+bounce_read:
 #ifdef ALIGN_DEBUG
-	printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
-	       (unsigned long) size);
+	printf("raw_read_blk: O_DIRECT fallback: %p %lu<>%lu\n", buf,
+		location, (unsigned long) size);
 #endif
-
 	/*
 	 * The buffer or size which we're trying to read isn't aligned
 	 * to the O_DIRECT rules, so we need to do this the hard way...
+	 * read / write must be aligned to the block device sector size
 	 */
-bounce_read:
+
+	blk = location / align;
+	offset = location % align;
+
+	if (lseek(data->dev, blk * align, SEEK_SET) < 0)
+		return errno;
+
 	while (size > 0) {
-		actual = read(data->dev, data->bounce, channel->block_size);
-		if (actual != channel->block_size) {
+		actual = read(data->dev, data->bounce, align);
+		if (actual != align) {
 			actual = really_read;
 			buf -= really_read;
 			size += really_read;
-			goto short_read;
+			retval = EXT2_ET_SHORT_READ;
+			goto error_out;
 		}
 		actual = size;
-		if (size > channel->block_size)
-			actual = channel->block_size;
-		memcpy(buf, data->bounce, actual);
+		if ((actual + offset) > align)
+			actual = align - offset;
+		if (actual > size)
+			actual = size;
+
+		memcpy(buf, data->bounce + offset, actual);
 		really_read += actual;
 		size -= actual;
 		buf += actual;
+		offset = 0;
+		blk++;
 	}
 	return 0;
 
@@ -258,6 +295,58 @@ error_out:
 	return retval;
 }
 
+static errcode_t raw_aligned_write_blk(io_channel channel,
+			       struct unix_private_data *data,
+			       ext2_loff_t  location,
+			       ssize_t size, const void *bufv,
+			       int *asize)
+
+{
+	int		actual = 0;
+	errcode_t	retval;
+	const unsigned char *buf = (void *)bufv;
+
+#ifdef ALIGN_DEBUG
+	printf("raw_aligned_write_blk: %p %lu %lu\n", buf,
+	       location, (unsigned long) size);
+#endif
+
+#ifdef HAVE_PWRITE64
+	/* Try an aligned pwrite */
+	actual = pwrite64(data->dev, buf, size, location);
+	if (actual == size)
+		return 0;
+#elif HAVE_PWRITE
+	/* Try an aligned pwrite */
+	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) {
+		actual = pwrite(data->dev, buf, size, location);
+		if (actual == size)
+			return 0;
+	}
+#endif /* HAVE_PWRITE */
+
+	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+		goto error_out;
+	}
+
+	actual = write(data->dev, buf, size);
+	if (actual < 0) {
+		retval = errno;
+		goto error_out;
+	}
+	if (actual != size) {
+		retval = EXT2_ET_SHORT_WRITE;
+		goto error_out;
+	}
+	retval = 0;
+error_out:
+	*asize = actual;
+	return retval;
+
+}
+
+
 static errcode_t raw_write_blk(io_channel channel,
 			       struct unix_private_data *data,
 			       unsigned long long block,
@@ -268,6 +357,9 @@ static errcode_t raw_write_blk(io_channel channel,
 	int		actual = 0;
 	errcode_t	retval;
 	const unsigned char *buf = bufv;
+	unsigned int	align = channel->align;
+	blk64_t		blk;
+	loff_t		offset;
 
 	if (count == 1)
 		size = channel->block_size;
@@ -282,95 +374,79 @@ static errcode_t raw_write_blk(io_channel channel,
 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
 
 	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
-		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
-			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
-			goto error_out;
-		}
+		align = channel->block_size;
 		goto bounce_write;
 	}
 
-#ifdef HAVE_PWRITE64
-	/* Try an aligned pwrite */
 	if ((channel->align == 0) ||
-	    (IS_ALIGNED(buf, channel->align) &&
+	    (IS_ALIGNED(location, channel->align) &&
+	     IS_ALIGNED(buf, channel->align) &&
 	     IS_ALIGNED(size, channel->align))) {
-		actual = pwrite64(data->dev, buf, size, location);
-		if (actual == size)
-			return 0;
-	}
-#elif HAVE_PWRITE
-	/* Try an aligned pwrite */
-	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
-	    ((channel->align == 0) ||
-	     (IS_ALIGNED(buf, channel->align) &&
-	      IS_ALIGNED(size, channel->align)))) {
-		actual = pwrite(data->dev, buf, size, location);
-		if (actual == size)
-			return 0;
-	}
-#endif /* HAVE_PWRITE */
+		retval = raw_aligned_write_blk(channel, data, location,
+						size, bufv, &actual);
+		if (retval != 0)
+			goto error_out;
 
-	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
-		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
-		goto error_out;
+		return retval;
 	}
 
-	if ((channel->align == 0) ||
-	    (IS_ALIGNED(buf, channel->align) &&
-	     IS_ALIGNED(size, channel->align))) {
-		actual = write(data->dev, buf, size);
-		if (actual < 0) {
-			retval = errno;
-			goto error_out;
-		}
-		if (actual != size) {
-		short_write:
-			retval = EXT2_ET_SHORT_WRITE;
-			goto error_out;
-		}
-		return 0;
-	}
 
+bounce_write:
 #ifdef ALIGN_DEBUG
 	printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
 	       (unsigned long) size);
 #endif
+
+	/* logical offset may don't aligned with block device block size */
+	blk = location / align;
+	offset = location % align;
+
+	if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
+		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+		goto error_out;
+	}
+
 	/*
 	 * The buffer or size which we're trying to write isn't aligned
 	 * to the O_DIRECT rules, so we need to do this the hard way...
 	 */
-bounce_write:
 	while (size > 0) {
-		if (size < channel->block_size) {
-			actual = read(data->dev, data->bounce,
-				      channel->block_size);
-			if (actual != channel->block_size) {
-				if (actual < 0) {
-					retval = errno;
-					goto error_out;
-				}
-				memset((char *) data->bounce + actual, 0,
-				       channel->block_size - actual);
+		int actual_w;
+
+		memset((char *) data->bounce, 0, align);
+		if (offset || (size < align)) {
+			actual = read(data->dev, data->bounce, align);
+			if (actual < 0) {
+				retval = errno;
+				goto error_out;
 			}
 		}
 		actual = size;
-		if (size > channel->block_size)
-			actual = channel->block_size;
-		memcpy(data->bounce, buf, actual);
-		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+		if ((actual + offset) > align)
+			actual = align - offset;
+		if (actual > size)
+			actual = size;
+		memcpy(((char *)data->bounce) + offset, buf, actual);
+
+		if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
 			goto error_out;
 		}
-		actual = write(data->dev, data->bounce, channel->block_size);
-		if (actual < 0) {
+
+		actual_w = write(data->dev, data->bounce, align);
+		if (actual_w < 0) {
 			retval = errno;
 			goto error_out;
 		}
-		if (actual != channel->block_size)
-			goto short_write;
+		if (actual_w != align) {
+			retval = EXT2_ET_SHORT_WRITE;
+			goto error_out;
+		}
 		size -= actual;
 		buf += actual;
 		location += actual;
+		blk ++;
+		offset = 0;
 	}
 	return 0;
 
-- 
2.18.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] libfs: Fix DIO mode aligment
  2020-10-23 11:26 [PATCH] libfs: Fix DIO mode aligment Artem Blagodarenko
@ 2020-11-17 15:30 ` Благодаренко Артём
  2020-11-17 19:19   ` Theodore Y. Ts'o
  2021-02-26 23:17 ` Theodore Ts'o
  1 sibling, 1 reply; 10+ messages in thread
From: Благодаренко Артём @ 2020-11-17 15:30 UTC (permalink / raw)
  To: linux-ext4; +Cc: adilger.kernel, Alexey Lyashkov, Theodore Tso

Hello,

Any thoughts about this change? Thanks.

Best regards,
Artem Blagodarenko.

> On 23 Oct 2020, at 14:26, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
> 
> From: Alexey Lyashkov <alexey.lyashkov@hpe.com>
> 
> Bounce buffer must be extended to hold a whole disk block size.
> Read/write operations with unaligned offsets is prohobined
> in the DIO mode, so read/write blocks should be adjusted to
> reflect it.
> 
> Change-Id: Ic573c9ff0d476028dd2293f8b814c6112705db0e
> Signed-off-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
> HPE-bug-id: LUS-9241
> ---
> lib/ext2fs/io_manager.c |   5 +-
> lib/ext2fs/unix_io.c    | 296 +++++++++++++++++++++++++---------------
> 2 files changed, 190 insertions(+), 111 deletions(-)
> 
> diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
> index c395d615..84399a12 100644
> --- a/lib/ext2fs/io_manager.c
> +++ b/lib/ext2fs/io_manager.c
> @@ -20,6 +20,9 @@
> #include "ext2_fs.h"
> #include "ext2fs.h"
> 
> +#define max(a, b) ((a) > (b) ? (a) : (b))
> +
> +
> errcode_t io_channel_set_options(io_channel channel, const char *opts)
> {
> 	errcode_t retval = 0;
> @@ -128,7 +131,7 @@ errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
> 	size_t	size;
> 
> 	if (count == 0)
> -		size = io->block_size;
> +		size = max(io->block_size, io->align);
> 	else if (count > 0)
> 		size = io->block_size * count;
> 	else
> diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
> index 628e60c3..53647a22 100644
> --- a/lib/ext2fs/unix_io.c
> +++ b/lib/ext2fs/unix_io.c
> @@ -154,46 +154,30 @@ static char *safe_getenv(const char *arg)
> /*
>  * Here are the raw I/O functions
>  */
> -static errcode_t raw_read_blk(io_channel channel,
> +static errcode_t raw_aligned_read_blk(io_channel channel,
> 			      struct unix_private_data *data,
> -			      unsigned long long block,
> -			      int count, void *bufv)
> +			      ext2_loff_t location,
> +			      ssize_t size, void *bufv,
> +			      int *asize)
> {
> -	errcode_t	retval;
> -	ssize_t		size;
> -	ext2_loff_t	location;
> 	int		actual = 0;
> +	errcode_t	retval;
> 	unsigned char	*buf = bufv;
> -	ssize_t		really_read = 0;
> -
> -	size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
> -	data->io_stats.bytes_read += size;
> -	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
> 
> -	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> -		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> -			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> -			goto error_out;
> -		}
> -		goto bounce_read;
> -	}
> +#ifdef ALIGN_DEBUG
> +	printf("raw_aligned_read_blk: %p %lu<>%lu\n", buf,
> +		location, (unsigned long) size);
> +#endif
> 
> #ifdef HAVE_PREAD64
> 	/* Try an aligned pread */
> -	if ((channel->align == 0) ||
> -	    (IS_ALIGNED(buf, channel->align) &&
> -	     IS_ALIGNED(size, channel->align))) {
> -		actual = pread64(data->dev, buf, size, location);
> -		if (actual == size)
> -			return 0;
> -		actual = 0;
> -	}
> +	actual = pread64(data->dev, buf, size, location);
> +	if (actual == size)
> +		return 0;
> +	actual = 0;
> #elif HAVE_PREAD
> 	/* Try an aligned pread */
> -	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
> -	    ((channel->align == 0) ||
> -	     (IS_ALIGNED(buf, channel->align) &&
> -	      IS_ALIGNED(size, channel->align)))) {
> +	if (sizeof(off_t) >= sizeof(ext2_loff_t)) {
> 		actual = pread(data->dev, buf, size, location);
> 		if (actual == size)
> 			return 0;
> @@ -205,47 +189,100 @@ static errcode_t raw_read_blk(io_channel channel,
> 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> 		goto error_out;
> 	}
> +
> +	actual = read(data->dev, buf, size);
> +	if (actual != size) {
> +		if (actual < 0) {
> +			retval = errno;
> +			actual = 0;
> +		} else {
> +			retval = EXT2_ET_SHORT_READ;
> +		}
> +	}
> +
> +error_out:
> +	*asize = actual;
> +	return retval;
> +}
> +
> +
> +/*
> + * Here are the raw I/O functions
> + */
> +static errcode_t raw_read_blk(io_channel channel,
> +			      struct unix_private_data *data,
> +			      unsigned long long block,
> +			      int count, void *bufv)
> +{
> +	errcode_t	retval;
> +	ssize_t		size;
> +	ext2_loff_t	location;
> +	int		actual = 0;
> +	unsigned char	*buf = bufv;
> +	unsigned int	align = channel->align;
> +	ssize_t		really_read = 0;
> +	blk64_t		blk;
> +	loff_t		offset;
> +
> +	size = (count < 0) ? -count : count * channel->block_size;
> +	data->io_stats.bytes_read += size;
> +	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
> +
> +	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> +		align = channel->block_size;
> +		goto bounce_read;
> +	}
> +
> 	if ((channel->align == 0) ||
> -	    (IS_ALIGNED(buf, channel->align) &&
> +	    (IS_ALIGNED(location, channel->align) &&
> +	     IS_ALIGNED(buf, channel->align) &&
> 	     IS_ALIGNED(size, channel->align))) {
> -		actual = read(data->dev, buf, size);
> -		if (actual != size) {
> -		short_read:
> -			if (actual < 0) {
> -				retval = errno;
> -				actual = 0;
> -			} else
> -				retval = EXT2_ET_SHORT_READ;
> +		retval = raw_aligned_read_blk(channel, data, location,
> +						size, bufv, &actual);
> +		if (retval != 0)
> 			goto error_out;
> -		}
> -		return 0;
> +
> +		return retval;
> 	}
> 
> +bounce_read:
> #ifdef ALIGN_DEBUG
> -	printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
> -	       (unsigned long) size);
> +	printf("raw_read_blk: O_DIRECT fallback: %p %lu<>%lu\n", buf,
> +		location, (unsigned long) size);
> #endif
> -
> 	/*
> 	 * The buffer or size which we're trying to read isn't aligned
> 	 * to the O_DIRECT rules, so we need to do this the hard way...
> +	 * read / write must be aligned to the block device sector size
> 	 */
> -bounce_read:
> +
> +	blk = location / align;
> +	offset = location % align;
> +
> +	if (lseek(data->dev, blk * align, SEEK_SET) < 0)
> +		return errno;
> +
> 	while (size > 0) {
> -		actual = read(data->dev, data->bounce, channel->block_size);
> -		if (actual != channel->block_size) {
> +		actual = read(data->dev, data->bounce, align);
> +		if (actual != align) {
> 			actual = really_read;
> 			buf -= really_read;
> 			size += really_read;
> -			goto short_read;
> +			retval = EXT2_ET_SHORT_READ;
> +			goto error_out;
> 		}
> 		actual = size;
> -		if (size > channel->block_size)
> -			actual = channel->block_size;
> -		memcpy(buf, data->bounce, actual);
> +		if ((actual + offset) > align)
> +			actual = align - offset;
> +		if (actual > size)
> +			actual = size;
> +
> +		memcpy(buf, data->bounce + offset, actual);
> 		really_read += actual;
> 		size -= actual;
> 		buf += actual;
> +		offset = 0;
> +		blk++;
> 	}
> 	return 0;
> 
> @@ -258,6 +295,58 @@ error_out:
> 	return retval;
> }
> 
> +static errcode_t raw_aligned_write_blk(io_channel channel,
> +			       struct unix_private_data *data,
> +			       ext2_loff_t  location,
> +			       ssize_t size, const void *bufv,
> +			       int *asize)
> +
> +{
> +	int		actual = 0;
> +	errcode_t	retval;
> +	const unsigned char *buf = (void *)bufv;
> +
> +#ifdef ALIGN_DEBUG
> +	printf("raw_aligned_write_blk: %p %lu %lu\n", buf,
> +	       location, (unsigned long) size);
> +#endif
> +
> +#ifdef HAVE_PWRITE64
> +	/* Try an aligned pwrite */
> +	actual = pwrite64(data->dev, buf, size, location);
> +	if (actual == size)
> +		return 0;
> +#elif HAVE_PWRITE
> +	/* Try an aligned pwrite */
> +	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) {
> +		actual = pwrite(data->dev, buf, size, location);
> +		if (actual == size)
> +			return 0;
> +	}
> +#endif /* HAVE_PWRITE */
> +
> +	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> +		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> +		goto error_out;
> +	}
> +
> +	actual = write(data->dev, buf, size);
> +	if (actual < 0) {
> +		retval = errno;
> +		goto error_out;
> +	}
> +	if (actual != size) {
> +		retval = EXT2_ET_SHORT_WRITE;
> +		goto error_out;
> +	}
> +	retval = 0;
> +error_out:
> +	*asize = actual;
> +	return retval;
> +
> +}
> +
> +
> static errcode_t raw_write_blk(io_channel channel,
> 			       struct unix_private_data *data,
> 			       unsigned long long block,
> @@ -268,6 +357,9 @@ static errcode_t raw_write_blk(io_channel channel,
> 	int		actual = 0;
> 	errcode_t	retval;
> 	const unsigned char *buf = bufv;
> +	unsigned int	align = channel->align;
> +	blk64_t		blk;
> +	loff_t		offset;
> 
> 	if (count == 1)
> 		size = channel->block_size;
> @@ -282,95 +374,79 @@ static errcode_t raw_write_blk(io_channel channel,
> 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
> 
> 	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> -		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> -			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> -			goto error_out;
> -		}
> +		align = channel->block_size;
> 		goto bounce_write;
> 	}
> 
> -#ifdef HAVE_PWRITE64
> -	/* Try an aligned pwrite */
> 	if ((channel->align == 0) ||
> -	    (IS_ALIGNED(buf, channel->align) &&
> +	    (IS_ALIGNED(location, channel->align) &&
> +	     IS_ALIGNED(buf, channel->align) &&
> 	     IS_ALIGNED(size, channel->align))) {
> -		actual = pwrite64(data->dev, buf, size, location);
> -		if (actual == size)
> -			return 0;
> -	}
> -#elif HAVE_PWRITE
> -	/* Try an aligned pwrite */
> -	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
> -	    ((channel->align == 0) ||
> -	     (IS_ALIGNED(buf, channel->align) &&
> -	      IS_ALIGNED(size, channel->align)))) {
> -		actual = pwrite(data->dev, buf, size, location);
> -		if (actual == size)
> -			return 0;
> -	}
> -#endif /* HAVE_PWRITE */
> +		retval = raw_aligned_write_blk(channel, data, location,
> +						size, bufv, &actual);
> +		if (retval != 0)
> +			goto error_out;
> 
> -	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> -		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> -		goto error_out;
> +		return retval;
> 	}
> 
> -	if ((channel->align == 0) ||
> -	    (IS_ALIGNED(buf, channel->align) &&
> -	     IS_ALIGNED(size, channel->align))) {
> -		actual = write(data->dev, buf, size);
> -		if (actual < 0) {
> -			retval = errno;
> -			goto error_out;
> -		}
> -		if (actual != size) {
> -		short_write:
> -			retval = EXT2_ET_SHORT_WRITE;
> -			goto error_out;
> -		}
> -		return 0;
> -	}
> 
> +bounce_write:
> #ifdef ALIGN_DEBUG
> 	printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
> 	       (unsigned long) size);
> #endif
> +
> +	/* logical offset may don't aligned with block device block size */
> +	blk = location / align;
> +	offset = location % align;
> +
> +	if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
> +		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> +		goto error_out;
> +	}
> +
> 	/*
> 	 * The buffer or size which we're trying to write isn't aligned
> 	 * to the O_DIRECT rules, so we need to do this the hard way...
> 	 */
> -bounce_write:
> 	while (size > 0) {
> -		if (size < channel->block_size) {
> -			actual = read(data->dev, data->bounce,
> -				      channel->block_size);
> -			if (actual != channel->block_size) {
> -				if (actual < 0) {
> -					retval = errno;
> -					goto error_out;
> -				}
> -				memset((char *) data->bounce + actual, 0,
> -				       channel->block_size - actual);
> +		int actual_w;
> +
> +		memset((char *) data->bounce, 0, align);
> +		if (offset || (size < align)) {
> +			actual = read(data->dev, data->bounce, align);
> +			if (actual < 0) {
> +				retval = errno;
> +				goto error_out;
> 			}
> 		}
> 		actual = size;
> -		if (size > channel->block_size)
> -			actual = channel->block_size;
> -		memcpy(data->bounce, buf, actual);
> -		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> +		if ((actual + offset) > align)
> +			actual = align - offset;
> +		if (actual > size)
> +			actual = size;
> +		memcpy(((char *)data->bounce) + offset, buf, actual);
> +
> +		if (lseek(data->dev, blk * align, SEEK_SET) != blk * align) {
> 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> 			goto error_out;
> 		}
> -		actual = write(data->dev, data->bounce, channel->block_size);
> -		if (actual < 0) {
> +
> +		actual_w = write(data->dev, data->bounce, align);
> +		if (actual_w < 0) {
> 			retval = errno;
> 			goto error_out;
> 		}
> -		if (actual != channel->block_size)
> -			goto short_write;
> +		if (actual_w != align) {
> +			retval = EXT2_ET_SHORT_WRITE;
> +			goto error_out;
> +		}
> 		size -= actual;
> 		buf += actual;
> 		location += actual;
> +		blk ++;
> +		offset = 0;
> 	}
> 	return 0;
> 
> -- 
> 2.18.4
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] libfs: Fix DIO mode aligment
  2020-11-17 15:30 ` Благодаренко Артём
@ 2020-11-17 19:19   ` Theodore Y. Ts'o
  2020-11-19 12:26     ` Lyashkov, Alexey
  0 siblings, 1 reply; 10+ messages in thread
From: Theodore Y. Ts'o @ 2020-11-17 19:19 UTC (permalink / raw)
  To: Благодаренко
	Артём
  Cc: linux-ext4, adilger.kernel, Alexey Lyashkov, Theodore Tso

On Tue, Nov 17, 2020 at 06:30:11PM +0300, Благодаренко Артём wrote:
> Hello,
> 
> Any thoughts about this change? Thanks.

I'm trying to think of situations where this could actually trigger in
real life.  The only one I can think of is if a file system with a 1k
block file system is located on an an Advanced FormatDrive with a 4k
sector size.

What was the use case where this was actually an issue?

     	     	      	    	     - Ted

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] libfs: Fix DIO mode aligment
  2020-11-17 19:19   ` Theodore Y. Ts'o
@ 2020-11-19 12:26     ` Lyashkov, Alexey
  2020-12-08  8:33       ` Благодаренко Артём
  2020-12-18 22:03       ` Andreas Dilger
  0 siblings, 2 replies; 10+ messages in thread
From: Lyashkov, Alexey @ 2020-11-19 12:26 UTC (permalink / raw)
  To: Theodore Y. Ts'o,
	Благодаренко
	Артём
  Cc: linux-ext4, adilger.kernel, Theodore Tso

Tso,

This situation hit with modern hdd with 4k block size and e2image changed to use DIRECT IO instead of buffered.
e2fsprogs tries to read a super lock on offset 1k and it caused to set FS block size to 1k and second block reading.
(many other places exist, but it simplest).
>>
        if (superblock) {
                if (!block_size) {
                        retval = EXT2_ET_INVALID_ARGUMENT;
                        goto cleanup;
                }
                io_channel_set_blksize(fs->io, block_size);
                group_block = superblock;
                fs->orig_super = 0;
        } else {
                io_channel_set_blksize(fs->io, SUPERBLOCK_OFFSET); <<<<< this is problem
                superblock = 1;
                group_block = 0;
                retval = ext2fs_get_mem(SUPERBLOCK_SIZE, &fs->orig_super);
                if (retval)
                        goto cleanup;
        }
        retval = io_channel_read_blk(fs->io, superblock, -SUPERBLOCK_SIZE,
                                     fs->super);
>>
It caused errors like
# e2image -Q /dev/md65 /tmp/node05_image_out
e2image 1.45.6.cr1 (14-Aug-2020)
e2image: Attempt to read block from filesystem resulted in short read while trying to open /dev/md65
Couldn’t find valid filesystem superblock.

It looks like I don't first person to found a bug, as someone was add 

Alex

On 17/11/2020, 22:19, "Theodore Y. Ts'o" <tytso@mit.edu> wrote:

    On Tue, Nov 17, 2020 at 06:30:11PM +0300, Благодаренко Артём wrote:
    > Hello,
    > 
    > Any thoughts about this change? Thanks.

    I'm trying to think of situations where this could actually trigger in
    real life.  The only one I can think of is if a file system with a 1k
    block file system is located on an an Advanced FormatDrive with a 4k
    sector size.

    What was the use case where this was actually an issue?

         	     	      	    	     - Ted


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] libfs: Fix DIO mode aligment
  2020-11-19 12:26     ` Lyashkov, Alexey
@ 2020-12-08  8:33       ` Благодаренко Артём
  2020-12-18 22:03       ` Andreas Dilger
  1 sibling, 0 replies; 10+ messages in thread
From: Благодаренко Артём @ 2020-12-08  8:33 UTC (permalink / raw)
  To: Lyashkov, Alexey
  Cc: Theodore Y. Ts'o, linux-ext4, adilger.kernel, Theodore Tso

Hello Theodore,

Is this case important enough to fix the code?

Thanks.
Best regards,
Artem Blagodarenko.


> On 19 Nov 2020, at 15:26, Lyashkov, Alexey <alexey.lyashkov@hpe.com> wrote:
> 
> Tso,
> 
> This situation hit with modern hdd with 4k block size and e2image changed to use DIRECT IO instead of buffered.
> e2fsprogs tries to read a super lock on offset 1k and it caused to set FS block size to 1k and second block reading.
> (many other places exist, but it simplest).
>>> 
>        if (superblock) {
>                if (!block_size) {
>                        retval = EXT2_ET_INVALID_ARGUMENT;
>                        goto cleanup;
>                }
>                io_channel_set_blksize(fs->io, block_size);
>                group_block = superblock;
>                fs->orig_super = 0;
>        } else {
>                io_channel_set_blksize(fs->io, SUPERBLOCK_OFFSET); <<<<< this is problem
>                superblock = 1;
>                group_block = 0;
>                retval = ext2fs_get_mem(SUPERBLOCK_SIZE, &fs->orig_super);
>                if (retval)
>                        goto cleanup;
>        }
>        retval = io_channel_read_blk(fs->io, superblock, -SUPERBLOCK_SIZE,
>                                     fs->super);
>>> 
> It caused errors like
> # e2image -Q /dev/md65 /tmp/node05_image_out
> e2image 1.45.6.cr1 (14-Aug-2020)
> e2image: Attempt to read block from filesystem resulted in short read while trying to open /dev/md65
> Couldn’t find valid filesystem superblock.
> 
> It looks like I don't first person to found a bug, as someone was add 
> 
> Alex
> 
> On 17/11/2020, 22:19, "Theodore Y. Ts'o" <tytso@mit.edu> wrote:
> 
>    On Tue, Nov 17, 2020 at 06:30:11PM +0300, Благодаренко Артём wrote:
>> Hello,
>> 
>> Any thoughts about this change? Thanks.
> 
>    I'm trying to think of situations where this could actually trigger in
>    real life.  The only one I can think of is if a file system with a 1k
>    block file system is located on an an Advanced FormatDrive with a 4k
>    sector size.
> 
>    What was the use case where this was actually an issue?
> 
>         	     	      	    	     - Ted
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] libfs: Fix DIO mode aligment
  2020-11-19 12:26     ` Lyashkov, Alexey
  2020-12-08  8:33       ` Благодаренко Артём
@ 2020-12-18 22:03       ` Andreas Dilger
  2020-12-19  4:31         ` Lyashkov, Alexey
  1 sibling, 1 reply; 10+ messages in thread
From: Andreas Dilger @ 2020-12-18 22:03 UTC (permalink / raw)
  To: Lyashkov, Alexey
  Cc: Theodore Y. Ts'o,
	Благодаренко
	Артём,
	linux-ext4

[-- Attachment #1: Type: text/plain, Size: 3221 bytes --]

On Nov 19, 2020, at 5:26 AM, Lyashkov, Alexey <alexey.lyashkov@hpe.com> wrote:
> 
> Tso,
> 
> This situation hit with modern hdd with 4k block size and e2image changed to use DIRECT IO instead of buffered.

It would be useful to include this patch for e2image as part of this submission,
so that this can be tested.  I suspect that O_DIRECT would be useful for other
tools (e.g. e2fsck, debugfs, etc.) since the IO manager would avoid double
buffering the data in both the kernel and userspace.

> e2fsprogs tries to read a super lock on offset 1k and it caused to set FS block size to 1k and second block reading.
> (many other places exist, but it simplest).

Are there actually other places where it is doing sub-block-size reads from disk?

It seems simpler to fix the superblock read at open to always read the first 4KB
into a buffer (and to make it easy to extend to 16KB or 64KB if sector sizes get
even larger), then find the superblock within the buffer to decide the blocksize.

That avoids the short/unaligned read from disk when opening the filesystem, without
the need to add complexity to the reading code to buffer all unaligned reads, for
a case that doesn't seem likely otherwise.  The only other possibility I can think
that would need this is a small-block filesystem image (e.g. 1KB) copied to a
large-sector device?  It isn't clear if the kernel would be able to mount that...

Cheers, Andreas

>        if (superblock) {
>                if (!block_size) {
>                        retval = EXT2_ET_INVALID_ARGUMENT;
>                        goto cleanup;
>                }
>                io_channel_set_blksize(fs->io, block_size);
>                group_block = superblock;
>                fs->orig_super = 0;
>        } else {
>                io_channel_set_blksize(fs->io, SUPERBLOCK_OFFSET); <<<<< this is problem
>                superblock = 1;
>                group_block = 0;
>                retval = ext2fs_get_mem(SUPERBLOCK_SIZE, &fs->orig_super);
>                if (retval)
>                        goto cleanup;
>        }
>        retval = io_channel_read_blk(fs->io, superblock, -SUPERBLOCK_SIZE,
>                                     fs->super);
> 
> It caused errors like
> # e2image -Q /dev/md65 /tmp/node05_image_out
> e2image 1.45.6.cr1 (14-Aug-2020)
> e2image: Attempt to read block from filesystem resulted in short read while trying to open /dev/md65
> Couldn’t find valid filesystem superblock.
> 
> It looks like I don't first person to found a bug, as someone was add
> 
> Alex
> 
> On 17/11/2020, 22:19, "Theodore Y. Ts'o" <tytso@mit.edu> wrote:
> 
>    On Tue, Nov 17, 2020 at 06:30:11PM +0300, Благодаренко Артём wrote:
>> Hello,
>> 
>> Any thoughts about this change? Thanks.
> 
>    I'm trying to think of situations where this could actually trigger in
>    real life.  The only one I can think of is if a file system with a 1k
>    block file system is located on an an Advanced FormatDrive with a 4k
>    sector size.
> 
>    What was the use case where this was actually an issue?
> 
>         	     	      	    	     - Ted
> 


Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] libfs: Fix DIO mode aligment
  2020-12-18 22:03       ` Andreas Dilger
@ 2020-12-19  4:31         ` Lyashkov, Alexey
  0 siblings, 0 replies; 10+ messages in thread
From: Lyashkov, Alexey @ 2020-12-19  4:31 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Theodore Y. Ts'o,
	Благодаренко
	Артём,
	linux-ext4



On 19/12/2020, 01:03, "Andreas Dilger" <adilger@dilger.ca> wrote:

    On Nov 19, 2020, at 5:26 AM, Lyashkov, Alexey <alexey.lyashkov@hpe.com> wrote:
    > 
    > Tso,
    > 
    > This situation hit with modern hdd with 4k block size and e2image changed to use DIRECT IO instead of buffered.

   >  It would be useful to include this patch for e2image as part of this submission,
    > so that this can be tested.  I suspect that O_DIRECT would be useful for other
    > tools (e.g. e2fsck, debugfs, etc.) since the IO manager would avoid double
    > buffering the data in both the kernel and userspace.

debugfs have a -D option already. As about e2fsck have run in single user and several loops over FS exist.
So caching is good to have there. Don't forget - caching permits an readahead works - which is very usefull for the large filesystem open.



    > e2fsprogs tries to read a super lock on offset 1k and it caused to set FS block size to 1k and second block reading.
    > (many other places exist, but it simplest).

 >    Are there actually other places where it is doing sub-block-size reads from disk?
Many places. 

bash-3.2$ grep -rn io_channel_set_blksize * | grep SUPERBLOCK
lib/ext2fs/undo_io.c:223:	io_channel_set_blksize(channel, SUPERBLOCK_OFFSET);
lib/ext2fs/undo_io.c:506:	io_channel_set_blksize(channel, SUPERBLOCK_OFFSET);
lib/ext2fs/closefs.c:201:		io_channel_set_blksize(fs->io, SUPERBLOCK_OFFSET);
lib/ext2fs/openfs.c:218:		io_channel_set_blksize(fs->io, SUPERBLOCK_OFFSET);
misc/mke2fs.c:2573:	io_channel_set_blksize(channel, SUPERBLOCK_OFFSET);
misc/e2undo.c:168:	io_channel_set_blksize(channel, SUPERBLOCK_OFFSET);

and some places where set_blksize was called with other size different than block device size.
In theory we can create an FS with 1K block size, and tools should able to work with it.


>    It seems simpler to fix the superblock read at open to always read the first 4KB
>    into a buffer (and to make it easy to extend to 16KB or 64KB if sector sizes get
>    even larger), then find the superblock within the buffer to decide the blocksize.

And make it on many places including an metadata reading in case FS block size is 1k.





^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] libfs: Fix DIO mode aligment
  2020-10-23 11:26 [PATCH] libfs: Fix DIO mode aligment Artem Blagodarenko
  2020-11-17 15:30 ` Благодаренко Артём
@ 2021-02-26 23:17 ` Theodore Ts'o
  2021-03-01 16:14   ` Благодаренко Артём
  1 sibling, 1 reply; 10+ messages in thread
From: Theodore Ts'o @ 2021-02-26 23:17 UTC (permalink / raw)
  To: Artem Blagodarenko; +Cc: linux-ext4, adilger.kernel, Alexey Lyashkov

I've rewritten the patch because it was simplest way to review the
code.  The resulting code is simpler and has a smaller number of lines
of code.  I don't have any 4k advanced format disks handy, so I'd
appreciate it if someone can test it.  It passes the existing
regression tests, and I've run a number of manual tests to simulate a
advanced format HDD, with the tests being run with clang and UBSAN and
ADDRSAN enabled.

If someone with access to an advanced format disk can test running
debugfs -D on an advanced format disk, that would be great, thanks.

	      	 	  	       - Ted

commit c001596110e834a85b01a47a20811b318cb3b9e4
Author: Theodore Ts'o <tytso@mit.edu>
Date:   Fri Feb 26 17:41:06 2021 -0500

    libext2fs: fix unix_io's Direct I/O support
    
    The previous Direct I/O support worked on HDD's with 512 byte logical
    sector sizes, and on FreeBSD which required 4k aligned memory buffers.
    However, it was incomplete and was not correctly working on HDD's with
    4k logical sector sizes (aka Advanced Format Disks).
    
    Based on a patch from Alexey Lyashkov <alexey.lyashkov@hpe.com> but
    rewritten to work with the latest e2fsprogs and to minimize changes to
    make it easier to review.
    
    Signed-off-by: Theodore Ts'o <tytso@mit.edu>
    Reported-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>

diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
index c395d615..996c31a1 100644
--- a/lib/ext2fs/io_manager.c
+++ b/lib/ext2fs/io_manager.c
@@ -134,9 +134,11 @@ errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
 	else
 		size = -count;
 
-	if (io->align)
+	if (io->align) {
+		if (io->align > size)
+			size = io->align;
 		return ext2fs_get_memalign(size, io->align, ptr);
-	else
+	} else
 		return ext2fs_get_mem(size, ptr);
 }
 
diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index 73f5667e..8965535c 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -218,6 +218,8 @@ static errcode_t raw_read_blk(io_channel channel,
 	int		actual = 0;
 	unsigned char	*buf = bufv;
 	ssize_t		really_read = 0;
+	unsigned long long aligned_blk;
+	int		align_size, offset;
 
 	size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
 	mutex_lock(data, STATS_MTX);
@@ -226,7 +228,7 @@ static errcode_t raw_read_blk(io_channel channel,
 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
 
 	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
-		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+		if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
 			goto error_out;
 		}
@@ -237,6 +239,7 @@ static errcode_t raw_read_blk(io_channel channel,
 	/* Try an aligned pread */
 	if ((channel->align == 0) ||
 	    (IS_ALIGNED(buf, channel->align) &&
+	     IS_ALIGNED(location, channel->align) &&
 	     IS_ALIGNED(size, channel->align))) {
 		actual = pread64(data->dev, buf, size, location);
 		if (actual == size)
@@ -248,6 +251,7 @@ static errcode_t raw_read_blk(io_channel channel,
 	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
 	    ((channel->align == 0) ||
 	     (IS_ALIGNED(buf, channel->align) &&
+	      IS_ALIGNED(location, channel->align) &&
 	      IS_ALIGNED(size, channel->align)))) {
 		actual = pread(data->dev, buf, size, location);
 		if (actual == size)
@@ -256,12 +260,13 @@ static errcode_t raw_read_blk(io_channel channel,
 	}
 #endif /* HAVE_PREAD */
 
-	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+	if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
 		goto error_out;
 	}
 	if ((channel->align == 0) ||
 	    (IS_ALIGNED(buf, channel->align) &&
+	     IS_ALIGNED(location, channel->align) &&
 	     IS_ALIGNED(size, channel->align))) {
 		actual = read(data->dev, buf, size);
 		if (actual != size) {
@@ -286,23 +291,39 @@ static errcode_t raw_read_blk(io_channel channel,
 	 * to the O_DIRECT rules, so we need to do this the hard way...
 	 */
 bounce_read:
+	if ((channel->block_size > channel->align) &&
+	    (channel->block_size % channel->align) == 0)
+		align_size = channel->block_size;
+	else
+		align_size = channel->align;
+	aligned_blk = location / align_size;
+	offset = location % align_size;
+
+	if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
+		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+		goto error_out;
+	}
 	while (size > 0) {
 		mutex_lock(data, BOUNCE_MTX);
-		actual = read(data->dev, data->bounce, channel->block_size);
-		if (actual != channel->block_size) {
+		actual = read(data->dev, data->bounce, align_size);
+		if (actual != align_size) {
 			mutex_unlock(data, BOUNCE_MTX);
 			actual = really_read;
 			buf -= really_read;
 			size += really_read;
 			goto short_read;
 		}
-		actual = size;
-		if (size > channel->block_size)
-			actual = channel->block_size;
-		memcpy(buf, data->bounce, actual);
+		if ((actual + offset) > align_size)
+			actual = align_size - offset;
+		if (actual > size)
+			actual = size;
+		memcpy(buf, data->bounce + offset, actual);
+
 		really_read += actual;
 		size -= actual;
 		buf += actual;
+		offset = 0;
+		aligned_blk++;
 		mutex_unlock(data, BOUNCE_MTX);
 	}
 	return 0;
@@ -326,6 +347,8 @@ static errcode_t raw_write_blk(io_channel channel,
 	int		actual = 0;
 	errcode_t	retval;
 	const unsigned char *buf = bufv;
+	unsigned long long aligned_blk;
+	int		align_size, offset;
 
 	if (count == 1)
 		size = channel->block_size;
@@ -342,7 +365,7 @@ static errcode_t raw_write_blk(io_channel channel,
 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
 
 	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
-		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+		if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
 			goto error_out;
 		}
@@ -353,6 +376,7 @@ static errcode_t raw_write_blk(io_channel channel,
 	/* Try an aligned pwrite */
 	if ((channel->align == 0) ||
 	    (IS_ALIGNED(buf, channel->align) &&
+	     IS_ALIGNED(location, channel->align) &&
 	     IS_ALIGNED(size, channel->align))) {
 		actual = pwrite64(data->dev, buf, size, location);
 		if (actual == size)
@@ -363,6 +387,7 @@ static errcode_t raw_write_blk(io_channel channel,
 	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
 	    ((channel->align == 0) ||
 	     (IS_ALIGNED(buf, channel->align) &&
+	      IS_ALIGNED(location, channel->align) &&
 	      IS_ALIGNED(size, channel->align)))) {
 		actual = pwrite(data->dev, buf, size, location);
 		if (actual == size)
@@ -370,13 +395,14 @@ static errcode_t raw_write_blk(io_channel channel,
 	}
 #endif /* HAVE_PWRITE */
 
-	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+	if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
 		goto error_out;
 	}
 
 	if ((channel->align == 0) ||
 	    (IS_ALIGNED(buf, channel->align) &&
+	     IS_ALIGNED(location, channel->align) &&
 	     IS_ALIGNED(size, channel->align))) {
 		actual = write(data->dev, buf, size);
 		if (actual < 0) {
@@ -400,40 +426,59 @@ static errcode_t raw_write_blk(io_channel channel,
 	 * to the O_DIRECT rules, so we need to do this the hard way...
 	 */
 bounce_write:
+	if ((channel->block_size > channel->align) &&
+	    (channel->block_size % channel->align) == 0)
+		align_size = channel->block_size;
+	else
+		align_size = channel->align;
+	aligned_blk = location / align_size;
+	offset = location % align_size;
+
 	while (size > 0) {
+		int actual_w;
+
 		mutex_lock(data, BOUNCE_MTX);
-		if (size < channel->block_size) {
+		if (size < align_size || offset) {
+			if (ext2fs_llseek(data->dev, aligned_blk * align_size,
+					  SEEK_SET) < 0) {
+				retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
+				goto error_out;
+			}
 			actual = read(data->dev, data->bounce,
-				      channel->block_size);
-			if (actual != channel->block_size) {
+				      align_size);
+			if (actual != align_size) {
 				if (actual < 0) {
 					mutex_unlock(data, BOUNCE_MTX);
 					retval = errno;
 					goto error_out;
 				}
 				memset((char *) data->bounce + actual, 0,
-				       channel->block_size - actual);
+				       align_size - actual);
 			}
 		}
 		actual = size;
-		if (size > channel->block_size)
-			actual = channel->block_size;
-		memcpy(data->bounce, buf, actual);
-		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
+		if ((actual + offset) > align_size)
+			actual = align_size - offset;
+		if (actual > size)
+			actual = size;
+		memcpy(((char *)data->bounce) + offset, buf, actual);
+		if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
 			goto error_out;
 		}
-		actual = write(data->dev, data->bounce, channel->block_size);
+		actual_w = write(data->dev, data->bounce, align_size);
 		mutex_unlock(data, BOUNCE_MTX);
-		if (actual < 0) {
+		if (actual_w < 0) {
 			retval = errno;
 			goto error_out;
 		}
-		if (actual != channel->block_size)
+		if (actual_w != align_size)
 			goto short_write;
 		size -= actual;
 		buf += actual;
 		location += actual;
+		aligned_blk++;
+		offset = 0;
 	}
 	return 0;
 

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] libfs: Fix DIO mode aligment
  2021-02-26 23:17 ` Theodore Ts'o
@ 2021-03-01 16:14   ` Благодаренко Артём
  2021-03-01 17:42     ` Theodore Ts'o
  0 siblings, 1 reply; 10+ messages in thread
From: Благодаренко Артём @ 2021-03-01 16:14 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: linux-ext4, adilger.kernel, Alexey Lyashkov

Hello Ted!

Thank you very much for the new version of the fix.

The problem can be easily reproduced, using loop device.

Here is how I reproduced it originally (without the patch)

# truncate -s 512MB /tmp/lustre-ost
# losetup -b 4096 /dev/loop0 /tmp/lustre-ost
# mkfs.ext4 /dev/loop0
mke2fs 1.45.6.wc1 (20-Mar-2020)
detected raid stride 4096 too large, use optimum 512
Discarding device blocks: done                            
Creating filesystem with 125000 4k blocks and 125056 inodes
Filesystem UUID: 541ad524-556a-45be-84fe-5b68c1ca4562
Superblock backups stored on blocks:
    32768, 98304
 
Allocating group tables: done                            
Writing inode tables: done                            
Creating journal (4096 blocks): done
Writing superblocks and filesystem accounting information: done

# git rev-parse HEAD
0f880f553dfa09dabe4cb03752b97a07b66eb998

[root@CO82 e2fsprogs-hpe]# misc/e2image /dev/loop0 /tmp/ost-image
e2image 1.45.2.cr2 (09-Apr-2020)
misc/e2image: Attempt to read block from filesystem resulted in short read while trying to open /dev/loop0
Couldn't find valid filesystem superblock.


With your patch e2image works fine.


[root@CO82 e2fsprogs-kernel]# truncate -s 512MB /tmp/lustre-ost
[root@CO82 e2fsprogs-kernel]# losetup -b 4096 /dev/loop0 /tmp/lustre-ost
[root@CO82 e2fsprogs-kernel]# mkfs.ext4 /dev/loop0
mke2fs 1.45.6.wc1 (20-Mar-2020)
detected raid stride 4096 too large, use optimum 512
Discarding device blocks: done                            
Creating filesystem with 125000 4k blocks and 125056 inodes
Filesystem UUID: 42f6fc7d-382a-4681-99b8-79f3fcd2b4bf
Superblock backups stored on blocks: 
	32768, 98304

Allocating group tables: done                            
Writing inode tables: done                            
Creating journal (4096 blocks): done
Writing superblocks and filesystem accounting information: done

[root@CO82 e2fsprogs-kernel]# git rev-parse HEAD
67f2b54667e65cf5a478fcea8b85722be9ee6e8d
[root@CO82 e2fsprogs-kernel]# misc/e2image /dev/loop0 /tmp/ost-image
e2image 1.46.1 (9-Feb-2021)

Thanks again.
Best regards,
Artem Blagodarenko.


> On 27 Feb 2021, at 02:17, Theodore Ts'o <tytso@mit.edu> wrote:
> 
> I've rewritten the patch because it was simplest way to review the
> code.  The resulting code is simpler and has a smaller number of lines
> of code.  I don't have any 4k advanced format disks handy, so I'd
> appreciate it if someone can test it.  It passes the existing
> regression tests, and I've run a number of manual tests to simulate a
> advanced format HDD, with the tests being run with clang and UBSAN and
> ADDRSAN enabled.
> 
> If someone with access to an advanced format disk can test running
> debugfs -D on an advanced format disk, that would be great, thanks.
> 
> 	      	 	  	       - Ted
> 
> commit c001596110e834a85b01a47a20811b318cb3b9e4
> Author: Theodore Ts'o <tytso@mit.edu>
> Date:   Fri Feb 26 17:41:06 2021 -0500
> 
>    libext2fs: fix unix_io's Direct I/O support
> 
>    The previous Direct I/O support worked on HDD's with 512 byte logical
>    sector sizes, and on FreeBSD which required 4k aligned memory buffers.
>    However, it was incomplete and was not correctly working on HDD's with
>    4k logical sector sizes (aka Advanced Format Disks).
> 
>    Based on a patch from Alexey Lyashkov <alexey.lyashkov@hpe.com> but
>    rewritten to work with the latest e2fsprogs and to minimize changes to
>    make it easier to review.
> 
>    Signed-off-by: Theodore Ts'o <tytso@mit.edu>
>    Reported-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
> 
> diff --git a/lib/ext2fs/io_manager.c b/lib/ext2fs/io_manager.c
> index c395d615..996c31a1 100644
> --- a/lib/ext2fs/io_manager.c
> +++ b/lib/ext2fs/io_manager.c
> @@ -134,9 +134,11 @@ errcode_t io_channel_alloc_buf(io_channel io, int count, void *ptr)
> 	else
> 		size = -count;
> 
> -	if (io->align)
> +	if (io->align) {
> +		if (io->align > size)
> +			size = io->align;
> 		return ext2fs_get_memalign(size, io->align, ptr);
> -	else
> +	} else
> 		return ext2fs_get_mem(size, ptr);
> }
> 
> diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
> index 73f5667e..8965535c 100644
> --- a/lib/ext2fs/unix_io.c
> +++ b/lib/ext2fs/unix_io.c
> @@ -218,6 +218,8 @@ static errcode_t raw_read_blk(io_channel channel,
> 	int		actual = 0;
> 	unsigned char	*buf = bufv;
> 	ssize_t		really_read = 0;
> +	unsigned long long aligned_blk;
> +	int		align_size, offset;
> 
> 	size = (count < 0) ? -count : (ext2_loff_t) count * channel->block_size;
> 	mutex_lock(data, STATS_MTX);
> @@ -226,7 +228,7 @@ static errcode_t raw_read_blk(io_channel channel,
> 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
> 
> 	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> -		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> +		if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
> 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> 			goto error_out;
> 		}
> @@ -237,6 +239,7 @@ static errcode_t raw_read_blk(io_channel channel,
> 	/* Try an aligned pread */
> 	if ((channel->align == 0) ||
> 	    (IS_ALIGNED(buf, channel->align) &&
> +	     IS_ALIGNED(location, channel->align) &&
> 	     IS_ALIGNED(size, channel->align))) {
> 		actual = pread64(data->dev, buf, size, location);
> 		if (actual == size)
> @@ -248,6 +251,7 @@ static errcode_t raw_read_blk(io_channel channel,
> 	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
> 	    ((channel->align == 0) ||
> 	     (IS_ALIGNED(buf, channel->align) &&
> +	      IS_ALIGNED(location, channel->align) &&
> 	      IS_ALIGNED(size, channel->align)))) {
> 		actual = pread(data->dev, buf, size, location);
> 		if (actual == size)
> @@ -256,12 +260,13 @@ static errcode_t raw_read_blk(io_channel channel,
> 	}
> #endif /* HAVE_PREAD */
> 
> -	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> +	if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
> 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> 		goto error_out;
> 	}
> 	if ((channel->align == 0) ||
> 	    (IS_ALIGNED(buf, channel->align) &&
> +	     IS_ALIGNED(location, channel->align) &&
> 	     IS_ALIGNED(size, channel->align))) {
> 		actual = read(data->dev, buf, size);
> 		if (actual != size) {
> @@ -286,23 +291,39 @@ static errcode_t raw_read_blk(io_channel channel,
> 	 * to the O_DIRECT rules, so we need to do this the hard way...
> 	 */
> bounce_read:
> +	if ((channel->block_size > channel->align) &&
> +	    (channel->block_size % channel->align) == 0)
> +		align_size = channel->block_size;
> +	else
> +		align_size = channel->align;
> +	aligned_blk = location / align_size;
> +	offset = location % align_size;
> +
> +	if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
> +		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> +		goto error_out;
> +	}
> 	while (size > 0) {
> 		mutex_lock(data, BOUNCE_MTX);
> -		actual = read(data->dev, data->bounce, channel->block_size);
> -		if (actual != channel->block_size) {
> +		actual = read(data->dev, data->bounce, align_size);
> +		if (actual != align_size) {
> 			mutex_unlock(data, BOUNCE_MTX);
> 			actual = really_read;
> 			buf -= really_read;
> 			size += really_read;
> 			goto short_read;
> 		}
> -		actual = size;
> -		if (size > channel->block_size)
> -			actual = channel->block_size;
> -		memcpy(buf, data->bounce, actual);
> +		if ((actual + offset) > align_size)
> +			actual = align_size - offset;
> +		if (actual > size)
> +			actual = size;
> +		memcpy(buf, data->bounce + offset, actual);
> +
> 		really_read += actual;
> 		size -= actual;
> 		buf += actual;
> +		offset = 0;
> +		aligned_blk++;
> 		mutex_unlock(data, BOUNCE_MTX);
> 	}
> 	return 0;
> @@ -326,6 +347,8 @@ static errcode_t raw_write_blk(io_channel channel,
> 	int		actual = 0;
> 	errcode_t	retval;
> 	const unsigned char *buf = bufv;
> +	unsigned long long aligned_blk;
> +	int		align_size, offset;
> 
> 	if (count == 1)
> 		size = channel->block_size;
> @@ -342,7 +365,7 @@ static errcode_t raw_write_blk(io_channel channel,
> 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
> 
> 	if (data->flags & IO_FLAG_FORCE_BOUNCE) {
> -		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> +		if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
> 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> 			goto error_out;
> 		}
> @@ -353,6 +376,7 @@ static errcode_t raw_write_blk(io_channel channel,
> 	/* Try an aligned pwrite */
> 	if ((channel->align == 0) ||
> 	    (IS_ALIGNED(buf, channel->align) &&
> +	     IS_ALIGNED(location, channel->align) &&
> 	     IS_ALIGNED(size, channel->align))) {
> 		actual = pwrite64(data->dev, buf, size, location);
> 		if (actual == size)
> @@ -363,6 +387,7 @@ static errcode_t raw_write_blk(io_channel channel,
> 	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
> 	    ((channel->align == 0) ||
> 	     (IS_ALIGNED(buf, channel->align) &&
> +	      IS_ALIGNED(location, channel->align) &&
> 	      IS_ALIGNED(size, channel->align)))) {
> 		actual = pwrite(data->dev, buf, size, location);
> 		if (actual == size)
> @@ -370,13 +395,14 @@ static errcode_t raw_write_blk(io_channel channel,
> 	}
> #endif /* HAVE_PWRITE */
> 
> -	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> +	if (ext2fs_llseek(data->dev, location, SEEK_SET) < 0) {
> 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> 		goto error_out;
> 	}
> 
> 	if ((channel->align == 0) ||
> 	    (IS_ALIGNED(buf, channel->align) &&
> +	     IS_ALIGNED(location, channel->align) &&
> 	     IS_ALIGNED(size, channel->align))) {
> 		actual = write(data->dev, buf, size);
> 		if (actual < 0) {
> @@ -400,40 +426,59 @@ static errcode_t raw_write_blk(io_channel channel,
> 	 * to the O_DIRECT rules, so we need to do this the hard way...
> 	 */
> bounce_write:
> +	if ((channel->block_size > channel->align) &&
> +	    (channel->block_size % channel->align) == 0)
> +		align_size = channel->block_size;
> +	else
> +		align_size = channel->align;
> +	aligned_blk = location / align_size;
> +	offset = location % align_size;
> +
> 	while (size > 0) {
> +		int actual_w;
> +
> 		mutex_lock(data, BOUNCE_MTX);
> -		if (size < channel->block_size) {
> +		if (size < align_size || offset) {
> +			if (ext2fs_llseek(data->dev, aligned_blk * align_size,
> +					  SEEK_SET) < 0) {
> +				retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> +				goto error_out;
> +			}
> 			actual = read(data->dev, data->bounce,
> -				      channel->block_size);
> -			if (actual != channel->block_size) {
> +				      align_size);
> +			if (actual != align_size) {
> 				if (actual < 0) {
> 					mutex_unlock(data, BOUNCE_MTX);
> 					retval = errno;
> 					goto error_out;
> 				}
> 				memset((char *) data->bounce + actual, 0,
> -				       channel->block_size - actual);
> +				       align_size - actual);
> 			}
> 		}
> 		actual = size;
> -		if (size > channel->block_size)
> -			actual = channel->block_size;
> -		memcpy(data->bounce, buf, actual);
> -		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
> +		if ((actual + offset) > align_size)
> +			actual = align_size - offset;
> +		if (actual > size)
> +			actual = size;
> +		memcpy(((char *)data->bounce) + offset, buf, actual);
> +		if (ext2fs_llseek(data->dev, aligned_blk * align_size, SEEK_SET) < 0) {
> 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
> 			goto error_out;
> 		}
> -		actual = write(data->dev, data->bounce, channel->block_size);
> +		actual_w = write(data->dev, data->bounce, align_size);
> 		mutex_unlock(data, BOUNCE_MTX);
> -		if (actual < 0) {
> +		if (actual_w < 0) {
> 			retval = errno;
> 			goto error_out;
> 		}
> -		if (actual != channel->block_size)
> +		if (actual_w != align_size)
> 			goto short_write;
> 		size -= actual;
> 		buf += actual;
> 		location += actual;
> +		aligned_blk++;
> +		offset = 0;
> 	}
> 	return 0;
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] libfs: Fix DIO mode aligment
  2021-03-01 16:14   ` Благодаренко Артём
@ 2021-03-01 17:42     ` Theodore Ts'o
  0 siblings, 0 replies; 10+ messages in thread
From: Theodore Ts'o @ 2021-03-01 17:42 UTC (permalink / raw)
  To: Благодаренко
	Артём
  Cc: linux-ext4, adilger.kernel, Alexey Lyashkov

On Mon, Mar 01, 2021 at 07:14:00PM +0300, Благодаренко Артём wrote:
> Here is how I reproduced it originally (without the patch)
> 
> # truncate -s 512MB /tmp/lustre-ost
> # losetup -b 4096 /dev/loop0 /tmp/lustre-ost
> # mkfs.ext4 /dev/loop0

Thanks for pointing out that this can be tested using losetup.

> With your patch e2image works fine.
> 
> [root@CO82 e2fsprogs-kernel]# git rev-parse HEAD
> 67f2b54667e65cf5a478fcea8b85722be9ee6e8d
> [root@CO82 e2fsprogs-kernel]# misc/e2image /dev/loop0 /tmp/ost-image
> e2image 1.46.1 (9-Feb-2021)

So commit 67f2b54667e6 is 1.46.2, so I'm not sure that you recompiled
e2image.  More importantly, e2image in upstream doesn't use Direct
I/O.  When I tried using debugfs -D, it looks like Direct I/O on an
Advanced Format HDD isn't working correctly:

# debugfs -D /dev/loop0
debugfs 1.46.2 (28-Feb-2021)
debugfs: Bad magic number in super-block while trying to open /dev/loop0
/dev/loop0 contains a ext4 file system
        created on Mon Mar  1 12:31:43 2021

This isn't a regression since it was broken in upstream before, but it
would be good to get this fixed before e2fsprogs 1.46.3.

Cheers,

						- Ted

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-03-01 17:46 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-23 11:26 [PATCH] libfs: Fix DIO mode aligment Artem Blagodarenko
2020-11-17 15:30 ` Благодаренко Артём
2020-11-17 19:19   ` Theodore Y. Ts'o
2020-11-19 12:26     ` Lyashkov, Alexey
2020-12-08  8:33       ` Благодаренко Артём
2020-12-18 22:03       ` Andreas Dilger
2020-12-19  4:31         ` Lyashkov, Alexey
2021-02-26 23:17 ` Theodore Ts'o
2021-03-01 16:14   ` Благодаренко Артём
2021-03-01 17:42     ` Theodore Ts'o

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).