All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5 1/2] Return bytes transferred for partial direct I/O
@ 2018-01-19  0:57 Goldwyn Rodrigues
  2018-01-19  0:57 ` [PATCH 2/2] xfs: remove assert to check bytes returned Goldwyn Rodrigues
                   ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Goldwyn Rodrigues @ 2018-01-19  0:57 UTC (permalink / raw)
  To: linux-block; +Cc: linux-fsdevel, darrick.wong, Goldwyn Rodrigues

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

In case direct I/O encounters an error midway, it returns the error.
Instead it should be returning the number of bytes transferred so far.

Test case for filesystems (with ENOSPC):
1. Create an almost full filesystem
2. Create a file, say /mnt/lastfile, until the filesystem is full.
3. Direct write() with count > sizeof /mnt/lastfile.

Result: write() returns -ENOSPC. However, file content has data written
in step 3.

This fixes fstest generic/472.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/block_dev.c |  2 +-
 fs/direct-io.c |  4 +---
 fs/iomap.c     | 20 ++++++++++----------
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4a181fcb5175..49d94360bb51 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -409,7 +409,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 
 	if (!ret)
 		ret = blk_status_to_errno(dio->bio.bi_status);
-	if (likely(!ret))
+	if (likely(dio->size))
 		ret = dio->size;
 
 	bio_put(&dio->bio);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3aafb3343a65..0c98b6e65d7c 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -262,8 +262,6 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
 		ret = dio->page_errors;
 	if (ret == 0)
 		ret = dio->io_error;
-	if (ret == 0)
-		ret = transferred;
 
 	if (dio->end_io) {
 		// XXX: ki_pos??
@@ -310,7 +308,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
 	}
 
 	kmem_cache_free(dio_cache, dio);
-	return ret;
+	return transferred ? transferred : ret;
 }
 
 static void dio_aio_complete_work(struct work_struct *work)
diff --git a/fs/iomap.c b/fs/iomap.c
index 47d29ccffaef..cab57d85404e 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -716,23 +716,23 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 	struct kiocb *iocb = dio->iocb;
 	struct inode *inode = file_inode(iocb->ki_filp);
 	loff_t offset = iocb->ki_pos;
-	ssize_t ret;
+	ssize_t err;
+	ssize_t transferred = dio->size;
 
 	if (dio->end_io) {
-		ret = dio->end_io(iocb,
-				dio->error ? dio->error : dio->size,
+		err = dio->end_io(iocb,
+				transferred ? transferred : dio->error,
 				dio->flags);
 	} else {
-		ret = dio->error;
+		err = dio->error;
 	}
 
-	if (likely(!ret)) {
-		ret = dio->size;
+	if (likely(transferred)) {
 		/* check for short read */
-		if (offset + ret > dio->i_size &&
+		if (offset + transferred > dio->i_size &&
 		    !(dio->flags & IOMAP_DIO_WRITE))
-			ret = dio->i_size - offset;
-		iocb->ki_pos += ret;
+			transferred = dio->i_size - offset;
+		iocb->ki_pos += transferred;
 	}
 
 	/*
@@ -759,7 +759,7 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 	inode_dio_end(file_inode(iocb->ki_filp));
 	kfree(dio);
 
-	return ret;
+	return transferred ? transferred : err;
 }
 
 static void iomap_dio_complete_work(struct work_struct *work)
-- 
2.15.1

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/2] xfs: remove assert to check bytes returned
  2018-01-19  0:57 [PATCH v5 1/2] Return bytes transferred for partial direct I/O Goldwyn Rodrigues
@ 2018-01-19  0:57 ` Goldwyn Rodrigues
  2018-01-19  3:57   ` Dave Chinner
  2018-01-19  2:13 ` [PATCH v5 1/2] Return bytes transferred for partial direct I/O Al Viro
  2018-01-21  2:11 ` Andi Kleen
  2 siblings, 1 reply; 29+ messages in thread
From: Goldwyn Rodrigues @ 2018-01-19  0:57 UTC (permalink / raw)
  To: linux-block; +Cc: linux-fsdevel, darrick.wong, Goldwyn Rodrigues

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

Since we can return less than count in case of partial direct
writes, remove the ASSERT.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/xfs/xfs_file.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8601275cc5e6..8fc4dbf66910 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -590,12 +590,6 @@ xfs_file_dio_aio_write(
 	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
 out:
 	xfs_iunlock(ip, iolock);
-
-	/*
-	 * No fallback to buffered IO on errors for XFS, direct IO will either
-	 * complete fully or fail.
-	 */
-	ASSERT(ret < 0 || ret == count);
 	return ret;
 }
 
-- 
2.15.1

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-19  0:57 [PATCH v5 1/2] Return bytes transferred for partial direct I/O Goldwyn Rodrigues
  2018-01-19  0:57 ` [PATCH 2/2] xfs: remove assert to check bytes returned Goldwyn Rodrigues
@ 2018-01-19  2:13 ` Al Viro
  2018-01-19  3:59   ` Dave Chinner
  2018-01-21  2:11 ` Andi Kleen
  2 siblings, 1 reply; 29+ messages in thread
From: Al Viro @ 2018-01-19  2:13 UTC (permalink / raw)
  To: Goldwyn Rodrigues
  Cc: linux-block, linux-fsdevel, darrick.wong, Goldwyn Rodrigues

On Thu, Jan 18, 2018 at 06:57:40PM -0600, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> 
> In case direct I/O encounters an error midway, it returns the error.
> Instead it should be returning the number of bytes transferred so far.
> 
> Test case for filesystems (with ENOSPC):
> 1. Create an almost full filesystem
> 2. Create a file, say /mnt/lastfile, until the filesystem is full.
> 3. Direct write() with count > sizeof /mnt/lastfile.
> 
> Result: write() returns -ENOSPC. However, file content has data written
> in step 3.
> 
> This fixes fstest generic/472.

OK...  I can live with that.  What about the XFS side?  It should be
a prereq, to avoid bisection hazard; I can throw both into vfs.git,
if XFS folks are OK with that.  Objections?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] xfs: remove assert to check bytes returned
  2018-01-19  0:57 ` [PATCH 2/2] xfs: remove assert to check bytes returned Goldwyn Rodrigues
@ 2018-01-19  3:57   ` Dave Chinner
  2018-01-19  4:23     ` Raphael Carvalho
  0 siblings, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2018-01-19  3:57 UTC (permalink / raw)
  To: Goldwyn Rodrigues
  Cc: linux-block, linux-fsdevel, darrick.wong, Goldwyn Rodrigues

On Thu, Jan 18, 2018 at 06:57:41PM -0600, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> 
> Since we can return less than count in case of partial direct
> writes, remove the ASSERT.
> 
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  fs/xfs/xfs_file.c | 6 ------
>  1 file changed, 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 8601275cc5e6..8fc4dbf66910 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -590,12 +590,6 @@ xfs_file_dio_aio_write(
>  	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
>  out:
>  	xfs_iunlock(ip, iolock);
> -
> -	/*
> -	 * No fallback to buffered IO on errors for XFS, direct IO will either
> -	 * complete fully or fail.
> -	 */
> -	ASSERT(ret < 0 || ret == count);
>  	return ret;
>  }

Acked-by: Dave Chinner <dchinner@redhat.com>

-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-19  2:13 ` [PATCH v5 1/2] Return bytes transferred for partial direct I/O Al Viro
@ 2018-01-19  3:59   ` Dave Chinner
  2018-01-19  6:31     ` Darrick J. Wong
  0 siblings, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2018-01-19  3:59 UTC (permalink / raw)
  To: Al Viro
  Cc: Goldwyn Rodrigues, linux-block, linux-fsdevel, darrick.wong,
	Goldwyn Rodrigues

On Fri, Jan 19, 2018 at 02:13:53AM +0000, Al Viro wrote:
> On Thu, Jan 18, 2018 at 06:57:40PM -0600, Goldwyn Rodrigues wrote:
> > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > 
> > In case direct I/O encounters an error midway, it returns the error.
> > Instead it should be returning the number of bytes transferred so far.
> > 
> > Test case for filesystems (with ENOSPC):
> > 1. Create an almost full filesystem
> > 2. Create a file, say /mnt/lastfile, until the filesystem is full.
> > 3. Direct write() with count > sizeof /mnt/lastfile.
> > 
> > Result: write() returns -ENOSPC. However, file content has data written
> > in step 3.
> > 
> > This fixes fstest generic/472.
> 
> OK...  I can live with that.  What about the XFS side?  It should be
> a prereq, to avoid bisection hazard; I can throw both into vfs.git,
> if XFS folks are OK with that.  Objections?

Going through the VFS tree seesm the best approach to me - it's a
trivial change. I'm sure Darrick will shout if it's going to be a
problem, though.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] xfs: remove assert to check bytes returned
  2018-01-19  3:57   ` Dave Chinner
@ 2018-01-19  4:23     ` Raphael Carvalho
  2018-01-19  4:51       ` Dave Chinner
  0 siblings, 1 reply; 29+ messages in thread
From: Raphael Carvalho @ 2018-01-19  4:23 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Goldwyn Rodrigues, linux-block, linux-fsdevel, darrick.wong,
	Goldwyn Rodrigues

On Fri, Jan 19, 2018 at 1:57 AM, Dave Chinner <david@fromorbit.com> wrote:
>
> On Thu, Jan 18, 2018 at 06:57:41PM -0600, Goldwyn Rodrigues wrote:
> > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> >
> > Since we can return less than count in case of partial direct
> > writes, remove the ASSERT.
> >
> > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > ---
> >  fs/xfs/xfs_file.c | 6 ------
> >  1 file changed, 6 deletions(-)
> >
> > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > index 8601275cc5e6..8fc4dbf66910 100644
> > --- a/fs/xfs/xfs_file.c
> > +++ b/fs/xfs/xfs_file.c
> > @@ -590,12 +590,6 @@ xfs_file_dio_aio_write(
> >       ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
> >  out:
> >       xfs_iunlock(ip, iolock);
> > -
> > -     /*
> > -      * No fallback to buffered IO on errors for XFS, direct IO will either
> > -      * complete fully or fail.
> > -      */
> > -     ASSERT(ret < 0 || ret == count);
> >       return ret;
> >  }
>
> Acked-by: Dave Chinner <dchinner@redhat.com>


Is this really correct? Isn't this check with regards to DIO
submission? The bytes written is returned in a different asynchronous
path due to AIO support, no?!

>
>
> --
> Dave Chinner
> david@fromorbit.com


Regards,
Raphael S. Carvalho

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] xfs: remove assert to check bytes returned
  2018-01-19  4:23     ` Raphael Carvalho
@ 2018-01-19  4:51       ` Dave Chinner
  0 siblings, 0 replies; 29+ messages in thread
From: Dave Chinner @ 2018-01-19  4:51 UTC (permalink / raw)
  To: Raphael Carvalho
  Cc: Goldwyn Rodrigues, linux-block, linux-fsdevel, darrick.wong,
	Goldwyn Rodrigues

On Fri, Jan 19, 2018 at 02:23:16AM -0200, Raphael Carvalho wrote:
> On Fri, Jan 19, 2018 at 1:57 AM, Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Thu, Jan 18, 2018 at 06:57:41PM -0600, Goldwyn Rodrigues wrote:
> > > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > >
> > > Since we can return less than count in case of partial direct
> > > writes, remove the ASSERT.
> > >
> > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > > ---
> > >  fs/xfs/xfs_file.c | 6 ------
> > >  1 file changed, 6 deletions(-)
> > >
> > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > index 8601275cc5e6..8fc4dbf66910 100644
> > > --- a/fs/xfs/xfs_file.c
> > > +++ b/fs/xfs/xfs_file.c
> > > @@ -590,12 +590,6 @@ xfs_file_dio_aio_write(
> > >       ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
> > >  out:
> > >       xfs_iunlock(ip, iolock);
> > > -
> > > -     /*
> > > -      * No fallback to buffered IO on errors for XFS, direct IO will either
> > > -      * complete fully or fail.
> > > -      */
> > > -     ASSERT(ret < 0 || ret == count);
> > >       return ret;
> > >  }
> >
> > Acked-by: Dave Chinner <dchinner@redhat.com>
> 
> 
> Is this really correct?

Yes.

> Isn't this check with regards to DIO
> submission?

Yes, if there is an error during submission.

But it also checked synchronous IO completion (i.e. error or bytes
written), because iomap_dio_rw() waits for non-AIO DIO to complete
and returns the IO completion status in that case.

> The bytes written is returned in a different asynchronous
> path due to AIO support, no?!

That is correct. For AIO we'll get -EIOCBQUEUED here on successful
submission.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-19  3:59   ` Dave Chinner
@ 2018-01-19  6:31     ` Darrick J. Wong
  2018-01-19  6:33       ` Al Viro
  0 siblings, 1 reply; 29+ messages in thread
From: Darrick J. Wong @ 2018-01-19  6:31 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Al Viro, Goldwyn Rodrigues, linux-block, linux-fsdevel,
	Goldwyn Rodrigues

On Fri, Jan 19, 2018 at 02:59:51PM +1100, Dave Chinner wrote:
> On Fri, Jan 19, 2018 at 02:13:53AM +0000, Al Viro wrote:
> > On Thu, Jan 18, 2018 at 06:57:40PM -0600, Goldwyn Rodrigues wrote:
> > > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > > 
> > > In case direct I/O encounters an error midway, it returns the error.
> > > Instead it should be returning the number of bytes transferred so far.
> > > 
> > > Test case for filesystems (with ENOSPC):
> > > 1. Create an almost full filesystem
> > > 2. Create a file, say /mnt/lastfile, until the filesystem is full.
> > > 3. Direct write() with count > sizeof /mnt/lastfile.
> > > 
> > > Result: write() returns -ENOSPC. However, file content has data written
> > > in step 3.
> > > 
> > > This fixes fstest generic/472.
> > 
> > OK...  I can live with that.  What about the XFS side?  It should be
> > a prereq, to avoid bisection hazard; I can throw both into vfs.git,
> > if XFS folks are OK with that.  Objections?
> 
> Going through the VFS tree seesm the best approach to me - it's a
> trivial change. I'm sure Darrick will shout if it's going to be a
> problem, though.

vfs.git is fine, though the second patch to remove the xfs assert should
go first, as Al points out.

For both patches,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-19  6:31     ` Darrick J. Wong
@ 2018-01-19  6:33       ` Al Viro
  2018-01-20 19:47         ` Al Viro
  0 siblings, 1 reply; 29+ messages in thread
From: Al Viro @ 2018-01-19  6:33 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Dave Chinner, Goldwyn Rodrigues, linux-block, linux-fsdevel,
	Goldwyn Rodrigues

On Thu, Jan 18, 2018 at 10:31:18PM -0800, Darrick J. Wong wrote:
> On Fri, Jan 19, 2018 at 02:59:51PM +1100, Dave Chinner wrote:
> > On Fri, Jan 19, 2018 at 02:13:53AM +0000, Al Viro wrote:
> > > On Thu, Jan 18, 2018 at 06:57:40PM -0600, Goldwyn Rodrigues wrote:
> > > > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > > > 
> > > > In case direct I/O encounters an error midway, it returns the error.
> > > > Instead it should be returning the number of bytes transferred so far.
> > > > 
> > > > Test case for filesystems (with ENOSPC):
> > > > 1. Create an almost full filesystem
> > > > 2. Create a file, say /mnt/lastfile, until the filesystem is full.
> > > > 3. Direct write() with count > sizeof /mnt/lastfile.
> > > > 
> > > > Result: write() returns -ENOSPC. However, file content has data written
> > > > in step 3.
> > > > 
> > > > This fixes fstest generic/472.
> > > 
> > > OK...  I can live with that.  What about the XFS side?  It should be
> > > a prereq, to avoid bisection hazard; I can throw both into vfs.git,
> > > if XFS folks are OK with that.  Objections?
> > 
> > Going through the VFS tree seesm the best approach to me - it's a
> > trivial change. I'm sure Darrick will shout if it's going to be a
> > problem, though.
> 
> vfs.git is fine, though the second patch to remove the xfs assert should
> go first, as Al points out.
> 
> For both patches,
> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

Applied; will be in -next tomorrow morning after the tree I'm putting together
gets through local beating.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-19  6:33       ` Al Viro
@ 2018-01-20 19:47         ` Al Viro
  2018-01-21  2:57           ` Goldwyn Rodrigues
  0 siblings, 1 reply; 29+ messages in thread
From: Al Viro @ 2018-01-20 19:47 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Dave Chinner, Goldwyn Rodrigues, linux-block, linux-fsdevel,
	Goldwyn Rodrigues

On Fri, Jan 19, 2018 at 06:33:56AM +0000, Al Viro wrote:
> > For both patches,
> > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Applied; will be in -next tomorrow morning after the tree I'm putting together
> gets through local beating.

FWIW, it consistently blows generic/250 and generic/252.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-19  0:57 [PATCH v5 1/2] Return bytes transferred for partial direct I/O Goldwyn Rodrigues
  2018-01-19  0:57 ` [PATCH 2/2] xfs: remove assert to check bytes returned Goldwyn Rodrigues
  2018-01-19  2:13 ` [PATCH v5 1/2] Return bytes transferred for partial direct I/O Al Viro
@ 2018-01-21  2:11 ` Andi Kleen
  2018-01-21  2:23   ` Goldwyn Rodrigues
  2 siblings, 1 reply; 29+ messages in thread
From: Andi Kleen @ 2018-01-21  2:11 UTC (permalink / raw)
  To: Goldwyn Rodrigues
  Cc: linux-block, linux-fsdevel, darrick.wong, Goldwyn Rodrigues

Goldwyn Rodrigues <rgoldwyn@suse.de> writes:

> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>
> In case direct I/O encounters an error midway, it returns the error.
> Instead it should be returning the number of bytes transferred so far.

It's likely there's a lot of code in user space that does

     if (write(..., N) < 0) handle error

With your change it would need to be

     if (write(..., N) != N) handle error

How much code is actually doing that?

I can understand it fixes your artifical test suite, but it seems to me your
change has a high potential to break a lot of existing user code
in subtle ways. So it seems to be a bad idea.

-Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-21  2:11 ` Andi Kleen
@ 2018-01-21  2:23   ` Goldwyn Rodrigues
  2018-01-21  3:07     ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: Goldwyn Rodrigues @ 2018-01-21  2:23 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-block, linux-fsdevel, darrick.wong, Goldwyn Rodrigues



On 01/20/2018 08:11 PM, Andi Kleen wrote:
> Goldwyn Rodrigues <rgoldwyn@suse.de> writes:
> 
>> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>
>> In case direct I/O encounters an error midway, it returns the error.
>> Instead it should be returning the number of bytes transferred so far.
> 
> It's likely there's a lot of code in user space that does
> 
>      if (write(..., N) < 0) handle error
> 
> With your change it would need to be
> 
>      if (write(..., N) != N) handle error
> 
> How much code is actually doing that?
> 
> I can understand it fixes your artifical test suite, but it seems to me your
> change has a high potential to break a lot of existing user code
> in subtle ways. So it seems to be a bad idea.
> 
> -Andi
> 


Quoting 'man 2 write':

RETURN VALUE
 On  success,  the  number  of bytes written is returned (zero indicates
 nothing was written).  It is not an error if  this  number  is  smaller
 than the number of bytes requested; this may happen for example because
 the disk device was filled.  See also NOTES.

-- 
Goldwyn

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-20 19:47         ` Al Viro
@ 2018-01-21  2:57           ` Goldwyn Rodrigues
  0 siblings, 0 replies; 29+ messages in thread
From: Goldwyn Rodrigues @ 2018-01-21  2:57 UTC (permalink / raw)
  To: Al Viro, Darrick J. Wong
  Cc: Dave Chinner, linux-block, linux-fsdevel, Goldwyn Rodrigues



On 01/20/2018 01:47 PM, Al Viro wrote:
> On Fri, Jan 19, 2018 at 06:33:56AM +0000, Al Viro wrote:
>>> For both patches,
>>> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
>>
>> Applied; will be in -next tomorrow morning after the tree I'm putting together
>> gets through local beating.
> 
> FWIW, it consistently blows generic/250 and generic/252.
> 

Thanks. I will look into it. It is performing direct writes with dm_error.
Hopefully it should be just updating the md5sum in the output files.

-- 
Goldwyn

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-21  2:23   ` Goldwyn Rodrigues
@ 2018-01-21  3:07     ` Jens Axboe
  2018-01-21 12:06       ` Goldwyn Rodrigues
                         ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Jens Axboe @ 2018-01-21  3:07 UTC (permalink / raw)
  To: Goldwyn Rodrigues, Andi Kleen
  Cc: linux-block, linux-fsdevel, darrick.wong, Goldwyn Rodrigues

On 1/20/18 7:23 PM, Goldwyn Rodrigues wrote:
> 
> 
> On 01/20/2018 08:11 PM, Andi Kleen wrote:
>> Goldwyn Rodrigues <rgoldwyn@suse.de> writes:
>>
>>> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>>
>>> In case direct I/O encounters an error midway, it returns the error.
>>> Instead it should be returning the number of bytes transferred so far.
>>
>> It's likely there's a lot of code in user space that does
>>
>>      if (write(..., N) < 0) handle error
>>
>> With your change it would need to be
>>
>>      if (write(..., N) != N) handle error
>>
>> How much code is actually doing that?
>>
>> I can understand it fixes your artifical test suite, but it seems to me your
>> change has a high potential to break a lot of existing user code
>> in subtle ways. So it seems to be a bad idea.
>>
>> -Andi
>>
> 
> 
> Quoting 'man 2 write':
> 
> RETURN VALUE
>  On  success,  the  number  of bytes written is returned (zero indicates
>  nothing was written).  It is not an error if  this  number  is  smaller
>  than the number of bytes requested; this may happen for example because
>  the disk device was filled.  See also NOTES.

You can quote as much man page as you want - Andi is well aware of how
read/write system call works, as I'm sure all of us are, that is not the
issue. The issue is that there are potentially LOTS of applications out
there that do not check for short writes, they do exactly what Andi
speculated above. If you break it with this change, it doesn't matter
what's in the man page. What matters is previous behavior, and that
you are breaking user space. At that point nobody cares what's in the
man page.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-21  3:07     ` Jens Axboe
@ 2018-01-21 12:06       ` Goldwyn Rodrigues
  2018-01-22 18:08         ` Andi Kleen
  2018-01-22 19:10       ` Andreas Dilger
  2018-01-22 22:25         ` Elliott, Robert (Persistent Memory)
  2 siblings, 1 reply; 29+ messages in thread
From: Goldwyn Rodrigues @ 2018-01-21 12:06 UTC (permalink / raw)
  To: Jens Axboe, Andi Kleen
  Cc: linux-block, linux-fsdevel, darrick.wong, Goldwyn Rodrigues



On 01/20/2018 09:07 PM, Jens Axboe wrote:
> On 1/20/18 7:23 PM, Goldwyn Rodrigues wrote:
>>
>>
>> On 01/20/2018 08:11 PM, Andi Kleen wrote:
>>> Goldwyn Rodrigues <rgoldwyn@suse.de> writes:
>>>
>>>> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>>>
>>>> In case direct I/O encounters an error midway, it returns the error.
>>>> Instead it should be returning the number of bytes transferred so far.
>>>
>>> It's likely there's a lot of code in user space that does
>>>
>>>      if (write(..., N) < 0) handle error
>>>
>>> With your change it would need to be
>>>
>>>      if (write(..., N) != N) handle error
>>>
>>> How much code is actually doing that?
>>>
>>> I can understand it fixes your artifical test suite, but it seems to me your
>>> change has a high potential to break a lot of existing user code
>>> in subtle ways. So it seems to be a bad idea.
>>>
>>> -Andi
>>>
>>
>>
>> Quoting 'man 2 write':
>>
>> RETURN VALUE
>>  On  success,  the  number  of bytes written is returned (zero indicates
>>  nothing was written).  It is not an error if  this  number  is  smaller
>>  than the number of bytes requested; this may happen for example because
>>  the disk device was filled.  See also NOTES.
> 
> You can quote as much man page as you want - Andi is well aware of how
> read/write system call works, as I'm sure all of us are, that is not the
> issue. The issue is that there are potentially LOTS of applications out
> there that do not check for short writes, they do exactly what Andi
> speculated above. If you break it with this change, it doesn't matter
> what's in the man page. What matters is previous behavior, and that
> you are breaking user space. At that point nobody cares what's in the
> man page.
> 

Agree. So how do you think we should fix this to accommodate userspace
application who did not cater to the fact that write can return short
write, and still be consistent?

The only way I can think is that a DIO write should check early enough
that the write(N) will complete with N bytes without an error. Is it
possible to completely guarantee that?

Leaving it as it is incorrect as quoted in the artificial test case. You
should not be changing the file and yet conveying to the user an error
for the same write() call. It should either be an error and the file
contents are unchanged, or it should be change in contents and the write
size returned.


-- 
Goldwyn

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-21 12:06       ` Goldwyn Rodrigues
@ 2018-01-22 18:08         ` Andi Kleen
  0 siblings, 0 replies; 29+ messages in thread
From: Andi Kleen @ 2018-01-22 18:08 UTC (permalink / raw)
  To: Goldwyn Rodrigues
  Cc: Jens Axboe, linux-block, linux-fsdevel, darrick.wong, Goldwyn Rodrigues

> The only way I can think is that a DIO write should check early enough
> that the write(N) will complete with N bytes without an error. Is it
> possible to completely guarantee that?

Probably not.

> 
> Leaving it as it is incorrect as quoted in the artificial test case. You
> should not be changing the file and yet conveying to the user an error
> for the same write() call. It should either be an error and the file
> contents are unchanged, or it should be change in contents and the write
> size returned.

There are already lots of syscall cases that don't completely
undo changes on error handling.  Fixing that would likely require
a transaction system higher level in the kernel, or lots of 
complicated code everywhere, which is unlikely to happen. 

Also the complicated code would be difficult to test,
and likely bit rot over time, because it would be only an error handling
path that is infrequently exercised.

So yes it's not nice, but the alternative would be worse.

I think we're best of leaving it as it is now.

Adding some comments/documentation to explain this would be good though.
Perhaps you could submit a patch to the manpage?

-Andi

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-21  3:07     ` Jens Axboe
  2018-01-21 12:06       ` Goldwyn Rodrigues
@ 2018-01-22 19:10       ` Andreas Dilger
  2018-01-22 19:13         ` Jens Axboe
  2018-01-22 22:25         ` Elliott, Robert (Persistent Memory)
  2 siblings, 1 reply; 29+ messages in thread
From: Andreas Dilger @ 2018-01-22 19:10 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Goldwyn Rodrigues, Andi Kleen, linux-block, linux-fsdevel,
	darrick.wong, Goldwyn Rodrigues

[-- Attachment #1: Type: text/plain, Size: 1689 bytes --]


> On Jan 20, 2018, at 8:07 PM, Jens Axboe <axboe@kernel.dk> wrote:
> 
> On 1/20/18 7:23 PM, Goldwyn Rodrigues wrote:
>> 
>> 
>> On 01/20/2018 08:11 PM, Andi Kleen wrote:
>>> 
>>> It's likely there's a lot of code in user space that does
>>> 
>>>     if (write(..., N) < 0) handle error
>>> 
>>> With your change it would need to be
>>> 
>>>     if (write(..., N) != N) handle error
>>> 
>>> How much code is actually doing that?
>>> 
>>> I can understand it fixes your artifical test suite, but it seems to me your
>>> change has a high potential to break a lot of existing user code
>>> in subtle ways. So it seems to be a bad idea.
>> 
>> 
>> Quoting 'man 2 write':
>> 
>> RETURN VALUE
>> On  success,  the  number  of bytes written is returned (zero indicates
>> nothing was written).  It is not an error if  this  number  is  smaller
>> than the number of bytes requested; this may happen for example because
>> the disk device was filled.  See also NOTES.
> 
> You can quote as much man page as you want - Andi is well aware of how
> read/write system call works, as I'm sure all of us are, that is not the
> issue. The issue is that there are potentially LOTS of applications out
> there that do not check for short writes, they do exactly what Andi
> speculated above. If you break it with this change, it doesn't matter
> what's in the man page. What matters is previous behavior, and that
> you are breaking user space. At that point nobody cares what's in the
> man page.

Applications that don't handle partial writes are already broken with
buffered I/O, this change doesn't really make them more broken.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-22 19:10       ` Andreas Dilger
@ 2018-01-22 19:13         ` Jens Axboe
  2018-01-23  3:18           ` Goldwyn Rodrigues
  0 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2018-01-22 19:13 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Goldwyn Rodrigues, Andi Kleen, linux-block, linux-fsdevel,
	darrick.wong, Goldwyn Rodrigues

On 1/22/18 12:10 PM, Andreas Dilger wrote:
> 
>> On Jan 20, 2018, at 8:07 PM, Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 1/20/18 7:23 PM, Goldwyn Rodrigues wrote:
>>>
>>>
>>> On 01/20/2018 08:11 PM, Andi Kleen wrote:
>>>>
>>>> It's likely there's a lot of code in user space that does
>>>>
>>>>     if (write(..., N) < 0) handle error
>>>>
>>>> With your change it would need to be
>>>>
>>>>     if (write(..., N) != N) handle error
>>>>
>>>> How much code is actually doing that?
>>>>
>>>> I can understand it fixes your artifical test suite, but it seems to me your
>>>> change has a high potential to break a lot of existing user code
>>>> in subtle ways. So it seems to be a bad idea.
>>>
>>>
>>> Quoting 'man 2 write':
>>>
>>> RETURN VALUE
>>> On  success,  the  number  of bytes written is returned (zero indicates
>>> nothing was written).  It is not an error if  this  number  is  smaller
>>> than the number of bytes requested; this may happen for example because
>>> the disk device was filled.  See also NOTES.
>>
>> You can quote as much man page as you want - Andi is well aware of how
>> read/write system call works, as I'm sure all of us are, that is not the
>> issue. The issue is that there are potentially LOTS of applications out
>> there that do not check for short writes, they do exactly what Andi
>> speculated above. If you break it with this change, it doesn't matter
>> what's in the man page. What matters is previous behavior, and that
>> you are breaking user space. At that point nobody cares what's in the
>> man page.
> 
> Applications that don't handle partial writes are already broken with
> buffered I/O, this change doesn't really make them more broken.

Not disagreeing that they are broken, of course they are. But if you've
been doing direct IO and not seeing short writes, then this could break
your application. And if that's the case, it doesn't really help to say
that their application was "already broken". I'd hate for a kernel
upgrade to break them.

I do wish we could make the change, and maybe we can. But it probably
needs some safe guard proc entry to toggle the behavior, something we
can drop in a few years when we're confident it won't break real
applications.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-21  3:07     ` Jens Axboe
@ 2018-01-22 22:25         ` Elliott, Robert (Persistent Memory)
  2018-01-22 19:10       ` Andreas Dilger
  2018-01-22 22:25         ` Elliott, Robert (Persistent Memory)
  2 siblings, 0 replies; 29+ messages in thread
From: Elliott, Robert (Persistent Memory) @ 2018-01-22 22:25 UTC (permalink / raw)
  To: Jens Axboe, Goldwyn Rodrigues, Andi Kleen
  Cc: linux-block, linux-fsdevel, darrick.wong, Rodrigues, Goldwyn

DQoNCj4gT24gMS8yMC8xOCA3OjIzIFBNLCBHb2xkd3luIFJvZHJpZ3VlcyB3cm90ZToNCj4gPiBP
biAwMS8yMC8yMDE4IDA4OjExIFBNLCBBbmRpIEtsZWVuIHdyb3RlOg0KPiA+PiBHb2xkd3luIFJv
ZHJpZ3VlcyA8cmdvbGR3eW5Ac3VzZS5kZT4gd3JpdGVzOg0KPiA+Pj4gRnJvbTogR29sZHd5biBS
b2RyaWd1ZXMgPHJnb2xkd3luQHN1c2UuY29tPg0KPiA+Pj4gSW4gY2FzZSBkaXJlY3QgSS9PIGVu
Y291bnRlcnMgYW4gZXJyb3IgbWlkd2F5LCBpdCByZXR1cm5zIHRoZSBlcnJvci4NCj4gPj4+IElu
c3RlYWQgaXQgc2hvdWxkIGJlIHJldHVybmluZyB0aGUgbnVtYmVyIG9mIGJ5dGVzIHRyYW5zZmVy
cmVkIHNvIGZhci4NCj4gPj4NCj4gPj4gSXQncyBsaWtlbHkgdGhlcmUncyBhIGxvdCBvZiBjb2Rl
IGluIHVzZXIgc3BhY2UgdGhhdCBkb2VzDQo+ID4+DQo+ID4+ICAgICAgaWYgKHdyaXRlKC4uLiwg
TikgPCAwKSBoYW5kbGUgZXJyb3INCj4gPj4NCj4gPj4gV2l0aCB5b3VyIGNoYW5nZSBpdCB3b3Vs
ZCBuZWVkIHRvIGJlDQo+ID4+DQo+ID4+ICAgICAgaWYgKHdyaXRlKC4uLiwgTikgIT0gTikgaGFu
ZGxlIGVycm9yDQo+ID4+DQo+ID4+IEhvdyBtdWNoIGNvZGUgaXMgYWN0dWFsbHkgZG9pbmcgdGhh
dD8NCj4gPj4NCj4gPj4gSSBjYW4gdW5kZXJzdGFuZCBpdCBmaXhlcyB5b3VyIGFydGlmaWNhbCB0
ZXN0IHN1aXRlLCBidXQgaXQgc2VlbXMgdG8gbWUgeW91cg0KPiA+PiBjaGFuZ2UgaGFzIGEgaGln
aCBwb3RlbnRpYWwgdG8gYnJlYWsgYSBsb3Qgb2YgZXhpc3RpbmcgdXNlciBjb2RlDQo+ID4+IGlu
IHN1YnRsZSB3YXlzLiBTbyBpdCBzZWVtcyB0byBiZSBhIGJhZCBpZGVhLg0KPiA+Pg0KPiA+PiAt
QW5kaQ0KPiA+DQo+ID4gUXVvdGluZyAnbWFuIDIgd3JpdGUnOg0KPiA+DQo+ID4gUkVUVVJOIFZB
TFVFDQo+ID4gIE9uICBzdWNjZXNzLCAgdGhlICBudW1iZXIgIG9mIGJ5dGVzIHdyaXR0ZW4gaXMg
cmV0dXJuZWQgKHplcm8gaW5kaWNhdGVzDQo+ID4gIG5vdGhpbmcgd2FzIHdyaXR0ZW4pLiAgSXQg
aXMgbm90IGFuIGVycm9yIGlmICB0aGlzICBudW1iZXIgIGlzIHNtYWxsZXINCj4gPiAgdGhhbiB0
aGUgbnVtYmVyIG9mIGJ5dGVzIHJlcXVlc3RlZDsgdGhpcyBtYXkgaGFwcGVuIGZvciBleGFtcGxl
IGJlY2F1c2UNCj4gPiAgdGhlIGRpc2sgZGV2aWNlIHdhcyBmaWxsZWQuICBTZWUgYWxzbyBOT1RF
Uy4NCj4gDQo+IFlvdSBjYW4gcXVvdGUgYXMgbXVjaCBtYW4gcGFnZSBhcyB5b3Ugd2FudCAtIEFu
ZGkgaXMgd2VsbCBhd2FyZSBvZiBob3cNCj4gcmVhZC93cml0ZSBzeXN0ZW0gY2FsbCB3b3Jrcywg
YXMgSSdtIHN1cmUgYWxsIG9mIHVzIGFyZSwgdGhhdCBpcyBub3QgdGhlDQo+IGlzc3VlLiBUaGUg
aXNzdWUgaXMgdGhhdCB0aGVyZSBhcmUgcG90ZW50aWFsbHkgTE9UUyBvZiBhcHBsaWNhdGlvbnMg
b3V0DQo+IHRoZXJlIHRoYXQgZG8gbm90IGNoZWNrIGZvciBzaG9ydCB3cml0ZXMsIHRoZXkgZG8g
ZXhhY3RseSB3aGF0IEFuZGkNCj4gc3BlY3VsYXRlZCBhYm92ZS4gSWYgeW91IGJyZWFrIGl0IHdp
dGggdGhpcyBjaGFuZ2UsIGl0IGRvZXNuJ3QgbWF0dGVyDQo+IHdoYXQncyBpbiB0aGUgbWFuIHBh
Z2UuIFdoYXQgbWF0dGVycyBpcyBwcmV2aW91cyBiZWhhdmlvciwgYW5kIHRoYXQNCj4geW91IGFy
ZSBicmVha2luZyB1c2VyIHNwYWNlLiBBdCB0aGF0IHBvaW50IG5vYm9keSBjYXJlcyB3aGF0J3Mg
aW4gdGhlDQo+IG1hbiBwYWdlLg0KIA0KDQpmaW8gZW5naW5lcy9zZy5jIGZpb19zZ2lvX3J3X2Rv
aW8oKSBoYXMgdGhhdCBwYXR0ZXJuOg0KDQoJcmV0ID0gd3JpdGUoZi0+ZmQsIGhkciwgc2l6ZW9m
KCpoZHIpKTsNCglpZiAocmV0IDwgMCkNCgkJcmV0dXJuIHJldDsNCgkuLi4NCglyZXR1cm4gRklP
X1FfUVVFVUVEOyAgIFt3aGljaCBpcyAxXQ0KDQphbHRob3VnaCB0aGVyZSBtaWdodCBiZSBzcGVj
aWFsIGNpcmN1bXN0YW5jZXMgZm9yIHRoZSBzZyBpbnRlcmZhY2UNCm1ha2luZyB0aGF0IHNhZmUu
DQoNCg0KDQotLS0NClJvYmVydCBFbGxpb3R0LCBIUEUgUGVyc2lzdGVudCBNZW1vcnkNCg0KDQo=

^ permalink raw reply	[flat|nested] 29+ messages in thread

* RE: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
@ 2018-01-22 22:25         ` Elliott, Robert (Persistent Memory)
  0 siblings, 0 replies; 29+ messages in thread
From: Elliott, Robert (Persistent Memory) @ 2018-01-22 22:25 UTC (permalink / raw)
  To: Jens Axboe, Goldwyn Rodrigues, Andi Kleen
  Cc: linux-block, linux-fsdevel, darrick.wong, Rodrigues, Goldwyn



> On 1/20/18 7:23 PM, Goldwyn Rodrigues wrote:
> > On 01/20/2018 08:11 PM, Andi Kleen wrote:
> >> Goldwyn Rodrigues <rgoldwyn@suse.de> writes:
> >>> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> >>> In case direct I/O encounters an error midway, it returns the error.
> >>> Instead it should be returning the number of bytes transferred so far.
> >>
> >> It's likely there's a lot of code in user space that does
> >>
> >>      if (write(..., N) < 0) handle error
> >>
> >> With your change it would need to be
> >>
> >>      if (write(..., N) != N) handle error
> >>
> >> How much code is actually doing that?
> >>
> >> I can understand it fixes your artifical test suite, but it seems to me your
> >> change has a high potential to break a lot of existing user code
> >> in subtle ways. So it seems to be a bad idea.
> >>
> >> -Andi
> >
> > Quoting 'man 2 write':
> >
> > RETURN VALUE
> >  On  success,  the  number  of bytes written is returned (zero indicates
> >  nothing was written).  It is not an error if  this  number  is smaller
> >  than the number of bytes requested; this may happen for example because
> >  the disk device was filled.  See also NOTES.
> 
> You can quote as much man page as you want - Andi is well aware of how
> read/write system call works, as I'm sure all of us are, that is not the
> issue. The issue is that there are potentially LOTS of applications out
> there that do not check for short writes, they do exactly what Andi
> speculated above. If you break it with this change, it doesn't matter
> what's in the man page. What matters is previous behavior, and that
> you are breaking user space. At that point nobody cares what's in the
> man page.
 

fio engines/sg.c fio_sgio_rw_doio() has that pattern:

	ret = write(f->fd, hdr, sizeof(*hdr));
	if (ret < 0)
		return ret;
	...
	return FIO_Q_QUEUED;   [which is 1]

although there might be special circumstances for the sg interface
making that safe.



---
Robert Elliott, HPE Persistent Memory



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-22 22:25         ` Elliott, Robert (Persistent Memory)
  (?)
@ 2018-01-22 23:14         ` Jens Axboe
  2018-01-22 23:24             ` Bart Van Assche
  -1 siblings, 1 reply; 29+ messages in thread
From: Jens Axboe @ 2018-01-22 23:14 UTC (permalink / raw)
  To: Elliott, Robert (Persistent Memory), Goldwyn Rodrigues, Andi Kleen
  Cc: linux-block, linux-fsdevel, darrick.wong, Rodrigues, Goldwyn

On 1/22/18 3:25 PM, Elliott, Robert (Persistent Memory) wrote:
> 
> 
>> On 1/20/18 7:23 PM, Goldwyn Rodrigues wrote:
>>> On 01/20/2018 08:11 PM, Andi Kleen wrote:
>>>> Goldwyn Rodrigues <rgoldwyn@suse.de> writes:
>>>>> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>>>> In case direct I/O encounters an error midway, it returns the error.
>>>>> Instead it should be returning the number of bytes transferred so far.
>>>>
>>>> It's likely there's a lot of code in user space that does
>>>>
>>>>      if (write(..., N) < 0) handle error
>>>>
>>>> With your change it would need to be
>>>>
>>>>      if (write(..., N) != N) handle error
>>>>
>>>> How much code is actually doing that?
>>>>
>>>> I can understand it fixes your artifical test suite, but it seems to me your
>>>> change has a high potential to break a lot of existing user code
>>>> in subtle ways. So it seems to be a bad idea.
>>>>
>>>> -Andi
>>>
>>> Quoting 'man 2 write':
>>>
>>> RETURN VALUE
>>>  On  success,  the  number  of bytes written is returned (zero indicates
>>>  nothing was written).  It is not an error if  this  number  is smaller
>>>  than the number of bytes requested; this may happen for example because
>>>  the disk device was filled.  See also NOTES.
>>
>> You can quote as much man page as you want - Andi is well aware of how
>> read/write system call works, as I'm sure all of us are, that is not the
>> issue. The issue is that there are potentially LOTS of applications out
>> there that do not check for short writes, they do exactly what Andi
>> speculated above. If you break it with this change, it doesn't matter
>> what's in the man page. What matters is previous behavior, and that
>> you are breaking user space. At that point nobody cares what's in the
>> man page.
>  
> 
> fio engines/sg.c fio_sgio_rw_doio() has that pattern:
> 
> 	ret = write(f->fd, hdr, sizeof(*hdr));
> 	if (ret < 0)
> 		return ret;
> 	...
> 	return FIO_Q_QUEUED;   [which is 1]
> 
> although there might be special circumstances for the sg interface
> making that safe.

That's for SG scsi direct IO, I don't think that supports partial
IO since it's sending raw SCSI commands.

For the regular libaio or sync IO system calls, fio of course checks
and handles short IOs correctly. It even logs if it got any.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-22 23:14         ` Jens Axboe
@ 2018-01-22 23:24             ` Bart Van Assche
  0 siblings, 0 replies; 29+ messages in thread
From: Bart Van Assche @ 2018-01-22 23:24 UTC (permalink / raw)
  To: ak, rgoldwyn, axboe, elliott
  Cc: darrick.wong, linux-block, RGoldwyn, linux-fsdevel

T24gTW9uLCAyMDE4LTAxLTIyIGF0IDE2OjE0IC0wNzAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBP
biAxLzIyLzE4IDM6MjUgUE0sIEVsbGlvdHQsIFJvYmVydCAoUGVyc2lzdGVudCBNZW1vcnkpIHdy
b3RlOg0KPiA+IGZpbyBlbmdpbmVzL3NnLmMgZmlvX3NnaW9fcndfZG9pbygpIGhhcyB0aGF0IHBh
dHRlcm46DQo+ID4gDQo+ID4gCXJldCA9IHdyaXRlKGYtPmZkLCBoZHIsIHNpemVvZigqaGRyKSk7
DQo+ID4gCWlmIChyZXQgPCAwKQ0KPiA+IAkJcmV0dXJuIHJldDsNCj4gPiAJLi4uDQo+ID4gCXJl
dHVybiBGSU9fUV9RVUVVRUQ7ICAgW3doaWNoIGlzIDFdDQo+ID4gDQo+ID4gYWx0aG91Z2ggdGhl
cmUgbWlnaHQgYmUgc3BlY2lhbCBjaXJjdW1zdGFuY2VzIGZvciB0aGUgc2cgaW50ZXJmYWNlDQo+
ID4gbWFraW5nIHRoYXQgc2FmZS4NCj4gDQo+IFRoYXQncyBmb3IgU0cgc2NzaSBkaXJlY3QgSU8s
IEkgZG9uJ3QgdGhpbmsgdGhhdCBzdXBwb3J0cyBwYXJ0aWFsDQo+IElPIHNpbmNlIGl0J3Mgc2Vu
ZGluZyByYXcgU0NTSSBjb21tYW5kcy4NCj4gDQo+IEZvciB0aGUgcmVndWxhciBsaWJhaW8gb3Ig
c3luYyBJTyBzeXN0ZW0gY2FsbHMsIGZpbyBvZiBjb3Vyc2UgY2hlY2tzDQo+IGFuZCBoYW5kbGVz
IHNob3J0IElPcyBjb3JyZWN0bHkuIEl0IGV2ZW4gbG9ncyBpZiBpdCBnb3QgYW55Lg0KDQpUaGUg
ZW50aXJlIGZpb19zZ2lvX3J3X2RvaW8oKSBmdW5jdGlvbiBpcyBhcyBmb2xsb3dzOg0KDQpzdGF0
aWMgaW50IGZpb19zZ2lvX3J3X2RvaW8oc3RydWN0IGZpb19maWxlICpmLCBzdHJ1Y3QgaW9fdSAq
aW9fdSwgaW50IGRvX3N5bmMpDQp7DQoJc3RydWN0IHNnX2lvX2hkciAqaGRyID0gJmlvX3UtPmhk
cjsNCglpbnQgcmV0Ow0KDQoJcmV0ID0gd3JpdGUoZi0+ZmQsIGhkciwgc2l6ZW9mKCpoZHIpKTsN
CglpZiAocmV0IDwgMCkNCgkJcmV0dXJuIHJldDsNCg0KCWlmIChkb19zeW5jKSB7DQoJCXJldCA9
IHJlYWQoZi0+ZmQsIGhkciwgc2l6ZW9mKCpoZHIpKTsNCgkJaWYgKHJldCA8IDApDQoJCQlyZXR1
cm4gcmV0Ow0KDQoJCS8qIHJlY29yZCBpZiBhbiBpbyBlcnJvciBvY2N1cnJlZCAqLw0KCQlpZiAo
aGRyLT5pbmZvICYgU0dfSU5GT19DSEVDSykNCgkJCWlvX3UtPmVycm9yID0gRUlPOw0KDQoJCXJl
dHVybiBGSU9fUV9DT01QTEVURUQ7DQoJfQ0KDQoJcmV0dXJuIEZJT19RX1FVRVVFRDsNCn0NCg0K
SSB0aGluayB0aGUgJ3Jlc2lkJyBtZW1iZXIgb2YgdGhlIHN0cnVjdCBzZ19pb19oZHIgdGhhdCBp
cyBwcm92aWRlZCBieSB0aGUNCnNnX2lvIGtlcm5lbCBkcml2ZXIgYXMgYSByZXNwb25zZSByZXBy
ZXNlbnRzIHRoZSBudW1iZXIgb2YgYnl0ZXMgdGhhdCBoYXMgbm90DQpiZWVuIHdyaXR0ZW4uIFNv
IGl0IHNob3VsZCBiZSBwb3NzaWJsZSB0byByZWNvZ25pemUgYW5kIGhhbmRsZSBzaG9ydCBJL09z
IGluDQp0aGF0IGZ1bmN0aW9uLiBGcm9tIGluY2x1ZGUvc2NzaS9zZy5oOg0KDQogICAgaW50IHJl
c2lkOyAgICAgICAgICAgICAgICAgIC8qIFtvXSBkeGZlcl9sZW4gLSBhY3R1YWxfdHJhbnNmZXJy
ZWQgKi8NCg0KQmFydC4=

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
@ 2018-01-22 23:24             ` Bart Van Assche
  0 siblings, 0 replies; 29+ messages in thread
From: Bart Van Assche @ 2018-01-22 23:24 UTC (permalink / raw)
  To: ak, rgoldwyn, axboe, elliott
  Cc: darrick.wong, linux-block, RGoldwyn, linux-fsdevel

On Mon, 2018-01-22 at 16:14 -0700, Jens Axboe wrote:
> On 1/22/18 3:25 PM, Elliott, Robert (Persistent Memory) wrote:
> > fio engines/sg.c fio_sgio_rw_doio() has that pattern:
> > 
> > 	ret = write(f->fd, hdr, sizeof(*hdr));
> > 	if (ret < 0)
> > 		return ret;
> > 	...
> > 	return FIO_Q_QUEUED;   [which is 1]
> > 
> > although there might be special circumstances for the sg interface
> > making that safe.
> 
> That's for SG scsi direct IO, I don't think that supports partial
> IO since it's sending raw SCSI commands.
> 
> For the regular libaio or sync IO system calls, fio of course checks
> and handles short IOs correctly. It even logs if it got any.

The entire fio_sgio_rw_doio() function is as follows:

static int fio_sgio_rw_doio(struct fio_file *f, struct io_u *io_u, int do_sync)
{
	struct sg_io_hdr *hdr = &io_u->hdr;
	int ret;

	ret = write(f->fd, hdr, sizeof(*hdr));
	if (ret < 0)
		return ret;

	if (do_sync) {
		ret = read(f->fd, hdr, sizeof(*hdr));
		if (ret < 0)
			return ret;

		/* record if an io error occurred */
		if (hdr->info & SG_INFO_CHECK)
			io_u->error = EIO;

		return FIO_Q_COMPLETED;
	}

	return FIO_Q_QUEUED;
}

I think the 'resid' member of the struct sg_io_hdr that is provided by the
sg_io kernel driver as a response represents the number of bytes that has not
been written. So it should be possible to recognize and handle short I/Os in
that function. From include/scsi/sg.h:

    int resid;                  /* [o] dxfer_len - actual_transferred */

Bart.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-22 23:24             ` Bart Van Assche
  (?)
@ 2018-01-22 23:27             ` Jens Axboe
  -1 siblings, 0 replies; 29+ messages in thread
From: Jens Axboe @ 2018-01-22 23:27 UTC (permalink / raw)
  To: Bart Van Assche, ak, rgoldwyn, elliott
  Cc: darrick.wong, linux-block, RGoldwyn, linux-fsdevel

On 1/22/18 4:24 PM, Bart Van Assche wrote:
> On Mon, 2018-01-22 at 16:14 -0700, Jens Axboe wrote:
>> On 1/22/18 3:25 PM, Elliott, Robert (Persistent Memory) wrote:
>>> fio engines/sg.c fio_sgio_rw_doio() has that pattern:
>>>
>>> 	ret = write(f->fd, hdr, sizeof(*hdr));
>>> 	if (ret < 0)
>>> 		return ret;
>>> 	...
>>> 	return FIO_Q_QUEUED;   [which is 1]
>>>
>>> although there might be special circumstances for the sg interface
>>> making that safe.
>>
>> That's for SG scsi direct IO, I don't think that supports partial
>> IO since it's sending raw SCSI commands.
>>
>> For the regular libaio or sync IO system calls, fio of course checks
>> and handles short IOs correctly. It even logs if it got any.
> 
> The entire fio_sgio_rw_doio() function is as follows:
> 
> static int fio_sgio_rw_doio(struct fio_file *f, struct io_u *io_u, int do_sync)
> {
> 	struct sg_io_hdr *hdr = &io_u->hdr;
> 	int ret;
> 
> 	ret = write(f->fd, hdr, sizeof(*hdr));
> 	if (ret < 0)
> 		return ret;
> 
> 	if (do_sync) {
> 		ret = read(f->fd, hdr, sizeof(*hdr));
> 		if (ret < 0)
> 			return ret;
> 
> 		/* record if an io error occurred */
> 		if (hdr->info & SG_INFO_CHECK)
> 			io_u->error = EIO;
> 
> 		return FIO_Q_COMPLETED;
> 	}
> 
> 	return FIO_Q_QUEUED;
> }
> 
> I think the 'resid' member of the struct sg_io_hdr that is provided by the
> sg_io kernel driver as a response represents the number of bytes that has not
> been written. So it should be possible to recognize and handle short I/Os in
> that function. From include/scsi/sg.h:
> 
>     int resid;                  /* [o] dxfer_len - actual_transferred */

Yeah that's true.

But let's not side track the discussion here, fio+sg isn't relevant to the
topic at hand. Fio and other engines would be (like libaio, or sync and
friends), but those handle short/partial IOs just fine.

That said, if someone wants to submit a patch for the sg engine, I would
of course take it.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-22 19:13         ` Jens Axboe
@ 2018-01-23  3:18           ` Goldwyn Rodrigues
  2018-01-23  3:28             ` Jens Axboe
  0 siblings, 1 reply; 29+ messages in thread
From: Goldwyn Rodrigues @ 2018-01-23  3:18 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Andreas Dilger, Goldwyn Rodrigues, Andi Kleen, linux-block,
	linux-fsdevel, darrick.wong



On 01/22/2018 01:13 PM, Jens Axboe wrote:
> On 1/22/18 12:10 PM, Andreas Dilger wrote:
>>
>>> On Jan 20, 2018, at 8:07 PM, Jens Axboe <axboe@kernel.dk> wrote:
>>>
>>> On 1/20/18 7:23 PM, Goldwyn Rodrigues wrote:
>>>>
>>>>
>>>> On 01/20/2018 08:11 PM, Andi Kleen wrote:
>>>>>
>>>>> It's likely there's a lot of code in user space that does
>>>>>
>>>>>     if (write(..., N) < 0) handle error
>>>>>
>>>>> With your change it would need to be
>>>>>
>>>>>     if (write(..., N) != N) handle error
>>>>>
>>>>> How much code is actually doing that?
>>>>>
>>>>> I can understand it fixes your artifical test suite, but it seems to me your
>>>>> change has a high potential to break a lot of existing user code
>>>>> in subtle ways. So it seems to be a bad idea.
>>>>
>>>>
>>>> Quoting 'man 2 write':
>>>>
>>>> RETURN VALUE
>>>> On  success,  the  number  of bytes written is returned (zero indicates
>>>> nothing was written).  It is not an error if  this  number  is  smaller
>>>> than the number of bytes requested; this may happen for example because
>>>> the disk device was filled.  See also NOTES.
>>>
>>> You can quote as much man page as you want - Andi is well aware of how
>>> read/write system call works, as I'm sure all of us are, that is not the
>>> issue. The issue is that there are potentially LOTS of applications out
>>> there that do not check for short writes, they do exactly what Andi
>>> speculated above. If you break it with this change, it doesn't matter
>>> what's in the man page. What matters is previous behavior, and that
>>> you are breaking user space. At that point nobody cares what's in the
>>> man page.
>>
>> Applications that don't handle partial writes are already broken with
>> buffered I/O, this change doesn't really make them more broken.
> 
> Not disagreeing that they are broken, of course they are. But if you've
> been doing direct IO and not seeing short writes, then this could break
> your application. And if that's the case, it doesn't really help to say

I started exploring short writes and how direct I/O behaves on errors
after your suggestion to incorporate short writes in a previous
conversation [1]. I started looking into how midway errors with direct
I/O's are handled now and I stumble upon this issue.

> that their application was "already broken". I'd hate for a kernel
> upgrade to break them.
> 
> I do wish we could make the change, and maybe we can. But it probably
> needs some safe guard proc entry to toggle the behavior, something we
> can drop in a few years when we're confident it won't break real
> applications.

Assuming we call it /proc/sys/fs/dio_short_writes(better names/paths?),
should it be enabled or disabled by default?

[1] https://www.spinics.net/lists/linux-block/msg15910.html



-- 
Goldwyn

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-23  3:18           ` Goldwyn Rodrigues
@ 2018-01-23  3:28             ` Jens Axboe
  2018-01-23  6:35               ` Matthew Wilcox
  2018-01-24  0:19               ` Andreas Dilger
  0 siblings, 2 replies; 29+ messages in thread
From: Jens Axboe @ 2018-01-23  3:28 UTC (permalink / raw)
  To: Goldwyn Rodrigues
  Cc: Andreas Dilger, Goldwyn Rodrigues, Andi Kleen, linux-block,
	linux-fsdevel, darrick.wong

On 1/22/18 8:18 PM, Goldwyn Rodrigues wrote:
>> that their application was "already broken". I'd hate for a kernel
>> upgrade to break them.
>>
>> I do wish we could make the change, and maybe we can. But it probably
>> needs some safe guard proc entry to toggle the behavior, something we
>> can drop in a few years when we're confident it won't break real
>> applications.
> 
> Assuming we call it /proc/sys/fs/dio_short_writes(better names/paths?),
> should it be enabled or disabled by default?

I'd enable it by default, if not, you are never going to be able to
remove it because you'll have no confidence that anyone actually flipped
the switch and ran with it enabled. The point of having it there and on
by default would be that if something does break, people have the option
of turning it off and restoring the previous behavior, without having to
change the kernel.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-23  3:28             ` Jens Axboe
@ 2018-01-23  6:35               ` Matthew Wilcox
  2018-01-25 18:01                 ` Goldwyn Rodrigues
  2018-01-24  0:19               ` Andreas Dilger
  1 sibling, 1 reply; 29+ messages in thread
From: Matthew Wilcox @ 2018-01-23  6:35 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Goldwyn Rodrigues, Andreas Dilger, Goldwyn Rodrigues, Andi Kleen,
	linux-block, linux-fsdevel, darrick.wong

On Mon, Jan 22, 2018 at 08:28:54PM -0700, Jens Axboe wrote:
> On 1/22/18 8:18 PM, Goldwyn Rodrigues wrote:
> >> that their application was "already broken". I'd hate for a kernel
> >> upgrade to break them.
> >>
> >> I do wish we could make the change, and maybe we can. But it probably
> >> needs some safe guard proc entry to toggle the behavior, something we
> >> can drop in a few years when we're confident it won't break real
> >> applications.
> > 
> > Assuming we call it /proc/sys/fs/dio_short_writes(better names/paths?),
> > should it be enabled or disabled by default?
> 
> I'd enable it by default, if not, you are never going to be able to
> remove it because you'll have no confidence that anyone actually flipped
> the switch and ran with it enabled. The point of having it there and on
> by default would be that if something does break, people have the option
> of turning it off and restoring the previous behavior, without having to
> change the kernel.

I think it's an opt-in prctl that's something like PRCTL_SHORT_WRITES_ALLOWED.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-23  3:28             ` Jens Axboe
  2018-01-23  6:35               ` Matthew Wilcox
@ 2018-01-24  0:19               ` Andreas Dilger
  1 sibling, 0 replies; 29+ messages in thread
From: Andreas Dilger @ 2018-01-24  0:19 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Goldwyn Rodrigues, Goldwyn Rodrigues, Andi Kleen, linux-block,
	linux-fsdevel, darrick.wong

[-- Attachment #1: Type: text/plain, Size: 1087 bytes --]

On Jan 22, 2018, at 8:28 PM, Jens Axboe <axboe@kernel.dk> wrote:
> 
> On 1/22/18 8:18 PM, Goldwyn Rodrigues wrote:
>>> that their application was "already broken". I'd hate for a kernel
>>> upgrade to break them.
>>> 
>>> I do wish we could make the change, and maybe we can. But it probably
>>> needs some safe guard proc entry to toggle the behavior, something we
>>> can drop in a few years when we're confident it won't break real
>>> applications.
>> 
>> Assuming we call it /proc/sys/fs/dio_short_writes(better names/paths?),
>> should it be enabled or disabled by default?
> 
> I'd enable it by default, if not, you are never going to be able to
> remove it because you'll have no confidence that anyone actually flipped
> the switch and ran with it enabled. The point of having it there and on
> by default would be that if something does break, people have the option
> of turning it off and restoring the previous behavior, without having to
> change the kernel.

... or fixing their application. :-)

But, yes, I agree that this should be on by default.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v5 1/2] Return bytes transferred for partial direct I/O
  2018-01-23  6:35               ` Matthew Wilcox
@ 2018-01-25 18:01                 ` Goldwyn Rodrigues
  0 siblings, 0 replies; 29+ messages in thread
From: Goldwyn Rodrigues @ 2018-01-25 18:01 UTC (permalink / raw)
  To: Matthew Wilcox, Jens Axboe
  Cc: Andreas Dilger, Goldwyn Rodrigues, Andi Kleen, linux-block,
	linux-fsdevel, darrick.wong



On 01/23/2018 12:35 AM, Matthew Wilcox wrote:
> On Mon, Jan 22, 2018 at 08:28:54PM -0700, Jens Axboe wrote:
>> On 1/22/18 8:18 PM, Goldwyn Rodrigues wrote:
>>>> that their application was "already broken". I'd hate for a kernel
>>>> upgrade to break them.
>>>>
>>>> I do wish we could make the change, and maybe we can. But it probably
>>>> needs some safe guard proc entry to toggle the behavior, something we
>>>> can drop in a few years when we're confident it won't break real
>>>> applications.
>>>
>>> Assuming we call it /proc/sys/fs/dio_short_writes(better names/paths?),
>>> should it be enabled or disabled by default?
>>
>> I'd enable it by default, if not, you are never going to be able to
>> remove it because you'll have no confidence that anyone actually flipped
>> the switch and ran with it enabled. The point of having it there and on
>> by default would be that if something does break, people have the option
>> of turning it off and restoring the previous behavior, without having to
>> change the kernel.
> 
> I think it's an opt-in prctl that's something like PRCTL_SHORT_WRITES_ALLOWED.
> 

I cannot decide where to stick this bit in the task_struct.
current->flags/PF_* does not seem right. Don't want to start a new
fs_flags field. Suggestions?

-- 
Goldwyn

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2018-01-25 18:01 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-19  0:57 [PATCH v5 1/2] Return bytes transferred for partial direct I/O Goldwyn Rodrigues
2018-01-19  0:57 ` [PATCH 2/2] xfs: remove assert to check bytes returned Goldwyn Rodrigues
2018-01-19  3:57   ` Dave Chinner
2018-01-19  4:23     ` Raphael Carvalho
2018-01-19  4:51       ` Dave Chinner
2018-01-19  2:13 ` [PATCH v5 1/2] Return bytes transferred for partial direct I/O Al Viro
2018-01-19  3:59   ` Dave Chinner
2018-01-19  6:31     ` Darrick J. Wong
2018-01-19  6:33       ` Al Viro
2018-01-20 19:47         ` Al Viro
2018-01-21  2:57           ` Goldwyn Rodrigues
2018-01-21  2:11 ` Andi Kleen
2018-01-21  2:23   ` Goldwyn Rodrigues
2018-01-21  3:07     ` Jens Axboe
2018-01-21 12:06       ` Goldwyn Rodrigues
2018-01-22 18:08         ` Andi Kleen
2018-01-22 19:10       ` Andreas Dilger
2018-01-22 19:13         ` Jens Axboe
2018-01-23  3:18           ` Goldwyn Rodrigues
2018-01-23  3:28             ` Jens Axboe
2018-01-23  6:35               ` Matthew Wilcox
2018-01-25 18:01                 ` Goldwyn Rodrigues
2018-01-24  0:19               ` Andreas Dilger
2018-01-22 22:25       ` Elliott, Robert (Persistent Memory)
2018-01-22 22:25         ` Elliott, Robert (Persistent Memory)
2018-01-22 23:14         ` Jens Axboe
2018-01-22 23:24           ` Bart Van Assche
2018-01-22 23:24             ` Bart Van Assche
2018-01-22 23:27             ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.