All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nfs: track writeback errors with errseq_t
@ 2017-07-20 19:42 Jeff Layton
  2017-08-25 17:59 ` Jeff Layton
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff Layton @ 2017-07-20 19:42 UTC (permalink / raw)
  To: trond.myklebust, anna.schumaker; +Cc: linux-nfs, linux-fsdevel, NeilBrown

From: Jeff Layton <jlayton@redhat.com>

There is some ambiguity in nfs about how writeback errors are tracked.

For instance, nfs_pageio_add_request calls mapping_set_error when the
add fails, but we track errors that occur after adding the request
with a dedicated int error in the open context.

Now that we have better infrastructure for the vfs layer, this
latter int is now unnecessary. Just have nfs_context_set_write_error set
the error in the mapping when one occurs.

Have NFS use file_write_and_wait_range to initiate and wait on writeback
of the data, and then check again after issuing the commit(s).

With this, we also don't need to pay attention to the ERROR_WRITE
flag for reporting, and just clear it to indicate to subsequent
writers that they should try to go asynchronous again.

In nfs_page_async_flush, sample the error before locking and joining
the requests, and check for errors since that point.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/nfs/file.c          | 24 +++++++++++-------------
 fs/nfs/inode.c         |  3 +--
 fs/nfs/write.c         |  8 ++++++--
 include/linux/nfs_fs.h |  1 -
 4 files changed, 18 insertions(+), 18 deletions(-)

I have a baling wire and duct tape solution for testing this with
xfstests (using iptables REJECT targets and soft mounts). This seems to
make nfs do the right thing.

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5713eb32a45e..15d3c6faafd3 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode *inode = file_inode(file);
-	int have_error, do_resend, status;
-	int ret = 0;
+	int do_resend, status;
+	int ret;
 
 	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
 
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
 	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
-	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-	status = nfs_commit_inode(inode, FLUSH_SYNC);
-	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-	if (have_error) {
-		ret = xchg(&ctx->error, 0);
-		if (ret)
-			goto out;
-	}
-	if (status < 0) {
+	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	ret = nfs_commit_inode(inode, FLUSH_SYNC);
+
+	/* Recheck and advance after the commit */
+	status = file_check_and_advance_wb_err(file);
+	if (!ret)
 		ret = status;
+	if (ret)
 		goto out;
-	}
+
 	do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
 	if (do_resend)
 		ret = -EAGAIN;
@@ -247,7 +245,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	trace_nfs_fsync_enter(inode);
 
 	do {
-		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+		ret = file_write_and_wait_range(file, start, end);
 		if (ret != 0)
 			break;
 		ret = nfs_file_fsync_commit(file, start, end, datasync);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 109279d6d91b..c48f673c5cc9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -900,7 +900,6 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
 	ctx->state = NULL;
 	ctx->mode = f_mode;
 	ctx->flags = 0;
-	ctx->error = 0;
 	ctx->flock_owner = (fl_owner_t)filp;
 	nfs_init_lock_context(&ctx->lock_context);
 	ctx->lock_context.open_context = ctx;
@@ -1009,7 +1008,7 @@ void nfs_file_clear_open_context(struct file *filp)
 		 * We fatal error on write before. Try to writeback
 		 * every page again.
 		 */
-		if (ctx->error < 0)
+		if (filemap_check_wb_err(inode->i_mapping, filp->f_wb_err))
 			invalidate_inode_pages2(inode->i_mapping);
 		filp->private_data = NULL;
 		spin_lock(&inode->i_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b1af5dee5e0a..c2fcaf07cd24 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -149,7 +149,9 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
 
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 {
-	ctx->error = error;
+	struct inode *inode = d_inode(ctx->dentry);
+
+	mapping_set_error(inode->i_mapping, error);
 	smp_wmb();
 	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 }
@@ -628,6 +630,8 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 {
 	struct nfs_page *req;
 	int ret = 0;
+	struct address_space *mapping = page_file_mapping(page);
+	errseq_t since = filemap_sample_wb_err(mapping);
 
 	req = nfs_lock_and_join_requests(page, nonblock);
 	if (!req)
@@ -641,7 +645,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 
 	ret = 0;
 	/* If there is a fatal error that covers this write, just exit */
-	if (nfs_error_is_fatal_on_server(req->wb_context->error))
+	if (nfs_error_is_fatal_on_server(filemap_check_wb_err(mapping, since)))
 		goto out_launder;
 
 	if (!nfs_pageio_add_request(pgio, req)) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index e52cc55ac300..a96b0bd52b32 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -77,7 +77,6 @@ struct nfs_open_context {
 #define NFS_CONTEXT_RESEND_WRITES	(1)
 #define NFS_CONTEXT_BAD			(2)
 #define NFS_CONTEXT_UNLOCK	(3)
-	int error;
 
 	struct list_head list;
 	struct nfs4_threshold	*mdsthreshold;
-- 
2.13.3

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-07-20 19:42 [PATCH] nfs: track writeback errors with errseq_t Jeff Layton
@ 2017-08-25 17:59 ` Jeff Layton
  2017-08-27 23:24   ` NeilBrown
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff Layton @ 2017-08-25 17:59 UTC (permalink / raw)
  To: Jeff Layton, trond.myklebust, anna.schumaker
  Cc: linux-nfs, linux-fsdevel, NeilBrown

On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> From: Jeff Layton <jlayton@redhat.com>
> 
> There is some ambiguity in nfs about how writeback errors are tracked.
> 
> For instance, nfs_pageio_add_request calls mapping_set_error when the
> add fails, but we track errors that occur after adding the request
> with a dedicated int error in the open context.
> 
> Now that we have better infrastructure for the vfs layer, this
> latter int is now unnecessary. Just have nfs_context_set_write_error set
> the error in the mapping when one occurs.
> 
> Have NFS use file_write_and_wait_range to initiate and wait on writeback
> of the data, and then check again after issuing the commit(s).
> 
> With this, we also don't need to pay attention to the ERROR_WRITE
> flag for reporting, and just clear it to indicate to subsequent
> writers that they should try to go asynchronous again.
> 
> In nfs_page_async_flush, sample the error before locking and joining
> the requests, and check for errors since that point.
> 
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
>  fs/nfs/file.c          | 24 +++++++++++-------------
>  fs/nfs/inode.c         |  3 +--
>  fs/nfs/write.c         |  8 ++++++--
>  include/linux/nfs_fs.h |  1 -
>  4 files changed, 18 insertions(+), 18 deletions(-)
> 
> I have a baling wire and duct tape solution for testing this with
> xfstests (using iptables REJECT targets and soft mounts). This seems to
> make nfs do the right thing.
> 
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index 5713eb32a45e..15d3c6faafd3 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
>  {
>  	struct nfs_open_context *ctx = nfs_file_open_context(file);
>  	struct inode *inode = file_inode(file);
> -	int have_error, do_resend, status;
> -	int ret = 0;
> +	int do_resend, status;
> +	int ret;
>  
>  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
>  
>  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>  	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
> -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> -	if (have_error) {
> -		ret = xchg(&ctx->error, 0);
> -		if (ret)
> -			goto out;
> -	}
> -	if (status < 0) {
> +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> +
> +	/* Recheck and advance after the commit */
> +	status = file_check_and_advance_wb_err(file);
> +	if (!ret)
>  		ret = status;
> +	if (ret)
>  		goto out;
> -	}
> +
>  	do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
>  	if (do_resend)
>  		ret = -EAGAIN;
> @@ -247,7 +245,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>  	trace_nfs_fsync_enter(inode);
>  
>  	do {
> -		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
> +		ret = file_write_and_wait_range(file, start, end);
>  		if (ret != 0)
>  			break;
>  		ret = nfs_file_fsync_commit(file, start, end, datasync);
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index 109279d6d91b..c48f673c5cc9 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -900,7 +900,6 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
>  	ctx->state = NULL;
>  	ctx->mode = f_mode;
>  	ctx->flags = 0;
> -	ctx->error = 0;
>  	ctx->flock_owner = (fl_owner_t)filp;
>  	nfs_init_lock_context(&ctx->lock_context);
>  	ctx->lock_context.open_context = ctx;
> @@ -1009,7 +1008,7 @@ void nfs_file_clear_open_context(struct file *filp)
>  		 * We fatal error on write before. Try to writeback
>  		 * every page again.
>  		 */
> -		if (ctx->error < 0)
> +		if (filemap_check_wb_err(inode->i_mapping, filp->f_wb_err))
>  			invalidate_inode_pages2(inode->i_mapping);
>  		filp->private_data = NULL;
>  		spin_lock(&inode->i_lock);
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index b1af5dee5e0a..c2fcaf07cd24 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -149,7 +149,9 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
>  
>  static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
>  {
> -	ctx->error = error;
> +	struct inode *inode = d_inode(ctx->dentry);
> +
> +	mapping_set_error(inode->i_mapping, error);
>  	smp_wmb();
>  	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>  }
> @@ -628,6 +630,8 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
>  {
>  	struct nfs_page *req;
>  	int ret = 0;
> +	struct address_space *mapping = page_file_mapping(page);
> +	errseq_t since = filemap_sample_wb_err(mapping);
>  
>  	req = nfs_lock_and_join_requests(page, nonblock);
>  	if (!req)
> @@ -641,7 +645,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
>  
>  	ret = 0;
>  	/* If there is a fatal error that covers this write, just exit */
> -	if (nfs_error_is_fatal_on_server(req->wb_context->error))
> +	if (nfs_error_is_fatal_on_server(filemap_check_wb_err(mapping, since)))
>  		goto out_launder;
>  
>  	if (!nfs_pageio_add_request(pgio, req)) {
> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
> index e52cc55ac300..a96b0bd52b32 100644
> --- a/include/linux/nfs_fs.h
> +++ b/include/linux/nfs_fs.h
> @@ -77,7 +77,6 @@ struct nfs_open_context {
>  #define NFS_CONTEXT_RESEND_WRITES	(1)
>  #define NFS_CONTEXT_BAD			(2)
>  #define NFS_CONTEXT_UNLOCK	(3)
> -	int error;
>  
>  	struct list_head list;
>  	struct nfs4_threshold	*mdsthreshold;

Anna and Trond,

Ping? I haven't seen any word on this patch, and it hasn't shown up in
any branches. Do you have concerns with it, or is this good to go for
v4.14?

Thanks,
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-08-25 17:59 ` Jeff Layton
@ 2017-08-27 23:24   ` NeilBrown
  2017-08-28 11:47     ` Jeff Layton
  0 siblings, 1 reply; 33+ messages in thread
From: NeilBrown @ 2017-08-27 23:24 UTC (permalink / raw)
  To: Jeff Layton, Jeff Layton, trond.myklebust, anna.schumaker
  Cc: linux-nfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 3232 bytes --]

On Fri, Aug 25 2017, Jeff Layton wrote:

> On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
>> From: Jeff Layton <jlayton@redhat.com>
>> 
>> There is some ambiguity in nfs about how writeback errors are tracked.
>> 
>> For instance, nfs_pageio_add_request calls mapping_set_error when the
>> add fails, but we track errors that occur after adding the request
>> with a dedicated int error in the open context.
>> 
>> Now that we have better infrastructure for the vfs layer, this
>> latter int is now unnecessary. Just have nfs_context_set_write_error set
>> the error in the mapping when one occurs.
>> 
>> Have NFS use file_write_and_wait_range to initiate and wait on writeback
>> of the data, and then check again after issuing the commit(s).
>> 
>> With this, we also don't need to pay attention to the ERROR_WRITE
>> flag for reporting, and just clear it to indicate to subsequent
>> writers that they should try to go asynchronous again.
>> 
>> In nfs_page_async_flush, sample the error before locking and joining
>> the requests, and check for errors since that point.
>> 
>> Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> ---
>>  fs/nfs/file.c          | 24 +++++++++++-------------
>>  fs/nfs/inode.c         |  3 +--
>>  fs/nfs/write.c         |  8 ++++++--
>>  include/linux/nfs_fs.h |  1 -
>>  4 files changed, 18 insertions(+), 18 deletions(-)
>> 
>> I have a baling wire and duct tape solution for testing this with
>> xfstests (using iptables REJECT targets and soft mounts). This seems to
>> make nfs do the right thing.
>> 
>> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> index 5713eb32a45e..15d3c6faafd3 100644
>> --- a/fs/nfs/file.c
>> +++ b/fs/nfs/file.c
>> @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
>>  {
>>  	struct nfs_open_context *ctx = nfs_file_open_context(file);
>>  	struct inode *inode = file_inode(file);
>> -	int have_error, do_resend, status;
>> -	int ret = 0;
>> +	int do_resend, status;
>> +	int ret;
>>  
>>  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
>>  
>>  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>>  	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
>> -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> -	status = nfs_commit_inode(inode, FLUSH_SYNC);
>> -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> -	if (have_error) {
>> -		ret = xchg(&ctx->error, 0);
>> -		if (ret)
>> -			goto out;
>> -	}
>> -	if (status < 0) {
>> +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
>> +
>> +	/* Recheck and advance after the commit */
>> +	status = file_check_and_advance_wb_err(file);

This change makes the code inconsistent with the comment above the
function, which still references ctx->error.  The intent of the comment
is still correct, but the details have changed.

Also, there is a call to mapping_set_error() in
nfs_pageio_add_request().
I wonder if that should be changed to
  nfs_context_set_write_error(req->wb_context, desc->pg_error)
??

Otherwise, patch looks good to me.
Thanks,
NeilBrown


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-08-27 23:24   ` NeilBrown
@ 2017-08-28 11:47     ` Jeff Layton
  2017-08-29  1:23       ` NeilBrown
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff Layton @ 2017-08-28 11:47 UTC (permalink / raw)
  To: NeilBrown, Jeff Layton, trond.myklebust, anna.schumaker
  Cc: linux-nfs, linux-fsdevel

On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
> On Fri, Aug 25 2017, Jeff Layton wrote:
> 
> > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> > > From: Jeff Layton <jlayton@redhat.com>
> > > 
> > > There is some ambiguity in nfs about how writeback errors are
> > > tracked.
> > > 
> > > For instance, nfs_pageio_add_request calls mapping_set_error when
> > > the
> > > add fails, but we track errors that occur after adding the
> > > request
> > > with a dedicated int error in the open context.
> > > 
> > > Now that we have better infrastructure for the vfs layer, this
> > > latter int is now unnecessary. Just have
> > > nfs_context_set_write_error set
> > > the error in the mapping when one occurs.
> > > 
> > > Have NFS use file_write_and_wait_range to initiate and wait on
> > > writeback
> > > of the data, and then check again after issuing the commit(s).
> > > 
> > > With this, we also don't need to pay attention to the ERROR_WRITE
> > > flag for reporting, and just clear it to indicate to subsequent
> > > writers that they should try to go asynchronous again.
> > > 
> > > In nfs_page_async_flush, sample the error before locking and
> > > joining
> > > the requests, and check for errors since that point.
> > > 
> > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > ---
> > >  fs/nfs/file.c          | 24 +++++++++++-------------
> > >  fs/nfs/inode.c         |  3 +--
> > >  fs/nfs/write.c         |  8 ++++++--
> > >  include/linux/nfs_fs.h |  1 -
> > >  4 files changed, 18 insertions(+), 18 deletions(-)
> > > 
> > > I have a baling wire and duct tape solution for testing this with
> > > xfstests (using iptables REJECT targets and soft mounts). This
> > > seems to
> > > make nfs do the right thing.
> > > 
> > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > index 5713eb32a45e..15d3c6faafd3 100644
> > > --- a/fs/nfs/file.c
> > > +++ b/fs/nfs/file.c
> > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file,
> > > loff_t start, loff_t end, int datasync)
> > >  {
> > >  	struct nfs_open_context *ctx =
> > > nfs_file_open_context(file);
> > >  	struct inode *inode = file_inode(file);
> > > -	int have_error, do_resend, status;
> > > -	int ret = 0;
> > > +	int do_resend, status;
> > > +	int ret;
> > >  
> > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file,
> > > datasync);
> > >  
> > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > >  	do_resend =
> > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
> > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> > > &ctx->flags);
> > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > >flags);
> > > -	if (have_error) {
> > > -		ret = xchg(&ctx->error, 0);
> > > -		if (ret)
> > > -			goto out;
> > > -	}
> > > -	if (status < 0) {
> > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> > > +
> > > +	/* Recheck and advance after the commit */
> > > +	status = file_check_and_advance_wb_err(file);
> 
> This change makes the code inconsistent with the comment above the
> function, which still references ctx->error.  The intent of the
> comment
> is still correct, but the details have changed.
> 

Good catch. I'll fix that up in a respin.

> Also, there is a call to mapping_set_error() in
> nfs_pageio_add_request().
> I wonder if that should be changed to
>   nfs_context_set_write_error(req->wb_context, desc->pg_error)
> ??
> 

Trickier question...

I'm not quite sure what semantics we're looking for with
NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
synchronous, but I'm not quite sure why it gets cleared the way it
does. It's set on any error but cleared before issuing a commit.

I added a similar flag to Ceph inodes recently, but only clear it when
a write succeeds. Wouldn't that make more sense here as well?

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-08-28 11:47     ` Jeff Layton
@ 2017-08-29  1:23       ` NeilBrown
  2017-08-29 10:54         ` Jeff Layton
  0 siblings, 1 reply; 33+ messages in thread
From: NeilBrown @ 2017-08-29  1:23 UTC (permalink / raw)
  To: Jeff Layton, Jeff Layton, trond.myklebust, anna.schumaker
  Cc: linux-nfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 5412 bytes --]

On Mon, Aug 28 2017, Jeff Layton wrote:

> On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
>> On Fri, Aug 25 2017, Jeff Layton wrote:
>> 
>> > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
>> > > From: Jeff Layton <jlayton@redhat.com>
>> > > 
>> > > There is some ambiguity in nfs about how writeback errors are
>> > > tracked.
>> > > 
>> > > For instance, nfs_pageio_add_request calls mapping_set_error when
>> > > the
>> > > add fails, but we track errors that occur after adding the
>> > > request
>> > > with a dedicated int error in the open context.
>> > > 
>> > > Now that we have better infrastructure for the vfs layer, this
>> > > latter int is now unnecessary. Just have
>> > > nfs_context_set_write_error set
>> > > the error in the mapping when one occurs.
>> > > 
>> > > Have NFS use file_write_and_wait_range to initiate and wait on
>> > > writeback
>> > > of the data, and then check again after issuing the commit(s).
>> > > 
>> > > With this, we also don't need to pay attention to the ERROR_WRITE
>> > > flag for reporting, and just clear it to indicate to subsequent
>> > > writers that they should try to go asynchronous again.
>> > > 
>> > > In nfs_page_async_flush, sample the error before locking and
>> > > joining
>> > > the requests, and check for errors since that point.
>> > > 
>> > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > > ---
>> > >  fs/nfs/file.c          | 24 +++++++++++-------------
>> > >  fs/nfs/inode.c         |  3 +--
>> > >  fs/nfs/write.c         |  8 ++++++--
>> > >  include/linux/nfs_fs.h |  1 -
>> > >  4 files changed, 18 insertions(+), 18 deletions(-)
>> > > 
>> > > I have a baling wire and duct tape solution for testing this with
>> > > xfstests (using iptables REJECT targets and soft mounts). This
>> > > seems to
>> > > make nfs do the right thing.
>> > > 
>> > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> > > index 5713eb32a45e..15d3c6faafd3 100644
>> > > --- a/fs/nfs/file.c
>> > > +++ b/fs/nfs/file.c
>> > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file,
>> > > loff_t start, loff_t end, int datasync)
>> > >  {
>> > >  	struct nfs_open_context *ctx =
>> > > nfs_file_open_context(file);
>> > >  	struct inode *inode = file_inode(file);
>> > > -	int have_error, do_resend, status;
>> > > -	int ret = 0;
>> > > +	int do_resend, status;
>> > > +	int ret;
>> > >  
>> > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file,
>> > > datasync);
>> > >  
>> > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>> > >  	do_resend =
>> > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
>> > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
>> > > &ctx->flags);
>> > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
>> > > >flags);
>> > > -	if (have_error) {
>> > > -		ret = xchg(&ctx->error, 0);
>> > > -		if (ret)
>> > > -			goto out;
>> > > -	}
>> > > -	if (status < 0) {
>> > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > +
>> > > +	/* Recheck and advance after the commit */
>> > > +	status = file_check_and_advance_wb_err(file);
>> 
>> This change makes the code inconsistent with the comment above the
>> function, which still references ctx->error.  The intent of the
>> comment
>> is still correct, but the details have changed.
>> 
>
> Good catch. I'll fix that up in a respin.
>
>> Also, there is a call to mapping_set_error() in
>> nfs_pageio_add_request().
>> I wonder if that should be changed to
>>   nfs_context_set_write_error(req->wb_context, desc->pg_error)
>> ??
>> 
>
> Trickier question...
>
> I'm not quite sure what semantics we're looking for with
> NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
> synchronous, but I'm not quite sure why it gets cleared the way it
> does. It's set on any error but cleared before issuing a commit.
>
> I added a similar flag to Ceph inodes recently, but only clear it when
> a write succeeds. Wouldn't that make more sense here as well?

It is a bit hard to wrap one's mind around.

In the original code (commit 7b159fc18d417980) it looks like:
 - test-and-clear bit
 - write and sync
 - test-bit

This does, I think, seem safer than "clear on successful write" as the
writes could complete out-of-order and I wouldn't be surprised if the
unsuccessful ones completed with an error before the successful one -
particularly with an error like EDQUOT.

However the current code does the writes before the test-and-clear, and
only does the commit afterwards.  That makes it less clear why the
current sequence is a good idea.

However ... nfs_file_fsync_commit() is only called if
filemap_write_and_wait_range() returned with success, so we only clear
the flag after successful writes(?).

Oh....
This patch from me:

Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error handling.")

seems to have been reverted by

Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if page writeback failed")

which probably isn't good.  It appears that this code is very fragile
and easily broken.
Maybe we need to work out exactly what is required, and document it - so
we can stop breaking it.
Or maybe we need some unit tests.....

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-08-29  1:23       ` NeilBrown
@ 2017-08-29 10:54         ` Jeff Layton
  2017-09-07  3:37           ` NeilBrown
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff Layton @ 2017-08-29 10:54 UTC (permalink / raw)
  To: NeilBrown, Jeff Layton, trond.myklebust, anna.schumaker
  Cc: linux-nfs, linux-fsdevel

On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
> On Mon, Aug 28 2017, Jeff Layton wrote:
> 
> > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
> > > On Fri, Aug 25 2017, Jeff Layton wrote:
> > > 
> > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> > > > > From: Jeff Layton <jlayton@redhat.com>
> > > > > 
> > > > > There is some ambiguity in nfs about how writeback errors are
> > > > > tracked.
> > > > > 
> > > > > For instance, nfs_pageio_add_request calls mapping_set_error when
> > > > > the
> > > > > add fails, but we track errors that occur after adding the
> > > > > request
> > > > > with a dedicated int error in the open context.
> > > > > 
> > > > > Now that we have better infrastructure for the vfs layer, this
> > > > > latter int is now unnecessary. Just have
> > > > > nfs_context_set_write_error set
> > > > > the error in the mapping when one occurs.
> > > > > 
> > > > > Have NFS use file_write_and_wait_range to initiate and wait on
> > > > > writeback
> > > > > of the data, and then check again after issuing the commit(s).
> > > > > 
> > > > > With this, we also don't need to pay attention to the ERROR_WRITE
> > > > > flag for reporting, and just clear it to indicate to subsequent
> > > > > writers that they should try to go asynchronous again.
> > > > > 
> > > > > In nfs_page_async_flush, sample the error before locking and
> > > > > joining
> > > > > the requests, and check for errors since that point.
> > > > > 
> > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > > ---
> > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
> > > > >  fs/nfs/inode.c         |  3 +--
> > > > >  fs/nfs/write.c         |  8 ++++++--
> > > > >  include/linux/nfs_fs.h |  1 -
> > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
> > > > > 
> > > > > I have a baling wire and duct tape solution for testing this with
> > > > > xfstests (using iptables REJECT targets and soft mounts). This
> > > > > seems to
> > > > > make nfs do the right thing.
> > > > > 
> > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > > > index 5713eb32a45e..15d3c6faafd3 100644
> > > > > --- a/fs/nfs/file.c
> > > > > +++ b/fs/nfs/file.c
> > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file,
> > > > > loff_t start, loff_t end, int datasync)
> > > > >  {
> > > > >  	struct nfs_open_context *ctx =
> > > > > nfs_file_open_context(file);
> > > > >  	struct inode *inode = file_inode(file);
> > > > > -	int have_error, do_resend, status;
> > > > > -	int ret = 0;
> > > > > +	int do_resend, status;
> > > > > +	int ret;
> > > > >  
> > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file,
> > > > > datasync);
> > > > >  
> > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > > > >  	do_resend =
> > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
> > > > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> > > > > &ctx->flags);
> > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > flags);
> > > > > 
> > > > > -	if (have_error) {
> > > > > -		ret = xchg(&ctx->error, 0);
> > > > > -		if (ret)
> > > > > -			goto out;
> > > > > -	}
> > > > > -	if (status < 0) {
> > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > +
> > > > > +	/* Recheck and advance after the commit */
> > > > > +	status = file_check_and_advance_wb_err(file);
> > > 
> > > This change makes the code inconsistent with the comment above the
> > > function, which still references ctx->error.  The intent of the
> > > comment
> > > is still correct, but the details have changed.
> > > 
> > 
> > Good catch. I'll fix that up in a respin.
> > 
> > > Also, there is a call to mapping_set_error() in
> > > nfs_pageio_add_request().
> > > I wonder if that should be changed to
> > >   nfs_context_set_write_error(req->wb_context, desc->pg_error)
> > > ??
> > > 
> > 
> > Trickier question...
> > 
> > I'm not quite sure what semantics we're looking for with
> > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
> > synchronous, but I'm not quite sure why it gets cleared the way it
> > does. It's set on any error but cleared before issuing a commit.
> > 
> > I added a similar flag to Ceph inodes recently, but only clear it when
> > a write succeeds. Wouldn't that make more sense here as well?
> 
> It is a bit hard to wrap one's mind around.
> 
> In the original code (commit 7b159fc18d417980) it looks like:
>  - test-and-clear bit
>  - write and sync
>  - test-bit
> 
> This does, I think, seem safer than "clear on successful write" as the
> writes could complete out-of-order and I wouldn't be surprised if the
> unsuccessful ones completed with an error before the successful one -
> particularly with an error like EDQUOT.
> 
> However the current code does the writes before the test-and-clear, and
> only does the commit afterwards.  That makes it less clear why the
> current sequence is a good idea.
> 
> However ... nfs_file_fsync_commit() is only called if
> filemap_write_and_wait_range() returned with success, so we only clear
> the flag after successful writes(?).
> 
> Oh....
> This patch from me:
> 
> Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error handling.")
> 
> seems to have been reverted by
> 
> Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if page writeback failed")
> 
> which probably isn't good.  It appears that this code is very fragile
> and easily broken.
> Maybe we need to work out exactly what is required, and document it - so
> we can stop breaking it.
> Or maybe we need some unit tests.....
> 

Yes, laying out what's necessary for this would be very helpful. We
clearly want to set the flag when an error occurs. Under what
circumstances should we be clearing it?

I'm not sure we can really do much better than clearing it on a
successful write. With Ceph, was that this is just a hint to the write
submission mechanism and we generally aren't too concerned if a few slip
past in either direction.
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-08-29 10:54         ` Jeff Layton
@ 2017-09-07  3:37           ` NeilBrown
  2017-09-07 11:35             ` Jeff Layton
  0 siblings, 1 reply; 33+ messages in thread
From: NeilBrown @ 2017-09-07  3:37 UTC (permalink / raw)
  To: Jeff Layton, Jeff Layton, trond.myklebust, anna.schumaker
  Cc: linux-nfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 8507 bytes --]

On Tue, Aug 29 2017, Jeff Layton wrote:

> On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
>> On Mon, Aug 28 2017, Jeff Layton wrote:
>> 
>> > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
>> > > On Fri, Aug 25 2017, Jeff Layton wrote:
>> > > 
>> > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
>> > > > > From: Jeff Layton <jlayton@redhat.com>
>> > > > > 
>> > > > > There is some ambiguity in nfs about how writeback errors are
>> > > > > tracked.
>> > > > > 
>> > > > > For instance, nfs_pageio_add_request calls mapping_set_error when
>> > > > > the
>> > > > > add fails, but we track errors that occur after adding the
>> > > > > request
>> > > > > with a dedicated int error in the open context.
>> > > > > 
>> > > > > Now that we have better infrastructure for the vfs layer, this
>> > > > > latter int is now unnecessary. Just have
>> > > > > nfs_context_set_write_error set
>> > > > > the error in the mapping when one occurs.
>> > > > > 
>> > > > > Have NFS use file_write_and_wait_range to initiate and wait on
>> > > > > writeback
>> > > > > of the data, and then check again after issuing the commit(s).
>> > > > > 
>> > > > > With this, we also don't need to pay attention to the ERROR_WRITE
>> > > > > flag for reporting, and just clear it to indicate to subsequent
>> > > > > writers that they should try to go asynchronous again.
>> > > > > 
>> > > > > In nfs_page_async_flush, sample the error before locking and
>> > > > > joining
>> > > > > the requests, and check for errors since that point.
>> > > > > 
>> > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > > > > ---
>> > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
>> > > > >  fs/nfs/inode.c         |  3 +--
>> > > > >  fs/nfs/write.c         |  8 ++++++--
>> > > > >  include/linux/nfs_fs.h |  1 -
>> > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
>> > > > > 
>> > > > > I have a baling wire and duct tape solution for testing this with
>> > > > > xfstests (using iptables REJECT targets and soft mounts). This
>> > > > > seems to
>> > > > > make nfs do the right thing.
>> > > > > 
>> > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> > > > > index 5713eb32a45e..15d3c6faafd3 100644
>> > > > > --- a/fs/nfs/file.c
>> > > > > +++ b/fs/nfs/file.c
>> > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file,
>> > > > > loff_t start, loff_t end, int datasync)
>> > > > >  {
>> > > > >  	struct nfs_open_context *ctx =
>> > > > > nfs_file_open_context(file);
>> > > > >  	struct inode *inode = file_inode(file);
>> > > > > -	int have_error, do_resend, status;
>> > > > > -	int ret = 0;
>> > > > > +	int do_resend, status;
>> > > > > +	int ret;
>> > > > >  
>> > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file,
>> > > > > datasync);
>> > > > >  
>> > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>> > > > >  	do_resend =
>> > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
>> > > > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
>> > > > > &ctx->flags);
>> > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
>> > > > > > flags);
>> > > > > 
>> > > > > -	if (have_error) {
>> > > > > -		ret = xchg(&ctx->error, 0);
>> > > > > -		if (ret)
>> > > > > -			goto out;
>> > > > > -	}
>> > > > > -	if (status < 0) {
>> > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > > > +
>> > > > > +	/* Recheck and advance after the commit */
>> > > > > +	status = file_check_and_advance_wb_err(file);
>> > > 
>> > > This change makes the code inconsistent with the comment above the
>> > > function, which still references ctx->error.  The intent of the
>> > > comment
>> > > is still correct, but the details have changed.
>> > > 
>> > 
>> > Good catch. I'll fix that up in a respin.
>> > 
>> > > Also, there is a call to mapping_set_error() in
>> > > nfs_pageio_add_request().
>> > > I wonder if that should be changed to
>> > >   nfs_context_set_write_error(req->wb_context, desc->pg_error)
>> > > ??
>> > > 
>> > 
>> > Trickier question...
>> > 
>> > I'm not quite sure what semantics we're looking for with
>> > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
>> > synchronous, but I'm not quite sure why it gets cleared the way it
>> > does. It's set on any error but cleared before issuing a commit.
>> > 
>> > I added a similar flag to Ceph inodes recently, but only clear it when
>> > a write succeeds. Wouldn't that make more sense here as well?
>> 
>> It is a bit hard to wrap one's mind around.
>> 
>> In the original code (commit 7b159fc18d417980) it looks like:
>>  - test-and-clear bit
>>  - write and sync
>>  - test-bit
>> 
>> This does, I think, seem safer than "clear on successful write" as the
>> writes could complete out-of-order and I wouldn't be surprised if the
>> unsuccessful ones completed with an error before the successful one -
>> particularly with an error like EDQUOT.
>> 
>> However the current code does the writes before the test-and-clear, and
>> only does the commit afterwards.  That makes it less clear why the
>> current sequence is a good idea.
>> 
>> However ... nfs_file_fsync_commit() is only called if
>> filemap_write_and_wait_range() returned with success, so we only clear
>> the flag after successful writes(?).
>> 
>> Oh....
>> This patch from me:
>> 
>> Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error handling.")
>> 
>> seems to have been reverted by
>> 
>> Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if page writeback failed")
>> 
>> which probably isn't good.  It appears that this code is very fragile
>> and easily broken.

On further investigation, I think the problem that I fixed and then we
reintroduced will be fixed again - more permanently - by your patch.
The root problem is that nfs keeps error codes in a different way to the
MM core.  By unifying those, the problem goes.
(The specific problem is that writes which hit EDQUOT on the server can
 report EIO on the client).


>> Maybe we need to work out exactly what is required, and document it - so
>> we can stop breaking it.
>> Or maybe we need some unit tests.....
>> 
>
> Yes, laying out what's necessary for this would be very helpful. We
> clearly want to set the flag when an error occurs. Under what
> circumstances should we be clearing it?

Well.... looking back at  7b159fc18d417980f57ae which introduced the
flag, prior to that write errors (ctx->error) were only reported by
nfs_file_flush and nfs_fsync, so only one close() and fsync().

After that commit, setting the flag would mean that errors could be
returned by 'write'.  So clearing as part of returning the error makes
perfect sense.

As long as the error gets recorded, and gets returned when it is
recorded, it doesn't much matter when the flag is cleared.  With your
patches we don't need to flag any more to get errors reliably reported.

Leaving the flag set means that writes go more slowly - we don't get
large queue of background rights building up but destined for failure.
This is the main point made in the comment message when the flag was
introduced.
Of course, by the time we first get an error there could already
by a large queue, so we probably want that to drain completely before
allowing async writes again.

It might make sense to have 2 flags.  One which says "writes should be
synchronous", another that says "There was an error recently".
We clear the error flag before calling nfs_fsync, and if it is still
clear afterwards, we clear the sync-writes flag.  Maybe that is more
complex than needed though.

I'm leaning towards your suggestion that it doesn't matter very much
when it gets cleared, and clearing it on any successful write is
simplest.

So I'm still in favor of using nfs_context_set_write_error() in
nfs_pageio_add_request(), primarily because it is most consistent - we
don't need exceptions.

Thanks,
NeilBrown


>
> I'm not sure we can really do much better than clearing it on a
> successful write. With Ceph, was that this is just a hint to the write
> submission mechanism and we generally aren't too concerned if a few slip
> past in either direction.
> -- 
> Jeff Layton <jlayton@redhat.com>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-09-07  3:37           ` NeilBrown
@ 2017-09-07 11:35             ` Jeff Layton
  2017-09-07 14:54                 ` Trond Myklebust
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff Layton @ 2017-09-07 11:35 UTC (permalink / raw)
  To: NeilBrown, Jeff Layton, trond.myklebust, anna.schumaker
  Cc: linux-nfs, linux-fsdevel

On Thu, 2017-09-07 at 13:37 +1000, NeilBrown wrote:
> On Tue, Aug 29 2017, Jeff Layton wrote:
> 
> > On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
> > > On Mon, Aug 28 2017, Jeff Layton wrote:
> > > 
> > > > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
> > > > > On Fri, Aug 25 2017, Jeff Layton wrote:
> > > > > 
> > > > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> > > > > > > From: Jeff Layton <jlayton@redhat.com>
> > > > > > > 
> > > > > > > There is some ambiguity in nfs about how writeback errors are
> > > > > > > tracked.
> > > > > > > 
> > > > > > > For instance, nfs_pageio_add_request calls mapping_set_error when
> > > > > > > the
> > > > > > > add fails, but we track errors that occur after adding the
> > > > > > > request
> > > > > > > with a dedicated int error in the open context.
> > > > > > > 
> > > > > > > Now that we have better infrastructure for the vfs layer, this
> > > > > > > latter int is now unnecessary. Just have
> > > > > > > nfs_context_set_write_error set
> > > > > > > the error in the mapping when one occurs.
> > > > > > > 
> > > > > > > Have NFS use file_write_and_wait_range to initiate and wait on
> > > > > > > writeback
> > > > > > > of the data, and then check again after issuing the commit(s).
> > > > > > > 
> > > > > > > With this, we also don't need to pay attention to the ERROR_WRITE
> > > > > > > flag for reporting, and just clear it to indicate to subsequent
> > > > > > > writers that they should try to go asynchronous again.
> > > > > > > 
> > > > > > > In nfs_page_async_flush, sample the error before locking and
> > > > > > > joining
> > > > > > > the requests, and check for errors since that point.
> > > > > > > 
> > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > > > > ---
> > > > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
> > > > > > >  fs/nfs/inode.c         |  3 +--
> > > > > > >  fs/nfs/write.c         |  8 ++++++--
> > > > > > >  include/linux/nfs_fs.h |  1 -
> > > > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
> > > > > > > 
> > > > > > > I have a baling wire and duct tape solution for testing this with
> > > > > > > xfstests (using iptables REJECT targets and soft mounts). This
> > > > > > > seems to
> > > > > > > make nfs do the right thing.
> > > > > > > 
> > > > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > > > > > index 5713eb32a45e..15d3c6faafd3 100644
> > > > > > > --- a/fs/nfs/file.c
> > > > > > > +++ b/fs/nfs/file.c
> > > > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file,
> > > > > > > loff_t start, loff_t end, int datasync)
> > > > > > >  {
> > > > > > >  	struct nfs_open_context *ctx =
> > > > > > > nfs_file_open_context(file);
> > > > > > >  	struct inode *inode = file_inode(file);
> > > > > > > -	int have_error, do_resend, status;
> > > > > > > -	int ret = 0;
> > > > > > > +	int do_resend, status;
> > > > > > > +	int ret;
> > > > > > >  
> > > > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file,
> > > > > > > datasync);
> > > > > > >  
> > > > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > > > > > >  	do_resend =
> > > > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
> > > > > > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> > > > > > > &ctx->flags);
> > > > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > > > flags);
> > > > > > > 
> > > > > > > -	if (have_error) {
> > > > > > > -		ret = xchg(&ctx->error, 0);
> > > > > > > -		if (ret)
> > > > > > > -			goto out;
> > > > > > > -	}
> > > > > > > -	if (status < 0) {
> > > > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > +
> > > > > > > +	/* Recheck and advance after the commit */
> > > > > > > +	status = file_check_and_advance_wb_err(file);
> > > > > 
> > > > > This change makes the code inconsistent with the comment above the
> > > > > function, which still references ctx->error.  The intent of the
> > > > > comment
> > > > > is still correct, but the details have changed.
> > > > > 
> > > > 
> > > > Good catch. I'll fix that up in a respin.
> > > > 
> > > > > Also, there is a call to mapping_set_error() in
> > > > > nfs_pageio_add_request().
> > > > > I wonder if that should be changed to
> > > > >   nfs_context_set_write_error(req->wb_context, desc->pg_error)
> > > > > ??
> > > > > 
> > > > 
> > > > Trickier question...
> > > > 
> > > > I'm not quite sure what semantics we're looking for with
> > > > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
> > > > synchronous, but I'm not quite sure why it gets cleared the way it
> > > > does. It's set on any error but cleared before issuing a commit.
> > > > 
> > > > I added a similar flag to Ceph inodes recently, but only clear it when
> > > > a write succeeds. Wouldn't that make more sense here as well?
> > > 
> > > It is a bit hard to wrap one's mind around.
> > > 
> > > In the original code (commit 7b159fc18d417980) it looks like:
> > >  - test-and-clear bit
> > >  - write and sync
> > >  - test-bit
> > > 
> > > This does, I think, seem safer than "clear on successful write" as the
> > > writes could complete out-of-order and I wouldn't be surprised if the
> > > unsuccessful ones completed with an error before the successful one -
> > > particularly with an error like EDQUOT.
> > > 
> > > However the current code does the writes before the test-and-clear, and
> > > only does the commit afterwards.  That makes it less clear why the
> > > current sequence is a good idea.
> > > 
> > > However ... nfs_file_fsync_commit() is only called if
> > > filemap_write_and_wait_range() returned with success, so we only clear
> > > the flag after successful writes(?).
> > > 
> > > Oh....
> > > This patch from me:
> > > 
> > > Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error handling.")
> > > 
> > > seems to have been reverted by
> > > 
> > > Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if page writeback failed")
> > > 
> > > which probably isn't good.  It appears that this code is very fragile
> > > and easily broken.
> 
> On further investigation, I think the problem that I fixed and then we
> reintroduced will be fixed again - more permanently - by your patch.
> The root problem is that nfs keeps error codes in a different way to the
> MM core.  By unifying those, the problem goes.
> (The specific problem is that writes which hit EDQUOT on the server can
>  report EIO on the client).
> 
> 
> > > Maybe we need to work out exactly what is required, and document it - so
> > > we can stop breaking it.
> > > Or maybe we need some unit tests.....
> > > 
> > 
> > Yes, laying out what's necessary for this would be very helpful. We
> > clearly want to set the flag when an error occurs. Under what
> > circumstances should we be clearing it?
> 
> Well.... looking back at  7b159fc18d417980f57ae which introduced the
> flag, prior to that write errors (ctx->error) were only reported by
> nfs_file_flush and nfs_fsync, so only one close() and fsync().
> 
> After that commit, setting the flag would mean that errors could be
> returned by 'write'.  So clearing as part of returning the error makes
> perfect sense.
> 
> As long as the error gets recorded, and gets returned when it is
> recorded, it doesn't much matter when the flag is cleared.  With your
> patches we don't need to flag any more to get errors reliably reported.
> 
> Leaving the flag set means that writes go more slowly - we don't get
> large queue of background rights building up but destined for failure.
> This is the main point made in the comment message when the flag was
> introduced.
> Of course, by the time we first get an error there could already
> by a large queue, so we probably want that to drain completely before
> allowing async writes again.
> 
> It might make sense to have 2 flags.  One which says "writes should be
> synchronous", another that says "There was an error recently".
> We clear the error flag before calling nfs_fsync, and if it is still
> clear afterwards, we clear the sync-writes flag.  Maybe that is more
> complex than needed though.
> 
> I'm leaning towards your suggestion that it doesn't matter very much
> when it gets cleared, and clearing it on any successful write is
> simplest.
> 
> So I'm still in favor of using nfs_context_set_write_error() in
> nfs_pageio_add_request(), primarily because it is most consistent - we
> don't need exceptions.

Thanks for taking a closer look. I can easily make the change above, and
I do think that keeping this mechanism as simple as possible will make
it easier to prevent bitrot.

That said... NFS_CONTEXT_ERROR_WRITE is a per ctx flag, and the ctx is a
per open file description object.

Is that the correct way to track this? All of the ctx's will share the
same inode. If we're getting writeback errors for one context, it's
quite likely that we'll be seeing them via others.

I suppose the counterargument is when we have things like expiring krb5
tickets. Write failures via an expiring set of creds may have no effect
on writeback via other creds.

Still, I think a per-inode flag might make more sense here.

Thoughts?
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-09-07 11:35             ` Jeff Layton
@ 2017-09-07 14:54                 ` Trond Myklebust
  0 siblings, 0 replies; 33+ messages in thread
From: Trond Myklebust @ 2017-09-07 14:54 UTC (permalink / raw)
  To: anna.schumaker, jlayton, neilb, jlayton; +Cc: linux-nfs, linux-fsdevel

On Thu, 2017-09-07 at 07:35 -0400, Jeff Layton wrote:
> On Thu, 2017-09-07 at 13:37 +1000, NeilBrown wrote:
> > On Tue, Aug 29 2017, Jeff Layton wrote:
> > 
> > > On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
> > > > On Mon, Aug 28 2017, Jeff Layton wrote:
> > > > 
> > > > > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
> > > > > > On Fri, Aug 25 2017, Jeff Layton wrote:
> > > > > > 
> > > > > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> > > > > > > > From: Jeff Layton <jlayton@redhat.com>
> > > > > > > > 
> > > > > > > > There is some ambiguity in nfs about how writeback
> > > > > > > > errors are
> > > > > > > > tracked.
> > > > > > > > 
> > > > > > > > For instance, nfs_pageio_add_request calls
> > > > > > > > mapping_set_error when
> > > > > > > > the
> > > > > > > > add fails, but we track errors that occur after adding
> > > > > > > > the
> > > > > > > > request
> > > > > > > > with a dedicated int error in the open context.
> > > > > > > > 
> > > > > > > > Now that we have better infrastructure for the vfs
> > > > > > > > layer, this
> > > > > > > > latter int is now unnecessary. Just have
> > > > > > > > nfs_context_set_write_error set
> > > > > > > > the error in the mapping when one occurs.
> > > > > > > > 
> > > > > > > > Have NFS use file_write_and_wait_range to initiate and
> > > > > > > > wait on
> > > > > > > > writeback
> > > > > > > > of the data, and then check again after issuing the
> > > > > > > > commit(s).
> > > > > > > > 
> > > > > > > > With this, we also don't need to pay attention to the
> > > > > > > > ERROR_WRITE
> > > > > > > > flag for reporting, and just clear it to indicate to
> > > > > > > > subsequent
> > > > > > > > writers that they should try to go asynchronous again.
> > > > > > > > 
> > > > > > > > In nfs_page_async_flush, sample the error before
> > > > > > > > locking and
> > > > > > > > joining
> > > > > > > > the requests, and check for errors since that point.
> > > > > > > > 
> > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > > > > > ---
> > > > > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
> > > > > > > >  fs/nfs/inode.c         |  3 +--
> > > > > > > >  fs/nfs/write.c         |  8 ++++++--
> > > > > > > >  include/linux/nfs_fs.h |  1 -
> > > > > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
> > > > > > > > 
> > > > > > > > I have a baling wire and duct tape solution for testing
> > > > > > > > this with
> > > > > > > > xfstests (using iptables REJECT targets and soft
> > > > > > > > mounts). This
> > > > > > > > seems to
> > > > > > > > make nfs do the right thing.
> > > > > > > > 
> > > > > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > > > > > > index 5713eb32a45e..15d3c6faafd3 100644
> > > > > > > > --- a/fs/nfs/file.c
> > > > > > > > +++ b/fs/nfs/file.c
> > > > > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file
> > > > > > > > *file,
> > > > > > > > loff_t start, loff_t end, int datasync)
> > > > > > > >  {
> > > > > > > >  	struct nfs_open_context *ctx =
> > > > > > > > nfs_file_open_context(file);
> > > > > > > >  	struct inode *inode = file_inode(file);
> > > > > > > > -	int have_error, do_resend, status;
> > > > > > > > -	int ret = 0;
> > > > > > > > +	int do_resend, status;
> > > > > > > > +	int ret;
> > > > > > > >  
> > > > > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n",
> > > > > > > > file,
> > > > > > > > datasync);
> > > > > > > >  
> > > > > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > > > > > > >  	do_resend =
> > > > > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx-
> > > > > > > > >flags);
> > > > > > > > -	have_error =
> > > > > > > > test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> > > > > > > > &ctx->flags);
> > > > > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > > -	have_error |=
> > > > > > > > test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > > > > flags);
> > > > > > > > 
> > > > > > > > -	if (have_error) {
> > > > > > > > -		ret = xchg(&ctx->error, 0);
> > > > > > > > -		if (ret)
> > > > > > > > -			goto out;
> > > > > > > > -	}
> > > > > > > > -	if (status < 0) {
> > > > > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > > > >flags);
> > > > > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > > +
> > > > > > > > +	/* Recheck and advance after the commit */
> > > > > > > > +	status = file_check_and_advance_wb_err(file);
> > > > > > 
> > > > > > This change makes the code inconsistent with the comment
> > > > > > above the
> > > > > > function, which still references ctx->error.  The intent of
> > > > > > the
> > > > > > comment
> > > > > > is still correct, but the details have changed.
> > > > > > 
> > > > > 
> > > > > Good catch. I'll fix that up in a respin.
> > > > > 
> > > > > > Also, there is a call to mapping_set_error() in
> > > > > > nfs_pageio_add_request().
> > > > > > I wonder if that should be changed to
> > > > > >   nfs_context_set_write_error(req->wb_context, desc-
> > > > > > >pg_error)
> > > > > > ??
> > > > > > 
> > > > > 
> > > > > Trickier question...
> > > > > 
> > > > > I'm not quite sure what semantics we're looking for with
> > > > > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
> > > > > synchronous, but I'm not quite sure why it gets cleared the
> > > > > way it
> > > > > does. It's set on any error but cleared before issuing a
> > > > > commit.
> > > > > 
> > > > > I added a similar flag to Ceph inodes recently, but only
> > > > > clear it when
> > > > > a write succeeds. Wouldn't that make more sense here as well?
> > > > 
> > > > It is a bit hard to wrap one's mind around.
> > > > 
> > > > In the original code (commit 7b159fc18d417980) it looks like:
> > > >  - test-and-clear bit
> > > >  - write and sync
> > > >  - test-bit
> > > > 
> > > > This does, I think, seem safer than "clear on successful write"
> > > > as the
> > > > writes could complete out-of-order and I wouldn't be surprised
> > > > if the
> > > > unsuccessful ones completed with an error before the successful
> > > > one -
> > > > particularly with an error like EDQUOT.
> > > > 
> > > > However the current code does the writes before the test-and-
> > > > clear, and
> > > > only does the commit afterwards.  That makes it less clear why
> > > > the
> > > > current sequence is a good idea.
> > > > 
> > > > However ... nfs_file_fsync_commit() is only called if
> > > > filemap_write_and_wait_range() returned with success, so we
> > > > only clear
> > > > the flag after successful writes(?).
> > > > 
> > > > Oh....
> > > > This patch from me:
> > > > 
> > > > Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error
> > > > handling.")
> > > > 
> > > > seems to have been reverted by
> > > > 
> > > > Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if
> > > > page writeback failed")
> > > > 
> > > > which probably isn't good.  It appears that this code is very
> > > > fragile
> > > > and easily broken.
> > 
> > On further investigation, I think the problem that I fixed and then
> > we
> > reintroduced will be fixed again - more permanently - by your
> > patch.
> > The root problem is that nfs keeps error codes in a different way
> > to the
> > MM core.  By unifying those, the problem goes.
> > (The specific problem is that writes which hit EDQUOT on the server
> > can
> >  report EIO on the client).
> > 
> > 
> > > > Maybe we need to work out exactly what is required, and
> > > > document it - so
> > > > we can stop breaking it.
> > > > Or maybe we need some unit tests.....
> > > > 
> > > 
> > > Yes, laying out what's necessary for this would be very helpful.
> > > We
> > > clearly want to set the flag when an error occurs. Under what
> > > circumstances should we be clearing it?
> > 
> > Well.... looking back at  7b159fc18d417980f57ae which introduced
> > the
> > flag, prior to that write errors (ctx->error) were only reported by
> > nfs_file_flush and nfs_fsync, so only one close() and fsync().
> > 
> > After that commit, setting the flag would mean that errors could be
> > returned by 'write'.  So clearing as part of returning the error
> > makes
> > perfect sense.
> > 
> > As long as the error gets recorded, and gets returned when it is
> > recorded, it doesn't much matter when the flag is cleared.  With
> > your
> > patches we don't need to flag any more to get errors reliably
> > reported.
> > 
> > Leaving the flag set means that writes go more slowly - we don't
> > get
> > large queue of background rights building up but destined for
> > failure.
> > This is the main point made in the comment message when the flag
> > was
> > introduced.
> > Of course, by the time we first get an error there could already
> > by a large queue, so we probably want that to drain completely
> > before
> > allowing async writes again.

We already have this functionality implemented in the existing code.

> > 
> > It might make sense to have 2 flags.  One which says "writes should
> > be
> > synchronous", another that says "There was an error recently".
> > We clear the error flag before calling nfs_fsync, and if it is
> > still
> > clear afterwards, we clear the sync-writes flag.  Maybe that is
> > more
> > complex than needed though.
> > 

We also need to preserve the NFS_CONTEXT_RESEND_WRITES flag. I don't
see any global mechanism that will replace that.

> > I'm leaning towards your suggestion that it doesn't matter very
> > much
> > when it gets cleared, and clearing it on any successful write is
> > simplest.
> > 
> > So I'm still in favor of using nfs_context_set_write_error() in
> > nfs_pageio_add_request(), primarily because it is most consistent -
> > we
> > don't need exceptions.
> 
> Thanks for taking a closer look. I can easily make the change above,
> and
> I do think that keeping this mechanism as simple as possible will
> make
> it easier to prevent bitrot.
> 
> That said... NFS_CONTEXT_ERROR_WRITE is a per ctx flag, and the ctx
> is a
> per open file description object.
> 
> Is that the correct way to track this? All of the ctx's will share
> the
> same inode. If we're getting writeback errors for one context, it's
> quite likely that we'll be seeing them via others.
> 
> I suppose the counterargument is when we have things like expiring
> krb5
> tickets. Write failures via an expiring set of creds may have no
> effect
> on writeback via other creds.
> 
> Still, I think a per-inode flag might make more sense here.
> 
> Thoughts?

As far as I'm concerned, that would be a regression. The most common
problem when flushing writeback data to the server aside from ENOSPC
(and possibly ESTALE) is EACCES, which is particular to the file
descriptor that opened the file.

File contexts, and NFS_CONTEXT_ERROR_WRITE solve that problem by being
private to the file descriptor.

-- 
Trond Myklebust
Linux NFS client maintainer, PrimaryData
trond.myklebust@primarydata.com

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
@ 2017-09-07 14:54                 ` Trond Myklebust
  0 siblings, 0 replies; 33+ messages in thread
From: Trond Myklebust @ 2017-09-07 14:54 UTC (permalink / raw)
  To: anna.schumaker, jlayton, neilb, jlayton; +Cc: linux-nfs, linux-fsdevel

T24gVGh1LCAyMDE3LTA5LTA3IGF0IDA3OjM1IC0wNDAwLCBKZWZmIExheXRvbiB3cm90ZToNCj4g
T24gVGh1LCAyMDE3LTA5LTA3IGF0IDEzOjM3ICsxMDAwLCBOZWlsQnJvd24gd3JvdGU6DQo+ID4g
T24gVHVlLCBBdWcgMjkgMjAxNywgSmVmZiBMYXl0b24gd3JvdGU6DQo+ID4gDQo+ID4gPiBPbiBU
dWUsIDIwMTctMDgtMjkgYXQgMTE6MjMgKzEwMDAsIE5laWxCcm93biB3cm90ZToNCj4gPiA+ID4g
T24gTW9uLCBBdWcgMjggMjAxNywgSmVmZiBMYXl0b24gd3JvdGU6DQo+ID4gPiA+IA0KPiA+ID4g
PiA+IE9uIE1vbiwgMjAxNy0wOC0yOCBhdCAwOToyNCArMTAwMCwgTmVpbEJyb3duIHdyb3RlOg0K
PiA+ID4gPiA+ID4gT24gRnJpLCBBdWcgMjUgMjAxNywgSmVmZiBMYXl0b24gd3JvdGU6DQo+ID4g
PiA+ID4gPiANCj4gPiA+ID4gPiA+ID4gT24gVGh1LCAyMDE3LTA3LTIwIGF0IDE1OjQyIC0wNDAw
LCBKZWZmIExheXRvbiB3cm90ZToNCj4gPiA+ID4gPiA+ID4gPiBGcm9tOiBKZWZmIExheXRvbiA8
amxheXRvbkByZWRoYXQuY29tPg0KPiA+ID4gPiA+ID4gPiA+IA0KPiA+ID4gPiA+ID4gPiA+IFRo
ZXJlIGlzIHNvbWUgYW1iaWd1aXR5IGluIG5mcyBhYm91dCBob3cgd3JpdGViYWNrDQo+ID4gPiA+
ID4gPiA+ID4gZXJyb3JzIGFyZQ0KPiA+ID4gPiA+ID4gPiA+IHRyYWNrZWQuDQo+ID4gPiA+ID4g
PiA+ID4gDQo+ID4gPiA+ID4gPiA+ID4gRm9yIGluc3RhbmNlLCBuZnNfcGFnZWlvX2FkZF9yZXF1
ZXN0IGNhbGxzDQo+ID4gPiA+ID4gPiA+ID4gbWFwcGluZ19zZXRfZXJyb3Igd2hlbg0KPiA+ID4g
PiA+ID4gPiA+IHRoZQ0KPiA+ID4gPiA+ID4gPiA+IGFkZCBmYWlscywgYnV0IHdlIHRyYWNrIGVy
cm9ycyB0aGF0IG9jY3VyIGFmdGVyIGFkZGluZw0KPiA+ID4gPiA+ID4gPiA+IHRoZQ0KPiA+ID4g
PiA+ID4gPiA+IHJlcXVlc3QNCj4gPiA+ID4gPiA+ID4gPiB3aXRoIGEgZGVkaWNhdGVkIGludCBl
cnJvciBpbiB0aGUgb3BlbiBjb250ZXh0Lg0KPiA+ID4gPiA+ID4gPiA+IA0KPiA+ID4gPiA+ID4g
PiA+IE5vdyB0aGF0IHdlIGhhdmUgYmV0dGVyIGluZnJhc3RydWN0dXJlIGZvciB0aGUgdmZzDQo+
ID4gPiA+ID4gPiA+ID4gbGF5ZXIsIHRoaXMNCj4gPiA+ID4gPiA+ID4gPiBsYXR0ZXIgaW50IGlz
IG5vdyB1bm5lY2Vzc2FyeS4gSnVzdCBoYXZlDQo+ID4gPiA+ID4gPiA+ID4gbmZzX2NvbnRleHRf
c2V0X3dyaXRlX2Vycm9yIHNldA0KPiA+ID4gPiA+ID4gPiA+IHRoZSBlcnJvciBpbiB0aGUgbWFw
cGluZyB3aGVuIG9uZSBvY2N1cnMuDQo+ID4gPiA+ID4gPiA+ID4gDQo+ID4gPiA+ID4gPiA+ID4g
SGF2ZSBORlMgdXNlIGZpbGVfd3JpdGVfYW5kX3dhaXRfcmFuZ2UgdG8gaW5pdGlhdGUgYW5kDQo+
ID4gPiA+ID4gPiA+ID4gd2FpdCBvbg0KPiA+ID4gPiA+ID4gPiA+IHdyaXRlYmFjaw0KPiA+ID4g
PiA+ID4gPiA+IG9mIHRoZSBkYXRhLCBhbmQgdGhlbiBjaGVjayBhZ2FpbiBhZnRlciBpc3N1aW5n
IHRoZQ0KPiA+ID4gPiA+ID4gPiA+IGNvbW1pdChzKS4NCj4gPiA+ID4gPiA+ID4gPiANCj4gPiA+
ID4gPiA+ID4gPiBXaXRoIHRoaXMsIHdlIGFsc28gZG9uJ3QgbmVlZCB0byBwYXkgYXR0ZW50aW9u
IHRvIHRoZQ0KPiA+ID4gPiA+ID4gPiA+IEVSUk9SX1dSSVRFDQo+ID4gPiA+ID4gPiA+ID4gZmxh
ZyBmb3IgcmVwb3J0aW5nLCBhbmQganVzdCBjbGVhciBpdCB0byBpbmRpY2F0ZSB0bw0KPiA+ID4g
PiA+ID4gPiA+IHN1YnNlcXVlbnQNCj4gPiA+ID4gPiA+ID4gPiB3cml0ZXJzIHRoYXQgdGhleSBz
aG91bGQgdHJ5IHRvIGdvIGFzeW5jaHJvbm91cyBhZ2Fpbi4NCj4gPiA+ID4gPiA+ID4gPiANCj4g
PiA+ID4gPiA+ID4gPiBJbiBuZnNfcGFnZV9hc3luY19mbHVzaCwgc2FtcGxlIHRoZSBlcnJvciBi
ZWZvcmUNCj4gPiA+ID4gPiA+ID4gPiBsb2NraW5nIGFuZA0KPiA+ID4gPiA+ID4gPiA+IGpvaW5p
bmcNCj4gPiA+ID4gPiA+ID4gPiB0aGUgcmVxdWVzdHMsIGFuZCBjaGVjayBmb3IgZXJyb3JzIHNp
bmNlIHRoYXQgcG9pbnQuDQo+ID4gPiA+ID4gPiA+ID4gDQo+ID4gPiA+ID4gPiA+ID4gU2lnbmVk
LW9mZi1ieTogSmVmZiBMYXl0b24gPGpsYXl0b25AcmVkaGF0LmNvbT4NCj4gPiA+ID4gPiA+ID4g
PiAtLS0NCj4gPiA+ID4gPiA+ID4gPiAgZnMvbmZzL2ZpbGUuYyAgICAgICAgICB8IDI0ICsrKysr
KysrKysrLS0tLS0tLS0tLS0tLQ0KPiA+ID4gPiA+ID4gPiA+ICBmcy9uZnMvaW5vZGUuYyAgICAg
ICAgIHwgIDMgKy0tDQo+ID4gPiA+ID4gPiA+ID4gIGZzL25mcy93cml0ZS5jICAgICAgICAgfCAg
OCArKysrKystLQ0KPiA+ID4gPiA+ID4gPiA+ICBpbmNsdWRlL2xpbnV4L25mc19mcy5oIHwgIDEg
LQ0KPiA+ID4gPiA+ID4gPiA+ICA0IGZpbGVzIGNoYW5nZWQsIDE4IGluc2VydGlvbnMoKyksIDE4
IGRlbGV0aW9ucygtKQ0KPiA+ID4gPiA+ID4gPiA+IA0KPiA+ID4gPiA+ID4gPiA+IEkgaGF2ZSBh
IGJhbGluZyB3aXJlIGFuZCBkdWN0IHRhcGUgc29sdXRpb24gZm9yIHRlc3RpbmcNCj4gPiA+ID4g
PiA+ID4gPiB0aGlzIHdpdGgNCj4gPiA+ID4gPiA+ID4gPiB4ZnN0ZXN0cyAodXNpbmcgaXB0YWJs
ZXMgUkVKRUNUIHRhcmdldHMgYW5kIHNvZnQNCj4gPiA+ID4gPiA+ID4gPiBtb3VudHMpLiBUaGlz
DQo+ID4gPiA+ID4gPiA+ID4gc2VlbXMgdG8NCj4gPiA+ID4gPiA+ID4gPiBtYWtlIG5mcyBkbyB0
aGUgcmlnaHQgdGhpbmcuDQo+ID4gPiA+ID4gPiA+ID4gDQo+ID4gPiA+ID4gPiA+ID4gZGlmZiAt
LWdpdCBhL2ZzL25mcy9maWxlLmMgYi9mcy9uZnMvZmlsZS5jDQo+ID4gPiA+ID4gPiA+ID4gaW5k
ZXggNTcxM2ViMzJhNDVlLi4xNWQzYzZmYWFmZDMgMTAwNjQ0DQo+ID4gPiA+ID4gPiA+ID4gLS0t
IGEvZnMvbmZzL2ZpbGUuYw0KPiA+ID4gPiA+ID4gPiA+ICsrKyBiL2ZzL25mcy9maWxlLmMNCj4g
PiA+ID4gPiA+ID4gPiBAQCAtMjEyLDI1ICsyMTIsMjMgQEAgbmZzX2ZpbGVfZnN5bmNfY29tbWl0
KHN0cnVjdCBmaWxlDQo+ID4gPiA+ID4gPiA+ID4gKmZpbGUsDQo+ID4gPiA+ID4gPiA+ID4gbG9m
Zl90IHN0YXJ0LCBsb2ZmX3QgZW5kLCBpbnQgZGF0YXN5bmMpDQo+ID4gPiA+ID4gPiA+ID4gIHsN
Cj4gPiA+ID4gPiA+ID4gPiAgCXN0cnVjdCBuZnNfb3Blbl9jb250ZXh0ICpjdHggPQ0KPiA+ID4g
PiA+ID4gPiA+IG5mc19maWxlX29wZW5fY29udGV4dChmaWxlKTsNCj4gPiA+ID4gPiA+ID4gPiAg
CXN0cnVjdCBpbm9kZSAqaW5vZGUgPSBmaWxlX2lub2RlKGZpbGUpOw0KPiA+ID4gPiA+ID4gPiA+
IC0JaW50IGhhdmVfZXJyb3IsIGRvX3Jlc2VuZCwgc3RhdHVzOw0KPiA+ID4gPiA+ID4gPiA+IC0J
aW50IHJldCA9IDA7DQo+ID4gPiA+ID4gPiA+ID4gKwlpbnQgZG9fcmVzZW5kLCBzdGF0dXM7DQo+
ID4gPiA+ID4gPiA+ID4gKwlpbnQgcmV0Ow0KPiA+ID4gPiA+ID4gPiA+ICANCj4gPiA+ID4gPiA+
ID4gPiAgCWRwcmludGsoIk5GUzogZnN5bmMgZmlsZSglcEQyKSBkYXRhc3luYyAlZFxuIiwNCj4g
PiA+ID4gPiA+ID4gPiBmaWxlLA0KPiA+ID4gPiA+ID4gPiA+IGRhdGFzeW5jKTsNCj4gPiA+ID4g
PiA+ID4gPiAgDQo+ID4gPiA+ID4gPiA+ID4gIAluZnNfaW5jX3N0YXRzKGlub2RlLCBORlNJT1Nf
VkZTRlNZTkMpOw0KPiA+ID4gPiA+ID4gPiA+ICAJZG9fcmVzZW5kID0NCj4gPiA+ID4gPiA+ID4g
PiB0ZXN0X2FuZF9jbGVhcl9iaXQoTkZTX0NPTlRFWFRfUkVTRU5EX1dSSVRFUywgJmN0eC0NCj4g
PiA+ID4gPiA+ID4gPiA+ZmxhZ3MpOw0KPiA+ID4gPiA+ID4gPiA+IC0JaGF2ZV9lcnJvciA9DQo+
ID4gPiA+ID4gPiA+ID4gdGVzdF9hbmRfY2xlYXJfYml0KE5GU19DT05URVhUX0VSUk9SX1dSSVRF
LA0KPiA+ID4gPiA+ID4gPiA+ICZjdHgtPmZsYWdzKTsNCj4gPiA+ID4gPiA+ID4gPiAtCXN0YXR1
cyA9IG5mc19jb21taXRfaW5vZGUoaW5vZGUsIEZMVVNIX1NZTkMpOw0KPiA+ID4gPiA+ID4gPiA+
IC0JaGF2ZV9lcnJvciB8PQ0KPiA+ID4gPiA+ID4gPiA+IHRlc3RfYml0KE5GU19DT05URVhUX0VS
Uk9SX1dSSVRFLCAmY3R4LQ0KPiA+ID4gPiA+ID4gPiA+ID4gZmxhZ3MpOw0KPiA+ID4gPiA+ID4g
PiA+IA0KPiA+ID4gPiA+ID4gPiA+IC0JaWYgKGhhdmVfZXJyb3IpIHsNCj4gPiA+ID4gPiA+ID4g
PiAtCQlyZXQgPSB4Y2hnKCZjdHgtPmVycm9yLCAwKTsNCj4gPiA+ID4gPiA+ID4gPiAtCQlpZiAo
cmV0KQ0KPiA+ID4gPiA+ID4gPiA+IC0JCQlnb3RvIG91dDsNCj4gPiA+ID4gPiA+ID4gPiAtCX0N
Cj4gPiA+ID4gPiA+ID4gPiAtCWlmIChzdGF0dXMgPCAwKSB7DQo+ID4gPiA+ID4gPiA+ID4gKwlj
bGVhcl9iaXQoTkZTX0NPTlRFWFRfRVJST1JfV1JJVEUsICZjdHgtDQo+ID4gPiA+ID4gPiA+ID4g
PmZsYWdzKTsNCj4gPiA+ID4gPiA+ID4gPiArCXJldCA9IG5mc19jb21taXRfaW5vZGUoaW5vZGUs
IEZMVVNIX1NZTkMpOw0KPiA+ID4gPiA+ID4gPiA+ICsNCj4gPiA+ID4gPiA+ID4gPiArCS8qIFJl
Y2hlY2sgYW5kIGFkdmFuY2UgYWZ0ZXIgdGhlIGNvbW1pdCAqLw0KPiA+ID4gPiA+ID4gPiA+ICsJ
c3RhdHVzID0gZmlsZV9jaGVja19hbmRfYWR2YW5jZV93Yl9lcnIoZmlsZSk7DQo+ID4gPiA+ID4g
PiANCj4gPiA+ID4gPiA+IFRoaXMgY2hhbmdlIG1ha2VzIHRoZSBjb2RlIGluY29uc2lzdGVudCB3
aXRoIHRoZSBjb21tZW50DQo+ID4gPiA+ID4gPiBhYm92ZSB0aGUNCj4gPiA+ID4gPiA+IGZ1bmN0
aW9uLCB3aGljaCBzdGlsbCByZWZlcmVuY2VzIGN0eC0+ZXJyb3IuICBUaGUgaW50ZW50IG9mDQo+
ID4gPiA+ID4gPiB0aGUNCj4gPiA+ID4gPiA+IGNvbW1lbnQNCj4gPiA+ID4gPiA+IGlzIHN0aWxs
IGNvcnJlY3QsIGJ1dCB0aGUgZGV0YWlscyBoYXZlIGNoYW5nZWQuDQo+ID4gPiA+ID4gPiANCj4g
PiA+ID4gPiANCj4gPiA+ID4gPiBHb29kIGNhdGNoLiBJJ2xsIGZpeCB0aGF0IHVwIGluIGEgcmVz
cGluLg0KPiA+ID4gPiA+IA0KPiA+ID4gPiA+ID4gQWxzbywgdGhlcmUgaXMgYSBjYWxsIHRvIG1h
cHBpbmdfc2V0X2Vycm9yKCkgaW4NCj4gPiA+ID4gPiA+IG5mc19wYWdlaW9fYWRkX3JlcXVlc3Qo
KS4NCj4gPiA+ID4gPiA+IEkgd29uZGVyIGlmIHRoYXQgc2hvdWxkIGJlIGNoYW5nZWQgdG8NCj4g
PiA+ID4gPiA+ICAgbmZzX2NvbnRleHRfc2V0X3dyaXRlX2Vycm9yKHJlcS0+d2JfY29udGV4dCwg
ZGVzYy0NCj4gPiA+ID4gPiA+ID5wZ19lcnJvcikNCj4gPiA+ID4gPiA+ID8/DQo+ID4gPiA+ID4g
PiANCj4gPiA+ID4gPiANCj4gPiA+ID4gPiBUcmlja2llciBxdWVzdGlvbi4uLg0KPiA+ID4gPiA+
IA0KPiA+ID4gPiA+IEknbSBub3QgcXVpdGUgc3VyZSB3aGF0IHNlbWFudGljcyB3ZSdyZSBsb29r
aW5nIGZvciB3aXRoDQo+ID4gPiA+ID4gTkZTX0NPTlRFWFRfRVJST1JfV1JJVEUuIEkga25vdyB0
aGF0IGl0IGZvcmNlcyB3cml0ZXMgdG8gYmUNCj4gPiA+ID4gPiBzeW5jaHJvbm91cywgYnV0IEkn
bSBub3QgcXVpdGUgc3VyZSB3aHkgaXQgZ2V0cyBjbGVhcmVkIHRoZQ0KPiA+ID4gPiA+IHdheSBp
dA0KPiA+ID4gPiA+IGRvZXMuIEl0J3Mgc2V0IG9uIGFueSBlcnJvciBidXQgY2xlYXJlZCBiZWZv
cmUgaXNzdWluZyBhDQo+ID4gPiA+ID4gY29tbWl0Lg0KPiA+ID4gPiA+IA0KPiA+ID4gPiA+IEkg
YWRkZWQgYSBzaW1pbGFyIGZsYWcgdG8gQ2VwaCBpbm9kZXMgcmVjZW50bHksIGJ1dCBvbmx5DQo+
ID4gPiA+ID4gY2xlYXIgaXQgd2hlbg0KPiA+ID4gPiA+IGEgd3JpdGUgc3VjY2VlZHMuIFdvdWxk
bid0IHRoYXQgbWFrZSBtb3JlIHNlbnNlIGhlcmUgYXMgd2VsbD8NCj4gPiA+ID4gDQo+ID4gPiA+
IEl0IGlzIGEgYml0IGhhcmQgdG8gd3JhcCBvbmUncyBtaW5kIGFyb3VuZC4NCj4gPiA+ID4gDQo+
ID4gPiA+IEluIHRoZSBvcmlnaW5hbCBjb2RlIChjb21taXQgN2IxNTlmYzE4ZDQxNzk4MCkgaXQg
bG9va3MgbGlrZToNCj4gPiA+ID4gIC0gdGVzdC1hbmQtY2xlYXIgYml0DQo+ID4gPiA+ICAtIHdy
aXRlIGFuZCBzeW5jDQo+ID4gPiA+ICAtIHRlc3QtYml0DQo+ID4gPiA+IA0KPiA+ID4gPiBUaGlz
IGRvZXMsIEkgdGhpbmssIHNlZW0gc2FmZXIgdGhhbiAiY2xlYXIgb24gc3VjY2Vzc2Z1bCB3cml0
ZSINCj4gPiA+ID4gYXMgdGhlDQo+ID4gPiA+IHdyaXRlcyBjb3VsZCBjb21wbGV0ZSBvdXQtb2Yt
b3JkZXIgYW5kIEkgd291bGRuJ3QgYmUgc3VycHJpc2VkDQo+ID4gPiA+IGlmIHRoZQ0KPiA+ID4g
PiB1bnN1Y2Nlc3NmdWwgb25lcyBjb21wbGV0ZWQgd2l0aCBhbiBlcnJvciBiZWZvcmUgdGhlIHN1
Y2Nlc3NmdWwNCj4gPiA+ID4gb25lIC0NCj4gPiA+ID4gcGFydGljdWxhcmx5IHdpdGggYW4gZXJy
b3IgbGlrZSBFRFFVT1QuDQo+ID4gPiA+IA0KPiA+ID4gPiBIb3dldmVyIHRoZSBjdXJyZW50IGNv
ZGUgZG9lcyB0aGUgd3JpdGVzIGJlZm9yZSB0aGUgdGVzdC1hbmQtDQo+ID4gPiA+IGNsZWFyLCBh
bmQNCj4gPiA+ID4gb25seSBkb2VzIHRoZSBjb21taXQgYWZ0ZXJ3YXJkcy4gIFRoYXQgbWFrZXMg
aXQgbGVzcyBjbGVhciB3aHkNCj4gPiA+ID4gdGhlDQo+ID4gPiA+IGN1cnJlbnQgc2VxdWVuY2Ug
aXMgYSBnb29kIGlkZWEuDQo+ID4gPiA+IA0KPiA+ID4gPiBIb3dldmVyIC4uLiBuZnNfZmlsZV9m
c3luY19jb21taXQoKSBpcyBvbmx5IGNhbGxlZCBpZg0KPiA+ID4gPiBmaWxlbWFwX3dyaXRlX2Fu
ZF93YWl0X3JhbmdlKCkgcmV0dXJuZWQgd2l0aCBzdWNjZXNzLCBzbyB3ZQ0KPiA+ID4gPiBvbmx5
IGNsZWFyDQo+ID4gPiA+IHRoZSBmbGFnIGFmdGVyIHN1Y2Nlc3NmdWwgd3JpdGVzKD8pLg0KPiA+
ID4gPiANCj4gPiA+ID4gT2guLi4uDQo+ID4gPiA+IFRoaXMgcGF0Y2ggZnJvbSBtZToNCj4gPiA+
ID4gDQo+ID4gPiA+IENvbW1pdDogMmVkYjZiYzM4NTJjICgiTkZTIC0gZml4IHJlY2VudCBicmVh
a2FnZSB0byBORlMgZXJyb3INCj4gPiA+ID4gaGFuZGxpbmcuIikNCj4gPiA+ID4gDQo+ID4gPiA+
IHNlZW1zIHRvIGhhdmUgYmVlbiByZXZlcnRlZCBieQ0KPiA+ID4gPiANCj4gPiA+ID4gQ29tbWl0
OiA3YjI4MWVlMDI2NTUgKCJORlM6IGZzeW5jKCkgbXVzdCBleGl0IHdpdGggYW4gZXJyb3IgaWYN
Cj4gPiA+ID4gcGFnZSB3cml0ZWJhY2sgZmFpbGVkIikNCj4gPiA+ID4gDQo+ID4gPiA+IHdoaWNo
IHByb2JhYmx5IGlzbid0IGdvb2QuICBJdCBhcHBlYXJzIHRoYXQgdGhpcyBjb2RlIGlzIHZlcnkN
Cj4gPiA+ID4gZnJhZ2lsZQ0KPiA+ID4gPiBhbmQgZWFzaWx5IGJyb2tlbi4NCj4gPiANCj4gPiBP
biBmdXJ0aGVyIGludmVzdGlnYXRpb24sIEkgdGhpbmsgdGhlIHByb2JsZW0gdGhhdCBJIGZpeGVk
IGFuZCB0aGVuDQo+ID4gd2UNCj4gPiByZWludHJvZHVjZWQgd2lsbCBiZSBmaXhlZCBhZ2FpbiAt
IG1vcmUgcGVybWFuZW50bHkgLSBieSB5b3VyDQo+ID4gcGF0Y2guDQo+ID4gVGhlIHJvb3QgcHJv
YmxlbSBpcyB0aGF0IG5mcyBrZWVwcyBlcnJvciBjb2RlcyBpbiBhIGRpZmZlcmVudCB3YXkNCj4g
PiB0byB0aGUNCj4gPiBNTSBjb3JlLiAgQnkgdW5pZnlpbmcgdGhvc2UsIHRoZSBwcm9ibGVtIGdv
ZXMuDQo+ID4gKFRoZSBzcGVjaWZpYyBwcm9ibGVtIGlzIHRoYXQgd3JpdGVzIHdoaWNoIGhpdCBF
RFFVT1Qgb24gdGhlIHNlcnZlcg0KPiA+IGNhbg0KPiA+ICByZXBvcnQgRUlPIG9uIHRoZSBjbGll
bnQpLg0KPiA+IA0KPiA+IA0KPiA+ID4gPiBNYXliZSB3ZSBuZWVkIHRvIHdvcmsgb3V0IGV4YWN0
bHkgd2hhdCBpcyByZXF1aXJlZCwgYW5kDQo+ID4gPiA+IGRvY3VtZW50IGl0IC0gc28NCj4gPiA+
ID4gd2UgY2FuIHN0b3AgYnJlYWtpbmcgaXQuDQo+ID4gPiA+IE9yIG1heWJlIHdlIG5lZWQgc29t
ZSB1bml0IHRlc3RzLi4uLi4NCj4gPiA+ID4gDQo+ID4gPiANCj4gPiA+IFllcywgbGF5aW5nIG91
dCB3aGF0J3MgbmVjZXNzYXJ5IGZvciB0aGlzIHdvdWxkIGJlIHZlcnkgaGVscGZ1bC4NCj4gPiA+
IFdlDQo+ID4gPiBjbGVhcmx5IHdhbnQgdG8gc2V0IHRoZSBmbGFnIHdoZW4gYW4gZXJyb3Igb2Nj
dXJzLiBVbmRlciB3aGF0DQo+ID4gPiBjaXJjdW1zdGFuY2VzIHNob3VsZCB3ZSBiZSBjbGVhcmlu
ZyBpdD8NCj4gPiANCj4gPiBXZWxsLi4uLiBsb29raW5nIGJhY2sgYXQgIDdiMTU5ZmMxOGQ0MTc5
ODBmNTdhZSB3aGljaCBpbnRyb2R1Y2VkDQo+ID4gdGhlDQo+ID4gZmxhZywgcHJpb3IgdG8gdGhh
dCB3cml0ZSBlcnJvcnMgKGN0eC0+ZXJyb3IpIHdlcmUgb25seSByZXBvcnRlZCBieQ0KPiA+IG5m
c19maWxlX2ZsdXNoIGFuZCBuZnNfZnN5bmMsIHNvIG9ubHkgb25lIGNsb3NlKCkgYW5kIGZzeW5j
KCkuDQo+ID4gDQo+ID4gQWZ0ZXIgdGhhdCBjb21taXQsIHNldHRpbmcgdGhlIGZsYWcgd291bGQg
bWVhbiB0aGF0IGVycm9ycyBjb3VsZCBiZQ0KPiA+IHJldHVybmVkIGJ5ICd3cml0ZScuICBTbyBj
bGVhcmluZyBhcyBwYXJ0IG9mIHJldHVybmluZyB0aGUgZXJyb3INCj4gPiBtYWtlcw0KPiA+IHBl
cmZlY3Qgc2Vuc2UuDQo+ID4gDQo+ID4gQXMgbG9uZyBhcyB0aGUgZXJyb3IgZ2V0cyByZWNvcmRl
ZCwgYW5kIGdldHMgcmV0dXJuZWQgd2hlbiBpdCBpcw0KPiA+IHJlY29yZGVkLCBpdCBkb2Vzbid0
IG11Y2ggbWF0dGVyIHdoZW4gdGhlIGZsYWcgaXMgY2xlYXJlZC4gIFdpdGgNCj4gPiB5b3VyDQo+
ID4gcGF0Y2hlcyB3ZSBkb24ndCBuZWVkIHRvIGZsYWcgYW55IG1vcmUgdG8gZ2V0IGVycm9ycyBy
ZWxpYWJseQ0KPiA+IHJlcG9ydGVkLg0KPiA+IA0KPiA+IExlYXZpbmcgdGhlIGZsYWcgc2V0IG1l
YW5zIHRoYXQgd3JpdGVzIGdvIG1vcmUgc2xvd2x5IC0gd2UgZG9uJ3QNCj4gPiBnZXQNCj4gPiBs
YXJnZSBxdWV1ZSBvZiBiYWNrZ3JvdW5kIHJpZ2h0cyBidWlsZGluZyB1cCBidXQgZGVzdGluZWQg
Zm9yDQo+ID4gZmFpbHVyZS4NCj4gPiBUaGlzIGlzIHRoZSBtYWluIHBvaW50IG1hZGUgaW4gdGhl
IGNvbW1lbnQgbWVzc2FnZSB3aGVuIHRoZSBmbGFnDQo+ID4gd2FzDQo+ID4gaW50cm9kdWNlZC4N
Cj4gPiBPZiBjb3Vyc2UsIGJ5IHRoZSB0aW1lIHdlIGZpcnN0IGdldCBhbiBlcnJvciB0aGVyZSBj
b3VsZCBhbHJlYWR5DQo+ID4gYnkgYSBsYXJnZSBxdWV1ZSwgc28gd2UgcHJvYmFibHkgd2FudCB0
aGF0IHRvIGRyYWluIGNvbXBsZXRlbHkNCj4gPiBiZWZvcmUNCj4gPiBhbGxvd2luZyBhc3luYyB3
cml0ZXMgYWdhaW4uDQoNCldlIGFscmVhZHkgaGF2ZSB0aGlzIGZ1bmN0aW9uYWxpdHkgaW1wbGVt
ZW50ZWQgaW4gdGhlIGV4aXN0aW5nIGNvZGUuDQoNCj4gPiANCj4gPiBJdCBtaWdodCBtYWtlIHNl
bnNlIHRvIGhhdmUgMiBmbGFncy4gIE9uZSB3aGljaCBzYXlzICJ3cml0ZXMgc2hvdWxkDQo+ID4g
YmUNCj4gPiBzeW5jaHJvbm91cyIsIGFub3RoZXIgdGhhdCBzYXlzICJUaGVyZSB3YXMgYW4gZXJy
b3IgcmVjZW50bHkiLg0KPiA+IFdlIGNsZWFyIHRoZSBlcnJvciBmbGFnIGJlZm9yZSBjYWxsaW5n
IG5mc19mc3luYywgYW5kIGlmIGl0IGlzDQo+ID4gc3RpbGwNCj4gPiBjbGVhciBhZnRlcndhcmRz
LCB3ZSBjbGVhciB0aGUgc3luYy13cml0ZXMgZmxhZy4gIE1heWJlIHRoYXQgaXMNCj4gPiBtb3Jl
DQo+ID4gY29tcGxleCB0aGFuIG5lZWRlZCB0aG91Z2guDQo+ID4gDQoNCldlIGFsc28gbmVlZCB0
byBwcmVzZXJ2ZSB0aGUgTkZTX0NPTlRFWFRfUkVTRU5EX1dSSVRFUyBmbGFnLiBJIGRvbid0DQpz
ZWUgYW55IGdsb2JhbCBtZWNoYW5pc20gdGhhdCB3aWxsIHJlcGxhY2UgdGhhdC4NCg0KPiA+IEkn
bSBsZWFuaW5nIHRvd2FyZHMgeW91ciBzdWdnZXN0aW9uIHRoYXQgaXQgZG9lc24ndCBtYXR0ZXIg
dmVyeQ0KPiA+IG11Y2gNCj4gPiB3aGVuIGl0IGdldHMgY2xlYXJlZCwgYW5kIGNsZWFyaW5nIGl0
IG9uIGFueSBzdWNjZXNzZnVsIHdyaXRlIGlzDQo+ID4gc2ltcGxlc3QuDQo+ID4gDQo+ID4gU28g
SSdtIHN0aWxsIGluIGZhdm9yIG9mIHVzaW5nIG5mc19jb250ZXh0X3NldF93cml0ZV9lcnJvcigp
IGluDQo+ID4gbmZzX3BhZ2Vpb19hZGRfcmVxdWVzdCgpLCBwcmltYXJpbHkgYmVjYXVzZSBpdCBp
cyBtb3N0IGNvbnNpc3RlbnQgLQ0KPiA+IHdlDQo+ID4gZG9uJ3QgbmVlZCBleGNlcHRpb25zLg0K
PiANCj4gVGhhbmtzIGZvciB0YWtpbmcgYSBjbG9zZXIgbG9vay4gSSBjYW4gZWFzaWx5IG1ha2Ug
dGhlIGNoYW5nZSBhYm92ZSwNCj4gYW5kDQo+IEkgZG8gdGhpbmsgdGhhdCBrZWVwaW5nIHRoaXMg
bWVjaGFuaXNtIGFzIHNpbXBsZSBhcyBwb3NzaWJsZSB3aWxsDQo+IG1ha2UNCj4gaXQgZWFzaWVy
IHRvIHByZXZlbnQgYml0cm90Lg0KPiANCj4gVGhhdCBzYWlkLi4uIE5GU19DT05URVhUX0VSUk9S
X1dSSVRFIGlzIGEgcGVyIGN0eCBmbGFnLCBhbmQgdGhlIGN0eA0KPiBpcyBhDQo+IHBlciBvcGVu
IGZpbGUgZGVzY3JpcHRpb24gb2JqZWN0Lg0KPiANCj4gSXMgdGhhdCB0aGUgY29ycmVjdCB3YXkg
dG8gdHJhY2sgdGhpcz8gQWxsIG9mIHRoZSBjdHgncyB3aWxsIHNoYXJlDQo+IHRoZQ0KPiBzYW1l
IGlub2RlLiBJZiB3ZSdyZSBnZXR0aW5nIHdyaXRlYmFjayBlcnJvcnMgZm9yIG9uZSBjb250ZXh0
LCBpdCdzDQo+IHF1aXRlIGxpa2VseSB0aGF0IHdlJ2xsIGJlIHNlZWluZyB0aGVtIHZpYSBvdGhl
cnMuDQo+IA0KPiBJIHN1cHBvc2UgdGhlIGNvdW50ZXJhcmd1bWVudCBpcyB3aGVuIHdlIGhhdmUg
dGhpbmdzIGxpa2UgZXhwaXJpbmcNCj4ga3JiNQ0KPiB0aWNrZXRzLiBXcml0ZSBmYWlsdXJlcyB2
aWEgYW4gZXhwaXJpbmcgc2V0IG9mIGNyZWRzIG1heSBoYXZlIG5vDQo+IGVmZmVjdA0KPiBvbiB3
cml0ZWJhY2sgdmlhIG90aGVyIGNyZWRzLg0KPiANCj4gU3RpbGwsIEkgdGhpbmsgYSBwZXItaW5v
ZGUgZmxhZyBtaWdodCBtYWtlIG1vcmUgc2Vuc2UgaGVyZS4NCj4gDQo+IFRob3VnaHRzPw0KDQpB
cyBmYXIgYXMgSSdtIGNvbmNlcm5lZCwgdGhhdCB3b3VsZCBiZSBhIHJlZ3Jlc3Npb24uIFRoZSBt
b3N0IGNvbW1vbg0KcHJvYmxlbSB3aGVuIGZsdXNoaW5nIHdyaXRlYmFjayBkYXRhIHRvIHRoZSBz
ZXJ2ZXIgYXNpZGUgZnJvbSBFTk9TUEMNCihhbmQgcG9zc2libHkgRVNUQUxFKSBpcyBFQUNDRVMs
IHdoaWNoIGlzIHBhcnRpY3VsYXIgdG8gdGhlIGZpbGUNCmRlc2NyaXB0b3IgdGhhdCBvcGVuZWQg
dGhlIGZpbGUuDQoNCkZpbGUgY29udGV4dHMsIGFuZCBORlNfQ09OVEVYVF9FUlJPUl9XUklURSBz
b2x2ZSB0aGF0IHByb2JsZW0gYnkgYmVpbmcNCnByaXZhdGUgdG8gdGhlIGZpbGUgZGVzY3JpcHRv
ci4NCg0KLS0gDQpUcm9uZCBNeWtsZWJ1c3QNCkxpbnV4IE5GUyBjbGllbnQgbWFpbnRhaW5lciwg
UHJpbWFyeURhdGENCnRyb25kLm15a2xlYnVzdEBwcmltYXJ5ZGF0YS5jb20NCg==


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-09-07 14:54                 ` Trond Myklebust
  (?)
@ 2017-09-11  3:24                 ` NeilBrown
  2017-09-11 10:46                   ` Jeff Layton
  2017-09-12  2:24                     ` Trond Myklebust
  -1 siblings, 2 replies; 33+ messages in thread
From: NeilBrown @ 2017-09-11  3:24 UTC (permalink / raw)
  To: Trond Myklebust, anna.schumaker, jlayton, jlayton
  Cc: linux-nfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 16524 bytes --]

On Thu, Sep 07 2017, Trond Myklebust wrote:

> On Thu, 2017-09-07 at 07:35 -0400, Jeff Layton wrote:
>> On Thu, 2017-09-07 at 13:37 +1000, NeilBrown wrote:
>> > On Tue, Aug 29 2017, Jeff Layton wrote:
>> > 
>> > > On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
>> > > > On Mon, Aug 28 2017, Jeff Layton wrote:
>> > > > 
>> > > > > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
>> > > > > > On Fri, Aug 25 2017, Jeff Layton wrote:
>> > > > > > 
>> > > > > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
>> > > > > > > > From: Jeff Layton <jlayton@redhat.com>
>> > > > > > > > 
>> > > > > > > > There is some ambiguity in nfs about how writeback
>> > > > > > > > errors are
>> > > > > > > > tracked.
>> > > > > > > > 
>> > > > > > > > For instance, nfs_pageio_add_request calls
>> > > > > > > > mapping_set_error when
>> > > > > > > > the
>> > > > > > > > add fails, but we track errors that occur after adding
>> > > > > > > > the
>> > > > > > > > request
>> > > > > > > > with a dedicated int error in the open context.
>> > > > > > > > 
>> > > > > > > > Now that we have better infrastructure for the vfs
>> > > > > > > > layer, this
>> > > > > > > > latter int is now unnecessary. Just have
>> > > > > > > > nfs_context_set_write_error set
>> > > > > > > > the error in the mapping when one occurs.
>> > > > > > > > 
>> > > > > > > > Have NFS use file_write_and_wait_range to initiate and
>> > > > > > > > wait on
>> > > > > > > > writeback
>> > > > > > > > of the data, and then check again after issuing the
>> > > > > > > > commit(s).
>> > > > > > > > 
>> > > > > > > > With this, we also don't need to pay attention to the
>> > > > > > > > ERROR_WRITE
>> > > > > > > > flag for reporting, and just clear it to indicate to
>> > > > > > > > subsequent
>> > > > > > > > writers that they should try to go asynchronous again.
>> > > > > > > > 
>> > > > > > > > In nfs_page_async_flush, sample the error before
>> > > > > > > > locking and
>> > > > > > > > joining
>> > > > > > > > the requests, and check for errors since that point.
>> > > > > > > > 
>> > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > > > > > > > ---
>> > > > > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
>> > > > > > > >  fs/nfs/inode.c         |  3 +--
>> > > > > > > >  fs/nfs/write.c         |  8 ++++++--
>> > > > > > > >  include/linux/nfs_fs.h |  1 -
>> > > > > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
>> > > > > > > > 
>> > > > > > > > I have a baling wire and duct tape solution for testing
>> > > > > > > > this with
>> > > > > > > > xfstests (using iptables REJECT targets and soft
>> > > > > > > > mounts). This
>> > > > > > > > seems to
>> > > > > > > > make nfs do the right thing.
>> > > > > > > > 
>> > > > > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> > > > > > > > index 5713eb32a45e..15d3c6faafd3 100644
>> > > > > > > > --- a/fs/nfs/file.c
>> > > > > > > > +++ b/fs/nfs/file.c
>> > > > > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file
>> > > > > > > > *file,
>> > > > > > > > loff_t start, loff_t end, int datasync)
>> > > > > > > >  {
>> > > > > > > >  	struct nfs_open_context *ctx =
>> > > > > > > > nfs_file_open_context(file);
>> > > > > > > >  	struct inode *inode = file_inode(file);
>> > > > > > > > -	int have_error, do_resend, status;
>> > > > > > > > -	int ret = 0;
>> > > > > > > > +	int do_resend, status;
>> > > > > > > > +	int ret;
>> > > > > > > >  
>> > > > > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n",
>> > > > > > > > file,
>> > > > > > > > datasync);
>> > > > > > > >  
>> > > > > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>> > > > > > > >  	do_resend =
>> > > > > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx-
>> > > > > > > > >flags);
>> > > > > > > > -	have_error =
>> > > > > > > > test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
>> > > > > > > > &ctx->flags);
>> > > > > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > > > > > > -	have_error |=
>> > > > > > > > test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
>> > > > > > > > > flags);
>> > > > > > > > 
>> > > > > > > > -	if (have_error) {
>> > > > > > > > -		ret = xchg(&ctx->error, 0);
>> > > > > > > > -		if (ret)
>> > > > > > > > -			goto out;
>> > > > > > > > -	}
>> > > > > > > > -	if (status < 0) {
>> > > > > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
>> > > > > > > > >flags);
>> > > > > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > > > > > > +
>> > > > > > > > +	/* Recheck and advance after the commit */
>> > > > > > > > +	status = file_check_and_advance_wb_err(file);
>> > > > > > 
>> > > > > > This change makes the code inconsistent with the comment
>> > > > > > above the
>> > > > > > function, which still references ctx->error.  The intent of
>> > > > > > the
>> > > > > > comment
>> > > > > > is still correct, but the details have changed.
>> > > > > > 
>> > > > > 
>> > > > > Good catch. I'll fix that up in a respin.
>> > > > > 
>> > > > > > Also, there is a call to mapping_set_error() in
>> > > > > > nfs_pageio_add_request().
>> > > > > > I wonder if that should be changed to
>> > > > > >   nfs_context_set_write_error(req->wb_context, desc-
>> > > > > > >pg_error)
>> > > > > > ??
>> > > > > > 
>> > > > > 
>> > > > > Trickier question...
>> > > > > 
>> > > > > I'm not quite sure what semantics we're looking for with
>> > > > > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
>> > > > > synchronous, but I'm not quite sure why it gets cleared the
>> > > > > way it
>> > > > > does. It's set on any error but cleared before issuing a
>> > > > > commit.
>> > > > > 
>> > > > > I added a similar flag to Ceph inodes recently, but only
>> > > > > clear it when
>> > > > > a write succeeds. Wouldn't that make more sense here as well?
>> > > > 
>> > > > It is a bit hard to wrap one's mind around.
>> > > > 
>> > > > In the original code (commit 7b159fc18d417980) it looks like:
>> > > >  - test-and-clear bit
>> > > >  - write and sync
>> > > >  - test-bit
>> > > > 
>> > > > This does, I think, seem safer than "clear on successful write"
>> > > > as the
>> > > > writes could complete out-of-order and I wouldn't be surprised
>> > > > if the
>> > > > unsuccessful ones completed with an error before the successful
>> > > > one -
>> > > > particularly with an error like EDQUOT.
>> > > > 
>> > > > However the current code does the writes before the test-and-
>> > > > clear, and
>> > > > only does the commit afterwards.  That makes it less clear why
>> > > > the
>> > > > current sequence is a good idea.
>> > > > 
>> > > > However ... nfs_file_fsync_commit() is only called if
>> > > > filemap_write_and_wait_range() returned with success, so we
>> > > > only clear
>> > > > the flag after successful writes(?).
>> > > > 
>> > > > Oh....
>> > > > This patch from me:
>> > > > 
>> > > > Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error
>> > > > handling.")
>> > > > 
>> > > > seems to have been reverted by
>> > > > 
>> > > > Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if
>> > > > page writeback failed")
>> > > > 
>> > > > which probably isn't good.  It appears that this code is very
>> > > > fragile
>> > > > and easily broken.
>> > 
>> > On further investigation, I think the problem that I fixed and then
>> > we
>> > reintroduced will be fixed again - more permanently - by your
>> > patch.
>> > The root problem is that nfs keeps error codes in a different way
>> > to the
>> > MM core.  By unifying those, the problem goes.
>> > (The specific problem is that writes which hit EDQUOT on the server
>> > can
>> >  report EIO on the client).
>> > 
>> > 
>> > > > Maybe we need to work out exactly what is required, and
>> > > > document it - so
>> > > > we can stop breaking it.
>> > > > Or maybe we need some unit tests.....
>> > > > 
>> > > 
>> > > Yes, laying out what's necessary for this would be very helpful.
>> > > We
>> > > clearly want to set the flag when an error occurs. Under what
>> > > circumstances should we be clearing it?
>> > 
>> > Well.... looking back at  7b159fc18d417980f57ae which introduced
>> > the
>> > flag, prior to that write errors (ctx->error) were only reported by
>> > nfs_file_flush and nfs_fsync, so only one close() and fsync().
>> > 
>> > After that commit, setting the flag would mean that errors could be
>> > returned by 'write'.  So clearing as part of returning the error
>> > makes
>> > perfect sense.
>> > 
>> > As long as the error gets recorded, and gets returned when it is
>> > recorded, it doesn't much matter when the flag is cleared.  With
>> > your
>> > patches we don't need to flag any more to get errors reliably
>> > reported.
>> > 
>> > Leaving the flag set means that writes go more slowly - we don't
>> > get
>> > large queue of background rights building up but destined for
>> > failure.
>> > This is the main point made in the comment message when the flag
>> > was
>> > introduced.
>> > Of course, by the time we first get an error there could already
>> > by a large queue, so we probably want that to drain completely
>> > before
>> > allowing async writes again.
>
> We already have this functionality implemented in the existing code.
>
>> > 
>> > It might make sense to have 2 flags.  One which says "writes should
>> > be
>> > synchronous", another that says "There was an error recently".
>> > We clear the error flag before calling nfs_fsync, and if it is
>> > still
>> > clear afterwards, we clear the sync-writes flag.  Maybe that is
>> > more
>> > complex than needed though.
>> > 
>
> We also need to preserve the NFS_CONTEXT_RESEND_WRITES flag. I don't
> see any global mechanism that will replace that.
>
>> > I'm leaning towards your suggestion that it doesn't matter very
>> > much
>> > when it gets cleared, and clearing it on any successful write is
>> > simplest.
>> > 
>> > So I'm still in favor of using nfs_context_set_write_error() in
>> > nfs_pageio_add_request(), primarily because it is most consistent -
>> > we
>> > don't need exceptions.
>> 
>> Thanks for taking a closer look. I can easily make the change above,
>> and
>> I do think that keeping this mechanism as simple as possible will
>> make
>> it easier to prevent bitrot.
>> 
>> That said... NFS_CONTEXT_ERROR_WRITE is a per ctx flag, and the ctx
>> is a
>> per open file description object.
>> 
>> Is that the correct way to track this? All of the ctx's will share
>> the
>> same inode. If we're getting writeback errors for one context, it's
>> quite likely that we'll be seeing them via others.
>> 
>> I suppose the counterargument is when we have things like expiring
>> krb5
>> tickets. Write failures via an expiring set of creds may have no
>> effect
>> on writeback via other creds.
>> 
>> Still, I think a per-inode flag might make more sense here.
>> 
>> Thoughts?
>
> As far as I'm concerned, that would be a regression. The most common
> problem when flushing writeback data to the server aside from ENOSPC
> (and possibly ESTALE) is EACCES, which is particular to the file
> descriptor that opened the file.
>
> File contexts, and NFS_CONTEXT_ERROR_WRITE solve that problem by being
> private to the file descriptor.

Thanks for the reminder that errors are per-context and this patch drops
this.  The per-context nature of errors in NFS was the reason that I
nagged Jeff to make errseq_t a stand-alone type rather than just a part
of address_space.  I had envisaged that it would be embedded in the
open_context as well.
We still could do that, but as there is precisely one open-file for each
open_context, the gains are not great.

However, while looking over the code to make sure I really understood it
and all the possible consequences of changing to errseq_t I found a few
anomalies.  The patch below addresses them all.

Would you see if they may sense to you?

Thanks,
NeilBrown


From: NeilBrown <neilb@suse.com>
Date: Mon, 11 Sep 2017 13:15:50 +1000
Subject: [PATCH] NFS: various changes relating to reporting IO errors.

1/ remove 'start' and 'end' args from nfs_file_fsync_commit().
   They aren't used.

2/ Make nfs_context_set_write_error() a "static inline" in internal.h
   so we can...

3/ Use nfs_context_set_write_error() instead of mapping_set_error()
   if nfs_pageio_add_request() fails before sending any request.
   NFS generally keeps errors in the open_context, not the mapping,
   so this is more consistent.

4/ If filemap_write_and_write_range() reports any error, still
   check ctx->error.  The value in ctx->error is likely to be
   more useful.  As part of this, NFS_CONTEXT_ERROR_WRITE is
   cleared slightly earlier, before nfs_file_fsync_commit() is called,
   rather than at the start of that function.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 fs/nfs/file.c     | 16 ++++++++++------
 fs/nfs/internal.h |  7 +++++++
 fs/nfs/pagelist.c |  4 ++--
 fs/nfs/write.c    |  7 -------
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index af330c31f627..ab324f14081f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -208,21 +208,19 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
  * fall back to doing a synchronous write.
  */
 static int
-nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
+nfs_file_fsync_commit(struct file *file, int datasync)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode *inode = file_inode(file);
-	int have_error, do_resend, status;
+	int do_resend, status;
 	int ret = 0;
 
 	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
 
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
 	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
-	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 	status = nfs_commit_inode(inode, FLUSH_SYNC);
-	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-	if (have_error) {
+	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
 		ret = xchg(&ctx->error, 0);
 		if (ret)
 			goto out;
@@ -247,10 +245,16 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	trace_nfs_fsync_enter(inode);
 
 	do {
+		struct nfs_open_context *ctx = nfs_file_open_context(file);
 		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+		if (test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
+			int ret2 = xchg(&ctx->error, 0);
+			if (ret2)
+				ret = ret2;
+		}
 		if (ret != 0)
 			break;
-		ret = nfs_file_fsync_commit(file, start, end, datasync);
+		ret = nfs_file_fsync_commit(file, datasync);
 		if (!ret)
 			ret = pnfs_sync_inode(inode, !!datasync);
 		/*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index dc456416d2be..44c8962fec91 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -769,3 +769,10 @@ static inline bool nfs_error_is_fatal(int err)
 		return false;
 	}
 }
+
+static inline void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
+{
+	ctx->error = error;
+	smp_wmb();
+	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index de9066a92c0d..0ebd26b9a6bd 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1198,8 +1198,8 @@ out_failed:
 
 		/* remember fatal errors */
 		if (nfs_error_is_fatal(desc->pg_error))
-			mapping_set_error(desc->pg_inode->i_mapping,
-					  desc->pg_error);
+			nfs_context_set_write_error(req->wb_context,
+						    desc->pg_error);
 
 		func = desc->pg_completion_ops->error_cleanup;
 		for (midx = 0; midx < desc->pg_mirror_count; midx++) {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b1af5dee5e0a..f702bf2def79 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -147,13 +147,6 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
 		kref_put(&ioc->refcount, nfs_io_completion_release);
 }
 
-static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
-{
-	ctx->error = error;
-	smp_wmb();
-	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-}
-
 /*
  * nfs_page_find_head_request_locked - find head request associated with @page
  *
-- 
2.14.0.rc0.dirty


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-09-11  3:24                 ` NeilBrown
@ 2017-09-11 10:46                   ` Jeff Layton
  2017-09-11 21:52                     ` NeilBrown
  2017-09-12  2:24                     ` Trond Myklebust
  1 sibling, 1 reply; 33+ messages in thread
From: Jeff Layton @ 2017-09-11 10:46 UTC (permalink / raw)
  To: NeilBrown, Trond Myklebust, anna.schumaker, jlayton
  Cc: linux-nfs, linux-fsdevel

On Mon, 2017-09-11 at 13:24 +1000, NeilBrown wrote:
> On Thu, Sep 07 2017, Trond Myklebust wrote:
> 
> > On Thu, 2017-09-07 at 07:35 -0400, Jeff Layton wrote:
> > > On Thu, 2017-09-07 at 13:37 +1000, NeilBrown wrote:
> > > > On Tue, Aug 29 2017, Jeff Layton wrote:
> > > > 
> > > > > On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
> > > > > > On Mon, Aug 28 2017, Jeff Layton wrote:
> > > > > > 
> > > > > > > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
> > > > > > > > On Fri, Aug 25 2017, Jeff Layton wrote:
> > > > > > > > 
> > > > > > > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> > > > > > > > > > From: Jeff Layton <jlayton@redhat.com>
> > > > > > > > > > 
> > > > > > > > > > There is some ambiguity in nfs about how writeback
> > > > > > > > > > errors are
> > > > > > > > > > tracked.
> > > > > > > > > > 
> > > > > > > > > > For instance, nfs_pageio_add_request calls
> > > > > > > > > > mapping_set_error when
> > > > > > > > > > the
> > > > > > > > > > add fails, but we track errors that occur after adding
> > > > > > > > > > the
> > > > > > > > > > request
> > > > > > > > > > with a dedicated int error in the open context.
> > > > > > > > > > 
> > > > > > > > > > Now that we have better infrastructure for the vfs
> > > > > > > > > > layer, this
> > > > > > > > > > latter int is now unnecessary. Just have
> > > > > > > > > > nfs_context_set_write_error set
> > > > > > > > > > the error in the mapping when one occurs.
> > > > > > > > > > 
> > > > > > > > > > Have NFS use file_write_and_wait_range to initiate and
> > > > > > > > > > wait on
> > > > > > > > > > writeback
> > > > > > > > > > of the data, and then check again after issuing the
> > > > > > > > > > commit(s).
> > > > > > > > > > 
> > > > > > > > > > With this, we also don't need to pay attention to the
> > > > > > > > > > ERROR_WRITE
> > > > > > > > > > flag for reporting, and just clear it to indicate to
> > > > > > > > > > subsequent
> > > > > > > > > > writers that they should try to go asynchronous again.
> > > > > > > > > > 
> > > > > > > > > > In nfs_page_async_flush, sample the error before
> > > > > > > > > > locking and
> > > > > > > > > > joining
> > > > > > > > > > the requests, and check for errors since that point.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > > > > > > > ---
> > > > > > > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
> > > > > > > > > >  fs/nfs/inode.c         |  3 +--
> > > > > > > > > >  fs/nfs/write.c         |  8 ++++++--
> > > > > > > > > >  include/linux/nfs_fs.h |  1 -
> > > > > > > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
> > > > > > > > > > 
> > > > > > > > > > I have a baling wire and duct tape solution for testing
> > > > > > > > > > this with
> > > > > > > > > > xfstests (using iptables REJECT targets and soft
> > > > > > > > > > mounts). This
> > > > > > > > > > seems to
> > > > > > > > > > make nfs do the right thing.
> > > > > > > > > > 
> > > > > > > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > > > > > > > > index 5713eb32a45e..15d3c6faafd3 100644
> > > > > > > > > > --- a/fs/nfs/file.c
> > > > > > > > > > +++ b/fs/nfs/file.c
> > > > > > > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file
> > > > > > > > > > *file,
> > > > > > > > > > loff_t start, loff_t end, int datasync)
> > > > > > > > > >  {
> > > > > > > > > >  	struct nfs_open_context *ctx =
> > > > > > > > > > nfs_file_open_context(file);
> > > > > > > > > >  	struct inode *inode = file_inode(file);
> > > > > > > > > > -	int have_error, do_resend, status;
> > > > > > > > > > -	int ret = 0;
> > > > > > > > > > +	int do_resend, status;
> > > > > > > > > > +	int ret;
> > > > > > > > > >  
> > > > > > > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n",
> > > > > > > > > > file,
> > > > > > > > > > datasync);
> > > > > > > > > >  
> > > > > > > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > > > > > > > > >  	do_resend =
> > > > > > > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx-
> > > > > > > > > > > flags);
> > > > > > > > > > 
> > > > > > > > > > -	have_error =
> > > > > > > > > > test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> > > > > > > > > > &ctx->flags);
> > > > > > > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > > > > -	have_error |=
> > > > > > > > > > test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > > > > > > flags);
> > > > > > > > > > 
> > > > > > > > > > -	if (have_error) {
> > > > > > > > > > -		ret = xchg(&ctx->error, 0);
> > > > > > > > > > -		if (ret)
> > > > > > > > > > -			goto out;
> > > > > > > > > > -	}
> > > > > > > > > > -	if (status < 0) {
> > > > > > > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > > > > > > flags);
> > > > > > > > > > 
> > > > > > > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > > > > +
> > > > > > > > > > +	/* Recheck and advance after the commit */
> > > > > > > > > > +	status = file_check_and_advance_wb_err(file);
> > > > > > > > 
> > > > > > > > This change makes the code inconsistent with the comment
> > > > > > > > above the
> > > > > > > > function, which still references ctx->error.  The intent of
> > > > > > > > the
> > > > > > > > comment
> > > > > > > > is still correct, but the details have changed.
> > > > > > > > 
> > > > > > > 
> > > > > > > Good catch. I'll fix that up in a respin.
> > > > > > > 
> > > > > > > > Also, there is a call to mapping_set_error() in
> > > > > > > > nfs_pageio_add_request().
> > > > > > > > I wonder if that should be changed to
> > > > > > > >   nfs_context_set_write_error(req->wb_context, desc-
> > > > > > > > > pg_error)
> > > > > > > > 
> > > > > > > > ??
> > > > > > > > 
> > > > > > > 
> > > > > > > Trickier question...
> > > > > > > 
> > > > > > > I'm not quite sure what semantics we're looking for with
> > > > > > > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
> > > > > > > synchronous, but I'm not quite sure why it gets cleared the
> > > > > > > way it
> > > > > > > does. It's set on any error but cleared before issuing a
> > > > > > > commit.
> > > > > > > 
> > > > > > > I added a similar flag to Ceph inodes recently, but only
> > > > > > > clear it when
> > > > > > > a write succeeds. Wouldn't that make more sense here as well?
> > > > > > 
> > > > > > It is a bit hard to wrap one's mind around.
> > > > > > 
> > > > > > In the original code (commit 7b159fc18d417980) it looks like:
> > > > > >  - test-and-clear bit
> > > > > >  - write and sync
> > > > > >  - test-bit
> > > > > > 
> > > > > > This does, I think, seem safer than "clear on successful write"
> > > > > > as the
> > > > > > writes could complete out-of-order and I wouldn't be surprised
> > > > > > if the
> > > > > > unsuccessful ones completed with an error before the successful
> > > > > > one -
> > > > > > particularly with an error like EDQUOT.
> > > > > > 
> > > > > > However the current code does the writes before the test-and-
> > > > > > clear, and
> > > > > > only does the commit afterwards.  That makes it less clear why
> > > > > > the
> > > > > > current sequence is a good idea.
> > > > > > 
> > > > > > However ... nfs_file_fsync_commit() is only called if
> > > > > > filemap_write_and_wait_range() returned with success, so we
> > > > > > only clear
> > > > > > the flag after successful writes(?).
> > > > > > 
> > > > > > Oh....
> > > > > > This patch from me:
> > > > > > 
> > > > > > Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error
> > > > > > handling.")
> > > > > > 
> > > > > > seems to have been reverted by
> > > > > > 
> > > > > > Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if
> > > > > > page writeback failed")
> > > > > > 
> > > > > > which probably isn't good.  It appears that this code is very
> > > > > > fragile
> > > > > > and easily broken.
> > > > 
> > > > On further investigation, I think the problem that I fixed and then
> > > > we
> > > > reintroduced will be fixed again - more permanently - by your
> > > > patch.
> > > > The root problem is that nfs keeps error codes in a different way
> > > > to the
> > > > MM core.  By unifying those, the problem goes.
> > > > (The specific problem is that writes which hit EDQUOT on the server
> > > > can
> > > >  report EIO on the client).
> > > > 
> > > > 
> > > > > > Maybe we need to work out exactly what is required, and
> > > > > > document it - so
> > > > > > we can stop breaking it.
> > > > > > Or maybe we need some unit tests.....
> > > > > > 
> > > > > 
> > > > > Yes, laying out what's necessary for this would be very helpful.
> > > > > We
> > > > > clearly want to set the flag when an error occurs. Under what
> > > > > circumstances should we be clearing it?
> > > > 
> > > > Well.... looking back at  7b159fc18d417980f57ae which introduced
> > > > the
> > > > flag, prior to that write errors (ctx->error) were only reported by
> > > > nfs_file_flush and nfs_fsync, so only one close() and fsync().
> > > > 
> > > > After that commit, setting the flag would mean that errors could be
> > > > returned by 'write'.  So clearing as part of returning the error
> > > > makes
> > > > perfect sense.
> > > > 
> > > > As long as the error gets recorded, and gets returned when it is
> > > > recorded, it doesn't much matter when the flag is cleared.  With
> > > > your
> > > > patches we don't need to flag any more to get errors reliably
> > > > reported.
> > > > 
> > > > Leaving the flag set means that writes go more slowly - we don't
> > > > get
> > > > large queue of background rights building up but destined for
> > > > failure.
> > > > This is the main point made in the comment message when the flag
> > > > was
> > > > introduced.
> > > > Of course, by the time we first get an error there could already
> > > > by a large queue, so we probably want that to drain completely
> > > > before
> > > > allowing async writes again.
> > 
> > We already have this functionality implemented in the existing code.
> > 
> > > > 
> > > > It might make sense to have 2 flags.  One which says "writes should
> > > > be
> > > > synchronous", another that says "There was an error recently".
> > > > We clear the error flag before calling nfs_fsync, and if it is
> > > > still
> > > > clear afterwards, we clear the sync-writes flag.  Maybe that is
> > > > more
> > > > complex than needed though.
> > > > 
> > 
> > We also need to preserve the NFS_CONTEXT_RESEND_WRITES flag. I don't
> > see any global mechanism that will replace that.
> > 
> > > > I'm leaning towards your suggestion that it doesn't matter very
> > > > much
> > > > when it gets cleared, and clearing it on any successful write is
> > > > simplest.
> > > > 
> > > > So I'm still in favor of using nfs_context_set_write_error() in
> > > > nfs_pageio_add_request(), primarily because it is most consistent -
> > > > we
> > > > don't need exceptions.
> > > 
> > > Thanks for taking a closer look. I can easily make the change above,
> > > and
> > > I do think that keeping this mechanism as simple as possible will
> > > make
> > > it easier to prevent bitrot.
> > > 
> > > That said... NFS_CONTEXT_ERROR_WRITE is a per ctx flag, and the ctx
> > > is a
> > > per open file description object.
> > > 
> > > Is that the correct way to track this? All of the ctx's will share
> > > the
> > > same inode. If we're getting writeback errors for one context, it's
> > > quite likely that we'll be seeing them via others.
> > > 
> > > I suppose the counterargument is when we have things like expiring
> > > krb5
> > > tickets. Write failures via an expiring set of creds may have no
> > > effect
> > > on writeback via other creds.
> > > 
> > > Still, I think a per-inode flag might make more sense here.
> > > 
> > > Thoughts?
> > 
> > As far as I'm concerned, that would be a regression. The most common
> > problem when flushing writeback data to the server aside from ENOSPC
> > (and possibly ESTALE) is EACCES, which is particular to the file
> > descriptor that opened the file.
> > 
> > File contexts, and NFS_CONTEXT_ERROR_WRITE solve that problem by being
> > private to the file descriptor.
> 
> Thanks for the reminder that errors are per-context and this patch drops
> this.  The per-context nature of errors in NFS was the reason that I
> nagged Jeff to make errseq_t a stand-alone type rather than just a part
> of address_space.  I had envisaged that it would be embedded in the
> open_context as well.
> We still could do that, but as there is precisely one open-file for each
> open_context, the gains are not great.
> 
> However, while looking over the code to make sure I really understood it
> and all the possible consequences of changing to errseq_t I found a few
> anomalies.  The patch below addresses them all.
> 
> Would you see if they may sense to you?
> 
> Thanks,
> NeilBrown
> 
> 
> From: NeilBrown <neilb@suse.com>
> Date: Mon, 11 Sep 2017 13:15:50 +1000
> Subject: [PATCH] NFS: various changes relating to reporting IO errors.
> 
> 1/ remove 'start' and 'end' args from nfs_file_fsync_commit().
>    They aren't used.
> 
> 2/ Make nfs_context_set_write_error() a "static inline" in internal.h
>    so we can...
> 
> 3/ Use nfs_context_set_write_error() instead of mapping_set_error()
>    if nfs_pageio_add_request() fails before sending any request.
>    NFS generally keeps errors in the open_context, not the mapping,
>    so this is more consistent.
> 
> 4/ If filemap_write_and_write_range() reports any error, still
>    check ctx->error.  The value in ctx->error is likely to be
>    more useful.  As part of this, NFS_CONTEXT_ERROR_WRITE is
>    cleared slightly earlier, before nfs_file_fsync_commit() is called,
>    rather than at the start of that function.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  fs/nfs/file.c     | 16 ++++++++++------
>  fs/nfs/internal.h |  7 +++++++
>  fs/nfs/pagelist.c |  4 ++--
>  fs/nfs/write.c    |  7 -------
>  4 files changed, 19 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index af330c31f627..ab324f14081f 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -208,21 +208,19 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
>   * fall back to doing a synchronous write.
>   */
>  static int
> -nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
> +nfs_file_fsync_commit(struct file *file, int datasync)
>  {
>  	struct nfs_open_context *ctx = nfs_file_open_context(file);
>  	struct inode *inode = file_inode(file);
> -	int have_error, do_resend, status;
> +	int do_resend, status;
>  	int ret = 0;
>  
>  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
>  
>  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>  	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
> -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>  	status = nfs_commit_inode(inode, FLUSH_SYNC);
> -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> -	if (have_error) {
> +	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
>  		ret = xchg(&ctx->error, 0);
>  		if (ret)
>  			goto out;
> @@ -247,10 +245,16 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>  	trace_nfs_fsync_enter(inode);
>  
>  	do {
> +		struct nfs_open_context *ctx = nfs_file_open_context(file);
>  		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
> +		if (test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
> +			int ret2 = xchg(&ctx->error, 0);
> +			if (ret2)
> +				ret = ret2;
> +		}
>  		if (ret != 0)
>  			break;
> -		ret = nfs_file_fsync_commit(file, start, end, datasync);
> +		ret = nfs_file_fsync_commit(file, datasync);
>  		if (!ret)
>  			ret = pnfs_sync_inode(inode, !!datasync);
>  		/*
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index dc456416d2be..44c8962fec91 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -769,3 +769,10 @@ static inline bool nfs_error_is_fatal(int err)
>  		return false;
>  	}
>  }
> +
> +static inline void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
> +{
> +	ctx->error = error;
> +	smp_wmb();
> +	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> +}
> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> index de9066a92c0d..0ebd26b9a6bd 100644
> --- a/fs/nfs/pagelist.c
> +++ b/fs/nfs/pagelist.c
> @@ -1198,8 +1198,8 @@ out_failed:
>  
>  		/* remember fatal errors */
>  		if (nfs_error_is_fatal(desc->pg_error))
> -			mapping_set_error(desc->pg_inode->i_mapping,
> -					  desc->pg_error);
> +			nfs_context_set_write_error(req->wb_context,
> +						    desc->pg_error);
>  
>  		func = desc->pg_completion_ops->error_cleanup;
>  		for (midx = 0; midx < desc->pg_mirror_count; midx++) {
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index b1af5dee5e0a..f702bf2def79 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -147,13 +147,6 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
>  		kref_put(&ioc->refcount, nfs_io_completion_release);
>  }
>  
> -static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
> -{
> -	ctx->error = error;
> -	smp_wmb();
> -	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> -}
> -
>  /*
>   * nfs_page_find_head_request_locked - find head request associated with @page
>   *

This should probably be broken out into at least a 2-3 different
patches.

Ok, so to make sure I understand:

All writeback is done under the aegis of an open context, and writes
from different open contexts are not mergeable. We also flush to the
server in the case that a dirty page is written via an incompatible open
context. So with that we can always tie

In that case, yes...mixing in errseq_t doesn't really buy us much here,
and I agree with most of the changes above.

That said...I'm still not thrilled with how NFS_CONTEXT_ERROR_WRITE is
handled in this code. That flag is set when a write fails, but is only
cleared on fsync.

That seems wrong to me. Why wait for an fsync to start doing async
writes again once they start working? What if the application never does
an fsync? Clearing that flag on a successful WRITE seems like it'd make
more sense.
-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-09-11 10:46                   ` Jeff Layton
@ 2017-09-11 21:52                     ` NeilBrown
  2017-09-12 15:20                       ` Jeff Layton
  0 siblings, 1 reply; 33+ messages in thread
From: NeilBrown @ 2017-09-11 21:52 UTC (permalink / raw)
  To: Jeff Layton, Trond Myklebust, anna.schumaker, jlayton
  Cc: linux-nfs, linux-fsdevel


> On Mon, 2017-09-11 at 13:24 +1000, NeilBrown wrote:
>> On Thu, Sep 07 2017, Trond Myklebust wrote:
>> 
>> > On Thu, 2017-09-07 at 07:35 -0400, Jeff Layton wrote:
>> > > On Thu, 2017-09-07 at 13:37 +1000, NeilBrown wrote:
>> > > > On Tue, Aug 29 2017, Jeff Layton wrote:
>> > > > 
>> > > > > On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
>> > > > > > On Mon, Aug 28 2017, Jeff Layton wrote:
>> > > > > > 
>> > > > > > > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
>> > > > > > > > On Fri, Aug 25 2017, Jeff Layton wrote:
>> > > > > > > > 
>> > > > > > > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
>> > > > > > > > > > From: Jeff Layton <jlayton@redhat.com>
>> > > > > > > > > > 
>> > > > > > > > > > There is some ambiguity in nfs about how writeback
>> > > > > > > > > > errors are
>> > > > > > > > > > tracked.
>> > > > > > > > > > 
>> > > > > > > > > > For instance, nfs_pageio_add_request calls
>> > > > > > > > > > mapping_set_error when
>> > > > > > > > > > the
>> > > > > > > > > > add fails, but we track errors that occur after adding
>> > > > > > > > > > the
>> > > > > > > > > > request
>> > > > > > > > > > with a dedicated int error in the open context.
>> > > > > > > > > > 
>> > > > > > > > > > Now that we have better infrastructure for the vfs
>> > > > > > > > > > layer, this
>> > > > > > > > > > latter int is now unnecessary. Just have
>> > > > > > > > > > nfs_context_set_write_error set
>> > > > > > > > > > the error in the mapping when one occurs.
>> > > > > > > > > > 
>> > > > > > > > > > Have NFS use file_write_and_wait_range to initiate and
>> > > > > > > > > > wait on
>> > > > > > > > > > writeback
>> > > > > > > > > > of the data, and then check again after issuing the
>> > > > > > > > > > commit(s).
>> > > > > > > > > > 
>> > > > > > > > > > With this, we also don't need to pay attention to the
>> > > > > > > > > > ERROR_WRITE
>> > > > > > > > > > flag for reporting, and just clear it to indicate to
>> > > > > > > > > > subsequent
>> > > > > > > > > > writers that they should try to go asynchronous again.
>> > > > > > > > > > 
>> > > > > > > > > > In nfs_page_async_flush, sample the error before
>> > > > > > > > > > locking and
>> > > > > > > > > > joining
>> > > > > > > > > > the requests, and check for errors since that point.
>> > > > > > > > > > 
>> > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > > > > > > > > > ---
>> > > > > > > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
>> > > > > > > > > >  fs/nfs/inode.c         |  3 +--
>> > > > > > > > > >  fs/nfs/write.c         |  8 ++++++--
>> > > > > > > > > >  include/linux/nfs_fs.h |  1 -
>> > > > > > > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
>> > > > > > > > > > 
>> > > > > > > > > > I have a baling wire and duct tape solution for testing
>> > > > > > > > > > this with
>> > > > > > > > > > xfstests (using iptables REJECT targets and soft
>> > > > > > > > > > mounts). This
>> > > > > > > > > > seems to
>> > > > > > > > > > make nfs do the right thing.
>> > > > > > > > > > 
>> > > > > > > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> > > > > > > > > > index 5713eb32a45e..15d3c6faafd3 100644
>> > > > > > > > > > --- a/fs/nfs/file.c
>> > > > > > > > > > +++ b/fs/nfs/file.c
>> > > > > > > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file
>> > > > > > > > > > *file,
>> > > > > > > > > > loff_t start, loff_t end, int datasync)
>> > > > > > > > > >  {
>> > > > > > > > > >  	struct nfs_open_context *ctx =
>> > > > > > > > > > nfs_file_open_context(file);
>> > > > > > > > > >  	struct inode *inode = file_inode(file);
>> > > > > > > > > > -	int have_error, do_resend, status;
>> > > > > > > > > > -	int ret = 0;
>> > > > > > > > > > +	int do_resend, status;
>> > > > > > > > > > +	int ret;
>> > > > > > > > > >  
>> > > > > > > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n",
>> > > > > > > > > > file,
>> > > > > > > > > > datasync);
>> > > > > > > > > >  
>> > > > > > > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>> > > > > > > > > >  	do_resend =
>> > > > > > > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx-
>> > > > > > > > > > > flags);
>> > > > > > > > > > 
>> > > > > > > > > > -	have_error =
>> > > > > > > > > > test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
>> > > > > > > > > > &ctx->flags);
>> > > > > > > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > > > > > > > > -	have_error |=
>> > > > > > > > > > test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
>> > > > > > > > > > > flags);
>> > > > > > > > > > 
>> > > > > > > > > > -	if (have_error) {
>> > > > > > > > > > -		ret = xchg(&ctx->error, 0);
>> > > > > > > > > > -		if (ret)
>> > > > > > > > > > -			goto out;
>> > > > > > > > > > -	}
>> > > > > > > > > > -	if (status < 0) {
>> > > > > > > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
>> > > > > > > > > > > flags);
>> > > > > > > > > > 
>> > > > > > > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > > > > > > > > +
>> > > > > > > > > > +	/* Recheck and advance after the commit */
>> > > > > > > > > > +	status = file_check_and_advance_wb_err(file);
>> > > > > > > > 
>> > > > > > > > This change makes the code inconsistent with the comment
>> > > > > > > > above the
>> > > > > > > > function, which still references ctx->error.  The intent of
>> > > > > > > > the
>> > > > > > > > comment
>> > > > > > > > is still correct, but the details have changed.
>> > > > > > > > 
>> > > > > > > 
>> > > > > > > Good catch. I'll fix that up in a respin.
>> > > > > > > 
>> > > > > > > > Also, there is a call to mapping_set_error() in
>> > > > > > > > nfs_pageio_add_request().
>> > > > > > > > I wonder if that should be changed to
>> > > > > > > >   nfs_context_set_write_error(req->wb_context, desc-
>> > > > > > > > > pg_error)
>> > > > > > > > 
>> > > > > > > > ??
>> > > > > > > > 
>> > > > > > > 
>> > > > > > > Trickier question...
>> > > > > > > 
>> > > > > > > I'm not quite sure what semantics we're looking for with
>> > > > > > > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
>> > > > > > > synchronous, but I'm not quite sure why it gets cleared the
>> > > > > > > way it
>> > > > > > > does. It's set on any error but cleared before issuing a
>> > > > > > > commit.
>> > > > > > > 
>> > > > > > > I added a similar flag to Ceph inodes recently, but only
>> > > > > > > clear it when
>> > > > > > > a write succeeds. Wouldn't that make more sense here as well?
>> > > > > > 
>> > > > > > It is a bit hard to wrap one's mind around.
>> > > > > > 
>> > > > > > In the original code (commit 7b159fc18d417980) it looks like:
>> > > > > >  - test-and-clear bit
>> > > > > >  - write and sync
>> > > > > >  - test-bit
>> > > > > > 
>> > > > > > This does, I think, seem safer than "clear on successful write"
>> > > > > > as the
>> > > > > > writes could complete out-of-order and I wouldn't be surprised
>> > > > > > if the
>> > > > > > unsuccessful ones completed with an error before the successful
>> > > > > > one -
>> > > > > > particularly with an error like EDQUOT.
>> > > > > > 
>> > > > > > However the current code does the writes before the test-and-
>> > > > > > clear, and
>> > > > > > only does the commit afterwards.  That makes it less clear why
>> > > > > > the
>> > > > > > current sequence is a good idea.
>> > > > > > 
>> > > > > > However ... nfs_file_fsync_commit() is only called if
>> > > > > > filemap_write_and_wait_range() returned with success, so we
>> > > > > > only clear
>> > > > > > the flag after successful writes(?).
>> > > > > > 
>> > > > > > Oh....
>> > > > > > This patch from me:
>> > > > > > 
>> > > > > > Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error
>> > > > > > handling.")
>> > > > > > 
>> > > > > > seems to have been reverted by
>> > > > > > 
>> > > > > > Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if
>> > > > > > page writeback failed")
>> > > > > > 
>> > > > > > which probably isn't good.  It appears that this code is very
>> > > > > > fragile
>> > > > > > and easily broken.
>> > > > 
>> > > > On further investigation, I think the problem that I fixed and then
>> > > > we
>> > > > reintroduced will be fixed again - more permanently - by your
>> > > > patch.
>> > > > The root problem is that nfs keeps error codes in a different way
>> > > > to the
>> > > > MM core.  By unifying those, the problem goes.
>> > > > (The specific problem is that writes which hit EDQUOT on the server
>> > > > can
>> > > >  report EIO on the client).
>> > > > 
>> > > > 
>> > > > > > Maybe we need to work out exactly what is required, and
>> > > > > > document it - so
>> > > > > > we can stop breaking it.
>> > > > > > Or maybe we need some unit tests.....
>> > > > > > 
>> > > > > 
>> > > > > Yes, laying out what's necessary for this would be very helpful.
>> > > > > We
>> > > > > clearly want to set the flag when an error occurs. Under what
>> > > > > circumstances should we be clearing it?
>> > > > 
>> > > > Well.... looking back at  7b159fc18d417980f57ae which introduced
>> > > > the
>> > > > flag, prior to that write errors (ctx->error) were only reported by
>> > > > nfs_file_flush and nfs_fsync, so only one close() and fsync().
>> > > > 
>> > > > After that commit, setting the flag would mean that errors could be
>> > > > returned by 'write'.  So clearing as part of returning the error
>> > > > makes
>> > > > perfect sense.
>> > > > 
>> > > > As long as the error gets recorded, and gets returned when it is
>> > > > recorded, it doesn't much matter when the flag is cleared.  With
>> > > > your
>> > > > patches we don't need to flag any more to get errors reliably
>> > > > reported.
>> > > > 
>> > > > Leaving the flag set means that writes go more slowly - we don't
>> > > > get
>> > > > large queue of background rights building up but destined for
>> > > > failure.
>> > > > This is the main point made in the comment message when the flag
>> > > > was
>> > > > introduced.
>> > > > Of course, by the time we first get an error there could already
>> > > > by a large queue, so we probably want that to drain completely
>> > > > before
>> > > > allowing async writes again.
>> > 
>> > We already have this functionality implemented in the existing code.
>> > 
>> > > > 
>> > > > It might make sense to have 2 flags.  One which says "writes should
>> > > > be
>> > > > synchronous", another that says "There was an error recently".
>> > > > We clear the error flag before calling nfs_fsync, and if it is
>> > > > still
>> > > > clear afterwards, we clear the sync-writes flag.  Maybe that is
>> > > > more
>> > > > complex than needed though.
>> > > > 
>> > 
>> > We also need to preserve the NFS_CONTEXT_RESEND_WRITES flag. I don't
>> > see any global mechanism that will replace that.
>> > 
>> > > > I'm leaning towards your suggestion that it doesn't matter very
>> > > > much
>> > > > when it gets cleared, and clearing it on any successful write is
>> > > > simplest.
>> > > > 
>> > > > So I'm still in favor of using nfs_context_set_write_error() in
>> > > > nfs_pageio_add_request(), primarily because it is most consistent -
>> > > > we
>> > > > don't need exceptions.
>> > > 
>> > > Thanks for taking a closer look. I can easily make the change above,
>> > > and
>> > > I do think that keeping this mechanism as simple as possible will
>> > > make
>> > > it easier to prevent bitrot.
>> > > 
>> > > That said... NFS_CONTEXT_ERROR_WRITE is a per ctx flag, and the ctx
>> > > is a
>> > > per open file description object.
>> > > 
>> > > Is that the correct way to track this? All of the ctx's will share
>> > > the
>> > > same inode. If we're getting writeback errors for one context, it's
>> > > quite likely that we'll be seeing them via others.
>> > > 
>> > > I suppose the counterargument is when we have things like expiring
>> > > krb5
>> > > tickets. Write failures via an expiring set of creds may have no
>> > > effect
>> > > on writeback via other creds.
>> > > 
>> > > Still, I think a per-inode flag might make more sense here.
>> > > 
>> > > Thoughts?
>> > 
>> > As far as I'm concerned, that would be a regression. The most common
>> > problem when flushing writeback data to the server aside from ENOSPC
>> > (and possibly ESTALE) is EACCES, which is particular to the file
>> > descriptor that opened the file.
>> > 
>> > File contexts, and NFS_CONTEXT_ERROR_WRITE solve that problem by being
>> > private to the file descriptor.
>> 
>> Thanks for the reminder that errors are per-context and this patch drops
>> this.  The per-context nature of errors in NFS was the reason that I
>> nagged Jeff to make errseq_t a stand-alone type rather than just a part
>> of address_space.  I had envisaged that it would be embedded in the
>> open_context as well.
>> We still could do that, but as there is precisely one open-file for each
>> open_context, the gains are not great.
>> 
>> However, while looking over the code to make sure I really understood it
>> and all the possible consequences of changing to errseq_t I found a few
>> anomalies.  The patch below addresses them all.
>> 
>> Would you see if they may sense to you?
>> 
>> Thanks,
>> NeilBrown
>> 
>> 
>> From: NeilBrown <neilb@suse.com>
>> Date: Mon, 11 Sep 2017 13:15:50 +1000
>> Subject: [PATCH] NFS: various changes relating to reporting IO errors.
>> 
>> 1/ remove 'start' and 'end' args from nfs_file_fsync_commit().
>>    They aren't used.
>> 
>> 2/ Make nfs_context_set_write_error() a "static inline" in internal.h
>>    so we can...
>> 
>> 3/ Use nfs_context_set_write_error() instead of mapping_set_error()
>>    if nfs_pageio_add_request() fails before sending any request.
>>    NFS generally keeps errors in the open_context, not the mapping,
>>    so this is more consistent.
>> 
>> 4/ If filemap_write_and_write_range() reports any error, still
>>    check ctx->error.  The value in ctx->error is likely to be
>>    more useful.  As part of this, NFS_CONTEXT_ERROR_WRITE is
>>    cleared slightly earlier, before nfs_file_fsync_commit() is called,
>>    rather than at the start of that function.
>> 
>> Signed-off-by: NeilBrown <neilb@suse.com>
>> ---
>>  fs/nfs/file.c     | 16 ++++++++++------
>>  fs/nfs/internal.h |  7 +++++++
>>  fs/nfs/pagelist.c |  4 ++--
>>  fs/nfs/write.c    |  7 -------
>>  4 files changed, 19 insertions(+), 15 deletions(-)
>> 
>> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> index af330c31f627..ab324f14081f 100644
>> --- a/fs/nfs/file.c
>> +++ b/fs/nfs/file.c
>> @@ -208,21 +208,19 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
>>   * fall back to doing a synchronous write.
>>   */
>>  static int
>> -nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
>> +nfs_file_fsync_commit(struct file *file, int datasync)
>>  {
>>  	struct nfs_open_context *ctx = nfs_file_open_context(file);
>>  	struct inode *inode = file_inode(file);
>> -	int have_error, do_resend, status;
>> +	int do_resend, status;
>>  	int ret = 0;
>>  
>>  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
>>  
>>  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>>  	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
>> -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>>  	status = nfs_commit_inode(inode, FLUSH_SYNC);
>> -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> -	if (have_error) {
>> +	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
>>  		ret = xchg(&ctx->error, 0);
>>  		if (ret)
>>  			goto out;
>> @@ -247,10 +245,16 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>>  	trace_nfs_fsync_enter(inode);
>>  
>>  	do {
>> +		struct nfs_open_context *ctx = nfs_file_open_context(file);
>>  		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
>> +		if (test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
>> +			int ret2 = xchg(&ctx->error, 0);
>> +			if (ret2)
>> +				ret = ret2;
>> +		}
>>  		if (ret != 0)
>>  			break;
>> -		ret = nfs_file_fsync_commit(file, start, end, datasync);
>> +		ret = nfs_file_fsync_commit(file, datasync);
>>  		if (!ret)
>>  			ret = pnfs_sync_inode(inode, !!datasync);
>>  		/*
>> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
>> index dc456416d2be..44c8962fec91 100644
>> --- a/fs/nfs/internal.h
>> +++ b/fs/nfs/internal.h
>> @@ -769,3 +769,10 @@ static inline bool nfs_error_is_fatal(int err)
>>  		return false;
>>  	}
>>  }
>> +
>> +static inline void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
>> +{
>> +	ctx->error = error;
>> +	smp_wmb();
>> +	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> +}
>> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
>> index de9066a92c0d..0ebd26b9a6bd 100644
>> --- a/fs/nfs/pagelist.c
>> +++ b/fs/nfs/pagelist.c
>> @@ -1198,8 +1198,8 @@ out_failed:
>>  
>>  		/* remember fatal errors */
>>  		if (nfs_error_is_fatal(desc->pg_error))
>> -			mapping_set_error(desc->pg_inode->i_mapping,
>> -					  desc->pg_error);
>> +			nfs_context_set_write_error(req->wb_context,
>> +						    desc->pg_error);
>>  
>>  		func = desc->pg_completion_ops->error_cleanup;
>>  		for (midx = 0; midx < desc->pg_mirror_count; midx++) {
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index b1af5dee5e0a..f702bf2def79 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -147,13 +147,6 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
>>  		kref_put(&ioc->refcount, nfs_io_completion_release);
>>  }
>>  
>> -static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
>> -{
>> -	ctx->error = error;
>> -	smp_wmb();
>> -	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> -}
>> -
>>  /*
>>   * nfs_page_find_head_request_locked - find head request associated with @page
>>   *
>
> This should probably be broken out into at least a 2-3 different
> patches.
>
> Ok, so to make sure I understand:
>
> All writeback is done under the aegis of an open context, and writes
> from different open contexts are not mergeable. We also flush to the
> server in the case that a dirty page is written via an incompatible open
> context. So with that we can always tie

Not quite.  Writes from different open contexts are sometimes mergeable,
providing the credential is the same and there are no locks that might
get in the way. (nfs_flush_incompatible() gets rid of conflicts writes
to the same page as part of nfs_write_begin().
When writes are merged, all contexts remain reachable from the request
through an 'nfs_page'. nfs_write_completion() iterates over all the
nfs_pages attached to the nfs_pgio_header, and sets the context
write_error from the hdr->error.

>
> In that case, yes...mixing in errseq_t doesn't really buy us much here,
> and I agree with most of the changes above.
>
> That said...I'm still not thrilled with how NFS_CONTEXT_ERROR_WRITE is
> handled in this code. That flag is set when a write fails, but is only
> cleared on fsync.
>
> That seems wrong to me. Why wait for an fsync to start doing async
> writes again once they start working? What if the application never does
> an fsync? Clearing that flag on a successful WRITE seems like it'd make
> more sense.

We don't really 'wait' for an fsync.  Having NFS_CONTEXT_ERROR_WRITE
means that the very next write will force an fsync
(nfs_need_check_write()).  So we really just wait for the next write.
The current code doesn't seem "obviously right" to me, but it isn't
"obviously wrong" either, and I can only make it obviously right to me
by making it more complex, and I don't think I can justify that.

Thanks,
NeilBrown

> -- 
> Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-09-11  3:24                 ` NeilBrown
@ 2017-09-12  2:24                     ` Trond Myklebust
  2017-09-12  2:24                     ` Trond Myklebust
  1 sibling, 0 replies; 33+ messages in thread
From: Trond Myklebust @ 2017-09-12  2:24 UTC (permalink / raw)
  To: anna.schumaker, jlayton, neilb, jlayton; +Cc: linux-nfs, linux-fsdevel

On Mon, 2017-09-11 at 13:24 +1000, NeilBrown wrote:
> However, while looking over the code to make sure I really understood
> it
> and all the possible consequences of changing to errseq_t I found a
> few
> anomalies.  The patch below addresses them all.
> 
> Would you see if they may sense to you?
> 
> Thanks,
> NeilBrown
> 
> 
> From: NeilBrown <neilb@suse.com>
> Date: Mon, 11 Sep 2017 13:15:50 +1000
> Subject: [PATCH] NFS: various changes relating to reporting IO
> errors.
> 
> 1/ remove 'start' and 'end' args from nfs_file_fsync_commit().
>    They aren't used.
> 
> 2/ Make nfs_context_set_write_error() a "static inline" in internal.h
>    so we can...
> 
> 3/ Use nfs_context_set_write_error() instead of mapping_set_error()
>    if nfs_pageio_add_request() fails before sending any request.
>    NFS generally keeps errors in the open_context, not the mapping,
>    so this is more consistent.
> 
> 4/ If filemap_write_and_write_range() reports any error, still
>    check ctx->error.  The value in ctx->error is likely to be
>    more useful.  As part of this, NFS_CONTEXT_ERROR_WRITE is
>    cleared slightly earlier, before nfs_file_fsync_commit() is
> called,
>    rather than at the start of that function.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  fs/nfs/file.c     | 16 ++++++++++------
>  fs/nfs/internal.h |  7 +++++++
>  fs/nfs/pagelist.c |  4 ++--
>  fs/nfs/write.c    |  7 -------
>  4 files changed, 19 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index af330c31f627..ab324f14081f 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -208,21 +208,19 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
>   * fall back to doing a synchronous write.
>   */
>  static int
> -nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end,
> int datasync)
> +nfs_file_fsync_commit(struct file *file, int datasync)
>  {
>  	struct nfs_open_context *ctx = nfs_file_open_context(file);
>  	struct inode *inode = file_inode(file);
> -	int have_error, do_resend, status;
> +	int do_resend, status;
>  	int ret = 0;
>  
>  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file,
> datasync);
>  
>  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>  	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES,
> &ctx->flags);
> -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> &ctx->flags);
>  	status = nfs_commit_inode(inode, FLUSH_SYNC);
> -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> >flags);
> -	if (have_error) {
> +	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
>  		ret = xchg(&ctx->error, 0);
>  		if (ret)
>  			goto out;
> @@ -247,10 +245,16 @@ nfs_file_fsync(struct file *file, loff_t start,
> loff_t end, int datasync)
>  	trace_nfs_fsync_enter(inode);
>  
>  	do {
> +		struct nfs_open_context *ctx =
> nfs_file_open_context(file);
>  		ret = filemap_write_and_wait_range(inode->i_mapping, 
> start, end);
> +		if (test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> &ctx->flags)) {
> +			int ret2 = xchg(&ctx->error, 0);
> +			if (ret2)
> +				ret = ret2;
> +		}
>  		if (ret != 0)
>  			break;
> -		ret = nfs_file_fsync_commit(file, start, end,
> datasync);
> +		ret = nfs_file_fsync_commit(file, datasync);
>  		if (!ret)
>  			ret = pnfs_sync_inode(inode, !!datasync);
>  		/*
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index dc456416d2be..44c8962fec91 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -769,3 +769,10 @@ static inline bool nfs_error_is_fatal(int err)
>  		return false;
>  	}
>  }
> +
> +static inline void nfs_context_set_write_error(struct
> nfs_open_context *ctx, int error)
> +{
> +	ctx->error = error;
> +	smp_wmb();
> +	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> +}
> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> index de9066a92c0d..0ebd26b9a6bd 100644
> --- a/fs/nfs/pagelist.c
> +++ b/fs/nfs/pagelist.c
> @@ -1198,8 +1198,8 @@ out_failed:
>  
>  		/* remember fatal errors */
>  		if (nfs_error_is_fatal(desc->pg_error))
> -			mapping_set_error(desc->pg_inode->i_mapping,
> -					  desc->pg_error);
> +			nfs_context_set_write_error(req->wb_context,
> +						    desc->pg_error);
>  
>  		func = desc->pg_completion_ops->error_cleanup;
>  		for (midx = 0; midx < desc->pg_mirror_count; midx++)
> {
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index b1af5dee5e0a..f702bf2def79 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -147,13 +147,6 @@ static void nfs_io_completion_put(struct
> nfs_io_completion *ioc)
>  		kref_put(&ioc->refcount, nfs_io_completion_release);
>  }
>  
> -static void nfs_context_set_write_error(struct nfs_open_context
> *ctx, int error)
> -{
> -	ctx->error = error;
> -	smp_wmb();
> -	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> -}
> -
>  /*
>   * nfs_page_find_head_request_locked - find head request associated
> with @page
>   *

That makes sense to me. I'm applying and will send as an update to
4.14...

-- 
Trond Myklebust
Linux NFS client maintainer, PrimaryData
trond.myklebust@primarydata.com

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
@ 2017-09-12  2:24                     ` Trond Myklebust
  0 siblings, 0 replies; 33+ messages in thread
From: Trond Myklebust @ 2017-09-12  2:24 UTC (permalink / raw)
  To: anna.schumaker, jlayton, neilb, jlayton; +Cc: linux-nfs, linux-fsdevel

T24gTW9uLCAyMDE3LTA5LTExIGF0IDEzOjI0ICsxMDAwLCBOZWlsQnJvd24gd3JvdGU6DQo+IEhv
d2V2ZXIsIHdoaWxlIGxvb2tpbmcgb3ZlciB0aGUgY29kZSB0byBtYWtlIHN1cmUgSSByZWFsbHkg
dW5kZXJzdG9vZA0KPiBpdA0KPiBhbmQgYWxsIHRoZSBwb3NzaWJsZSBjb25zZXF1ZW5jZXMgb2Yg
Y2hhbmdpbmcgdG8gZXJyc2VxX3QgSSBmb3VuZCBhDQo+IGZldw0KPiBhbm9tYWxpZXMuICBUaGUg
cGF0Y2ggYmVsb3cgYWRkcmVzc2VzIHRoZW0gYWxsLg0KPiANCj4gV291bGQgeW91IHNlZSBpZiB0
aGV5IG1heSBzZW5zZSB0byB5b3U/DQo+IA0KPiBUaGFua3MsDQo+IE5laWxCcm93bg0KPiANCj4g
DQo+IEZyb206IE5laWxCcm93biA8bmVpbGJAc3VzZS5jb20+DQo+IERhdGU6IE1vbiwgMTEgU2Vw
IDIwMTcgMTM6MTU6NTAgKzEwMDANCj4gU3ViamVjdDogW1BBVENIXSBORlM6IHZhcmlvdXMgY2hh
bmdlcyByZWxhdGluZyB0byByZXBvcnRpbmcgSU8NCj4gZXJyb3JzLg0KPiANCj4gMS8gcmVtb3Zl
ICdzdGFydCcgYW5kICdlbmQnIGFyZ3MgZnJvbSBuZnNfZmlsZV9mc3luY19jb21taXQoKS4NCj4g
ICAgVGhleSBhcmVuJ3QgdXNlZC4NCj4gDQo+IDIvIE1ha2UgbmZzX2NvbnRleHRfc2V0X3dyaXRl
X2Vycm9yKCkgYSAic3RhdGljIGlubGluZSIgaW4gaW50ZXJuYWwuaA0KPiAgICBzbyB3ZSBjYW4u
Li4NCj4gDQo+IDMvIFVzZSBuZnNfY29udGV4dF9zZXRfd3JpdGVfZXJyb3IoKSBpbnN0ZWFkIG9m
IG1hcHBpbmdfc2V0X2Vycm9yKCkNCj4gICAgaWYgbmZzX3BhZ2Vpb19hZGRfcmVxdWVzdCgpIGZh
aWxzIGJlZm9yZSBzZW5kaW5nIGFueSByZXF1ZXN0Lg0KPiAgICBORlMgZ2VuZXJhbGx5IGtlZXBz
IGVycm9ycyBpbiB0aGUgb3Blbl9jb250ZXh0LCBub3QgdGhlIG1hcHBpbmcsDQo+ICAgIHNvIHRo
aXMgaXMgbW9yZSBjb25zaXN0ZW50Lg0KPiANCj4gNC8gSWYgZmlsZW1hcF93cml0ZV9hbmRfd3Jp
dGVfcmFuZ2UoKSByZXBvcnRzIGFueSBlcnJvciwgc3RpbGwNCj4gICAgY2hlY2sgY3R4LT5lcnJv
ci4gIFRoZSB2YWx1ZSBpbiBjdHgtPmVycm9yIGlzIGxpa2VseSB0byBiZQ0KPiAgICBtb3JlIHVz
ZWZ1bC4gIEFzIHBhcnQgb2YgdGhpcywgTkZTX0NPTlRFWFRfRVJST1JfV1JJVEUgaXMNCj4gICAg
Y2xlYXJlZCBzbGlnaHRseSBlYXJsaWVyLCBiZWZvcmUgbmZzX2ZpbGVfZnN5bmNfY29tbWl0KCkg
aXMNCj4gY2FsbGVkLA0KPiAgICByYXRoZXIgdGhhbiBhdCB0aGUgc3RhcnQgb2YgdGhhdCBmdW5j
dGlvbi4NCj4gDQo+IFNpZ25lZC1vZmYtYnk6IE5laWxCcm93biA8bmVpbGJAc3VzZS5jb20+DQo+
IC0tLQ0KPiAgZnMvbmZzL2ZpbGUuYyAgICAgfCAxNiArKysrKysrKysrLS0tLS0tDQo+ICBmcy9u
ZnMvaW50ZXJuYWwuaCB8ICA3ICsrKysrKysNCj4gIGZzL25mcy9wYWdlbGlzdC5jIHwgIDQgKyst
LQ0KPiAgZnMvbmZzL3dyaXRlLmMgICAgfCAgNyAtLS0tLS0tDQo+ICA0IGZpbGVzIGNoYW5nZWQs
IDE5IGluc2VydGlvbnMoKyksIDE1IGRlbGV0aW9ucygtKQ0KPiANCj4gZGlmZiAtLWdpdCBhL2Zz
L25mcy9maWxlLmMgYi9mcy9uZnMvZmlsZS5jDQo+IGluZGV4IGFmMzMwYzMxZjYyNy4uYWIzMjRm
MTQwODFmIDEwMDY0NA0KPiAtLS0gYS9mcy9uZnMvZmlsZS5jDQo+ICsrKyBiL2ZzL25mcy9maWxl
LmMNCj4gQEAgLTIwOCwyMSArMjA4LDE5IEBAIEVYUE9SVF9TWU1CT0xfR1BMKG5mc19maWxlX21t
YXApOw0KPiAgICogZmFsbCBiYWNrIHRvIGRvaW5nIGEgc3luY2hyb25vdXMgd3JpdGUuDQo+ICAg
Ki8NCj4gIHN0YXRpYyBpbnQNCj4gLW5mc19maWxlX2ZzeW5jX2NvbW1pdChzdHJ1Y3QgZmlsZSAq
ZmlsZSwgbG9mZl90IHN0YXJ0LCBsb2ZmX3QgZW5kLA0KPiBpbnQgZGF0YXN5bmMpDQo+ICtuZnNf
ZmlsZV9mc3luY19jb21taXQoc3RydWN0IGZpbGUgKmZpbGUsIGludCBkYXRhc3luYykNCj4gIHsN
Cj4gIAlzdHJ1Y3QgbmZzX29wZW5fY29udGV4dCAqY3R4ID0gbmZzX2ZpbGVfb3Blbl9jb250ZXh0
KGZpbGUpOw0KPiAgCXN0cnVjdCBpbm9kZSAqaW5vZGUgPSBmaWxlX2lub2RlKGZpbGUpOw0KPiAt
CWludCBoYXZlX2Vycm9yLCBkb19yZXNlbmQsIHN0YXR1czsNCj4gKwlpbnQgZG9fcmVzZW5kLCBz
dGF0dXM7DQo+ICAJaW50IHJldCA9IDA7DQo+ICANCj4gIAlkcHJpbnRrKCJORlM6IGZzeW5jIGZp
bGUoJXBEMikgZGF0YXN5bmMgJWRcbiIsIGZpbGUsDQo+IGRhdGFzeW5jKTsNCj4gIA0KPiAgCW5m
c19pbmNfc3RhdHMoaW5vZGUsIE5GU0lPU19WRlNGU1lOQyk7DQo+ICAJZG9fcmVzZW5kID0gdGVz
dF9hbmRfY2xlYXJfYml0KE5GU19DT05URVhUX1JFU0VORF9XUklURVMsDQo+ICZjdHgtPmZsYWdz
KTsNCj4gLQloYXZlX2Vycm9yID0gdGVzdF9hbmRfY2xlYXJfYml0KE5GU19DT05URVhUX0VSUk9S
X1dSSVRFLA0KPiAmY3R4LT5mbGFncyk7DQo+ICAJc3RhdHVzID0gbmZzX2NvbW1pdF9pbm9kZShp
bm9kZSwgRkxVU0hfU1lOQyk7DQo+IC0JaGF2ZV9lcnJvciB8PSB0ZXN0X2JpdChORlNfQ09OVEVY
VF9FUlJPUl9XUklURSwgJmN0eC0NCj4gPmZsYWdzKTsNCj4gLQlpZiAoaGF2ZV9lcnJvcikgew0K
PiArCWlmICh0ZXN0X2JpdChORlNfQ09OVEVYVF9FUlJPUl9XUklURSwgJmN0eC0+ZmxhZ3MpKSB7
DQo+ICAJCXJldCA9IHhjaGcoJmN0eC0+ZXJyb3IsIDApOw0KPiAgCQlpZiAocmV0KQ0KPiAgCQkJ
Z290byBvdXQ7DQo+IEBAIC0yNDcsMTAgKzI0NSwxNiBAQCBuZnNfZmlsZV9mc3luYyhzdHJ1Y3Qg
ZmlsZSAqZmlsZSwgbG9mZl90IHN0YXJ0LA0KPiBsb2ZmX3QgZW5kLCBpbnQgZGF0YXN5bmMpDQo+
ICAJdHJhY2VfbmZzX2ZzeW5jX2VudGVyKGlub2RlKTsNCj4gIA0KPiAgCWRvIHsNCj4gKwkJc3Ry
dWN0IG5mc19vcGVuX2NvbnRleHQgKmN0eCA9DQo+IG5mc19maWxlX29wZW5fY29udGV4dChmaWxl
KTsNCj4gIAkJcmV0ID0gZmlsZW1hcF93cml0ZV9hbmRfd2FpdF9yYW5nZShpbm9kZS0+aV9tYXBw
aW5nLCANCj4gc3RhcnQsIGVuZCk7DQo+ICsJCWlmICh0ZXN0X2FuZF9jbGVhcl9iaXQoTkZTX0NP
TlRFWFRfRVJST1JfV1JJVEUsDQo+ICZjdHgtPmZsYWdzKSkgew0KPiArCQkJaW50IHJldDIgPSB4
Y2hnKCZjdHgtPmVycm9yLCAwKTsNCj4gKwkJCWlmIChyZXQyKQ0KPiArCQkJCXJldCA9IHJldDI7
DQo+ICsJCX0NCj4gIAkJaWYgKHJldCAhPSAwKQ0KPiAgCQkJYnJlYWs7DQo+IC0JCXJldCA9IG5m
c19maWxlX2ZzeW5jX2NvbW1pdChmaWxlLCBzdGFydCwgZW5kLA0KPiBkYXRhc3luYyk7DQo+ICsJ
CXJldCA9IG5mc19maWxlX2ZzeW5jX2NvbW1pdChmaWxlLCBkYXRhc3luYyk7DQo+ICAJCWlmICgh
cmV0KQ0KPiAgCQkJcmV0ID0gcG5mc19zeW5jX2lub2RlKGlub2RlLCAhIWRhdGFzeW5jKTsNCj4g
IAkJLyoNCj4gZGlmZiAtLWdpdCBhL2ZzL25mcy9pbnRlcm5hbC5oIGIvZnMvbmZzL2ludGVybmFs
LmgNCj4gaW5kZXggZGM0NTY0MTZkMmJlLi40NGM4OTYyZmVjOTEgMTAwNjQ0DQo+IC0tLSBhL2Zz
L25mcy9pbnRlcm5hbC5oDQo+ICsrKyBiL2ZzL25mcy9pbnRlcm5hbC5oDQo+IEBAIC03NjksMyAr
NzY5LDEwIEBAIHN0YXRpYyBpbmxpbmUgYm9vbCBuZnNfZXJyb3JfaXNfZmF0YWwoaW50IGVycikN
Cj4gIAkJcmV0dXJuIGZhbHNlOw0KPiAgCX0NCj4gIH0NCj4gKw0KPiArc3RhdGljIGlubGluZSB2
b2lkIG5mc19jb250ZXh0X3NldF93cml0ZV9lcnJvcihzdHJ1Y3QNCj4gbmZzX29wZW5fY29udGV4
dCAqY3R4LCBpbnQgZXJyb3IpDQo+ICt7DQo+ICsJY3R4LT5lcnJvciA9IGVycm9yOw0KPiArCXNt
cF93bWIoKTsNCj4gKwlzZXRfYml0KE5GU19DT05URVhUX0VSUk9SX1dSSVRFLCAmY3R4LT5mbGFn
cyk7DQo+ICt9DQo+IGRpZmYgLS1naXQgYS9mcy9uZnMvcGFnZWxpc3QuYyBiL2ZzL25mcy9wYWdl
bGlzdC5jDQo+IGluZGV4IGRlOTA2NmE5MmMwZC4uMGViZDI2YjlhNmJkIDEwMDY0NA0KPiAtLS0g
YS9mcy9uZnMvcGFnZWxpc3QuYw0KPiArKysgYi9mcy9uZnMvcGFnZWxpc3QuYw0KPiBAQCAtMTE5
OCw4ICsxMTk4LDggQEAgb3V0X2ZhaWxlZDoNCj4gIA0KPiAgCQkvKiByZW1lbWJlciBmYXRhbCBl
cnJvcnMgKi8NCj4gIAkJaWYgKG5mc19lcnJvcl9pc19mYXRhbChkZXNjLT5wZ19lcnJvcikpDQo+
IC0JCQltYXBwaW5nX3NldF9lcnJvcihkZXNjLT5wZ19pbm9kZS0+aV9tYXBwaW5nLA0KPiAtCQkJ
CQkgIGRlc2MtPnBnX2Vycm9yKTsNCj4gKwkJCW5mc19jb250ZXh0X3NldF93cml0ZV9lcnJvcihy
ZXEtPndiX2NvbnRleHQsDQo+ICsJCQkJCQkgICAgZGVzYy0+cGdfZXJyb3IpOw0KPiAgDQo+ICAJ
CWZ1bmMgPSBkZXNjLT5wZ19jb21wbGV0aW9uX29wcy0+ZXJyb3JfY2xlYW51cDsNCj4gIAkJZm9y
IChtaWR4ID0gMDsgbWlkeCA8IGRlc2MtPnBnX21pcnJvcl9jb3VudDsgbWlkeCsrKQ0KPiB7DQo+
IGRpZmYgLS1naXQgYS9mcy9uZnMvd3JpdGUuYyBiL2ZzL25mcy93cml0ZS5jDQo+IGluZGV4IGIx
YWY1ZGVlNWUwYS4uZjcwMmJmMmRlZjc5IDEwMDY0NA0KPiAtLS0gYS9mcy9uZnMvd3JpdGUuYw0K
PiArKysgYi9mcy9uZnMvd3JpdGUuYw0KPiBAQCAtMTQ3LDEzICsxNDcsNiBAQCBzdGF0aWMgdm9p
ZCBuZnNfaW9fY29tcGxldGlvbl9wdXQoc3RydWN0DQo+IG5mc19pb19jb21wbGV0aW9uICppb2Mp
DQo+ICAJCWtyZWZfcHV0KCZpb2MtPnJlZmNvdW50LCBuZnNfaW9fY29tcGxldGlvbl9yZWxlYXNl
KTsNCj4gIH0NCj4gIA0KPiAtc3RhdGljIHZvaWQgbmZzX2NvbnRleHRfc2V0X3dyaXRlX2Vycm9y
KHN0cnVjdCBuZnNfb3Blbl9jb250ZXh0DQo+ICpjdHgsIGludCBlcnJvcikNCj4gLXsNCj4gLQlj
dHgtPmVycm9yID0gZXJyb3I7DQo+IC0Jc21wX3dtYigpOw0KPiAtCXNldF9iaXQoTkZTX0NPTlRF
WFRfRVJST1JfV1JJVEUsICZjdHgtPmZsYWdzKTsNCj4gLX0NCj4gLQ0KPiAgLyoNCj4gICAqIG5m
c19wYWdlX2ZpbmRfaGVhZF9yZXF1ZXN0X2xvY2tlZCAtIGZpbmQgaGVhZCByZXF1ZXN0IGFzc29j
aWF0ZWQNCj4gd2l0aCBAcGFnZQ0KPiAgICoNCg0KVGhhdCBtYWtlcyBzZW5zZSB0byBtZS4gSSdt
IGFwcGx5aW5nIGFuZCB3aWxsIHNlbmQgYXMgYW4gdXBkYXRlIHRvDQo0LjE0Li4uDQoNCi0tIA0K
VHJvbmQgTXlrbGVidXN0DQpMaW51eCBORlMgY2xpZW50IG1haW50YWluZXIsIFByaW1hcnlEYXRh
DQp0cm9uZC5teWtsZWJ1c3RAcHJpbWFyeWRhdGEuY29tDQo=


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-09-12  2:24                     ` Trond Myklebust
  (?)
@ 2017-09-12  5:29                     ` NeilBrown
  -1 siblings, 0 replies; 33+ messages in thread
From: NeilBrown @ 2017-09-12  5:29 UTC (permalink / raw)
  To: Trond Myklebust, anna.schumaker, jlayton, jlayton
  Cc: linux-nfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 146 bytes --]

On Tue, Sep 12 2017, Trond Myklebust wrote:
>
> That makes sense to me. I'm applying and will send as an update to
> 4.14...
>

Thanks!
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-09-11 21:52                     ` NeilBrown
@ 2017-09-12 15:20                       ` Jeff Layton
  2017-09-12 21:47                         ` NeilBrown
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff Layton @ 2017-09-12 15:20 UTC (permalink / raw)
  To: NeilBrown, Trond Myklebust, anna.schumaker, jlayton
  Cc: linux-nfs, linux-fsdevel

On Tue, 2017-09-12 at 07:52 +1000, NeilBrown wrote:
> > On Mon, 2017-09-11 at 13:24 +1000, NeilBrown wrote:
> > > On Thu, Sep 07 2017, Trond Myklebust wrote:
> > > 
> > > > On Thu, 2017-09-07 at 07:35 -0400, Jeff Layton wrote:
> > > > > On Thu, 2017-09-07 at 13:37 +1000, NeilBrown wrote:
> > > > > > On Tue, Aug 29 2017, Jeff Layton wrote:
> > > > > > 
> > > > > > > On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
> > > > > > > > On Mon, Aug 28 2017, Jeff Layton wrote:
> > > > > > > > 
> > > > > > > > > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
> > > > > > > > > > On Fri, Aug 25 2017, Jeff Layton wrote:
> > > > > > > > > > 
> > > > > > > > > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> > > > > > > > > > > > From: Jeff Layton <jlayton@redhat.com>
> > > > > > > > > > > > 
> > > > > > > > > > > > There is some ambiguity in nfs about how writeback
> > > > > > > > > > > > errors are
> > > > > > > > > > > > tracked.
> > > > > > > > > > > > 
> > > > > > > > > > > > For instance, nfs_pageio_add_request calls
> > > > > > > > > > > > mapping_set_error when
> > > > > > > > > > > > the
> > > > > > > > > > > > add fails, but we track errors that occur after adding
> > > > > > > > > > > > the
> > > > > > > > > > > > request
> > > > > > > > > > > > with a dedicated int error in the open context.
> > > > > > > > > > > > 
> > > > > > > > > > > > Now that we have better infrastructure for the vfs
> > > > > > > > > > > > layer, this
> > > > > > > > > > > > latter int is now unnecessary. Just have
> > > > > > > > > > > > nfs_context_set_write_error set
> > > > > > > > > > > > the error in the mapping when one occurs.
> > > > > > > > > > > > 
> > > > > > > > > > > > Have NFS use file_write_and_wait_range to initiate and
> > > > > > > > > > > > wait on
> > > > > > > > > > > > writeback
> > > > > > > > > > > > of the data, and then check again after issuing the
> > > > > > > > > > > > commit(s).
> > > > > > > > > > > > 
> > > > > > > > > > > > With this, we also don't need to pay attention to the
> > > > > > > > > > > > ERROR_WRITE
> > > > > > > > > > > > flag for reporting, and just clear it to indicate to
> > > > > > > > > > > > subsequent
> > > > > > > > > > > > writers that they should try to go asynchronous again.
> > > > > > > > > > > > 
> > > > > > > > > > > > In nfs_page_async_flush, sample the error before
> > > > > > > > > > > > locking and
> > > > > > > > > > > > joining
> > > > > > > > > > > > the requests, and check for errors since that point.
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > > > > > > > > > ---
> > > > > > > > > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
> > > > > > > > > > > >  fs/nfs/inode.c         |  3 +--
> > > > > > > > > > > >  fs/nfs/write.c         |  8 ++++++--
> > > > > > > > > > > >  include/linux/nfs_fs.h |  1 -
> > > > > > > > > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
> > > > > > > > > > > > 
> > > > > > > > > > > > I have a baling wire and duct tape solution for testing
> > > > > > > > > > > > this with
> > > > > > > > > > > > xfstests (using iptables REJECT targets and soft
> > > > > > > > > > > > mounts). This
> > > > > > > > > > > > seems to
> > > > > > > > > > > > make nfs do the right thing.
> > > > > > > > > > > > 
> > > > > > > > > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > > > > > > > > > > index 5713eb32a45e..15d3c6faafd3 100644
> > > > > > > > > > > > --- a/fs/nfs/file.c
> > > > > > > > > > > > +++ b/fs/nfs/file.c
> > > > > > > > > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file
> > > > > > > > > > > > *file,
> > > > > > > > > > > > loff_t start, loff_t end, int datasync)
> > > > > > > > > > > >  {
> > > > > > > > > > > >  	struct nfs_open_context *ctx =
> > > > > > > > > > > > nfs_file_open_context(file);
> > > > > > > > > > > >  	struct inode *inode = file_inode(file);
> > > > > > > > > > > > -	int have_error, do_resend, status;
> > > > > > > > > > > > -	int ret = 0;
> > > > > > > > > > > > +	int do_resend, status;
> > > > > > > > > > > > +	int ret;
> > > > > > > > > > > >  
> > > > > > > > > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n",
> > > > > > > > > > > > file,
> > > > > > > > > > > > datasync);
> > > > > > > > > > > >  
> > > > > > > > > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > > > > > > > > > > >  	do_resend =
> > > > > > > > > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx-
> > > > > > > > > > > > > flags);
> > > > > > > > > > > > 
> > > > > > > > > > > > -	have_error =
> > > > > > > > > > > > test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> > > > > > > > > > > > &ctx->flags);
> > > > > > > > > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > > > > > > -	have_error |=
> > > > > > > > > > > > test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > > > > > > > > flags);
> > > > > > > > > > > > 
> > > > > > > > > > > > -	if (have_error) {
> > > > > > > > > > > > -		ret = xchg(&ctx->error, 0);
> > > > > > > > > > > > -		if (ret)
> > > > > > > > > > > > -			goto out;
> > > > > > > > > > > > -	}
> > > > > > > > > > > > -	if (status < 0) {
> > > > > > > > > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > > > > > > > > flags);
> > > > > > > > > > > > 
> > > > > > > > > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > > > > > > +
> > > > > > > > > > > > +	/* Recheck and advance after the commit */
> > > > > > > > > > > > +	status = file_check_and_advance_wb_err(file);
> > > > > > > > > > 
> > > > > > > > > > This change makes the code inconsistent with the comment
> > > > > > > > > > above the
> > > > > > > > > > function, which still references ctx->error.  The intent of
> > > > > > > > > > the
> > > > > > > > > > comment
> > > > > > > > > > is still correct, but the details have changed.
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Good catch. I'll fix that up in a respin.
> > > > > > > > > 
> > > > > > > > > > Also, there is a call to mapping_set_error() in
> > > > > > > > > > nfs_pageio_add_request().
> > > > > > > > > > I wonder if that should be changed to
> > > > > > > > > >   nfs_context_set_write_error(req->wb_context, desc-
> > > > > > > > > > > pg_error)
> > > > > > > > > > 
> > > > > > > > > > ??
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Trickier question...
> > > > > > > > > 
> > > > > > > > > I'm not quite sure what semantics we're looking for with
> > > > > > > > > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
> > > > > > > > > synchronous, but I'm not quite sure why it gets cleared the
> > > > > > > > > way it
> > > > > > > > > does. It's set on any error but cleared before issuing a
> > > > > > > > > commit.
> > > > > > > > > 
> > > > > > > > > I added a similar flag to Ceph inodes recently, but only
> > > > > > > > > clear it when
> > > > > > > > > a write succeeds. Wouldn't that make more sense here as well?
> > > > > > > > 
> > > > > > > > It is a bit hard to wrap one's mind around.
> > > > > > > > 
> > > > > > > > In the original code (commit 7b159fc18d417980) it looks like:
> > > > > > > >  - test-and-clear bit
> > > > > > > >  - write and sync
> > > > > > > >  - test-bit
> > > > > > > > 
> > > > > > > > This does, I think, seem safer than "clear on successful write"
> > > > > > > > as the
> > > > > > > > writes could complete out-of-order and I wouldn't be surprised
> > > > > > > > if the
> > > > > > > > unsuccessful ones completed with an error before the successful
> > > > > > > > one -
> > > > > > > > particularly with an error like EDQUOT.
> > > > > > > > 
> > > > > > > > However the current code does the writes before the test-and-
> > > > > > > > clear, and
> > > > > > > > only does the commit afterwards.  That makes it less clear why
> > > > > > > > the
> > > > > > > > current sequence is a good idea.
> > > > > > > > 
> > > > > > > > However ... nfs_file_fsync_commit() is only called if
> > > > > > > > filemap_write_and_wait_range() returned with success, so we
> > > > > > > > only clear
> > > > > > > > the flag after successful writes(?).
> > > > > > > > 
> > > > > > > > Oh....
> > > > > > > > This patch from me:
> > > > > > > > 
> > > > > > > > Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error
> > > > > > > > handling.")
> > > > > > > > 
> > > > > > > > seems to have been reverted by
> > > > > > > > 
> > > > > > > > Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if
> > > > > > > > page writeback failed")
> > > > > > > > 
> > > > > > > > which probably isn't good.  It appears that this code is very
> > > > > > > > fragile
> > > > > > > > and easily broken.
> > > > > > 
> > > > > > On further investigation, I think the problem that I fixed and then
> > > > > > we
> > > > > > reintroduced will be fixed again - more permanently - by your
> > > > > > patch.
> > > > > > The root problem is that nfs keeps error codes in a different way
> > > > > > to the
> > > > > > MM core.  By unifying those, the problem goes.
> > > > > > (The specific problem is that writes which hit EDQUOT on the server
> > > > > > can
> > > > > >  report EIO on the client).
> > > > > > 
> > > > > > 
> > > > > > > > Maybe we need to work out exactly what is required, and
> > > > > > > > document it - so
> > > > > > > > we can stop breaking it.
> > > > > > > > Or maybe we need some unit tests.....
> > > > > > > > 
> > > > > > > 
> > > > > > > Yes, laying out what's necessary for this would be very helpful.
> > > > > > > We
> > > > > > > clearly want to set the flag when an error occurs. Under what
> > > > > > > circumstances should we be clearing it?
> > > > > > 
> > > > > > Well.... looking back at  7b159fc18d417980f57ae which introduced
> > > > > > the
> > > > > > flag, prior to that write errors (ctx->error) were only reported by
> > > > > > nfs_file_flush and nfs_fsync, so only one close() and fsync().
> > > > > > 
> > > > > > After that commit, setting the flag would mean that errors could be
> > > > > > returned by 'write'.  So clearing as part of returning the error
> > > > > > makes
> > > > > > perfect sense.
> > > > > > 
> > > > > > As long as the error gets recorded, and gets returned when it is
> > > > > > recorded, it doesn't much matter when the flag is cleared.  With
> > > > > > your
> > > > > > patches we don't need to flag any more to get errors reliably
> > > > > > reported.
> > > > > > 
> > > > > > Leaving the flag set means that writes go more slowly - we don't
> > > > > > get
> > > > > > large queue of background rights building up but destined for
> > > > > > failure.
> > > > > > This is the main point made in the comment message when the flag
> > > > > > was
> > > > > > introduced.
> > > > > > Of course, by the time we first get an error there could already
> > > > > > by a large queue, so we probably want that to drain completely
> > > > > > before
> > > > > > allowing async writes again.
> > > > 
> > > > We already have this functionality implemented in the existing code.
> > > > 
> > > > > > 
> > > > > > It might make sense to have 2 flags.  One which says "writes should
> > > > > > be
> > > > > > synchronous", another that says "There was an error recently".
> > > > > > We clear the error flag before calling nfs_fsync, and if it is
> > > > > > still
> > > > > > clear afterwards, we clear the sync-writes flag.  Maybe that is
> > > > > > more
> > > > > > complex than needed though.
> > > > > > 
> > > > 
> > > > We also need to preserve the NFS_CONTEXT_RESEND_WRITES flag. I don't
> > > > see any global mechanism that will replace that.
> > > > 
> > > > > > I'm leaning towards your suggestion that it doesn't matter very
> > > > > > much
> > > > > > when it gets cleared, and clearing it on any successful write is
> > > > > > simplest.
> > > > > > 
> > > > > > So I'm still in favor of using nfs_context_set_write_error() in
> > > > > > nfs_pageio_add_request(), primarily because it is most consistent -
> > > > > > we
> > > > > > don't need exceptions.
> > > > > 
> > > > > Thanks for taking a closer look. I can easily make the change above,
> > > > > and
> > > > > I do think that keeping this mechanism as simple as possible will
> > > > > make
> > > > > it easier to prevent bitrot.
> > > > > 
> > > > > That said... NFS_CONTEXT_ERROR_WRITE is a per ctx flag, and the ctx
> > > > > is a
> > > > > per open file description object.
> > > > > 
> > > > > Is that the correct way to track this? All of the ctx's will share
> > > > > the
> > > > > same inode. If we're getting writeback errors for one context, it's
> > > > > quite likely that we'll be seeing them via others.
> > > > > 
> > > > > I suppose the counterargument is when we have things like expiring
> > > > > krb5
> > > > > tickets. Write failures via an expiring set of creds may have no
> > > > > effect
> > > > > on writeback via other creds.
> > > > > 
> > > > > Still, I think a per-inode flag might make more sense here.
> > > > > 
> > > > > Thoughts?
> > > > 
> > > > As far as I'm concerned, that would be a regression. The most common
> > > > problem when flushing writeback data to the server aside from ENOSPC
> > > > (and possibly ESTALE) is EACCES, which is particular to the file
> > > > descriptor that opened the file.
> > > > 
> > > > File contexts, and NFS_CONTEXT_ERROR_WRITE solve that problem by being
> > > > private to the file descriptor.
> > > 
> > > Thanks for the reminder that errors are per-context and this patch drops
> > > this.  The per-context nature of errors in NFS was the reason that I
> > > nagged Jeff to make errseq_t a stand-alone type rather than just a part
> > > of address_space.  I had envisaged that it would be embedded in the
> > > open_context as well.
> > > We still could do that, but as there is precisely one open-file for each
> > > open_context, the gains are not great.
> > > 
> > > However, while looking over the code to make sure I really understood it
> > > and all the possible consequences of changing to errseq_t I found a few
> > > anomalies.  The patch below addresses them all.
> > > 
> > > Would you see if they may sense to you?
> > > 
> > > Thanks,
> > > NeilBrown
> > > 
> > > 
> > > From: NeilBrown <neilb@suse.com>
> > > Date: Mon, 11 Sep 2017 13:15:50 +1000
> > > Subject: [PATCH] NFS: various changes relating to reporting IO errors.
> > > 
> > > 1/ remove 'start' and 'end' args from nfs_file_fsync_commit().
> > >    They aren't used.
> > > 
> > > 2/ Make nfs_context_set_write_error() a "static inline" in internal.h
> > >    so we can...
> > > 
> > > 3/ Use nfs_context_set_write_error() instead of mapping_set_error()
> > >    if nfs_pageio_add_request() fails before sending any request.
> > >    NFS generally keeps errors in the open_context, not the mapping,
> > >    so this is more consistent.
> > > 
> > > 4/ If filemap_write_and_write_range() reports any error, still
> > >    check ctx->error.  The value in ctx->error is likely to be
> > >    more useful.  As part of this, NFS_CONTEXT_ERROR_WRITE is
> > >    cleared slightly earlier, before nfs_file_fsync_commit() is called,
> > >    rather than at the start of that function.
> > > 
> > > Signed-off-by: NeilBrown <neilb@suse.com>
> > > ---
> > >  fs/nfs/file.c     | 16 ++++++++++------
> > >  fs/nfs/internal.h |  7 +++++++
> > >  fs/nfs/pagelist.c |  4 ++--
> > >  fs/nfs/write.c    |  7 -------
> > >  4 files changed, 19 insertions(+), 15 deletions(-)
> > > 
> > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > index af330c31f627..ab324f14081f 100644
> > > --- a/fs/nfs/file.c
> > > +++ b/fs/nfs/file.c
> > > @@ -208,21 +208,19 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
> > >   * fall back to doing a synchronous write.
> > >   */
> > >  static int
> > > -nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
> > > +nfs_file_fsync_commit(struct file *file, int datasync)
> > >  {
> > >  	struct nfs_open_context *ctx = nfs_file_open_context(file);
> > >  	struct inode *inode = file_inode(file);
> > > -	int have_error, do_resend, status;
> > > +	int do_resend, status;
> > >  	int ret = 0;
> > >  
> > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
> > >  
> > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > >  	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
> > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > >  	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > -	if (have_error) {
> > > +	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
> > >  		ret = xchg(&ctx->error, 0);
> > >  		if (ret)
> > >  			goto out;
> > > @@ -247,10 +245,16 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> > >  	trace_nfs_fsync_enter(inode);
> > >  
> > >  	do {
> > > +		struct nfs_open_context *ctx = nfs_file_open_context(file);
> > >  		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
> > > +		if (test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
> > > +			int ret2 = xchg(&ctx->error, 0);
> > > +			if (ret2)
> > > +				ret = ret2;
> > > +		}
> > >  		if (ret != 0)
> > >  			break;
> > > -		ret = nfs_file_fsync_commit(file, start, end, datasync);
> > > +		ret = nfs_file_fsync_commit(file, datasync);
> > >  		if (!ret)
> > >  			ret = pnfs_sync_inode(inode, !!datasync);
> > >  		/*
> > > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> > > index dc456416d2be..44c8962fec91 100644
> > > --- a/fs/nfs/internal.h
> > > +++ b/fs/nfs/internal.h
> > > @@ -769,3 +769,10 @@ static inline bool nfs_error_is_fatal(int err)
> > >  		return false;
> > >  	}
> > >  }
> > > +
> > > +static inline void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
> > > +{
> > > +	ctx->error = error;
> > > +	smp_wmb();
> > > +	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > +}
> > > diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> > > index de9066a92c0d..0ebd26b9a6bd 100644
> > > --- a/fs/nfs/pagelist.c
> > > +++ b/fs/nfs/pagelist.c
> > > @@ -1198,8 +1198,8 @@ out_failed:
> > >  
> > >  		/* remember fatal errors */
> > >  		if (nfs_error_is_fatal(desc->pg_error))
> > > -			mapping_set_error(desc->pg_inode->i_mapping,
> > > -					  desc->pg_error);
> > > +			nfs_context_set_write_error(req->wb_context,
> > > +						    desc->pg_error);
> > >  
> > >  		func = desc->pg_completion_ops->error_cleanup;
> > >  		for (midx = 0; midx < desc->pg_mirror_count; midx++) {
> > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > > index b1af5dee5e0a..f702bf2def79 100644
> > > --- a/fs/nfs/write.c
> > > +++ b/fs/nfs/write.c
> > > @@ -147,13 +147,6 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
> > >  		kref_put(&ioc->refcount, nfs_io_completion_release);
> > >  }
> > >  
> > > -static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
> > > -{
> > > -	ctx->error = error;
> > > -	smp_wmb();
> > > -	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > -}
> > > -
> > >  /*
> > >   * nfs_page_find_head_request_locked - find head request associated with @page
> > >   *
> > 
> > This should probably be broken out into at least a 2-3 different
> > patches.
> > 
> > Ok, so to make sure I understand:
> > 
> > All writeback is done under the aegis of an open context, and writes
> > from different open contexts are not mergeable. We also flush to the
> > server in the case that a dirty page is written via an incompatible open
> > context. So with that we can always tie
> 
> Not quite.  Writes from different open contexts are sometimes mergeable,
> providing the credential is the same and there are no locks that might
> get in the way. (nfs_flush_incompatible() gets rid of conflicts writes
> to the same page as part of nfs_write_begin().
> When writes are merged, all contexts remain reachable from the request
> through an 'nfs_page'. nfs_write_completion() iterates over all the
> nfs_pages attached to the nfs_pgio_header, and sets the context
> write_error from the hdr->error.
> 

Ok, by this account, NFS should already have "correct" error reporting
semantics on fsync. i.e. when the file is written via multiple fds, you
should get back an error on all fds if those writebacks failed.

I have a test for nfs for the new-style error reporting:

    https://git.kernel.org/pub/scm/linux/kernel/git/jlayton/xfstests-dev.git/log/?h=wberr

The nfs test is still pretty rickety, using soft mounts and iptables to
cause requests to fail. With the patch I originally proposed, this test
would pass. When I run this test on normal mainline kernels, it fails:

-------------------------------8<------------------------------
FSTYP         -- nfs
PLATFORM      -- Linux/x86_64 wberr 4.12.11-300.fc26.x86_64
MKFS_OPTIONS  -- knfsdsrv:/export/scratch
MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 knfsdsrv:/export/scratch /mnt/scratch

nfs/002	 - output mismatch (see /home/jlayton/git/xfstests/results//nfs/002.out.bad)
    --- tests/nfs/002.out	2017-07-19 13:12:59.354561869 -0400
    +++ /home/jlayton/git/xfstests/results//nfs/002.out.bad	2017-09-12 11:17:17.335943539 -0400
    @@ -1,3 +1,3 @@
     QA output created by 002
     Format and mount
    -Test passed!
    +Success on second fsync on fd[1]!
    ...
    (Run 'diff -u tests/nfs/002.out /home/jlayton/git/xfstests/results//nfs/002.out.bad'  to see the entire diff)
Ran: nfs/002
Failures: nfs/002
Failed 1 of 1 tests
-------------------------------8<------------------------------

I'm not sure that errors are really propagated to all struct files like
you suggest above. I'll plan to look a little more closely at what's
happening here, when I get some time.

> > 
> > In that case, yes...mixing in errseq_t doesn't really buy us much here,
> > and I agree with most of the changes above.
> > 
> > That said...I'm still not thrilled with how NFS_CONTEXT_ERROR_WRITE is
> > handled in this code. That flag is set when a write fails, but is only
> > cleared on fsync.
> > 
> > That seems wrong to me. Why wait for an fsync to start doing async
> > writes again once they start working? What if the application never does
> > an fsync? Clearing that flag on a successful WRITE seems like it'd make
> > more sense.
> 
> We don't really 'wait' for an fsync.  Having NFS_CONTEXT_ERROR_WRITE
> means that the very next write will force an fsync
> (nfs_need_check_write()).  So we really just wait for the next write.
> The current code doesn't seem "obviously right" to me, but it isn't
> "obviously wrong" either, and I can only make it obviously right to me
> by making it more complex, and I don't think I can justify that.

Thanks for pointing this out. I missed the bit about it forcing the
fsync when this fails. I agree that that should be fine.

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-09-12 15:20                       ` Jeff Layton
@ 2017-09-12 21:47                         ` NeilBrown
  2017-09-13 12:23                           ` Jeff Layton
  0 siblings, 1 reply; 33+ messages in thread
From: NeilBrown @ 2017-09-12 21:47 UTC (permalink / raw)
  To: Jeff Layton, Trond Myklebust, anna.schumaker, jlayton
  Cc: linux-nfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 24786 bytes --]

On Tue, Sep 12 2017, Jeff Layton wrote:

> On Tue, 2017-09-12 at 07:52 +1000, NeilBrown wrote:
>> > On Mon, 2017-09-11 at 13:24 +1000, NeilBrown wrote:
>> > > On Thu, Sep 07 2017, Trond Myklebust wrote:
>> > > 
>> > > > On Thu, 2017-09-07 at 07:35 -0400, Jeff Layton wrote:
>> > > > > On Thu, 2017-09-07 at 13:37 +1000, NeilBrown wrote:
>> > > > > > On Tue, Aug 29 2017, Jeff Layton wrote:
>> > > > > > 
>> > > > > > > On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
>> > > > > > > > On Mon, Aug 28 2017, Jeff Layton wrote:
>> > > > > > > > 
>> > > > > > > > > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
>> > > > > > > > > > On Fri, Aug 25 2017, Jeff Layton wrote:
>> > > > > > > > > > 
>> > > > > > > > > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
>> > > > > > > > > > > > From: Jeff Layton <jlayton@redhat.com>
>> > > > > > > > > > > > 
>> > > > > > > > > > > > There is some ambiguity in nfs about how writeback
>> > > > > > > > > > > > errors are
>> > > > > > > > > > > > tracked.
>> > > > > > > > > > > > 
>> > > > > > > > > > > > For instance, nfs_pageio_add_request calls
>> > > > > > > > > > > > mapping_set_error when
>> > > > > > > > > > > > the
>> > > > > > > > > > > > add fails, but we track errors that occur after adding
>> > > > > > > > > > > > the
>> > > > > > > > > > > > request
>> > > > > > > > > > > > with a dedicated int error in the open context.
>> > > > > > > > > > > > 
>> > > > > > > > > > > > Now that we have better infrastructure for the vfs
>> > > > > > > > > > > > layer, this
>> > > > > > > > > > > > latter int is now unnecessary. Just have
>> > > > > > > > > > > > nfs_context_set_write_error set
>> > > > > > > > > > > > the error in the mapping when one occurs.
>> > > > > > > > > > > > 
>> > > > > > > > > > > > Have NFS use file_write_and_wait_range to initiate and
>> > > > > > > > > > > > wait on
>> > > > > > > > > > > > writeback
>> > > > > > > > > > > > of the data, and then check again after issuing the
>> > > > > > > > > > > > commit(s).
>> > > > > > > > > > > > 
>> > > > > > > > > > > > With this, we also don't need to pay attention to the
>> > > > > > > > > > > > ERROR_WRITE
>> > > > > > > > > > > > flag for reporting, and just clear it to indicate to
>> > > > > > > > > > > > subsequent
>> > > > > > > > > > > > writers that they should try to go asynchronous again.
>> > > > > > > > > > > > 
>> > > > > > > > > > > > In nfs_page_async_flush, sample the error before
>> > > > > > > > > > > > locking and
>> > > > > > > > > > > > joining
>> > > > > > > > > > > > the requests, and check for errors since that point.
>> > > > > > > > > > > > 
>> > > > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > > > > > > > > > > > ---
>> > > > > > > > > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
>> > > > > > > > > > > >  fs/nfs/inode.c         |  3 +--
>> > > > > > > > > > > >  fs/nfs/write.c         |  8 ++++++--
>> > > > > > > > > > > >  include/linux/nfs_fs.h |  1 -
>> > > > > > > > > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
>> > > > > > > > > > > > 
>> > > > > > > > > > > > I have a baling wire and duct tape solution for testing
>> > > > > > > > > > > > this with
>> > > > > > > > > > > > xfstests (using iptables REJECT targets and soft
>> > > > > > > > > > > > mounts). This
>> > > > > > > > > > > > seems to
>> > > > > > > > > > > > make nfs do the right thing.
>> > > > > > > > > > > > 
>> > > > > > > > > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> > > > > > > > > > > > index 5713eb32a45e..15d3c6faafd3 100644
>> > > > > > > > > > > > --- a/fs/nfs/file.c
>> > > > > > > > > > > > +++ b/fs/nfs/file.c
>> > > > > > > > > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file
>> > > > > > > > > > > > *file,
>> > > > > > > > > > > > loff_t start, loff_t end, int datasync)
>> > > > > > > > > > > >  {
>> > > > > > > > > > > >  	struct nfs_open_context *ctx =
>> > > > > > > > > > > > nfs_file_open_context(file);
>> > > > > > > > > > > >  	struct inode *inode = file_inode(file);
>> > > > > > > > > > > > -	int have_error, do_resend, status;
>> > > > > > > > > > > > -	int ret = 0;
>> > > > > > > > > > > > +	int do_resend, status;
>> > > > > > > > > > > > +	int ret;
>> > > > > > > > > > > >  
>> > > > > > > > > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n",
>> > > > > > > > > > > > file,
>> > > > > > > > > > > > datasync);
>> > > > > > > > > > > >  
>> > > > > > > > > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>> > > > > > > > > > > >  	do_resend =
>> > > > > > > > > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx-
>> > > > > > > > > > > > > flags);
>> > > > > > > > > > > > 
>> > > > > > > > > > > > -	have_error =
>> > > > > > > > > > > > test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
>> > > > > > > > > > > > &ctx->flags);
>> > > > > > > > > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > > > > > > > > > > -	have_error |=
>> > > > > > > > > > > > test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
>> > > > > > > > > > > > > flags);
>> > > > > > > > > > > > 
>> > > > > > > > > > > > -	if (have_error) {
>> > > > > > > > > > > > -		ret = xchg(&ctx->error, 0);
>> > > > > > > > > > > > -		if (ret)
>> > > > > > > > > > > > -			goto out;
>> > > > > > > > > > > > -	}
>> > > > > > > > > > > > -	if (status < 0) {
>> > > > > > > > > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
>> > > > > > > > > > > > > flags);
>> > > > > > > > > > > > 
>> > > > > > > > > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > > > > > > > > > > +
>> > > > > > > > > > > > +	/* Recheck and advance after the commit */
>> > > > > > > > > > > > +	status = file_check_and_advance_wb_err(file);
>> > > > > > > > > > 
>> > > > > > > > > > This change makes the code inconsistent with the comment
>> > > > > > > > > > above the
>> > > > > > > > > > function, which still references ctx->error.  The intent of
>> > > > > > > > > > the
>> > > > > > > > > > comment
>> > > > > > > > > > is still correct, but the details have changed.
>> > > > > > > > > > 
>> > > > > > > > > 
>> > > > > > > > > Good catch. I'll fix that up in a respin.
>> > > > > > > > > 
>> > > > > > > > > > Also, there is a call to mapping_set_error() in
>> > > > > > > > > > nfs_pageio_add_request().
>> > > > > > > > > > I wonder if that should be changed to
>> > > > > > > > > >   nfs_context_set_write_error(req->wb_context, desc-
>> > > > > > > > > > > pg_error)
>> > > > > > > > > > 
>> > > > > > > > > > ??
>> > > > > > > > > > 
>> > > > > > > > > 
>> > > > > > > > > Trickier question...
>> > > > > > > > > 
>> > > > > > > > > I'm not quite sure what semantics we're looking for with
>> > > > > > > > > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
>> > > > > > > > > synchronous, but I'm not quite sure why it gets cleared the
>> > > > > > > > > way it
>> > > > > > > > > does. It's set on any error but cleared before issuing a
>> > > > > > > > > commit.
>> > > > > > > > > 
>> > > > > > > > > I added a similar flag to Ceph inodes recently, but only
>> > > > > > > > > clear it when
>> > > > > > > > > a write succeeds. Wouldn't that make more sense here as well?
>> > > > > > > > 
>> > > > > > > > It is a bit hard to wrap one's mind around.
>> > > > > > > > 
>> > > > > > > > In the original code (commit 7b159fc18d417980) it looks like:
>> > > > > > > >  - test-and-clear bit
>> > > > > > > >  - write and sync
>> > > > > > > >  - test-bit
>> > > > > > > > 
>> > > > > > > > This does, I think, seem safer than "clear on successful write"
>> > > > > > > > as the
>> > > > > > > > writes could complete out-of-order and I wouldn't be surprised
>> > > > > > > > if the
>> > > > > > > > unsuccessful ones completed with an error before the successful
>> > > > > > > > one -
>> > > > > > > > particularly with an error like EDQUOT.
>> > > > > > > > 
>> > > > > > > > However the current code does the writes before the test-and-
>> > > > > > > > clear, and
>> > > > > > > > only does the commit afterwards.  That makes it less clear why
>> > > > > > > > the
>> > > > > > > > current sequence is a good idea.
>> > > > > > > > 
>> > > > > > > > However ... nfs_file_fsync_commit() is only called if
>> > > > > > > > filemap_write_and_wait_range() returned with success, so we
>> > > > > > > > only clear
>> > > > > > > > the flag after successful writes(?).
>> > > > > > > > 
>> > > > > > > > Oh....
>> > > > > > > > This patch from me:
>> > > > > > > > 
>> > > > > > > > Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error
>> > > > > > > > handling.")
>> > > > > > > > 
>> > > > > > > > seems to have been reverted by
>> > > > > > > > 
>> > > > > > > > Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if
>> > > > > > > > page writeback failed")
>> > > > > > > > 
>> > > > > > > > which probably isn't good.  It appears that this code is very
>> > > > > > > > fragile
>> > > > > > > > and easily broken.
>> > > > > > 
>> > > > > > On further investigation, I think the problem that I fixed and then
>> > > > > > we
>> > > > > > reintroduced will be fixed again - more permanently - by your
>> > > > > > patch.
>> > > > > > The root problem is that nfs keeps error codes in a different way
>> > > > > > to the
>> > > > > > MM core.  By unifying those, the problem goes.
>> > > > > > (The specific problem is that writes which hit EDQUOT on the server
>> > > > > > can
>> > > > > >  report EIO on the client).
>> > > > > > 
>> > > > > > 
>> > > > > > > > Maybe we need to work out exactly what is required, and
>> > > > > > > > document it - so
>> > > > > > > > we can stop breaking it.
>> > > > > > > > Or maybe we need some unit tests.....
>> > > > > > > > 
>> > > > > > > 
>> > > > > > > Yes, laying out what's necessary for this would be very helpful.
>> > > > > > > We
>> > > > > > > clearly want to set the flag when an error occurs. Under what
>> > > > > > > circumstances should we be clearing it?
>> > > > > > 
>> > > > > > Well.... looking back at  7b159fc18d417980f57ae which introduced
>> > > > > > the
>> > > > > > flag, prior to that write errors (ctx->error) were only reported by
>> > > > > > nfs_file_flush and nfs_fsync, so only one close() and fsync().
>> > > > > > 
>> > > > > > After that commit, setting the flag would mean that errors could be
>> > > > > > returned by 'write'.  So clearing as part of returning the error
>> > > > > > makes
>> > > > > > perfect sense.
>> > > > > > 
>> > > > > > As long as the error gets recorded, and gets returned when it is
>> > > > > > recorded, it doesn't much matter when the flag is cleared.  With
>> > > > > > your
>> > > > > > patches we don't need to flag any more to get errors reliably
>> > > > > > reported.
>> > > > > > 
>> > > > > > Leaving the flag set means that writes go more slowly - we don't
>> > > > > > get
>> > > > > > large queue of background rights building up but destined for
>> > > > > > failure.
>> > > > > > This is the main point made in the comment message when the flag
>> > > > > > was
>> > > > > > introduced.
>> > > > > > Of course, by the time we first get an error there could already
>> > > > > > by a large queue, so we probably want that to drain completely
>> > > > > > before
>> > > > > > allowing async writes again.
>> > > > 
>> > > > We already have this functionality implemented in the existing code.
>> > > > 
>> > > > > > 
>> > > > > > It might make sense to have 2 flags.  One which says "writes should
>> > > > > > be
>> > > > > > synchronous", another that says "There was an error recently".
>> > > > > > We clear the error flag before calling nfs_fsync, and if it is
>> > > > > > still
>> > > > > > clear afterwards, we clear the sync-writes flag.  Maybe that is
>> > > > > > more
>> > > > > > complex than needed though.
>> > > > > > 
>> > > > 
>> > > > We also need to preserve the NFS_CONTEXT_RESEND_WRITES flag. I don't
>> > > > see any global mechanism that will replace that.
>> > > > 
>> > > > > > I'm leaning towards your suggestion that it doesn't matter very
>> > > > > > much
>> > > > > > when it gets cleared, and clearing it on any successful write is
>> > > > > > simplest.
>> > > > > > 
>> > > > > > So I'm still in favor of using nfs_context_set_write_error() in
>> > > > > > nfs_pageio_add_request(), primarily because it is most consistent -
>> > > > > > we
>> > > > > > don't need exceptions.
>> > > > > 
>> > > > > Thanks for taking a closer look. I can easily make the change above,
>> > > > > and
>> > > > > I do think that keeping this mechanism as simple as possible will
>> > > > > make
>> > > > > it easier to prevent bitrot.
>> > > > > 
>> > > > > That said... NFS_CONTEXT_ERROR_WRITE is a per ctx flag, and the ctx
>> > > > > is a
>> > > > > per open file description object.
>> > > > > 
>> > > > > Is that the correct way to track this? All of the ctx's will share
>> > > > > the
>> > > > > same inode. If we're getting writeback errors for one context, it's
>> > > > > quite likely that we'll be seeing them via others.
>> > > > > 
>> > > > > I suppose the counterargument is when we have things like expiring
>> > > > > krb5
>> > > > > tickets. Write failures via an expiring set of creds may have no
>> > > > > effect
>> > > > > on writeback via other creds.
>> > > > > 
>> > > > > Still, I think a per-inode flag might make more sense here.
>> > > > > 
>> > > > > Thoughts?
>> > > > 
>> > > > As far as I'm concerned, that would be a regression. The most common
>> > > > problem when flushing writeback data to the server aside from ENOSPC
>> > > > (and possibly ESTALE) is EACCES, which is particular to the file
>> > > > descriptor that opened the file.
>> > > > 
>> > > > File contexts, and NFS_CONTEXT_ERROR_WRITE solve that problem by being
>> > > > private to the file descriptor.
>> > > 
>> > > Thanks for the reminder that errors are per-context and this patch drops
>> > > this.  The per-context nature of errors in NFS was the reason that I
>> > > nagged Jeff to make errseq_t a stand-alone type rather than just a part
>> > > of address_space.  I had envisaged that it would be embedded in the
>> > > open_context as well.
>> > > We still could do that, but as there is precisely one open-file for each
>> > > open_context, the gains are not great.
>> > > 
>> > > However, while looking over the code to make sure I really understood it
>> > > and all the possible consequences of changing to errseq_t I found a few
>> > > anomalies.  The patch below addresses them all.
>> > > 
>> > > Would you see if they may sense to you?
>> > > 
>> > > Thanks,
>> > > NeilBrown
>> > > 
>> > > 
>> > > From: NeilBrown <neilb@suse.com>
>> > > Date: Mon, 11 Sep 2017 13:15:50 +1000
>> > > Subject: [PATCH] NFS: various changes relating to reporting IO errors.
>> > > 
>> > > 1/ remove 'start' and 'end' args from nfs_file_fsync_commit().
>> > >    They aren't used.
>> > > 
>> > > 2/ Make nfs_context_set_write_error() a "static inline" in internal.h
>> > >    so we can...
>> > > 
>> > > 3/ Use nfs_context_set_write_error() instead of mapping_set_error()
>> > >    if nfs_pageio_add_request() fails before sending any request.
>> > >    NFS generally keeps errors in the open_context, not the mapping,
>> > >    so this is more consistent.
>> > > 
>> > > 4/ If filemap_write_and_write_range() reports any error, still
>> > >    check ctx->error.  The value in ctx->error is likely to be
>> > >    more useful.  As part of this, NFS_CONTEXT_ERROR_WRITE is
>> > >    cleared slightly earlier, before nfs_file_fsync_commit() is called,
>> > >    rather than at the start of that function.
>> > > 
>> > > Signed-off-by: NeilBrown <neilb@suse.com>
>> > > ---
>> > >  fs/nfs/file.c     | 16 ++++++++++------
>> > >  fs/nfs/internal.h |  7 +++++++
>> > >  fs/nfs/pagelist.c |  4 ++--
>> > >  fs/nfs/write.c    |  7 -------
>> > >  4 files changed, 19 insertions(+), 15 deletions(-)
>> > > 
>> > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> > > index af330c31f627..ab324f14081f 100644
>> > > --- a/fs/nfs/file.c
>> > > +++ b/fs/nfs/file.c
>> > > @@ -208,21 +208,19 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
>> > >   * fall back to doing a synchronous write.
>> > >   */
>> > >  static int
>> > > -nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
>> > > +nfs_file_fsync_commit(struct file *file, int datasync)
>> > >  {
>> > >  	struct nfs_open_context *ctx = nfs_file_open_context(file);
>> > >  	struct inode *inode = file_inode(file);
>> > > -	int have_error, do_resend, status;
>> > > +	int do_resend, status;
>> > >  	int ret = 0;
>> > >  
>> > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
>> > >  
>> > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>> > >  	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
>> > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> > >  	status = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> > > -	if (have_error) {
>> > > +	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
>> > >  		ret = xchg(&ctx->error, 0);
>> > >  		if (ret)
>> > >  			goto out;
>> > > @@ -247,10 +245,16 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>> > >  	trace_nfs_fsync_enter(inode);
>> > >  
>> > >  	do {
>> > > +		struct nfs_open_context *ctx = nfs_file_open_context(file);
>> > >  		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
>> > > +		if (test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
>> > > +			int ret2 = xchg(&ctx->error, 0);
>> > > +			if (ret2)
>> > > +				ret = ret2;
>> > > +		}
>> > >  		if (ret != 0)
>> > >  			break;
>> > > -		ret = nfs_file_fsync_commit(file, start, end, datasync);
>> > > +		ret = nfs_file_fsync_commit(file, datasync);
>> > >  		if (!ret)
>> > >  			ret = pnfs_sync_inode(inode, !!datasync);
>> > >  		/*
>> > > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
>> > > index dc456416d2be..44c8962fec91 100644
>> > > --- a/fs/nfs/internal.h
>> > > +++ b/fs/nfs/internal.h
>> > > @@ -769,3 +769,10 @@ static inline bool nfs_error_is_fatal(int err)
>> > >  		return false;
>> > >  	}
>> > >  }
>> > > +
>> > > +static inline void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
>> > > +{
>> > > +	ctx->error = error;
>> > > +	smp_wmb();
>> > > +	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> > > +}
>> > > diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
>> > > index de9066a92c0d..0ebd26b9a6bd 100644
>> > > --- a/fs/nfs/pagelist.c
>> > > +++ b/fs/nfs/pagelist.c
>> > > @@ -1198,8 +1198,8 @@ out_failed:
>> > >  
>> > >  		/* remember fatal errors */
>> > >  		if (nfs_error_is_fatal(desc->pg_error))
>> > > -			mapping_set_error(desc->pg_inode->i_mapping,
>> > > -					  desc->pg_error);
>> > > +			nfs_context_set_write_error(req->wb_context,
>> > > +						    desc->pg_error);
>> > >  
>> > >  		func = desc->pg_completion_ops->error_cleanup;
>> > >  		for (midx = 0; midx < desc->pg_mirror_count; midx++) {
>> > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> > > index b1af5dee5e0a..f702bf2def79 100644
>> > > --- a/fs/nfs/write.c
>> > > +++ b/fs/nfs/write.c
>> > > @@ -147,13 +147,6 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
>> > >  		kref_put(&ioc->refcount, nfs_io_completion_release);
>> > >  }
>> > >  
>> > > -static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
>> > > -{
>> > > -	ctx->error = error;
>> > > -	smp_wmb();
>> > > -	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> > > -}
>> > > -
>> > >  /*
>> > >   * nfs_page_find_head_request_locked - find head request associated with @page
>> > >   *
>> > 
>> > This should probably be broken out into at least a 2-3 different
>> > patches.
>> > 
>> > Ok, so to make sure I understand:
>> > 
>> > All writeback is done under the aegis of an open context, and writes
>> > from different open contexts are not mergeable. We also flush to the
>> > server in the case that a dirty page is written via an incompatible open
>> > context. So with that we can always tie
>> 
>> Not quite.  Writes from different open contexts are sometimes mergeable,
>> providing the credential is the same and there are no locks that might
>> get in the way. (nfs_flush_incompatible() gets rid of conflicts writes
>> to the same page as part of nfs_write_begin().
>> When writes are merged, all contexts remain reachable from the request
>> through an 'nfs_page'. nfs_write_completion() iterates over all the
>> nfs_pages attached to the nfs_pgio_header, and sets the context
>> write_error from the hdr->error.
>> 
>
> Ok, by this account, NFS should already have "correct" error reporting
> semantics on fsync. i.e. when the file is written via multiple fds, you
> should get back an error on all fds if those writebacks failed.
>
> I have a test for nfs for the new-style error reporting:
>
>     https://git.kernel.org/pub/scm/linux/kernel/git/jlayton/xfstests-dev.git/log/?h=wberr
>
> The nfs test is still pretty rickety, using soft mounts and iptables to
> cause requests to fail. With the patch I originally proposed, this test
> would pass. When I run this test on normal mainline kernels, it fails:
>
> -------------------------------8<------------------------------
> FSTYP         -- nfs
> PLATFORM      -- Linux/x86_64 wberr 4.12.11-300.fc26.x86_64
> MKFS_OPTIONS  -- knfsdsrv:/export/scratch
> MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 knfsdsrv:/export/scratch /mnt/scratch
>
> nfs/002	 - output mismatch (see /home/jlayton/git/xfstests/results//nfs/002.out.bad)
>     --- tests/nfs/002.out	2017-07-19 13:12:59.354561869 -0400
>     +++ /home/jlayton/git/xfstests/results//nfs/002.out.bad	2017-09-12 11:17:17.335943539 -0400
>     @@ -1,3 +1,3 @@
>      QA output created by 002
>      Format and mount
>     -Test passed!
>     +Success on second fsync on fd[1]!

Interesting.
I haven't reviewed the kernel code yet, but having looked at the test
code I see that the one file is opened 10 time and that the *same* block
in the file (65k?) is written via each fd.
If I write to a file, and then someone else over-writes all the bytes
that I wrote, then the page is written to the server and gets an error,
then you could argue that as none of the bytes I wrote were involved in
the error, I don't need to see the error status - someone else has taken
on responsibility for that range.

Maybe the test should/could write a few bytes with a different offset for each
fd ??

NeilBrown


>     ...
>     (Run 'diff -u tests/nfs/002.out /home/jlayton/git/xfstests/results//nfs/002.out.bad'  to see the entire diff)
> Ran: nfs/002
> Failures: nfs/002
> Failed 1 of 1 tests
> -------------------------------8<------------------------------
>
> I'm not sure that errors are really propagated to all struct files like
> you suggest above. I'll plan to look a little more closely at what's
> happening here, when I get some time.
>
>> > 
>> > In that case, yes...mixing in errseq_t doesn't really buy us much here,
>> > and I agree with most of the changes above.
>> > 
>> > That said...I'm still not thrilled with how NFS_CONTEXT_ERROR_WRITE is
>> > handled in this code. That flag is set when a write fails, but is only
>> > cleared on fsync.
>> > 
>> > That seems wrong to me. Why wait for an fsync to start doing async
>> > writes again once they start working? What if the application never does
>> > an fsync? Clearing that flag on a successful WRITE seems like it'd make
>> > more sense.
>> 
>> We don't really 'wait' for an fsync.  Having NFS_CONTEXT_ERROR_WRITE
>> means that the very next write will force an fsync
>> (nfs_need_check_write()).  So we really just wait for the next write.
>> The current code doesn't seem "obviously right" to me, but it isn't
>> "obviously wrong" either, and I can only make it obviously right to me
>> by making it more complex, and I don't think I can justify that.
>
> Thanks for pointing this out. I missed the bit about it forcing the
> fsync when this fails. I agree that that should be fine.
>
> -- 
> Jeff Layton <jlayton@redhat.com>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] nfs: track writeback errors with errseq_t
  2017-09-12 21:47                         ` NeilBrown
@ 2017-09-13 12:23                           ` Jeff Layton
  2017-09-13 23:50                               ` NeilBrown
  0 siblings, 1 reply; 33+ messages in thread
From: Jeff Layton @ 2017-09-13 12:23 UTC (permalink / raw)
  To: NeilBrown, Trond Myklebust, anna.schumaker, jlayton
  Cc: linux-nfs, linux-fsdevel

On Wed, 2017-09-13 at 07:47 +1000, NeilBrown wrote:
> On Tue, Sep 12 2017, Jeff Layton wrote:
> 
> > On Tue, 2017-09-12 at 07:52 +1000, NeilBrown wrote:
> > > > On Mon, 2017-09-11 at 13:24 +1000, NeilBrown wrote:
> > > > > On Thu, Sep 07 2017, Trond Myklebust wrote:
> > > > > 
> > > > > > On Thu, 2017-09-07 at 07:35 -0400, Jeff Layton wrote:
> > > > > > > On Thu, 2017-09-07 at 13:37 +1000, NeilBrown wrote:
> > > > > > > > On Tue, Aug 29 2017, Jeff Layton wrote:
> > > > > > > > 
> > > > > > > > > On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
> > > > > > > > > > On Mon, Aug 28 2017, Jeff Layton wrote:
> > > > > > > > > > 
> > > > > > > > > > > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
> > > > > > > > > > > > On Fri, Aug 25 2017, Jeff Layton wrote:
> > > > > > > > > > > > 
> > > > > > > > > > > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> > > > > > > > > > > > > > From: Jeff Layton <jlayton@redhat.com>
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > There is some ambiguity in nfs about how writeback
> > > > > > > > > > > > > > errors are
> > > > > > > > > > > > > > tracked.
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > For instance, nfs_pageio_add_request calls
> > > > > > > > > > > > > > mapping_set_error when
> > > > > > > > > > > > > > the
> > > > > > > > > > > > > > add fails, but we track errors that occur after adding
> > > > > > > > > > > > > > the
> > > > > > > > > > > > > > request
> > > > > > > > > > > > > > with a dedicated int error in the open context.
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > Now that we have better infrastructure for the vfs
> > > > > > > > > > > > > > layer, this
> > > > > > > > > > > > > > latter int is now unnecessary. Just have
> > > > > > > > > > > > > > nfs_context_set_write_error set
> > > > > > > > > > > > > > the error in the mapping when one occurs.
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > Have NFS use file_write_and_wait_range to initiate and
> > > > > > > > > > > > > > wait on
> > > > > > > > > > > > > > writeback
> > > > > > > > > > > > > > of the data, and then check again after issuing the
> > > > > > > > > > > > > > commit(s).
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > With this, we also don't need to pay attention to the
> > > > > > > > > > > > > > ERROR_WRITE
> > > > > > > > > > > > > > flag for reporting, and just clear it to indicate to
> > > > > > > > > > > > > > subsequent
> > > > > > > > > > > > > > writers that they should try to go asynchronous again.
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > In nfs_page_async_flush, sample the error before
> > > > > > > > > > > > > > locking and
> > > > > > > > > > > > > > joining
> > > > > > > > > > > > > > the requests, and check for errors since that point.
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
> > > > > > > > > > > > > >  fs/nfs/inode.c         |  3 +--
> > > > > > > > > > > > > >  fs/nfs/write.c         |  8 ++++++--
> > > > > > > > > > > > > >  include/linux/nfs_fs.h |  1 -
> > > > > > > > > > > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > I have a baling wire and duct tape solution for testing
> > > > > > > > > > > > > > this with
> > > > > > > > > > > > > > xfstests (using iptables REJECT targets and soft
> > > > > > > > > > > > > > mounts). This
> > > > > > > > > > > > > > seems to
> > > > > > > > > > > > > > make nfs do the right thing.
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > > > > > > > > > > > > index 5713eb32a45e..15d3c6faafd3 100644
> > > > > > > > > > > > > > --- a/fs/nfs/file.c
> > > > > > > > > > > > > > +++ b/fs/nfs/file.c
> > > > > > > > > > > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file
> > > > > > > > > > > > > > *file,
> > > > > > > > > > > > > > loff_t start, loff_t end, int datasync)
> > > > > > > > > > > > > >  {
> > > > > > > > > > > > > >  	struct nfs_open_context *ctx =
> > > > > > > > > > > > > > nfs_file_open_context(file);
> > > > > > > > > > > > > >  	struct inode *inode = file_inode(file);
> > > > > > > > > > > > > > -	int have_error, do_resend, status;
> > > > > > > > > > > > > > -	int ret = 0;
> > > > > > > > > > > > > > +	int do_resend, status;
> > > > > > > > > > > > > > +	int ret;
> > > > > > > > > > > > > >  
> > > > > > > > > > > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n",
> > > > > > > > > > > > > > file,
> > > > > > > > > > > > > > datasync);
> > > > > > > > > > > > > >  
> > > > > > > > > > > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > > > > > > > > > > > > >  	do_resend =
> > > > > > > > > > > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx-
> > > > > > > > > > > > > > > flags);
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > -	have_error =
> > > > > > > > > > > > > > test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> > > > > > > > > > > > > > &ctx->flags);
> > > > > > > > > > > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > > > > > > > > -	have_error |=
> > > > > > > > > > > > > > test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > > > > > > > > > > flags);
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > -	if (have_error) {
> > > > > > > > > > > > > > -		ret = xchg(&ctx->error, 0);
> > > > > > > > > > > > > > -		if (ret)
> > > > > > > > > > > > > > -			goto out;
> > > > > > > > > > > > > > -	}
> > > > > > > > > > > > > > -	if (status < 0) {
> > > > > > > > > > > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > > > > > > > > > > flags);
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	/* Recheck and advance after the commit */
> > > > > > > > > > > > > > +	status = file_check_and_advance_wb_err(file);
> > > > > > > > > > > > 
> > > > > > > > > > > > This change makes the code inconsistent with the comment
> > > > > > > > > > > > above the
> > > > > > > > > > > > function, which still references ctx->error.  The intent of
> > > > > > > > > > > > the
> > > > > > > > > > > > comment
> > > > > > > > > > > > is still correct, but the details have changed.
> > > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > Good catch. I'll fix that up in a respin.
> > > > > > > > > > > 
> > > > > > > > > > > > Also, there is a call to mapping_set_error() in
> > > > > > > > > > > > nfs_pageio_add_request().
> > > > > > > > > > > > I wonder if that should be changed to
> > > > > > > > > > > >   nfs_context_set_write_error(req->wb_context, desc-
> > > > > > > > > > > > > pg_error)
> > > > > > > > > > > > 
> > > > > > > > > > > > ??
> > > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > Trickier question...
> > > > > > > > > > > 
> > > > > > > > > > > I'm not quite sure what semantics we're looking for with
> > > > > > > > > > > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
> > > > > > > > > > > synchronous, but I'm not quite sure why it gets cleared the
> > > > > > > > > > > way it
> > > > > > > > > > > does. It's set on any error but cleared before issuing a
> > > > > > > > > > > commit.
> > > > > > > > > > > 
> > > > > > > > > > > I added a similar flag to Ceph inodes recently, but only
> > > > > > > > > > > clear it when
> > > > > > > > > > > a write succeeds. Wouldn't that make more sense here as well?
> > > > > > > > > > 
> > > > > > > > > > It is a bit hard to wrap one's mind around.
> > > > > > > > > > 
> > > > > > > > > > In the original code (commit 7b159fc18d417980) it looks like:
> > > > > > > > > >  - test-and-clear bit
> > > > > > > > > >  - write and sync
> > > > > > > > > >  - test-bit
> > > > > > > > > > 
> > > > > > > > > > This does, I think, seem safer than "clear on successful write"
> > > > > > > > > > as the
> > > > > > > > > > writes could complete out-of-order and I wouldn't be surprised
> > > > > > > > > > if the
> > > > > > > > > > unsuccessful ones completed with an error before the successful
> > > > > > > > > > one -
> > > > > > > > > > particularly with an error like EDQUOT.
> > > > > > > > > > 
> > > > > > > > > > However the current code does the writes before the test-and-
> > > > > > > > > > clear, and
> > > > > > > > > > only does the commit afterwards.  That makes it less clear why
> > > > > > > > > > the
> > > > > > > > > > current sequence is a good idea.
> > > > > > > > > > 
> > > > > > > > > > However ... nfs_file_fsync_commit() is only called if
> > > > > > > > > > filemap_write_and_wait_range() returned with success, so we
> > > > > > > > > > only clear
> > > > > > > > > > the flag after successful writes(?).
> > > > > > > > > > 
> > > > > > > > > > Oh....
> > > > > > > > > > This patch from me:
> > > > > > > > > > 
> > > > > > > > > > Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error
> > > > > > > > > > handling.")
> > > > > > > > > > 
> > > > > > > > > > seems to have been reverted by
> > > > > > > > > > 
> > > > > > > > > > Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if
> > > > > > > > > > page writeback failed")
> > > > > > > > > > 
> > > > > > > > > > which probably isn't good.  It appears that this code is very
> > > > > > > > > > fragile
> > > > > > > > > > and easily broken.
> > > > > > > > 
> > > > > > > > On further investigation, I think the problem that I fixed and then
> > > > > > > > we
> > > > > > > > reintroduced will be fixed again - more permanently - by your
> > > > > > > > patch.
> > > > > > > > The root problem is that nfs keeps error codes in a different way
> > > > > > > > to the
> > > > > > > > MM core.  By unifying those, the problem goes.
> > > > > > > > (The specific problem is that writes which hit EDQUOT on the server
> > > > > > > > can
> > > > > > > >  report EIO on the client).
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > > Maybe we need to work out exactly what is required, and
> > > > > > > > > > document it - so
> > > > > > > > > > we can stop breaking it.
> > > > > > > > > > Or maybe we need some unit tests.....
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Yes, laying out what's necessary for this would be very helpful.
> > > > > > > > > We
> > > > > > > > > clearly want to set the flag when an error occurs. Under what
> > > > > > > > > circumstances should we be clearing it?
> > > > > > > > 
> > > > > > > > Well.... looking back at  7b159fc18d417980f57ae which introduced
> > > > > > > > the
> > > > > > > > flag, prior to that write errors (ctx->error) were only reported by
> > > > > > > > nfs_file_flush and nfs_fsync, so only one close() and fsync().
> > > > > > > > 
> > > > > > > > After that commit, setting the flag would mean that errors could be
> > > > > > > > returned by 'write'.  So clearing as part of returning the error
> > > > > > > > makes
> > > > > > > > perfect sense.
> > > > > > > > 
> > > > > > > > As long as the error gets recorded, and gets returned when it is
> > > > > > > > recorded, it doesn't much matter when the flag is cleared.  With
> > > > > > > > your
> > > > > > > > patches we don't need to flag any more to get errors reliably
> > > > > > > > reported.
> > > > > > > > 
> > > > > > > > Leaving the flag set means that writes go more slowly - we don't
> > > > > > > > get
> > > > > > > > large queue of background rights building up but destined for
> > > > > > > > failure.
> > > > > > > > This is the main point made in the comment message when the flag
> > > > > > > > was
> > > > > > > > introduced.
> > > > > > > > Of course, by the time we first get an error there could already
> > > > > > > > by a large queue, so we probably want that to drain completely
> > > > > > > > before
> > > > > > > > allowing async writes again.
> > > > > > 
> > > > > > We already have this functionality implemented in the existing code.
> > > > > > 
> > > > > > > > 
> > > > > > > > It might make sense to have 2 flags.  One which says "writes should
> > > > > > > > be
> > > > > > > > synchronous", another that says "There was an error recently".
> > > > > > > > We clear the error flag before calling nfs_fsync, and if it is
> > > > > > > > still
> > > > > > > > clear afterwards, we clear the sync-writes flag.  Maybe that is
> > > > > > > > more
> > > > > > > > complex than needed though.
> > > > > > > > 
> > > > > > 
> > > > > > We also need to preserve the NFS_CONTEXT_RESEND_WRITES flag. I don't
> > > > > > see any global mechanism that will replace that.
> > > > > > 
> > > > > > > > I'm leaning towards your suggestion that it doesn't matter very
> > > > > > > > much
> > > > > > > > when it gets cleared, and clearing it on any successful write is
> > > > > > > > simplest.
> > > > > > > > 
> > > > > > > > So I'm still in favor of using nfs_context_set_write_error() in
> > > > > > > > nfs_pageio_add_request(), primarily because it is most consistent -
> > > > > > > > we
> > > > > > > > don't need exceptions.
> > > > > > > 
> > > > > > > Thanks for taking a closer look. I can easily make the change above,
> > > > > > > and
> > > > > > > I do think that keeping this mechanism as simple as possible will
> > > > > > > make
> > > > > > > it easier to prevent bitrot.
> > > > > > > 
> > > > > > > That said... NFS_CONTEXT_ERROR_WRITE is a per ctx flag, and the ctx
> > > > > > > is a
> > > > > > > per open file description object.
> > > > > > > 
> > > > > > > Is that the correct way to track this? All of the ctx's will share
> > > > > > > the
> > > > > > > same inode. If we're getting writeback errors for one context, it's
> > > > > > > quite likely that we'll be seeing them via others.
> > > > > > > 
> > > > > > > I suppose the counterargument is when we have things like expiring
> > > > > > > krb5
> > > > > > > tickets. Write failures via an expiring set of creds may have no
> > > > > > > effect
> > > > > > > on writeback via other creds.
> > > > > > > 
> > > > > > > Still, I think a per-inode flag might make more sense here.
> > > > > > > 
> > > > > > > Thoughts?
> > > > > > 
> > > > > > As far as I'm concerned, that would be a regression. The most common
> > > > > > problem when flushing writeback data to the server aside from ENOSPC
> > > > > > (and possibly ESTALE) is EACCES, which is particular to the file
> > > > > > descriptor that opened the file.
> > > > > > 
> > > > > > File contexts, and NFS_CONTEXT_ERROR_WRITE solve that problem by being
> > > > > > private to the file descriptor.
> > > > > 
> > > > > Thanks for the reminder that errors are per-context and this patch drops
> > > > > this.  The per-context nature of errors in NFS was the reason that I
> > > > > nagged Jeff to make errseq_t a stand-alone type rather than just a part
> > > > > of address_space.  I had envisaged that it would be embedded in the
> > > > > open_context as well.
> > > > > We still could do that, but as there is precisely one open-file for each
> > > > > open_context, the gains are not great.
> > > > > 
> > > > > However, while looking over the code to make sure I really understood it
> > > > > and all the possible consequences of changing to errseq_t I found a few
> > > > > anomalies.  The patch below addresses them all.
> > > > > 
> > > > > Would you see if they may sense to you?
> > > > > 
> > > > > Thanks,
> > > > > NeilBrown
> > > > > 
> > > > > 
> > > > > From: NeilBrown <neilb@suse.com>
> > > > > Date: Mon, 11 Sep 2017 13:15:50 +1000
> > > > > Subject: [PATCH] NFS: various changes relating to reporting IO errors.
> > > > > 
> > > > > 1/ remove 'start' and 'end' args from nfs_file_fsync_commit().
> > > > >    They aren't used.
> > > > > 
> > > > > 2/ Make nfs_context_set_write_error() a "static inline" in internal.h
> > > > >    so we can...
> > > > > 
> > > > > 3/ Use nfs_context_set_write_error() instead of mapping_set_error()
> > > > >    if nfs_pageio_add_request() fails before sending any request.
> > > > >    NFS generally keeps errors in the open_context, not the mapping,
> > > > >    so this is more consistent.
> > > > > 
> > > > > 4/ If filemap_write_and_write_range() reports any error, still
> > > > >    check ctx->error.  The value in ctx->error is likely to be
> > > > >    more useful.  As part of this, NFS_CONTEXT_ERROR_WRITE is
> > > > >    cleared slightly earlier, before nfs_file_fsync_commit() is called,
> > > > >    rather than at the start of that function.
> > > > > 
> > > > > Signed-off-by: NeilBrown <neilb@suse.com>
> > > > > ---
> > > > >  fs/nfs/file.c     | 16 ++++++++++------
> > > > >  fs/nfs/internal.h |  7 +++++++
> > > > >  fs/nfs/pagelist.c |  4 ++--
> > > > >  fs/nfs/write.c    |  7 -------
> > > > >  4 files changed, 19 insertions(+), 15 deletions(-)
> > > > > 
> > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > > > index af330c31f627..ab324f14081f 100644
> > > > > --- a/fs/nfs/file.c
> > > > > +++ b/fs/nfs/file.c
> > > > > @@ -208,21 +208,19 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
> > > > >   * fall back to doing a synchronous write.
> > > > >   */
> > > > >  static int
> > > > > -nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
> > > > > +nfs_file_fsync_commit(struct file *file, int datasync)
> > > > >  {
> > > > >  	struct nfs_open_context *ctx = nfs_file_open_context(file);
> > > > >  	struct inode *inode = file_inode(file);
> > > > > -	int have_error, do_resend, status;
> > > > > +	int do_resend, status;
> > > > >  	int ret = 0;
> > > > >  
> > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
> > > > >  
> > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > > > >  	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
> > > > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > > >  	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > > > -	if (have_error) {
> > > > > +	if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
> > > > >  		ret = xchg(&ctx->error, 0);
> > > > >  		if (ret)
> > > > >  			goto out;
> > > > > @@ -247,10 +245,16 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> > > > >  	trace_nfs_fsync_enter(inode);
> > > > >  
> > > > >  	do {
> > > > > +		struct nfs_open_context *ctx = nfs_file_open_context(file);
> > > > >  		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
> > > > > +		if (test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) {
> > > > > +			int ret2 = xchg(&ctx->error, 0);
> > > > > +			if (ret2)
> > > > > +				ret = ret2;
> > > > > +		}
> > > > >  		if (ret != 0)
> > > > >  			break;
> > > > > -		ret = nfs_file_fsync_commit(file, start, end, datasync);
> > > > > +		ret = nfs_file_fsync_commit(file, datasync);
> > > > >  		if (!ret)
> > > > >  			ret = pnfs_sync_inode(inode, !!datasync);
> > > > >  		/*
> > > > > diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> > > > > index dc456416d2be..44c8962fec91 100644
> > > > > --- a/fs/nfs/internal.h
> > > > > +++ b/fs/nfs/internal.h
> > > > > @@ -769,3 +769,10 @@ static inline bool nfs_error_is_fatal(int err)
> > > > >  		return false;
> > > > >  	}
> > > > >  }
> > > > > +
> > > > > +static inline void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
> > > > > +{
> > > > > +	ctx->error = error;
> > > > > +	smp_wmb();
> > > > > +	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > > > +}
> > > > > diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> > > > > index de9066a92c0d..0ebd26b9a6bd 100644
> > > > > --- a/fs/nfs/pagelist.c
> > > > > +++ b/fs/nfs/pagelist.c
> > > > > @@ -1198,8 +1198,8 @@ out_failed:
> > > > >  
> > > > >  		/* remember fatal errors */
> > > > >  		if (nfs_error_is_fatal(desc->pg_error))
> > > > > -			mapping_set_error(desc->pg_inode->i_mapping,
> > > > > -					  desc->pg_error);
> > > > > +			nfs_context_set_write_error(req->wb_context,
> > > > > +						    desc->pg_error);
> > > > >  
> > > > >  		func = desc->pg_completion_ops->error_cleanup;
> > > > >  		for (midx = 0; midx < desc->pg_mirror_count; midx++) {
> > > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > > > > index b1af5dee5e0a..f702bf2def79 100644
> > > > > --- a/fs/nfs/write.c
> > > > > +++ b/fs/nfs/write.c
> > > > > @@ -147,13 +147,6 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
> > > > >  		kref_put(&ioc->refcount, nfs_io_completion_release);
> > > > >  }
> > > > >  
> > > > > -static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
> > > > > -{
> > > > > -	ctx->error = error;
> > > > > -	smp_wmb();
> > > > > -	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > > > -}
> > > > > -
> > > > >  /*
> > > > >   * nfs_page_find_head_request_locked - find head request associated with @page
> > > > >   *
> > > > 
> > > > This should probably be broken out into at least a 2-3 different
> > > > patches.
> > > > 
> > > > Ok, so to make sure I understand:
> > > > 
> > > > All writeback is done under the aegis of an open context, and writes
> > > > from different open contexts are not mergeable. We also flush to the
> > > > server in the case that a dirty page is written via an incompatible open
> > > > context. So with that we can always tie
> > > 
> > > Not quite.  Writes from different open contexts are sometimes mergeable,
> > > providing the credential is the same and there are no locks that might
> > > get in the way. (nfs_flush_incompatible() gets rid of conflicts writes
> > > to the same page as part of nfs_write_begin().
> > > When writes are merged, all contexts remain reachable from the request
> > > through an 'nfs_page'. nfs_write_completion() iterates over all the
> > > nfs_pages attached to the nfs_pgio_header, and sets the context
> > > write_error from the hdr->error.
> > > 
> > 
> > Ok, by this account, NFS should already have "correct" error reporting
> > semantics on fsync. i.e. when the file is written via multiple fds, you
> > should get back an error on all fds if those writebacks failed.
> > 
> > I have a test for nfs for the new-style error reporting:
> > 
> >     https://git.kernel.org/pub/scm/linux/kernel/git/jlayton/xfstests-dev.git/log/?h=wberr
> > 
> > The nfs test is still pretty rickety, using soft mounts and iptables to
> > cause requests to fail. With the patch I originally proposed, this test
> > would pass. When I run this test on normal mainline kernels, it fails:
> > 
> > -------------------------------8<------------------------------
> > FSTYP         -- nfs
> > PLATFORM      -- Linux/x86_64 wberr 4.12.11-300.fc26.x86_64
> > MKFS_OPTIONS  -- knfsdsrv:/export/scratch
> > MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 knfsdsrv:/export/scratch /mnt/scratch
> > 
> > nfs/002	 - output mismatch (see /home/jlayton/git/xfstests/results//nfs/002.out.bad)
> >     --- tests/nfs/002.out	2017-07-19 13:12:59.354561869 -0400
> >     +++ /home/jlayton/git/xfstests/results//nfs/002.out.bad	2017-09-12 11:17:17.335943539 -0400
> >     @@ -1,3 +1,3 @@
> >      QA output created by 002
> >      Format and mount
> >     -Test passed!
> >     +Success on second fsync on fd[1]!
> 
> Interesting.
> I haven't reviewed the kernel code yet, but having looked at the test
> code I see that the one file is opened 10 time and that the *same* block
> in the file (65k?) is written via each fd.
> If I write to a file, and then someone else over-writes all the bytes
> that I wrote, then the page is written to the server and gets an error,
> then you could argue that as none of the bytes I wrote were involved in
> the error, I don't need to see the error status - someone else has taken
> on responsibility for that range.
> 
> Maybe the test should/could write a few bytes with a different offset for each
> fd ??
> 
> NeilBrown
> 

Yes, that seems to do the trick! I'll send out a patch to xfstests to
change that in the test there as it's probably more representative of a
real workload.

Ok, so I guess from both this result and starting at the code for a bit
that the nfs client will replace one nfs_page with another if the second
one covers the entire range of the first.

We should note that that behavior is a little different from other
filesystems that use errseq_t now. If two tasks are writing to the same
file, and one attempts to overwrite the other's range, then the first
one will not see an error on writeback.

Maybe the above is worth a blurb in vfs.txt?

I do (idly) wonder if that behavior could be exploited in some fashion.
A process running in a different container that has write access to a
file could potentially mask writeback errors for a task running in a
different container. It's probably not useful on its own to an attacker,
but could be part of a wider exploit? In any case, it doesn't seem like
a big deal.

Thanks for all of the patience here. I think your patch looks fine as
well. You can add:

Reviewed-by: Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
  2017-09-13 12:23                           ` Jeff Layton
@ 2017-09-13 23:50                               ` NeilBrown
  0 siblings, 0 replies; 33+ messages in thread
From: NeilBrown @ 2017-09-13 23:50 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages),
	Jeff Layton, Trond Myklebust, anna.schumaker, jlayton
  Cc: linux-man, linux-nfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 4344 bytes --]


Since 4.13, errors from writeback are more reliably reported
to all file descriptors that might be relevant.

Add notes to this effect, and also add details about ENOSPC and EDQUOT
which can be delayed in a similar manner to EIO - for NFS in particular.

Signed-off-by: NeilBrown <neilb@suse.com>
---

This is my summary of recent changes, and details that have been made
clear during the exploration of those changes.

I haven't mentioned the fact that EPERM can be returned by
write/fsync/close on NFS if the permissions on the server are changed.
We probably should ... are there other errors that are worth mentioning
along with EPERM, ENOSPC, EDQUOT ??

Thanks,
NeilBronw


 man2/close.2 |  9 +++++++++
 man2/fsync.2 | 19 ++++++++++++++++++-
 man2/write.2 | 20 +++++++++++++++++---
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/man2/close.2 b/man2/close.2
index 751ec322b1f1..9776c839b8b6 100644
--- a/man2/close.2
+++ b/man2/close.2
@@ -82,6 +82,15 @@ call was interrupted by a signal; see
 .TP
 .B EIO
 An I/O error occurred.
+.TP
+.BR ENOSPC ", " EDQUOT
+On NFS, these errors are not normally reported against the first write
+which exceeds the available storage space, but instead against a
+subsequent
+.BR write (2),
+.BR fsync (2),
+or
+.BR close (2).
 .PP
 See NOTES for a discussion of why
 .BR close ()
diff --git a/man2/fsync.2 b/man2/fsync.2
index f1a01301da0f..e706a08d360d 100644
--- a/man2/fsync.2
+++ b/man2/fsync.2
@@ -120,12 +120,29 @@ is set appropriately.
 is not a valid open file descriptor.
 .TP
 .B EIO
-An error occurred during synchronization.
+An error occurred during synchronization.  This error may relate
+to data written to some other file descriptor on the same file.
+.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
+Since Linux 4.13 errors from write-back will be reported to
+all file descriptors that might have written the data which triggered
+the error, and which are still open.  Some filesystems (e.g. NFS)
+keep close track of which data came through which file descriptor,
+and give more precise reporting.  Other filesystems (e.g. most local
+filesystems) will report errors to all file descriptors on the same
+file.
 .TP
 .BR EROFS ", " EINVAL
 .I fd
 is bound to a special file (e.g., a pipe, FIFO, or socket)
 which does not support synchronization.
+.TP
+.BR ENOSPC ", " EDQUOT
+.I fd
+is bound to a file on NFS or another filesystem which does not allocate
+space at the time of a
+.BR write (2)
+system call, and some previous write failed due to insufficient
+storage space.
 .SH CONFORMING TO
 POSIX.1-2001, POSIX.1-2008, 4.3BSD.
 .SH AVAILABILITY
diff --git a/man2/write.2 b/man2/write.2
index 6a39b5b5541d..1a9a86b03b04 100644
--- a/man2/write.2
+++ b/man2/write.2
@@ -47,7 +47,7 @@ write \- write to a file descriptor
 .BR write ()
 writes up to
 .I count
-bytes from the buffer pointed
+bytes from the buffer starting at
 .I buf
 to the file referred to by the file descriptor
 .IR fd .
@@ -181,6 +181,14 @@ or the file offset is not suitably aligned.
 .TP
 .B EIO
 A low-level I/O error occurred while modifying the inode.
+This error may relate to data written by an earlier
+.BR write (2),
+which may have been issued to a different file descriptor on
+the same file.  Since Linux 4.13 errors from write-back will
+be reported to all file descriptors that might have
+written the data which triggered the error, and which are still
+open.
+.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
 .TP
 .B ENOSPC
 The device containing the file referred to by
@@ -222,8 +230,14 @@ unsigned and signed integer data types specified by POSIX.1.
 A successful return from
 .BR write ()
 does not make any guarantee that data has been committed to disk.
-In fact, on some buggy implementations, it does not even guarantee
-that space has successfully been reserved for the data.
+On some filesystems, including NFS, it does not even guarantee
+that space has successfully been reserved for the data.  In the case,
+some errors might be delayed to a future
+.BR write (2)
+or to
+.BR fsync (2)
+or even
+.BR close (2).
 The only way to be sure is to call
 .BR fsync (2)
 after you are done writing all your data.
-- 
2.14.0.rc0.dirty


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
@ 2017-09-13 23:50                               ` NeilBrown
  0 siblings, 0 replies; 33+ messages in thread
From: NeilBrown @ 2017-09-13 23:50 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages),
	Jeff Layton, Trond Myklebust, anna.schumaker, jlayton
  Cc: linux-man, linux-nfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 4344 bytes --]


Since 4.13, errors from writeback are more reliably reported
to all file descriptors that might be relevant.

Add notes to this effect, and also add details about ENOSPC and EDQUOT
which can be delayed in a similar manner to EIO - for NFS in particular.

Signed-off-by: NeilBrown <neilb@suse.com>
---

This is my summary of recent changes, and details that have been made
clear during the exploration of those changes.

I haven't mentioned the fact that EPERM can be returned by
write/fsync/close on NFS if the permissions on the server are changed.
We probably should ... are there other errors that are worth mentioning
along with EPERM, ENOSPC, EDQUOT ??

Thanks,
NeilBronw


 man2/close.2 |  9 +++++++++
 man2/fsync.2 | 19 ++++++++++++++++++-
 man2/write.2 | 20 +++++++++++++++++---
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/man2/close.2 b/man2/close.2
index 751ec322b1f1..9776c839b8b6 100644
--- a/man2/close.2
+++ b/man2/close.2
@@ -82,6 +82,15 @@ call was interrupted by a signal; see
 .TP
 .B EIO
 An I/O error occurred.
+.TP
+.BR ENOSPC ", " EDQUOT
+On NFS, these errors are not normally reported against the first write
+which exceeds the available storage space, but instead against a
+subsequent
+.BR write (2),
+.BR fsync (2),
+or
+.BR close (2).
 .PP
 See NOTES for a discussion of why
 .BR close ()
diff --git a/man2/fsync.2 b/man2/fsync.2
index f1a01301da0f..e706a08d360d 100644
--- a/man2/fsync.2
+++ b/man2/fsync.2
@@ -120,12 +120,29 @@ is set appropriately.
 is not a valid open file descriptor.
 .TP
 .B EIO
-An error occurred during synchronization.
+An error occurred during synchronization.  This error may relate
+to data written to some other file descriptor on the same file.
+.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
+Since Linux 4.13 errors from write-back will be reported to
+all file descriptors that might have written the data which triggered
+the error, and which are still open.  Some filesystems (e.g. NFS)
+keep close track of which data came through which file descriptor,
+and give more precise reporting.  Other filesystems (e.g. most local
+filesystems) will report errors to all file descriptors on the same
+file.
 .TP
 .BR EROFS ", " EINVAL
 .I fd
 is bound to a special file (e.g., a pipe, FIFO, or socket)
 which does not support synchronization.
+.TP
+.BR ENOSPC ", " EDQUOT
+.I fd
+is bound to a file on NFS or another filesystem which does not allocate
+space at the time of a
+.BR write (2)
+system call, and some previous write failed due to insufficient
+storage space.
 .SH CONFORMING TO
 POSIX.1-2001, POSIX.1-2008, 4.3BSD.
 .SH AVAILABILITY
diff --git a/man2/write.2 b/man2/write.2
index 6a39b5b5541d..1a9a86b03b04 100644
--- a/man2/write.2
+++ b/man2/write.2
@@ -47,7 +47,7 @@ write \- write to a file descriptor
 .BR write ()
 writes up to
 .I count
-bytes from the buffer pointed
+bytes from the buffer starting at
 .I buf
 to the file referred to by the file descriptor
 .IR fd .
@@ -181,6 +181,14 @@ or the file offset is not suitably aligned.
 .TP
 .B EIO
 A low-level I/O error occurred while modifying the inode.
+This error may relate to data written by an earlier
+.BR write (2),
+which may have been issued to a different file descriptor on
+the same file.  Since Linux 4.13 errors from write-back will
+be reported to all file descriptors that might have
+written the data which triggered the error, and which are still
+open.
+.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
 .TP
 .B ENOSPC
 The device containing the file referred to by
@@ -222,8 +230,14 @@ unsigned and signed integer data types specified by POSIX.1.
 A successful return from
 .BR write ()
 does not make any guarantee that data has been committed to disk.
-In fact, on some buggy implementations, it does not even guarantee
-that space has successfully been reserved for the data.
+On some filesystems, including NFS, it does not even guarantee
+that space has successfully been reserved for the data.  In the case,
+some errors might be delayed to a future
+.BR write (2)
+or to
+.BR fsync (2)
+or even
+.BR close (2).
 The only way to be sure is to call
 .BR fsync (2)
 after you are done writing all your data.
-- 
2.14.0.rc0.dirty


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
       [not found]                               ` <87ingm9n04.fsf-wvvUuzkyo1HefUI2i7LXDhCRmIWqnp/j@public.gmane.org>
@ 2017-09-14  7:59                                 ` walter harms
       [not found]                                   ` <59BA36C5.9000506-fPG8STNUNVg@public.gmane.org>
  0 siblings, 1 reply; 33+ messages in thread
From: walter harms @ 2017-09-14  7:59 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-man-u79uwXL29TY76Z2rM5mHXA



Am 14.09.2017 01:50, schrieb NeilBrown:
> 
> Since 4.13, errors from writeback are more reliably reported
> to all file descriptors that might be relevant.
> 
> Add notes to this effect, and also add details about ENOSPC and EDQUOT
> which can be delayed in a similar manner to EIO - for NFS in particular.
> 

I never read EDQUOT  before but it is real:

see:
http://elixir.free-electrons.com/linux/latest/source/security/keys/request_key.c

-EDQUOT
 * if insufficient key quota was available to create a new key

> Signed-off-by: NeilBrown <neilb-IBi9RG/b67k@public.gmane.org>
> ---
> 
> This is my summary of recent changes, and details that have been made
> clear during the exploration of those changes.
> 
> I haven't mentioned the fact that EPERM can be returned by
> write/fsync/close on NFS if the permissions on the server are changed.
> We probably should ... are there other errors that are worth mentioning
> along with EPERM, ENOSPC, EDQUOT ??
> 
> Thanks,
> NeilBronw
> 
> 
>  man2/close.2 |  9 +++++++++
>  man2/fsync.2 | 19 ++++++++++++++++++-
>  man2/write.2 | 20 +++++++++++++++++---
>  3 files changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/man2/close.2 b/man2/close.2
> index 751ec322b1f1..9776c839b8b6 100644
> --- a/man2/close.2
> +++ b/man2/close.2
> @@ -82,6 +82,15 @@ call was interrupted by a signal; see
>  .TP
>  .B EIO
>  An I/O error occurred.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +On NFS, these errors are not normally reported against the first write
> +which exceeds the available storage space, but instead against a
> +subsequent
> +.BR write (2),
> +.BR fsync (2),
> +or
> +.BR close (2).
>  .PP
>  See NOTES for a discussion of why
>  .BR close ()
> diff --git a/man2/fsync.2 b/man2/fsync.2
> index f1a01301da0f..e706a08d360d 100644
> --- a/man2/fsync.2
> +++ b/man2/fsync.2
> @@ -120,12 +120,29 @@ is set appropriately.
>  is not a valid open file descriptor.
>  .TP
>  .B EIO
> -An error occurred during synchronization.
> +An error occurred during synchronization.  This error may relate
> +to data written to some other file descriptor on the same file.
> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
> +Since Linux 4.13 errors from write-back will be reported to
> +all file descriptors that might have written the data which triggered
> +the error, and which are still open.  Some filesystems (e.g. NFS)
> +keep close track of which data came through which file descriptor,
> +and give more precise reporting.  Other filesystems (e.g. most local
> +filesystems) will report errors to all file descriptors on the same
> +file.
>  .TP
>  .BR EROFS ", " EINVAL
>  .I fd
>  is bound to a special file (e.g., a pipe, FIFO, or socket)
>  which does not support synchronization.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +.I fd
> +is bound to a file on NFS or another filesystem which does not allocate
> +space at the time of a
> +.BR write (2)
> +system call, and some previous write failed due to insufficient
> +storage space.
>  .SH CONFORMING TO
>  POSIX.1-2001, POSIX.1-2008, 4.3BSD.
>  .SH AVAILABILITY
> diff --git a/man2/write.2 b/man2/write.2
> index 6a39b5b5541d..1a9a86b03b04 100644
> --- a/man2/write.2
> +++ b/man2/write.2
> @@ -47,7 +47,7 @@ write \- write to a file descriptor
>  .BR write ()
>  writes up to
>  .I count
> -bytes from the buffer pointed
> +bytes from the buffer starting at
>  .I buf
>  to the file referred to by the file descriptor
>  .IR fd .
> @@ -181,6 +181,14 @@ or the file offset is not suitably aligned.
>  .TP
>  .B EIO
>  A low-level I/O error occurred while modifying the inode.
> +This error may relate to data written by an earlier
> +.BR write (2),
> +which may have been issued to a different file descriptor on
> +the same file.  Since Linux 4.13 errors from write-back will
> +be reported to all file descriptors that might have
> +written the data which triggered the error, and which are still
> +open.
> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>  .TP
>  .B ENOSPC
>  The device containing the file referred to by
> @@ -222,8 +230,14 @@ unsigned and signed integer data types specified by POSIX.1.
>  A successful return from
>  .BR write ()
>  does not make any guarantee that data has been committed to disk.
> -In fact, on some buggy implementations, it does not even guarantee
> -that space has successfully been reserved for the data.
> +On some filesystems, including NFS, it does not even guarantee
> +that space has successfully been reserved for the data.  In the case,
> +some errors might be delayed to a future
> +.BR write (2)
> +or to
> +.BR fsync (2)
> +or even
> +.BR close (2).
>  The only way to be sure is to call
>  .BR fsync (2)
>  after you are done writing all your data.
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
       [not found]                               ` <87ingm9n04.fsf-wvvUuzkyo1HefUI2i7LXDhCRmIWqnp/j@public.gmane.org>
@ 2017-09-14 10:48                                 ` Jeff Layton
  0 siblings, 0 replies; 33+ messages in thread
From: Jeff Layton @ 2017-09-14 10:48 UTC (permalink / raw)
  To: NeilBrown, Michael Kerrisk (man-pages),
	Trond Myklebust, anna.schumaker, jlayton
  Cc: linux-man, linux-nfs, linux-fsdevel

On Thu, 2017-09-14 at 09:50 +1000, NeilBrown wrote:
> Since 4.13, errors from writeback are more reliably reported
> to all file descriptors that might be relevant.
> 
> Add notes to this effect, and also add details about ENOSPC and EDQUOT
> which can be delayed in a similar manner to EIO - for NFS in particular.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
> 
> This is my summary of recent changes, and details that have been made
> clear during the exploration of those changes.
> 
> I haven't mentioned the fact that EPERM can be returned by
> write/fsync/close on NFS if the permissions on the server are changed.
> We probably should ... are there other errors that are worth mentioning
> along with EPERM, ENOSPC, EDQUOT ??
> 
> Thanks,
> NeilBronw
> 

Many thanks for doing this! It was on my to-do list. Comments below:

> 
>  man2/close.2 |  9 +++++++++
>  man2/fsync.2 | 19 ++++++++++++++++++-
>  man2/write.2 | 20 +++++++++++++++++---
>  3 files changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/man2/close.2 b/man2/close.2
> index 751ec322b1f1..9776c839b8b6 100644
> --- a/man2/close.2
> +++ b/man2/close.2
> @@ -82,6 +82,15 @@ call was interrupted by a signal; see
>  .TP
>  .B EIO
>  An I/O error occurred.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +On NFS, these errors are not normally reported against the first write
> +which exceeds the available storage space, but instead against a
> +subsequent
> +.BR write (2),
> +.BR fsync (2),
> +or
> +.BR close (2).
>  .PP
>  See NOTES for a discussion of why
>  .BR close ()
> diff --git a/man2/fsync.2 b/man2/fsync.2
> index f1a01301da0f..e706a08d360d 100644
> --- a/man2/fsync.2
> +++ b/man2/fsync.2
> @@ -120,12 +120,29 @@ is set appropriately.
>  is not a valid open file descriptor.
>  .TP
>  .B EIO
> -An error occurred during synchronization.
> +An error occurred during synchronization.  This error may relate
> +to data written to some other file descriptor on the same file.
> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
> +Since Linux 4.13 errors from write-back will be reported to
> +all file descriptors that might have written the data which triggered
> +the error, and which are still open.

This is a little awkward. How could we report to a fd that was no longer
open? How about:

"Since Linux 4.13, errors from write-back will be reported to all file
descriptors that were open at the time that the error was recorded."

>   Some filesystems (e.g. NFS)
> +keep close track of which data came through which file descriptor,
> +and give more precise reporting.  Other filesystems (e.g. most local
> +filesystems) will report errors to all file descriptors on the same
> +file.
>  .TP
>  .BR EROFS ", " EINVAL
>  .I fd
>  is bound to a special file (e.g., a pipe, FIFO, or socket)
>  which does not support synchronization.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +.I fd
> +is bound to a file on NFS or another filesystem which does not allocate
> +space at the time of a
> +.BR write (2)
> +system call, and some previous write failed due to insufficient
> +storage space.
>  .SH CONFORMING TO
>  POSIX.1-2001, POSIX.1-2008, 4.3BSD.
>  .SH AVAILABILITY
> diff --git a/man2/write.2 b/man2/write.2
> index 6a39b5b5541d..1a9a86b03b04 100644
> --- a/man2/write.2
> +++ b/man2/write.2
> @@ -47,7 +47,7 @@ write \- write to a file descriptor
>  .BR write ()
>  writes up to
>  .I count
> -bytes from the buffer pointed
> +bytes from the buffer starting at
>  .I buf
>  to the file referred to by the file descriptor
>  .IR fd .
> @@ -181,6 +181,14 @@ or the file offset is not suitably aligned.
>  .TP
>  .B EIO
>  A low-level I/O error occurred while modifying the inode.
> +This error may relate to data written by an earlier
> +.BR write (2),
> +which may have been issued to a different file descriptor on
> +the same file.  Since Linux 4.13 errors from write-back will
> +be reported to all file descriptors that might have
> +written the data which triggered the error, and which are still
> +open.


This is where things get a little more vague.

Some filesystems will return errors on a subsequent write(2) when
previous writeback has failed -- some don't. In either case though,
write(2) should never advance your errseq_t cursor, so only an fsync
will "clear" an earlier error.

I'm not sure how best to convey that in the manpages though.

> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>  .TP
>  .B ENOSPC
>  The device containing the file referred to by
> @@ -222,8 +230,14 @@ unsigned and signed integer data types specified by POSIX.1.
>  A successful return from
>  .BR write ()
>  does not make any guarantee that data has been committed to disk.
> -In fact, on some buggy implementations, it does not even guarantee
> -that space has successfully been reserved for the data.
> +On some filesystems, including NFS, it does not even guarantee
> +that space has successfully been reserved for the data.  In the case,
> +some errors might be delayed to a future
> +.BR write (2)
> +or to
> +.BR fsync (2)
> +or even
> +.BR close (2).
>  The only way to be sure is to call
>  .BR fsync (2)
>  after you are done writing all your data.

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
@ 2017-09-14 10:48                                 ` Jeff Layton
  0 siblings, 0 replies; 33+ messages in thread
From: Jeff Layton @ 2017-09-14 10:48 UTC (permalink / raw)
  To: NeilBrown, Michael Kerrisk (man-pages),
	Trond Myklebust, anna.schumaker-HgOvQuBEEgTQT0dZR+AlfA,
	jlayton-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-man-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA

On Thu, 2017-09-14 at 09:50 +1000, NeilBrown wrote:
> Since 4.13, errors from writeback are more reliably reported
> to all file descriptors that might be relevant.
> 
> Add notes to this effect, and also add details about ENOSPC and EDQUOT
> which can be delayed in a similar manner to EIO - for NFS in particular.
> 
> Signed-off-by: NeilBrown <neilb-IBi9RG/b67k@public.gmane.org>
> ---
> 
> This is my summary of recent changes, and details that have been made
> clear during the exploration of those changes.
> 
> I haven't mentioned the fact that EPERM can be returned by
> write/fsync/close on NFS if the permissions on the server are changed.
> We probably should ... are there other errors that are worth mentioning
> along with EPERM, ENOSPC, EDQUOT ??
> 
> Thanks,
> NeilBronw
> 

Many thanks for doing this! It was on my to-do list. Comments below:

> 
>  man2/close.2 |  9 +++++++++
>  man2/fsync.2 | 19 ++++++++++++++++++-
>  man2/write.2 | 20 +++++++++++++++++---
>  3 files changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/man2/close.2 b/man2/close.2
> index 751ec322b1f1..9776c839b8b6 100644
> --- a/man2/close.2
> +++ b/man2/close.2
> @@ -82,6 +82,15 @@ call was interrupted by a signal; see
>  .TP
>  .B EIO
>  An I/O error occurred.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +On NFS, these errors are not normally reported against the first write
> +which exceeds the available storage space, but instead against a
> +subsequent
> +.BR write (2),
> +.BR fsync (2),
> +or
> +.BR close (2).
>  .PP
>  See NOTES for a discussion of why
>  .BR close ()
> diff --git a/man2/fsync.2 b/man2/fsync.2
> index f1a01301da0f..e706a08d360d 100644
> --- a/man2/fsync.2
> +++ b/man2/fsync.2
> @@ -120,12 +120,29 @@ is set appropriately.
>  is not a valid open file descriptor.
>  .TP
>  .B EIO
> -An error occurred during synchronization.
> +An error occurred during synchronization.  This error may relate
> +to data written to some other file descriptor on the same file.
> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
> +Since Linux 4.13 errors from write-back will be reported to
> +all file descriptors that might have written the data which triggered
> +the error, and which are still open.

This is a little awkward. How could we report to a fd that was no longer
open? How about:

"Since Linux 4.13, errors from write-back will be reported to all file
descriptors that were open at the time that the error was recorded."

>   Some filesystems (e.g. NFS)
> +keep close track of which data came through which file descriptor,
> +and give more precise reporting.  Other filesystems (e.g. most local
> +filesystems) will report errors to all file descriptors on the same
> +file.
>  .TP
>  .BR EROFS ", " EINVAL
>  .I fd
>  is bound to a special file (e.g., a pipe, FIFO, or socket)
>  which does not support synchronization.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +.I fd
> +is bound to a file on NFS or another filesystem which does not allocate
> +space at the time of a
> +.BR write (2)
> +system call, and some previous write failed due to insufficient
> +storage space.
>  .SH CONFORMING TO
>  POSIX.1-2001, POSIX.1-2008, 4.3BSD.
>  .SH AVAILABILITY
> diff --git a/man2/write.2 b/man2/write.2
> index 6a39b5b5541d..1a9a86b03b04 100644
> --- a/man2/write.2
> +++ b/man2/write.2
> @@ -47,7 +47,7 @@ write \- write to a file descriptor
>  .BR write ()
>  writes up to
>  .I count
> -bytes from the buffer pointed
> +bytes from the buffer starting at
>  .I buf
>  to the file referred to by the file descriptor
>  .IR fd .
> @@ -181,6 +181,14 @@ or the file offset is not suitably aligned.
>  .TP
>  .B EIO
>  A low-level I/O error occurred while modifying the inode.
> +This error may relate to data written by an earlier
> +.BR write (2),
> +which may have been issued to a different file descriptor on
> +the same file.  Since Linux 4.13 errors from write-back will
> +be reported to all file descriptors that might have
> +written the data which triggered the error, and which are still
> +open.


This is where things get a little more vague.

Some filesystems will return errors on a subsequent write(2) when
previous writeback has failed -- some don't. In either case though,
write(2) should never advance your errseq_t cursor, so only an fsync
will "clear" an earlier error.

I'm not sure how best to convey that in the manpages though.

> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>  .TP
>  .B ENOSPC
>  The device containing the file referred to by
> @@ -222,8 +230,14 @@ unsigned and signed integer data types specified by POSIX.1.
>  A successful return from
>  .BR write ()
>  does not make any guarantee that data has been committed to disk.
> -In fact, on some buggy implementations, it does not even guarantee
> -that space has successfully been reserved for the data.
> +On some filesystems, including NFS, it does not even guarantee
> +that space has successfully been reserved for the data.  In the case,
> +some errors might be delayed to a future
> +.BR write (2)
> +or to
> +.BR fsync (2)
> +or even
> +.BR close (2).
>  The only way to be sure is to call
>  .BR fsync (2)
>  after you are done writing all your data.

-- 
Jeff Layton <jlayton-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
       [not found]                                   ` <59BA36C5.9000506-fPG8STNUNVg@public.gmane.org>
@ 2017-09-14 22:36                                     ` NeilBrown
  0 siblings, 0 replies; 33+ messages in thread
From: NeilBrown @ 2017-09-14 22:36 UTC (permalink / raw)
  To: wharms-fPG8STNUNVg; +Cc: linux-man-u79uwXL29TY76Z2rM5mHXA

[-- Attachment #1: Type: text/plain, Size: 865 bytes --]

On Thu, Sep 14 2017, walter harms wrote:

> Am 14.09.2017 01:50, schrieb NeilBrown:
>> 
>> Since 4.13, errors from writeback are more reliably reported
>> to all file descriptors that might be relevant.
>> 
>> Add notes to this effect, and also add details about ENOSPC and EDQUOT
>> which can be delayed in a similar manner to EIO - for NFS in particular.
>> 
>
> I never read EDQUOT  before but it is real:
>
> see:
> http://elixir.free-electrons.com/linux/latest/source/security/keys/request_key.c
>
> -EDQUOT
>  * if insufficient key quota was available to create a new key

EDQUOT existed long before security keys.
It's original purpose is to report Disk QUOTa problems.  If you write to
a file and you have exhausted you quota, you get EDQUOT.

http://elixir.free-electrons.com/linux/latest/source/fs/quota/dquot.c#L1318

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
  2017-09-14 10:48                                 ` Jeff Layton
  (?)
@ 2017-09-15  7:50                                 ` Michael Kerrisk (man-pages)
  2017-09-15  8:25                                     ` NeilBrown
  -1 siblings, 1 reply; 33+ messages in thread
From: Michael Kerrisk (man-pages) @ 2017-09-15  7:50 UTC (permalink / raw)
  To: Jeff Layton, NeilBrown, Trond Myklebust, anna.schumaker, jlayton
  Cc: mtk.manpages, linux-man, linux-nfs, linux-fsdevel

Hi Neil,

Will you revise this patch to incorporate Jeff's comments, or
should I try manually editing them in. (I'd prefer the former.)

Cheers,

Michael


On 09/14/2017 12:48 PM, Jeff Layton wrote:
> On Thu, 2017-09-14 at 09:50 +1000, NeilBrown wrote:
>> Since 4.13, errors from writeback are more reliably reported
>> to all file descriptors that might be relevant.
>>
>> Add notes to this effect, and also add details about ENOSPC and EDQUOT
>> which can be delayed in a similar manner to EIO - for NFS in particular.
>>
>> Signed-off-by: NeilBrown <neilb@suse.com>
>> ---
>>
>> This is my summary of recent changes, and details that have been made
>> clear during the exploration of those changes.
>>
>> I haven't mentioned the fact that EPERM can be returned by
>> write/fsync/close on NFS if the permissions on the server are changed.
>> We probably should ... are there other errors that are worth mentioning
>> along with EPERM, ENOSPC, EDQUOT ??
>>
>> Thanks,
>> NeilBronw
>>
> 
> Many thanks for doing this! It was on my to-do list. Comments below:
> 
>>
>>  man2/close.2 |  9 +++++++++
>>  man2/fsync.2 | 19 ++++++++++++++++++-
>>  man2/write.2 | 20 +++++++++++++++++---
>>  3 files changed, 44 insertions(+), 4 deletions(-)
>>
>> diff --git a/man2/close.2 b/man2/close.2
>> index 751ec322b1f1..9776c839b8b6 100644
>> --- a/man2/close.2
>> +++ b/man2/close.2
>> @@ -82,6 +82,15 @@ call was interrupted by a signal; see
>>  .TP
>>  .B EIO
>>  An I/O error occurred.
>> +.TP
>> +.BR ENOSPC ", " EDQUOT
>> +On NFS, these errors are not normally reported against the first write
>> +which exceeds the available storage space, but instead against a
>> +subsequent
>> +.BR write (2),
>> +.BR fsync (2),
>> +or
>> +.BR close (2).
>>  .PP
>>  See NOTES for a discussion of why
>>  .BR close ()
>> diff --git a/man2/fsync.2 b/man2/fsync.2
>> index f1a01301da0f..e706a08d360d 100644
>> --- a/man2/fsync.2
>> +++ b/man2/fsync.2
>> @@ -120,12 +120,29 @@ is set appropriately.
>>  is not a valid open file descriptor.
>>  .TP
>>  .B EIO
>> -An error occurred during synchronization.
>> +An error occurred during synchronization.  This error may relate
>> +to data written to some other file descriptor on the same file.
>> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>> +Since Linux 4.13 errors from write-back will be reported to
>> +all file descriptors that might have written the data which triggered
>> +the error, and which are still open.
> 
> This is a little awkward. How could we report to a fd that was no longer
> open? How about:
> 
> "Since Linux 4.13, errors from write-back will be reported to all file
> descriptors that were open at the time that the error was recorded."
> 
>>   Some filesystems (e.g. NFS)
>> +keep close track of which data came through which file descriptor,
>> +and give more precise reporting.  Other filesystems (e.g. most local
>> +filesystems) will report errors to all file descriptors on the same
>> +file.
>>  .TP
>>  .BR EROFS ", " EINVAL
>>  .I fd
>>  is bound to a special file (e.g., a pipe, FIFO, or socket)
>>  which does not support synchronization.
>> +.TP
>> +.BR ENOSPC ", " EDQUOT
>> +.I fd
>> +is bound to a file on NFS or another filesystem which does not allocate
>> +space at the time of a
>> +.BR write (2)
>> +system call, and some previous write failed due to insufficient
>> +storage space.
>>  .SH CONFORMING TO
>>  POSIX.1-2001, POSIX.1-2008, 4.3BSD.
>>  .SH AVAILABILITY
>> diff --git a/man2/write.2 b/man2/write.2
>> index 6a39b5b5541d..1a9a86b03b04 100644
>> --- a/man2/write.2
>> +++ b/man2/write.2
>> @@ -47,7 +47,7 @@ write \- write to a file descriptor
>>  .BR write ()
>>  writes up to
>>  .I count
>> -bytes from the buffer pointed
>> +bytes from the buffer starting at
>>  .I buf
>>  to the file referred to by the file descriptor
>>  .IR fd .
>> @@ -181,6 +181,14 @@ or the file offset is not suitably aligned.
>>  .TP
>>  .B EIO
>>  A low-level I/O error occurred while modifying the inode.
>> +This error may relate to data written by an earlier
>> +.BR write (2),
>> +which may have been issued to a different file descriptor on
>> +the same file.  Since Linux 4.13 errors from write-back will
>> +be reported to all file descriptors that might have
>> +written the data which triggered the error, and which are still
>> +open.
> 
> 
> This is where things get a little more vague.
> 
> Some filesystems will return errors on a subsequent write(2) when
> previous writeback has failed -- some don't. In either case though,
> write(2) should never advance your errseq_t cursor, so only an fsync
> will "clear" an earlier error.
> 
> I'm not sure how best to convey that in the manpages though.
> 
>> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>>  .TP
>>  .B ENOSPC
>>  The device containing the file referred to by
>> @@ -222,8 +230,14 @@ unsigned and signed integer data types specified by POSIX.1.
>>  A successful return from
>>  .BR write ()
>>  does not make any guarantee that data has been committed to disk.
>> -In fact, on some buggy implementations, it does not even guarantee
>> -that space has successfully been reserved for the data.
>> +On some filesystems, including NFS, it does not even guarantee
>> +that space has successfully been reserved for the data.  In the case,
>> +some errors might be delayed to a future
>> +.BR write (2)
>> +or to
>> +.BR fsync (2)
>> +or even
>> +.BR close (2).
>>  The only way to be sure is to call
>>  .BR fsync (2)
>>  after you are done writing all your data.
> 


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
@ 2017-09-15  8:25                                     ` NeilBrown
  0 siblings, 0 replies; 33+ messages in thread
From: NeilBrown @ 2017-09-15  8:25 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages),
	Jeff Layton, Trond Myklebust, anna.schumaker, jlayton
  Cc: mtk.manpages, linux-man, linux-nfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 6201 bytes --]


I'll do something, but maybe not for a few days (24 hour sardine
impersonation pending - Europe, here I come...)

NeilBrown

On Fri, Sep 15 2017, Michael Kerrisk (man-pages) wrote:

> Hi Neil,
>
> Will you revise this patch to incorporate Jeff's comments, or
> should I try manually editing them in. (I'd prefer the former.)
>
> Cheers,
>
> Michael
>
>
> On 09/14/2017 12:48 PM, Jeff Layton wrote:
>> On Thu, 2017-09-14 at 09:50 +1000, NeilBrown wrote:
>>> Since 4.13, errors from writeback are more reliably reported
>>> to all file descriptors that might be relevant.
>>>
>>> Add notes to this effect, and also add details about ENOSPC and EDQUOT
>>> which can be delayed in a similar manner to EIO - for NFS in particular.
>>>
>>> Signed-off-by: NeilBrown <neilb@suse.com>
>>> ---
>>>
>>> This is my summary of recent changes, and details that have been made
>>> clear during the exploration of those changes.
>>>
>>> I haven't mentioned the fact that EPERM can be returned by
>>> write/fsync/close on NFS if the permissions on the server are changed.
>>> We probably should ... are there other errors that are worth mentioning
>>> along with EPERM, ENOSPC, EDQUOT ??
>>>
>>> Thanks,
>>> NeilBronw
>>>
>> 
>> Many thanks for doing this! It was on my to-do list. Comments below:
>> 
>>>
>>>  man2/close.2 |  9 +++++++++
>>>  man2/fsync.2 | 19 ++++++++++++++++++-
>>>  man2/write.2 | 20 +++++++++++++++++---
>>>  3 files changed, 44 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/man2/close.2 b/man2/close.2
>>> index 751ec322b1f1..9776c839b8b6 100644
>>> --- a/man2/close.2
>>> +++ b/man2/close.2
>>> @@ -82,6 +82,15 @@ call was interrupted by a signal; see
>>>  .TP
>>>  .B EIO
>>>  An I/O error occurred.
>>> +.TP
>>> +.BR ENOSPC ", " EDQUOT
>>> +On NFS, these errors are not normally reported against the first write
>>> +which exceeds the available storage space, but instead against a
>>> +subsequent
>>> +.BR write (2),
>>> +.BR fsync (2),
>>> +or
>>> +.BR close (2).
>>>  .PP
>>>  See NOTES for a discussion of why
>>>  .BR close ()
>>> diff --git a/man2/fsync.2 b/man2/fsync.2
>>> index f1a01301da0f..e706a08d360d 100644
>>> --- a/man2/fsync.2
>>> +++ b/man2/fsync.2
>>> @@ -120,12 +120,29 @@ is set appropriately.
>>>  is not a valid open file descriptor.
>>>  .TP
>>>  .B EIO
>>> -An error occurred during synchronization.
>>> +An error occurred during synchronization.  This error may relate
>>> +to data written to some other file descriptor on the same file.
>>> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>>> +Since Linux 4.13 errors from write-back will be reported to
>>> +all file descriptors that might have written the data which triggered
>>> +the error, and which are still open.
>> 
>> This is a little awkward. How could we report to a fd that was no longer
>> open? How about:
>> 
>> "Since Linux 4.13, errors from write-back will be reported to all file
>> descriptors that were open at the time that the error was recorded."
>> 
>>>   Some filesystems (e.g. NFS)
>>> +keep close track of which data came through which file descriptor,
>>> +and give more precise reporting.  Other filesystems (e.g. most local
>>> +filesystems) will report errors to all file descriptors on the same
>>> +file.
>>>  .TP
>>>  .BR EROFS ", " EINVAL
>>>  .I fd
>>>  is bound to a special file (e.g., a pipe, FIFO, or socket)
>>>  which does not support synchronization.
>>> +.TP
>>> +.BR ENOSPC ", " EDQUOT
>>> +.I fd
>>> +is bound to a file on NFS or another filesystem which does not allocate
>>> +space at the time of a
>>> +.BR write (2)
>>> +system call, and some previous write failed due to insufficient
>>> +storage space.
>>>  .SH CONFORMING TO
>>>  POSIX.1-2001, POSIX.1-2008, 4.3BSD.
>>>  .SH AVAILABILITY
>>> diff --git a/man2/write.2 b/man2/write.2
>>> index 6a39b5b5541d..1a9a86b03b04 100644
>>> --- a/man2/write.2
>>> +++ b/man2/write.2
>>> @@ -47,7 +47,7 @@ write \- write to a file descriptor
>>>  .BR write ()
>>>  writes up to
>>>  .I count
>>> -bytes from the buffer pointed
>>> +bytes from the buffer starting at
>>>  .I buf
>>>  to the file referred to by the file descriptor
>>>  .IR fd .
>>> @@ -181,6 +181,14 @@ or the file offset is not suitably aligned.
>>>  .TP
>>>  .B EIO
>>>  A low-level I/O error occurred while modifying the inode.
>>> +This error may relate to data written by an earlier
>>> +.BR write (2),
>>> +which may have been issued to a different file descriptor on
>>> +the same file.  Since Linux 4.13 errors from write-back will
>>> +be reported to all file descriptors that might have
>>> +written the data which triggered the error, and which are still
>>> +open.
>> 
>> 
>> This is where things get a little more vague.
>> 
>> Some filesystems will return errors on a subsequent write(2) when
>> previous writeback has failed -- some don't. In either case though,
>> write(2) should never advance your errseq_t cursor, so only an fsync
>> will "clear" an earlier error.
>> 
>> I'm not sure how best to convey that in the manpages though.
>> 
>>> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>>>  .TP
>>>  .B ENOSPC
>>>  The device containing the file referred to by
>>> @@ -222,8 +230,14 @@ unsigned and signed integer data types specified by POSIX.1.
>>>  A successful return from
>>>  .BR write ()
>>>  does not make any guarantee that data has been committed to disk.
>>> -In fact, on some buggy implementations, it does not even guarantee
>>> -that space has successfully been reserved for the data.
>>> +On some filesystems, including NFS, it does not even guarantee
>>> +that space has successfully been reserved for the data.  In the case,
>>> +some errors might be delayed to a future
>>> +.BR write (2)
>>> +or to
>>> +.BR fsync (2)
>>> +or even
>>> +.BR close (2).
>>>  The only way to be sure is to call
>>>  .BR fsync (2)
>>>  after you are done writing all your data.
>> 
>
>
> -- 
> Michael Kerrisk
> Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> Linux/UNIX System Programming Training: http://man7.org/training/

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
@ 2017-09-15  8:25                                     ` NeilBrown
  0 siblings, 0 replies; 33+ messages in thread
From: NeilBrown @ 2017-09-15  8:25 UTC (permalink / raw)
  To: Jeff Layton, Trond Myklebust, anna.schumaker@netapp.com,
	jlayton@kernel.org
  Cc: mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
	linux-man-u79uwXL29TY76Z2rM5mHXA, linux-nfs@vger.kernel.org,
	linux-fsdevel@vger.kernel.org

[-- Attachment #1: Type: text/plain, Size: 6221 bytes --]


I'll do something, but maybe not for a few days (24 hour sardine
impersonation pending - Europe, here I come...)

NeilBrown

On Fri, Sep 15 2017, Michael Kerrisk (man-pages) wrote:

> Hi Neil,
>
> Will you revise this patch to incorporate Jeff's comments, or
> should I try manually editing them in. (I'd prefer the former.)
>
> Cheers,
>
> Michael
>
>
> On 09/14/2017 12:48 PM, Jeff Layton wrote:
>> On Thu, 2017-09-14 at 09:50 +1000, NeilBrown wrote:
>>> Since 4.13, errors from writeback are more reliably reported
>>> to all file descriptors that might be relevant.
>>>
>>> Add notes to this effect, and also add details about ENOSPC and EDQUOT
>>> which can be delayed in a similar manner to EIO - for NFS in particular.
>>>
>>> Signed-off-by: NeilBrown <neilb-IBi9RG/b67k@public.gmane.org>
>>> ---
>>>
>>> This is my summary of recent changes, and details that have been made
>>> clear during the exploration of those changes.
>>>
>>> I haven't mentioned the fact that EPERM can be returned by
>>> write/fsync/close on NFS if the permissions on the server are changed.
>>> We probably should ... are there other errors that are worth mentioning
>>> along with EPERM, ENOSPC, EDQUOT ??
>>>
>>> Thanks,
>>> NeilBronw
>>>
>> 
>> Many thanks for doing this! It was on my to-do list. Comments below:
>> 
>>>
>>>  man2/close.2 |  9 +++++++++
>>>  man2/fsync.2 | 19 ++++++++++++++++++-
>>>  man2/write.2 | 20 +++++++++++++++++---
>>>  3 files changed, 44 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/man2/close.2 b/man2/close.2
>>> index 751ec322b1f1..9776c839b8b6 100644
>>> --- a/man2/close.2
>>> +++ b/man2/close.2
>>> @@ -82,6 +82,15 @@ call was interrupted by a signal; see
>>>  .TP
>>>  .B EIO
>>>  An I/O error occurred.
>>> +.TP
>>> +.BR ENOSPC ", " EDQUOT
>>> +On NFS, these errors are not normally reported against the first write
>>> +which exceeds the available storage space, but instead against a
>>> +subsequent
>>> +.BR write (2),
>>> +.BR fsync (2),
>>> +or
>>> +.BR close (2).
>>>  .PP
>>>  See NOTES for a discussion of why
>>>  .BR close ()
>>> diff --git a/man2/fsync.2 b/man2/fsync.2
>>> index f1a01301da0f..e706a08d360d 100644
>>> --- a/man2/fsync.2
>>> +++ b/man2/fsync.2
>>> @@ -120,12 +120,29 @@ is set appropriately.
>>>  is not a valid open file descriptor.
>>>  .TP
>>>  .B EIO
>>> -An error occurred during synchronization.
>>> +An error occurred during synchronization.  This error may relate
>>> +to data written to some other file descriptor on the same file.
>>> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>>> +Since Linux 4.13 errors from write-back will be reported to
>>> +all file descriptors that might have written the data which triggered
>>> +the error, and which are still open.
>> 
>> This is a little awkward. How could we report to a fd that was no longer
>> open? How about:
>> 
>> "Since Linux 4.13, errors from write-back will be reported to all file
>> descriptors that were open at the time that the error was recorded."
>> 
>>>   Some filesystems (e.g. NFS)
>>> +keep close track of which data came through which file descriptor,
>>> +and give more precise reporting.  Other filesystems (e.g. most local
>>> +filesystems) will report errors to all file descriptors on the same
>>> +file.
>>>  .TP
>>>  .BR EROFS ", " EINVAL
>>>  .I fd
>>>  is bound to a special file (e.g., a pipe, FIFO, or socket)
>>>  which does not support synchronization.
>>> +.TP
>>> +.BR ENOSPC ", " EDQUOT
>>> +.I fd
>>> +is bound to a file on NFS or another filesystem which does not allocate
>>> +space at the time of a
>>> +.BR write (2)
>>> +system call, and some previous write failed due to insufficient
>>> +storage space.
>>>  .SH CONFORMING TO
>>>  POSIX.1-2001, POSIX.1-2008, 4.3BSD.
>>>  .SH AVAILABILITY
>>> diff --git a/man2/write.2 b/man2/write.2
>>> index 6a39b5b5541d..1a9a86b03b04 100644
>>> --- a/man2/write.2
>>> +++ b/man2/write.2
>>> @@ -47,7 +47,7 @@ write \- write to a file descriptor
>>>  .BR write ()
>>>  writes up to
>>>  .I count
>>> -bytes from the buffer pointed
>>> +bytes from the buffer starting at
>>>  .I buf
>>>  to the file referred to by the file descriptor
>>>  .IR fd .
>>> @@ -181,6 +181,14 @@ or the file offset is not suitably aligned.
>>>  .TP
>>>  .B EIO
>>>  A low-level I/O error occurred while modifying the inode.
>>> +This error may relate to data written by an earlier
>>> +.BR write (2),
>>> +which may have been issued to a different file descriptor on
>>> +the same file.  Since Linux 4.13 errors from write-back will
>>> +be reported to all file descriptors that might have
>>> +written the data which triggered the error, and which are still
>>> +open.
>> 
>> 
>> This is where things get a little more vague.
>> 
>> Some filesystems will return errors on a subsequent write(2) when
>> previous writeback has failed -- some don't. In either case though,
>> write(2) should never advance your errseq_t cursor, so only an fsync
>> will "clear" an earlier error.
>> 
>> I'm not sure how best to convey that in the manpages though.
>> 
>>> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>>>  .TP
>>>  .B ENOSPC
>>>  The device containing the file referred to by
>>> @@ -222,8 +230,14 @@ unsigned and signed integer data types specified by POSIX.1.
>>>  A successful return from
>>>  .BR write ()
>>>  does not make any guarantee that data has been committed to disk.
>>> -In fact, on some buggy implementations, it does not even guarantee
>>> -that space has successfully been reserved for the data.
>>> +On some filesystems, including NFS, it does not even guarantee
>>> +that space has successfully been reserved for the data.  In the case,
>>> +some errors might be delayed to a future
>>> +.BR write (2)
>>> +or to
>>> +.BR fsync (2)
>>> +or even
>>> +.BR close (2).
>>>  The only way to be sure is to call
>>>  .BR fsync (2)
>>>  after you are done writing all your data.
>> 
>
>
> -- 
> Michael Kerrisk
> Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> Linux/UNIX System Programming Training: http://man7.org/training/

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
@ 2017-09-28  3:01                                   ` NeilBrown
  0 siblings, 0 replies; 33+ messages in thread
From: NeilBrown @ 2017-09-28  3:01 UTC (permalink / raw)
  To: Jeff Layton, Michael Kerrisk (man-pages),
	Trond Myklebust, anna.schumaker, jlayton
  Cc: linux-man, linux-nfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 7147 bytes --]

On Thu, Sep 14 2017, Jeff Layton wrote:

>>  .TP
>>  .B EIO
>> -An error occurred during synchronization.
>> +An error occurred during synchronization.  This error may relate
>> +to data written to some other file descriptor on the same file.
>> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>> +Since Linux 4.13 errors from write-back will be reported to
>> +all file descriptors that might have written the data which triggered
>> +the error, and which are still open.
>
> This is a little awkward. How could we report to a fd that was no longer
> open? How about:
>
> "Since Linux 4.13, errors from write-back will be reported to all file
> descriptors that were open at the time that the error was recorded."

That might be simpler, but it is less correct.  As I go on to say, NFS
*doesn't* report on all file descriptors that were open at that time.

I've changed it to

-------------------
Since Linux 4.13, errors from write-back will be reported to
all file descriptors that might have written the data which triggered
the error.  Some filesystems (e.g. NFS) keep close track of which data
came through which file descriptor, and give precise reporting.
Other filesystems (e.g. most local filesystems) will report errors to
all file descriptors that where open on the file when the error was recorded.
------------------

which includes some of your text, and removes the "that are still open"
which probably doesn't help.

>>  .TP
>>  .B EIO
>>  A low-level I/O error occurred while modifying the inode.
>> +This error may relate to data written by an earlier
>> +.BR write (2),
>> +which may have been issued to a different file descriptor on
>> +the same file.  Since Linux 4.13 errors from write-back will
>> +be reported to all file descriptors that might have
>> +written the data which triggered the error, and which are still
>> +open.
>
>
> This is where things get a little more vague.
>
> Some filesystems will return errors on a subsequent write(2) when
> previous writeback has failed -- some don't. In either case though,
> write(2) should never advance your errseq_t cursor, so only an fsync
> will "clear" an earlier error.
>
> I'm not sure how best to convey that in the manpages though.

How about:

-------------
This error may relate to the write-back of data written by an
earlier
.BR write (2),
which may have been issued to a different file descriptor on
the same file.  Since Linux 4.13, errors from write-back come
with a promise that they
.I may
be reported by subsequent.
.BR write (2)
requests, and
.I will
be reported by a subsequent
.BR fsync (2)
(whether or not they were also reported by
.BR write (2)).
------------
??

Those changes are included in the following.

Thanks,
NeilBrown

From: NeilBrown <neilb@suse.com>
Date: Thu, 14 Sep 2017 09:44:43 +1000
Subject: [PATCH] write.2, fsync.2, close.2: update description of error codes

Since 4.13, errors from writeback are more reliably reported
to all file descriptors that might be relevant.

Add notes to this effect, and also add detail about ENOSPC and EDQUOT
which can be delayed in a similar many to EIO - for NFS in particular.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 man2/close.2 |  9 +++++++++
 man2/fsync.2 | 18 +++++++++++++++++-
 man2/write.2 | 28 +++++++++++++++++++++++++---
 3 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/man2/close.2 b/man2/close.2
index 55d89ed3dbc7..136bd0be3f67 100644
--- a/man2/close.2
+++ b/man2/close.2
@@ -82,6 +82,15 @@ call was interrupted by a signal; see
 .TP
 .B EIO
 An I/O error occurred.
+.TP
+.BR ENOSPC ", " EDQUOT
+On NFS, these errors are not normally reported against the first write
+which exceeds the available storage space, but instead against a
+subsequent
+.BR write (2),
+.BR fsync (2),
+or
+.BR close (2).
 .PP
 See NOTES for a discussion of why
 .BR close ()
diff --git a/man2/fsync.2 b/man2/fsync.2
index eed3c460bea9..c7878bf3496f 100644
--- a/man2/fsync.2
+++ b/man2/fsync.2
@@ -121,7 +121,15 @@ is set appropriately.
 is not a valid open file descriptor.
 .TP
 .B EIO
-An error occurred during synchronization.
+An error occurred during synchronization.  This error may relate
+to data written to some other file descriptor on the same file.
+.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
+Since Linux 4.13, errors from write-back will be reported to
+all file descriptors that might have written the data which triggered
+the error.  Some filesystems (e.g. NFS) keep close track of which data
+came through which file descriptor, and give more precise reporting.
+Other filesystems (e.g. most local filesystems) will report errors to
+all file descriptors that where open on the file when the error was recorded.
 .TP
 .B ENOSPC
 Disk space was exhausted while synchronizing.
@@ -130,6 +138,14 @@ Disk space was exhausted while synchronizing.
 .I fd
 is bound to a special file (e.g., a pipe, FIFO, or socket)
 which does not support synchronization.
+.TP
+.BR ENOSPC ", " EDQUOT
+.I fd
+is bound to a file on NFS or another filesystem which does not allocate
+space at the time of a
+.BR write (2)
+system call, and some previous write failed due to insufficient
+storage space.
 .SH CONFORMING TO
 POSIX.1-2001, POSIX.1-2008, 4.3BSD.
 .SH AVAILABILITY
diff --git a/man2/write.2 b/man2/write.2
index 061aa70cf590..b1cc3a2cfb17 100644
--- a/man2/write.2
+++ b/man2/write.2
@@ -47,7 +47,7 @@ write \- write to a file descriptor
 .BR write ()
 writes up to
 .I count
-bytes from the buffer pointed
+bytes from the buffer starting at
 .I buf
 to the file referred to by the file descriptor
 .IR fd .
@@ -181,6 +181,22 @@ or the file offset is not suitably aligned.
 .TP
 .B EIO
 A low-level I/O error occurred while modifying the inode.
+This error may relate to the write-back of data written by an
+earlier
+.BR write (2),
+which may have been issued to a different file descriptor on
+the same file.  Since Linux 4.13, errors from write-back come
+with a promise that they
+.I may
+be reported by subsequent.
+.BR write (2)
+requests, and
+.I will
+be reported by a subsequent
+.BR fsync (2)
+(whether or not they were also reported by
+.BR write (2)).
+.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
 .TP
 .B ENOSPC
 The device containing the file referred to by
@@ -222,8 +238,14 @@ unsigned and signed integer data types specified by POSIX.1.
 A successful return from
 .BR write ()
 does not make any guarantee that data has been committed to disk.
-In fact, on some buggy implementations, it does not even guarantee
-that space has successfully been reserved for the data.
+On some filesystems, including NFS, it does not even guarantee
+that space has successfully been reserved for the data.  In the case,
+some errors might be delayed to a future
+.BR write (2)
+or to
+.BR fsync (2)
+or even
+.BR close (2).
 The only way to be sure is to call
 .BR fsync (2)
 after you are done writing all your data.
-- 
2.14.0.rc0.dirty


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
@ 2017-09-28  3:01                                   ` NeilBrown
  0 siblings, 0 replies; 33+ messages in thread
From: NeilBrown @ 2017-09-28  3:01 UTC (permalink / raw)
  To: Jeff Layton, Michael Kerrisk (man-pages),
	Trond Myklebust, anna.schumaker@netapp.com, jlayton@kernel.org
  Cc: linux-man-u79uwXL29TY76Z2rM5mHXA, linux-nfs@vger.kernel.org,
	linux-fsdevel@vger.kernel.org

[-- Attachment #1: Type: text/plain, Size: 7187 bytes --]

On Thu, Sep 14 2017, Jeff Layton wrote:

>>  .TP
>>  .B EIO
>> -An error occurred during synchronization.
>> +An error occurred during synchronization.  This error may relate
>> +to data written to some other file descriptor on the same file.
>> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>> +Since Linux 4.13 errors from write-back will be reported to
>> +all file descriptors that might have written the data which triggered
>> +the error, and which are still open.
>
> This is a little awkward. How could we report to a fd that was no longer
> open? How about:
>
> "Since Linux 4.13, errors from write-back will be reported to all file
> descriptors that were open at the time that the error was recorded."

That might be simpler, but it is less correct.  As I go on to say, NFS
*doesn't* report on all file descriptors that were open at that time.

I've changed it to

-------------------
Since Linux 4.13, errors from write-back will be reported to
all file descriptors that might have written the data which triggered
the error.  Some filesystems (e.g. NFS) keep close track of which data
came through which file descriptor, and give precise reporting.
Other filesystems (e.g. most local filesystems) will report errors to
all file descriptors that where open on the file when the error was recorded.
------------------

which includes some of your text, and removes the "that are still open"
which probably doesn't help.

>>  .TP
>>  .B EIO
>>  A low-level I/O error occurred while modifying the inode.
>> +This error may relate to data written by an earlier
>> +.BR write (2),
>> +which may have been issued to a different file descriptor on
>> +the same file.  Since Linux 4.13 errors from write-back will
>> +be reported to all file descriptors that might have
>> +written the data which triggered the error, and which are still
>> +open.
>
>
> This is where things get a little more vague.
>
> Some filesystems will return errors on a subsequent write(2) when
> previous writeback has failed -- some don't. In either case though,
> write(2) should never advance your errseq_t cursor, so only an fsync
> will "clear" an earlier error.
>
> I'm not sure how best to convey that in the manpages though.

How about:

-------------
This error may relate to the write-back of data written by an
earlier
.BR write (2),
which may have been issued to a different file descriptor on
the same file.  Since Linux 4.13, errors from write-back come
with a promise that they
.I may
be reported by subsequent.
.BR write (2)
requests, and
.I will
be reported by a subsequent
.BR fsync (2)
(whether or not they were also reported by
.BR write (2)).
------------
??

Those changes are included in the following.

Thanks,
NeilBrown

From: NeilBrown <neilb-IBi9RG/b67k@public.gmane.org>
Date: Thu, 14 Sep 2017 09:44:43 +1000
Subject: [PATCH] write.2, fsync.2, close.2: update description of error codes

Since 4.13, errors from writeback are more reliably reported
to all file descriptors that might be relevant.

Add notes to this effect, and also add detail about ENOSPC and EDQUOT
which can be delayed in a similar many to EIO - for NFS in particular.

Signed-off-by: NeilBrown <neilb-IBi9RG/b67k@public.gmane.org>
---
 man2/close.2 |  9 +++++++++
 man2/fsync.2 | 18 +++++++++++++++++-
 man2/write.2 | 28 +++++++++++++++++++++++++---
 3 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/man2/close.2 b/man2/close.2
index 55d89ed3dbc7..136bd0be3f67 100644
--- a/man2/close.2
+++ b/man2/close.2
@@ -82,6 +82,15 @@ call was interrupted by a signal; see
 .TP
 .B EIO
 An I/O error occurred.
+.TP
+.BR ENOSPC ", " EDQUOT
+On NFS, these errors are not normally reported against the first write
+which exceeds the available storage space, but instead against a
+subsequent
+.BR write (2),
+.BR fsync (2),
+or
+.BR close (2).
 .PP
 See NOTES for a discussion of why
 .BR close ()
diff --git a/man2/fsync.2 b/man2/fsync.2
index eed3c460bea9..c7878bf3496f 100644
--- a/man2/fsync.2
+++ b/man2/fsync.2
@@ -121,7 +121,15 @@ is set appropriately.
 is not a valid open file descriptor.
 .TP
 .B EIO
-An error occurred during synchronization.
+An error occurred during synchronization.  This error may relate
+to data written to some other file descriptor on the same file.
+.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
+Since Linux 4.13, errors from write-back will be reported to
+all file descriptors that might have written the data which triggered
+the error.  Some filesystems (e.g. NFS) keep close track of which data
+came through which file descriptor, and give more precise reporting.
+Other filesystems (e.g. most local filesystems) will report errors to
+all file descriptors that where open on the file when the error was recorded.
 .TP
 .B ENOSPC
 Disk space was exhausted while synchronizing.
@@ -130,6 +138,14 @@ Disk space was exhausted while synchronizing.
 .I fd
 is bound to a special file (e.g., a pipe, FIFO, or socket)
 which does not support synchronization.
+.TP
+.BR ENOSPC ", " EDQUOT
+.I fd
+is bound to a file on NFS or another filesystem which does not allocate
+space at the time of a
+.BR write (2)
+system call, and some previous write failed due to insufficient
+storage space.
 .SH CONFORMING TO
 POSIX.1-2001, POSIX.1-2008, 4.3BSD.
 .SH AVAILABILITY
diff --git a/man2/write.2 b/man2/write.2
index 061aa70cf590..b1cc3a2cfb17 100644
--- a/man2/write.2
+++ b/man2/write.2
@@ -47,7 +47,7 @@ write \- write to a file descriptor
 .BR write ()
 writes up to
 .I count
-bytes from the buffer pointed
+bytes from the buffer starting at
 .I buf
 to the file referred to by the file descriptor
 .IR fd .
@@ -181,6 +181,22 @@ or the file offset is not suitably aligned.
 .TP
 .B EIO
 A low-level I/O error occurred while modifying the inode.
+This error may relate to the write-back of data written by an
+earlier
+.BR write (2),
+which may have been issued to a different file descriptor on
+the same file.  Since Linux 4.13, errors from write-back come
+with a promise that they
+.I may
+be reported by subsequent.
+.BR write (2)
+requests, and
+.I will
+be reported by a subsequent
+.BR fsync (2)
+(whether or not they were also reported by
+.BR write (2)).
+.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
 .TP
 .B ENOSPC
 The device containing the file referred to by
@@ -222,8 +238,14 @@ unsigned and signed integer data types specified by POSIX.1.
 A successful return from
 .BR write ()
 does not make any guarantee that data has been committed to disk.
-In fact, on some buggy implementations, it does not even guarantee
-that space has successfully been reserved for the data.
+On some filesystems, including NFS, it does not even guarantee
+that space has successfully been reserved for the data.  In the case,
+some errors might be delayed to a future
+.BR write (2)
+or to
+.BR fsync (2)
+or even
+.BR close (2).
 The only way to be sure is to call
 .BR fsync (2)
 after you are done writing all your data.
-- 
2.14.0.rc0.dirty


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
@ 2017-09-28 12:20                                     ` Jeff Layton
  0 siblings, 0 replies; 33+ messages in thread
From: Jeff Layton @ 2017-09-28 12:20 UTC (permalink / raw)
  To: NeilBrown, Michael Kerrisk (man-pages),
	Trond Myklebust, anna.schumaker, jlayton
  Cc: linux-man, linux-nfs, linux-fsdevel

On Thu, 2017-09-28 at 13:01 +1000, NeilBrown wrote:
> On Thu, Sep 14 2017, Jeff Layton wrote:
> 
> > >  .TP
> > >  .B EIO
> > > -An error occurred during synchronization.
> > > +An error occurred during synchronization.  This error may relate
> > > +to data written to some other file descriptor on the same file.
> > > +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
> > > +Since Linux 4.13 errors from write-back will be reported to
> > > +all file descriptors that might have written the data which
> > > triggered
> > > +the error, and which are still open.
> > 
> > This is a little awkward. How could we report to a fd that was no
> > longer
> > open? How about:
> > 
> > "Since Linux 4.13, errors from write-back will be reported to all
> > file
> > descriptors that were open at the time that the error was
> > recorded."
> 
> That might be simpler, but it is less correct.  As I go on to say,
> NFS
> *doesn't* report on all file descriptors that were open at that time.
> 
> I've changed it to
> 
> -------------------
> Since Linux 4.13, errors from write-back will be reported to
> all file descriptors that might have written the data which triggered
> the error.  Some filesystems (e.g. NFS) keep close track of which
> data
> came through which file descriptor, and give precise reporting.
> Other filesystems (e.g. most local filesystems) will report errors to
> all file descriptors that where open on the file when the error was
> recorded.
> ------------------
> 
> which includes some of your text, and removes the "that are still
> open"
> which probably doesn't help.
> 
> > >  .TP
> > >  .B EIO
> > >  A low-level I/O error occurred while modifying the inode.
> > > +This error may relate to data written by an earlier
> > > +.BR write (2),
> > > +which may have been issued to a different file descriptor on
> > > +the same file.  Since Linux 4.13 errors from write-back will
> > > +be reported to all file descriptors that might have
> > > +written the data which triggered the error, and which are still
> > > +open.
> > 
> > 
> > This is where things get a little more vague.
> > 
> > Some filesystems will return errors on a subsequent write(2) when
> > previous writeback has failed -- some don't. In either case though,
> > write(2) should never advance your errseq_t cursor, so only an
> > fsync
> > will "clear" an earlier error.
> > 
> > I'm not sure how best to convey that in the manpages though.
> 
> How about:
> 
> -------------
> This error may relate to the write-back of data written by an
> earlier
> .BR write (2),
> which may have been issued to a different file descriptor on
> the same file.  Since Linux 4.13, errors from write-back come
> with a promise that they
> .I may
> be reported by subsequent.
> .BR write (2)
> requests, and
> .I will
> be reported by a subsequent
> .BR fsync (2)
> (whether or not they were also reported by
> .BR write (2)).
> ------------
> ??
> 
> Those changes are included in the following.
> 
> Thanks,
> NeilBrown
> 
> From: NeilBrown <neilb@suse.com>
> Date: Thu, 14 Sep 2017 09:44:43 +1000
> Subject: [PATCH] write.2, fsync.2, close.2: update description of
> error codes
> 
> Since 4.13, errors from writeback are more reliably reported
> to all file descriptors that might be relevant.
> 
> Add notes to this effect, and also add detail about ENOSPC and EDQUOT
> which can be delayed in a similar many to EIO - for NFS in
> particular.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  man2/close.2 |  9 +++++++++
>  man2/fsync.2 | 18 +++++++++++++++++-
>  man2/write.2 | 28 +++++++++++++++++++++++++---
>  3 files changed, 51 insertions(+), 4 deletions(-)
> 
> diff --git a/man2/close.2 b/man2/close.2
> index 55d89ed3dbc7..136bd0be3f67 100644
> --- a/man2/close.2
> +++ b/man2/close.2
> @@ -82,6 +82,15 @@ call was interrupted by a signal; see
>  .TP
>  .B EIO
>  An I/O error occurred.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +On NFS, these errors are not normally reported against the first
> write
> +which exceeds the available storage space, but instead against a
> +subsequent
> +.BR write (2),
> +.BR fsync (2),
> +or
> +.BR close (2).
>  .PP
>  See NOTES for a discussion of why
>  .BR close ()
> diff --git a/man2/fsync.2 b/man2/fsync.2
> index eed3c460bea9..c7878bf3496f 100644
> --- a/man2/fsync.2
> +++ b/man2/fsync.2
> @@ -121,7 +121,15 @@ is set appropriately.
>  is not a valid open file descriptor.
>  .TP
>  .B EIO
> -An error occurred during synchronization.
> +An error occurred during synchronization.  This error may relate
> +to data written to some other file descriptor on the same file.
> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
> +Since Linux 4.13, errors from write-back will be reported to
> +all file descriptors that might have written the data which
> triggered
> +the error.  Some filesystems (e.g. NFS) keep close track of which
> data
> +came through which file descriptor, and give more precise reporting.
> +Other filesystems (e.g. most local filesystems) will report errors
> to
> +all file descriptors that where open on the file when the error was
> recorded.
>  .TP
>  .B ENOSPC
>  Disk space was exhausted while synchronizing.
> @@ -130,6 +138,14 @@ Disk space was exhausted while synchronizing.
>  .I fd
>  is bound to a special file (e.g., a pipe, FIFO, or socket)
>  which does not support synchronization.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +.I fd
> +is bound to a file on NFS or another filesystem which does not
> allocate
> +space at the time of a
> +.BR write (2)
> +system call, and some previous write failed due to insufficient
> +storage space.
>  .SH CONFORMING TO
>  POSIX.1-2001, POSIX.1-2008, 4.3BSD.
>  .SH AVAILABILITY
> diff --git a/man2/write.2 b/man2/write.2
> index 061aa70cf590..b1cc3a2cfb17 100644
> --- a/man2/write.2
> +++ b/man2/write.2
> @@ -47,7 +47,7 @@ write \- write to a file descriptor
>  .BR write ()
>  writes up to
>  .I count
> -bytes from the buffer pointed
> +bytes from the buffer starting at
>  .I buf
>  to the file referred to by the file descriptor
>  .IR fd .
> @@ -181,6 +181,22 @@ or the file offset is not suitably aligned.
>  .TP
>  .B EIO
>  A low-level I/O error occurred while modifying the inode.
> +This error may relate to the write-back of data written by an
> +earlier
> +.BR write (2),
> +which may have been issued to a different file descriptor on
> +the same file.  Since Linux 4.13, errors from write-back come
> +with a promise that they
> +.I may
> +be reported by subsequent.
> +.BR write (2)
> +requests, and
> +.I will
> +be reported by a subsequent
> +.BR fsync (2)
> +(whether or not they were also reported by
> +.BR write (2)).
> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>  .TP
>  .B ENOSPC
>  The device containing the file referred to by
> @@ -222,8 +238,14 @@ unsigned and signed integer data types specified
> by POSIX.1.
>  A successful return from
>  .BR write ()
>  does not make any guarantee that data has been committed to disk.
> -In fact, on some buggy implementations, it does not even guarantee
> -that space has successfully been reserved for the data.
> +On some filesystems, including NFS, it does not even guarantee
> +that space has successfully been reserved for the data.  In the
> case,
> +some errors might be delayed to a future
> +.BR write (2)
> +or to
> +.BR fsync (2)
> +or even
> +.BR close (2).
>  The only way to be sure is to call
>  .BR fsync (2)
>  after you are done writing all your data.

Looks good to me!

Reviewed-by: Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
@ 2017-09-28 12:20                                     ` Jeff Layton
  0 siblings, 0 replies; 33+ messages in thread
From: Jeff Layton @ 2017-09-28 12:20 UTC (permalink / raw)
  To: NeilBrown, Michael Kerrisk (man-pages),
	Trond Myklebust, anna.schumaker-HgOvQuBEEgTQT0dZR+AlfA,
	jlayton-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-man-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA

On Thu, 2017-09-28 at 13:01 +1000, NeilBrown wrote:
> On Thu, Sep 14 2017, Jeff Layton wrote:
> 
> > >  .TP
> > >  .B EIO
> > > -An error occurred during synchronization.
> > > +An error occurred during synchronization.  This error may relate
> > > +to data written to some other file descriptor on the same file.
> > > +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
> > > +Since Linux 4.13 errors from write-back will be reported to
> > > +all file descriptors that might have written the data which
> > > triggered
> > > +the error, and which are still open.
> > 
> > This is a little awkward. How could we report to a fd that was no
> > longer
> > open? How about:
> > 
> > "Since Linux 4.13, errors from write-back will be reported to all
> > file
> > descriptors that were open at the time that the error was
> > recorded."
> 
> That might be simpler, but it is less correct.  As I go on to say,
> NFS
> *doesn't* report on all file descriptors that were open at that time.
> 
> I've changed it to
> 
> -------------------
> Since Linux 4.13, errors from write-back will be reported to
> all file descriptors that might have written the data which triggered
> the error.  Some filesystems (e.g. NFS) keep close track of which
> data
> came through which file descriptor, and give precise reporting.
> Other filesystems (e.g. most local filesystems) will report errors to
> all file descriptors that where open on the file when the error was
> recorded.
> ------------------
> 
> which includes some of your text, and removes the "that are still
> open"
> which probably doesn't help.
> 
> > >  .TP
> > >  .B EIO
> > >  A low-level I/O error occurred while modifying the inode.
> > > +This error may relate to data written by an earlier
> > > +.BR write (2),
> > > +which may have been issued to a different file descriptor on
> > > +the same file.  Since Linux 4.13 errors from write-back will
> > > +be reported to all file descriptors that might have
> > > +written the data which triggered the error, and which are still
> > > +open.
> > 
> > 
> > This is where things get a little more vague.
> > 
> > Some filesystems will return errors on a subsequent write(2) when
> > previous writeback has failed -- some don't. In either case though,
> > write(2) should never advance your errseq_t cursor, so only an
> > fsync
> > will "clear" an earlier error.
> > 
> > I'm not sure how best to convey that in the manpages though.
> 
> How about:
> 
> -------------
> This error may relate to the write-back of data written by an
> earlier
> .BR write (2),
> which may have been issued to a different file descriptor on
> the same file.  Since Linux 4.13, errors from write-back come
> with a promise that they
> .I may
> be reported by subsequent.
> .BR write (2)
> requests, and
> .I will
> be reported by a subsequent
> .BR fsync (2)
> (whether or not they were also reported by
> .BR write (2)).
> ------------
> ??
> 
> Those changes are included in the following.
> 
> Thanks,
> NeilBrown
> 
> From: NeilBrown <neilb-IBi9RG/b67k@public.gmane.org>
> Date: Thu, 14 Sep 2017 09:44:43 +1000
> Subject: [PATCH] write.2, fsync.2, close.2: update description of
> error codes
> 
> Since 4.13, errors from writeback are more reliably reported
> to all file descriptors that might be relevant.
> 
> Add notes to this effect, and also add detail about ENOSPC and EDQUOT
> which can be delayed in a similar many to EIO - for NFS in
> particular.
> 
> Signed-off-by: NeilBrown <neilb-IBi9RG/b67k@public.gmane.org>
> ---
>  man2/close.2 |  9 +++++++++
>  man2/fsync.2 | 18 +++++++++++++++++-
>  man2/write.2 | 28 +++++++++++++++++++++++++---
>  3 files changed, 51 insertions(+), 4 deletions(-)
> 
> diff --git a/man2/close.2 b/man2/close.2
> index 55d89ed3dbc7..136bd0be3f67 100644
> --- a/man2/close.2
> +++ b/man2/close.2
> @@ -82,6 +82,15 @@ call was interrupted by a signal; see
>  .TP
>  .B EIO
>  An I/O error occurred.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +On NFS, these errors are not normally reported against the first
> write
> +which exceeds the available storage space, but instead against a
> +subsequent
> +.BR write (2),
> +.BR fsync (2),
> +or
> +.BR close (2).
>  .PP
>  See NOTES for a discussion of why
>  .BR close ()
> diff --git a/man2/fsync.2 b/man2/fsync.2
> index eed3c460bea9..c7878bf3496f 100644
> --- a/man2/fsync.2
> +++ b/man2/fsync.2
> @@ -121,7 +121,15 @@ is set appropriately.
>  is not a valid open file descriptor.
>  .TP
>  .B EIO
> -An error occurred during synchronization.
> +An error occurred during synchronization.  This error may relate
> +to data written to some other file descriptor on the same file.
> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
> +Since Linux 4.13, errors from write-back will be reported to
> +all file descriptors that might have written the data which
> triggered
> +the error.  Some filesystems (e.g. NFS) keep close track of which
> data
> +came through which file descriptor, and give more precise reporting.
> +Other filesystems (e.g. most local filesystems) will report errors
> to
> +all file descriptors that where open on the file when the error was
> recorded.
>  .TP
>  .B ENOSPC
>  Disk space was exhausted while synchronizing.
> @@ -130,6 +138,14 @@ Disk space was exhausted while synchronizing.
>  .I fd
>  is bound to a special file (e.g., a pipe, FIFO, or socket)
>  which does not support synchronization.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +.I fd
> +is bound to a file on NFS or another filesystem which does not
> allocate
> +space at the time of a
> +.BR write (2)
> +system call, and some previous write failed due to insufficient
> +storage space.
>  .SH CONFORMING TO
>  POSIX.1-2001, POSIX.1-2008, 4.3BSD.
>  .SH AVAILABILITY
> diff --git a/man2/write.2 b/man2/write.2
> index 061aa70cf590..b1cc3a2cfb17 100644
> --- a/man2/write.2
> +++ b/man2/write.2
> @@ -47,7 +47,7 @@ write \- write to a file descriptor
>  .BR write ()
>  writes up to
>  .I count
> -bytes from the buffer pointed
> +bytes from the buffer starting at
>  .I buf
>  to the file referred to by the file descriptor
>  .IR fd .
> @@ -181,6 +181,22 @@ or the file offset is not suitably aligned.
>  .TP
>  .B EIO
>  A low-level I/O error occurred while modifying the inode.
> +This error may relate to the write-back of data written by an
> +earlier
> +.BR write (2),
> +which may have been issued to a different file descriptor on
> +the same file.  Since Linux 4.13, errors from write-back come
> +with a promise that they
> +.I may
> +be reported by subsequent.
> +.BR write (2)
> +requests, and
> +.I will
> +be reported by a subsequent
> +.BR fsync (2)
> +(whether or not they were also reported by
> +.BR write (2)).
> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>  .TP
>  .B ENOSPC
>  The device containing the file referred to by
> @@ -222,8 +238,14 @@ unsigned and signed integer data types specified
> by POSIX.1.
>  A successful return from
>  .BR write ()
>  does not make any guarantee that data has been committed to disk.
> -In fact, on some buggy implementations, it does not even guarantee
> -that space has successfully been reserved for the data.
> +On some filesystems, including NFS, it does not even guarantee
> +that space has successfully been reserved for the data.  In the
> case,
> +some errors might be delayed to a future
> +.BR write (2)
> +or to
> +.BR fsync (2)
> +or even
> +.BR close (2).
>  The only way to be sure is to call
>  .BR fsync (2)
>  after you are done writing all your data.

Looks good to me!

Reviewed-by: Jeff Layton <jlayton-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes
  2017-09-28  3:01                                   ` NeilBrown
  (?)
  (?)
@ 2017-09-28 16:19                                   ` Michael Kerrisk (man-opages)
  -1 siblings, 0 replies; 33+ messages in thread
From: Michael Kerrisk (man-opages) @ 2017-09-28 16:19 UTC (permalink / raw)
  To: NeilBrown, Jeff Layton, Trond Myklebust, anna.schumaker, jlayton
  Cc: mtk.manpages, linux-man, linux-nfs, linux-fsdevel

Hi Neil,

On 09/28/2017 05:01 AM, NeilBrown wrote:
> On Thu, Sep 14 2017, Jeff Layton wrote:
> 
>>>   .TP
>>>   .B EIO
>>> -An error occurred during synchronization.
>>> +An error occurred during synchronization.  This error may relate
>>> +to data written to some other file descriptor on the same file.
>>> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>>> +Since Linux 4.13 errors from write-back will be reported to
>>> +all file descriptors that might have written the data which triggered
>>> +the error, and which are still open.
>>
>> This is a little awkward. How could we report to a fd that was no longer
>> open? How about:
>>
>> "Since Linux 4.13, errors from write-back will be reported to all file
>> descriptors that were open at the time that the error was recorded."
> 
> That might be simpler, but it is less correct.  As I go on to say, NFS
> *doesn't* report on all file descriptors that were open at that time.
> 
> I've changed it to
> 
> -------------------
> Since Linux 4.13, errors from write-back will be reported to
> all file descriptors that might have written the data which triggered
> the error.  Some filesystems (e.g. NFS) keep close track of which data
> came through which file descriptor, and give precise reporting.
> Other filesystems (e.g. most local filesystems) will report errors to
> all file descriptors that where open on the file when the error was recorded.
> ------------------
> 
> which includes some of your text, and removes the "that are still open"
> which probably doesn't help.
> 
>>>   .TP
>>>   .B EIO
>>>   A low-level I/O error occurred while modifying the inode.
>>> +This error may relate to data written by an earlier
>>> +.BR write (2),
>>> +which may have been issued to a different file descriptor on
>>> +the same file.  Since Linux 4.13 errors from write-back will
>>> +be reported to all file descriptors that might have
>>> +written the data which triggered the error, and which are still
>>> +open.
>>
>>
>> This is where things get a little more vague.
>>
>> Some filesystems will return errors on a subsequent write(2) when
>> previous writeback has failed -- some don't. In either case though,
>> write(2) should never advance your errseq_t cursor, so only an fsync
>> will "clear" an earlier error.
>>
>> I'm not sure how best to convey that in the manpages though.
> 
> How about:
> 
> -------------
> This error may relate to the write-back of data written by an
> earlier
> .BR write (2),
> which may have been issued to a different file descriptor on
> the same file.  Since Linux 4.13, errors from write-back come
> with a promise that they
> .I may
> be reported by subsequent.
> .BR write (2)
> requests, and
> .I will
> be reported by a subsequent
> .BR fsync (2)
> (whether or not they were also reported by
> .BR write (2)).
> ------------
> ??
> 
> Those changes are included in the following.
> 
> Thanks,
> NeilBrown
> 
> From: NeilBrown <neilb@suse.com>
> Date: Thu, 14 Sep 2017 09:44:43 +1000
> Subject: [PATCH] write.2, fsync.2, close.2: update description of error codes
> 
> Since 4.13, errors from writeback are more reliably reported
> to all file descriptors that might be relevant.
> 
> Add notes to this effect, and also add detail about ENOSPC and EDQUOT
> which can be delayed in a similar many to EIO - for NFS in particular.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>

Thanks! I've applied, and added Jeff's Reviewed-by.

Cheers,

Michael


> ---
>   man2/close.2 |  9 +++++++++
>   man2/fsync.2 | 18 +++++++++++++++++-
>   man2/write.2 | 28 +++++++++++++++++++++++++---
>   3 files changed, 51 insertions(+), 4 deletions(-)
> 
> diff --git a/man2/close.2 b/man2/close.2
> index 55d89ed3dbc7..136bd0be3f67 100644
> --- a/man2/close.2
> +++ b/man2/close.2
> @@ -82,6 +82,15 @@ call was interrupted by a signal; see
>   .TP
>   .B EIO
>   An I/O error occurred.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +On NFS, these errors are not normally reported against the first write
> +which exceeds the available storage space, but instead against a
> +subsequent
> +.BR write (2),
> +.BR fsync (2),
> +or
> +.BR close (2).
>   .PP
>   See NOTES for a discussion of why
>   .BR close ()
> diff --git a/man2/fsync.2 b/man2/fsync.2
> index eed3c460bea9..c7878bf3496f 100644
> --- a/man2/fsync.2
> +++ b/man2/fsync.2
> @@ -121,7 +121,15 @@ is set appropriately.
>   is not a valid open file descriptor.
>   .TP
>   .B EIO
> -An error occurred during synchronization.
> +An error occurred during synchronization.  This error may relate
> +to data written to some other file descriptor on the same file.
> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
> +Since Linux 4.13, errors from write-back will be reported to
> +all file descriptors that might have written the data which triggered
> +the error.  Some filesystems (e.g. NFS) keep close track of which data
> +came through which file descriptor, and give more precise reporting.
> +Other filesystems (e.g. most local filesystems) will report errors to
> +all file descriptors that where open on the file when the error was recorded.
>   .TP
>   .B ENOSPC
>   Disk space was exhausted while synchronizing.
> @@ -130,6 +138,14 @@ Disk space was exhausted while synchronizing.
>   .I fd
>   is bound to a special file (e.g., a pipe, FIFO, or socket)
>   which does not support synchronization.
> +.TP
> +.BR ENOSPC ", " EDQUOT
> +.I fd
> +is bound to a file on NFS or another filesystem which does not allocate
> +space at the time of a
> +.BR write (2)
> +system call, and some previous write failed due to insufficient
> +storage space.
>   .SH CONFORMING TO
>   POSIX.1-2001, POSIX.1-2008, 4.3BSD.
>   .SH AVAILABILITY
> diff --git a/man2/write.2 b/man2/write.2
> index 061aa70cf590..b1cc3a2cfb17 100644
> --- a/man2/write.2
> +++ b/man2/write.2
> @@ -47,7 +47,7 @@ write \- write to a file descriptor
>   .BR write ()
>   writes up to
>   .I count
> -bytes from the buffer pointed
> +bytes from the buffer starting at
>   .I buf
>   to the file referred to by the file descriptor
>   .IR fd .
> @@ -181,6 +181,22 @@ or the file offset is not suitably aligned.
>   .TP
>   .B EIO
>   A low-level I/O error occurred while modifying the inode.
> +This error may relate to the write-back of data written by an
> +earlier
> +.BR write (2),
> +which may have been issued to a different file descriptor on
> +the same file.  Since Linux 4.13, errors from write-back come
> +with a promise that they
> +.I may
> +be reported by subsequent.
> +.BR write (2)
> +requests, and
> +.I will
> +be reported by a subsequent
> +.BR fsync (2)
> +(whether or not they were also reported by
> +.BR write (2)).
> +.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
>   .TP
>   .B ENOSPC
>   The device containing the file referred to by
> @@ -222,8 +238,14 @@ unsigned and signed integer data types specified by POSIX.1.
>   A successful return from
>   .BR write ()
>   does not make any guarantee that data has been committed to disk.
> -In fact, on some buggy implementations, it does not even guarantee
> -that space has successfully been reserved for the data.
> +On some filesystems, including NFS, it does not even guarantee
> +that space has successfully been reserved for the data.  In the case,
> +some errors might be delayed to a future
> +.BR write (2)
> +or to
> +.BR fsync (2)
> +or even
> +.BR close (2).
>   The only way to be sure is to call
>   .BR fsync (2)
>   after you are done writing all your data.
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2017-09-28 16:19 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-20 19:42 [PATCH] nfs: track writeback errors with errseq_t Jeff Layton
2017-08-25 17:59 ` Jeff Layton
2017-08-27 23:24   ` NeilBrown
2017-08-28 11:47     ` Jeff Layton
2017-08-29  1:23       ` NeilBrown
2017-08-29 10:54         ` Jeff Layton
2017-09-07  3:37           ` NeilBrown
2017-09-07 11:35             ` Jeff Layton
2017-09-07 14:54               ` Trond Myklebust
2017-09-07 14:54                 ` Trond Myklebust
2017-09-11  3:24                 ` NeilBrown
2017-09-11 10:46                   ` Jeff Layton
2017-09-11 21:52                     ` NeilBrown
2017-09-12 15:20                       ` Jeff Layton
2017-09-12 21:47                         ` NeilBrown
2017-09-13 12:23                           ` Jeff Layton
2017-09-13 23:50                             ` [RFC PATCH manpages] write.2, fsync.2, close.2: update description of error codes NeilBrown
2017-09-13 23:50                               ` NeilBrown
     [not found]                               ` <87ingm9n04.fsf-wvvUuzkyo1HefUI2i7LXDhCRmIWqnp/j@public.gmane.org>
2017-09-14  7:59                                 ` walter harms
     [not found]                                   ` <59BA36C5.9000506-fPG8STNUNVg@public.gmane.org>
2017-09-14 22:36                                     ` NeilBrown
2017-09-14 10:48                               ` Jeff Layton
2017-09-14 10:48                                 ` Jeff Layton
2017-09-15  7:50                                 ` Michael Kerrisk (man-pages)
2017-09-15  8:25                                   ` NeilBrown
2017-09-15  8:25                                     ` NeilBrown
2017-09-28  3:01                                 ` NeilBrown
2017-09-28  3:01                                   ` NeilBrown
2017-09-28 12:20                                   ` Jeff Layton
2017-09-28 12:20                                     ` Jeff Layton
2017-09-28 16:19                                   ` Michael Kerrisk (man-opages)
2017-09-12  2:24                   ` [PATCH] nfs: track writeback errors with errseq_t Trond Myklebust
2017-09-12  2:24                     ` Trond Myklebust
2017-09-12  5:29                     ` NeilBrown

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.