All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ceph: request Fw caps before updating the mtime in ceph_write_iter
@ 2021-08-11 11:23 Jeff Layton
  2021-08-11 14:08 ` Luis Henriques
  2021-08-20  5:16 ` Yan, Zheng
  0 siblings, 2 replies; 5+ messages in thread
From: Jeff Layton @ 2021-08-11 11:23 UTC (permalink / raw)
  To: ceph-devel; +Cc: idryomov, lhenriques, xiubli, Jozef Kováč

The current code will update the mtime and then try to get caps to
handle the write. If we end up having to request caps from the MDS, then
the mtime in the cap grant will clobber the updated mtime and it'll be
lost.

This is most noticable when two clients are alternately writing to the
same file. Fw caps are continually being granted and revoked, and the
mtime ends up stuck because the updated mtimes are always being
overwritten with the old one.

Fix this by changing the order of operations in ceph_write_iter. Get the
caps much earlier, and only update the times afterward. Also, make sure
we check the NEARFULL conditions before making any changes to the inode.

URL: https://tracker.ceph.com/issues/46574
Reported-by: Jozef Kováč <kovac@firma.zoznam.sk>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/file.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index f55ca2c4c7de..5867acfc6a51 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1722,22 +1722,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		goto out;
 	}
 
-	err = file_remove_privs(file);
-	if (err)
-		goto out;
-
-	err = file_update_time(file);
-	if (err)
-		goto out;
-
-	inode_inc_iversion_raw(inode);
-
-	if (ci->i_inline_version != CEPH_INLINE_NONE) {
-		err = ceph_uninline_data(file, NULL);
-		if (err < 0)
-			goto out;
-	}
-
 	down_read(&osdc->lock);
 	map_flags = osdc->osdmap->flags;
 	pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
@@ -1748,6 +1732,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		goto out;
 	}
 
+	if (ci->i_inline_version != CEPH_INLINE_NONE) {
+		err = ceph_uninline_data(file, NULL);
+		if (err < 0)
+			goto out;
+	}
+
 	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
 	     inode, ceph_vinop(inode), pos, count, i_size_read(inode));
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
@@ -1759,6 +1749,16 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (err < 0)
 		goto out;
 
+	err = file_remove_privs(file);
+	if (err)
+		goto out_caps;
+
+	err = file_update_time(file);
+	if (err)
+		goto out_caps;
+
+	inode_inc_iversion_raw(inode);
+
 	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
 	     inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
 
@@ -1822,7 +1822,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
 			ceph_check_caps(ci, 0, NULL);
 	}
-
+out_caps:
 	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
 	     inode, ceph_vinop(inode), pos, (unsigned)count,
 	     ceph_cap_string(got));
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] ceph: request Fw caps before updating the mtime in ceph_write_iter
  2021-08-11 11:23 [PATCH] ceph: request Fw caps before updating the mtime in ceph_write_iter Jeff Layton
@ 2021-08-11 14:08 ` Luis Henriques
  2021-08-11 14:15   ` Jeff Layton
  2021-08-20  5:16 ` Yan, Zheng
  1 sibling, 1 reply; 5+ messages in thread
From: Luis Henriques @ 2021-08-11 14:08 UTC (permalink / raw)
  To: Jeff Layton; +Cc: ceph-devel, idryomov, xiubli, Jozef Kováč

Jeff Layton <jlayton@kernel.org> writes:

> The current code will update the mtime and then try to get caps to
> handle the write. If we end up having to request caps from the MDS, then
> the mtime in the cap grant will clobber the updated mtime and it'll be
> lost.
>
> This is most noticable when two clients are alternately writing to the
> same file. Fw caps are continually being granted and revoked, and the
> mtime ends up stuck because the updated mtimes are always being
> overwritten with the old one.
>
> Fix this by changing the order of operations in ceph_write_iter. Get the
> caps much earlier, and only update the times afterward. Also, make sure
> we check the NEARFULL conditions before making any changes to the inode.
>
> URL: https://tracker.ceph.com/issues/46574
> Reported-by: Jozef Kováč <kovac@firma.zoznam.sk>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/file.c | 34 +++++++++++++++++-----------------
>  1 file changed, 17 insertions(+), 17 deletions(-)
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index f55ca2c4c7de..5867acfc6a51 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -1722,22 +1722,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		goto out;
>  	}
>  
> -	err = file_remove_privs(file);
> -	if (err)
> -		goto out;
> -
> -	err = file_update_time(file);
> -	if (err)
> -		goto out;
> -
> -	inode_inc_iversion_raw(inode);
> -
> -	if (ci->i_inline_version != CEPH_INLINE_NONE) {
> -		err = ceph_uninline_data(file, NULL);
> -		if (err < 0)
> -			goto out;
> -	}
> -
>  	down_read(&osdc->lock);
>  	map_flags = osdc->osdmap->flags;
>  	pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
> @@ -1748,6 +1732,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		goto out;
>  	}
>  
> +	if (ci->i_inline_version != CEPH_INLINE_NONE) {
> +		err = ceph_uninline_data(file, NULL);
> +		if (err < 0)
> +			goto out;
> +	}
> +
>  	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
>  	     inode, ceph_vinop(inode), pos, count, i_size_read(inode));
>  	if (fi->fmode & CEPH_FILE_MODE_LAZY)
> @@ -1759,6 +1749,16 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	if (err < 0)
>  		goto out;
>  
> +	err = file_remove_privs(file);
> +	if (err)
> +		goto out_caps;
> +
> +	err = file_update_time(file);
> +	if (err)
> +		goto out_caps;

Unless I'm missing something (which happens quite frequently!) i_rwsem
still needs to be released through either ceph_end_io_write() or
ceph_end_io_direct().  And this isn't being done if we jump to out_caps
(yeah, goto's spaghetti fun).

Also, this patch is probably worth adding to stable@ too, although I
haven't checked how easy is it to cherry-pick to older kernel versions.

Cheers,
-- 
Luis

> +
> +	inode_inc_iversion_raw(inode);
> +
>  	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
>  	     inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
>  
> @@ -1822,7 +1822,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
>  			ceph_check_caps(ci, 0, NULL);
>  	}
> -
> +out_caps:
>  	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
>  	     inode, ceph_vinop(inode), pos, (unsigned)count,
>  	     ceph_cap_string(got));
> -- 
>
> 2.31.1
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ceph: request Fw caps before updating the mtime in ceph_write_iter
  2021-08-11 14:08 ` Luis Henriques
@ 2021-08-11 14:15   ` Jeff Layton
  0 siblings, 0 replies; 5+ messages in thread
From: Jeff Layton @ 2021-08-11 14:15 UTC (permalink / raw)
  To: Luis Henriques; +Cc: ceph-devel, idryomov, xiubli, Jozef Kováč

On Wed, 2021-08-11 at 15:08 +0100, Luis Henriques wrote:
> Jeff Layton <jlayton@kernel.org> writes:
> 
> > The current code will update the mtime and then try to get caps to
> > handle the write. If we end up having to request caps from the MDS, then
> > the mtime in the cap grant will clobber the updated mtime and it'll be
> > lost.
> > 
> > This is most noticable when two clients are alternately writing to the
> > same file. Fw caps are continually being granted and revoked, and the
> > mtime ends up stuck because the updated mtimes are always being
> > overwritten with the old one.
> > 
> > Fix this by changing the order of operations in ceph_write_iter. Get the
> > caps much earlier, and only update the times afterward. Also, make sure
> > we check the NEARFULL conditions before making any changes to the inode.
> > 
> > URL: https://tracker.ceph.com/issues/46574
> > Reported-by: Jozef Kováč <kovac@firma.zoznam.sk>
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/file.c | 34 +++++++++++++++++-----------------
> >  1 file changed, 17 insertions(+), 17 deletions(-)
> > 
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index f55ca2c4c7de..5867acfc6a51 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -1722,22 +1722,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >  		goto out;
> >  	}
> >  
> > -	err = file_remove_privs(file);
> > -	if (err)
> > -		goto out;
> > -
> > -	err = file_update_time(file);
> > -	if (err)
> > -		goto out;
> > -
> > -	inode_inc_iversion_raw(inode);
> > -
> > -	if (ci->i_inline_version != CEPH_INLINE_NONE) {
> > -		err = ceph_uninline_data(file, NULL);
> > -		if (err < 0)
> > -			goto out;
> > -	}
> > -
> >  	down_read(&osdc->lock);
> >  	map_flags = osdc->osdmap->flags;
> >  	pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
> > @@ -1748,6 +1732,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >  		goto out;
> >  	}
> >  
> > +	if (ci->i_inline_version != CEPH_INLINE_NONE) {
> > +		err = ceph_uninline_data(file, NULL);
> > +		if (err < 0)
> > +			goto out;
> > +	}
> > +
> >  	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
> >  	     inode, ceph_vinop(inode), pos, count, i_size_read(inode));
> >  	if (fi->fmode & CEPH_FILE_MODE_LAZY)
> > @@ -1759,6 +1749,16 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >  	if (err < 0)
> >  		goto out;
> >  
> > +	err = file_remove_privs(file);
> > +	if (err)
> > +		goto out_caps;
> > +
> > +	err = file_update_time(file);
> > +	if (err)
> > +		goto out_caps;
> 
> Unless I'm missing something (which happens quite frequently!) i_rwsem
> still needs to be released through either ceph_end_io_write() or
> ceph_end_io_direct().  And this isn't being done if we jump to out_caps
> (yeah, goto's spaghetti fun).
> 

Good catch! I'll send a v2 in a bit after I test it.

> Also, this patch is probably worth adding to stable@ too, although I
> haven't checked how easy is it to cherry-pick to older kernel versions.
> 

I'm not sure it qualifies for stable. We do have an open tracker bug for
it, but the only real problem is that the mtime/change_attr stall out
while there is competing I/O. Definitely broken, but I'm not sure it's
really affecting that many people.

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ceph: request Fw caps before updating the mtime in ceph_write_iter
  2021-08-11 11:23 [PATCH] ceph: request Fw caps before updating the mtime in ceph_write_iter Jeff Layton
  2021-08-11 14:08 ` Luis Henriques
@ 2021-08-20  5:16 ` Yan, Zheng
  2021-08-20 10:40   ` Jeff Layton
  1 sibling, 1 reply; 5+ messages in thread
From: Yan, Zheng @ 2021-08-20  5:16 UTC (permalink / raw)
  To: Jeff Layton
  Cc: ceph-devel, Ilya Dryomov, Luis Henriques, Xiubo Li,
	Jozef Kováč

On Wed, Aug 11, 2021 at 7:24 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> The current code will update the mtime and then try to get caps to
> handle the write. If we end up having to request caps from the MDS, then
> the mtime in the cap grant will clobber the updated mtime and it'll be
> lost.
>
> This is most noticable when two clients are alternately writing to the
> same file. Fw caps are continually being granted and revoked, and the
> mtime ends up stuck because the updated mtimes are always being
> overwritten with the old one.
>
> Fix this by changing the order of operations in ceph_write_iter. Get the
> caps much earlier, and only update the times afterward. Also, make sure
> we check the NEARFULL conditions before making any changes to the inode.
>
> URL: https://tracker.ceph.com/issues/46574
> Reported-by: Jozef Kováč <kovac@firma.zoznam.sk>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/file.c | 34 +++++++++++++++++-----------------
>  1 file changed, 17 insertions(+), 17 deletions(-)
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index f55ca2c4c7de..5867acfc6a51 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -1722,22 +1722,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>                 goto out;
>         }
>
> -       err = file_remove_privs(file);
> -       if (err)
> -               goto out;
> -
> -       err = file_update_time(file);
> -       if (err)
> -               goto out;
> -
> -       inode_inc_iversion_raw(inode);
> -
> -       if (ci->i_inline_version != CEPH_INLINE_NONE) {
> -               err = ceph_uninline_data(file, NULL);
> -               if (err < 0)
> -                       goto out;
> -       }
> -
>         down_read(&osdc->lock);
>         map_flags = osdc->osdmap->flags;
>         pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
> @@ -1748,6 +1732,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>                 goto out;
>         }
>
> +       if (ci->i_inline_version != CEPH_INLINE_NONE) {
> +               err = ceph_uninline_data(file, NULL);
> +               if (err < 0)
> +                       goto out;
> +       }
> +
>         dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
>              inode, ceph_vinop(inode), pos, count, i_size_read(inode));
>         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> @@ -1759,6 +1749,16 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>         if (err < 0)
>                 goto out;
>
> +       err = file_remove_privs(file);
> +       if (err)
> +               goto out_caps;

this may send setattr request to mds. holding cap here may cause deadlock.

> +
> +       err = file_update_time(file);
> +       if (err)
> +               goto out_caps;
> +
> +       inode_inc_iversion_raw(inode);
> +
>         dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
>              inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
>
> @@ -1822,7 +1822,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
>                 if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
>                         ceph_check_caps(ci, 0, NULL);
>         }
> -
> +out_caps:
>         dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
>              inode, ceph_vinop(inode), pos, (unsigned)count,
>              ceph_cap_string(got));
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] ceph: request Fw caps before updating the mtime in ceph_write_iter
  2021-08-20  5:16 ` Yan, Zheng
@ 2021-08-20 10:40   ` Jeff Layton
  0 siblings, 0 replies; 5+ messages in thread
From: Jeff Layton @ 2021-08-20 10:40 UTC (permalink / raw)
  To: Yan, Zheng
  Cc: ceph-devel, Ilya Dryomov, Luis Henriques, Xiubo Li,
	Jozef Kováč

On Fri, 2021-08-20 at 13:16 +0800, Yan, Zheng wrote:
> On Wed, Aug 11, 2021 at 7:24 PM Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > The current code will update the mtime and then try to get caps to
> > handle the write. If we end up having to request caps from the MDS, then
> > the mtime in the cap grant will clobber the updated mtime and it'll be
> > lost.
> > 
> > This is most noticable when two clients are alternately writing to the
> > same file. Fw caps are continually being granted and revoked, and the
> > mtime ends up stuck because the updated mtimes are always being
> > overwritten with the old one.
> > 
> > Fix this by changing the order of operations in ceph_write_iter. Get the
> > caps much earlier, and only update the times afterward. Also, make sure
> > we check the NEARFULL conditions before making any changes to the inode.
> > 
> > URL: https://tracker.ceph.com/issues/46574
> > Reported-by: Jozef Kováč <kovac@firma.zoznam.sk>
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/file.c | 34 +++++++++++++++++-----------------
> >  1 file changed, 17 insertions(+), 17 deletions(-)
> > 
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index f55ca2c4c7de..5867acfc6a51 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -1722,22 +1722,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >                 goto out;
> >         }
> > 
> > -       err = file_remove_privs(file);
> > -       if (err)
> > -               goto out;
> > -
> > -       err = file_update_time(file);
> > -       if (err)
> > -               goto out;
> > -
> > -       inode_inc_iversion_raw(inode);
> > -
> > -       if (ci->i_inline_version != CEPH_INLINE_NONE) {
> > -               err = ceph_uninline_data(file, NULL);
> > -               if (err < 0)
> > -                       goto out;
> > -       }
> > -
> >         down_read(&osdc->lock);
> >         map_flags = osdc->osdmap->flags;
> >         pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
> > @@ -1748,6 +1732,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >                 goto out;
> >         }
> > 
> > +       if (ci->i_inline_version != CEPH_INLINE_NONE) {
> > +               err = ceph_uninline_data(file, NULL);
> > +               if (err < 0)
> > +                       goto out;
> > +       }
> > +
> >         dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
> >              inode, ceph_vinop(inode), pos, count, i_size_read(inode));
> >         if (fi->fmode & CEPH_FILE_MODE_LAZY)
> > @@ -1759,6 +1749,16 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >         if (err < 0)
> >                 goto out;
> > 
> > +       err = file_remove_privs(file);
> > +       if (err)
> > +               goto out_caps;
> 
> this may send setattr request to mds. holding cap here may cause deadlock.
> 

Thanks, Zheng -- good point. I guess we can move this call to before the
cap acquisition. I'll test that out and send a v3.

> > +
> > +       err = file_update_time(file);
> > +       if (err)
> > +               goto out_caps;
> > +
> > +       inode_inc_iversion_raw(inode);
> > +
> >         dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
> >              inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
> > 
> > @@ -1822,7 +1822,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
> >                 if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
> >                         ceph_check_caps(ci, 0, NULL);
> >         }
> > -
> > +out_caps:
> >         dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
> >              inode, ceph_vinop(inode), pos, (unsigned)count,
> >              ceph_cap_string(got));
> > --
> > 2.31.1
> > 

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-08-20 10:40 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-11 11:23 [PATCH] ceph: request Fw caps before updating the mtime in ceph_write_iter Jeff Layton
2021-08-11 14:08 ` Luis Henriques
2021-08-11 14:15   ` Jeff Layton
2021-08-20  5:16 ` Yan, Zheng
2021-08-20 10:40   ` Jeff Layton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.