All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 5.10] nfsd: Replace use of rwsem with errseq_t
@ 2022-06-07 20:10 Leah Rumancik
  2022-06-13  7:58 ` Greg KH
  0 siblings, 1 reply; 4+ messages in thread
From: Leah Rumancik @ 2022-06-07 20:10 UTC (permalink / raw)
  To: stable; +Cc: Trond Myklebust, Chuck Lever, Leah Rumancik

From: Trond Myklebust <trond.myklebust@hammerspace.com>

[ Upstream commit 555dbf1a9aac6d3150c8b52fa35f768a692f4eeb ]

The nfsd_file nf_rwsem is currently being used to separate file write
and commit instances to ensure that we catch errors and apply them to
the correct write/commit.
We can improve scalability at the expense of a little accuracy (some
extra false positives) by replacing the nf_rwsem with more careful
use of the errseq_t mechanism to track errors across the different
operations.

[Leah: This patch is for 5.10. 5011af4c698a ("nfsd: Fix stable writes")
introduced a 75% performance regression on parallel random write
workloads. With this commit, the performance is restored to 90% of what
it was prior to 5011af4c698a. The changes to the fsync for asynchronous
copies were not included in this backport version as the fsync was not
added until 5.14 (eac0b17a77fb).]

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
[ cel: rebased on zero-verifier fix ]
---
 fs/nfsd/filecache.c |  1 -
 fs/nfsd/filecache.h |  1 -
 fs/nfsd/nfs4proc.c  |  7 ++++---
 fs/nfsd/vfs.c       | 40 +++++++++++++++-------------------------
 4 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index acd0898e3866..e30e1ddc1ace 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -194,7 +194,6 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
 				__set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
 		}
 		nf->nf_mark = NULL;
-		init_rwsem(&nf->nf_rwsem);
 		trace_nfsd_file_alloc(nf);
 	}
 	return nf;
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 7872df5a0fe3..435ceab27897 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -46,7 +46,6 @@ struct nfsd_file {
 	refcount_t		nf_ref;
 	unsigned char		nf_may;
 	struct nfsd_file_mark	*nf_mark;
-	struct rw_semaphore	nf_rwsem;
 };
 
 int nfsd_file_cache_init(void);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7850d141c762..735ee8a79870 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1380,6 +1380,8 @@ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
 
 static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
 {
+	struct file *dst = copy->nf_dst->nf_file;
+	struct file *src = copy->nf_src->nf_file;
 	ssize_t bytes_copied = 0;
 	size_t bytes_total = copy->cp_count;
 	u64 src_pos = copy->cp_src_pos;
@@ -1388,9 +1390,8 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
 	do {
 		if (kthread_should_stop())
 			break;
-		bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file,
-				src_pos, copy->nf_dst->nf_file, dst_pos,
-				bytes_total);
+		bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos,
+						    bytes_total);
 		if (bytes_copied <= 0)
 			break;
 		bytes_total -= bytes_copied;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 011cd570b50d..548ebc913f92 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -535,10 +535,11 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
 {
 	struct file *src = nf_src->nf_file;
 	struct file *dst = nf_dst->nf_file;
+	errseq_t since;
 	loff_t cloned;
 	__be32 ret = 0;
 
-	down_write(&nf_dst->nf_rwsem);
+	since = READ_ONCE(dst->f_wb_err);
 	cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
 	if (cloned < 0) {
 		ret = nfserrno(cloned);
@@ -552,6 +553,8 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
 		loff_t dst_end = count ? dst_pos + count - 1 : LLONG_MAX;
 		int status = vfs_fsync_range(dst, dst_pos, dst_end, 0);
 
+		if (!status)
+			status = filemap_check_wb_err(dst->f_mapping, since);
 		if (!status)
 			status = commit_inode_metadata(file_inode(src));
 		if (status < 0) {
@@ -561,7 +564,6 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
 		}
 	}
 out_err:
-	up_write(&nf_dst->nf_rwsem);
 	return ret;
 }
 
@@ -980,6 +982,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 	struct file		*file = nf->nf_file;
 	struct svc_export	*exp;
 	struct iov_iter		iter;
+	errseq_t		since;
 	__be32			nfserr;
 	int			host_err;
 	int			use_wgather;
@@ -1009,21 +1012,18 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 		flags |= RWF_SYNC;
 
 	iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
+	since = READ_ONCE(file->f_wb_err);
 	if (flags & RWF_SYNC) {
-		down_write(&nf->nf_rwsem);
 		host_err = vfs_iter_write(file, &iter, &pos, flags);
 		if (host_err < 0)
 			nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
 						 nfsd_net_id));
-		up_write(&nf->nf_rwsem);
 	} else {
-		down_read(&nf->nf_rwsem);
 		if (verf)
 			nfsd_copy_boot_verifier(verf,
 					net_generic(SVC_NET(rqstp),
 					nfsd_net_id));
 		host_err = vfs_iter_write(file, &iter, &pos, flags);
-		up_read(&nf->nf_rwsem);
 	}
 	if (host_err < 0) {
 		nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
@@ -1033,6 +1033,9 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 	*cnt = host_err;
 	nfsdstats.io_write += *cnt;
 	fsnotify_modify(file);
+	host_err = filemap_check_wb_err(file->f_mapping, since);
+	if (host_err < 0)
+		goto out_nfserr;
 
 	if (stable && use_wgather) {
 		host_err = wait_for_concurrent_writes(file);
@@ -1113,19 +1116,6 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 }
 
 #ifdef CONFIG_NFSD_V3
-static int
-nfsd_filemap_write_and_wait_range(struct nfsd_file *nf, loff_t offset,
-				  loff_t end)
-{
-	struct address_space *mapping = nf->nf_file->f_mapping;
-	int ret = filemap_fdatawrite_range(mapping, offset, end);
-
-	if (ret)
-		return ret;
-	filemap_fdatawait_range_keep_errors(mapping, offset, end);
-	return 0;
-}
-
 /*
  * Commit all pending writes to stable storage.
  *
@@ -1156,25 +1146,25 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (err)
 		goto out;
 	if (EX_ISSYNC(fhp->fh_export)) {
-		int err2 = nfsd_filemap_write_and_wait_range(nf, offset, end);
+		errseq_t since = READ_ONCE(nf->nf_file->f_wb_err);
+		int err2;
 
-		down_write(&nf->nf_rwsem);
-		if (!err2)
-			err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
+		err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
 		switch (err2) {
 		case 0:
 			nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
 						nfsd_net_id));
+			err2 = filemap_check_wb_err(nf->nf_file->f_mapping,
+						    since);
 			break;
 		case -EINVAL:
 			err = nfserr_notsupp;
 			break;
 		default:
-			err = nfserrno(err2);
 			nfsd_reset_boot_verifier(net_generic(nf->nf_net,
 						 nfsd_net_id));
 		}
-		up_write(&nf->nf_rwsem);
+		err = nfserrno(err2);
 	} else
 		nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
 					nfsd_net_id));
-- 
2.36.1.255.ge46751e96f-goog


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 5.10] nfsd: Replace use of rwsem with errseq_t
  2022-06-07 20:10 [PATCH 5.10] nfsd: Replace use of rwsem with errseq_t Leah Rumancik
@ 2022-06-13  7:58 ` Greg KH
  2022-06-13 23:46   ` Leah Rumancik
  0 siblings, 1 reply; 4+ messages in thread
From: Greg KH @ 2022-06-13  7:58 UTC (permalink / raw)
  To: Leah Rumancik; +Cc: stable, Trond Myklebust, Chuck Lever

On Tue, Jun 07, 2022 at 01:10:36PM -0700, Leah Rumancik wrote:
> From: Trond Myklebust <trond.myklebust@hammerspace.com>
> 
> [ Upstream commit 555dbf1a9aac6d3150c8b52fa35f768a692f4eeb ]
> 
> The nfsd_file nf_rwsem is currently being used to separate file write
> and commit instances to ensure that we catch errors and apply them to
> the correct write/commit.
> We can improve scalability at the expense of a little accuracy (some
> extra false positives) by replacing the nf_rwsem with more careful
> use of the errseq_t mechanism to track errors across the different
> operations.
> 
> [Leah: This patch is for 5.10. 5011af4c698a ("nfsd: Fix stable writes")
> introduced a 75% performance regression on parallel random write
> workloads. With this commit, the performance is restored to 90% of what
> it was prior to 5011af4c698a. The changes to the fsync for asynchronous
> copies were not included in this backport version as the fsync was not
> added until 5.14 (eac0b17a77fb).]
> 
> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
> [ cel: rebased on zero-verifier fix ]
> ---
>  fs/nfsd/filecache.c |  1 -
>  fs/nfsd/filecache.h |  1 -
>  fs/nfsd/nfs4proc.c  |  7 ++++---
>  fs/nfsd/vfs.c       | 40 +++++++++++++++-------------------------
>  4 files changed, 19 insertions(+), 30 deletions(-)

What about 5.15?  We can't take this patch for 5.10 only as if you
upgrade to 5.15 you would have a regression.  Can you provide a version
for that tree so that I can then apply this one too?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 5.10] nfsd: Replace use of rwsem with errseq_t
  2022-06-13  7:58 ` Greg KH
@ 2022-06-13 23:46   ` Leah Rumancik
  2022-06-16 13:08     ` Greg KH
  0 siblings, 1 reply; 4+ messages in thread
From: Leah Rumancik @ 2022-06-13 23:46 UTC (permalink / raw)
  To: Greg KH; +Cc: stable, Trond Myklebust, Chuck Lever

On Mon, Jun 13, 2022 at 09:58:21AM +0200, Greg KH wrote:
> On Tue, Jun 07, 2022 at 01:10:36PM -0700, Leah Rumancik wrote:
> > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > 
> > [ Upstream commit 555dbf1a9aac6d3150c8b52fa35f768a692f4eeb ]
> > 
> > The nfsd_file nf_rwsem is currently being used to separate file write
> > and commit instances to ensure that we catch errors and apply them to
> > the correct write/commit.
> > We can improve scalability at the expense of a little accuracy (some
> > extra false positives) by replacing the nf_rwsem with more careful
> > use of the errseq_t mechanism to track errors across the different
> > operations.
> > 
> > [Leah: This patch is for 5.10. 5011af4c698a ("nfsd: Fix stable writes")
> > introduced a 75% performance regression on parallel random write
> > workloads. With this commit, the performance is restored to 90% of what
> > it was prior to 5011af4c698a. The changes to the fsync for asynchronous
> > copies were not included in this backport version as the fsync was not
> > added until 5.14 (eac0b17a77fb).]
> > 
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
> > [ cel: rebased on zero-verifier fix ]
> > ---
> >  fs/nfsd/filecache.c |  1 -
> >  fs/nfsd/filecache.h |  1 -
> >  fs/nfsd/nfs4proc.c  |  7 ++++---
> >  fs/nfsd/vfs.c       | 40 +++++++++++++++-------------------------
> >  4 files changed, 19 insertions(+), 30 deletions(-)
> 
> What about 5.15?  We can't take this patch for 5.10 only as if you
> upgrade to 5.15 you would have a regression.  Can you provide a version
> for that tree so that I can then apply this one too?
> 
> thanks,
> 
> greg k-h

Just sent the 5.15 version. The upstream commit
(555dbf1a9aac6d3150c8b52fa35f768a692f4eeb) actually applies cleanly on
5.15 so you can pull that or the version I just sent with the
justification for backporting. After applying this commit to 5.15, I
confirmed there was no peformance regression.

Best,
Leah

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 5.10] nfsd: Replace use of rwsem with errseq_t
  2022-06-13 23:46   ` Leah Rumancik
@ 2022-06-16 13:08     ` Greg KH
  0 siblings, 0 replies; 4+ messages in thread
From: Greg KH @ 2022-06-16 13:08 UTC (permalink / raw)
  To: Leah Rumancik; +Cc: stable, Trond Myklebust, Chuck Lever

On Mon, Jun 13, 2022 at 04:46:47PM -0700, Leah Rumancik wrote:
> On Mon, Jun 13, 2022 at 09:58:21AM +0200, Greg KH wrote:
> > On Tue, Jun 07, 2022 at 01:10:36PM -0700, Leah Rumancik wrote:
> > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > 
> > > [ Upstream commit 555dbf1a9aac6d3150c8b52fa35f768a692f4eeb ]
> > > 
> > > The nfsd_file nf_rwsem is currently being used to separate file write
> > > and commit instances to ensure that we catch errors and apply them to
> > > the correct write/commit.
> > > We can improve scalability at the expense of a little accuracy (some
> > > extra false positives) by replacing the nf_rwsem with more careful
> > > use of the errseq_t mechanism to track errors across the different
> > > operations.
> > > 
> > > [Leah: This patch is for 5.10. 5011af4c698a ("nfsd: Fix stable writes")
> > > introduced a 75% performance regression on parallel random write
> > > workloads. With this commit, the performance is restored to 90% of what
> > > it was prior to 5011af4c698a. The changes to the fsync for asynchronous
> > > copies were not included in this backport version as the fsync was not
> > > added until 5.14 (eac0b17a77fb).]
> > > 
> > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> > > Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
> > > [ cel: rebased on zero-verifier fix ]
> > > ---
> > >  fs/nfsd/filecache.c |  1 -
> > >  fs/nfsd/filecache.h |  1 -
> > >  fs/nfsd/nfs4proc.c  |  7 ++++---
> > >  fs/nfsd/vfs.c       | 40 +++++++++++++++-------------------------
> > >  4 files changed, 19 insertions(+), 30 deletions(-)
> > 
> > What about 5.15?  We can't take this patch for 5.10 only as if you
> > upgrade to 5.15 you would have a regression.  Can you provide a version
> > for that tree so that I can then apply this one too?
> > 
> > thanks,
> > 
> > greg k-h
> 
> Just sent the 5.15 version. The upstream commit
> (555dbf1a9aac6d3150c8b52fa35f768a692f4eeb) actually applies cleanly on
> 5.15 so you can pull that or the version I just sent with the
> justification for backporting. After applying this commit to 5.15, I
> confirmed there was no peformance regression.

Now all queued up, thanks.

greg k-h

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-06-16 13:08 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-07 20:10 [PATCH 5.10] nfsd: Replace use of rwsem with errseq_t Leah Rumancik
2022-06-13  7:58 ` Greg KH
2022-06-13 23:46   ` Leah Rumancik
2022-06-16 13:08     ` Greg KH

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.