All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nfsd: drop st_mutex before calling move_to_close_lru()
@ 2023-12-22  1:41 NeilBrown
  2023-12-22  2:12 ` [PATCH v2] nfsd: drop st_mutex and rp_mutex " NeilBrown
  0 siblings, 1 reply; 11+ messages in thread
From: NeilBrown @ 2023-12-22  1:41 UTC (permalink / raw)
  To: Chuck Lever, Jeff Layton, J. Bruce Fields
  Cc: Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs


move_to_close_lru() is currently called with ->st_mutex held.
This can lead to a deadlock as move_to_close_lru() waits for sc_count to
drop to 2, and some threads holding a reference might be waiting for the
mutex.  These references will never be dropped so sc_count will never
reach 2.

There can be no harm in dropping ->st_mutex to before
move_to_close_lru() because the only place that takes the mutex is
nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.

See also
 https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
where this problem was raised but not successfully resolved.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/nfs4state.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 40415929e2ae..1fd0c8a59cc4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	return status;
 }
 
-static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 {
 	struct nfs4_client *clp = s->st_stid.sc_client;
 	bool unhashed;
@@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 		list_for_each_entry(stp, &reaplist, st_locks)
 			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
 		free_ol_stateid_reaplist(&reaplist);
+		return false;
 	} else {
 		spin_unlock(&clp->cl_lock);
 		free_ol_stateid_reaplist(&reaplist);
-		if (unhashed)
-			move_to_close_lru(s, clp->net);
+		return unhashed;
 	}
 }
 
@@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_ol_stateid *stp;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	bool need_move_to_close_list;
 
 	dprintk("NFSD: nfsd4_close on file %pd\n", 
 			cstate->current_fh.fh_dentry);
@@ -7114,8 +7115,10 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 */
 	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
 
-	nfsd4_close_open_stateid(stp);
+	need_move_to_close_list = nfsd4_close_open_stateid(stp);
 	mutex_unlock(&stp->st_mutex);
+	if (need_move_to_close_list)
+		move_to_close_lru(stp, net);
 
 	/* v4.1+ suggests that we send a special stateid in here, since the
 	 * clients should just ignore this anyway. Since this is not useful
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru()
  2023-12-22  1:41 [PATCH] nfsd: drop st_mutex before calling move_to_close_lru() NeilBrown
@ 2023-12-22  2:12 ` NeilBrown
  2023-12-22 14:39   ` Chuck Lever
  2023-12-22 20:15   ` dai.ngo
  0 siblings, 2 replies; 11+ messages in thread
From: NeilBrown @ 2023-12-22  2:12 UTC (permalink / raw)
  To: Chuck Lever, Jeff Layton, J. Bruce Fields
  Cc: Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs


move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
This can lead to a deadlock as move_to_close_lru() waits for sc_count to
drop to 2, and some threads holding a reference might be waiting for either
mutex.  These references will never be dropped so sc_count will never
reach 2.

There can be no harm in dropping ->st_mutex to before
move_to_close_lru() because the only place that takes the mutex is
nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.

Similarly dropping .rp_mutex is safe after the state is closed and so
no longer usable.  Another way to look at this is that nothing
significant happens between when nfsd4_close() now calls
nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
nfsd4_cstate_clear_replay() a little later.

See also
 https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
where this problem was raised but not successfully resolved.

Signed-off-by: NeilBrown <neilb@suse.de>
---

Sorry - I posted v1 a little hastily.  I need to drop rp_mutex as well
to avoid the deadlock.  This also is safe.

 fs/nfsd/nfs4state.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 40415929e2ae..453714fbcd66 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	return status;
 }
 
-static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 {
 	struct nfs4_client *clp = s->st_stid.sc_client;
 	bool unhashed;
@@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 		list_for_each_entry(stp, &reaplist, st_locks)
 			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
 		free_ol_stateid_reaplist(&reaplist);
+		return false;
 	} else {
 		spin_unlock(&clp->cl_lock);
 		free_ol_stateid_reaplist(&reaplist);
-		if (unhashed)
-			move_to_close_lru(s, clp->net);
+		return unhashed;
 	}
 }
 
@@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_ol_stateid *stp;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	bool need_move_to_close_list;
 
 	dprintk("NFSD: nfsd4_close on file %pd\n", 
 			cstate->current_fh.fh_dentry);
@@ -7114,8 +7115,11 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 */
 	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
 
-	nfsd4_close_open_stateid(stp);
+	need_move_to_close_list = nfsd4_close_open_stateid(stp);
 	mutex_unlock(&stp->st_mutex);
+	nfsd4_cstate_clear_replay(cstate);
+	if (need_move_to_close_list)
+		move_to_close_lru(stp, net);
 
 	/* v4.1+ suggests that we send a special stateid in here, since the
 	 * clients should just ignore this anyway. Since this is not useful
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru()
  2023-12-22  2:12 ` [PATCH v2] nfsd: drop st_mutex and rp_mutex " NeilBrown
@ 2023-12-22 14:39   ` Chuck Lever
  2023-12-22 20:15   ` dai.ngo
  1 sibling, 0 replies; 11+ messages in thread
From: Chuck Lever @ 2023-12-22 14:39 UTC (permalink / raw)
  To: NeilBrown
  Cc: Jeff Layton, J. Bruce Fields, Olga Kornievskaia, Dai Ngo,
	Tom Talpey, linux-nfs

On Fri, Dec 22, 2023 at 01:12:10PM +1100, NeilBrown wrote:
> 
> move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> drop to 2, and some threads holding a reference might be waiting for either
> mutex.  These references will never be dropped so sc_count will never
> reach 2.
> 
> There can be no harm in dropping ->st_mutex to before
> move_to_close_lru() because the only place that takes the mutex is
> nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
> NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
> 
> Similarly dropping .rp_mutex is safe after the state is closed and so
> no longer usable.  Another way to look at this is that nothing
> significant happens between when nfsd4_close() now calls
> nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
> nfsd4_cstate_clear_replay() a little later.
> 
> See also
>  https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> where this problem was raised but not successfully resolved.
> 
> Signed-off-by: NeilBrown <neilb@suse.de>

Hi Neil-

I would like Jeff and Dai to have a look at this one too, since they
have both worked extensively in this area. But that means it will get
deferred to v6.9 (ie, after the holiday).

I've applied it to a private branch that I will pick up after the v6.8
merge window closes.


> ---
> 
> Sorry - I posted v1 a little hastily.  I need to drop rp_mutex as well
> to avoid the deadlock.  This also is safe.
> 
>  fs/nfsd/nfs4state.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 40415929e2ae..453714fbcd66 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
>  	return status;
>  }
>  
> -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  {
>  	struct nfs4_client *clp = s->st_stid.sc_client;
>  	bool unhashed;
> @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  		list_for_each_entry(stp, &reaplist, st_locks)
>  			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
>  		free_ol_stateid_reaplist(&reaplist);
> +		return false;
>  	} else {
>  		spin_unlock(&clp->cl_lock);
>  		free_ol_stateid_reaplist(&reaplist);
> -		if (unhashed)
> -			move_to_close_lru(s, clp->net);
> +		return unhashed;
>  	}
>  }
>  
> @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	struct nfs4_ol_stateid *stp;
>  	struct net *net = SVC_NET(rqstp);
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +	bool need_move_to_close_list;
>  
>  	dprintk("NFSD: nfsd4_close on file %pd\n", 
>  			cstate->current_fh.fh_dentry);
> @@ -7114,8 +7115,11 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	 */
>  	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
>  
> -	nfsd4_close_open_stateid(stp);
> +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
>  	mutex_unlock(&stp->st_mutex);
> +	nfsd4_cstate_clear_replay(cstate);
> +	if (need_move_to_close_list)
> +		move_to_close_lru(stp, net);
>  
>  	/* v4.1+ suggests that we send a special stateid in here, since the
>  	 * clients should just ignore this anyway. Since this is not useful
> -- 
> 2.43.0
> 

-- 
Chuck Lever

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru()
  2023-12-22  2:12 ` [PATCH v2] nfsd: drop st_mutex and rp_mutex " NeilBrown
  2023-12-22 14:39   ` Chuck Lever
@ 2023-12-22 20:15   ` dai.ngo
  2023-12-22 23:01     ` NeilBrown
  1 sibling, 1 reply; 11+ messages in thread
From: dai.ngo @ 2023-12-22 20:15 UTC (permalink / raw)
  To: NeilBrown, Chuck Lever, Jeff Layton, J. Bruce Fields
  Cc: Olga Kornievskaia, Tom Talpey, linux-nfs


On 12/21/23 6:12 PM, NeilBrown wrote:
> move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> drop to 2, and some threads holding a reference might be waiting for either
> mutex.  These references will never be dropped so sc_count will never
> reach 2.

Yes, I think there is potential deadlock here since both nfs4_preprocess_seqid_op
and nfsd4_find_and_lock_existing_open can increment the sc_count but then
have to wait for the st_mutex which being held by move_to_close_lru.

>
> There can be no harm in dropping ->st_mutex to before
> move_to_close_lru() because the only place that takes the mutex is
> nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
> NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
>
> Similarly dropping .rp_mutex is safe after the state is closed and so
> no longer usable.  Another way to look at this is that nothing
> significant happens between when nfsd4_close() now calls
> nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
> nfsd4_cstate_clear_replay() a little later.
>
> See also
>   https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> where this problem was raised but not successfully resolved.
>
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>
> Sorry - I posted v1 a little hastily.  I need to drop rp_mutex as well
> to avoid the deadlock.  This also is safe.
>
>   fs/nfsd/nfs4state.c | 12 ++++++++----
>   1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 40415929e2ae..453714fbcd66 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
>   	return status;
>   }
>   
> -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>   {
>   	struct nfs4_client *clp = s->st_stid.sc_client;
>   	bool unhashed;
> @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>   		list_for_each_entry(stp, &reaplist, st_locks)
>   			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
>   		free_ol_stateid_reaplist(&reaplist);
> +		return false;
>   	} else {
>   		spin_unlock(&clp->cl_lock);
>   		free_ol_stateid_reaplist(&reaplist);
> -		if (unhashed)
> -			move_to_close_lru(s, clp->net);
> +		return unhashed;
>   	}
>   }
>   
> @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>   	struct nfs4_ol_stateid *stp;
>   	struct net *net = SVC_NET(rqstp);
>   	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +	bool need_move_to_close_list;
>   
>   	dprintk("NFSD: nfsd4_close on file %pd\n",
>   			cstate->current_fh.fh_dentry);
> @@ -7114,8 +7115,11 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>   	 */
>   	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
>   
> -	nfsd4_close_open_stateid(stp);
> +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
>   	mutex_unlock(&stp->st_mutex);
> +	nfsd4_cstate_clear_replay(cstate);

should nfsd4_cstate_clear_replay be called only if need_move_to_close_list
is true?

-Dai

> +	if (need_move_to_close_list)
> +		move_to_close_lru(stp, net);
>   
>   	/* v4.1+ suggests that we send a special stateid in here, since the
>   	 * clients should just ignore this anyway. Since this is not useful

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru()
  2023-12-22 20:15   ` dai.ngo
@ 2023-12-22 23:01     ` NeilBrown
  2023-12-23 18:07       ` dai.ngo
  0 siblings, 1 reply; 11+ messages in thread
From: NeilBrown @ 2023-12-22 23:01 UTC (permalink / raw)
  To: dai.ngo
  Cc: Chuck Lever, Jeff Layton, J. Bruce Fields, Olga Kornievskaia,
	Tom Talpey, linux-nfs

On Sat, 23 Dec 2023, dai.ngo@oracle.com wrote:
> On 12/21/23 6:12 PM, NeilBrown wrote:
> > move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> > This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> > drop to 2, and some threads holding a reference might be waiting for either
> > mutex.  These references will never be dropped so sc_count will never
> > reach 2.
> 
> Yes, I think there is potential deadlock here since both nfs4_preprocess_seqid_op
> and nfsd4_find_and_lock_existing_open can increment the sc_count but then
> have to wait for the st_mutex which being held by move_to_close_lru.
> 
> >
> > There can be no harm in dropping ->st_mutex to before
> > move_to_close_lru() because the only place that takes the mutex is
> > nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
> > NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
> >
> > Similarly dropping .rp_mutex is safe after the state is closed and so
> > no longer usable.  Another way to look at this is that nothing
> > significant happens between when nfsd4_close() now calls
> > nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
> > nfsd4_cstate_clear_replay() a little later.
> >
> > See also
> >   https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> > where this problem was raised but not successfully resolved.
> >
> > Signed-off-by: NeilBrown <neilb@suse.de>
> > ---
> >
> > Sorry - I posted v1 a little hastily.  I need to drop rp_mutex as well
> > to avoid the deadlock.  This also is safe.
> >
> >   fs/nfsd/nfs4state.c | 12 ++++++++----
> >   1 file changed, 8 insertions(+), 4 deletions(-)
> >
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 40415929e2ae..453714fbcd66 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
> >   	return status;
> >   }
> >   
> > -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> > +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> >   {
> >   	struct nfs4_client *clp = s->st_stid.sc_client;
> >   	bool unhashed;
> > @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> >   		list_for_each_entry(stp, &reaplist, st_locks)
> >   			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
> >   		free_ol_stateid_reaplist(&reaplist);
> > +		return false;
> >   	} else {
> >   		spin_unlock(&clp->cl_lock);
> >   		free_ol_stateid_reaplist(&reaplist);
> > -		if (unhashed)
> > -			move_to_close_lru(s, clp->net);
> > +		return unhashed;
> >   	}
> >   }
> >   
> > @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >   	struct nfs4_ol_stateid *stp;
> >   	struct net *net = SVC_NET(rqstp);
> >   	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +	bool need_move_to_close_list;
> >   
> >   	dprintk("NFSD: nfsd4_close on file %pd\n",
> >   			cstate->current_fh.fh_dentry);
> > @@ -7114,8 +7115,11 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >   	 */
> >   	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
> >   
> > -	nfsd4_close_open_stateid(stp);
> > +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
> >   	mutex_unlock(&stp->st_mutex);
> > +	nfsd4_cstate_clear_replay(cstate);
> 
> should nfsd4_cstate_clear_replay be called only if need_move_to_close_list
> is true?

It certain could be done like that.

   if (need_move_to_close_list) {
         nfsd4_cstate_clear_replay(cstate);
         move_to_close_lru(stp, net);
   }

It would make almost no behavioural difference as
need_to_move_close_list is never true for v4.1 and later and almost
always true for v4.0, and nfsd4_cstate_clear_replay() does nothing for
v4.1 and later.
The only time behaviour would interrestingly different is when
nfsd4_close_open_stateid() found the state was already unlocked.  Then
need_move_to_close_list would be false, but nfsd4_cstate_clear_replay()
wouldn't be a no-op.

I thought the code was a little simpler the way I wrote it.  We don't
need the need_move_to_close_list guard on nfsd4_cstate_clear_replay(),
so I left it unguarded.
But I'm happy to change it if you can give a good reason - or even if
you just think it is clearer the other way.

Thanks,
NeilBrown

> 
> -Dai
> 
> > +	if (need_move_to_close_list)
> > +		move_to_close_lru(stp, net);
> >   
> >   	/* v4.1+ suggests that we send a special stateid in here, since the
> >   	 * clients should just ignore this anyway. Since this is not useful
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru()
  2023-12-22 23:01     ` NeilBrown
@ 2023-12-23 18:07       ` dai.ngo
  0 siblings, 0 replies; 11+ messages in thread
From: dai.ngo @ 2023-12-23 18:07 UTC (permalink / raw)
  To: NeilBrown
  Cc: Chuck Lever, Jeff Layton, J. Bruce Fields, Olga Kornievskaia,
	Tom Talpey, linux-nfs


On 12/22/23 3:01 PM, NeilBrown wrote:
> On Sat, 23 Dec 2023, dai.ngo@oracle.com wrote:
>> On 12/21/23 6:12 PM, NeilBrown wrote:
>>> move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
>>> This can lead to a deadlock as move_to_close_lru() waits for sc_count to
>>> drop to 2, and some threads holding a reference might be waiting for either
>>> mutex.  These references will never be dropped so sc_count will never
>>> reach 2.
>> Yes, I think there is potential deadlock here since both nfs4_preprocess_seqid_op
>> and nfsd4_find_and_lock_existing_open can increment the sc_count but then
>> have to wait for the st_mutex which being held by move_to_close_lru.
>>
>>> There can be no harm in dropping ->st_mutex to before
>>> move_to_close_lru() because the only place that takes the mutex is
>>> nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
>>> NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
>>>
>>> Similarly dropping .rp_mutex is safe after the state is closed and so
>>> no longer usable.  Another way to look at this is that nothing
>>> significant happens between when nfsd4_close() now calls
>>> nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
>>> nfsd4_cstate_clear_replay() a little later.
>>>
>>> See also
>>>    https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
>>> where this problem was raised but not successfully resolved.
>>>
>>> Signed-off-by: NeilBrown <neilb@suse.de>
>>> ---
>>>
>>> Sorry - I posted v1 a little hastily.  I need to drop rp_mutex as well
>>> to avoid the deadlock.  This also is safe.
>>>
>>>    fs/nfsd/nfs4state.c | 12 ++++++++----
>>>    1 file changed, 8 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>> index 40415929e2ae..453714fbcd66 100644
>>> --- a/fs/nfsd/nfs4state.c
>>> +++ b/fs/nfsd/nfs4state.c
>>> @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
>>>    	return status;
>>>    }
>>>    
>>> -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>>> +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>>>    {
>>>    	struct nfs4_client *clp = s->st_stid.sc_client;
>>>    	bool unhashed;
>>> @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>>>    		list_for_each_entry(stp, &reaplist, st_locks)
>>>    			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
>>>    		free_ol_stateid_reaplist(&reaplist);
>>> +		return false;
>>>    	} else {
>>>    		spin_unlock(&clp->cl_lock);
>>>    		free_ol_stateid_reaplist(&reaplist);
>>> -		if (unhashed)
>>> -			move_to_close_lru(s, clp->net);
>>> +		return unhashed;
>>>    	}
>>>    }
>>>    
>>> @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>>    	struct nfs4_ol_stateid *stp;
>>>    	struct net *net = SVC_NET(rqstp);
>>>    	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>>> +	bool need_move_to_close_list;
>>>    
>>>    	dprintk("NFSD: nfsd4_close on file %pd\n",
>>>    			cstate->current_fh.fh_dentry);
>>> @@ -7114,8 +7115,11 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>>>    	 */
>>>    	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
>>>    
>>> -	nfsd4_close_open_stateid(stp);
>>> +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
>>>    	mutex_unlock(&stp->st_mutex);
>>> +	nfsd4_cstate_clear_replay(cstate);
>> should nfsd4_cstate_clear_replay be called only if need_move_to_close_list
>> is true?
> It certain could be done like that.
>
>     if (need_move_to_close_list) {
>           nfsd4_cstate_clear_replay(cstate);
>           move_to_close_lru(stp, net);
>     }
>
> It would make almost no behavioural difference as
> need_to_move_close_list is never true for v4.1 and later and almost
> always true for v4.0, and nfsd4_cstate_clear_replay() does nothing for
> v4.1 and later.
> The only time behaviour would interrestingly different is when
> nfsd4_close_open_stateid() found the state was already unlocked.  Then
> need_move_to_close_list would be false, but nfsd4_cstate_clear_replay()
> wouldn't be a no-op.
>
> I thought the code was a little simpler the way I wrote it.  We don't
> need the need_move_to_close_list guard on nfsd4_cstate_clear_replay(),
> so I left it unguarded.
> But I'm happy to change it if you can give a good reason - or even if
> you just think it is clearer the other way.

My thinking is that if move_to_close_lru is not called then why bother
to do nfsd4_cstate_clear_replay. It's just easier to understand and
safer (against future changes) than having to go through all possible
scenarios to make sure it's safe to call nfsd4_cstate_clear_replay
regardless.

-Dai

>
> Thanks,
> NeilBrown
>
>> -Dai
>>
>>> +	if (need_move_to_close_list)
>>> +		move_to_close_lru(stp, net);
>>>    
>>>    	/* v4.1+ suggests that we send a special stateid in here, since the
>>>    	 * clients should just ignore this anyway. Since this is not useful

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru()
  2024-02-28 17:40 ` Jeff Layton
  2024-02-28 19:30   ` Jeff Layton
@ 2024-02-29 14:00   ` Jeff Layton
  1 sibling, 0 replies; 11+ messages in thread
From: Jeff Layton @ 2024-02-29 14:00 UTC (permalink / raw)
  To: NeilBrown, Chuck Lever, J. Bruce Fields
  Cc: Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs

On Wed, 2024-02-28 at 12:40 -0500, Jeff Layton wrote:
> On Wed, 2024-01-17 at 14:48 +1100, NeilBrown wrote:
> > move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> > This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> > drop to 2, and some threads holding a reference might be waiting for either
> > mutex.  These references will never be dropped so sc_count will never
> > reach 2.
> > 
> > There can be no harm in dropping ->st_mutex to before
> > move_to_close_lru() because the only place that takes the mutex is
> > nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
> > NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
> > 
> > Similarly dropping .rp_mutex is safe after the state is closed and so
> > no longer usable.  Another way to look at this is that nothing
> > significant happens between when nfsd4_close() now calls
> > nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
> > nfsd4_cstate_clear_replay() a little later.
> > 
> > See also
> >  https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> > where this problem was raised but not successfully resolved.
> > 
> > Signed-off-by: NeilBrown <neilb@suse.de>
> > ---
> >  fs/nfsd/nfs4state.c | 18 ++++++++++++++----
> >  1 file changed, 14 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 40415929e2ae..0850191f9920 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
> >  	return status;
> >  }
> >  
> > -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> > +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> >  {
> >  	struct nfs4_client *clp = s->st_stid.sc_client;
> >  	bool unhashed;
> > @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> >  		list_for_each_entry(stp, &reaplist, st_locks)
> >  			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
> >  		free_ol_stateid_reaplist(&reaplist);
> > +		return false;
> >  	} else {
> >  		spin_unlock(&clp->cl_lock);
> >  		free_ol_stateid_reaplist(&reaplist);
> > -		if (unhashed)
> > -			move_to_close_lru(s, clp->net);
> > +		return unhashed;
> >  	}
> >  }
> >  
> > @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  	struct nfs4_ol_stateid *stp;
> >  	struct net *net = SVC_NET(rqstp);
> >  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +	bool need_move_to_close_list;
> >  
> >  	dprintk("NFSD: nfsd4_close on file %pd\n", 
> >  			cstate->current_fh.fh_dentry);
> > @@ -7114,8 +7115,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  	 */
> >  	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
> >  
> > -	nfsd4_close_open_stateid(stp);
> > +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
> >  	mutex_unlock(&stp->st_mutex);
> > +	if (need_move_to_close_list) {
> > +		/* Drop the replay mutex early as move_to_close_lru()
> > +		 * can wait for other threads which hold that mutex.
> > +		 * This call is idempotent, so that fact that it will
> > +		 * be called twice is harmless.
> > +		 */
> > +		nfsd4_cstate_clear_replay(cstate);

Ok, I think I figured out the regression. The problem is the above line.

That clears cstate->replay_owner, which makes nfsd4_encode_operation not
update the so_replay.rp_buflen, which leaves it set to what it was in
the _prior_ seqid-morphing operation. In this case, that's an OPEN
reply, which was 40 bytes longer than the CLOSE reply.

I'm not sure of the best way to fix this, so it may be best to just
revert this patch for now.

Thinking about it more, the rp_mutex has a rather nasty code smell about
it. Maybe we ought to turn the mutex_lock into a trylock and just return
NFS4ERR_DELAY if you can't get it?

In principle, contention for that lock means that the stateowner is
spraying seqid-morphing operations at us. Returning DELAY would seem
like a reasonable thing to do there if we get confused.

Chuck, Neil, any thoughts?

> > +		move_to_close_lru(stp, net);
> > +	}
> >  
> >  	/* v4.1+ suggests that we send a special stateid in here, since the
> >  	 * clients should just ignore this anyway. Since this is not useful
> 
> There is a recent regression in pynfs test CLOSE12 in Chuck's nfsd-next
> branch. In the attached capture, there is an extra 40 bytes on the end
> of the CLOSE response in frame 112.
> 
> A bisect landed on this patch, though I don't see the cause just yet.
> 
> Thoughts?

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru()
  2024-02-28 17:40 ` Jeff Layton
@ 2024-02-28 19:30   ` Jeff Layton
  2024-02-29 14:00   ` Jeff Layton
  1 sibling, 0 replies; 11+ messages in thread
From: Jeff Layton @ 2024-02-28 19:30 UTC (permalink / raw)
  To: NeilBrown, Chuck Lever, J. Bruce Fields
  Cc: Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs

On Wed, 2024-02-28 at 12:40 -0500, Jeff Layton wrote:
> On Wed, 2024-01-17 at 14:48 +1100, NeilBrown wrote:
> > move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> > This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> > drop to 2, and some threads holding a reference might be waiting for either
> > mutex.  These references will never be dropped so sc_count will never
> > reach 2.
> > 
> > There can be no harm in dropping ->st_mutex to before
> > move_to_close_lru() because the only place that takes the mutex is
> > nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
> > NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
> > 
> > Similarly dropping .rp_mutex is safe after the state is closed and so
> > no longer usable.  Another way to look at this is that nothing
> > significant happens between when nfsd4_close() now calls
> > nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
> > nfsd4_cstate_clear_replay() a little later.
> > 
> > See also
> >  https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> > where this problem was raised but not successfully resolved.
> > 
> > Signed-off-by: NeilBrown <neilb@suse.de>
> > ---
> >  fs/nfsd/nfs4state.c | 18 ++++++++++++++----
> >  1 file changed, 14 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 40415929e2ae..0850191f9920 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
> >  	return status;
> >  }
> >  
> > -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> > +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> >  {
> >  	struct nfs4_client *clp = s->st_stid.sc_client;
> >  	bool unhashed;
> > @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> >  		list_for_each_entry(stp, &reaplist, st_locks)
> >  			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
> >  		free_ol_stateid_reaplist(&reaplist);
> > +		return false;
> >  	} else {
> >  		spin_unlock(&clp->cl_lock);
> >  		free_ol_stateid_reaplist(&reaplist);
> > -		if (unhashed)
> > -			move_to_close_lru(s, clp->net);
> > +		return unhashed;
> >  	}
> >  }
> >  
> > @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  	struct nfs4_ol_stateid *stp;
> >  	struct net *net = SVC_NET(rqstp);
> >  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +	bool need_move_to_close_list;
> >  
> >  	dprintk("NFSD: nfsd4_close on file %pd\n", 
> >  			cstate->current_fh.fh_dentry);
> > @@ -7114,8 +7115,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  	 */
> >  	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
> >  
> > -	nfsd4_close_open_stateid(stp);
> > +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
> >  	mutex_unlock(&stp->st_mutex);
> > +	if (need_move_to_close_list) {
> > +		/* Drop the replay mutex early as move_to_close_lru()
> > +		 * can wait for other threads which hold that mutex.
> > +		 * This call is idempotent, so that fact that it will
> > +		 * be called twice is harmless.
> > +		 */
> > +		nfsd4_cstate_clear_replay(cstate);
> > +		move_to_close_lru(stp, net);
> > +	}
> >  
> >  	/* v4.1+ suggests that we send a special stateid in here, since the
> >  	 * clients should just ignore this anyway. Since this is not useful
> 
> There is a recent regression in pynfs test CLOSE12 in Chuck's nfsd-next
> branch. In the attached capture, there is an extra 40 bytes on the end
> of the CLOSE response in frame 112.
> 
> A bisect landed on this patch, though I don't see the cause just yet.
> 
> Thoughts?

To follow up with what I found so far:

The problem seems to be with the rp_buflen in nfsd4_encode_replay. With
this patch applied, rp_buflen is 56 bytes when nfsd4_encode_replay is
called. With this patch reverted, it's 16 bytes. That accounts for the
extra 40 bytes.

Still trying to determine why this is occurring though...
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru()
  2024-01-17  3:48 NeilBrown
  2024-01-17 18:19 ` Jeff Layton
@ 2024-02-28 17:40 ` Jeff Layton
  2024-02-28 19:30   ` Jeff Layton
  2024-02-29 14:00   ` Jeff Layton
  1 sibling, 2 replies; 11+ messages in thread
From: Jeff Layton @ 2024-02-28 17:40 UTC (permalink / raw)
  To: NeilBrown, Chuck Lever, J. Bruce Fields
  Cc: Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs

[-- Attachment #1: Type: text/plain, Size: 3683 bytes --]

On Wed, 2024-01-17 at 14:48 +1100, NeilBrown wrote:
> move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> drop to 2, and some threads holding a reference might be waiting for either
> mutex.  These references will never be dropped so sc_count will never
> reach 2.
> 
> There can be no harm in dropping ->st_mutex to before
> move_to_close_lru() because the only place that takes the mutex is
> nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
> NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
> 
> Similarly dropping .rp_mutex is safe after the state is closed and so
> no longer usable.  Another way to look at this is that nothing
> significant happens between when nfsd4_close() now calls
> nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
> nfsd4_cstate_clear_replay() a little later.
> 
> See also
>  https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> where this problem was raised but not successfully resolved.
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/nfsd/nfs4state.c | 18 ++++++++++++++----
>  1 file changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 40415929e2ae..0850191f9920 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
>  	return status;
>  }
>  
> -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  {
>  	struct nfs4_client *clp = s->st_stid.sc_client;
>  	bool unhashed;
> @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  		list_for_each_entry(stp, &reaplist, st_locks)
>  			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
>  		free_ol_stateid_reaplist(&reaplist);
> +		return false;
>  	} else {
>  		spin_unlock(&clp->cl_lock);
>  		free_ol_stateid_reaplist(&reaplist);
> -		if (unhashed)
> -			move_to_close_lru(s, clp->net);
> +		return unhashed;
>  	}
>  }
>  
> @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	struct nfs4_ol_stateid *stp;
>  	struct net *net = SVC_NET(rqstp);
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +	bool need_move_to_close_list;
>  
>  	dprintk("NFSD: nfsd4_close on file %pd\n", 
>  			cstate->current_fh.fh_dentry);
> @@ -7114,8 +7115,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	 */
>  	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
>  
> -	nfsd4_close_open_stateid(stp);
> +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
>  	mutex_unlock(&stp->st_mutex);
> +	if (need_move_to_close_list) {
> +		/* Drop the replay mutex early as move_to_close_lru()
> +		 * can wait for other threads which hold that mutex.
> +		 * This call is idempotent, so that fact that it will
> +		 * be called twice is harmless.
> +		 */
> +		nfsd4_cstate_clear_replay(cstate);
> +		move_to_close_lru(stp, net);
> +	}
>  
>  	/* v4.1+ suggests that we send a special stateid in here, since the
>  	 * clients should just ignore this anyway. Since this is not useful

There is a recent regression in pynfs test CLOSE12 in Chuck's nfsd-next
branch. In the attached capture, there is an extra 40 bytes on the end
of the CLOSE response in frame 112.

A bisect landed on this patch, though I don't see the cause just yet.

Thoughts?
-- 
Jeff Layton <jlayton@kernel.org>

[-- Attachment #2: nfs.pcap.xz --]
[-- Type: application/x-xz, Size: 3316 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru()
  2024-01-17  3:48 NeilBrown
@ 2024-01-17 18:19 ` Jeff Layton
  2024-02-28 17:40 ` Jeff Layton
  1 sibling, 0 replies; 11+ messages in thread
From: Jeff Layton @ 2024-01-17 18:19 UTC (permalink / raw)
  To: NeilBrown, Chuck Lever, J. Bruce Fields
  Cc: Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs

On Wed, 2024-01-17 at 14:48 +1100, NeilBrown wrote:
> move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> drop to 2, and some threads holding a reference might be waiting for either
> mutex.  These references will never be dropped so sc_count will never
> reach 2.
> 
> There can be no harm in dropping ->st_mutex to before
> move_to_close_lru() because the only place that takes the mutex is
> nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
> NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
> 
> Similarly dropping .rp_mutex is safe after the state is closed and so
> no longer usable.  Another way to look at this is that nothing
> significant happens between when nfsd4_close() now calls
> nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
> nfsd4_cstate_clear_replay() a little later.
> 
> See also
>  https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> where this problem was raised but not successfully resolved.
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/nfsd/nfs4state.c | 18 ++++++++++++++----
>  1 file changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 40415929e2ae..0850191f9920 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
>  	return status;
>  }
>  
> -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  {
>  	struct nfs4_client *clp = s->st_stid.sc_client;
>  	bool unhashed;
> @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  		list_for_each_entry(stp, &reaplist, st_locks)
>  			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
>  		free_ol_stateid_reaplist(&reaplist);
> +		return false;
>  	} else {
>  		spin_unlock(&clp->cl_lock);
>  		free_ol_stateid_reaplist(&reaplist);
> -		if (unhashed)
> -			move_to_close_lru(s, clp->net);
> +		return unhashed;
>  	}
>  }
>  
> @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	struct nfs4_ol_stateid *stp;
>  	struct net *net = SVC_NET(rqstp);
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +	bool need_move_to_close_list;
>  
>  	dprintk("NFSD: nfsd4_close on file %pd\n", 
>  			cstate->current_fh.fh_dentry);
> @@ -7114,8 +7115,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	 */
>  	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
>  
> -	nfsd4_close_open_stateid(stp);
> +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
>  	mutex_unlock(&stp->st_mutex);
> +	if (need_move_to_close_list) {
> +		/* Drop the replay mutex early as move_to_close_lru()
> +		 * can wait for other threads which hold that mutex.
> +		 * This call is idempotent, so that fact that it will
> +		 * be called twice is harmless.
> +		 */
> +		nfsd4_cstate_clear_replay(cstate);
> +		move_to_close_lru(stp, net);
> +	}
>  
>  	/* v4.1+ suggests that we send a special stateid in here, since the
>  	 * clients should just ignore this anyway. Since this is not useful

I think I get it, though I still struggle with the v4.0 open/close
handling.

Reviewed-by: Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru()
@ 2024-01-17  3:48 NeilBrown
  2024-01-17 18:19 ` Jeff Layton
  2024-02-28 17:40 ` Jeff Layton
  0 siblings, 2 replies; 11+ messages in thread
From: NeilBrown @ 2024-01-17  3:48 UTC (permalink / raw)
  To: Chuck Lever, Jeff Layton, J. Bruce Fields
  Cc: Olga Kornievskaia, Dai Ngo, Tom Talpey, linux-nfs


move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
This can lead to a deadlock as move_to_close_lru() waits for sc_count to
drop to 2, and some threads holding a reference might be waiting for either
mutex.  These references will never be dropped so sc_count will never
reach 2.

There can be no harm in dropping ->st_mutex to before
move_to_close_lru() because the only place that takes the mutex is
nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.

Similarly dropping .rp_mutex is safe after the state is closed and so
no longer usable.  Another way to look at this is that nothing
significant happens between when nfsd4_close() now calls
nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
nfsd4_cstate_clear_replay() a little later.

See also
 https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
where this problem was raised but not successfully resolved.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/nfs4state.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 40415929e2ae..0850191f9920 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	return status;
 }
 
-static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 {
 	struct nfs4_client *clp = s->st_stid.sc_client;
 	bool unhashed;
@@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 		list_for_each_entry(stp, &reaplist, st_locks)
 			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
 		free_ol_stateid_reaplist(&reaplist);
+		return false;
 	} else {
 		spin_unlock(&clp->cl_lock);
 		free_ol_stateid_reaplist(&reaplist);
-		if (unhashed)
-			move_to_close_lru(s, clp->net);
+		return unhashed;
 	}
 }
 
@@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_ol_stateid *stp;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	bool need_move_to_close_list;
 
 	dprintk("NFSD: nfsd4_close on file %pd\n", 
 			cstate->current_fh.fh_dentry);
@@ -7114,8 +7115,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 */
 	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
 
-	nfsd4_close_open_stateid(stp);
+	need_move_to_close_list = nfsd4_close_open_stateid(stp);
 	mutex_unlock(&stp->st_mutex);
+	if (need_move_to_close_list) {
+		/* Drop the replay mutex early as move_to_close_lru()
+		 * can wait for other threads which hold that mutex.
+		 * This call is idempotent, so that fact that it will
+		 * be called twice is harmless.
+		 */
+		nfsd4_cstate_clear_replay(cstate);
+		move_to_close_lru(stp, net);
+	}
 
 	/* v4.1+ suggests that we send a special stateid in here, since the
 	 * clients should just ignore this anyway. Since this is not useful
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2024-02-29 14:00 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-22  1:41 [PATCH] nfsd: drop st_mutex before calling move_to_close_lru() NeilBrown
2023-12-22  2:12 ` [PATCH v2] nfsd: drop st_mutex and rp_mutex " NeilBrown
2023-12-22 14:39   ` Chuck Lever
2023-12-22 20:15   ` dai.ngo
2023-12-22 23:01     ` NeilBrown
2023-12-23 18:07       ` dai.ngo
2024-01-17  3:48 NeilBrown
2024-01-17 18:19 ` Jeff Layton
2024-02-28 17:40 ` Jeff Layton
2024-02-28 19:30   ` Jeff Layton
2024-02-29 14:00   ` Jeff Layton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.