All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] NFSv4: Fix a nfs4_state_manager() race
@ 2023-09-17 23:05 trondmy
  2023-09-17 23:05 ` [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression trondmy
  2023-09-18  1:17 ` [PATCH 1/2] NFSv4: Fix a nfs4_state_manager() race NeilBrown
  0 siblings, 2 replies; 19+ messages in thread
From: trondmy @ 2023-09-17 23:05 UTC (permalink / raw)
  To: Anna Schumaker; +Cc: linux-nfs, Neil Brown

From: Trond Myklebust <trond.myklebust@hammerspace.com>

If the NFS4CLNT_RUN_MANAGER flag got set just before we cleared
NFS4CLNT_MANAGER_RUNNING, then we might have won the race against
nfs4_schedule_state_manager(), and are responsible for handling the
recovery situation.

Fixes: aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4state.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e079987af4a3..0bc160fbabec 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2703,6 +2703,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
 		nfs4_end_drain_session(clp);
 		nfs4_clear_state_manager_bit(clp);
 
+		if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+		    !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING,
+				      &clp->cl_state)) {
+			memflags = memalloc_nofs_save();
+			continue;
+		}
+
 		if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) {
 			if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
 				nfs_client_return_marked_delegations(clp);
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-17 23:05 [PATCH 1/2] NFSv4: Fix a nfs4_state_manager() race trondmy
@ 2023-09-17 23:05 ` trondmy
  2023-09-18  1:25   ` NeilBrown
  2023-09-20 19:38   ` Anna Schumaker
  2023-09-18  1:17 ` [PATCH 1/2] NFSv4: Fix a nfs4_state_manager() race NeilBrown
  1 sibling, 2 replies; 19+ messages in thread
From: trondmy @ 2023-09-17 23:05 UTC (permalink / raw)
  To: Anna Schumaker; +Cc: linux-nfs, Neil Brown

From: Trond Myklebust <trond.myklebust@hammerspace.com>

Commit 4dc73c679114 reintroduces the deadlock that was fixed by commit
aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because it
prevents the setup of new threads to handle reboot recovery, while the
older recovery thread is stuck returning delegations.

Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if swap is enabled")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c  |  4 +++-
 fs/nfs/nfs4state.c | 38 ++++++++++++++++++++++++++------------
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5deeaea8026e..a19e809cad16 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode *inode)
 	 */
 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 
-	nfs4_schedule_state_manager(clp);
+	set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+	clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+	wake_up_var(&clp->cl_state);
 }
 
 static const struct inode_operations nfs4_dir_inode_operations = {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0bc160fbabec..5751a6886da4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 {
 	struct task_struct *task;
 	char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+	struct rpc_clnt *clnt = clp->cl_rpcclient;
+	bool swapon = false;
 
-	if (clp->cl_rpcclient->cl_shutdown)
+	if (clnt->cl_shutdown)
 		return;
 
 	set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
-	if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) {
-		wake_up_var(&clp->cl_state);
-		return;
+
+	if (atomic_read(&clnt->cl_swapper)) {
+		swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE,
+					   &clp->cl_state);
+		if (!swapon) {
+			wake_up_var(&clp->cl_state);
+			return;
+		}
 	}
-	set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
+
+	if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+		return;
+
 	__module_get(THIS_MODULE);
 	refcount_inc(&clp->cl_count);
 
@@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 			__func__, PTR_ERR(task));
 		if (!nfs_client_init_is_complete(clp))
 			nfs_mark_client_ready(clp, PTR_ERR(task));
+		if (swapon)
+			clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
 		nfs4_clear_state_manager_bit(clp);
-		clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
 		nfs_put_client(clp);
 		module_put(THIS_MODULE);
 	}
@@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr)
 
 	allow_signal(SIGKILL);
 again:
-	set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
 	nfs4_state_manager(clp);
-	if (atomic_read(&cl->cl_swapper)) {
+
+	if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
+	    !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) {
 		wait_var_event_interruptible(&clp->cl_state,
 					     test_bit(NFS4CLNT_RUN_MANAGER,
 						      &clp->cl_state));
-		if (atomic_read(&cl->cl_swapper) &&
-		    test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
+		if (!atomic_read(&cl->cl_swapper))
+			clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+		if (refcount_read(&clp->cl_count) > 1 && !signalled())
 			goto again;
 		/* Either no longer a swapper, or were signalled */
+		clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+		nfs4_clear_state_manager_bit(clp);
 	}
-	clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
 
 	if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
 	    test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
-	    !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state))
+	    !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
 		goto again;
 
 	nfs_put_client(clp);
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] NFSv4: Fix a nfs4_state_manager() race
  2023-09-17 23:05 [PATCH 1/2] NFSv4: Fix a nfs4_state_manager() race trondmy
  2023-09-17 23:05 ` [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression trondmy
@ 2023-09-18  1:17 ` NeilBrown
  2023-09-18  2:20   ` Trond Myklebust
  1 sibling, 1 reply; 19+ messages in thread
From: NeilBrown @ 2023-09-18  1:17 UTC (permalink / raw)
  To: trondmy; +Cc: Anna Schumaker, linux-nfs

On Mon, 18 Sep 2023, trondmy@kernel.org wrote:
> From: Trond Myklebust <trond.myklebust@hammerspace.com>
> 
> If the NFS4CLNT_RUN_MANAGER flag got set just before we cleared
> NFS4CLNT_MANAGER_RUNNING, then we might have won the race against
> nfs4_schedule_state_manager(), and are responsible for handling the
> recovery situation.
> 
> Fixes: aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock")
> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> ---
>  fs/nfs/nfs4state.c | 7 +++++++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index e079987af4a3..0bc160fbabec 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -2703,6 +2703,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
>  		nfs4_end_drain_session(clp);
>  		nfs4_clear_state_manager_bit(clp);
>  
> +		if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
> +		    !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING,
> +				      &clp->cl_state)) {
> +			memflags = memalloc_nofs_save();
> +			continue;
> +		}
> +

I cannot see what race this closes.
When we cleared MANAGER_RUNNING, the only possible consequence is that
nfs4_wait_clnt_recover could have woken up.  This leads to
nfs4_schedule_state_manager() being run, which sets RUN_MANAGER whether
it was set or not.

I understand that there are problems with MANAGER_AVAILABLE which your
next patch addresses, but I cannot see what this one actually fixes.
Can you help me?

Thanks,
NeilBrown



>  		if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) {
>  			if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
>  				nfs_client_return_marked_delegations(clp);
> -- 
> 2.41.0
> 
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-17 23:05 ` [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression trondmy
@ 2023-09-18  1:25   ` NeilBrown
  2023-09-18  2:27     ` Trond Myklebust
  2023-09-20 19:38   ` Anna Schumaker
  1 sibling, 1 reply; 19+ messages in thread
From: NeilBrown @ 2023-09-18  1:25 UTC (permalink / raw)
  To: trondmy; +Cc: Anna Schumaker, linux-nfs

On Mon, 18 Sep 2023, trondmy@kernel.org wrote:
> From: Trond Myklebust <trond.myklebust@hammerspace.com>
> 
> Commit 4dc73c679114 reintroduces the deadlock that was fixed by commit
> aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because it
> prevents the setup of new threads to handle reboot recovery, while the
> older recovery thread is stuck returning delegations.

I hadn't realised that the state manager thread did these two distinct
tasks and might need to be doing both at once - requiring two threads.
Thanks for highlighting that.

It seems to me that even with the new code, we can still get a deadlock
when swap is enabled, as we only ever run one thread in that case.
Is that correct, or did I miss something?

Maybe we need two threads - a state manager and a delegation recall
handler.  And when swap is enabled, both need to be running permanently
??

Thanks,
NeilBrown


> 
> Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if swap is enabled")
> Cc: stable@vger.kernel.org
> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> ---
>  fs/nfs/nfs4proc.c  |  4 +++-
>  fs/nfs/nfs4state.c | 38 ++++++++++++++++++++++++++------------
>  2 files changed, 29 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 5deeaea8026e..a19e809cad16 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode *inode)
>  	 */
>  	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
>  
> -	nfs4_schedule_state_manager(clp);
> +	set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
> +	clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> +	wake_up_var(&clp->cl_state);
>  }
>  
>  static const struct inode_operations nfs4_dir_inode_operations = {
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index 0bc160fbabec..5751a6886da4 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
>  {
>  	struct task_struct *task;
>  	char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
> +	struct rpc_clnt *clnt = clp->cl_rpcclient;
> +	bool swapon = false;
>  
> -	if (clp->cl_rpcclient->cl_shutdown)
> +	if (clnt->cl_shutdown)
>  		return;
>  
>  	set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
> -	if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) {
> -		wake_up_var(&clp->cl_state);
> -		return;
> +
> +	if (atomic_read(&clnt->cl_swapper)) {
> +		swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE,
> +					   &clp->cl_state);
> +		if (!swapon) {
> +			wake_up_var(&clp->cl_state);
> +			return;
> +		}
>  	}
> -	set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
> +
> +	if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
> +		return;
> +
>  	__module_get(THIS_MODULE);
>  	refcount_inc(&clp->cl_count);
>  
> @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
>  			__func__, PTR_ERR(task));
>  		if (!nfs_client_init_is_complete(clp))
>  			nfs_mark_client_ready(clp, PTR_ERR(task));
> +		if (swapon)
> +			clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
>  		nfs4_clear_state_manager_bit(clp);
> -		clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
>  		nfs_put_client(clp);
>  		module_put(THIS_MODULE);
>  	}
> @@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr)
>  
>  	allow_signal(SIGKILL);
>  again:
> -	set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
>  	nfs4_state_manager(clp);
> -	if (atomic_read(&cl->cl_swapper)) {
> +
> +	if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
> +	    !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) {
>  		wait_var_event_interruptible(&clp->cl_state,
>  					     test_bit(NFS4CLNT_RUN_MANAGER,
>  						      &clp->cl_state));
> -		if (atomic_read(&cl->cl_swapper) &&
> -		    test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
> +		if (!atomic_read(&cl->cl_swapper))
> +			clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> +		if (refcount_read(&clp->cl_count) > 1 && !signalled())
>  			goto again;
>  		/* Either no longer a swapper, or were signalled */
> +		clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> +		nfs4_clear_state_manager_bit(clp);
>  	}
> -	clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
>  
>  	if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
>  	    test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
> -	    !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state))
> +	    !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
>  		goto again;
>  
>  	nfs_put_client(clp);
> -- 
> 2.41.0
> 
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] NFSv4: Fix a nfs4_state_manager() race
  2023-09-18  1:17 ` [PATCH 1/2] NFSv4: Fix a nfs4_state_manager() race NeilBrown
@ 2023-09-18  2:20   ` Trond Myklebust
  0 siblings, 0 replies; 19+ messages in thread
From: Trond Myklebust @ 2023-09-18  2:20 UTC (permalink / raw)
  To: neilb; +Cc: linux-nfs, Anna.Schumaker

On Mon, 2023-09-18 at 11:17 +1000, NeilBrown wrote:
> On Mon, 18 Sep 2023, trondmy@kernel.org wrote:
> > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > 
> > If the NFS4CLNT_RUN_MANAGER flag got set just before we cleared
> > NFS4CLNT_MANAGER_RUNNING, then we might have won the race against
> > nfs4_schedule_state_manager(), and are responsible for handling the
> > recovery situation.
> > 
> > Fixes: aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock")
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > ---
> >  fs/nfs/nfs4state.c | 7 +++++++
> >  1 file changed, 7 insertions(+)
> > 
> > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> > index e079987af4a3..0bc160fbabec 100644
> > --- a/fs/nfs/nfs4state.c
> > +++ b/fs/nfs/nfs4state.c
> > @@ -2703,6 +2703,13 @@ static void nfs4_state_manager(struct
> > nfs_client *clp)
> >                 nfs4_end_drain_session(clp);
> >                 nfs4_clear_state_manager_bit(clp);
> >  
> > +               if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)
> > &&
> > +                   !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING,
> > +                                     &clp->cl_state)) {
> > +                       memflags = memalloc_nofs_save();
> > +                       continue;
> > +               }
> > +
> 
> I cannot see what race this closes.
> When we cleared MANAGER_RUNNING, the only possible consequence is
> that
> nfs4_wait_clnt_recover could have woken up.  This leads to
> nfs4_schedule_state_manager() being run, which sets RUN_MANAGER
> whether
> it was set or not.
> 
> I understand that there are problems with MANAGER_AVAILABLE which
> your
> next patch addresses, but I cannot see what this one actually fixes.
> Can you help me?
> 

If NFS4CLNT_RUN_MANAGER gets set while we're handling the reboot
recovery or network partition, then NFS4CLNT_MANAGER_RUNNING will be
set, so nfs4_schedule_state_manager() will just exit rather than start
a new thread. If we don't catch that situation before we start handling
the asynchronous delegation returns, then we can deadlock.

If, OTOH, nfs4_schedule_state_manager() runs after we've cleared
NFS4CLNT_MANAGER_RUNNING, then we should be OK (assuming both patches
are applied).

Cheers
 Trond

> Thanks,
> NeilBrown
> 
> 
> 
> >                 if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING,
> > &clp->cl_state)) {
> >                         if
> > (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
> >                                 nfs_client_return_marked_delegation
> > s(clp);
> > -- 
> > 2.41.0
> > 
> > 
> 

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-18  1:25   ` NeilBrown
@ 2023-09-18  2:27     ` Trond Myklebust
  0 siblings, 0 replies; 19+ messages in thread
From: Trond Myklebust @ 2023-09-18  2:27 UTC (permalink / raw)
  To: NeilBrown; +Cc: Anna Schumaker, linux-nfs

On Mon, 2023-09-18 at 11:25 +1000, NeilBrown wrote:
> On Mon, 18 Sep 2023, trondmy@kernel.org wrote:
> > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > 
> > Commit 4dc73c679114 reintroduces the deadlock that was fixed by
> > commit
> > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because
> > it
> > prevents the setup of new threads to handle reboot recovery, while
> > the
> > older recovery thread is stuck returning delegations.
> 
> I hadn't realised that the state manager thread did these two
> distinct
> tasks and might need to be doing both at once - requiring two
> threads.
> Thanks for highlighting that.
> 
> It seems to me that even with the new code, we can still get a
> deadlock
> when swap is enabled, as we only ever run one thread in that case.
> Is that correct, or did I miss something?

That is correct. I did try to point this out when you were submitting
the swap patches, but my understanding was that you were assuming that
delegations would not be enabled when swap is enabled.

> 
> Maybe we need two threads - a state manager and a delegation recall
> handler.  And when swap is enabled, both need to be running
> permanently
> ??

Possibly.

> 
> > 
> > Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if
> > swap is enabled")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > ---
> >  fs/nfs/nfs4proc.c  |  4 +++-
> >  fs/nfs/nfs4state.c | 38 ++++++++++++++++++++++++++------------
> >  2 files changed, 29 insertions(+), 13 deletions(-)
> > 
> > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> > index 5deeaea8026e..a19e809cad16 100644
> > --- a/fs/nfs/nfs4proc.c
> > +++ b/fs/nfs/nfs4proc.c
> > @@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode
> > *inode)
> >          */
> >         struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
> >  
> > -       nfs4_schedule_state_manager(clp);
> > +       set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
> > +       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> > +       wake_up_var(&clp->cl_state);
> >  }
> >  
> >  static const struct inode_operations nfs4_dir_inode_operations = {
> > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> > index 0bc160fbabec..5751a6886da4 100644
> > --- a/fs/nfs/nfs4state.c
> > +++ b/fs/nfs/nfs4state.c
> > @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct
> > nfs_client *clp)
> >  {
> >         struct task_struct *task;
> >         char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
> > +       struct rpc_clnt *clnt = clp->cl_rpcclient;
> > +       bool swapon = false;
> >  
> > -       if (clp->cl_rpcclient->cl_shutdown)
> > +       if (clnt->cl_shutdown)
> >                 return;
> >  
> >         set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
> > -       if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state) != 0) {
> > -               wake_up_var(&clp->cl_state);
> > -               return;
> > +
> > +       if (atomic_read(&clnt->cl_swapper)) {
> > +               swapon =
> > !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE,
> > +                                          &clp->cl_state);
> > +               if (!swapon) {
> > +                       wake_up_var(&clp->cl_state);
> > +                       return;
> > +               }
> >         }
> > -       set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
> > +
> > +       if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp-
> > >cl_state) != 0)
> > +               return;
> > +
> >         __module_get(THIS_MODULE);
> >         refcount_inc(&clp->cl_count);
> >  
> > @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct
> > nfs_client *clp)
> >                         __func__, PTR_ERR(task));
> >                 if (!nfs_client_init_is_complete(clp))
> >                         nfs_mark_client_ready(clp, PTR_ERR(task));
> > +               if (swapon)
> > +                       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state);
> >                 nfs4_clear_state_manager_bit(clp);
> > -               clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state);
> >                 nfs_put_client(clp);
> >                 module_put(THIS_MODULE);
> >         }
> > @@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void
> > *ptr)
> >  
> >         allow_signal(SIGKILL);
> >  again:
> > -       set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
> >         nfs4_state_manager(clp);
> > -       if (atomic_read(&cl->cl_swapper)) {
> > +
> > +       if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
> > +           !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp-
> > >cl_state)) {
> >                 wait_var_event_interruptible(&clp->cl_state,
> >                                             
> > test_bit(NFS4CLNT_RUN_MANAGER,
> >                                                       &clp-
> > >cl_state));
> > -               if (atomic_read(&cl->cl_swapper) &&
> > -                   test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
> > +               if (!atomic_read(&cl->cl_swapper))
> > +                       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state);
> > +               if (refcount_read(&clp->cl_count) > 1 &&
> > !signalled())
> >                         goto again;
> >                 /* Either no longer a swapper, or were signalled */
> > +               clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state);
> > +               nfs4_clear_state_manager_bit(clp);
> >         }
> > -       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> >  
> >         if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
> >             test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
> > -           !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state))
> > +           !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp-
> > >cl_state))
> >                 goto again;
> >  
> >         nfs_put_client(clp);
> > -- 
> > 2.41.0
> > 
> > 
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-17 23:05 ` [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression trondmy
  2023-09-18  1:25   ` NeilBrown
@ 2023-09-20 19:38   ` Anna Schumaker
  2023-09-21  0:15     ` Trond Myklebust
  1 sibling, 1 reply; 19+ messages in thread
From: Anna Schumaker @ 2023-09-20 19:38 UTC (permalink / raw)
  To: trondmy; +Cc: Anna Schumaker, linux-nfs, Neil Brown

Hi Trond,

On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote:
>
> From: Trond Myklebust <trond.myklebust@hammerspace.com>
>
> Commit 4dc73c679114 reintroduces the deadlock that was fixed by commit
> aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because it
> prevents the setup of new threads to handle reboot recovery, while the
> older recovery thread is stuck returning delegations.

I'm seeing a possible deadlock with xfstests generic/472 on NFS v4.x
after applying this patch. The test itself checks for various swapfile
edge cases, so it seems likely something is going on there.

Let me know if you need more info
Anna

>
> Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if swap is enabled")
> Cc: stable@vger.kernel.org
> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> ---
>  fs/nfs/nfs4proc.c  |  4 +++-
>  fs/nfs/nfs4state.c | 38 ++++++++++++++++++++++++++------------
>  2 files changed, 29 insertions(+), 13 deletions(-)
>
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index 5deeaea8026e..a19e809cad16 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode *inode)
>          */
>         struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
>
> -       nfs4_schedule_state_manager(clp);
> +       set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
> +       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> +       wake_up_var(&clp->cl_state);
>  }
>
>  static const struct inode_operations nfs4_dir_inode_operations = {
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index 0bc160fbabec..5751a6886da4 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
>  {
>         struct task_struct *task;
>         char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
> +       struct rpc_clnt *clnt = clp->cl_rpcclient;
> +       bool swapon = false;
>
> -       if (clp->cl_rpcclient->cl_shutdown)
> +       if (clnt->cl_shutdown)
>                 return;
>
>         set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
> -       if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) {
> -               wake_up_var(&clp->cl_state);
> -               return;
> +
> +       if (atomic_read(&clnt->cl_swapper)) {
> +               swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE,
> +                                          &clp->cl_state);
> +               if (!swapon) {
> +                       wake_up_var(&clp->cl_state);
> +                       return;
> +               }
>         }
> -       set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
> +
> +       if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
> +               return;
> +
>         __module_get(THIS_MODULE);
>         refcount_inc(&clp->cl_count);
>
> @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
>                         __func__, PTR_ERR(task));
>                 if (!nfs_client_init_is_complete(clp))
>                         nfs_mark_client_ready(clp, PTR_ERR(task));
> +               if (swapon)
> +                       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
>                 nfs4_clear_state_manager_bit(clp);
> -               clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
>                 nfs_put_client(clp);
>                 module_put(THIS_MODULE);
>         }
> @@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr)
>
>         allow_signal(SIGKILL);
>  again:
> -       set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
>         nfs4_state_manager(clp);
> -       if (atomic_read(&cl->cl_swapper)) {
> +
> +       if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
> +           !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) {
>                 wait_var_event_interruptible(&clp->cl_state,
>                                              test_bit(NFS4CLNT_RUN_MANAGER,
>                                                       &clp->cl_state));
> -               if (atomic_read(&cl->cl_swapper) &&
> -                   test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
> +               if (!atomic_read(&cl->cl_swapper))
> +                       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> +               if (refcount_read(&clp->cl_count) > 1 && !signalled())
>                         goto again;
>                 /* Either no longer a swapper, or were signalled */
> +               clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> +               nfs4_clear_state_manager_bit(clp);
>         }
> -       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
>
>         if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
>             test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
> -           !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state))
> +           !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
>                 goto again;
>
>         nfs_put_client(clp);
> --
> 2.41.0
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-20 19:38   ` Anna Schumaker
@ 2023-09-21  0:15     ` Trond Myklebust
  2023-09-22 17:22       ` Olga Kornievskaia
  0 siblings, 1 reply; 19+ messages in thread
From: Trond Myklebust @ 2023-09-21  0:15 UTC (permalink / raw)
  To: schumaker.anna; +Cc: linux-nfs, Anna.Schumaker, neilb

On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote:
> Hi Trond,
> 
> On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote:
> > 
> > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > 
> > Commit 4dc73c679114 reintroduces the deadlock that was fixed by
> > commit
> > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because
> > it
> > prevents the setup of new threads to handle reboot recovery, while
> > the
> > older recovery thread is stuck returning delegations.
> 
> I'm seeing a possible deadlock with xfstests generic/472 on NFS v4.x
> after applying this patch. The test itself checks for various
> swapfile
> edge cases, so it seems likely something is going on there.
> 
> Let me know if you need more info
> Anna
> 

Did you turn off delegations on your server? If you don't, then swap
will deadlock itself under various scenarios.

> > 
> > Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if
> > swap is enabled")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > ---
> >  fs/nfs/nfs4proc.c  |  4 +++-
> >  fs/nfs/nfs4state.c | 38 ++++++++++++++++++++++++++------------
> >  2 files changed, 29 insertions(+), 13 deletions(-)
> > 
> > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> > index 5deeaea8026e..a19e809cad16 100644
> > --- a/fs/nfs/nfs4proc.c
> > +++ b/fs/nfs/nfs4proc.c
> > @@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode
> > *inode)
> >          */
> >         struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
> > 
> > -       nfs4_schedule_state_manager(clp);
> > +       set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
> > +       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> > +       wake_up_var(&clp->cl_state);
> >  }
> > 
> >  static const struct inode_operations nfs4_dir_inode_operations = {
> > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> > index 0bc160fbabec..5751a6886da4 100644
> > --- a/fs/nfs/nfs4state.c
> > +++ b/fs/nfs/nfs4state.c
> > @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct
> > nfs_client *clp)
> >  {
> >         struct task_struct *task;
> >         char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
> > +       struct rpc_clnt *clnt = clp->cl_rpcclient;
> > +       bool swapon = false;
> > 
> > -       if (clp->cl_rpcclient->cl_shutdown)
> > +       if (clnt->cl_shutdown)
> >                 return;
> > 
> >         set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
> > -       if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state) != 0) {
> > -               wake_up_var(&clp->cl_state);
> > -               return;
> > +
> > +       if (atomic_read(&clnt->cl_swapper)) {
> > +               swapon =
> > !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE,
> > +                                          &clp->cl_state);
> > +               if (!swapon) {
> > +                       wake_up_var(&clp->cl_state);
> > +                       return;
> > +               }
> >         }
> > -       set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
> > +
> > +       if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp-
> > >cl_state) != 0)
> > +               return;
> > +
> >         __module_get(THIS_MODULE);
> >         refcount_inc(&clp->cl_count);
> > 
> > @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct
> > nfs_client *clp)
> >                         __func__, PTR_ERR(task));
> >                 if (!nfs_client_init_is_complete(clp))
> >                         nfs_mark_client_ready(clp, PTR_ERR(task));
> > +               if (swapon)
> > +                       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state);
> >                 nfs4_clear_state_manager_bit(clp);
> > -               clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state);
> >                 nfs_put_client(clp);
> >                 module_put(THIS_MODULE);
> >         }
> > @@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void
> > *ptr)
> > 
> >         allow_signal(SIGKILL);
> >  again:
> > -       set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
> >         nfs4_state_manager(clp);
> > -       if (atomic_read(&cl->cl_swapper)) {
> > +
> > +       if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
> > +           !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp-
> > >cl_state)) {
> >                 wait_var_event_interruptible(&clp->cl_state,
> >                                             
> > test_bit(NFS4CLNT_RUN_MANAGER,
> >                                                       &clp-
> > >cl_state));
> > -               if (atomic_read(&cl->cl_swapper) &&
> > -                   test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
> > +               if (!atomic_read(&cl->cl_swapper))
> > +                       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state);
> > +               if (refcount_read(&clp->cl_count) > 1 &&
> > !signalled())
> >                         goto again;
> >                 /* Either no longer a swapper, or were signalled */
> > +               clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state);
> > +               nfs4_clear_state_manager_bit(clp);
> >         }
> > -       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> > 
> >         if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
> >             test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
> > -           !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > >cl_state))
> > +           !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp-
> > >cl_state))
> >                 goto again;
> > 
> >         nfs_put_client(clp);
> > --
> > 2.41.0
> > 

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-21  0:15     ` Trond Myklebust
@ 2023-09-22 17:22       ` Olga Kornievskaia
  2023-09-22 19:05         ` Trond Myklebust
  0 siblings, 1 reply; 19+ messages in thread
From: Olga Kornievskaia @ 2023-09-22 17:22 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: schumaker.anna, linux-nfs, Anna.Schumaker, neilb

On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote:
> > Hi Trond,
> >
> > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote:
> > >
> > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > >
> > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by
> > > commit
> > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because
> > > it
> > > prevents the setup of new threads to handle reboot recovery, while
> > > the
> > > older recovery thread is stuck returning delegations.
> >
> > I'm seeing a possible deadlock with xfstests generic/472 on NFS v4.x
> > after applying this patch. The test itself checks for various
> > swapfile
> > edge cases, so it seems likely something is going on there.
> >
> > Let me know if you need more info
> > Anna
> >
>
> Did you turn off delegations on your server? If you don't, then swap
> will deadlock itself under various scenarios.

Is there documentation somewhere that says that delegations must be
turned off on the server if NFS over swap is enabled?

If the client can't handle delegations + swap, then shouldn't this be
solved by (1) checking if we are in NFS over swap and then proactively
setting 'dont want delegation' on open and/or (2) proactively return
the delegation if received so that we don't get into the deadlock?

I think this is similar to Anna's. With this patch, I'm running into a
problem running against an ONTAP server using xfstests (no problems
without the patch). During the run two stuck threads are:
[root@unknown000c291be8aa aglo]# cat /proc/3724/stack
[<0>] nfs4_run_state_manager+0x1c0/0x1f8 [nfsv4]
[<0>] kthread+0x100/0x110
[<0>] ret_from_fork+0x10/0x20
[root@unknown000c291be8aa aglo]# cat /proc/3725/stack
[<0>] nfs_wait_bit_killable+0x1c/0x88 [nfs]
[<0>] nfs4_wait_clnt_recover+0xb4/0xf0 [nfsv4]
[<0>] nfs4_client_recover_expired_lease+0x34/0x88 [nfsv4]
[<0>] _nfs4_do_open.isra.0+0x94/0x408 [nfsv4]
[<0>] nfs4_do_open+0x9c/0x238 [nfsv4]
[<0>] nfs4_atomic_open+0x100/0x118 [nfsv4]
[<0>] nfs4_file_open+0x11c/0x240 [nfsv4]
[<0>] do_dentry_open+0x140/0x528
[<0>] vfs_open+0x30/0x38
[<0>] do_open+0x14c/0x360
[<0>] path_openat+0x104/0x250
[<0>] do_filp_open+0x84/0x138
[<0>] file_open_name+0x134/0x190
[<0>] __do_sys_swapoff+0x58/0x6e8
[<0>] __arm64_sys_swapoff+0x18/0x28
[<0>] invoke_syscall.constprop.0+0x7c/0xd0
[<0>] do_el0_svc+0xb4/0xd0
[<0>] el0_svc+0x50/0x228
[<0>] el0t_64_sync_handler+0x134/0x150
[<0>] el0t_64_sync+0x17c/0x180

>
> > >
> > > Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if
> > > swap is enabled")
> > > Cc: stable@vger.kernel.org
> > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > ---
> > >  fs/nfs/nfs4proc.c  |  4 +++-
> > >  fs/nfs/nfs4state.c | 38 ++++++++++++++++++++++++++------------
> > >  2 files changed, 29 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> > > index 5deeaea8026e..a19e809cad16 100644
> > > --- a/fs/nfs/nfs4proc.c
> > > +++ b/fs/nfs/nfs4proc.c
> > > @@ -10652,7 +10652,9 @@ static void nfs4_disable_swap(struct inode
> > > *inode)
> > >          */
> > >         struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
> > >
> > > -       nfs4_schedule_state_manager(clp);
> > > +       set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
> > > +       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> > > +       wake_up_var(&clp->cl_state);
> > >  }
> > >
> > >  static const struct inode_operations nfs4_dir_inode_operations = {
> > > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> > > index 0bc160fbabec..5751a6886da4 100644
> > > --- a/fs/nfs/nfs4state.c
> > > +++ b/fs/nfs/nfs4state.c
> > > @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct
> > > nfs_client *clp)
> > >  {
> > >         struct task_struct *task;
> > >         char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
> > > +       struct rpc_clnt *clnt = clp->cl_rpcclient;
> > > +       bool swapon = false;
> > >
> > > -       if (clp->cl_rpcclient->cl_shutdown)
> > > +       if (clnt->cl_shutdown)
> > >                 return;
> > >
> > >         set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
> > > -       if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > > >cl_state) != 0) {
> > > -               wake_up_var(&clp->cl_state);
> > > -               return;
> > > +
> > > +       if (atomic_read(&clnt->cl_swapper)) {
> > > +               swapon =
> > > !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE,
> > > +                                          &clp->cl_state);
> > > +               if (!swapon) {
> > > +                       wake_up_var(&clp->cl_state);
> > > +                       return;
> > > +               }
> > >         }
> > > -       set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
> > > +
> > > +       if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp-
> > > >cl_state) != 0)
> > > +               return;
> > > +
> > >         __module_get(THIS_MODULE);
> > >         refcount_inc(&clp->cl_count);
> > >
> > > @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct
> > > nfs_client *clp)
> > >                         __func__, PTR_ERR(task));
> > >                 if (!nfs_client_init_is_complete(clp))
> > >                         nfs_mark_client_ready(clp, PTR_ERR(task));
> > > +               if (swapon)
> > > +                       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > > >cl_state);
> > >                 nfs4_clear_state_manager_bit(clp);
> > > -               clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > > >cl_state);
> > >                 nfs_put_client(clp);
> > >                 module_put(THIS_MODULE);
> > >         }
> > > @@ -2748,22 +2759,25 @@ static int nfs4_run_state_manager(void
> > > *ptr)
> > >
> > >         allow_signal(SIGKILL);
> > >  again:
> > > -       set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
> > >         nfs4_state_manager(clp);
> > > -       if (atomic_read(&cl->cl_swapper)) {
> > > +
> > > +       if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
> > > +           !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp-
> > > >cl_state)) {
> > >                 wait_var_event_interruptible(&clp->cl_state,
> > >
> > > test_bit(NFS4CLNT_RUN_MANAGER,
> > >                                                       &clp-
> > > >cl_state));
> > > -               if (atomic_read(&cl->cl_swapper) &&
> > > -                   test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
> > > +               if (!atomic_read(&cl->cl_swapper))
> > > +                       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > > >cl_state);
> > > +               if (refcount_read(&clp->cl_count) > 1 &&
> > > !signalled())
> > >                         goto again;
> > >                 /* Either no longer a swapper, or were signalled */
> > > +               clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > > >cl_state);
> > > +               nfs4_clear_state_manager_bit(clp);
> > >         }
> > > -       clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
> > >
> > >         if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
> > >             test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
> > > -           !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp-
> > > >cl_state))
> > > +           !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp-
> > > >cl_state))
> > >                 goto again;
> > >
> > >         nfs_put_client(clp);
> > > --
> > > 2.41.0
> > >
>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-22 17:22       ` Olga Kornievskaia
@ 2023-09-22 19:05         ` Trond Myklebust
  2023-09-22 21:00           ` Olga Kornievskaia
  2023-09-25 22:28           ` NeilBrown
  0 siblings, 2 replies; 19+ messages in thread
From: Trond Myklebust @ 2023-09-22 19:05 UTC (permalink / raw)
  To: aglo; +Cc: linux-nfs, Anna.Schumaker, neilb, schumaker.anna

[-- Attachment #1: Type: text/plain, Size: 3278 bytes --]

On Fri, 2023-09-22 at 13:22 -0400, Olga Kornievskaia wrote:
> On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust
> <trondmy@hammerspace.com> wrote:
> > 
> > On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote:
> > > Hi Trond,
> > > 
> > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote:
> > > > 
> > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > 
> > > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by
> > > > commit
> > > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock")
> > > > because
> > > > it
> > > > prevents the setup of new threads to handle reboot recovery,
> > > > while
> > > > the
> > > > older recovery thread is stuck returning delegations.
> > > 
> > > I'm seeing a possible deadlock with xfstests generic/472 on NFS
> > > v4.x
> > > after applying this patch. The test itself checks for various
> > > swapfile
> > > edge cases, so it seems likely something is going on there.
> > > 
> > > Let me know if you need more info
> > > Anna
> > > 
> > 
> > Did you turn off delegations on your server? If you don't, then
> > swap
> > will deadlock itself under various scenarios.
> 
> Is there documentation somewhere that says that delegations must be
> turned off on the server if NFS over swap is enabled?

I think the question is more generally "Is there documentation for NFS
swap?"

> 
> If the client can't handle delegations + swap, then shouldn't this be
> solved by (1) checking if we are in NFS over swap and then
> proactively
> setting 'dont want delegation' on open and/or (2) proactively return
> the delegation if received so that we don't get into the deadlock?

We could do that for NFSv4.1 and NFSv4.2, but the protocol will not
allow us to do that for NFSv4.0.

> 
> I think this is similar to Anna's. With this patch, I'm running into
> a
> problem running against an ONTAP server using xfstests (no problems
> without the patch). During the run two stuck threads are:
> [root@unknown000c291be8aa aglo]# cat /proc/3724/stack
> [<0>] nfs4_run_state_manager+0x1c0/0x1f8 [nfsv4]
> [<0>] kthread+0x100/0x110
> [<0>] ret_from_fork+0x10/0x20
> [root@unknown000c291be8aa aglo]# cat /proc/3725/stack
> [<0>] nfs_wait_bit_killable+0x1c/0x88 [nfs]
> [<0>] nfs4_wait_clnt_recover+0xb4/0xf0 [nfsv4]
> [<0>] nfs4_client_recover_expired_lease+0x34/0x88 [nfsv4]
> [<0>] _nfs4_do_open.isra.0+0x94/0x408 [nfsv4]
> [<0>] nfs4_do_open+0x9c/0x238 [nfsv4]
> [<0>] nfs4_atomic_open+0x100/0x118 [nfsv4]
> [<0>] nfs4_file_open+0x11c/0x240 [nfsv4]
> [<0>] do_dentry_open+0x140/0x528
> [<0>] vfs_open+0x30/0x38
> [<0>] do_open+0x14c/0x360
> [<0>] path_openat+0x104/0x250
> [<0>] do_filp_open+0x84/0x138
> [<0>] file_open_name+0x134/0x190
> [<0>] __do_sys_swapoff+0x58/0x6e8
> [<0>] __arm64_sys_swapoff+0x18/0x28
> [<0>] invoke_syscall.constprop.0+0x7c/0xd0
> [<0>] do_el0_svc+0xb4/0xd0
> [<0>] el0_svc+0x50/0x228
> [<0>] el0t_64_sync_handler+0x134/0x150
> [<0>] el0t_64_sync+0x17c/0x180

Oh crap... Yes, that is a bug. Can you please apply the attached patch
on top of the original, and see if that fixes the problem?

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-fixup-NFSv4-Fix-a-state-manager-thread-deadlock-regr.patch --]
[-- Type: text/x-patch; name="0001-fixup-NFSv4-Fix-a-state-manager-thread-deadlock-regr.patch", Size: 1475 bytes --]

From ec89a837b71772feeb55cd9ece4e91900d4afa79 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 22 Sep 2023 15:00:08 -0400
Subject: [PATCH] fixup! NFSv4: Fix a state manager thread deadlock regression

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4state.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5751a6886da4..9a5d911a7edc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2762,17 +2762,17 @@ static int nfs4_run_state_manager(void *ptr)
 	nfs4_state_manager(clp);
 
 	if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
-	    !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) {
+	    !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) {
 		wait_var_event_interruptible(&clp->cl_state,
 					     test_bit(NFS4CLNT_RUN_MANAGER,
 						      &clp->cl_state));
 		if (!atomic_read(&cl->cl_swapper))
 			clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
-		if (refcount_read(&clp->cl_count) > 1 && !signalled())
+		if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
+		    !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
 			goto again;
 		/* Either no longer a swapper, or were signalled */
 		clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
-		nfs4_clear_state_manager_bit(clp);
 	}
 
 	if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-22 19:05         ` Trond Myklebust
@ 2023-09-22 21:00           ` Olga Kornievskaia
  2023-09-22 21:06             ` Trond Myklebust
  2023-09-25 22:28           ` NeilBrown
  1 sibling, 1 reply; 19+ messages in thread
From: Olga Kornievskaia @ 2023-09-22 21:00 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: linux-nfs, Anna.Schumaker, neilb, schumaker.anna

On Fri, Sep 22, 2023 at 3:05 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Fri, 2023-09-22 at 13:22 -0400, Olga Kornievskaia wrote:
> > On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust
> > <trondmy@hammerspace.com> wrote:
> > >
> > > On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote:
> > > > Hi Trond,
> > > >
> > > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote:
> > > > >
> > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > >
> > > > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by
> > > > > commit
> > > > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock")
> > > > > because
> > > > > it
> > > > > prevents the setup of new threads to handle reboot recovery,
> > > > > while
> > > > > the
> > > > > older recovery thread is stuck returning delegations.
> > > >
> > > > I'm seeing a possible deadlock with xfstests generic/472 on NFS
> > > > v4.x
> > > > after applying this patch. The test itself checks for various
> > > > swapfile
> > > > edge cases, so it seems likely something is going on there.
> > > >
> > > > Let me know if you need more info
> > > > Anna
> > > >
> > >
> > > Did you turn off delegations on your server? If you don't, then
> > > swap
> > > will deadlock itself under various scenarios.
> >
> > Is there documentation somewhere that says that delegations must be
> > turned off on the server if NFS over swap is enabled?
>
> I think the question is more generally "Is there documentation for NFS
> swap?"
>
> >
> > If the client can't handle delegations + swap, then shouldn't this be
> > solved by (1) checking if we are in NFS over swap and then
> > proactively
> > setting 'dont want delegation' on open and/or (2) proactively return
> > the delegation if received so that we don't get into the deadlock?
>
> We could do that for NFSv4.1 and NFSv4.2, but the protocol will not
> allow us to do that for NFSv4.0.
>
> >
> > I think this is similar to Anna's. With this patch, I'm running into
> > a
> > problem running against an ONTAP server using xfstests (no problems
> > without the patch). During the run two stuck threads are:
> > [root@unknown000c291be8aa aglo]# cat /proc/3724/stack
> > [<0>] nfs4_run_state_manager+0x1c0/0x1f8 [nfsv4]
> > [<0>] kthread+0x100/0x110
> > [<0>] ret_from_fork+0x10/0x20
> > [root@unknown000c291be8aa aglo]# cat /proc/3725/stack
> > [<0>] nfs_wait_bit_killable+0x1c/0x88 [nfs]
> > [<0>] nfs4_wait_clnt_recover+0xb4/0xf0 [nfsv4]
> > [<0>] nfs4_client_recover_expired_lease+0x34/0x88 [nfsv4]
> > [<0>] _nfs4_do_open.isra.0+0x94/0x408 [nfsv4]
> > [<0>] nfs4_do_open+0x9c/0x238 [nfsv4]
> > [<0>] nfs4_atomic_open+0x100/0x118 [nfsv4]
> > [<0>] nfs4_file_open+0x11c/0x240 [nfsv4]
> > [<0>] do_dentry_open+0x140/0x528
> > [<0>] vfs_open+0x30/0x38
> > [<0>] do_open+0x14c/0x360
> > [<0>] path_openat+0x104/0x250
> > [<0>] do_filp_open+0x84/0x138
> > [<0>] file_open_name+0x134/0x190
> > [<0>] __do_sys_swapoff+0x58/0x6e8
> > [<0>] __arm64_sys_swapoff+0x18/0x28
> > [<0>] invoke_syscall.constprop.0+0x7c/0xd0
> > [<0>] do_el0_svc+0xb4/0xd0
> > [<0>] el0_svc+0x50/0x228
> > [<0>] el0t_64_sync_handler+0x134/0x150
> > [<0>] el0t_64_sync+0x17c/0x180
>
> Oh crap... Yes, that is a bug. Can you please apply the attached patch
> on top of the original, and see if that fixes the problem?

I can't consistently reproduce the problem. Out of several xfstests
runs a couple got stuck in that state. So when I apply that patch and
run, I can't tell if i'm no longer hitting or if I'm just not hitting
the right condition.

Given I don't exactly know what's caused it I'm trying to find
something I can hit consistently. Any ideas? I mean this stack trace
seems to imply a recovery open but I'm not doing any server reboots or
connection drops.

>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-22 21:00           ` Olga Kornievskaia
@ 2023-09-22 21:06             ` Trond Myklebust
  2023-09-24 17:08               ` Trond Myklebust
  2023-09-26 14:31               ` Olga Kornievskaia
  0 siblings, 2 replies; 19+ messages in thread
From: Trond Myklebust @ 2023-09-22 21:06 UTC (permalink / raw)
  To: aglo; +Cc: linux-nfs, Anna.Schumaker, neilb, schumaker.anna

On Fri, 2023-09-22 at 17:00 -0400, Olga Kornievskaia wrote:
> On Fri, Sep 22, 2023 at 3:05 PM Trond Myklebust 
> > 
> > Oh crap... Yes, that is a bug. Can you please apply the attached
> > patch
> > on top of the original, and see if that fixes the problem?
> 
> I can't consistently reproduce the problem. Out of several xfstests
> runs a couple got stuck in that state. So when I apply that patch and
> run, I can't tell if i'm no longer hitting or if I'm just not hitting
> the right condition.
> 
> Given I don't exactly know what's caused it I'm trying to find
> something I can hit consistently. Any ideas? I mean this stack trace
> seems to imply a recovery open but I'm not doing any server reboots
> or
> connection drops.
> 
> > 

If I'm right about the root cause, then just turning off delegations on
the server, setting up a NFS swap partition and then running some
ordinary file open/close workload against the exact same server would
probably suffice to trigger your stack trace 100% reliably.

I'll see if I can find time to test it over the weekend.

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-22 21:06             ` Trond Myklebust
@ 2023-09-24 17:08               ` Trond Myklebust
  2023-09-26 14:55                 ` Anna Schumaker
  2023-09-26 14:31               ` Olga Kornievskaia
  1 sibling, 1 reply; 19+ messages in thread
From: Trond Myklebust @ 2023-09-24 17:08 UTC (permalink / raw)
  To: aglo; +Cc: linux-nfs, Anna.Schumaker, neilb, schumaker.anna

On Fri, 2023-09-22 at 17:06 -0400, Trond Myklebust wrote:
> On Fri, 2023-09-22 at 17:00 -0400, Olga Kornievskaia wrote:
> > On Fri, Sep 22, 2023 at 3:05 PM Trond Myklebust 
> > > 
> > > Oh crap... Yes, that is a bug. Can you please apply the attached
> > > patch
> > > on top of the original, and see if that fixes the problem?
> > 
> > I can't consistently reproduce the problem. Out of several xfstests
> > runs a couple got stuck in that state. So when I apply that patch
> > and
> > run, I can't tell if i'm no longer hitting or if I'm just not
> > hitting
> > the right condition.
> > 
> > Given I don't exactly know what's caused it I'm trying to find
> > something I can hit consistently. Any ideas? I mean this stack
> > trace
> > seems to imply a recovery open but I'm not doing any server reboots
> > or
> > connection drops.
> > 
> > > 
> 
> If I'm right about the root cause, then just turning off delegations
> on
> the server, setting up a NFS swap partition and then running some
> ordinary file open/close workload against the exact same server would
> probably suffice to trigger your stack trace 100% reliably.
> 
> I'll see if I can find time to test it over the weekend.

> 

Yep... Creating a 4G empty file on /mnt/nfs/swap/swapfile, running
mkswap  and then swapon followed by a simple bash line of
	echo "foo" >/mnt/nfs/foobar

will immediately lead to a hang. When I look at the stack for the bash
process, I see the following dump, which matches yours:

[root@vmw-test-1 ~]# cat /proc/1120/stack 
[<0>] nfs_wait_bit_killable+0x11/0x60 [nfs]
[<0>] nfs4_wait_clnt_recover+0x54/0x90 [nfsv4]
[<0>] nfs4_client_recover_expired_lease+0x29/0x60 [nfsv4]
[<0>] nfs4_do_open+0x170/0xa90 [nfsv4]
[<0>] nfs4_atomic_open+0x94/0x100 [nfsv4]
[<0>] nfs_atomic_open+0x2d9/0x670 [nfs]
[<0>] path_openat+0x3c3/0xd40
[<0>] do_filp_open+0xb4/0x160
[<0>] do_sys_openat2+0x81/0xe0
[<0>] __x64_sys_openat+0x81/0xa0
[<0>] do_syscall_64+0x68/0xa0
[<0>] entry_SYSCALL_64_after_hwframe+0x6e/0xd8

With the fix I sent you:

[root@vmw-test-1 ~]# mount -t nfs -overs=4.2 vmw-test-2:/export /mnt/nfs
[root@vmw-test-1 ~]# mkswap /mnt/nfs/swap/swapfile 
mkswap: /mnt/nfs/swap/swapfile: warning: wiping old swap signature.
Setting up swapspace version 1, size = 4 GiB (4294963200 bytes)
no label, UUID=1360b0a3-833a-4ba7-b467-8a59d3723013
[root@vmw-test-1 ~]# swapon /mnt/nfs/swap/swapfile
[root@vmw-test-1 ~]# ps -efww | grep manage
root        1214       2  0 13:04 ?        00:00:00 [192.168.76.251-manager]
root        1216    1147  0 13:04 pts/0    00:00:00 grep --color=auto manage
[root@vmw-test-1 ~]# echo "foo" >/mnt/nfs/foobar
[root@vmw-test-1 ~]# cat /mnt/nfs/foobar
foo

So that returns behaviour to normal in my testing, and I no longer see
the hangs.

Let me send out a PATCHv2...
-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-22 19:05         ` Trond Myklebust
  2023-09-22 21:00           ` Olga Kornievskaia
@ 2023-09-25 22:28           ` NeilBrown
  2023-09-25 22:44             ` Trond Myklebust
  1 sibling, 1 reply; 19+ messages in thread
From: NeilBrown @ 2023-09-25 22:28 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: aglo, linux-nfs, Anna.Schumaker, schumaker.anna

On Sat, 23 Sep 2023, Trond Myklebust wrote:
> On Fri, 2023-09-22 at 13:22 -0400, Olga Kornievskaia wrote:
> > On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust
> > <trondmy@hammerspace.com> wrote:
> > > 
> > > On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote:
> > > > Hi Trond,
> > > > 
> > > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote:
> > > > > 
> > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > 
> > > > > Commit 4dc73c679114 reintroduces the deadlock that was fixed by
> > > > > commit
> > > > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock")
> > > > > because
> > > > > it
> > > > > prevents the setup of new threads to handle reboot recovery,
> > > > > while
> > > > > the
> > > > > older recovery thread is stuck returning delegations.
> > > > 
> > > > I'm seeing a possible deadlock with xfstests generic/472 on NFS
> > > > v4.x
> > > > after applying this patch. The test itself checks for various
> > > > swapfile
> > > > edge cases, so it seems likely something is going on there.
> > > > 
> > > > Let me know if you need more info
> > > > Anna
> > > > 
> > > 
> > > Did you turn off delegations on your server? If you don't, then
> > > swap
> > > will deadlock itself under various scenarios.
> > 
> > Is there documentation somewhere that says that delegations must be
> > turned off on the server if NFS over swap is enabled?
> 
> I think the question is more generally "Is there documentation for NFS
> swap?"

The main difference between using NFS for swap and for regular file IO
is in the handling of writes, and particularly in the style of memory
allocation that is safe while handling a write request (or anything
which might block some write request, etc).

For buffered IO, memory allocations must be GFP_NOIO or PF_MEMALLOC_NOIO.
For swap-out, memory allocations must be GFP_MEMALLOC or PG_MEMALLOC.

That is the primary difference - all other differences are minor.  This
difference might justify documentation suggesting that 
/proc/sys/vm/min_free_kbytes could usefully be increased, but I don't
see that more is needed.

The NOIO/MEMALLOC distinction is properly plumbed through nfs, sunrpc,
and networking and all "just works".  The problem area is that
kthread_create() doesn't take a gfpflags_t argument, so it uses
GFP_KERNEL allocations to create the new thread.

This means that when a write cannot proceed without state management,
and state management requests that a threads be started, there is a risk
of memory allocation deadlock.
I believe the risk is there even for buffered IO, but I'm not 100%
certain and in practice I don't think a deadlock has ever been reported.
With swap-out it is fairly easy to trigger a deadlock if there is heavy
swap-out traffic when state management is needed.

The common pattern in the kernel when a thread might be needed to
support writeout is to keep the thread running permanently (rather than
to add a gfpflags_t to kthread_create), so that is what I added to the
nfsv4 state manager.

However the state manager thread has a second use - returning
delegations.  This sometimes needs to run concurrently with state
management, so one thread is not enough.

What is that context for delegation return?  Does it ever block writes? 
If it doesn't, would it make sense to use a work queue for returning
delegations - maybe system_wq?

I think it might be best to have the nfsv4 state manager thread always
be running whether swap is enabled or not.  As I say I think there is at
least a theoretical risk of a deadlock even without swap, and having a
small test matrix is usually a good thing.

Thanks,
NeilBrown

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-25 22:28           ` NeilBrown
@ 2023-09-25 22:44             ` Trond Myklebust
  2023-09-25 23:04               ` NeilBrown
  0 siblings, 1 reply; 19+ messages in thread
From: Trond Myklebust @ 2023-09-25 22:44 UTC (permalink / raw)
  To: neilb; +Cc: linux-nfs, aglo, Anna.Schumaker, schumaker.anna

On Tue, 2023-09-26 at 08:28 +1000, NeilBrown wrote:
> On Sat, 23 Sep 2023, Trond Myklebust wrote:
> > On Fri, 2023-09-22 at 13:22 -0400, Olga Kornievskaia wrote:
> > > On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust
> > > <trondmy@hammerspace.com> wrote:
> > > > 
> > > > On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote:
> > > > > Hi Trond,
> > > > > 
> > > > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote:
> > > > > > 
> > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > > 
> > > > > > Commit 4dc73c679114 reintroduces the deadlock that was
> > > > > > fixed by
> > > > > > commit
> > > > > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock")
> > > > > > because
> > > > > > it
> > > > > > prevents the setup of new threads to handle reboot
> > > > > > recovery,
> > > > > > while
> > > > > > the
> > > > > > older recovery thread is stuck returning delegations.
> > > > > 
> > > > > I'm seeing a possible deadlock with xfstests generic/472 on
> > > > > NFS
> > > > > v4.x
> > > > > after applying this patch. The test itself checks for various
> > > > > swapfile
> > > > > edge cases, so it seems likely something is going on there.
> > > > > 
> > > > > Let me know if you need more info
> > > > > Anna
> > > > > 
> > > > 
> > > > Did you turn off delegations on your server? If you don't, then
> > > > swap
> > > > will deadlock itself under various scenarios.
> > > 
> > > Is there documentation somewhere that says that delegations must
> > > be
> > > turned off on the server if NFS over swap is enabled?
> > 
> > I think the question is more generally "Is there documentation for
> > NFS
> > swap?"
> 
> The main difference between using NFS for swap and for regular file
> IO
> is in the handling of writes, and particularly in the style of memory
> allocation that is safe while handling a write request (or anything
> which might block some write request, etc).
> 
> For buffered IO, memory allocations must be GFP_NOIO or
> PF_MEMALLOC_NOIO.
> For swap-out, memory allocations must be GFP_MEMALLOC or PG_MEMALLOC.
> 
> That is the primary difference - all other differences are minor. 
> This
> difference might justify documentation suggesting that 
> /proc/sys/vm/min_free_kbytes could usefully be increased, but I don't
> see that more is needed.
> 
> The NOIO/MEMALLOC distinction is properly plumbed through nfs,
> sunrpc,
> and networking and all "just works".  The problem area is that
> kthread_create() doesn't take a gfpflags_t argument, so it uses
> GFP_KERNEL allocations to create the new thread.
> 
> This means that when a write cannot proceed without state management,
> and state management requests that a threads be started, there is a
> risk
> of memory allocation deadlock.
> I believe the risk is there even for buffered IO, but I'm not 100%
> certain and in practice I don't think a deadlock has ever been
> reported.
> With swap-out it is fairly easy to trigger a deadlock if there is
> heavy
> swap-out traffic when state management is needed.
> 
> The common pattern in the kernel when a thread might be needed to
> support writeout is to keep the thread running permanently (rather
> than
> to add a gfpflags_t to kthread_create), so that is what I added to
> the
> nfsv4 state manager.
> 
> However the state manager thread has a second use - returning
> delegations.  This sometimes needs to run concurrently with state
> management, so one thread is not enough.
> 
> What is that context for delegation return?  Does it ever block
> writes? 
> If it doesn't, would it make sense to use a work queue for returning
> delegations - maybe system_wq?

These are potentially long-lived processes because there may be lock
recovery involved, and because of the conflict with state recovery, so
it does not make sense to put them on a workqueue.

> 
> I think it might be best to have the nfsv4 state manager thread
> always
> be running whether swap is enabled or not.  As I say I think there is
> at
> least a theoretical risk of a deadlock even without swap, and having
> a
> small test matrix is usually a good thing.
> 

That's a "prove me wrong" argument.

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-25 22:44             ` Trond Myklebust
@ 2023-09-25 23:04               ` NeilBrown
  2023-09-25 23:20                 ` Trond Myklebust
  0 siblings, 1 reply; 19+ messages in thread
From: NeilBrown @ 2023-09-25 23:04 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: linux-nfs, aglo, Anna.Schumaker, schumaker.anna

On Tue, 26 Sep 2023, Trond Myklebust wrote:
> On Tue, 2023-09-26 at 08:28 +1000, NeilBrown wrote:
> > On Sat, 23 Sep 2023, Trond Myklebust wrote:
> > > On Fri, 2023-09-22 at 13:22 -0400, Olga Kornievskaia wrote:
> > > > On Wed, Sep 20, 2023 at 8:27 PM Trond Myklebust
> > > > <trondmy@hammerspace.com> wrote:
> > > > > 
> > > > > On Wed, 2023-09-20 at 15:38 -0400, Anna Schumaker wrote:
> > > > > > Hi Trond,
> > > > > > 
> > > > > > On Sun, Sep 17, 2023 at 7:12 PM <trondmy@kernel.org> wrote:
> > > > > > > 
> > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com>
> > > > > > > 
> > > > > > > Commit 4dc73c679114 reintroduces the deadlock that was
> > > > > > > fixed by
> > > > > > > commit
> > > > > > > aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock")
> > > > > > > because
> > > > > > > it
> > > > > > > prevents the setup of new threads to handle reboot
> > > > > > > recovery,
> > > > > > > while
> > > > > > > the
> > > > > > > older recovery thread is stuck returning delegations.
> > > > > > 
> > > > > > I'm seeing a possible deadlock with xfstests generic/472 on
> > > > > > NFS
> > > > > > v4.x
> > > > > > after applying this patch. The test itself checks for various
> > > > > > swapfile
> > > > > > edge cases, so it seems likely something is going on there.
> > > > > > 
> > > > > > Let me know if you need more info
> > > > > > Anna
> > > > > > 
> > > > > 
> > > > > Did you turn off delegations on your server? If you don't, then
> > > > > swap
> > > > > will deadlock itself under various scenarios.
> > > > 
> > > > Is there documentation somewhere that says that delegations must
> > > > be
> > > > turned off on the server if NFS over swap is enabled?
> > > 
> > > I think the question is more generally "Is there documentation for
> > > NFS
> > > swap?"
> > 
> > The main difference between using NFS for swap and for regular file
> > IO
> > is in the handling of writes, and particularly in the style of memory
> > allocation that is safe while handling a write request (or anything
> > which might block some write request, etc).
> > 
> > For buffered IO, memory allocations must be GFP_NOIO or
> > PF_MEMALLOC_NOIO.
> > For swap-out, memory allocations must be GFP_MEMALLOC or PG_MEMALLOC.
> > 
> > That is the primary difference - all other differences are minor. 
> > This
> > difference might justify documentation suggesting that 
> > /proc/sys/vm/min_free_kbytes could usefully be increased, but I don't
> > see that more is needed.
> > 
> > The NOIO/MEMALLOC distinction is properly plumbed through nfs,
> > sunrpc,
> > and networking and all "just works".  The problem area is that
> > kthread_create() doesn't take a gfpflags_t argument, so it uses
> > GFP_KERNEL allocations to create the new thread.
> > 
> > This means that when a write cannot proceed without state management,
> > and state management requests that a threads be started, there is a
> > risk
> > of memory allocation deadlock.
> > I believe the risk is there even for buffered IO, but I'm not 100%
> > certain and in practice I don't think a deadlock has ever been
> > reported.
> > With swap-out it is fairly easy to trigger a deadlock if there is
> > heavy
> > swap-out traffic when state management is needed.
> > 
> > The common pattern in the kernel when a thread might be needed to
> > support writeout is to keep the thread running permanently (rather
> > than
> > to add a gfpflags_t to kthread_create), so that is what I added to
> > the
> > nfsv4 state manager.
> > 
> > However the state manager thread has a second use - returning
> > delegations.  This sometimes needs to run concurrently with state
> > management, so one thread is not enough.
> > 
> > What is that context for delegation return?  Does it ever block
> > writes? 
> > If it doesn't, would it make sense to use a work queue for returning
> > delegations - maybe system_wq?
> 
> These are potentially long-lived processes because there may be lock
> recovery involved, and because of the conflict with state recovery, so
> it does not make sense to put them on a workqueue.
> 

Makes sense - thanks.
Are writes blocked while the delegation returns proceeds?  If not, would
it be reasonable to start a separate kthread on-demand when a return is
requested?

Thanks,
NeilBrown

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-25 23:04               ` NeilBrown
@ 2023-09-25 23:20                 ` Trond Myklebust
  0 siblings, 0 replies; 19+ messages in thread
From: Trond Myklebust @ 2023-09-25 23:20 UTC (permalink / raw)
  To: neilb; +Cc: linux-nfs, aglo, Anna.Schumaker, schumaker.anna

On Tue, 2023-09-26 at 09:04 +1000, NeilBrown wrote:
> 
> Are writes blocked while the delegation returns proceeds?  If not,
> would
> it be reasonable to start a separate kthread on-demand when a return
> is
> requested?
> 

That's what we've done historically.

We initially made it be the same thread as the standard recovery thread
because we do want to serialise recovery and delegation return. However
the recovery thread has the ability to block all other RPC to the
server in question, so that requirement that we serialise does not
depend on the two threads being the same. In practice, therefore, we
usually ended up with multiple separate threads when reboot recovery
was required during a delegation return, or if a single sweep of the
delegations took too long.

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-22 21:06             ` Trond Myklebust
  2023-09-24 17:08               ` Trond Myklebust
@ 2023-09-26 14:31               ` Olga Kornievskaia
  1 sibling, 0 replies; 19+ messages in thread
From: Olga Kornievskaia @ 2023-09-26 14:31 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: linux-nfs, Anna.Schumaker, neilb, schumaker.anna

On Fri, Sep 22, 2023 at 5:07 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Fri, 2023-09-22 at 17:00 -0400, Olga Kornievskaia wrote:
> > On Fri, Sep 22, 2023 at 3:05 PM Trond Myklebust
> > >
> > > Oh crap... Yes, that is a bug. Can you please apply the attached
> > > patch
> > > on top of the original, and see if that fixes the problem?
> >
> > I can't consistently reproduce the problem. Out of several xfstests
> > runs a couple got stuck in that state. So when I apply that patch and
> > run, I can't tell if i'm no longer hitting or if I'm just not hitting
> > the right condition.
> >
> > Given I don't exactly know what's caused it I'm trying to find
> > something I can hit consistently. Any ideas? I mean this stack trace
> > seems to imply a recovery open but I'm not doing any server reboots
> > or
> > connection drops.
> >
> > >
>
> If I'm right about the root cause, then just turning off delegations on
> the server, setting up a NFS swap partition and then running some
> ordinary file open/close workload against the exact same server would
> probably suffice to trigger your stack trace 100% reliably.

Getting back to this now. Thanks for v2 and I'll test. But I'm still
unclear, is there a requirement that delegations have to be off for
when the client has NFS over swap enabled? I always run with
delegations on in ONTAP and run xfstests. I don't usually run with NFS
over swap enabled but sometimes I do. So what should be the
expectations? If I choose this kernel configuration, then deadlock is
possible?
>
> I'll see if I can find time to test it over the weekend.
>
> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression
  2023-09-24 17:08               ` Trond Myklebust
@ 2023-09-26 14:55                 ` Anna Schumaker
  0 siblings, 0 replies; 19+ messages in thread
From: Anna Schumaker @ 2023-09-26 14:55 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: aglo, linux-nfs, Anna.Schumaker, neilb

On Sun, Sep 24, 2023 at 1:08 PM Trond Myklebust <trondmy@hammerspace.com> wrote:
>
> On Fri, 2023-09-22 at 17:06 -0400, Trond Myklebust wrote:
> > On Fri, 2023-09-22 at 17:00 -0400, Olga Kornievskaia wrote:
> > > On Fri, Sep 22, 2023 at 3:05 PM Trond Myklebust
> > > >
> > > > Oh crap... Yes, that is a bug. Can you please apply the attached
> > > > patch
> > > > on top of the original, and see if that fixes the problem?
> > >
> > > I can't consistently reproduce the problem. Out of several xfstests
> > > runs a couple got stuck in that state. So when I apply that patch
> > > and
> > > run, I can't tell if i'm no longer hitting or if I'm just not
> > > hitting
> > > the right condition.
> > >
> > > Given I don't exactly know what's caused it I'm trying to find
> > > something I can hit consistently. Any ideas? I mean this stack
> > > trace
> > > seems to imply a recovery open but I'm not doing any server reboots
> > > or
> > > connection drops.
> > >
> > > >
> >
> > If I'm right about the root cause, then just turning off delegations
> > on
> > the server, setting up a NFS swap partition and then running some
> > ordinary file open/close workload against the exact same server would
> > probably suffice to trigger your stack trace 100% reliably.
> >
> > I'll see if I can find time to test it over the weekend.
>
> >
>
> Yep... Creating a 4G empty file on /mnt/nfs/swap/swapfile, running
> mkswap  and then swapon followed by a simple bash line of
>         echo "foo" >/mnt/nfs/foobar
>
> will immediately lead to a hang. When I look at the stack for the bash
> process, I see the following dump, which matches yours:
>
> [root@vmw-test-1 ~]# cat /proc/1120/stack
> [<0>] nfs_wait_bit_killable+0x11/0x60 [nfs]
> [<0>] nfs4_wait_clnt_recover+0x54/0x90 [nfsv4]
> [<0>] nfs4_client_recover_expired_lease+0x29/0x60 [nfsv4]
> [<0>] nfs4_do_open+0x170/0xa90 [nfsv4]
> [<0>] nfs4_atomic_open+0x94/0x100 [nfsv4]
> [<0>] nfs_atomic_open+0x2d9/0x670 [nfs]
> [<0>] path_openat+0x3c3/0xd40
> [<0>] do_filp_open+0xb4/0x160
> [<0>] do_sys_openat2+0x81/0xe0
> [<0>] __x64_sys_openat+0x81/0xa0
> [<0>] do_syscall_64+0x68/0xa0
> [<0>] entry_SYSCALL_64_after_hwframe+0x6e/0xd8
>
> With the fix I sent you:
>
> [root@vmw-test-1 ~]# mount -t nfs -overs=4.2 vmw-test-2:/export /mnt/nfs
> [root@vmw-test-1 ~]# mkswap /mnt/nfs/swap/swapfile
> mkswap: /mnt/nfs/swap/swapfile: warning: wiping old swap signature.
> Setting up swapspace version 1, size = 4 GiB (4294963200 bytes)
> no label, UUID=1360b0a3-833a-4ba7-b467-8a59d3723013
> [root@vmw-test-1 ~]# swapon /mnt/nfs/swap/swapfile
> [root@vmw-test-1 ~]# ps -efww | grep manage
> root        1214       2  0 13:04 ?        00:00:00 [192.168.76.251-manager]
> root        1216    1147  0 13:04 pts/0    00:00:00 grep --color=auto manage
> [root@vmw-test-1 ~]# echo "foo" >/mnt/nfs/foobar
> [root@vmw-test-1 ~]# cat /mnt/nfs/foobar
> foo
>
> So that returns behaviour to normal in my testing, and I no longer see
> the hangs.
>
> Let me send out a PATCHv2...

I'm liking patch v2 much better! I was testing with a Linux server
with default configuration, and it's nice that the xfstests swapfile
tests aren't hanging anymore :)

Anna

> --
> Trond Myklebust
> Linux NFS client maintainer, Hammerspace
> trond.myklebust@hammerspace.com
>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2023-09-26 14:55 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-17 23:05 [PATCH 1/2] NFSv4: Fix a nfs4_state_manager() race trondmy
2023-09-17 23:05 ` [PATCH 2/2] NFSv4: Fix a state manager thread deadlock regression trondmy
2023-09-18  1:25   ` NeilBrown
2023-09-18  2:27     ` Trond Myklebust
2023-09-20 19:38   ` Anna Schumaker
2023-09-21  0:15     ` Trond Myklebust
2023-09-22 17:22       ` Olga Kornievskaia
2023-09-22 19:05         ` Trond Myklebust
2023-09-22 21:00           ` Olga Kornievskaia
2023-09-22 21:06             ` Trond Myklebust
2023-09-24 17:08               ` Trond Myklebust
2023-09-26 14:55                 ` Anna Schumaker
2023-09-26 14:31               ` Olga Kornievskaia
2023-09-25 22:28           ` NeilBrown
2023-09-25 22:44             ` Trond Myklebust
2023-09-25 23:04               ` NeilBrown
2023-09-25 23:20                 ` Trond Myklebust
2023-09-18  1:17 ` [PATCH 1/2] NFSv4: Fix a nfs4_state_manager() race NeilBrown
2023-09-18  2:20   ` Trond Myklebust

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.